From bd00a934c653fb1666fa7d18267644b4e9d14e5e Mon Sep 17 00:00:00 2001
From: Eli Friedman <eli.friedman@gmail.com>
Date: Fri, 4 Nov 2011 23:46:11 +0000
Subject: Enhanced vzeroupper insertion pass that avoids inserting vzeroupper
 where it is unnecessary through local analysis.  Patch from Bruno Cardoso
 Lopes, with some additional changes.

I'm going to wait for any review comments and perform some additional testing before turning this on by default.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@143750 91177308-0d34-0410-b5e6-96231b3b80d8
---
 test/CodeGen/X86/avx-vzeroupper.ll | 83 ++++++++++++++++++++++++++++++++------
 1 file changed, 70 insertions(+), 13 deletions(-)

(limited to 'test/CodeGen')

diff --git a/test/CodeGen/X86/avx-vzeroupper.ll b/test/CodeGen/X86/avx-vzeroupper.ll
index eaf236c6c7..bf4ab5be15 100644
--- a/test/CodeGen/X86/avx-vzeroupper.ll
+++ b/test/CodeGen/X86/avx-vzeroupper.ll
@@ -1,26 +1,83 @@
 ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
 
-define <4 x float> @do_sse_local(<4 x float> %a) nounwind uwtable readnone ssp {
-entry:
-  %add.i = fadd <4 x float> %a, %a
-  ret <4 x float> %add.i
-}
+declare <4 x float> @do_sse(<4 x float>)
+declare <8 x float> @do_avx(<8 x float>)
+declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone
+@x = common global <4 x float> zeroinitializer, align 16
+@g = common global <8 x float> zeroinitializer, align 32
+
+;; Basic checking - don't emit any vzeroupper instruction
 
 ; CHECK: _test00
 define <4 x float> @test00(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
 entry:
+  ; CHECK-NOT: vzeroupper
   %add.i = fadd <4 x float> %a, %b
+  %call3 = call <4 x float> @do_sse(<4 x float> %add.i) nounwind
+  ; CHECK: ret
+  ret <4 x float> %call3
+}
+
+;; Check parameter 256-bit parameter passing
+
+; CHECK: _test01
+define <8 x float> @test01(<4 x float> %a, <4 x float> %b, <8 x float> %c) nounwind uwtable ssp {
+entry:
+  %tmp = load <4 x float>* @x, align 16
   ; CHECK: vzeroupper
   ; CHECK-NEXT: callq _do_sse
-  %call3 = tail call <4 x float> @do_sse(<4 x float> %add.i) nounwind
-  %sub.i = fsub <4 x float> %call3, %add.i
+  %call = tail call <4 x float> @do_sse(<4 x float> %tmp) nounwind
+  store <4 x float> %call, <4 x float>* @x, align 16
   ; CHECK-NOT: vzeroupper
-  ; CHECK: callq _do_sse_local
-  %call8 = tail call <4 x float> @do_sse_local(<4 x float> %sub.i)
+  ; CHECK: callq _do_sse
+  %call2 = tail call <4 x float> @do_sse(<4 x float> %call) nounwind
+  store <4 x float> %call2, <4 x float>* @x, align 16
+  ; CHECK: ret
+  ret <8 x float> %c
+}
+
+;; Test the pass convergence and also that vzeroupper is only issued when necessary,
+;; for this function it should be only once
+
+; CHECK: _test02
+define <4 x float> @test02(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
+entry:
+  %add.i = fadd <4 x float> %a, %b
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  ; CHECK: LBB
+  ; CHECK-NOT: vzeroupper
+  %i.018 = phi i32 [ 0, %entry ], [ %1, %for.body ]
+  %c.017 = phi <4 x float> [ %add.i, %entry ], [ %call14, %for.body ]
+  ; CHECK: callq _do_sse
+  %call5 = tail call <4 x float> @do_sse(<4 x float> %c.017) nounwind
+  ; CHECK-NEXT: callq _do_sse
+  %call7 = tail call <4 x float> @do_sse(<4 x float> %call5) nounwind
+  %tmp11 = load <8 x float>* @g, align 32
+  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %tmp11, i8 1) nounwind
   ; CHECK: vzeroupper
-  ; CHECK-NEXT: jmp _do_sse
-  %call10 = tail call <4 x float> @do_sse(<4 x float> %call8) nounwind
-  ret <4 x float> %call10
+  ; CHECK-NEXT: callq _do_sse
+  %call14 = tail call <4 x float> @do_sse(<4 x float> %0) nounwind
+  %1 = add nsw i32 %i.018, 1
+  %exitcond = icmp eq i32 %1, 4
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret <4 x float> %call14
 }
 
-declare <4 x float> @do_sse(<4 x float>)
+;; Check that we also perform vzeroupper when we return from a function.
+
+; CHECK: _test03
+define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
+entry:
+  %shuf = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ; CHECK-NOT: vzeroupper
+  ; CHECK: call
+  %call = call <8 x float> @do_avx(<8 x float> %shuf) nounwind
+  %shuf2 = shufflevector <8 x float> %call, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ; CHECK: vzeroupper
+  ; CHECK: ret
+  ret <4 x float> %shuf2
+}
-- 
cgit v1.2.3-70-g09d2