diff options
author | Alexander Kornienko <alexfh@google.com> | 2013-03-14 10:51:38 +0000 |
---|---|---|
committer | Alexander Kornienko <alexfh@google.com> | 2013-03-14 10:51:38 +0000 |
commit | 647735c781c5b37061ee03d6e9e6c7dda92218e2 (patch) | |
tree | 5a5e56606d41060263048b5a5586b3d2380898ba /test/Transforms/LoopVectorize/X86 | |
parent | 6aed25d93d1cfcde5809a73ffa7dc1b0d6396f66 (diff) | |
parent | f635ef401786c84df32090251a8cf45981ecca33 (diff) |
Updating branches/google/stable to r176857
git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/google/stable@177040 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'test/Transforms/LoopVectorize/X86')
-rw-r--r-- | test/Transforms/LoopVectorize/X86/avx1.ll | 4 | ||||
-rw-r--r-- | test/Transforms/LoopVectorize/X86/conversion-cost.ll | 2 | ||||
-rw-r--r-- | test/Transforms/LoopVectorize/X86/cost-model.ll | 5 | ||||
-rw-r--r-- | test/Transforms/LoopVectorize/X86/gcc-examples.ll | 27 | ||||
-rw-r--r-- | test/Transforms/LoopVectorize/X86/min-trip-count-switch.ll | 28 | ||||
-rw-r--r-- | test/Transforms/LoopVectorize/X86/no-vector.ll | 22 | ||||
-rw-r--r-- | test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll | 52 | ||||
-rw-r--r-- | test/Transforms/LoopVectorize/X86/parallel-loops.ll | 114 | ||||
-rw-r--r-- | test/Transforms/LoopVectorize/X86/reduction-crash.ll | 35 | ||||
-rw-r--r-- | test/Transforms/LoopVectorize/X86/small-size.ll | 170 | ||||
-rw-r--r-- | test/Transforms/LoopVectorize/X86/struct-store.ll | 27 | ||||
-rw-r--r-- | test/Transforms/LoopVectorize/X86/unroll-small-loops.ll | 50 | ||||
-rw-r--r-- | test/Transforms/LoopVectorize/X86/unroll_selection.ll | 71 | ||||
-rw-r--r-- | test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll | 150 |
14 files changed, 747 insertions, 10 deletions
diff --git a/test/Transforms/LoopVectorize/X86/avx1.ll b/test/Transforms/LoopVectorize/X86/avx1.ll index a2d176a534..6c0366eae9 100644 --- a/test/Transforms/LoopVectorize/X86/avx1.ll +++ b/test/Transforms/LoopVectorize/X86/avx1.ll @@ -27,7 +27,7 @@ define i32 @read_mod_write_single_ptr(float* nocapture %a, i32 %n) nounwind uwta ;CHECK: @read_mod_i64 -;CHECK: load <8 x i64> +;CHECK: load <2 x i64> ;CHECK: ret i32 define i32 @read_mod_i64(i64* nocapture %a, i32 %n) nounwind uwtable ssp { %1 = icmp sgt i32 %n, 0 @@ -37,7 +37,7 @@ define i32 @read_mod_i64(i64* nocapture %a, i32 %n) nounwind uwtable ssp { %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] %2 = getelementptr inbounds i64* %a, i64 %indvars.iv %3 = load i64* %2, align 4 - %4 = mul i64 %3, 3 + %4 = add i64 %3, 3 store i64 %4, i64* %2, align 4 %indvars.iv.next = add i64 %indvars.iv, 1 %lftr.wideiv = trunc i64 %indvars.iv.next to i32 diff --git a/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/test/Transforms/LoopVectorize/X86/conversion-cost.ll index 8f1bb545fa..23d9233544 100644 --- a/test/Transforms/LoopVectorize/X86/conversion-cost.ll +++ b/test/Transforms/LoopVectorize/X86/conversion-cost.ll @@ -4,7 +4,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3 target triple = "x86_64-apple-macosx10.8.0" ;CHECK: @conversion_cost1 -;CHECK: store <2 x i8> +;CHECK: store <32 x i8> ;CHECK: ret define i32 @conversion_cost1(i32 %n, i8* nocapture %A, float* nocapture %B) nounwind uwtable ssp { %1 = icmp sgt i32 %n, 3 diff --git a/test/Transforms/LoopVectorize/X86/cost-model.ll b/test/Transforms/LoopVectorize/X86/cost-model.ll index 628f9912c8..b7f479acf9 100644 --- a/test/Transforms/LoopVectorize/X86/cost-model.ll +++ b/test/Transforms/LoopVectorize/X86/cost-model.ll @@ -8,8 +8,11 @@ target triple = "x86_64-apple-macosx10.8.0" @d = common global [2048 x i32] zeroinitializer, align 16 @a = common global [2048 x i32] zeroinitializer, align 16 +; The program below gathers and scatters data. We better not vectorize it. ;CHECK: cost_model_1 -;CHECK: <4 x i32> +;CHECK-NOT: <2 x i32> +;CHECK-NOT: <4 x i32> +;CHECK-NOT: <8 x i32> ;CHECK: ret void define void @cost_model_1() nounwind uwtable noinline ssp { entry: diff --git a/test/Transforms/LoopVectorize/X86/gcc-examples.ll b/test/Transforms/LoopVectorize/X86/gcc-examples.ll index 574c529834..d2d0eac305 100644 --- a/test/Transforms/LoopVectorize/X86/gcc-examples.ll +++ b/test/Transforms/LoopVectorize/X86/gcc-examples.ll @@ -1,4 +1,5 @@ -; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -dce -instcombine -licm -S | FileCheck %s +; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -dce -instcombine -S | FileCheck %s +; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -force-vector-unroll=0 -dce -instcombine -S | FileCheck %s -check-prefix=UNROLL target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" target triple = "x86_64-apple-macosx10.8.0" @@ -9,10 +10,19 @@ target triple = "x86_64-apple-macosx10.8.0" ; Select VF = 8; ;CHECK: @example1 -;CHECK: load <8 x i32> -;CHECK: add nsw <8 x i32> -;CHECK: store <8 x i32> +;CHECK: load <4 x i32> +;CHECK: add nsw <4 x i32> +;CHECK: store <4 x i32> ;CHECK: ret void + +;UNROLL: @example1 +;UNROLL: load <4 x i32> +;UNROLL: load <4 x i32> +;UNROLL: add nsw <4 x i32> +;UNROLL: add nsw <4 x i32> +;UNROLL: store <4 x i32> +;UNROLL: store <4 x i32> +;UNROLL: ret void define void @example1() nounwind uwtable ssp { br label %1 @@ -34,13 +44,18 @@ define void @example1() nounwind uwtable ssp { ret void } - -; Select VF=4 because sext <8 x i1> to <8 x i32> is expensive. +; Select VF=4 because sext <8 x i1> to <8 x i32> is expensive. ;CHECK: @example10b ;CHECK: load <4 x i16> ;CHECK: sext <4 x i16> ;CHECK: store <4 x i32> ;CHECK: ret void +;UNROLL: @example10b +;UNROLL: load <4 x i16> +;UNROLL: load <4 x i16> +;UNROLL: store <4 x i32> +;UNROLL: store <4 x i32> +;UNROLL: ret void define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp { br label %1 diff --git a/test/Transforms/LoopVectorize/X86/min-trip-count-switch.ll b/test/Transforms/LoopVectorize/X86/min-trip-count-switch.ll new file mode 100644 index 0000000000..186fba87d6 --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/min-trip-count-switch.ll @@ -0,0 +1,28 @@ +; RUN: opt < %s -loop-vectorize -force-vector-unroll=1 -vectorizer-min-trip-count=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; CHECK: <4 x float> +define void @trivial_loop(float* nocapture %a) nounwind uwtable optsize { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds float* %a, i64 %indvars.iv + %0 = load float* %arrayidx, align 4, !tbaa !0 + %add = fadd float %0, 1.000000e+00 + store float %add, float* %arrayidx, align 4, !tbaa !0 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 8 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +!0 = metadata !{metadata !"float", metadata !1} +!1 = metadata !{metadata !"omnipotent char", metadata !2} +!2 = metadata !{metadata !"Simple C/C++ TBAA"} diff --git a/test/Transforms/LoopVectorize/X86/no-vector.ll b/test/Transforms/LoopVectorize/X86/no-vector.ll new file mode 100644 index 0000000000..692eec9895 --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/no-vector.ll @@ -0,0 +1,22 @@ +; RUN: opt -S -mtriple=i386-unknown-freebsd -mcpu=i486 -loop-vectorize < %s + +define i32 @PR14639(i8* nocapture %s, i32 %len) nounwind { +entry: + %cmp4 = icmp sgt i32 %len, 0 + br i1 %cmp4, label %for.body, label %for.end + +for.body: ; preds = %entry, %for.body + %i.06 = phi i32 [ %inc, %for.body ], [ 0, %entry ] + %r.05 = phi i32 [ %xor, %for.body ], [ 0, %entry ] + %arrayidx = getelementptr inbounds i8* %s, i32 %i.06 + %0 = load i8* %arrayidx, align 1 + %conv = sext i8 %0 to i32 + %xor = xor i32 %conv, %r.05 + %inc = add nsw i32 %i.06, 1 + %exitcond = icmp eq i32 %inc, %len + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + %r.0.lcssa = phi i32 [ 0, %entry ], [ %xor, %for.body ] + ret i32 %r.0.lcssa +} diff --git a/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll b/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll new file mode 100644 index 0000000000..452d0df133 --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/parallel-loops-after-reg2mem.ll @@ -0,0 +1,52 @@ +; RUN: opt < %s -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; The parallel loop has been invalidated by the new memory accesses introduced +; by reg2mem (Loop::isParallel() starts to return false). Ensure the loop is +; now non-vectorizable. + +;CHECK-NOT: <4 x i32> +define void @parallel_loop(i32* nocapture %a, i32* nocapture %b) nounwind uwtable { +entry: + %indvars.iv.next.reg2mem = alloca i64 + %indvars.iv.reg2mem = alloca i64 + %"reg2mem alloca point" = bitcast i32 0 to i32 + store i64 0, i64* %indvars.iv.reg2mem + br label %for.body + +for.body: ; preds = %for.body.for.body_crit_edge, %entry + %indvars.iv.reload = load i64* %indvars.iv.reg2mem + %arrayidx = getelementptr inbounds i32* %b, i64 %indvars.iv.reload + %0 = load i32* %arrayidx, align 4, !tbaa !0, !llvm.mem.parallel_loop_access !3 + %arrayidx2 = getelementptr inbounds i32* %a, i64 %indvars.iv.reload + %1 = load i32* %arrayidx2, align 4, !tbaa !0, !llvm.mem.parallel_loop_access !3 + %idxprom3 = sext i32 %1 to i64 + %arrayidx4 = getelementptr inbounds i32* %a, i64 %idxprom3 + store i32 %0, i32* %arrayidx4, align 4, !tbaa !0, !llvm.mem.parallel_loop_access !3 + %indvars.iv.next = add i64 %indvars.iv.reload, 1 + ; A new store without the parallel metadata here: + store i64 %indvars.iv.next, i64* %indvars.iv.next.reg2mem + %indvars.iv.next.reload1 = load i64* %indvars.iv.next.reg2mem + %arrayidx6 = getelementptr inbounds i32* %b, i64 %indvars.iv.next.reload1 + %2 = load i32* %arrayidx6, align 4, !tbaa !0, !llvm.mem.parallel_loop_access !3 + store i32 %2, i32* %arrayidx2, align 4, !tbaa !0, !llvm.mem.parallel_loop_access !3 + %indvars.iv.next.reload = load i64* %indvars.iv.next.reg2mem + %lftr.wideiv = trunc i64 %indvars.iv.next.reload to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 512 + br i1 %exitcond, label %for.end, label %for.body.for.body_crit_edge, !llvm.loop.parallel !3 + +for.body.for.body_crit_edge: ; preds = %for.body + %indvars.iv.next.reload2 = load i64* %indvars.iv.next.reg2mem + store i64 %indvars.iv.next.reload2, i64* %indvars.iv.reg2mem + br label %for.body + +for.end: ; preds = %for.body + ret void +} + +!0 = metadata !{metadata !"int", metadata !1} +!1 = metadata !{metadata !"omnipotent char", metadata !2} +!2 = metadata !{metadata !"Simple C/C++ TBAA"} +!3 = metadata !{metadata !3} diff --git a/test/Transforms/LoopVectorize/X86/parallel-loops.ll b/test/Transforms/LoopVectorize/X86/parallel-loops.ll new file mode 100644 index 0000000000..f648722734 --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/parallel-loops.ll @@ -0,0 +1,114 @@ +; RUN: opt < %s -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; A tricky loop: +; +; void loop(int *a, int *b) { +; for (int i = 0; i < 512; ++i) { +; a[a[i]] = b[i]; +; a[i] = b[i+1]; +; } +;} + +;CHECK: @loop +;CHECK-NOT: <4 x i32> +define void @loop(i32* nocapture %a, i32* nocapture %b) nounwind uwtable { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32* %b, i64 %indvars.iv + %0 = load i32* %arrayidx, align 4, !tbaa !0 + %arrayidx2 = getelementptr inbounds i32* %a, i64 %indvars.iv + %1 = load i32* %arrayidx2, align 4, !tbaa !0 + %idxprom3 = sext i32 %1 to i64 + %arrayidx4 = getelementptr inbounds i32* %a, i64 %idxprom3 + store i32 %0, i32* %arrayidx4, align 4, !tbaa !0 + %indvars.iv.next = add i64 %indvars.iv, 1 + %arrayidx6 = getelementptr inbounds i32* %b, i64 %indvars.iv.next + %2 = load i32* %arrayidx6, align 4, !tbaa !0 + store i32 %2, i32* %arrayidx2, align 4, !tbaa !0 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 512 + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body + ret void +} + +; The same loop with parallel loop metadata added to the loop branch +; and the memory instructions. + +;CHECK: @parallel_loop +;CHECK: <4 x i32> +define void @parallel_loop(i32* nocapture %a, i32* nocapture %b) nounwind uwtable { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32* %b, i64 %indvars.iv + %0 = load i32* %arrayidx, align 4, !tbaa !0, !llvm.mem.parallel_loop_access !3 + %arrayidx2 = getelementptr inbounds i32* %a, i64 %indvars.iv + %1 = load i32* %arrayidx2, align 4, !tbaa !0, !llvm.mem.parallel_loop_access !3 + %idxprom3 = sext i32 %1 to i64 + %arrayidx4 = getelementptr inbounds i32* %a, i64 %idxprom3 + ; This store might have originated from inlining a function with a parallel + ; loop. Refers to a list with the "original loop reference" (!4) also included. + store i32 %0, i32* %arrayidx4, align 4, !tbaa !0, !llvm.mem.parallel_loop_access !5 + %indvars.iv.next = add i64 %indvars.iv, 1 + %arrayidx6 = getelementptr inbounds i32* %b, i64 %indvars.iv.next + %2 = load i32* %arrayidx6, align 4, !tbaa !0, !llvm.mem.parallel_loop_access !3 + store i32 %2, i32* %arrayidx2, align 4, !tbaa !0, !llvm.mem.parallel_loop_access !3 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 512 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop.parallel !3 + +for.end: ; preds = %for.body + ret void +} + +; The same loop with an illegal parallel loop metadata: the memory +; accesses refer to a different loop's identifier. + +;CHECK: @mixed_metadata +;CHECK-NOT: <4 x i32> + +define void @mixed_metadata(i32* nocapture %a, i32* nocapture %b) nounwind uwtable { +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i32* %b, i64 %indvars.iv + %0 = load i32* %arrayidx, align 4, !tbaa !0, !llvm.mem.parallel_loop_access !6 + %arrayidx2 = getelementptr inbounds i32* %a, i64 %indvars.iv + %1 = load i32* %arrayidx2, align 4, !tbaa !0, !llvm.mem.parallel_loop_access !6 + %idxprom3 = sext i32 %1 to i64 + %arrayidx4 = getelementptr inbounds i32* %a, i64 %idxprom3 + ; This refers to the loop marked with !7 which we are not in at the moment. + ; It should prevent detecting as a parallel loop. + store i32 %0, i32* %arrayidx4, align 4, !tbaa !0, !llvm.mem.parallel_loop_access !7 + %indvars.iv.next = add i64 %indvars.iv, 1 + %arrayidx6 = getelementptr inbounds i32* %b, i64 %indvars.iv.next + %2 = load i32* %arrayidx6, align 4, !tbaa !0, !llvm.mem.parallel_loop_access !6 + store i32 %2, i32* %arrayidx2, align 4, !tbaa !0, !llvm.mem.parallel_loop_access !6 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 512 + br i1 %exitcond, label %for.end, label %for.body, !llvm.loop.parallel !6 + +for.end: ; preds = %for.body + ret void +} + +!0 = metadata !{metadata !"int", metadata !1} +!1 = metadata !{metadata !"omnipotent char", metadata !2} +!2 = metadata !{metadata !"Simple C/C++ TBAA"} +!3 = metadata !{metadata !3} +!4 = metadata !{metadata !4} +!5 = metadata !{metadata !3, metadata !4} +!6 = metadata !{metadata !6} +!7 = metadata !{metadata !7} diff --git a/test/Transforms/LoopVectorize/X86/reduction-crash.ll b/test/Transforms/LoopVectorize/X86/reduction-crash.ll new file mode 100644 index 0000000000..f580846a02 --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/reduction-crash.ll @@ -0,0 +1,35 @@ +; RUN: opt -S -loop-vectorize -mcpu=prescott < %s | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128" +target triple = "i386-apple-darwin" + +; PR15344 +define void @test1(float* nocapture %arg, i32 %arg1) nounwind { +; CHECK: @test1 +; CHECK: preheader +; CHECK: insertelement <2 x double> zeroinitializer, double %tmp, i32 0 +; CHECK: vector.memcheck + +bb: + br label %bb2 + +bb2: ; preds = %bb + %tmp = load double* null, align 8 + br i1 undef, label %bb3, label %bb12 + +bb3: ; preds = %bb3, %bb2 + %tmp4 = phi double [ %tmp9, %bb3 ], [ %tmp, %bb2 ] + %tmp5 = phi i32 [ %tmp8, %bb3 ], [ 0, %bb2 ] + %tmp6 = getelementptr inbounds [16 x double]* undef, i32 0, i32 %tmp5 + %tmp7 = load double* %tmp6, align 4 + %tmp8 = add nsw i32 %tmp5, 1 + %tmp9 = fadd fast double %tmp4, undef + %tmp10 = getelementptr inbounds float* %arg, i32 %tmp5 + store float undef, float* %tmp10, align 4 + %tmp11 = icmp eq i32 %tmp8, %arg1 + br i1 %tmp11, label %bb12, label %bb3 + +bb12: ; preds = %bb3, %bb2 + %tmp13 = phi double [ %tmp, %bb2 ], [ %tmp9, %bb3 ] + ret void +} diff --git a/test/Transforms/LoopVectorize/X86/small-size.ll b/test/Transforms/LoopVectorize/X86/small-size.ll new file mode 100644 index 0000000000..f390b33c03 --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/small-size.ll @@ -0,0 +1,170 @@ +; RUN: opt < %s -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +@b = common global [2048 x i32] zeroinitializer, align 16 +@c = common global [2048 x i32] zeroinitializer, align 16 +@a = common global [2048 x i32] zeroinitializer, align 16 +@G = common global [32 x [1024 x i32]] zeroinitializer, align 16 +@ub = common global [1024 x i32] zeroinitializer, align 16 +@uc = common global [1024 x i32] zeroinitializer, align 16 +@d = common global [2048 x i32] zeroinitializer, align 16 +@fa = common global [1024 x float] zeroinitializer, align 16 +@fb = common global [1024 x float] zeroinitializer, align 16 +@ic = common global [1024 x i32] zeroinitializer, align 16 +@da = common global [1024 x float] zeroinitializer, align 16 +@db = common global [1024 x float] zeroinitializer, align 16 +@dc = common global [1024 x float] zeroinitializer, align 16 +@dd = common global [1024 x float] zeroinitializer, align 16 +@dj = common global [1024 x i32] zeroinitializer, align 16 + +; We can optimize this test without a tail. +;CHECK: @example1 +;CHECK: load <4 x i32> +;CHECK: add nsw <4 x i32> +;CHECK: store <4 x i32> +;CHECK: ret void +define void @example1() optsize { + br label %1 + +; <label>:1 ; preds = %1, %0 + %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ] + %2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv + %3 = load i32* %2, align 4 + %4 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv + %5 = load i32* %4, align 4 + %6 = add nsw i32 %5, %3 + %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv + store i32 %6, i32* %7, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 256 + br i1 %exitcond, label %8, label %1 + +; <label>:8 ; preds = %1 + ret void +} + +; Can't vectorize in 'optsize' mode because we need a tail. +;CHECK: @example2 +;CHECK-NOT: store <4 x i32> +;CHECK: ret void +define void @example2(i32 %n, i32 %x) optsize { + %1 = icmp sgt i32 %n, 0 + br i1 %1, label %.lr.ph5, label %.preheader + +..preheader_crit_edge: ; preds = %.lr.ph5 + %phitmp = sext i32 %n to i64 + br label %.preheader + +.preheader: ; preds = %..preheader_crit_edge, %0 + %i.0.lcssa = phi i64 [ %phitmp, %..preheader_crit_edge ], [ 0, %0 ] + %2 = icmp eq i32 %n, 0 + br i1 %2, label %._crit_edge, label %.lr.ph + +.lr.ph5: ; preds = %0, %.lr.ph5 + %indvars.iv6 = phi i64 [ %indvars.iv.next7, %.lr.ph5 ], [ 0, %0 ] + %3 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv6 + store i32 %x, i32* %3, align 4 + %indvars.iv.next7 = add i64 %indvars.iv6, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next7 to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %..preheader_crit_edge, label %.lr.ph5 + +.lr.ph: ; preds = %.preheader, %.lr.ph + %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ %i.0.lcssa, %.preheader ] + %.02 = phi i32 [ %4, %.lr.ph ], [ %n, %.preheader ] + %4 = add nsw i32 %.02, -1 + %5 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv + %6 = load i32* %5, align 4 + %7 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv + %8 = load i32* %7, align 4 + %9 = and i32 %8, %6 + %10 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv + store i32 %9, i32* %10, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %11 = icmp eq i32 %4, 0 + br i1 %11, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %.preheader + ret void +} + +; N is unknown, we need a tail. Can't vectorize. +;CHECK: @example3 +;CHECK-NOT: <4 x i32> +;CHECK: ret void +define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) optsize { + %1 = icmp eq i32 %n, 0 + br i1 %1, label %._crit_edge, label %.lr.ph + +.lr.ph: ; preds = %0, %.lr.ph + %.05 = phi i32 [ %2, %.lr.ph ], [ %n, %0 ] + %.014 = phi i32* [ %5, %.lr.ph ], [ %p, %0 ] + %.023 = phi i32* [ %3, %.lr.ph ], [ %q, %0 ] + %2 = add nsw i32 %.05, -1 + %3 = getelementptr inbounds i32* %.023, i64 1 + %4 = load i32* %.023, align 16 + %5 = getelementptr inbounds i32* %.014, i64 1 + store i32 %4, i32* %.014, align 16 + %6 = icmp eq i32 %2, 0 + br i1 %6, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 + ret void +} + + +; We can't vectorize this one because we need a runtime ptr check. +;CHECK: @example23 +;CHECK-NOT: <4 x i32> +;CHECK: ret void +define void @example23(i16* nocapture %src, i32* nocapture %dst) optsize { + br label %1 + +; <label>:1 ; preds = %1, %0 + %.04 = phi i16* [ %src, %0 ], [ %2, %1 ] + %.013 = phi i32* [ %dst, %0 ], [ %6, %1 ] + %i.02 = phi i32 [ 0, %0 ], [ %7, %1 ] + %2 = getelementptr inbounds i16* %.04, i64 1 + %3 = load i16* %.04, align 2 + %4 = zext i16 %3 to i32 + %5 = shl nuw nsw i32 %4, 7 + %6 = getelementptr inbounds i32* %.013, i64 1 + store i32 %5, i32* %.013, align 4 + %7 = add nsw i32 %i.02, 1 + %exitcond = icmp eq i32 %7, 256 + br i1 %exitcond, label %8, label %1 + +; <label>:8 ; preds = %1 + ret void +} + + +; We CAN vectorize this example because the pointers are marked as noalias. +;CHECK: @example23b +;CHECK: <4 x i32> +;CHECK: ret void +define void @example23b(i16* noalias nocapture %src, i32* noalias nocapture %dst) optsize { + br label %1 + +; <label>:1 ; preds = %1, %0 + %.04 = phi i16* [ %src, %0 ], [ %2, %1 ] + %.013 = phi i32* [ %dst, %0 ], [ %6, %1 ] + %i.02 = phi i32 [ 0, %0 ], [ %7, %1 ] + %2 = getelementptr inbounds i16* %.04, i64 1 + %3 = load i16* %.04, align 2 + %4 = zext i16 %3 to i32 + %5 = shl nuw nsw i32 %4, 7 + %6 = getelementptr inbounds i32* %.013, i64 1 + store i32 %5, i32* %.013, align 4 + %7 = add nsw i32 %i.02, 1 + %exitcond = icmp eq i32 %7, 256 + br i1 %exitcond, label %8, label %1 + +; <label>:8 ; preds = %1 + ret void +} + + diff --git a/test/Transforms/LoopVectorize/X86/struct-store.ll b/test/Transforms/LoopVectorize/X86/struct-store.ll new file mode 100644 index 0000000000..a995e43a5a --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/struct-store.ll @@ -0,0 +1,27 @@ +; RUN: opt < %s -loop-vectorize -mtriple=x86_64-unknown-linux-gnu -S + +; Make sure we are not crashing on this one. + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@glbl = external global [16 x { i64, i64 }], align 16 + +declare void @fn() + +define void @test() { +entry: + br label %loop + +loop: + %indvars.iv = phi i64 [ %indvars.iv.next, %loop ], [ 0, %entry ] + %tmp = getelementptr inbounds [16 x { i64, i64 }]* @glbl, i64 0, i64 %indvars.iv + store { i64, i64 } { i64 ptrtoint (void ()* @fn to i64), i64 0 }, { i64, i64 }* %tmp, align 16 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp ne i32 %lftr.wideiv, 16 + br i1 %exitcond, label %loop, label %exit + +exit: + ret void +} diff --git a/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll b/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll new file mode 100644 index 0000000000..ef63a145d0 --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/unroll-small-loops.ll @@ -0,0 +1,50 @@ +; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=4 -force-vector-unroll=0 -dce -S | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" +;CHECK: @foo +;CHECK: load <4 x i32> +;CHECK-NOT: load <4 x i32> +;CHECK: store <4 x i32> +;CHECK-NOT: store <4 x i32> +;CHECK: ret +define i32 @foo(i32* nocapture %A) nounwind uwtable ssp { + br label %1 + +; <label>:1 ; preds = %1, %0 + %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ] + %2 = getelementptr inbounds i32* %A, i64 %indvars.iv + %3 = load i32* %2, align 4 + %4 = add nsw i32 %3, 6 + store i32 %4, i32* %2, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, 100 + br i1 %exitcond, label %5, label %1 + +; <label>:5 ; preds = %1 + ret i32 undef +} + +;CHECK: @bar +;CHECK: store <4 x i32> +;CHECK: store <4 x i32> +;CHECK: ret +define i32 @bar(i32* nocapture %A, i32 %n) nounwind uwtable ssp { + %1 = icmp sgt i32 %n, 0 + br i1 %1, label %.lr.ph, label %._crit_edge + +.lr.ph: ; preds = %0, %.lr.ph + %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ] + %2 = getelementptr inbounds i32* %A, i64 %indvars.iv + %3 = load i32* %2, align 4 + %4 = add nsw i32 %3, 6 + store i32 %4, i32* %2, align 4 + %indvars.iv.next = add i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv.next to i32 + %exitcond = icmp eq i32 %lftr.wideiv, %n + br i1 %exitcond, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 + ret i32 undef +} diff --git a/test/Transforms/LoopVectorize/X86/unroll_selection.ll b/test/Transforms/LoopVectorize/X86/unroll_selection.ll new file mode 100644 index 0000000000..2d7b663804 --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/unroll_selection.ll @@ -0,0 +1,71 @@ +; RUN: opt < %s -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -force-vector-width=4 -force-vector-unroll=0 -dce -S | FileCheck %s + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +; Don't unroll when we have register pressure. +;CHECK: reg_pressure +;CHECK: load <4 x double> +;CHECK-NOT: load <4 x double> +;CHECK: store <4 x double> +;CHECK-NOT: store <4 x double> +;CHECK: ret +define void @reg_pressure(double* nocapture %A, i32 %n) nounwind uwtable ssp { + %1 = sext i32 %n to i64 + br label %2 + +; <label>:2 ; preds = %2, %0 + %indvars.iv = phi i64 [ %indvars.iv.next, %2 ], [ %1, %0 ] + %3 = getelementptr inbounds double* %A, i64 %indvars.iv + %4 = load double* %3, align 8 + %5 = fadd double %4, 3.000000e+00 + %6 = fmul double %4, 2.000000e+00 + %7 = fadd double %5, %6 + %8 = fadd double %7, 2.000000e+00 + %9 = fmul double %8, 5.000000e-01 + %10 = fadd double %6, %9 + %11 = fsub double %10, %5 + %12 = fadd double %4, %11 + %13 = fdiv double %8, %12 + %14 = fmul double %13, %8 + %15 = fmul double %6, %14 + %16 = fmul double %5, %15 + %17 = fadd double %16, -3.000000e+00 + %18 = fsub double %4, %5 + %19 = fadd double %6, %18 + %20 = fadd double %13, %19 + %21 = fadd double %20, %17 + %22 = fadd double %21, 3.000000e+00 + %23 = fmul double %4, %22 + store double %23, double* %3, align 8 + %indvars.iv.next = add i64 %indvars.iv, -1 + %24 = trunc i64 %indvars.iv to i32 + %25 = icmp eq i32 %24, 0 + br i1 %25, label %26, label %2 + +; <label>:26 ; preds = %2 + ret void +} + +; This is a small loop. Unroll it twice. +;CHECK: small_loop +;CHECK: xor +;CHECK: xor +;CHECK: ret +define void @small_loop(i16* nocapture %A, i64 %n) nounwind uwtable ssp { + %1 = icmp eq i64 %n, 0 + br i1 %1, label %._crit_edge, label %.lr.ph + +.lr.ph: ; preds = %0, %.lr.ph + %i.01 = phi i64 [ %5, %.lr.ph ], [ 0, %0 ] + %2 = getelementptr inbounds i16* %A, i64 %i.01 + %3 = load i16* %2, align 2 + %4 = xor i16 %3, 3 + store i16 %4, i16* %2, align 2 + %5 = add i64 %i.01, 1 + %exitcond = icmp eq i64 %5, %n + br i1 %exitcond, label %._crit_edge, label %.lr.ph + +._crit_edge: ; preds = %.lr.ph, %0 + ret void +} diff --git a/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll b/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll new file mode 100644 index 0000000000..59bb8d0054 --- /dev/null +++ b/test/Transforms/LoopVectorize/X86/vector_ptr_load_store.ll @@ -0,0 +1,150 @@ +; RUN: opt -loop-vectorize -mcpu=corei7-avx -debug -S < %s 2>&1 | FileCheck %s +; REQUIRES: asserts + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-apple-macosx10.8.0" + +%0 = type { %0*, %1 } +%1 = type { i8*, i32 } + +@p = global [2048 x [8 x i32*]] zeroinitializer, align 16 +@q = global [2048 x i16] zeroinitializer, align 16 +@r = global [2048 x i16] zeroinitializer, align 16 + +; Tests for widest type +; Ensure that we count the pointer store in the first test case. We have a +; consecutive vector of pointers store, therefore we should count it towards the +; widest vector count. +; +; CHECK: test_consecutive_store +; CHECK: The Widest type: 64 bits +define void @test_consecutive_store(%0**, %0**, %0** nocapture) nounwind ssp uwtable align 2 { + %4 = load %0** %2, align 8 + %5 = icmp eq %0** %0, %1 + br i1 %5, label %12, label %6 + +; <label>:6 ; preds = %3 + br label %7 + +; <label>:7 ; preds = %7, %6 + %8 = phi %0** [ %0, %6 ], [ %9, %7 ] + store %0* %4, %0** %8, align 8 + %9 = getelementptr inbounds %0** %8, i64 1 + %10 = icmp eq %0** %9, %1 + br i1 %10, label %11, label %7 + +; <label>:11 ; preds = %7 + br label %12 + +; <label>:12 ; preds = %11, %3 + ret void +} + +; However, if the store of a set of pointers is not to consecutive memory we do +; NOT count the store towards the widest vector type. +; In the test case below we add i16 types to store it in an array of pointer, +; therefore the widest type should be i16. +; int* p[2048][8]; +; short q[2048]; +; for (int y = 0; y < 8; ++y) +; for (int i = 0; i < 1024; ++i) { +; p[i][y] = (int*) (1 + q[i]); +; } +; CHECK: test_nonconsecutive_store +; CHECK: The Widest type: 16 bits +define void @test_nonconsecutive_store() nounwind ssp uwtable { + br label %1 + +; <label>:1 ; preds = %14, %0 + %2 = phi i64 [ 0, %0 ], [ %15, %14 ] + br label %3 + +; <label>:3 ; preds = %3, %1 + %4 = phi i64 [ 0, %1 ], [ %11, %3 ] + %5 = getelementptr inbounds [2048 x i16]* @q, i64 0, i64 %4 + %6 = load i16* %5, align 2 + %7 = sext i16 %6 to i64 + %8 = add i64 %7, 1 + %9 = inttoptr i64 %8 to i32* + %10 = getelementptr inbounds [2048 x [8 x i32*]]* @p, i64 0, i64 %4, i64 %2 + store i32* %9, i32** %10, align 8 + %11 = add i64 %4, 1 + %12 = trunc i64 %11 to i32 + %13 = icmp ne i32 %12, 1024 + br i1 %13, label %3, label %14 + +; <label>:14 ; preds = %3 + %15 = add i64 %2, 1 + %16 = trunc i64 %15 to i32 + %17 = icmp ne i32 %16, 8 + br i1 %17, label %1, label %18 + +; <label>:18 ; preds = %14 + ret void +} + + +@ia = global [1024 x i32*] zeroinitializer, align 16 +@ib = global [1024 x i32] zeroinitializer, align 16 +@ic = global [1024 x i8] zeroinitializer, align 16 +@p2 = global [2048 x [8 x i32*]] zeroinitializer, align 16 +@q2 = global [2048 x i16] zeroinitializer, align 16 + +;; Now we check the same rules for loads. We should take consecutive loads of +;; pointer types into account. +; CHECK: test_consecutive_ptr_load +; CHECK: The Widest type: 64 bits +define i8 @test_consecutive_ptr_load() nounwind readonly ssp uwtable { + br label %1 + +; <label>:1 ; preds = %1, %0 + %2 = phi i64 [ 0, %0 ], [ %10, %1 ] + %3 = phi i8 [ 0, %0 ], [ %9, %1 ] + %4 = getelementptr inbounds [1024 x i32*]* @ia, i32 0, i64 %2 + %5 = load i32** %4, align 4 + %6 = ptrtoint i32* %5 to i64 + %7 = trunc i64 %6 to i8 + %8 = add i8 %3, 1 + %9 = add i8 %7, %8 + %10 = add i64 %2, 1 + %11 = icmp ne i64 %10, 1024 + br i1 %11, label %1, label %12 + +; <label>:12 ; preds = %1 + %13 = phi i8 [ %9, %1 ] + ret i8 %13 +} + +;; However, we should not take unconsecutive loads of pointers into account. +; CHECK: test_nonconsecutive_ptr_load +; CHECK: The Widest type: 16 bits +define void @test_nonconsecutive_ptr_load() nounwind ssp uwtable { + br label %1 + +; <label>:1 ; preds = %13, %0 + %2 = phi i64 [ 0, %0 ], [ %14, %13 ] + br label %3 + +; <label>:3 ; preds = %3, %1 + %4 = phi i64 [ 0, %1 ], [ %10, %3 ] + %5 = getelementptr inbounds [2048 x [8 x i32*]]* @p2, i64 0, i64 %4, i64 %2 + %6 = getelementptr inbounds [2048 x i16]* @q2, i64 0, i64 %4 + %7 = load i32** %5, align 2 + %8 = ptrtoint i32* %7 to i64 + %9 = trunc i64 %8 to i16 + store i16 %9, i16* %6, align 8 + %10 = add i64 %4, 1 + %11 = trunc i64 %10 to i32 + %12 = icmp ne i32 %11, 1024 + br i1 %12, label %3, label %13 + +; <label>:13 ; preds = %3 + %14 = add i64 %2, 1 + %15 = trunc i64 %14 to i32 + %16 = icmp ne i32 %15, 8 + br i1 %16, label %1, label %17 + +; <label>:17 ; preds = %13 + ret void +} + |