From 5d79bb8770a5a655af0dccc87b952c3ea9bad45e Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Fri, 1 Mar 2013 19:07:31 +0000
Subject: LoopVectorize: Don't hang forever if a PHI only has skipped PHI uses.

Fixes PR15384.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@176366 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index a696a2ffba..ef9c7c9db2 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2724,6 +2724,9 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
     // Is this a bin op ?
     FoundBinOp |= !isa<PHINode>(Iter);
 
+    // Remember the current instruction.
+    Instruction *OldIter = Iter;
+
     // For each of the *users* of iter.
     for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end();
          it != e; ++it) {
@@ -2749,7 +2752,7 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
       if (isa<PHINode>(Iter) && isa<PHINode>(U) &&
           U->getParent() != TheLoop->getHeader() &&
           TheLoop->contains(U) &&
-          Iter->getNumUses() > 1)
+          Iter->hasNUsesOrMore(2))
         continue;
 
       // We can't have multiple inside users.
@@ -2769,6 +2772,10 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
       Iter = U;
     }
 
+    // If all uses were skipped this can't be a reduction variable.
+    if (Iter == OldIter)
+      return false;
+
     // We found a reduction var if we have reached the original
     // phi node and we only have a single instruction with out-of-loop
     // users.
-- 
cgit v1.2.3-70-g09d2


From 5290baacb8ca4fb75d798e873a441cad11cbfb2c Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Sat, 2 Mar 2013 01:33:49 +0000
Subject: PR14448 - prevent the loop vectorizer from vectorizing the same loop
 twice. The LoopVectorizer often runs multiple times on the same function due
 to inlining. When this happens the loop vectorizer often vectorizes the same
 loops multiple times, increasing code size and adding unneeded branches. With
 this patch, the vectorizer during vectorization puts metadata on scalar loops
 and marks them as 'already vectorized' so that it knows to ignore them when
 it sees them a second time.

PR14448.



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@176399 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp      | 18 ++++++
 test/Transforms/LoopVectorize/vectorize-once.ll | 75 +++++++++++++++++++++++++
 2 files changed, 93 insertions(+)
 create mode 100644 test/Transforms/LoopVectorize/vectorize-once.ll

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index ef9c7c9db2..0d11372808 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -116,6 +116,12 @@ static const unsigned TinyTripCountUnrollThreshold = 128;
 /// number of pointers. Notice that the check is quadratic!
 static const unsigned RuntimeMemoryCheckThreshold = 4;
 
+/// We use a metadata with this name  to indicate that a scalar loop was
+/// vectorized and that we don't need to re-vectorize it if we run into it
+/// again.
+static const char*
+AlreadyVectorizedMDName = "llvm.vectorizer.already_vectorized";
+
 namespace {
 
 // Forward declarations.
@@ -1159,6 +1165,11 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
   assert(ExitBlock && "Must have an exit block");
 
+  // Mark the old scalar loop with metadata that tells us not to vectorize this
+  // loop again if we run into it.
+  MDNode *MD = MDNode::get(OldBasicBlock->getContext(), ArrayRef<Value*>());
+  OldBasicBlock->getTerminator()->setMetadata(AlreadyVectorizedMDName, MD);
+
   // Some loops have a single integer induction variable, while other loops
   // don't. One example is c++ iterators that often have multiple pointer
   // induction variables. In the code below we also support a case where we
@@ -2224,6 +2235,13 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
   BasicBlock *PreHeader = TheLoop->getLoopPreheader();
   BasicBlock *Header = TheLoop->getHeader();
 
+  // If we marked the scalar loop as "already vectorized" then no need
+  // to vectorize it again.
+  if (Header->getTerminator()->getMetadata(AlreadyVectorizedMDName)) {
+    DEBUG(dbgs() << "LV: This loop was vectorized before\n");
+    return false;
+  }
+
   // For each block in the loop.
   for (Loop::block_iterator bb = TheLoop->block_begin(),
        be = TheLoop->block_end(); bb != be; ++bb) {
diff --git a/test/Transforms/LoopVectorize/vectorize-once.ll b/test/Transforms/LoopVectorize/vectorize-once.ll
new file mode 100644
index 0000000000..ac1694802a
--- /dev/null
+++ b/test/Transforms/LoopVectorize/vectorize-once.ll
@@ -0,0 +1,75 @@
+; RUN: opt < %s -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S -simplifycfg | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;
+; We want to make sure that we are vectorizeing the scalar loop only once
+; even if the pass manager runs the vectorizer multiple times due to inlining.
+
+
+; This test checks that we add metadata to vectorized loops
+; CHECK: _Z4foo1Pii
+; CHECK: <4 x i32>
+; CHECK: llvm.vectorizer.already_vectorized
+; CHECK: ret
+
+; This test comes from the loop:
+;
+;int foo (int *A, int n) {
+;  return std::accumulate(A, A + n, 0);
+;}
+define i32 @_Z4foo1Pii(i32* %A, i32 %n) #0 {
+entry:
+  %idx.ext = sext i32 %n to i64
+  %add.ptr = getelementptr inbounds i32* %A, i64 %idx.ext
+  %cmp3.i = icmp eq i32 %n, 0
+  br i1 %cmp3.i, label %_ZSt10accumulateIPiiET0_T_S2_S1_.exit, label %for.body.i
+
+for.body.i:                                       ; preds = %entry, %for.body.i
+  %__init.addr.05.i = phi i32 [ %add.i, %for.body.i ], [ 0, %entry ]
+  %__first.addr.04.i = phi i32* [ %incdec.ptr.i, %for.body.i ], [ %A, %entry ]
+  %0 = load i32* %__first.addr.04.i, align 4, !tbaa !0
+  %add.i = add nsw i32 %0, %__init.addr.05.i
+  %incdec.ptr.i = getelementptr inbounds i32* %__first.addr.04.i, i64 1
+  %cmp.i = icmp eq i32* %incdec.ptr.i, %add.ptr
+  br i1 %cmp.i, label %_ZSt10accumulateIPiiET0_T_S2_S1_.exit, label %for.body.i
+
+_ZSt10accumulateIPiiET0_T_S2_S1_.exit:            ; preds = %for.body.i, %entry
+  %__init.addr.0.lcssa.i = phi i32 [ 0, %entry ], [ %add.i, %for.body.i ]
+  ret i32 %__init.addr.0.lcssa.i
+}
+
+; This test checks that we don't vectorize loops that are marked with the "already vectorized" metadata.
+; CHECK: _Z4foo2Pii
+; CHECK-NOT: <4 x i32>
+; CHECK: llvm.vectorizer.already_vectorized
+; CHECK: ret
+define i32 @_Z4foo2Pii(i32* %A, i32 %n) #0 {
+entry:
+  %idx.ext = sext i32 %n to i64
+  %add.ptr = getelementptr inbounds i32* %A, i64 %idx.ext
+  %cmp3.i = icmp eq i32 %n, 0
+  br i1 %cmp3.i, label %_ZSt10accumulateIPiiET0_T_S2_S1_.exit, label %for.body.i
+
+for.body.i:                                       ; preds = %entry, %for.body.i
+  %__init.addr.05.i = phi i32 [ %add.i, %for.body.i ], [ 0, %entry ]
+  %__first.addr.04.i = phi i32* [ %incdec.ptr.i, %for.body.i ], [ %A, %entry ]
+  %0 = load i32* %__first.addr.04.i, align 4, !tbaa !0
+  %add.i = add nsw i32 %0, %__init.addr.05.i
+  %incdec.ptr.i = getelementptr inbounds i32* %__first.addr.04.i, i64 1
+  %cmp.i = icmp eq i32* %incdec.ptr.i, %add.ptr
+  br i1 %cmp.i, label %_ZSt10accumulateIPiiET0_T_S2_S1_.exit, label %for.body.i, !llvm.vectorizer.already_vectorized !3
+
+_ZSt10accumulateIPiiET0_T_S2_S1_.exit:            ; preds = %for.body.i, %entry
+  %__init.addr.0.lcssa.i = phi i32 [ 0, %entry ], [ %add.i, %for.body.i ]
+  ret i32 %__init.addr.0.lcssa.i
+}
+
+attributes #0 = { nounwind readonly ssp uwtable "fp-contract-model"="standard" "no-frame-pointer-elim" "no-frame-pointer-elim-non-leaf" "realign-stack" "relocation-model"="pic" "ssp-buffers-size"="8" }
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{}
+
-- 
cgit v1.2.3-70-g09d2


From f22d9cfa6d145ce26930804cc3ac54340f49c38b Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Fri, 8 Mar 2013 16:58:37 +0000
Subject: Insert the reduction start value into the first bypass block to
 preserve domination.

Fixes PR15344.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@176701 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp         |  2 +-
 .../LoopVectorize/X86/reduction-crash.ll           | 35 ++++++++++++++++++++++
 2 files changed, 36 insertions(+), 1 deletion(-)
 create mode 100644 test/Transforms/LoopVectorize/X86/reduction-crash.ll

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0d11372808..11f4b02204 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1643,7 +1643,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     // To do so, we need to generate the 'identity' vector and overide
     // one of the elements with the incoming scalar reduction. We need
     // to do it in the vector-loop preheader.
-    Builder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator());
+    Builder.SetInsertPoint(LoopBypassBlocks.front()->getTerminator());
 
     // This is the vector-clone of the value that leaves the loop.
     VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
diff --git a/test/Transforms/LoopVectorize/X86/reduction-crash.ll b/test/Transforms/LoopVectorize/X86/reduction-crash.ll
new file mode 100644
index 0000000000..44702c8dcf
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/reduction-crash.ll
@@ -0,0 +1,35 @@
+; RUN: opt -S -loop-vectorize < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
+target triple = "i386-apple-darwin"
+
+; PR15344
+define void @test1(float* nocapture %arg, i32 %arg1) nounwind {
+; CHECK: @test1
+; CHECK: preheader
+; CHECK: insertelement <2 x double> zeroinitializer, double %tmp, i32 0
+; CHECK: vector.memcheck
+
+bb:
+  br label %bb2
+
+bb2:                                              ; preds = %bb
+  %tmp = load double* null, align 8
+  br i1 undef, label %bb3, label %bb12
+
+bb3:                                              ; preds = %bb3, %bb2
+  %tmp4 = phi double [ %tmp9, %bb3 ], [ %tmp, %bb2 ]
+  %tmp5 = phi i32 [ %tmp8, %bb3 ], [ 0, %bb2 ]
+  %tmp6 = getelementptr inbounds [16 x double]* undef, i32 0, i32 %tmp5
+  %tmp7 = load double* %tmp6, align 4
+  %tmp8 = add nsw i32 %tmp5, 1
+  %tmp9 = fadd fast double %tmp4, undef
+  %tmp10 = getelementptr inbounds float* %arg, i32 %tmp5
+  store float undef, float* %tmp10, align 4
+  %tmp11 = icmp eq i32 %tmp8, %arg1
+  br i1 %tmp11, label %bb12, label %bb3
+
+bb12:                                             ; preds = %bb3, %bb2
+  %tmp13 = phi double [ %tmp, %bb2 ], [ %tmp9, %bb3 ]
+  ret void
+}
-- 
cgit v1.2.3-70-g09d2


From 56ee544a3af2b019329f06422c00e8a3645b895c Mon Sep 17 00:00:00 2001
From: Arnold Schwaighofer <aschwaighofer@apple.com>
Date: Sat, 9 Mar 2013 15:56:34 +0000
Subject: LoopVectorizer: Ignore dbg.value instructions

We want vectorization to happen at -g. Ignore calls to the dbg.value intrinsic
and don't transfer them to the vectorized code.

radar://13378964

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@176768 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 13 +++++-
 test/Transforms/LoopVectorize/dbg.value.ll | 70 ++++++++++++++++++++++++++++++
 2 files changed, 81 insertions(+), 2 deletions(-)
 create mode 100644 test/Transforms/LoopVectorize/dbg.value.ll

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 11f4b02204..e1f2932231 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2088,6 +2088,10 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
     }
 
     case Instruction::Call: {
+      // Ignore dbg.value instructions.
+      if (isa<DbgValueInst>(it))
+        break;
+
       Module *M = BB->getParent()->getParent();
       CallInst *CI = cast<CallInst>(it);
       Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
@@ -2324,9 +2328,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         return false;
       }// end of PHI handling
 
-      // We still don't handle functions.
+      // We still don't handle functions. However, we can ignore dbg.value
+      // calls and we do handle certain intrinsic and libm functions.
       CallInst *CI = dyn_cast<CallInst>(it);
-      if (CI && !getIntrinsicIDForCall(CI, TLI)) {
+      if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgValueInst>(CI)) {
         DEBUG(dbgs() << "LV: Found a call site.\n");
         return false;
       }
@@ -3263,6 +3268,10 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
 
     // For each instruction in the old loop.
     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+      // Skip dbg.value instructions.
+      if (isa<DbgValueInst>(it))
+        continue;
+
       unsigned C = getInstructionCost(it, VF);
       Cost += C;
       DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF " <<
diff --git a/test/Transforms/LoopVectorize/dbg.value.ll b/test/Transforms/LoopVectorize/dbg.value.ll
new file mode 100644
index 0000000000..a2ea9511bb
--- /dev/null
+++ b/test/Transforms/LoopVectorize/dbg.value.ll
@@ -0,0 +1,70 @@
+; RUN: opt < %s -S -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine | FileCheck %s
+; Make sure we vectorize with debugging turned on.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@A = global [1024 x i32] zeroinitializer, align 16
+@B = global [1024 x i32] zeroinitializer, align 16
+@C = global [1024 x i32] zeroinitializer, align 16
+
+; CHECK: @test
+define i32 @test() #0 {
+entry:
+  tail call void @llvm.dbg.value(metadata !1, i64 0, metadata !9), !dbg !18
+  br label %for.body, !dbg !18
+
+for.body:
+  ;CHECK: load <4 x i32>
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32]* @B, i64 0, i64 %indvars.iv, !dbg !19
+  %0 = load i32* %arrayidx, align 4, !dbg !19, !tbaa !21
+  %arrayidx2 = getelementptr inbounds [1024 x i32]* @C, i64 0, i64 %indvars.iv, !dbg !19
+  %1 = load i32* %arrayidx2, align 4, !dbg !19, !tbaa !21
+  %add = add nsw i32 %1, %0, !dbg !19
+  %arrayidx4 = getelementptr inbounds [1024 x i32]* @A, i64 0, i64 %indvars.iv, !dbg !19
+  store i32 %add, i32* %arrayidx4, align 4, !dbg !19, !tbaa !21
+  %indvars.iv.next = add i64 %indvars.iv, 1, !dbg !18
+  tail call void @llvm.dbg.value(metadata !{null}, i64 0, metadata !9), !dbg !18
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32, !dbg !18
+  %exitcond = icmp ne i32 %lftr.wideiv, 1024, !dbg !18
+  br i1 %exitcond, label %for.body, label %for.end, !dbg !18
+
+for.end:
+  ret i32 0, !dbg !24
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+declare void @llvm.dbg.value(metadata, i64, metadata) #1
+
+attributes #0 = { nounwind ssp uwtable "fp-contract-model"="standard" "no-frame-pointer-elim" "no-frame-pointer-elim-non-leaf" "realign-stack" "relocation-model"="pic" "ssp-buffers-size"="8" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"test", metadata !"/path/to/somewhere", metadata !"clang", i1 true, i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !2, metadata !11, metadata !""}
+!1 = metadata !{i32 0}
+!2 = metadata !{metadata !3}
+!3 = metadata !{i32 786478, i32 0, metadata !4, metadata !"test", metadata !"test", metadata !"test", metadata !4, i32 5, metadata !5, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* @test, null, null, metadata !8, i32 5}
+!4 = metadata !{i32 786473, metadata !"test", metadata !"/path/to/somewhere", null}
+!5 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !6, i32 0, i32 0}
+!6 = metadata !{metadata !7}
+!7 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5}
+!8 = metadata !{metadata !9}
+!9 = metadata !{i32 786688, metadata !10, metadata !"i", metadata !4, i32 6, metadata !7, i32 0, i32 0}
+!10 = metadata !{i32 786443, metadata !3, i32 6, i32 0, metadata !4, i32 0}
+!11 = metadata !{metadata !12, metadata !16, metadata !17}
+!12 = metadata !{i32 786484, i32 0, null, metadata !"A", metadata !"A", metadata !"", metadata !4, i32 1, metadata !13, i32 0, i32 1, [1024 x i32]* @A, null}
+!13 = metadata !{i32 786433, null, metadata !"", null, i32 0, i64 32768, i64 32, i32 0, i32 0, metadata !7, metadata !14, i32 0, i32 0}
+!14 = metadata !{metadata !15}
+!15 = metadata !{i32 786465, i64 0, i64 1024}
+!16 = metadata !{i32 786484, i32 0, null, metadata !"B", metadata !"B", metadata !"", metadata !4, i32 2, metadata !13, i32 0, i32 1, [1024 x i32]* @B, null}
+!17 = metadata !{i32 786484, i32 0, null, metadata !"C", metadata !"C", metadata !"", metadata !4, i32 3, metadata !13, i32 0, i32 1, [1024 x i32]* @C, null} 
+!18 = metadata !{i32 6, i32 0, metadata !10, null}
+!19 = metadata !{i32 7, i32 0, metadata !20, null}
+!20 = metadata !{i32 786443, metadata !10, i32 6, i32 0, metadata !4, i32 1}
+!21 = metadata !{metadata !"int", metadata !22}
+!22 = metadata !{metadata !"omnipotent char", metadata !23}
+!23 = metadata !{metadata !"Simple C/C++ TBAA"}
+!24 = metadata !{i32 9, i32 0, metadata !3, null}
-- 
cgit v1.2.3-70-g09d2


From 738295e4570f502360c11bc51843f5a8516a9526 Mon Sep 17 00:00:00 2001
From: Arnold Schwaighofer <aschwaighofer@apple.com>
Date: Sat, 9 Mar 2013 16:27:27 +0000
Subject: LoopVectorizer: Ignore all dbg intrinisic

Ignore all DbgIntriniscInfo instructions instead of just DbgValueInst.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@176769 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index e1f2932231..3da0f5d210 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2088,8 +2088,8 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
     }
 
     case Instruction::Call: {
-      // Ignore dbg.value instructions.
-      if (isa<DbgValueInst>(it))
+      // Ignore dbg intrinsics.
+      if (isa<DbgInfoIntrinsic>(it))
         break;
 
       Module *M = BB->getParent()->getParent();
@@ -2328,10 +2328,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         return false;
       }// end of PHI handling
 
-      // We still don't handle functions. However, we can ignore dbg.value
+      // We still don't handle functions. However, we can ignore dbg intrinsic
       // calls and we do handle certain intrinsic and libm functions.
       CallInst *CI = dyn_cast<CallInst>(it);
-      if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgValueInst>(CI)) {
+      if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI)) {
         DEBUG(dbgs() << "LV: Found a call site.\n");
         return false;
       }
@@ -3268,8 +3268,8 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
 
     // For each instruction in the old loop.
     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
-      // Skip dbg.value instructions.
-      if (isa<DbgValueInst>(it))
+      // Skip dbg intrinsics.
+      if (isa<DbgInfoIntrinsic>(it))
         continue;
 
       unsigned C = getInstructionCost(it, VF);
-- 
cgit v1.2.3-70-g09d2


From 0d932717d8d22b0e747b15fddea2c718043e4d51 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Sat, 9 Mar 2013 19:22:40 +0000
Subject: Remove a source of nondeterminism from the LoopVectorizer.

This made us emit runtime checks in a random order. Hopefully bootstrap
miscompares will go away now.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@176775 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 3da0f5d210..07dd453424 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -419,7 +419,7 @@ public:
 
   /// Alias(Multi)Map stores the values (GEPs or underlying objects and their
   /// respective Store/Load instruction(s) to calculate aliasing.
-  typedef DenseMap<Value*, Instruction* > AliasMap;
+  typedef MapVector<Value*, Instruction* > AliasMap;
   typedef DenseMap<Value*, std::vector<Instruction*> > AliasMultiMap;
 
   /// Returns true if it is legal to vectorize this loop.
-- 
cgit v1.2.3-70-g09d2


From d517da33b712b5d8d687ee5e5974056a4787ec4f Mon Sep 17 00:00:00 2001
From: Arnold Schwaighofer <aschwaighofer@apple.com>
Date: Thu, 14 Mar 2013 18:54:36 +0000
Subject: LoopVectorize: Invert case when we use a vector cmp value to query
 select cost

We generate a select with a vectorized condition argument when the condition is
NOT loop invariant. Not the other way around.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@177098 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp         |  2 +-
 .../LoopVectorize/X86/vector-scalar-select-cost.ll | 62 ++++++++++++++++++++++
 2 files changed, 63 insertions(+), 1 deletion(-)
 create mode 100644 test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 07dd453424..930d9c412f 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3338,7 +3338,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
     Type *CondTy = SI->getCondition()->getType();
-    if (ScalarCond)
+    if (!ScalarCond)
       CondTy = VectorType::get(CondTy, VF);
 
     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
diff --git a/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll b/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll
new file mode 100644
index 0000000000..d1d23aa92c
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/vector-scalar-select-cost.ll
@@ -0,0 +1,62 @@
+; RUN: opt < %s  -loop-vectorize -mcpu=core2 -debug-only=loop-vectorize 2>&1 -S | FileCheck %s
+
+; Make sure we use the right select kind when querying select costs.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@a = common global [2048 x i32] zeroinitializer, align 16
+@b = common global [2048 x i32] zeroinitializer, align 16
+@c = common global [2048 x i32] zeroinitializer, align 16
+
+; CHECK: Checking a loop in "scalarselect"
+define void @scalarselect(i1 %cond) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+; A scalar select has a cost of 1 on core2
+; CHECK: cost of 1 for VF 2 {{.*}}  select i1 %cond, i32 %6, i32 0
+  %sel = select i1 %cond, i32 %6, i32 zeroinitializer
+  store i32 %sel, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+; CHECK: Checking a loop in "vectorselect"
+define void @vectorselect(i1 %cond) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %8 = icmp ult i64 %indvars.iv, 8
+; A vector select has a cost of 4 on core2
+; CHECK: cost of 4 for VF 2 {{.*}}  select i1 %8, i32 %6, i32 0
+  %sel = select i1 %8, i32 %6, i32 zeroinitializer
+  store i32 %sel, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %9, label %1
+
+; <label>:9                                       ; preds = %1
+  ret void
+}
+
-- 
cgit v1.2.3-70-g09d2


From ac2cc0170f7d13e7c2e87f7728c99a6d07876f5c Mon Sep 17 00:00:00 2001
From: Arnold Schwaighofer <aschwaighofer@apple.com>
Date: Thu, 4 Apr 2013 23:26:27 +0000
Subject: LoopVectorizer: Pass OperandValueKind information to the cost model

Pass down the fact that an operand is going to be a vector of constants.

This should bring the performance of MultiSource/Benchmarks/PAQ8p/paq8p on x86
back. It had degraded to scalar performance due to my pervious shift cost change
that made all shifts expensive on x86.

radar://13576547

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@178809 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp         | 15 ++++++++++--
 .../LoopVectorize/X86/constant-vector-operand.ll   | 28 ++++++++++++++++++++++
 2 files changed, 41 insertions(+), 2 deletions(-)
 create mode 100644 test/Transforms/LoopVectorize/X86/constant-vector-operand.ll

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 930d9c412f..acf2b819b8 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -3331,8 +3331,19 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   case Instruction::AShr:
   case Instruction::And:
   case Instruction::Or:
-  case Instruction::Xor:
-    return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy);
+  case Instruction::Xor: {
+    // Certain instructions can be cheaper to vectorize if they have a constant
+    // second vector operand. One example of this are shifts on x86.
+    TargetTransformInfo::OperandValueKind Op1VK =
+      TargetTransformInfo::OK_AnyValue;
+    TargetTransformInfo::OperandValueKind Op2VK =
+      TargetTransformInfo::OK_AnyValue;
+
+    if (isa<ConstantInt>(I->getOperand(1)))
+      Op2VK = TargetTransformInfo::OK_UniformConstantValue;
+
+    return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK);
+  }
   case Instruction::Select: {
     SelectInst *SI = cast<SelectInst>(I);
     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
diff --git a/test/Transforms/LoopVectorize/X86/constant-vector-operand.ll b/test/Transforms/LoopVectorize/X86/constant-vector-operand.ll
new file mode 100644
index 0000000000..6c924409af
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/constant-vector-operand.ll
@@ -0,0 +1,28 @@
+; RUN: opt -mtriple=x86_64-apple-darwin -mcpu=core2 -loop-vectorize -dce -instcombine -S < %s | FileCheck %s
+
+@B = common global [1024 x i32] zeroinitializer, align 16
+@A = common global [1024 x i32] zeroinitializer, align 16
+
+; We use to not vectorize this loop because the shift was deemed to expensive.
+; Now that we differentiate shift cost base on the operand value kind, we will
+; vectorize this loop.
+; CHECK: ashr <4 x i32>
+define void @f() {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32]* @B, i64 0, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %shl = ashr i32 %0, 3
+  %arrayidx2 = getelementptr inbounds [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  store i32 %shl, i32* %arrayidx2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
-- 
cgit v1.2.3-70-g09d2


From 08a0e8f8dbae6314dade60e1ef92e73e89c5ff23 Mon Sep 17 00:00:00 2001
From: Arnold Schwaighofer <aschwaighofer@apple.com>
Date: Fri, 12 Apr 2013 15:15:19 +0000
Subject: LoopVectorizer: integer division is not a reduction operation

Don't classify idiv/udiv as a reduction operation. Integer division is lossy.
For example : (1 / 2) * 4 != 4/2.

Example:

int a[] = { 2, 5, 2, 2}
int x = 80;

for()
  x /= a[i];

Scalar:
  x /= 2 // = 40
  x /= 5 // = 8
  x /= 2 // = 4
  x /= 2 // = 2

Vectorized:

 <80, 1> / <2,5> //= <40,0>
 <40, 0> / <2,2> //= <20,0>

 20*0 = 0

radar://13640654

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179381 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp         |  2 --
 test/Transforms/LoopVectorize/no_idiv_reduction.ll | 24 ++++++++++++++++++++++
 2 files changed, 24 insertions(+), 2 deletions(-)
 create mode 100644 test/Transforms/LoopVectorize/no_idiv_reduction.ll

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index acf2b819b8..238fc8bfb7 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2833,8 +2833,6 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I,
   case Instruction::Sub:
   case Instruction::Add:
     return Kind == RK_IntegerAdd;
-  case Instruction::SDiv:
-  case Instruction::UDiv:
   case Instruction::Mul:
     return Kind == RK_IntegerMult;
   case Instruction::And:
diff --git a/test/Transforms/LoopVectorize/no_idiv_reduction.ll b/test/Transforms/LoopVectorize/no_idiv_reduction.ll
new file mode 100644
index 0000000000..cdfb3fd66f
--- /dev/null
+++ b/test/Transforms/LoopVectorize/no_idiv_reduction.ll
@@ -0,0 +1,24 @@
+; RUN: opt -loop-vectorize -force-vector-width=2 -force-vector-unroll=1 -S < %s | FileCheck %s
+@a = common global [128 x i32] zeroinitializer, align 16
+
+;; Must not vectorize division reduction. Division is lossy.
+define i32 @g() {
+entry:
+  br label %for.body
+
+for.body:
+  ; CHECK: @g
+  ; CHECK-NOT: sdiv <2 x i32>
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %r.05 = phi i32 [ 80, %entry ], [ %div, %for.body ]
+  %arrayidx = getelementptr inbounds [128 x i32]* @a, i64 0, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %div = sdiv i32 %r.05, %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %div
+}
-- 
cgit v1.2.3-70-g09d2


From 403fc143709bfe5cf014929d633c0b1c28f6e0c9 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Thu, 18 Apr 2013 14:29:13 +0000
Subject: LoopVectorize: Use a set to avoid longer cycles in the reduction
 chain too.

Fixes PR15748.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179757 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 14 ++++++--------
 test/Transforms/LoopVectorize/phi-hang.ll  | 18 ++++++++++++++++++
 2 files changed, 24 insertions(+), 8 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 238fc8bfb7..f40964f813 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2733,7 +2733,10 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
   // used as reduction variables (such as ADD). We may have a single
   // out-of-block user. The cycle must end with the original PHI.
   Instruction *Iter = Phi;
-  while (true) {
+
+  // Avoid cycles in the chain.
+  SmallPtrSet<Instruction *, 8> VisitedInsts;
+  while (VisitedInsts.insert(Iter)) {
     // If the instruction has no users then this is a broken
     // chain and can't be a reduction variable.
     if (Iter->use_empty())
@@ -2747,9 +2750,6 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
     // Is this a bin op ?
     FoundBinOp |= !isa<PHINode>(Iter);
 
-    // Remember the current instruction.
-    Instruction *OldIter = Iter;
-
     // For each of the *users* of iter.
     for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end();
          it != e; ++it) {
@@ -2795,10 +2795,6 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
       Iter = U;
     }
 
-    // If all uses were skipped this can't be a reduction variable.
-    if (Iter == OldIter)
-      return false;
-
     // We found a reduction var if we have reached the original
     // phi node and we only have a single instruction with out-of-loop
     // users.
@@ -2814,6 +2810,8 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
       return FoundBinOp && ExitInstruction;
     }
   }
+
+  return false;
 }
 
 bool
diff --git a/test/Transforms/LoopVectorize/phi-hang.ll b/test/Transforms/LoopVectorize/phi-hang.ll
index b80d45995d..bbce239afa 100644
--- a/test/Transforms/LoopVectorize/phi-hang.ll
+++ b/test/Transforms/LoopVectorize/phi-hang.ll
@@ -27,3 +27,21 @@ bb5:                                              ; preds = %bb4, %bb1
 bb11:                                             ; preds = %bb5
   ret void
 }
+
+; PR15748
+define void @test2() {
+bb:
+  br label %bb1
+
+bb1:                                              ; preds = %bb1, %bb
+  %tmp = phi i32 [ 0, %bb ], [ %tmp5, %bb1 ]
+  %tmp2 = phi i32 [ 0, %bb ], [ 1, %bb1 ]
+  %tmp3 = phi i32 [ 0, %bb ], [ %tmp4, %bb1 ]
+  %tmp4 = or i32 %tmp2, %tmp3
+  %tmp5 = add nsw i32 %tmp, 1
+  %tmp6 = icmp eq i32 %tmp5, 0
+  br i1 %tmp6, label %bb7, label %bb1
+
+bb7:                                              ; preds = %bb1
+  ret void
+}
-- 
cgit v1.2.3-70-g09d2


From a3fb330d05e85107d01ecf133355d0c6a88196fd Mon Sep 17 00:00:00 2001
From: Arnold Schwaighofer <aschwaighofer@apple.com>
Date: Thu, 18 Apr 2013 17:22:34 +0000
Subject: LoopVectorizer: Recognize min/max reductions

A min/max operation is represented by a select(cmp(lt/le/gt/ge, X, Y), X, Y)
sequence in LLVM. If we see such a sequence we can treat it just as any other
commutative binary instruction and reduce it.

This appears to help bzip2 by about 1.5% on an imac12,2.

radar://12960601

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179773 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp        | 243 +++++++++++--
 test/Transforms/LoopVectorize/minmax_reduction.ll | 399 ++++++++++++++++++++++
 2 files changed, 608 insertions(+), 34 deletions(-)
 create mode 100644 test/Transforms/LoopVectorize/minmax_reduction.ll

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index f40964f813..7a3d678b6b 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -343,6 +343,7 @@ public:
     RK_IntegerOr,   ///< Bitwise or logical OR of numbers.
     RK_IntegerAnd,  ///< Bitwise or logical AND of numbers.
     RK_IntegerXor,  ///< Bitwise or logical XOR of numbers.
+    RK_IntegerMinMax, //< Min/max implemented in terms of select(cmp()).
     RK_FloatAdd,    ///< Sum of floats.
     RK_FloatMult    ///< Product of floats.
   };
@@ -361,8 +362,9 @@ public:
     ReductionDescriptor() : StartValue(0), LoopExitInstr(0),
       Kind(RK_NoReduction) {}
 
-    ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K)
-        : StartValue(Start), LoopExitInstr(Exit), Kind(K) {}
+    ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K,
+                        CmpInst::Predicate P)
+        : StartValue(Start), LoopExitInstr(Exit), Kind(K), MinMaxPred(P) {}
 
     // The starting value of the reduction.
     // It does not have to be zero!
@@ -371,6 +373,25 @@ public:
     Instruction *LoopExitInstr;
     // The kind of the reduction.
     ReductionKind Kind;
+    // If this a min/max reduction the kind of reduction.
+    CmpInst::Predicate MinMaxPred;
+  };
+
+  /// This POD struct holds information about a potential reduction operation.
+  struct ReductionInstDesc {
+    ReductionInstDesc(bool IsRedux, Instruction *I) :
+      IsReduction(IsRedux), PatternLastInst(I), Predicate(ICmpInst::ICMP_EQ) {}
+
+    ReductionInstDesc(Instruction *I, CmpInst::Predicate P) :
+      IsReduction(true), PatternLastInst(I), Predicate(P) {}
+
+    // Is this instruction a reduction candidate.
+    bool IsReduction;
+    // The last instruction in a min/max pattern (select of the select(icmp())
+    // pattern), or the current reduction instruction otherwise.
+    Instruction *PatternLastInst;
+    // If this is a min/max pattern the comparison predicate.
+    CmpInst::Predicate Predicate;
   };
 
   // This POD struct holds information about the memory runtime legality
@@ -487,9 +508,13 @@ private:
   /// Returns True, if 'Phi' is the kind of reduction variable for type
   /// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
   bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
-  /// Returns true if the instruction I can be a reduction variable of type
-  /// 'Kind'.
-  bool isReductionInstr(Instruction *I, ReductionKind Kind);
+  /// Returns a struct describing if the instruction 'I' can be a reduction
+  /// variable of type 'Kind'. If the reduction is a min/max pattern of
+  /// select(icmp()) this function advances the instruction pointer 'I' from the
+  /// compare instruction to the select instruction and stores this pointer in
+  /// 'PatternLastInst' member of the returned struct.
+  ReductionInstDesc isReductionInstr(Instruction *I, ReductionKind Kind,
+                                     ReductionInstDesc Desc);
   /// Returns the induction kind of Phi. This function may return NoInduction
   /// if the PHI is not an induction variable.
   InductionKind isInductionVariable(PHINode *Phi);
@@ -1437,7 +1462,8 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
 /// This function returns the identity element (or neutral element) for
 /// the operation K.
 static Constant*
-getReductionIdentity(LoopVectorizationLegality::ReductionKind K, Type *Tp) {
+getReductionIdentity(LoopVectorizationLegality::ReductionKind K, Type *Tp,
+                     CmpInst::Predicate Pred) {
   switch (K) {
   case LoopVectorizationLegality:: RK_IntegerXor:
   case LoopVectorizationLegality:: RK_IntegerAdd:
@@ -1456,6 +1482,28 @@ getReductionIdentity(LoopVectorizationLegality::ReductionKind K, Type *Tp) {
   case LoopVectorizationLegality:: RK_FloatAdd:
     // Adding zero to a number does not change it.
     return ConstantFP::get(Tp, 0.0L);
+  case LoopVectorizationLegality:: RK_IntegerMinMax:
+    switch(Pred) {
+    default: llvm_unreachable("Unknown min/max predicate");
+    case CmpInst::ICMP_ULT:
+    case CmpInst::ICMP_ULE:
+      return ConstantInt::getAllOnesValue(Tp);
+    case CmpInst::ICMP_UGT:
+    case CmpInst::ICMP_UGE:
+      return ConstantInt::get(Tp, 0);
+    case CmpInst::ICMP_SLT:
+    case CmpInst::ICMP_SLE: {
+      unsigned BitWidth = Tp->getPrimitiveSizeInBits();
+      return ConstantInt::get(Tp->getContext(),
+                              APInt::getSignedMaxValue(BitWidth));
+    }
+    case CmpInst::ICMP_SGT:
+    case CmpInst::ICMP_SGE: {
+      unsigned BitWidth = Tp->getPrimitiveSizeInBits();
+      return ConstantInt::get(Tp->getContext(),
+                              APInt::getSignedMinValue(BitWidth));
+    }
+    }
   default:
     llvm_unreachable("Unknown reduction kind");
   }
@@ -1566,7 +1614,7 @@ getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) {
 }
 
 /// This function translates the reduction kind to an LLVM binary operator.
-static Instruction::BinaryOps
+static unsigned
 getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) {
   switch (Kind) {
     case LoopVectorizationLegality::RK_IntegerAdd:
@@ -1583,11 +1631,20 @@ getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) {
       return Instruction::FMul;
     case LoopVectorizationLegality::RK_FloatAdd:
       return Instruction::FAdd;
+    case LoopVectorizationLegality::RK_IntegerMinMax:
+      return Instruction::ICmp;
     default:
       llvm_unreachable("Unknown reduction operation");
   }
 }
 
+Value *createMinMaxOp(IRBuilder<> &Builder, ICmpInst::Predicate P, Value *Left,
+                      Value *Right) {
+  Value *Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp");
+  Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select");
+  return Select;
+}
+
 void
 InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
   //===------------------------------------------------===//
@@ -1651,7 +1708,8 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
 
     // Find the reduction identity variable. Zero for addition, or, xor,
     // one for multiplication, -1 for And.
-    Constant *Iden = getReductionIdentity(RdxDesc.Kind, VecTy->getScalarType());
+    Constant *Iden = getReductionIdentity(RdxDesc.Kind, VecTy->getScalarType(),
+                                          RdxDesc.MinMaxPred);
     Constant *Identity = ConstantVector::getSplat(VF, Iden);
 
     // This vector is the Identity vector where the first element is the
@@ -1699,10 +1757,15 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
 
     // Reduce all of the unrolled parts into a single vector.
     Value *ReducedPartRdx = RdxParts[0];
+    unsigned Op = getReductionBinOp(RdxDesc.Kind);
     for (unsigned part = 1; part < UF; ++part) {
-      Instruction::BinaryOps Op = getReductionBinOp(RdxDesc.Kind);
-      ReducedPartRdx = Builder.CreateBinOp(Op, RdxParts[part], ReducedPartRdx,
-                                           "bin.rdx");
+      if (Op != Instruction::ICmp)
+        ReducedPartRdx = Builder.CreateBinOp((Instruction::BinaryOps)Op,
+                                             RdxParts[part], ReducedPartRdx,
+                                             "bin.rdx");
+      else
+        ReducedPartRdx = createMinMaxOp(Builder, RdxDesc.MinMaxPred,
+                                        ReducedPartRdx, RdxParts[part]);
     }
 
     // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
@@ -1727,8 +1790,11 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
                                     ConstantVector::get(ShuffleMask),
                                     "rdx.shuf");
 
-      Instruction::BinaryOps Op = getReductionBinOp(RdxDesc.Kind);
-      TmpVec = Builder.CreateBinOp(Op, TmpVec, Shuf, "bin.rdx");
+      if (Op != Instruction::ICmp)
+        TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf,
+                                     "bin.rdx");
+      else
+        TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxPred, TmpVec, Shuf);
     }
 
     // The result is in the first element of the vector.
@@ -2315,6 +2381,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n");
           continue;
         }
+        if (AddReductionVar(Phi, RK_IntegerMinMax)) {
+          DEBUG(dbgs() << "LV: Found a MINMAX reduction PHI."<< *Phi <<"\n");
+          continue;
+        }
         if (AddReductionVar(Phi, RK_FloatMult)) {
           DEBUG(dbgs() << "LV: Found an FMult reduction PHI."<< *Phi <<"\n");
           continue;
@@ -2734,6 +2804,14 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
   // out-of-block user. The cycle must end with the original PHI.
   Instruction *Iter = Phi;
 
+  // To recognize min/max patterns formed by a icmp select sequence, we store
+  // the number of instruction we saw from the recognized min/max pattern,
+  // such that we don't stop when we see the phi has two uses (one by the select
+  // and one by the icmp) and to make sure we only see exactly the two
+  // instructions.
+  unsigned NumICmpSelectPatternInst = 0;
+  ReductionInstDesc ReduxDesc(false, 0);
+
   // Avoid cycles in the chain.
   SmallPtrSet<Instruction *, 8> VisitedInsts;
   while (VisitedInsts.insert(Iter)) {
@@ -2778,23 +2856,35 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
           Iter->hasNUsesOrMore(2))
         continue;
 
-      // We can't have multiple inside users.
-      if (FoundInBlockUser)
+      // We can't have multiple inside users except for a combination of
+      // icmp/select both using the phi.
+      if (FoundInBlockUser && !NumICmpSelectPatternInst)
         return false;
       FoundInBlockUser = true;
 
       // Any reduction instr must be of one of the allowed kinds.
-      if (!isReductionInstr(U, Kind))
+      ReduxDesc = isReductionInstr(U, Kind, ReduxDesc);
+      if (!ReduxDesc.IsReduction)
         return false;
 
+      if (Kind == RK_IntegerMinMax && (isa<ICmpInst>(U) ||
+                                       isa<SelectInst>(U)))
+          ++NumICmpSelectPatternInst;
+
       // Reductions of instructions such as Div, and Sub is only
       // possible if the LHS is the reduction variable.
-      if (!U->isCommutative() && !isa<PHINode>(U) && U->getOperand(0) != Iter)
+      if (!U->isCommutative() && !isa<PHINode>(U) && !isa<SelectInst>(U) &&
+          !isa<ICmpInst>(U) && U->getOperand(0) != Iter)
         return false;
 
-      Iter = U;
+      Iter = ReduxDesc.PatternLastInst;
     }
 
+    // This means we have seen one but not the other instruction of the
+    // pattern or more than just a select and cmp.
+    if (Kind == RK_IntegerMinMax && NumICmpSelectPatternInst != 2)
+      return false;
+
     // We found a reduction var if we have reached the original
     // phi node and we only have a single instruction with out-of-loop
     // users.
@@ -2803,7 +2893,8 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
       AllowedExit.insert(ExitInstruction);
 
       // Save the description of this reduction variable.
-      ReductionDescriptor RD(RdxStart, ExitInstruction, Kind);
+      ReductionDescriptor RD(RdxStart, ExitInstruction, Kind,
+                             ReduxDesc.Predicate);
       Reductions[Phi] = RD;
       // We've ended the cycle. This is a reduction variable if we have an
       // outside user and it has a binary op.
@@ -2814,36 +2905,120 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
   return false;
 }
 
-bool
+static CmpInst::Predicate getPredicateSense(CmpInst::Predicate P,
+                                            bool ShouldRevert) {
+  if (!ShouldRevert) return P;
+
+  switch(P) {
+  default:
+    llvm_unreachable("Unknown predicate sense");
+  case CmpInst::ICMP_UGT:
+  case CmpInst::ICMP_UGE:
+    return CmpInst::ICMP_ULT;
+  case CmpInst::ICMP_SGT:
+  case CmpInst::ICMP_SGE:
+    return CmpInst::ICMP_SLT;
+  case CmpInst::ICMP_ULT:
+  case CmpInst::ICMP_ULE:
+    return CmpInst::ICMP_UGT;
+  case CmpInst::ICMP_SLT:
+  case CmpInst::ICMP_SLE:
+    return CmpInst::ICMP_SGT;
+  }
+}
+
+/// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction
+/// pattern corresponding to a min(X, Y) or max(X, Y).
+static LoopVectorizationLegality::ReductionInstDesc
+isMinMaxSelectCmpPattern(Instruction *I) {
+
+  assert((isa<ICmpInst>(I) || isa<SelectInst>(I)) &&
+         "Expect a select instruction");
+  ICmpInst *Cmp = 0;
+  SelectInst *Select = 0;
+
+  // Look for a select(icmp(),...) pattern. Only handle integer reductions for
+  // now.
+  if ((Select = dyn_cast<SelectInst>(I))) {
+    if (!(Cmp = dyn_cast<ICmpInst>(I->getOperand(0))))
+      return LoopVectorizationLegality::ReductionInstDesc(false, I);
+    // Only handle the single user case
+    if (!Cmp->hasOneUse())
+      return LoopVectorizationLegality::ReductionInstDesc(false, I);
+  } else if ((Cmp = dyn_cast<ICmpInst>(I))) {
+    // Only handle the single user case.
+    if (!Cmp->hasOneUse())
+      return LoopVectorizationLegality::ReductionInstDesc(false, I);
+    // Look for the select.
+    if (!(Select = dyn_cast<SelectInst>(*I->use_begin())))
+      return LoopVectorizationLegality::ReductionInstDesc(false, I);
+    // Compare must be the first operand of the select.
+    if (Select->getOperand(0) != Cmp)
+      return LoopVectorizationLegality::ReductionInstDesc(false, I);
+  }
+
+  CmpInst::Predicate Pred = Cmp->getPredicate();
+
+  // Only (u/s)lt/gt/ge/le are min or max patterns.
+  if (Pred == CmpInst::ICMP_EQ ||
+      Pred == CmpInst::ICMP_NE)
+    return LoopVectorizationLegality::ReductionInstDesc(false, I);
+
+  Value *SelectOp1 = Select->getOperand(1);
+  Value *SelectOp2 = Select->getOperand(2);
+
+  Value *CmpLeft = Cmp->getOperand(0);
+  Value *CmpRight = Cmp->getOperand(1);
+
+  // Can have reversed sense.
+  // select(slt(X, Y), Y, X) == select(sge(X, Y), X, Y).
+  bool IsInverted = (SelectOp2 == CmpLeft && SelectOp1 == CmpRight);
+  bool IsMinMaxPattern = (SelectOp1 == CmpLeft && SelectOp2 == CmpRight) ||
+    IsInverted;
+
+  // Advance the instruction pointer from the icmp to the select instruction.
+  if (IsMinMaxPattern) {
+    CmpInst::Predicate P = getPredicateSense(Pred, IsInverted);
+    return LoopVectorizationLegality::ReductionInstDesc(Select, P);
+  }
+
+  return LoopVectorizationLegality::ReductionInstDesc(false, I);
+}
+
+LoopVectorizationLegality::ReductionInstDesc
 LoopVectorizationLegality::isReductionInstr(Instruction *I,
-                                            ReductionKind Kind) {
+                                            ReductionKind Kind,
+                                            ReductionInstDesc Desc) {
   bool FP = I->getType()->isFloatingPointTy();
   bool FastMath = (FP && I->isCommutative() && I->isAssociative());
-
   switch (I->getOpcode()) {
   default:
-    return false;
+    return ReductionInstDesc(false, I);
   case Instruction::PHI:
       if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd))
-        return false;
-    // possibly.
-    return true;
+        return ReductionInstDesc(false, I);
+    return ReductionInstDesc(I, Desc.Predicate);
   case Instruction::Sub:
   case Instruction::Add:
-    return Kind == RK_IntegerAdd;
+    return ReductionInstDesc(Kind == RK_IntegerAdd, I);
   case Instruction::Mul:
-    return Kind == RK_IntegerMult;
+    return ReductionInstDesc(Kind == RK_IntegerMult, I);
   case Instruction::And:
-    return Kind == RK_IntegerAnd;
+    return ReductionInstDesc(Kind == RK_IntegerAnd, I);
   case Instruction::Or:
-    return Kind == RK_IntegerOr;
+    return ReductionInstDesc(Kind == RK_IntegerOr, I);
   case Instruction::Xor:
-    return Kind == RK_IntegerXor;
+    return ReductionInstDesc(Kind == RK_IntegerXor, I);
   case Instruction::FMul:
-    return Kind == RK_FloatMult && FastMath;
+    return ReductionInstDesc(Kind == RK_FloatMult && FastMath, I);
   case Instruction::FAdd:
-    return Kind == RK_FloatAdd && FastMath;
-   }
+    return ReductionInstDesc(Kind == RK_FloatAdd && FastMath, I);
+  case Instruction::ICmp:
+  case Instruction::Select:
+    if (Kind != RK_IntegerMinMax)
+      return ReductionInstDesc(false, I);
+    return isMinMaxSelectCmpPattern(I);
+  }
 }
 
 LoopVectorizationLegality::InductionKind
diff --git a/test/Transforms/LoopVectorize/minmax_reduction.ll b/test/Transforms/LoopVectorize/minmax_reduction.ll
new file mode 100644
index 0000000000..90c45bbc66
--- /dev/null
+++ b/test/Transforms/LoopVectorize/minmax_reduction.ll
@@ -0,0 +1,399 @@
+; RUN: opt -S -loop-vectorize -dce -instcombine -force-vector-width=2 -force-vector-unroll=1 < %s | FileCheck %s
+@A = common global [1024 x i32] zeroinitializer, align 16
+
+; Signed tests.
+
+; Turn this into a max reduction.
+; CHECK: @max_red
+; CHECK: icmp sgt <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp sgt <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @max_red(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %cmp3 = icmp sgt i32 %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; Turn this into a max reduction. The select has its inputs reversed therefore
+; this is a max reduction.
+; CHECK: @max_red_inverse_select
+; CHECK: icmp slt <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp sgt <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @max_red_inverse_select(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %cmp3 = icmp slt i32 %max.red.08, %0
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; Turn this into a min reduction.
+; CHECK: @min_red
+; CHECK: icmp slt <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp slt <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @min_red(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %cmp3 = icmp slt i32 %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; Turn this into a min reduction. The select has its inputs reversed therefore
+; this is a min reduction.
+; CHECK: @min_red_inverse_select
+; CHECK: icmp sgt <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp slt <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @min_red_inverse_select(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %cmp3 = icmp sgt i32 %max.red.08, %0
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; Unsigned tests.
+
+; Turn this into a max reduction.
+; CHECK: @umax_red
+; CHECK: icmp ugt <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp ugt <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @umax_red(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %cmp3 = icmp ugt i32 %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; Turn this into a max reduction. The select has its inputs reversed therefore
+; this is a max reduction.
+; CHECK: @umax_red_inverse_select
+; CHECK: icmp ult <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp ugt <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @umax_red_inverse_select(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %cmp3 = icmp ult i32 %max.red.08, %0
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; Turn this into a min reduction.
+; CHECK: @umin_red
+; CHECK: icmp ult <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp ult <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @umin_red(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %cmp3 = icmp ult i32 %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; Turn this into a min reduction. The select has its inputs reversed therefore
+; this is a min reduction.
+; CHECK: @umin_red_inverse_select
+; CHECK: icmp ugt <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp ult <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @umin_red_inverse_select(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %cmp3 = icmp ugt i32 %max.red.08, %0
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; SGE -> SLT
+; Turn this into a min reduction (select inputs are reversed).
+; CHECK: @sge_min_red
+; CHECK: icmp sge <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp slt <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @sge_min_red(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %cmp3 = icmp sge i32 %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, i32 %max.red.08, i32 %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; SLE -> SGT
+; Turn this into a max reduction (select inputs are reversed).
+; CHECK: @sle_min_red
+; CHECK: icmp sle <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp sgt <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @sle_min_red(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %cmp3 = icmp sle i32 %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, i32 %max.red.08, i32 %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; UGE -> ULT
+; Turn this into a min reduction (select inputs are reversed).
+; CHECK: @uge_min_red
+; CHECK: icmp uge <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp ult <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @uge_min_red(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %cmp3 = icmp uge i32 %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, i32 %max.red.08, i32 %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; ULE -> UGT
+; Turn this into a max reduction (select inputs are reversed).
+; CHECK: @ule_min_red
+; CHECK: icmp ule <2 x i32>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: icmp ugt <2 x i32>
+; CHECK: select <2 x i1>
+
+define i32 @ule_min_red(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %cmp3 = icmp ule i32 %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, i32 %max.red.08, i32 %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; No reduction.
+; CHECK: @no_red_1
+; CHECK-NOT: icmp <2 x i32>
+define i32 @no_red_1(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %arrayidx1 = getelementptr inbounds [1024 x i32]* @A, i64 1, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %1 = load i32* %arrayidx1, align 4
+  %cmp3 = icmp sgt i32 %0, %1
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
+
+; CHECK: @no_red_2
+; CHECK-NOT: icmp <2 x i32>
+define i32 @no_red_2(i32 %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi i32 [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x i32]* @A, i64 0, i64 %indvars.iv
+  %arrayidx1 = getelementptr inbounds [1024 x i32]* @A, i64 1, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %1 = load i32* %arrayidx1, align 4
+  %cmp3 = icmp sgt i32 %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, i32 %0, i32 %1
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 %max.red.0
+}
-- 
cgit v1.2.3-70-g09d2


From 4b15d6ae49b407cfcf526a3f7bec7ab15fa803c0 Mon Sep 17 00:00:00 2001
From: Dmitri Gribenko <gribozavr@gmail.com>
Date: Thu, 18 Apr 2013 20:13:04 +0000
Subject: Fix a -Wdocumentation warning

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179789 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 7a3d678b6b..6ca7622963 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -343,7 +343,7 @@ public:
     RK_IntegerOr,   ///< Bitwise or logical OR of numbers.
     RK_IntegerAnd,  ///< Bitwise or logical AND of numbers.
     RK_IntegerXor,  ///< Bitwise or logical XOR of numbers.
-    RK_IntegerMinMax, //< Min/max implemented in terms of select(cmp()).
+    RK_IntegerMinMax, ///< Min/max implemented in terms of select(cmp()).
     RK_FloatAdd,    ///< Sum of floats.
     RK_FloatMult    ///< Product of floats.
   };
-- 
cgit v1.2.3-70-g09d2


From d717e202a2d50d47b96534dbf67b8aa6ea01b912 Mon Sep 17 00:00:00 2001
From: Arnold Schwaighofer <aschwaighofer@apple.com>
Date: Fri, 19 Apr 2013 21:03:36 +0000
Subject: LoopVectorizer: Use matcher from PatternMatch.h for the min/max
 patterns

Also make some static function class functions to avoid having to mention the
class namespace for enums all the time.

No functionality change intended.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@179886 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 206 ++++++++++++++---------------
 1 file changed, 102 insertions(+), 104 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 6ca7622963..0c88ba7835 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -78,6 +78,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/PatternMatch.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Scalar.h"
@@ -87,6 +88,7 @@
 #include <map>
 
 using namespace llvm;
+using namespace llvm::PatternMatch;
 
 static cl::opt<unsigned>
 VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
@@ -357,14 +359,23 @@ public:
     IK_ReversePtrInduction  ///< Reverse ptr indvar. Step = - sizeof(elem).
   };
 
+  // This enum represents the kind of minmax reduction.
+  enum MinMaxReductionKind {
+    MRK_Invalid,
+    MRK_UIntMin,
+    MRK_UIntMax,
+    MRK_SIntMin,
+    MRK_SIntMax
+  };
+
   /// This POD struct holds information about reduction variables.
   struct ReductionDescriptor {
     ReductionDescriptor() : StartValue(0), LoopExitInstr(0),
-      Kind(RK_NoReduction) {}
+      Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {}
 
     ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K,
-                        CmpInst::Predicate P)
-        : StartValue(Start), LoopExitInstr(Exit), Kind(K), MinMaxPred(P) {}
+                        MinMaxReductionKind MK)
+        : StartValue(Start), LoopExitInstr(Exit), Kind(K), MinMaxKind(MK) {}
 
     // The starting value of the reduction.
     // It does not have to be zero!
@@ -374,16 +385,16 @@ public:
     // The kind of the reduction.
     ReductionKind Kind;
     // If this a min/max reduction the kind of reduction.
-    CmpInst::Predicate MinMaxPred;
+    MinMaxReductionKind MinMaxKind;
   };
 
   /// This POD struct holds information about a potential reduction operation.
   struct ReductionInstDesc {
     ReductionInstDesc(bool IsRedux, Instruction *I) :
-      IsReduction(IsRedux), PatternLastInst(I), Predicate(ICmpInst::ICMP_EQ) {}
+      IsReduction(IsRedux), PatternLastInst(I), MinMaxKind(MRK_Invalid) {}
 
-    ReductionInstDesc(Instruction *I, CmpInst::Predicate P) :
-      IsReduction(true), PatternLastInst(I), Predicate(P) {}
+    ReductionInstDesc(Instruction *I, MinMaxReductionKind K) :
+      IsReduction(true), PatternLastInst(I), MinMaxKind(K) {}
 
     // Is this instruction a reduction candidate.
     bool IsReduction;
@@ -391,7 +402,7 @@ public:
     // pattern), or the current reduction instruction otherwise.
     Instruction *PatternLastInst;
     // If this is a min/max pattern the comparison predicate.
-    CmpInst::Predicate Predicate;
+    MinMaxReductionKind MinMaxKind;
   };
 
   // This POD struct holds information about the memory runtime legality
@@ -482,6 +493,11 @@ public:
 
   /// Returns the information that we collected about runtime memory check.
   RuntimePointerCheck *getRuntimePointerCheck() { return &PtrRtCheck; }
+
+  /// This function returns the identity element (or neutral element) for
+  /// the operation K.
+  static Constant *getReductionIdentity(ReductionKind K, Type *Tp,
+                                        MinMaxReductionKind MinMaxK);
 private:
   /// Check if a single basic block loop is vectorizable.
   /// At this point we know that this is a loop with a constant trip count
@@ -514,7 +530,11 @@ private:
   /// compare instruction to the select instruction and stores this pointer in
   /// 'PatternLastInst' member of the returned struct.
   ReductionInstDesc isReductionInstr(Instruction *I, ReductionKind Kind,
-                                     ReductionInstDesc Desc);
+                                     ReductionInstDesc &Desc);
+  /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction
+  /// pattern corresponding to a min(X, Y) or max(X, Y).
+  static ReductionInstDesc isMinMaxSelectCmpPattern(Instruction *I,
+                                                    ReductionInstDesc &Prev);
   /// Returns the induction kind of Phi. This function may return NoInduction
   /// if the PHI is not an induction variable.
   InductionKind isInductionVariable(PHINode *Phi);
@@ -1461,44 +1481,40 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
 
 /// This function returns the identity element (or neutral element) for
 /// the operation K.
-static Constant*
-getReductionIdentity(LoopVectorizationLegality::ReductionKind K, Type *Tp,
-                     CmpInst::Predicate Pred) {
+Constant*
+LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp,
+                                                MinMaxReductionKind MinMaxK) {
   switch (K) {
-  case LoopVectorizationLegality:: RK_IntegerXor:
-  case LoopVectorizationLegality:: RK_IntegerAdd:
-  case LoopVectorizationLegality:: RK_IntegerOr:
+  case RK_IntegerXor:
+  case RK_IntegerAdd:
+  case RK_IntegerOr:
     // Adding, Xoring, Oring zero to a number does not change it.
     return ConstantInt::get(Tp, 0);
-  case LoopVectorizationLegality:: RK_IntegerMult:
+  case RK_IntegerMult:
     // Multiplying a number by 1 does not change it.
     return ConstantInt::get(Tp, 1);
-  case LoopVectorizationLegality:: RK_IntegerAnd:
+  case RK_IntegerAnd:
     // AND-ing a number with an all-1 value does not change it.
     return ConstantInt::get(Tp, -1, true);
-  case LoopVectorizationLegality:: RK_FloatMult:
+  case  RK_FloatMult:
     // Multiplying a number by 1 does not change it.
     return ConstantFP::get(Tp, 1.0L);
-  case LoopVectorizationLegality:: RK_FloatAdd:
+  case  RK_FloatAdd:
     // Adding zero to a number does not change it.
     return ConstantFP::get(Tp, 0.0L);
-  case LoopVectorizationLegality:: RK_IntegerMinMax:
-    switch(Pred) {
+  case  RK_IntegerMinMax:
+    switch(MinMaxK) {
     default: llvm_unreachable("Unknown min/max predicate");
-    case CmpInst::ICMP_ULT:
-    case CmpInst::ICMP_ULE:
+    case MRK_UIntMin:
       return ConstantInt::getAllOnesValue(Tp);
-    case CmpInst::ICMP_UGT:
-    case CmpInst::ICMP_UGE:
+    case MRK_UIntMax:
       return ConstantInt::get(Tp, 0);
-    case CmpInst::ICMP_SLT:
-    case CmpInst::ICMP_SLE: {
+    case MRK_SIntMin: {
       unsigned BitWidth = Tp->getPrimitiveSizeInBits();
       return ConstantInt::get(Tp->getContext(),
                               APInt::getSignedMaxValue(BitWidth));
     }
-    case CmpInst::ICMP_SGT:
-    case CmpInst::ICMP_SGE: {
+    case LoopVectorizationLegality::MRK_SIntMax: {
       unsigned BitWidth = Tp->getPrimitiveSizeInBits();
       return ConstantInt::get(Tp->getContext(),
                               APInt::getSignedMinValue(BitWidth));
@@ -1638,8 +1654,26 @@ getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) {
   }
 }
 
-Value *createMinMaxOp(IRBuilder<> &Builder, ICmpInst::Predicate P, Value *Left,
+Value *createMinMaxOp(IRBuilder<> &Builder,
+                      LoopVectorizationLegality::MinMaxReductionKind RK,
+                      Value *Left,
                       Value *Right) {
+  CmpInst::Predicate P = CmpInst::ICMP_NE;
+  switch (RK) {
+  default:
+    llvm_unreachable("Unknown min/max reduction kind");
+  case LoopVectorizationLegality::MRK_UIntMin:
+    P = CmpInst::ICMP_ULT;
+    break;
+  case LoopVectorizationLegality::MRK_UIntMax:
+    P = CmpInst::ICMP_UGT;
+    break;
+  case LoopVectorizationLegality::MRK_SIntMin:
+    P = CmpInst::ICMP_SLT;
+    break;
+  case LoopVectorizationLegality::MRK_SIntMax:
+    P = CmpInst::ICMP_SGT;
+  }
   Value *Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp");
   Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select");
   return Select;
@@ -1708,8 +1742,10 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
 
     // Find the reduction identity variable. Zero for addition, or, xor,
     // one for multiplication, -1 for And.
-    Constant *Iden = getReductionIdentity(RdxDesc.Kind, VecTy->getScalarType(),
-                                          RdxDesc.MinMaxPred);
+    Constant *Iden =
+      LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind,
+                                                      VecTy->getScalarType(),
+                                                      RdxDesc.MinMaxKind);
     Constant *Identity = ConstantVector::getSplat(VF, Iden);
 
     // This vector is the Identity vector where the first element is the
@@ -1764,7 +1800,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
                                              RdxParts[part], ReducedPartRdx,
                                              "bin.rdx");
       else
-        ReducedPartRdx = createMinMaxOp(Builder, RdxDesc.MinMaxPred,
+        ReducedPartRdx = createMinMaxOp(Builder, RdxDesc.MinMaxKind,
                                         ReducedPartRdx, RdxParts[part]);
     }
 
@@ -1794,7 +1830,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf,
                                      "bin.rdx");
       else
-        TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxPred, TmpVec, Shuf);
+        TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf);
     }
 
     // The result is in the first element of the vector.
@@ -2894,7 +2930,7 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
 
       // Save the description of this reduction variable.
       ReductionDescriptor RD(RdxStart, ExitInstruction, Kind,
-                             ReduxDesc.Predicate);
+                             ReduxDesc.MinMaxKind);
       Reductions[Phi] = RD;
       // We've ended the cycle. This is a reduction variable if we have an
       // outside user and it has a binary op.
@@ -2905,90 +2941,52 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
   return false;
 }
 
-static CmpInst::Predicate getPredicateSense(CmpInst::Predicate P,
-                                            bool ShouldRevert) {
-  if (!ShouldRevert) return P;
-
-  switch(P) {
-  default:
-    llvm_unreachable("Unknown predicate sense");
-  case CmpInst::ICMP_UGT:
-  case CmpInst::ICMP_UGE:
-    return CmpInst::ICMP_ULT;
-  case CmpInst::ICMP_SGT:
-  case CmpInst::ICMP_SGE:
-    return CmpInst::ICMP_SLT;
-  case CmpInst::ICMP_ULT:
-  case CmpInst::ICMP_ULE:
-    return CmpInst::ICMP_UGT;
-  case CmpInst::ICMP_SLT:
-  case CmpInst::ICMP_SLE:
-    return CmpInst::ICMP_SGT;
-  }
-}
-
 /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction
 /// pattern corresponding to a min(X, Y) or max(X, Y).
-static LoopVectorizationLegality::ReductionInstDesc
-isMinMaxSelectCmpPattern(Instruction *I) {
+LoopVectorizationLegality::ReductionInstDesc
+LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I, ReductionInstDesc &Prev) {
 
   assert((isa<ICmpInst>(I) || isa<SelectInst>(I)) &&
          "Expect a select instruction");
   ICmpInst *Cmp = 0;
   SelectInst *Select = 0;
 
-  // Look for a select(icmp(),...) pattern. Only handle integer reductions for
-  // now.
-  if ((Select = dyn_cast<SelectInst>(I))) {
-    if (!(Cmp = dyn_cast<ICmpInst>(I->getOperand(0))))
-      return LoopVectorizationLegality::ReductionInstDesc(false, I);
-    // Only handle the single user case
-    if (!Cmp->hasOneUse())
-      return LoopVectorizationLegality::ReductionInstDesc(false, I);
-  } else if ((Cmp = dyn_cast<ICmpInst>(I))) {
-    // Only handle the single user case.
-    if (!Cmp->hasOneUse())
-      return LoopVectorizationLegality::ReductionInstDesc(false, I);
-    // Look for the select.
-    if (!(Select = dyn_cast<SelectInst>(*I->use_begin())))
-      return LoopVectorizationLegality::ReductionInstDesc(false, I);
-    // Compare must be the first operand of the select.
-    if (Select->getOperand(0) != Cmp)
-      return LoopVectorizationLegality::ReductionInstDesc(false, I);
-  }
-
-  CmpInst::Predicate Pred = Cmp->getPredicate();
-
-  // Only (u/s)lt/gt/ge/le are min or max patterns.
-  if (Pred == CmpInst::ICMP_EQ ||
-      Pred == CmpInst::ICMP_NE)
-    return LoopVectorizationLegality::ReductionInstDesc(false, I);
-
-  Value *SelectOp1 = Select->getOperand(1);
-  Value *SelectOp2 = Select->getOperand(2);
+  // We must handle the select(cmp()) as a single instruction. Advance to the
+  // select.
+  if ((Cmp = dyn_cast<ICmpInst>(I))) {
+    if (!Cmp->hasOneUse() || !(Select = dyn_cast<SelectInst>(*I->use_begin())))
+      return ReductionInstDesc(false, I);
+    return ReductionInstDesc(Select, Prev.MinMaxKind);
+  }
+
+  // Only handle single use cases for now.
+  if (!(Select = dyn_cast<SelectInst>(I)))
+    return ReductionInstDesc(false, I);
+  if (!(Cmp = dyn_cast<ICmpInst>(I->getOperand(0))))
+    return ReductionInstDesc(false, I);
+  if (!Cmp->hasOneUse())
+    return ReductionInstDesc(false, I);
 
   Value *CmpLeft = Cmp->getOperand(0);
   Value *CmpRight = Cmp->getOperand(1);
 
-  // Can have reversed sense.
-  // select(slt(X, Y), Y, X) == select(sge(X, Y), X, Y).
-  bool IsInverted = (SelectOp2 == CmpLeft && SelectOp1 == CmpRight);
-  bool IsMinMaxPattern = (SelectOp1 == CmpLeft && SelectOp2 == CmpRight) ||
-    IsInverted;
-
-  // Advance the instruction pointer from the icmp to the select instruction.
-  if (IsMinMaxPattern) {
-    CmpInst::Predicate P = getPredicateSense(Pred, IsInverted);
-    return LoopVectorizationLegality::ReductionInstDesc(Select, P);
-  }
-
-  return LoopVectorizationLegality::ReductionInstDesc(false, I);
+  // Look for a min/max pattern.
+  if (m_UMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+    return ReductionInstDesc(Select, MRK_UIntMin);
+  else if (m_UMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+    return ReductionInstDesc(Select, MRK_UIntMax);
+  else if (m_SMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+    return ReductionInstDesc(Select, MRK_SIntMax);
+  else if (m_SMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+    return ReductionInstDesc(Select, MRK_SIntMin);
+
+  return ReductionInstDesc(false, I);
 }
 
 LoopVectorizationLegality::ReductionInstDesc
 LoopVectorizationLegality::isReductionInstr(Instruction *I,
                                             ReductionKind Kind,
-                                            ReductionInstDesc Desc) {
+                                            ReductionInstDesc &Prev) {
   bool FP = I->getType()->isFloatingPointTy();
   bool FastMath = (FP && I->isCommutative() && I->isAssociative());
   switch (I->getOpcode()) {
@@ -2997,7 +2995,7 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I,
   case Instruction::PHI:
       if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd))
         return ReductionInstDesc(false, I);
-    return ReductionInstDesc(I, Desc.Predicate);
+    return ReductionInstDesc(I, Prev.MinMaxKind);
   case Instruction::Sub:
   case Instruction::Add:
     return ReductionInstDesc(Kind == RK_IntegerAdd, I);
@@ -3017,7 +3015,7 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I,
   case Instruction::Select:
     if (Kind != RK_IntegerMinMax)
       return ReductionInstDesc(false, I);
-    return isMinMaxSelectCmpPattern(I);
+    return isMinMaxSelectCmpPattern(I, Prev);
   }
 }
 
-- 
cgit v1.2.3-70-g09d2


From a8958769ea5583d58ae51931d8b1271c23fd20ea Mon Sep 17 00:00:00 2001
From: Pekka Jaaskelainen <pekka.jaaskelainen@tut.fi>
Date: Tue, 23 Apr 2013 08:08:51 +0000
Subject: Refuse to (even try to) vectorize loops which have uniform writes,
 even if erroneously annotated with the parallel loop metadata.

Fixes Bug 15794:
"Loop Vectorizer: Crashes with the use of llvm.loop.parallel metadata"



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@180081 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp         | 18 +++----
 .../X86/illegal-parallel-loop-uniform-write.ll     | 58 ++++++++++++++++++++++
 2 files changed, 67 insertions(+), 9 deletions(-)
 create mode 100644 test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0c88ba7835..2e42585e35 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2548,13 +2548,6 @@ LoopVectorizationLegality::hasPossibleGlobalWriteReorder(
 
 bool LoopVectorizationLegality::canVectorizeMemory() {
 
-  if (TheLoop->isAnnotatedParallel()) {
-    DEBUG(dbgs()
-          << "LV: A loop annotated parallel, ignore memory dependency "
-          << "checks.\n");
-    return true;
-  }
-
   typedef SmallVector<Value*, 16> ValueVector;
   typedef SmallPtrSet<Value*, 16> ValueSet;
   // Holds the Load and Store *instructions*.
@@ -2577,7 +2570,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
       if (it->mayReadFromMemory()) {
         LoadInst *Ld = dyn_cast<LoadInst>(it);
         if (!Ld) return false;
-        if (!Ld->isSimple()) {
+        if (!Ld->isSimple() && !TheLoop->isAnnotatedParallel()) {
           DEBUG(dbgs() << "LV: Found a non-simple load.\n");
           return false;
         }
@@ -2589,7 +2582,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
       if (it->mayWriteToMemory()) {
         StoreInst *St = dyn_cast<StoreInst>(it);
         if (!St) return false;
-        if (!St->isSimple()) {
+        if (!St->isSimple() && !TheLoop->isAnnotatedParallel()) {
           DEBUG(dbgs() << "LV: Found a non-simple store.\n");
           return false;
         }
@@ -2636,6 +2629,13 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
       ReadWrites.insert(std::make_pair(Ptr, ST));
   }
 
+  if (TheLoop->isAnnotatedParallel()) {
+    DEBUG(dbgs()
+          << "LV: A loop annotated parallel, ignore memory dependency "
+          << "checks.\n");
+    return true;
+  }
+
   for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
     LoadInst *LD = cast<LoadInst>(*I);
     Value* Ptr = LD->getPointerOperand();
diff --git a/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll b/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
new file mode 100644
index 0000000000..bc606f3ebd
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
@@ -0,0 +1,58 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+;CHECK: @foo
+;CHECK-NOT: <4 x 
+
+; Function Attrs: nounwind uwtable 
+define void @foo(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 {
+entry:
+  %cmp27 = icmp sgt i32 %m, 0
+  br i1 %cmp27, label %for.body3.lr.ph.us, label %for.end15
+
+for.end.us:                                       ; preds = %for.body3.us
+  %arrayidx9.us = getelementptr inbounds i32* %b, i64 %indvars.iv33
+  %0 = load i32* %arrayidx9.us, align 4, !tbaa !0, !llvm.mem.parallel_loop_access !3
+  %add10.us = add nsw i32 %0, 3
+  store i32 %add10.us, i32* %arrayidx9.us, align 4, !tbaa !0, !llvm.mem.parallel_loop_access !3
+  %indvars.iv.next34 = add i64 %indvars.iv33, 1
+  %lftr.wideiv35 = trunc i64 %indvars.iv.next34 to i32
+  %exitcond36 = icmp eq i32 %lftr.wideiv35, %m
+  br i1 %exitcond36, label %for.end15, label %for.body3.lr.ph.us, !llvm.loop.parallel !5
+
+for.body3.us:                                     ; preds = %for.body3.us, %for.body3.lr.ph.us
+  %indvars.iv29 = phi i64 [ 0, %for.body3.lr.ph.us ], [ %indvars.iv.next30, %for.body3.us ]
+  %1 = trunc i64 %indvars.iv29 to i32
+  %add4.us = add i32 %add.us, %1
+  %idxprom.us = sext i32 %add4.us to i64
+  %arrayidx.us = getelementptr inbounds i32* %a, i64 %idxprom.us
+  %2 = load i32* %arrayidx.us, align 4, !tbaa !0, !llvm.mem.parallel_loop_access !3
+  %add5.us = add nsw i32 %2, 1
+  store i32 %add5.us, i32* %arrayidx7.us, align 4, !tbaa !0, !llvm.mem.parallel_loop_access !3
+  %indvars.iv.next30 = add i64 %indvars.iv29, 1
+  %lftr.wideiv31 = trunc i64 %indvars.iv.next30 to i32
+  %exitcond32 = icmp eq i32 %lftr.wideiv31, %m
+  br i1 %exitcond32, label %for.end.us, label %for.body3.us, !llvm.loop.parallel !4
+
+for.body3.lr.ph.us:                               ; preds = %for.end.us, %entry
+  %indvars.iv33 = phi i64 [ %indvars.iv.next34, %for.end.us ], [ 0, %entry ]
+  %3 = trunc i64 %indvars.iv33 to i32
+  %add.us = add i32 %3, %k
+  %arrayidx7.us = getelementptr inbounds i32* %a, i64 %indvars.iv33
+  br label %for.body3.us
+
+for.end15:                                        ; preds = %for.end.us, %entry
+  ret void
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
+!3 = metadata !{metadata !4, metadata !5}
+!4 = metadata !{metadata !4}
+!5 = metadata !{metadata !5}
+
-- 
cgit v1.2.3-70-g09d2


From 2e59a125fce4235beeb4e72cafbcbe62854ed77c Mon Sep 17 00:00:00 2001
From: Pekka Jaaskelainen <pekka.jaaskelainen@tut.fi>
Date: Tue, 23 Apr 2013 16:44:43 +0000
Subject: Call the potentially costly isAnnotatedParallel() only once. Made the
 uniform write test's checks a bit stricter.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@180119 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp                        | 8 +++++---
 .../LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll      | 3 ++-
 2 files changed, 7 insertions(+), 4 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2e42585e35..162587c565 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2556,6 +2556,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
   PtrRtCheck.Pointers.clear();
   PtrRtCheck.Need = false;
 
+  const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
+
   // For each block.
   for (Loop::block_iterator bb = TheLoop->block_begin(),
        be = TheLoop->block_end(); bb != be; ++bb) {
@@ -2570,7 +2572,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
       if (it->mayReadFromMemory()) {
         LoadInst *Ld = dyn_cast<LoadInst>(it);
         if (!Ld) return false;
-        if (!Ld->isSimple() && !TheLoop->isAnnotatedParallel()) {
+        if (!Ld->isSimple() && !IsAnnotatedParallel) {
           DEBUG(dbgs() << "LV: Found a non-simple load.\n");
           return false;
         }
@@ -2582,7 +2584,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
       if (it->mayWriteToMemory()) {
         StoreInst *St = dyn_cast<StoreInst>(it);
         if (!St) return false;
-        if (!St->isSimple() && !TheLoop->isAnnotatedParallel()) {
+        if (!St->isSimple() && !IsAnnotatedParallel) {
           DEBUG(dbgs() << "LV: Found a non-simple store.\n");
           return false;
         }
@@ -2629,7 +2631,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
       ReadWrites.insert(std::make_pair(Ptr, ST));
   }
 
-  if (TheLoop->isAnnotatedParallel()) {
+  if (IsAnnotatedParallel) {
     DEBUG(dbgs()
           << "LV: A loop annotated parallel, ignore memory dependency "
           << "checks.\n");
diff --git a/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll b/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
index bc606f3ebd..20b0c74215 100644
--- a/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
+++ b/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
@@ -4,7 +4,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "x86_64-unknown-linux-gnu"
 
 ;CHECK: @foo
-;CHECK-NOT: <4 x 
+;CHECK-NOT: <4 x i32>
+;CHECK: ret void
 
 ; Function Attrs: nounwind uwtable 
 define void @foo(i32* nocapture %a, i32* nocapture %b, i32 %k, i32 %m) #0 {
-- 
cgit v1.2.3-70-g09d2


From a7d9a6ee63bec70fecea79b85a30108ed3e8fabd Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Tue, 23 Apr 2013 17:12:42 +0000
Subject: LoopVectorizer: Fix 15830. When scalarizing and unrolling stores make
 sure that the order in which the elements are scalarized is the same as the
 original order. This fixes a miscompilation in FreeBSD's regex library.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@180121 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp |  8 +++----
 test/Transforms/LoopVectorize/bsd_regex.ll | 36 ++++++++++++++++++++++++++++++
 2 files changed, 40 insertions(+), 4 deletions(-)
 create mode 100644 test/Transforms/LoopVectorize/bsd_regex.ll

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 162587c565..0988a4032f 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1085,10 +1085,10 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
   // Create a new entry in the WidenMap and initialize it to Undef or Null.
   VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
 
-  // For each scalar that we create:
-  for (unsigned Width = 0; Width < VF; ++Width) {
-    // For each vector unroll 'part':
-    for (unsigned Part = 0; Part < UF; ++Part) {
+  // For each vector unroll 'part':
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    // For each scalar that we create:
+    for (unsigned Width = 0; Width < VF; ++Width) {
       Instruction *Cloned = Instr->clone();
       if (!IsVoidRetTy)
         Cloned->setName(Instr->getName() + ".cloned");
diff --git a/test/Transforms/LoopVectorize/bsd_regex.ll b/test/Transforms/LoopVectorize/bsd_regex.ll
new file mode 100644
index 0000000000..a2aef1c368
--- /dev/null
+++ b/test/Transforms/LoopVectorize/bsd_regex.ll
@@ -0,0 +1,36 @@
+; RUN: opt -S -loop-vectorize -dce -instcombine -force-vector-width=2 -force-vector-unroll=2 < %s | FileCheck %s
+
+;PR 15830.
+
+;CHECK: foo
+; When scalarizing stores we need to preserve the original order.
+; Make sure that we are extracting in the correct order (0101, and not 0011).
+;CHECK: extractelement <2 x i64> {{.*}}, i32 0
+;CHECK: extractelement <2 x i64> {{.*}}, i32 1
+;CHECK: extractelement <2 x i64> {{.*}}, i32 0
+;CHECK: extractelement <2 x i64> {{.*}}, i32 1
+;CHECK: store
+;CHECK: store
+;CHECK: store
+;CHECK: store
+;CHECK: ret
+
+define i32 @foo(i32* nocapture %A) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 2
+  %arrayidx = getelementptr inbounds i32* %A, i64 %0
+  store i32 4, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 10000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 undef
+}
+
+
-- 
cgit v1.2.3-70-g09d2


From b03ad17536097a0f172428c939e80ce7657e201b Mon Sep 17 00:00:00 2001
From: Arnold Schwaighofer <aschwaighofer@apple.com>
Date: Wed, 24 Apr 2013 16:15:58 +0000
Subject: LoopVectorizer: Bail out if we don't have datalayout we need it

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@180195 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp                   | 5 +++++
 test/Transforms/LoopVectorize/X86/constant-vector-operand.ll | 2 ++
 test/Transforms/LoopVectorize/bsd_regex.ll                   | 2 ++
 test/Transforms/LoopVectorize/minmax_reduction.ll            | 5 ++++-
 4 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0988a4032f..ac0925df2a 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -707,6 +707,11 @@ struct LoopVectorize : public LoopPass {
     AA = getAnalysisIfAvailable<AliasAnalysis>();
     TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
 
+    if (DL == NULL) {
+      DEBUG(dbgs() << "LV: Not vectorizing because of missing data layout");
+      return false;
+    }
+
     DEBUG(dbgs() << "LV: Checking a loop in \"" <<
           L->getHeader()->getParent()->getName() << "\"\n");
 
diff --git a/test/Transforms/LoopVectorize/X86/constant-vector-operand.ll b/test/Transforms/LoopVectorize/X86/constant-vector-operand.ll
index 6c924409af..f4c07b4b24 100644
--- a/test/Transforms/LoopVectorize/X86/constant-vector-operand.ll
+++ b/test/Transforms/LoopVectorize/X86/constant-vector-operand.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -mtriple=x86_64-apple-darwin -mcpu=core2 -loop-vectorize -dce -instcombine -S < %s | FileCheck %s
 
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
 @B = common global [1024 x i32] zeroinitializer, align 16
 @A = common global [1024 x i32] zeroinitializer, align 16
 
diff --git a/test/Transforms/LoopVectorize/bsd_regex.ll b/test/Transforms/LoopVectorize/bsd_regex.ll
index a2aef1c368..a14b92d229 100644
--- a/test/Transforms/LoopVectorize/bsd_regex.ll
+++ b/test/Transforms/LoopVectorize/bsd_regex.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -S -loop-vectorize -dce -instcombine -force-vector-width=2 -force-vector-unroll=2 < %s | FileCheck %s
 
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
 ;PR 15830.
 
 ;CHECK: foo
diff --git a/test/Transforms/LoopVectorize/minmax_reduction.ll b/test/Transforms/LoopVectorize/minmax_reduction.ll
index 90c45bbc66..99dd09386f 100644
--- a/test/Transforms/LoopVectorize/minmax_reduction.ll
+++ b/test/Transforms/LoopVectorize/minmax_reduction.ll
@@ -1,4 +1,7 @@
-; RUN: opt -S -loop-vectorize -dce -instcombine -force-vector-width=2 -force-vector-unroll=1 < %s | FileCheck %s
+; RUN: opt -S -loop-vectorize -dce -instcombine -force-vector-width=2 -force-vector-unroll=1  < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
 @A = common global [1024 x i32] zeroinitializer, align 16
 
 ; Signed tests.
-- 
cgit v1.2.3-70-g09d2


From a4b8b4ccc9ecd6863606c7cdde6ec1b38734708a Mon Sep 17 00:00:00 2001
From: Arnold Schwaighofer <aschwaighofer@apple.com>
Date: Wed, 24 Apr 2013 16:16:01 +0000
Subject: LoopVectorize: Scalarize padded types

This patch disables memory-instruction vectorization for types that need padding
bytes, e.g., x86_fp80 has 10 bytes store size with 6 bytes padding in darwin on
x86_64. Because the load/store vectorization is performed by the bit casting to
a packed vector, which has incompatible memory layout due to the lack of padding
bytes, the present vectorizer produces inconsistent result for memory
instructions of those types.
This patch checks an equality of the AllocSize of a scalar type and allocated
size for each vector element, to ensure that there is no padding bytes and the
array can be read/written using vector operations.

Patch by Daisuke Takahashi!

Fixes PR15758.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@180196 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp         | 10 +++++++-
 .../LoopVectorize/X86/x86_fp80-vector-store.ll     | 29 ++++++++++++++++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)
 create mode 100644 test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index ac0925df2a..56a9a2d3d4 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -956,6 +956,12 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
   Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
   unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
 
+  unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy);
+  unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF;
+
+  if (ScalarAllocatedSize != VectorElementSize)
+    return scalarizeInstruction(Instr);
+
   // If the pointer is loop invariant or if it is non consecutive,
   // scalarize the load.
   int Stride = Legal->isConsecutivePtr(Ptr);
@@ -3558,7 +3564,9 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     // Scalarized loads/stores.
     int Stride = Legal->isConsecutivePtr(Ptr);
     bool Reverse = Stride < 0;
-    if (0 == Stride) {
+    unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ValTy);
+    unsigned VectorElementSize = DL->getTypeStoreSize(VectorTy)/VF;
+    if (0 == Stride || ScalarAllocatedSize != VectorElementSize) {
       unsigned Cost = 0;
       // The cost of extracting from the value vector and pointer vector.
       Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
diff --git a/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll b/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll
new file mode 100644
index 0000000000..b66119f4ef
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/x86_fp80-vector-store.ll
@@ -0,0 +1,29 @@
+; RUN: opt -O3 -loop-vectorize -force-vector-unroll=1 -force-vector-width=2 -S < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.7.0"
+
+@x = common global [1024 x x86_fp80] zeroinitializer, align 16
+
+;CHECK: @example
+;CHECK-NOT: bitcast x86_fp80* {{%[^ ]+}} to <{{[2-9][0-9]*}} x x86_fp80>*
+;CHECK: store
+;CHECK: ret void
+
+define void @example() nounwind ssp uwtable {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %conv = sitofp i32 1 to x86_fp80
+  %arrayidx = getelementptr inbounds [1024 x x86_fp80]* @x, i64 0, i64 %indvars.iv
+  store x86_fp80 %conv, x86_fp80* %arrayidx, align 16
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
-- 
cgit v1.2.3-70-g09d2


From 84875bad9c32b1110c6b8abd4f1346cc0dca761b Mon Sep 17 00:00:00 2001
From: Arnold Schwaighofer <aschwaighofer@apple.com>
Date: Wed, 24 Apr 2013 16:16:03 +0000
Subject: LoopVectorizer: Change variable name Stride to ConsecutiveStride

This makes it easier to read the code.

No functionality change.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@180197 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 56a9a2d3d4..8372bd0723 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -964,10 +964,10 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
 
   // If the pointer is loop invariant or if it is non consecutive,
   // scalarize the load.
-  int Stride = Legal->isConsecutivePtr(Ptr);
-  bool Reverse = Stride < 0;
+  int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
+  bool Reverse = ConsecutiveStride < 0;
   bool UniformLoad = LI && Legal->isUniform(Ptr);
-  if (Stride == 0 || UniformLoad)
+  if (!ConsecutiveStride || UniformLoad)
     return scalarizeInstruction(Instr);
 
   Constant *Zero = Builder.getInt32(0);
@@ -3562,11 +3562,11 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
         TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
 
     // Scalarized loads/stores.
-    int Stride = Legal->isConsecutivePtr(Ptr);
-    bool Reverse = Stride < 0;
+    int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
+    bool Reverse = ConsecutiveStride < 0;
     unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ValTy);
     unsigned VectorElementSize = DL->getTypeStoreSize(VectorTy)/VF;
-    if (0 == Stride || ScalarAllocatedSize != VectorElementSize) {
+    if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) {
       unsigned Cost = 0;
       // The cost of extracting from the value vector and pointer vector.
       Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
-- 
cgit v1.2.3-70-g09d2


From 975b1ddf60387139357c8cbbaeb613de5a39294f Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Thu, 25 Apr 2013 19:55:03 +0000
Subject: LoopVectorizer: No need to generate pointer disambiguation checks
 between readonly pointers.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@180570 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp         | 16 +++++++---
 .../LoopVectorize/runtime-check-readonly.ll        | 36 ++++++++++++++++++++++
 2 files changed, 48 insertions(+), 4 deletions(-)
 create mode 100644 test/Transforms/LoopVectorize/runtime-check-readonly.ll

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8372bd0723..0faab7a3c0 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -419,7 +419,7 @@ public:
     }
 
     /// Insert a pointer and calculate the start and end SCEVs.
-    void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr);
+    void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr);
 
     /// This flag indicates if we need to add the runtime check.
     bool Need;
@@ -429,6 +429,8 @@ public:
     SmallVector<const SCEV*, 2> Starts;
     /// Holds the pointer value at the end of the loop.
     SmallVector<const SCEV*, 2> Ends;
+    /// Holds the information if this pointer is used for writing to memory.
+    SmallVector<bool, 2> IsWritePtr;
   };
 
   /// A POD for saving information about induction variables.
@@ -787,7 +789,8 @@ struct LoopVectorize : public LoopPass {
 
 void
 LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE,
-                                                       Loop *Lp, Value *Ptr) {
+                                                       Loop *Lp, Value *Ptr,
+                                                       bool WritePtr) {
   const SCEV *Sc = SE->getSCEV(Ptr);
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
   assert(AR && "Invalid addrec expression");
@@ -796,6 +799,7 @@ LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE,
   Pointers.push_back(Ptr);
   Starts.push_back(AR->getStart());
   Ends.push_back(ScEnd);
+  IsWritePtr.push_back(WritePtr);
 }
 
 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
@@ -1166,6 +1170,10 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
 
   for (unsigned i = 0; i < NumPointers; ++i) {
     for (unsigned j = i+1; j < NumPointers; ++j) {
+      // No need to check if two readonly pointers intersect.
+      if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j])
+        continue;
+
       Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy, "bc");
       Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy, "bc");
       Value *End0 =   ChkBuilder.CreateBitCast(Ends[i],   PtrArithTy, "bc");
@@ -2678,7 +2686,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
   for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) {
     Value *V = (*MI).first;
     if (hasComputableBounds(V)) {
-      PtrRtCheck.insert(SE, TheLoop, V);
+      PtrRtCheck.insert(SE, TheLoop, V, true);
       DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n");
     } else {
       CanDoRT = false;
@@ -2688,7 +2696,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
   for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) {
     Value *V = (*MI).first;
     if (hasComputableBounds(V)) {
-      PtrRtCheck.insert(SE, TheLoop, V);
+      PtrRtCheck.insert(SE, TheLoop, V, false);
       DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n");
     } else {
       CanDoRT = false;
diff --git a/test/Transforms/LoopVectorize/runtime-check-readonly.ll b/test/Transforms/LoopVectorize/runtime-check-readonly.ll
new file mode 100644
index 0000000000..4145d134fd
--- /dev/null
+++ b/test/Transforms/LoopVectorize/runtime-check-readonly.ll
@@ -0,0 +1,36 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: add_ints
+;CHECK: br
+;CHECK: getelementptr
+;CHECK-NEXT: getelementptr
+;CHECK-NEXT: icmp uge
+;CHECK-NEXT: icmp uge
+;CHECK-NEXT: icmp uge
+;CHECK-NEXT: icmp uge
+;CHECK-NEXT: and
+;CHECK: ret
+define void @add_ints(i32* nocapture %A, i32* nocapture %B, i32* nocapture %C) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32* %C, i64 %indvars.iv
+  %1 = load i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx4, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 200
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
-- 
cgit v1.2.3-70-g09d2


From 7557e521e50180817a165f8c897220b0e2020b7b Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Fri, 26 Apr 2013 05:08:59 +0000
Subject: LoopVectorizer:  Calculate the number of pointers to disambiguate at
 runtime based on the numbers of reads and writes.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@180593 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp     | 15 +++--
 test/Transforms/LoopVectorize/runtime-limit.ll | 84 ++++++++++++++++++++++++++
 2 files changed, 95 insertions(+), 4 deletions(-)
 create mode 100644 test/Transforms/LoopVectorize/runtime-limit.ll

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0faab7a3c0..9a832f7a12 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -114,9 +114,9 @@ TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16),
 /// We don't unroll loops with a known constant trip count below this number.
 static const unsigned TinyTripCountUnrollThreshold = 128;
 
-/// When performing a runtime memory check, do not check more than this
-/// number of pointers. Notice that the check is quadratic!
-static const unsigned RuntimeMemoryCheckThreshold = 4;
+/// When performing memory disambiguation checks at runtime do not make more
+/// than this number of comparisons.
+static const unsigned RuntimeMemoryCheckThreshold = 8;
 
 /// We use a metadata with this name  to indicate that a scalar loop was
 /// vectorized and that we don't need to re-vectorize it if we run into it
@@ -2679,6 +2679,9 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
     return true;
   }
 
+  unsigned NumReadPtrs = 0;
+  unsigned NumWritePtrs = 0;
+
   // Find pointers with computable bounds. We are going to use this information
   // to place a runtime bound check.
   bool CanDoRT = true;
@@ -2687,6 +2690,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
     Value *V = (*MI).first;
     if (hasComputableBounds(V)) {
       PtrRtCheck.insert(SE, TheLoop, V, true);
+      NumWritePtrs++;
       DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n");
     } else {
       CanDoRT = false;
@@ -2697,6 +2701,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
     Value *V = (*MI).first;
     if (hasComputableBounds(V)) {
       PtrRtCheck.insert(SE, TheLoop, V, false);
+      NumReadPtrs++;
       DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n");
     } else {
       CanDoRT = false;
@@ -2706,7 +2711,9 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
 
   // Check that we did not collect too many pointers or found a
   // unsizeable pointer.
-  if (!CanDoRT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) {
+  unsigned NumComparisons = (NumWritePtrs * (NumReadPtrs + NumWritePtrs - 1));
+  DEBUG(dbgs() << "LV: We need to compare " << NumComparisons << " ptrs.\n");
+  if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) {
     PtrRtCheck.reset();
     CanDoRT = false;
   }
diff --git a/test/Transforms/LoopVectorize/runtime-limit.ll b/test/Transforms/LoopVectorize/runtime-limit.ll
new file mode 100644
index 0000000000..d7839746f0
--- /dev/null
+++ b/test/Transforms/LoopVectorize/runtime-limit.ll
@@ -0,0 +1,84 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; We are vectorizing with 6 runtime checks.
+;CHECK: func1x6
+;CHECK: <4 x i32>
+;CHECK: ret
+define i32 @func1x6(i32* nocapture %out, i32* nocapture %A, i32* nocapture %B, i32* nocapture %C, i32* nocapture %D, i32* nocapture %E, i32* nocapture %F) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.016 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %A, i64 %i.016
+  %0 = load i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %B, i64 %i.016
+  %1 = load i32* %arrayidx1, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx2 = getelementptr inbounds i32* %C, i64 %i.016
+  %2 = load i32* %arrayidx2, align 4
+  %add3 = add nsw i32 %add, %2
+  %arrayidx4 = getelementptr inbounds i32* %E, i64 %i.016
+  %3 = load i32* %arrayidx4, align 4
+  %add5 = add nsw i32 %add3, %3
+  %arrayidx6 = getelementptr inbounds i32* %F, i64 %i.016
+  %4 = load i32* %arrayidx6, align 4
+  %add7 = add nsw i32 %add5, %4
+  %arrayidx8 = getelementptr inbounds i32* %out, i64 %i.016
+  store i32 %add7, i32* %arrayidx8, align 4
+  %inc = add i64 %i.016, 1
+  %exitcond = icmp eq i64 %inc, 256
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 undef
+}
+
+; We are not vectorizing with 12 runtime checks.
+;CHECK: func2x6
+;CHECK-NOT: <4 x i32>
+;CHECK: ret
+define i32 @func2x6(i32* nocapture %out, i32* nocapture %out2, i32* nocapture %A, i32* nocapture %B, i32* nocapture %C, i32* nocapture %D, i32* nocapture %E, i32* nocapture %F) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.037 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %A, i64 %i.037
+  %0 = load i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32* %B, i64 %i.037
+  %1 = load i32* %arrayidx1, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx2 = getelementptr inbounds i32* %C, i64 %i.037
+  %2 = load i32* %arrayidx2, align 4
+  %add3 = add nsw i32 %add, %2
+  %arrayidx4 = getelementptr inbounds i32* %E, i64 %i.037
+  %3 = load i32* %arrayidx4, align 4
+  %add5 = add nsw i32 %add3, %3
+  %arrayidx6 = getelementptr inbounds i32* %F, i64 %i.037
+  %4 = load i32* %arrayidx6, align 4
+  %add7 = add nsw i32 %add5, %4
+  %arrayidx8 = getelementptr inbounds i32* %out, i64 %i.037
+  store i32 %add7, i32* %arrayidx8, align 4
+  %5 = load i32* %arrayidx, align 4
+  %6 = load i32* %arrayidx1, align 4
+  %add11 = add nsw i32 %6, %5
+  %7 = load i32* %arrayidx2, align 4
+  %add13 = add nsw i32 %add11, %7
+  %8 = load i32* %arrayidx4, align 4
+  %add15 = add nsw i32 %add13, %8
+  %9 = load i32* %arrayidx6, align 4
+  %add17 = add nsw i32 %add15, %9
+  %arrayidx18 = getelementptr inbounds i32* %out2, i64 %i.037
+  store i32 %add17, i32* %arrayidx18, align 4
+  %inc = add i64 %i.037, 1
+  %exitcond = icmp eq i64 %inc, 256
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 undef
+}
+
-- 
cgit v1.2.3-70-g09d2


From 4bcd5f888fa762613cf8096a79ba7b8a72665de2 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Fri, 3 May 2013 17:42:55 +0000
Subject: LoopVectorizer: Add support for if-conversion of PHINodes with 3+
 incoming values. By supporting the vectorization of PHINodes with more than
 two incoming values we can increase the complexity of nested if statements.

We can now vectorize this loop:

int foo(int *A, int *B, int n) {
  for (int i=0; i < n; i++) {
    int x = 9;
    if (A[i] > B[i]) {
      if (A[i] > 19) {
        x = 3;
      } else if (B[i] < 4 ) {
        x = 4;
      } else {
        x = 5;
      }
    }
    A[i] = x;
  }
}



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@181037 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp         | 49 +++++++++++++---------
 .../Transforms/LoopVectorize/if-conversion-nest.ll | 48 +++++++++++++++++++++
 2 files changed, 77 insertions(+), 20 deletions(-)
 create mode 100644 test/Transforms/LoopVectorize/if-conversion-nest.ll

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 9a832f7a12..b00b50e761 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1982,18 +1982,33 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
         // We know that all PHIs in non header blocks are converted into
         // selects, so we don't have to worry about the insertion order and we
         // can just use the builder.
-
         // At this point we generate the predication tree. There may be
         // duplications since this is a simple recursive scan, but future
         // optimizations will clean it up.
-        VectorParts Cond = createEdgeMask(P->getIncomingBlock(0),
-                                               P->getParent());
 
-        for (unsigned part = 0; part < UF; ++part) {
-        VectorParts &In0 = getVectorValue(P->getIncomingValue(0));
-        VectorParts &In1 = getVectorValue(P->getIncomingValue(1));
-          Entry[part] = Builder.CreateSelect(Cond[part], In0[part], In1[part],
-                                             "predphi");
+        unsigned NumIncoming = P->getNumIncomingValues();
+        assert(NumIncoming > 1 && "Invalid PHI");
+
+        // Generate a sequence of selects of the form:
+        // SELECT(Mask3, In3,
+        //      SELECT(Mask2, In2,
+        //                   ( ...)))
+        for (unsigned In = 0; In < NumIncoming; In++) {
+          VectorParts Cond = createEdgeMask(P->getIncomingBlock(In),
+                                            P->getParent());
+          VectorParts &In0 = getVectorValue(P->getIncomingValue(In));
+
+          for (unsigned part = 0; part < UF; ++part) {
+            // We don't need to 'select' the first PHI operand because it is
+            // the default value if all of the other masks don't match.
+            if (In == 0)
+              Entry[part] = In0[part];
+            else
+              // Select between the current value and the previous incoming edge
+              // based on the incoming mask.
+              Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
+                                                 Entry[part], "predphi");
+          }
         }
         continue;
       }
@@ -2273,12 +2288,6 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
     if (!isa<BranchInst>(BB->getTerminator()))
       return false;
 
-    // We must have at most two predecessors because we need to convert
-    // all PHIs to selects.
-    unsigned Preds = std::distance(pred_begin(BB), pred_end(BB));
-    if (Preds > 2)
-      return false;
-
     // We must be able to predicate all blocks that need to be predicated.
     if (blockNeedsPredication(BB) && !blockCanBePredicated(BB))
       return false;
@@ -2376,12 +2385,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
          ++it) {
 
       if (PHINode *Phi = dyn_cast<PHINode>(it)) {
-        // This should not happen because the loop should be normalized.
-        if (Phi->getNumIncomingValues() != 2) {
-          DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
-          return false;
-        }
-
         // Check that this PHI type is allowed.
         if (!Phi->getType()->isIntegerTy() &&
             !Phi->getType()->isFloatingPointTy() &&
@@ -2396,6 +2399,12 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         if (*bb != Header)
           continue;
 
+        // We only allow if-converted PHIs with more than two incoming values.
+        if (Phi->getNumIncomingValues() != 2) {
+          DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
+          return false;
+        }
+
         // This is the value coming from the preheader.
         Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);
         // Check if this is an induction variable.
diff --git a/test/Transforms/LoopVectorize/if-conversion-nest.ll b/test/Transforms/LoopVectorize/if-conversion-nest.ll
new file mode 100644
index 0000000000..f44862a2eb
--- /dev/null
+++ b/test/Transforms/LoopVectorize/if-conversion-nest.ll
@@ -0,0 +1,48 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -enable-if-conversion -dce -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+;CHECK: @foo
+;CHECK: icmp sgt
+;CHECK: icmp sgt
+;CHECK: icmp slt
+;CHECK: select <4 x i1>
+;CHECK: %[[P1:.*]] = select <4 x i1>
+;CHECK: xor <4 x i1>
+;CHECK: and <4 x i1>
+;CHECK: select <4 x i1> %{{.*}}, <4 x i32> %{{.*}}, <4 x i32> %[[P1]]
+;CHECK: ret
+define i32 @foo(i32* nocapture %A, i32* nocapture %B, i32 %n) {
+entry:
+  %cmp26 = icmp sgt i32 %n, 0
+  br i1 %cmp26, label %for.body, label %for.end
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %if.end14 ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %1 = load i32* %arrayidx2, align 4
+  %cmp3 = icmp sgt i32 %0, %1
+  br i1 %cmp3, label %if.then, label %if.end14
+
+if.then:
+  %cmp6 = icmp sgt i32 %0, 19
+  br i1 %cmp6, label %if.end14, label %if.else
+
+if.else:
+  %cmp10 = icmp slt i32 %1, 4
+  %. = select i1 %cmp10, i32 4, i32 5
+  br label %if.end14
+
+if.end14:
+  %x.0 = phi i32 [ 9, %for.body ], [ 3, %if.then ], [ %., %if.else ]  ; <------------- A PHI with 3 entries that we can still vectorize.
+  store i32 %x.0, i32* %arrayidx, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret i32 undef
+}
-- 
cgit v1.2.3-70-g09d2


From 5c332dbd30d9398ed25b30c3080506f7b8e92290 Mon Sep 17 00:00:00 2001
From: Dmitri Gribenko <gribozavr@gmail.com>
Date: Sun, 5 May 2013 00:40:33 +0000
Subject: Add ArrayRef constructor from None, and do the cleanups that this
 constructor enables

Patch by Robert Wilhelm.


git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@181138 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/ADT/ArrayRef.h                         |  9 ++++++++-
 include/llvm/CodeGen/LiveRangeEdit.h                |  3 +--
 include/llvm/CodeGen/MachineTraceMetrics.h          |  7 +++----
 include/llvm/IR/Intrinsics.h                        |  9 ++++-----
 include/llvm/MC/MCParser/MCAsmParser.h              |  7 +++----
 include/llvm/Support/SourceMgr.h                    | 12 ++++++------
 lib/AsmParser/LLParser.cpp                          |  2 +-
 lib/Bitcode/Reader/BitcodeReader.cpp                |  2 +-
 lib/CodeGen/SelectionDAG/SelectionDAG.cpp           |  4 ++--
 lib/CodeGen/ShrinkWrapping.cpp                      |  4 ++--
 lib/IR/ConstantsContext.h                           |  2 +-
 lib/IR/PassManager.cpp                              |  4 ++--
 lib/IR/Type.cpp                                     |  4 ++--
 lib/MC/MCParser/AsmParser.cpp                       |  6 +++---
 lib/Support/YAMLParser.cpp                          |  2 +-
 lib/Target/ARM/AsmParser/ARMAsmParser.cpp           |  4 ++--
 lib/Target/X86/AsmParser/X86AsmParser.cpp           |  4 ++--
 lib/Transforms/InstCombine/InstructionCombining.cpp |  2 +-
 lib/Transforms/ObjCARC/ObjCARCOpts.cpp              |  3 +--
 lib/Transforms/Utils/ValueMapper.cpp                |  2 +-
 lib/Transforms/Vectorize/LoopVectorize.cpp          |  2 +-
 21 files changed, 48 insertions(+), 46 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/include/llvm/ADT/ArrayRef.h b/include/llvm/ADT/ArrayRef.h
index c555c1c2b1..d4152ec727 100644
--- a/include/llvm/ADT/ArrayRef.h
+++ b/include/llvm/ADT/ArrayRef.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_ADT_ARRAYREF_H
 #define LLVM_ADT_ARRAYREF_H
 
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/SmallVector.h"
 #include <vector>
 
@@ -49,6 +50,9 @@ namespace llvm {
     /// Construct an empty ArrayRef.
     /*implicit*/ ArrayRef() : Data(0), Length(0) {}
 
+    /// Construct an empty ArrayRef from None.
+    /*implicit*/ ArrayRef(NoneType) : Data(0), Length(0) {}
+
     /// Construct an ArrayRef from a single element.
     /*implicit*/ ArrayRef(const T &OneElt)
       : Data(&OneElt), Length(1) {}
@@ -174,9 +178,12 @@ namespace llvm {
   public:
     typedef T *iterator;
 
-    /// Construct an empty ArrayRef.
+    /// Construct an empty MutableArrayRef.
     /*implicit*/ MutableArrayRef() : ArrayRef<T>() {}
 
+    /// Construct an empty MutableArrayRef from None.
+    /*implicit*/ MutableArrayRef(NoneType) : ArrayRef<T>() {}
+
     /// Construct an MutableArrayRef from a single element.
     /*implicit*/ MutableArrayRef(T &OneElt) : ArrayRef<T>(OneElt) {}
 
diff --git a/include/llvm/CodeGen/LiveRangeEdit.h b/include/llvm/CodeGen/LiveRangeEdit.h
index 8a32a3c11a..e59276fd6d 100644
--- a/include/llvm/CodeGen/LiveRangeEdit.h
+++ b/include/llvm/CodeGen/LiveRangeEdit.h
@@ -196,8 +196,7 @@ public:
   /// allocator.  These registers should not be split into new intervals
   /// as currently those new intervals are not guaranteed to spill.
   void eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
-                         ArrayRef<unsigned> RegsBeingSpilled 
-                          = ArrayRef<unsigned>());
+                         ArrayRef<unsigned> RegsBeingSpilled = None);
 
   /// calculateRegClassAndHint - Recompute register class and hint for each new
   /// register.
diff --git a/include/llvm/CodeGen/MachineTraceMetrics.h b/include/llvm/CodeGen/MachineTraceMetrics.h
index 4e087fc62d..9794707e35 100644
--- a/include/llvm/CodeGen/MachineTraceMetrics.h
+++ b/include/llvm/CodeGen/MachineTraceMetrics.h
@@ -263,10 +263,9 @@ public:
     /// trace. Likewise, extra resources required by the specified scheduling
     /// classes are included. For the caller to account for extra machine
     /// instructions, it must first resolve each instruction's scheduling class.
-    unsigned getResourceLength(ArrayRef<const MachineBasicBlock*> Extrablocks =
-                               ArrayRef<const MachineBasicBlock*>(),
-                               ArrayRef<const MCSchedClassDesc*> ExtraInstrs =
-                               ArrayRef<const MCSchedClassDesc*>()) const;
+    unsigned getResourceLength(
+                ArrayRef<const MachineBasicBlock*> Extrablocks = None,
+                ArrayRef<const MCSchedClassDesc*> ExtraInstrs = None) const;
 
     /// Return the length of the (data dependency) critical path through the
     /// trace.
diff --git a/include/llvm/IR/Intrinsics.h b/include/llvm/IR/Intrinsics.h
index e69ddec1e3..c81d110e3b 100644
--- a/include/llvm/IR/Intrinsics.h
+++ b/include/llvm/IR/Intrinsics.h
@@ -45,12 +45,12 @@ namespace Intrinsic {
   
   /// Intrinsic::getName(ID) - Return the LLVM name for an intrinsic, such as
   /// "llvm.ppc.altivec.lvx".
-  std::string getName(ID id, ArrayRef<Type*> Tys = ArrayRef<Type*>());
-  
+  std::string getName(ID id, ArrayRef<Type*> Tys = None);
+
   /// Intrinsic::getType(ID) - Return the function type for an intrinsic.
   ///
   FunctionType *getType(LLVMContext &Context, ID id,
-                        ArrayRef<Type*> Tys = ArrayRef<Type*>());
+                        ArrayRef<Type*> Tys = None);
 
   /// Intrinsic::isOverloaded(ID) - Returns true if the intrinsic can be
   /// overloaded.
@@ -67,8 +67,7 @@ namespace Intrinsic {
   /// using iAny, fAny, vAny, or iPTRAny).  For a declaration of an overloaded
   /// intrinsic, Tys must provide exactly one type for each overloaded type in
   /// the intrinsic.
-  Function *getDeclaration(Module *M, ID id,
-                           ArrayRef<Type*> Tys = ArrayRef<Type*>());
+  Function *getDeclaration(Module *M, ID id, ArrayRef<Type*> Tys = None);
 
   /// Map a GCC builtin name to an intrinsic ID.
   ID getIntrinsicForGCCBuiltin(const char *Prefix, const char *BuiltinName);
diff --git a/include/llvm/MC/MCParser/MCAsmParser.h b/include/llvm/MC/MCParser/MCAsmParser.h
index 7f012c65b0..dcc9886333 100644
--- a/include/llvm/MC/MCParser/MCAsmParser.h
+++ b/include/llvm/MC/MCParser/MCAsmParser.h
@@ -122,14 +122,14 @@ public:
   ///
   /// \return The return value is true, if warnings are fatal.
   virtual bool Warning(SMLoc L, const Twine &Msg,
-                       ArrayRef<SMRange> Ranges = ArrayRef<SMRange>()) = 0;
+                       ArrayRef<SMRange> Ranges = None) = 0;
 
   /// Error - Emit an error at the location \p L, with the message \p Msg.
   ///
   /// \return The return value is always true, as an idiomatic convenience to
   /// clients.
   virtual bool Error(SMLoc L, const Twine &Msg,
-                     ArrayRef<SMRange> Ranges = ArrayRef<SMRange>()) = 0;
+                     ArrayRef<SMRange> Ranges = None) = 0;
 
   /// Lex - Get the next AsmToken in the stream, possibly handling file
   /// inclusion first.
@@ -139,8 +139,7 @@ public:
   const AsmToken &getTok();
 
   /// \brief Report an error at the current lexer location.
-  bool TokError(const Twine &Msg,
-                ArrayRef<SMRange> Ranges = ArrayRef<SMRange>());
+  bool TokError(const Twine &Msg, ArrayRef<SMRange> Ranges = None);
 
   /// parseIdentifier - Parse an identifier or string (as a quoted identifier)
   /// and set \p Res to the identifier contents.
diff --git a/include/llvm/Support/SourceMgr.h b/include/llvm/Support/SourceMgr.h
index 02abf92daa..d67914a1b8 100644
--- a/include/llvm/Support/SourceMgr.h
+++ b/include/llvm/Support/SourceMgr.h
@@ -145,8 +145,8 @@ public:
   /// @param ShowColors - Display colored messages if output is a terminal and
   /// the default error handler is used.
   void PrintMessage(SMLoc Loc, DiagKind Kind, const Twine &Msg,
-                    ArrayRef<SMRange> Ranges = ArrayRef<SMRange>(),
-                    ArrayRef<SMFixIt> FixIts = ArrayRef<SMFixIt>(),
+                    ArrayRef<SMRange> Ranges = None,
+                    ArrayRef<SMFixIt> FixIts = None,
                     bool ShowColors = true) const;
 
 
@@ -155,9 +155,9 @@ public:
   ///
   /// @param Msg If non-null, the kind of message (e.g., "error") which is
   /// prefixed to the message.
-  SMDiagnostic GetMessage(SMLoc Loc, DiagKind Kind, const Twine &Msg, 
-                          ArrayRef<SMRange> Ranges = ArrayRef<SMRange>(),
-                          ArrayRef<SMFixIt> FixIts = ArrayRef<SMFixIt>()) const;
+  SMDiagnostic GetMessage(SMLoc Loc, DiagKind Kind, const Twine &Msg,
+                          ArrayRef<SMRange> Ranges = None,
+                          ArrayRef<SMFixIt> FixIts = None) const;
 
   /// PrintIncludeStack - Prints the names of included files and the line of the
   /// file they were included from.  A diagnostic handler can use this before
@@ -227,7 +227,7 @@ public:
                int Line, int Col, SourceMgr::DiagKind Kind,
                StringRef Msg, StringRef LineStr,
                ArrayRef<std::pair<unsigned,unsigned> > Ranges,
-               ArrayRef<SMFixIt> FixIts = ArrayRef<SMFixIt>());
+               ArrayRef<SMFixIt> FixIts = None);
 
   const SourceMgr *getSourceMgr() const { return SM; }
   SMLoc getLoc() const { return Loc; }
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index 998bca5212..62d8070d18 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -528,7 +528,7 @@ bool LLParser::ParseMDNodeID(MDNode *&Result) {
   if (Result) return false;
 
   // Otherwise, create MDNode forward reference.
-  MDNode *FwdNode = MDNode::getTemporary(Context, ArrayRef<Value*>());
+  MDNode *FwdNode = MDNode::getTemporary(Context, None);
   ForwardRefMDNodes[MID] = std::make_pair(FwdNode, Lex.getLoc());
 
   if (NumberedMetadata.size() <= MID)
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index f34884391a..e6ff4b43b1 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -405,7 +405,7 @@ Value *BitcodeReaderMDValueList::getValueFwdRef(unsigned Idx) {
   }
 
   // Create and return a placeholder, which will later be RAUW'd.
-  Value *V = MDNode::getTemporary(Context, ArrayRef<Value*>());
+  Value *V = MDNode::getTemporary(Context, None);
   MDValuePtrs[Idx] = V;
   return V;
 }
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 5e30c251e7..15235c8ac3 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5252,7 +5252,7 @@ SDNode *SelectionDAG::MorphNodeTo(SDNode *N, unsigned Opc,
 MachineSDNode *
 SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl, EVT VT) {
   SDVTList VTs = getVTList(VT);
-  return getMachineNode(Opcode, dl, VTs, ArrayRef<SDValue>());
+  return getMachineNode(Opcode, dl, VTs, None);
 }
 
 MachineSDNode *
@@ -5288,7 +5288,7 @@ SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl, EVT VT,
 MachineSDNode *
 SelectionDAG::getMachineNode(unsigned Opcode, DebugLoc dl, EVT VT1, EVT VT2) {
   SDVTList VTs = getVTList(VT1, VT2);
-  return getMachineNode(Opcode, dl, VTs, ArrayRef<SDValue>());
+  return getMachineNode(Opcode, dl, VTs, None);
 }
 
 MachineSDNode *
diff --git a/lib/CodeGen/ShrinkWrapping.cpp b/lib/CodeGen/ShrinkWrapping.cpp
index 9ab491808f..2feea59c03 100644
--- a/lib/CodeGen/ShrinkWrapping.cpp
+++ b/lib/CodeGen/ShrinkWrapping.cpp
@@ -70,14 +70,14 @@ ShrinkWrapFunc("shrink-wrap-func", cl::Hidden,
 
 // Debugging level for shrink wrapping.
 enum ShrinkWrapDebugLevel {
-  None, BasicInfo, Iterations, Details
+  Disabled, BasicInfo, Iterations, Details
 };
 
 static cl::opt<enum ShrinkWrapDebugLevel>
 ShrinkWrapDebugging("shrink-wrap-dbg", cl::Hidden,
   cl::desc("Print shrink wrapping debugging information"),
   cl::values(
-    clEnumVal(None      , "disable debug output"),
+    clEnumVal(Disabled  , "disable debug output"),
     clEnumVal(BasicInfo , "print basic DF sets"),
     clEnumVal(Iterations, "print SR sets for each iteration"),
     clEnumVal(Details   , "print all DF sets"),
diff --git a/lib/IR/ConstantsContext.h b/lib/IR/ConstantsContext.h
index e9958589f5..32bed95e21 100644
--- a/lib/IR/ConstantsContext.h
+++ b/lib/IR/ConstantsContext.h
@@ -318,7 +318,7 @@ struct ExprMapKeyType {
       ArrayRef<Constant*> ops,
       unsigned short flags = 0,
       unsigned short optionalflags = 0,
-      ArrayRef<unsigned> inds = ArrayRef<unsigned>())
+      ArrayRef<unsigned> inds = None)
         : opcode(opc), subclassoptionaldata(optionalflags), subclassdata(flags),
         operands(ops.begin(), ops.end()), indices(inds.begin(), inds.end()) {}
   uint8_t opcode;
diff --git a/lib/IR/PassManager.cpp b/lib/IR/PassManager.cpp
index 3c968aac16..387094a0e3 100644
--- a/lib/IR/PassManager.cpp
+++ b/lib/IR/PassManager.cpp
@@ -42,14 +42,14 @@ namespace llvm {
 
 // Different debug levels that can be enabled...
 enum PassDebugLevel {
-  None, Arguments, Structure, Executions, Details
+  Disabled, Arguments, Structure, Executions, Details
 };
 
 static cl::opt<enum PassDebugLevel>
 PassDebugging("debug-pass", cl::Hidden,
                   cl::desc("Print PassManager debugging information"),
                   cl::values(
-  clEnumVal(None      , "disable debug output"),
+  clEnumVal(Disabled  , "disable debug output"),
   clEnumVal(Arguments , "print pass arguments to pass to 'opt'"),
   clEnumVal(Structure , "print pass structure before run()"),
   clEnumVal(Executions, "print pass name before it is executed"),
diff --git a/lib/IR/Type.cpp b/lib/IR/Type.cpp
index 1e6a51ab10..46c61fc06e 100644
--- a/lib/IR/Type.cpp
+++ b/lib/IR/Type.cpp
@@ -380,7 +380,7 @@ FunctionType *FunctionType::get(Type *ReturnType,
 }
 
 FunctionType *FunctionType::get(Type *Result, bool isVarArg) {
-  return get(Result, ArrayRef<Type *>(), isVarArg);
+  return get(Result, None, isVarArg);
 }
 
 /// isValidReturnType - Return true if the specified type is valid as a return
@@ -499,7 +499,7 @@ StructType *StructType::create(LLVMContext &Context, StringRef Name) {
 }
 
 StructType *StructType::get(LLVMContext &Context, bool isPacked) {
-  return get(Context, llvm::ArrayRef<Type*>(), isPacked);
+  return get(Context, None, isPacked);
 }
 
 StructType *StructType::get(Type *type, ...) {
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index a903771bdb..2368e9e86a 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -201,9 +201,9 @@ public:
   }
 
   virtual bool Warning(SMLoc L, const Twine &Msg,
-                       ArrayRef<SMRange> Ranges = ArrayRef<SMRange>());
+                       ArrayRef<SMRange> Ranges = None);
   virtual bool Error(SMLoc L, const Twine &Msg,
-                     ArrayRef<SMRange> Ranges = ArrayRef<SMRange>());
+                     ArrayRef<SMRange> Ranges = None);
 
   virtual const AsmToken &Lex();
 
@@ -286,7 +286,7 @@ private:
 
   void PrintMacroInstantiations();
   void PrintMessage(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Msg,
-                    ArrayRef<SMRange> Ranges = ArrayRef<SMRange>()) const {
+                    ArrayRef<SMRange> Ranges = None) const {
     SrcMgr.PrintMessage(Loc, Kind, Msg, Ranges);
   }
   static void DiagHandler(const SMDiagnostic &Diag, void *Context);
diff --git a/lib/Support/YAMLParser.cpp b/lib/Support/YAMLParser.cpp
index 2cead20c0b..213f5e1568 100644
--- a/lib/Support/YAMLParser.cpp
+++ b/lib/Support/YAMLParser.cpp
@@ -260,7 +260,7 @@ public:
   Token getNext();
 
   void printError(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Message,
-                  ArrayRef<SMRange> Ranges = ArrayRef<SMRange>()) {
+                  ArrayRef<SMRange> Ranges = None) {
     SM.PrintMessage(Loc, Kind, Message, Ranges);
   }
 
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 114cc9e5c0..1dd2953b29 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -86,11 +86,11 @@ class ARMAsmParser : public MCTargetAsmParser {
   MCAsmLexer &getLexer() const { return Parser.getLexer(); }
 
   bool Warning(SMLoc L, const Twine &Msg,
-               ArrayRef<SMRange> Ranges = ArrayRef<SMRange>()) {
+               ArrayRef<SMRange> Ranges = None) {
     return Parser.Warning(L, Msg, Ranges);
   }
   bool Error(SMLoc L, const Twine &Msg,
-             ArrayRef<SMRange> Ranges = ArrayRef<SMRange>()) {
+             ArrayRef<SMRange> Ranges = None) {
     return Parser.Error(L, Msg, Ranges);
   }
 
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 5c09434854..68908abb57 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -477,7 +477,7 @@ private:
   MCAsmLexer &getLexer() const { return Parser.getLexer(); }
 
   bool Error(SMLoc L, const Twine &Msg,
-             ArrayRef<SMRange> Ranges = ArrayRef<SMRange>(),
+             ArrayRef<SMRange> Ranges = None,
              bool MatchingInlineAsm = false) {
     if (MatchingInlineAsm) return true;
     return Parser.Error(L, Msg, Ranges);
@@ -2204,7 +2204,7 @@ MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   assert(!Operands.empty() && "Unexpect empty operand list!");
   X86Operand *Op = static_cast<X86Operand*>(Operands[0]);
   assert(Op->isToken() && "Leading operand should always be a mnemonic!");
-  ArrayRef<SMRange> EmptyRanges = ArrayRef<SMRange>();
+  ArrayRef<SMRange> EmptyRanges = None;
 
   // First, handle aliases that expand to multiple instructions.
   // FIXME: This should be replaced with a real .td file alias mechanism.
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index c6115e3e91..ec10751202 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1483,7 +1483,7 @@ Instruction *InstCombiner::visitAllocSite(Instruction &MI) {
       Module *M = II->getParent()->getParent()->getParent();
       Function *F = Intrinsic::getDeclaration(M, Intrinsic::donothing);
       InvokeInst::Create(F, II->getNormalDest(), II->getUnwindDest(),
-                         ArrayRef<Value *>(), "", II->getParent());
+                         None, "", II->getParent());
     }
     return EraseInstFromFunction(MI);
   }
diff --git a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
index e3d51d5d74..43e2e20035 100644
--- a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -1472,8 +1472,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
         CallInst *NewCall =
           CallInst::Create(getReleaseCallee(F.getParent()),
                            Call->getArgOperand(0), "", Call);
-        NewCall->setMetadata(ImpreciseReleaseMDKind,
-                             MDNode::get(C, ArrayRef<Value *>()));
+        NewCall->setMetadata(ImpreciseReleaseMDKind, MDNode::get(C, None));
 
         DEBUG(dbgs() << "Replacing autorelease{,RV}(x) with objc_release(x) "
               "since x is otherwise unused.\nOld: " << *Call << "\nNew: "
diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp
index b5941bdf24..544c5eed7e 100644
--- a/lib/Transforms/Utils/ValueMapper.cpp
+++ b/lib/Transforms/Utils/ValueMapper.cpp
@@ -57,7 +57,7 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
       return VM[V] = const_cast<Value*>(V);
     
     // Create a dummy node in case we have a metadata cycle.
-    MDNode *Dummy = MDNode::getTemporary(V->getContext(), ArrayRef<Value*>());
+    MDNode *Dummy = MDNode::getTemporary(V->getContext(), None);
     VM[V] = Dummy;
     
     // Check all operands to see if any need to be remapped.
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index b00b50e761..2250712a8e 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1231,7 +1231,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
 
   // Mark the old scalar loop with metadata that tells us not to vectorize this
   // loop again if we run into it.
-  MDNode *MD = MDNode::get(OldBasicBlock->getContext(), ArrayRef<Value*>());
+  MDNode *MD = MDNode::get(OldBasicBlock->getContext(), None);
   OldBasicBlock->getTerminator()->setMetadata(AlreadyVectorizedMDName, MD);
 
   // Some loops have a single integer induction variable, while other loops
-- 
cgit v1.2.3-70-g09d2


From c1738fdadd1c969f13bbf09fe9c36fff56ccd709 Mon Sep 17 00:00:00 2001
From: Arnold Schwaighofer <aschwaighofer@apple.com>
Date: Sun, 5 May 2013 01:54:42 +0000
Subject: LoopVectorize: We don't need an identity element for min/max
 reductions

We can just use the initial element that feeds the reduction.

  max(max(x, y), z) == max(max(x,y), max(x,z))

radar://13723044

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@181141 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp        | 51 +++++++++--------------
 test/Transforms/LoopVectorize/minmax_reduction.ll |  5 ++-
 2 files changed, 23 insertions(+), 33 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 2250712a8e..1c7397bfa6 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -498,8 +498,7 @@ public:
 
   /// This function returns the identity element (or neutral element) for
   /// the operation K.
-  static Constant *getReductionIdentity(ReductionKind K, Type *Tp,
-                                        MinMaxReductionKind MinMaxK);
+  static Constant *getReductionIdentity(ReductionKind K, Type *Tp);
 private:
   /// Check if a single basic block loop is vectorizable.
   /// At this point we know that this is a loop with a constant trip count
@@ -1501,8 +1500,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
 /// This function returns the identity element (or neutral element) for
 /// the operation K.
 Constant*
-LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp,
-                                                MinMaxReductionKind MinMaxK) {
+LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp) {
   switch (K) {
   case RK_IntegerXor:
   case RK_IntegerAdd:
@@ -1521,24 +1519,6 @@ LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp,
   case  RK_FloatAdd:
     // Adding zero to a number does not change it.
     return ConstantFP::get(Tp, 0.0L);
-  case  RK_IntegerMinMax:
-    switch(MinMaxK) {
-    default: llvm_unreachable("Unknown min/max predicate");
-    case MRK_UIntMin:
-      return ConstantInt::getAllOnesValue(Tp);
-    case MRK_UIntMax:
-      return ConstantInt::get(Tp, 0);
-    case MRK_SIntMin: {
-      unsigned BitWidth = Tp->getPrimitiveSizeInBits();
-      return ConstantInt::get(Tp->getContext(),
-                              APInt::getSignedMaxValue(BitWidth));
-    }
-    case LoopVectorizationLegality::MRK_SIntMax: {
-      unsigned BitWidth = Tp->getPrimitiveSizeInBits();
-      return ConstantInt::get(Tp->getContext(),
-                              APInt::getSignedMinValue(BitWidth));
-    }
-    }
   default:
     llvm_unreachable("Unknown reduction kind");
   }
@@ -1761,16 +1741,23 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
 
     // Find the reduction identity variable. Zero for addition, or, xor,
     // one for multiplication, -1 for And.
-    Constant *Iden =
-      LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind,
-                                                      VecTy->getScalarType(),
-                                                      RdxDesc.MinMaxKind);
-    Constant *Identity = ConstantVector::getSplat(VF, Iden);
-
-    // This vector is the Identity vector where the first element is the
-    // incoming scalar reduction.
-    Value *VectorStart = Builder.CreateInsertElement(Identity,
-                                                     RdxDesc.StartValue, Zero);
+    Value *Identity;
+    Value *VectorStart;
+    if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerMinMax)
+      // MinMax reduction have the start value as their identify.
+      VectorStart = Identity = Builder.CreateVectorSplat(VF, RdxDesc.StartValue,
+                                                         "minmax.ident");
+    else {
+      Constant *Iden =
+        LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind,
+                                                        VecTy->getScalarType());
+      Identity = ConstantVector::getSplat(VF, Iden);
+
+      // This vector is the Identity vector where the first element is the
+      // incoming scalar reduction.
+      VectorStart = Builder.CreateInsertElement(Identity,
+                                                RdxDesc.StartValue, Zero);
+    }
 
     // Fix the vector-loop phi.
     // We created the induction variable so we know that the
diff --git a/test/Transforms/LoopVectorize/minmax_reduction.ll b/test/Transforms/LoopVectorize/minmax_reduction.ll
index 99dd09386f..36a8758e2c 100644
--- a/test/Transforms/LoopVectorize/minmax_reduction.ll
+++ b/test/Transforms/LoopVectorize/minmax_reduction.ll
@@ -6,8 +6,11 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 
 ; Signed tests.
 
-; Turn this into a max reduction.
+; Turn this into a max reduction. Make sure we use a splat to initialize the
+; vector for the reduction.
 ; CHECK: @max_red
+; CHECK: %[[VAR:.*]] = insertelement <2 x i32> undef, i32 %max, i32 0
+; CHECK: {{.*}} = shufflevector <2 x i32> %[[VAR]], <2 x i32> undef, <2 x i32> zeroinitializer
 ; CHECK: icmp sgt <2 x i32>
 ; CHECK: select <2 x i1>
 ; CHECK: middle.block
-- 
cgit v1.2.3-70-g09d2


From f852472823fd2182a3ca54bdf4d30ad8a6a6cd57 Mon Sep 17 00:00:00 2001
From: Arnold Schwaighofer <aschwaighofer@apple.com>
Date: Sun, 5 May 2013 01:54:44 +0000
Subject: LoopVectorizer: Cleanup of miminimum/maximum pattern match code

No need for setting the operands. The pointers are going to be bound by the
matcher.

radar://13723044

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@181142 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1c7397bfa6..9f90587551 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2991,8 +2991,8 @@ LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I, ReductionIns
   if (!Cmp->hasOneUse())
     return ReductionInstDesc(false, I);
 
-  Value *CmpLeft = Cmp->getOperand(0);
-  Value *CmpRight = Cmp->getOperand(1);
+  Value *CmpLeft;
+  Value *CmpRight;
 
   // Look for a min/max pattern.
   if (m_UMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
-- 
cgit v1.2.3-70-g09d2


From 87defd0924e08dd9c9db51e2fb208f289fa6adf7 Mon Sep 17 00:00:00 2001
From: Arnold Schwaighofer <aschwaighofer@apple.com>
Date: Sun, 5 May 2013 01:54:48 +0000
Subject: LoopVectorize: Add support for floating point min/max reductions

Add support for min/max reductions when "no-nans-float-math" is enabled. This
allows us to assume we have ordered floating point math and treat ordered and
unordered predicates equally.

radar://13723044

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@181144 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp        |  91 +++-
 test/Transforms/LoopVectorize/minmax_reduction.ll | 480 ++++++++++++++++++++++
 2 files changed, 549 insertions(+), 22 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 9f90587551..c7ff7c3fce 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -335,7 +335,7 @@ public:
                             DominatorTree *DT, TargetTransformInfo* TTI,
                             AliasAnalysis *AA, TargetLibraryInfo *TLI)
       : TheLoop(L), SE(SE), DL(DL), DT(DT), TTI(TTI), AA(AA), TLI(TLI),
-        Induction(0) {}
+        Induction(0), HasFunNoNaNAttr(false) {}
 
   /// This enum represents the kinds of reductions that we support.
   enum ReductionKind {
@@ -347,7 +347,8 @@ public:
     RK_IntegerXor,  ///< Bitwise or logical XOR of numbers.
     RK_IntegerMinMax, ///< Min/max implemented in terms of select(cmp()).
     RK_FloatAdd,    ///< Sum of floats.
-    RK_FloatMult    ///< Product of floats.
+    RK_FloatMult,   ///< Product of floats.
+    RK_FloatMinMax  ///< Min/max implemented in terms of select(cmp()).
   };
 
   /// This enum represents the kinds of inductions that we support.
@@ -365,7 +366,9 @@ public:
     MRK_UIntMin,
     MRK_UIntMax,
     MRK_SIntMin,
-    MRK_SIntMax
+    MRK_SIntMax,
+    MRK_FloatMin,
+    MRK_FloatMax
   };
 
   /// This POD struct holds information about reduction variables.
@@ -586,6 +589,8 @@ private:
   /// We need to check that all of the pointers in this list are disjoint
   /// at runtime.
   RuntimePointerCheck PtrRtCheck;
+  /// Can we assume the absence of NaNs.
+  bool HasFunNoNaNAttr;
 };
 
 /// LoopVectorizationCostModel - estimates the expected speedups due to
@@ -1648,6 +1653,8 @@ getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) {
       return Instruction::FAdd;
     case LoopVectorizationLegality::RK_IntegerMinMax:
       return Instruction::ICmp;
+    case LoopVectorizationLegality::RK_FloatMinMax:
+      return Instruction::FCmp;
     default:
       llvm_unreachable("Unknown reduction operation");
   }
@@ -1672,8 +1679,21 @@ Value *createMinMaxOp(IRBuilder<> &Builder,
     break;
   case LoopVectorizationLegality::MRK_SIntMax:
     P = CmpInst::ICMP_SGT;
+    break;
+  case LoopVectorizationLegality::MRK_FloatMin:
+    P = CmpInst::FCMP_OLT;
+    break;
+  case LoopVectorizationLegality::MRK_FloatMax:
+    P = CmpInst::FCMP_OGT;
+    break;
   }
-  Value *Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp");
+
+  Value *Cmp;
+  if (RK == LoopVectorizationLegality::MRK_FloatMin || RK == LoopVectorizationLegality::MRK_FloatMax)
+    Cmp = Builder.CreateFCmp(P, Left, Right, "rdx.minmax.cmp");
+  else
+    Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp");
+
   Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select");
   return Select;
 }
@@ -1743,11 +1763,12 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     // one for multiplication, -1 for And.
     Value *Identity;
     Value *VectorStart;
-    if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerMinMax)
+    if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerMinMax ||
+        RdxDesc.Kind == LoopVectorizationLegality::RK_FloatMinMax) {
       // MinMax reduction have the start value as their identify.
       VectorStart = Identity = Builder.CreateVectorSplat(VF, RdxDesc.StartValue,
                                                          "minmax.ident");
-    else {
+    } else {
       Constant *Iden =
         LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind,
                                                         VecTy->getScalarType());
@@ -1801,7 +1822,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     Value *ReducedPartRdx = RdxParts[0];
     unsigned Op = getReductionBinOp(RdxDesc.Kind);
     for (unsigned part = 1; part < UF; ++part) {
-      if (Op != Instruction::ICmp)
+      if (Op != Instruction::ICmp && Op != Instruction::FCmp)
         ReducedPartRdx = Builder.CreateBinOp((Instruction::BinaryOps)Op,
                                              RdxParts[part], ReducedPartRdx,
                                              "bin.rdx");
@@ -1832,7 +1853,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
                                     ConstantVector::get(ShuffleMask),
                                     "rdx.shuf");
 
-      if (Op != Instruction::ICmp)
+      if (Op != Instruction::ICmp && Op != Instruction::FCmp)
         TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf,
                                      "bin.rdx");
       else
@@ -2363,6 +2384,13 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
     return false;
   }
 
+  // Look for the attribute signaling the absence of NaNs.
+  Function &F = *Header->getParent();
+  if (F.hasFnAttribute("no-nans-fp-math"))
+    HasFunNoNaNAttr = F.getAttributes().getAttribute(
+      AttributeSet::FunctionIndex,
+      "no-nans-fp-math").getValueAsString() == "true";
+
   // For each block in the loop.
   for (Loop::block_iterator bb = TheLoop->block_begin(),
        be = TheLoop->block_end(); bb != be; ++bb) {
@@ -2444,6 +2472,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           DEBUG(dbgs() << "LV: Found an FAdd reduction PHI."<< *Phi <<"\n");
           continue;
         }
+        if (AddReductionVar(Phi, RK_FloatMinMax)) {
+          DEBUG(dbgs() << "LV: Found an float MINMAX reduction PHI."<< *Phi <<"\n");
+          continue;
+        }
 
         DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
         return false;
@@ -2869,7 +2901,7 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
   // such that we don't stop when we see the phi has two uses (one by the select
   // and one by the icmp) and to make sure we only see exactly the two
   // instructions.
-  unsigned NumICmpSelectPatternInst = 0;
+  unsigned NumCmpSelectPatternInst = 0;
   ReductionInstDesc ReduxDesc(false, 0);
 
   // Avoid cycles in the chain.
@@ -2918,7 +2950,7 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
 
       // We can't have multiple inside users except for a combination of
       // icmp/select both using the phi.
-      if (FoundInBlockUser && !NumICmpSelectPatternInst)
+      if (FoundInBlockUser && !NumCmpSelectPatternInst)
         return false;
       FoundInBlockUser = true;
 
@@ -2927,14 +2959,15 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
       if (!ReduxDesc.IsReduction)
         return false;
 
-      if (Kind == RK_IntegerMinMax && (isa<ICmpInst>(U) ||
-                                       isa<SelectInst>(U)))
-          ++NumICmpSelectPatternInst;
+      if (Kind == RK_IntegerMinMax && (isa<ICmpInst>(U) || isa<SelectInst>(U)))
+          ++NumCmpSelectPatternInst;
+      if (Kind == RK_FloatMinMax && (isa<FCmpInst>(U) || isa<SelectInst>(U)))
+          ++NumCmpSelectPatternInst;
 
       // Reductions of instructions such as Div, and Sub is only
       // possible if the LHS is the reduction variable.
       if (!U->isCommutative() && !isa<PHINode>(U) && !isa<SelectInst>(U) &&
-          !isa<ICmpInst>(U) && U->getOperand(0) != Iter)
+          !isa<ICmpInst>(U) && !isa<FCmpInst>(U) && U->getOperand(0) != Iter)
         return false;
 
       Iter = ReduxDesc.PatternLastInst;
@@ -2942,7 +2975,8 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
 
     // This means we have seen one but not the other instruction of the
     // pattern or more than just a select and cmp.
-    if (Kind == RK_IntegerMinMax && NumICmpSelectPatternInst != 2)
+    if ((Kind == RK_IntegerMinMax || Kind == RK_FloatMinMax) &&
+        NumCmpSelectPatternInst != 2)
       return false;
 
     // We found a reduction var if we have reached the original
@@ -2968,16 +3002,17 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
 /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction
 /// pattern corresponding to a min(X, Y) or max(X, Y).
 LoopVectorizationLegality::ReductionInstDesc
-LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I, ReductionInstDesc &Prev) {
+LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I,
+                                                    ReductionInstDesc &Prev) {
 
-  assert((isa<ICmpInst>(I) || isa<SelectInst>(I)) &&
+  assert((isa<ICmpInst>(I) || isa<FCmpInst>(I) || isa<SelectInst>(I)) &&
          "Expect a select instruction");
-  ICmpInst *Cmp = 0;
+  Instruction *Cmp = 0;
   SelectInst *Select = 0;
 
   // We must handle the select(cmp()) as a single instruction. Advance to the
   // select.
-  if ((Cmp = dyn_cast<ICmpInst>(I))) {
+  if ((Cmp = dyn_cast<ICmpInst>(I)) || (Cmp = dyn_cast<FCmpInst>(I))) {
     if (!Cmp->hasOneUse() || !(Select = dyn_cast<SelectInst>(*I->use_begin())))
       return ReductionInstDesc(false, I);
     return ReductionInstDesc(Select, Prev.MinMaxKind);
@@ -2986,7 +3021,8 @@ LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I, ReductionIns
   // Only handle single use cases for now.
   if (!(Select = dyn_cast<SelectInst>(I)))
     return ReductionInstDesc(false, I);
-  if (!(Cmp = dyn_cast<ICmpInst>(I->getOperand(0))))
+  if (!(Cmp = dyn_cast<ICmpInst>(I->getOperand(0))) &&
+      !(Cmp = dyn_cast<FCmpInst>(I->getOperand(0))))
     return ReductionInstDesc(false, I);
   if (!Cmp->hasOneUse())
     return ReductionInstDesc(false, I);
@@ -3003,6 +3039,14 @@ LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I, ReductionIns
     return ReductionInstDesc(Select, MRK_SIntMax);
   else if (m_SMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
     return ReductionInstDesc(Select, MRK_SIntMin);
+  else if (m_OrdFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+    return ReductionInstDesc(Select, MRK_FloatMin);
+  else if (m_OrdFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+    return ReductionInstDesc(Select, MRK_FloatMax);
+  else if (m_UnordFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+    return ReductionInstDesc(Select, MRK_FloatMin);
+  else if (m_UnordFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+    return ReductionInstDesc(Select, MRK_FloatMax);
 
   return ReductionInstDesc(false, I);
 }
@@ -3017,7 +3061,8 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I,
   default:
     return ReductionInstDesc(false, I);
   case Instruction::PHI:
-      if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd))
+      if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd &&
+                 Kind != RK_FloatMinMax))
         return ReductionInstDesc(false, I);
     return ReductionInstDesc(I, Prev.MinMaxKind);
   case Instruction::Sub:
@@ -3035,9 +3080,11 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I,
     return ReductionInstDesc(Kind == RK_FloatMult && FastMath, I);
   case Instruction::FAdd:
     return ReductionInstDesc(Kind == RK_FloatAdd && FastMath, I);
+  case Instruction::FCmp:
   case Instruction::ICmp:
   case Instruction::Select:
-    if (Kind != RK_IntegerMinMax)
+    if (Kind != RK_IntegerMinMax &&
+        (!HasFunNoNaNAttr || Kind != RK_FloatMinMax))
       return ReductionInstDesc(false, I);
     return isMinMaxSelectCmpPattern(I, Prev);
   }
diff --git a/test/Transforms/LoopVectorize/minmax_reduction.ll b/test/Transforms/LoopVectorize/minmax_reduction.ll
index 36a8758e2c..502fd8b938 100644
--- a/test/Transforms/LoopVectorize/minmax_reduction.ll
+++ b/test/Transforms/LoopVectorize/minmax_reduction.ll
@@ -3,6 +3,8 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 @A = common global [1024 x i32] zeroinitializer, align 16
+@fA = common global [1024 x float] zeroinitializer, align 16
+@dA = common global [1024 x double] zeroinitializer, align 16
 
 ; Signed tests.
 
@@ -403,3 +405,481 @@ for.body:
 for.end:
   ret i32 %max.red.0
 }
+
+; Float tests.
+
+; Maximum.
+
+; Turn this into a max reduction in the presence of a no-nans-fp-math attribute.
+; CHECK: @max_red_float
+; CHECK: fcmp ogt <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp ogt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @max_red_float(float %max) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %cmp3 = fcmp ogt float %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, float %0, float %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %max.red.0
+}
+
+; CHECK: @max_red_float_ge
+; CHECK: fcmp oge <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp ogt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @max_red_float_ge(float %max) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %cmp3 = fcmp oge float %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, float %0, float %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %max.red.0
+}
+
+; CHECK: @inverted_max_red_float
+; CHECK: fcmp olt <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp ogt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @inverted_max_red_float(float %max) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %cmp3 = fcmp olt float %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, float %max.red.08, float %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %max.red.0
+}
+
+; CHECK: @inverted_max_red_float_le
+; CHECK: fcmp ole <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp ogt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @inverted_max_red_float_le(float %max) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %cmp3 = fcmp ole float %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, float %max.red.08, float %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %max.red.0
+}
+
+; CHECK: @unordered_max_red
+; CHECK: fcmp ugt <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp ogt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @unordered_max_red_float(float %max) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %cmp3 = fcmp ugt float %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, float %0, float %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %max.red.0
+}
+
+; CHECK: @unordered_max_red_float_ge
+; CHECK: fcmp uge <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp ogt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @unordered_max_red_float_ge(float %max) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %cmp3 = fcmp uge float %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, float %0, float %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %max.red.0
+}
+
+; CHECK: @inverted_unordered_max_red
+; CHECK: fcmp ult <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp ogt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @inverted_unordered_max_red_float(float %max) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %cmp3 = fcmp ult float %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, float %max.red.08, float %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %max.red.0
+}
+
+; CHECK: @inverted_unordered_max_red_float_le
+; CHECK: fcmp ule <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp ogt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @inverted_unordered_max_red_float_le(float %max) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %cmp3 = fcmp ule float %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, float %max.red.08, float %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %max.red.0
+}
+
+; Minimum.
+
+; Turn this into a min reduction in the presence of a no-nans-fp-math attribute.
+; CHECK: @min_red_float
+; CHECK: fcmp olt <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp olt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @min_red_float(float %min) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %cmp3 = fcmp olt float %0, %min.red.08
+  %min.red.0 = select i1 %cmp3, float %0, float %min.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %min.red.0
+}
+
+; CHECK: @min_red_float_le
+; CHECK: fcmp ole <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp olt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @min_red_float_le(float %min) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %cmp3 = fcmp ole float %0, %min.red.08
+  %min.red.0 = select i1 %cmp3, float %0, float %min.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %min.red.0
+}
+
+; CHECK: @inverted_min_red_float
+; CHECK: fcmp ogt <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp olt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @inverted_min_red_float(float %min) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %cmp3 = fcmp ogt float %0, %min.red.08
+  %min.red.0 = select i1 %cmp3, float %min.red.08, float %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %min.red.0
+}
+
+; CHECK: @inverted_min_red_float_ge
+; CHECK: fcmp oge <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp olt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @inverted_min_red_float_ge(float %min) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %cmp3 = fcmp oge float %0, %min.red.08
+  %min.red.0 = select i1 %cmp3, float %min.red.08, float %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %min.red.0
+}
+
+; CHECK: @unordered_min_red
+; CHECK: fcmp ult <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp olt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @unordered_min_red_float(float %min) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %cmp3 = fcmp ult float %0, %min.red.08
+  %min.red.0 = select i1 %cmp3, float %0, float %min.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %min.red.0
+}
+
+; CHECK: @unordered_min_red_float_le
+; CHECK: fcmp ule <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp olt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @unordered_min_red_float_le(float %min) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %cmp3 = fcmp ule float %0, %min.red.08
+  %min.red.0 = select i1 %cmp3, float %0, float %min.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %min.red.0
+}
+
+; CHECK: @inverted_unordered_min_red
+; CHECK: fcmp ugt <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp olt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @inverted_unordered_min_red_float(float %min) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %cmp3 = fcmp ugt float %0, %min.red.08
+  %min.red.0 = select i1 %cmp3, float %min.red.08, float %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %min.red.0
+}
+
+; CHECK: @inverted_unordered_min_red_float_ge
+; CHECK: fcmp uge <2 x float>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp olt <2 x float>
+; CHECK: select <2 x i1>
+
+define float @inverted_unordered_min_red_float_ge(float %min) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %min.red.08 = phi float [ %min, %entry ], [ %min.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %cmp3 = fcmp uge float %0, %min.red.08
+  %min.red.0 = select i1 %cmp3, float %min.red.08, float %0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %min.red.0
+}
+
+; Make sure we handle doubles, too.
+; CHECK: @min_red_double
+; CHECK: fcmp olt <2 x double>
+; CHECK: select <2 x i1>
+; CHECK: middle.block
+; CHECK: fcmp olt <2 x double>
+; CHECK: select <2 x i1>
+
+define double @min_red_double(double %min) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %min.red.08 = phi double [ %min, %entry ], [ %min.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x double]* @dA, i64 0, i64 %indvars.iv
+  %0 = load double* %arrayidx, align 4
+  %cmp3 = fcmp olt double %0, %min.red.08
+  %min.red.0 = select i1 %cmp3, double %0, double %min.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret double %min.red.0
+}
+
+
+; Don't this into a max reduction. The no-nans-fp-math attribute is missing
+; CHECK: @max_red_float_nans
+; CHECK-NOT: <2 x float>
+
+define float @max_red_float_nans(float %max) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %max.red.08 = phi float [ %max, %entry ], [ %max.red.0, %for.body ]
+  %arrayidx = getelementptr inbounds [1024 x float]* @fA, i64 0, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4
+  %cmp3 = fcmp ogt float %0, %max.red.08
+  %max.red.0 = select i1 %cmp3, float %0, float %max.red.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret float %max.red.0
+}
+
+
+attributes #0 = { "no-nans-fp-math"="true" } 
-- 
cgit v1.2.3-70-g09d2


From acc47c738d5888f7445275095a174c7f48296449 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Sun, 5 May 2013 14:54:52 +0000
Subject: LoopVectorize: Print values instead of pointers in debug output.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@181157 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index c7ff7c3fce..9a85ee812c 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2804,8 +2804,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
                                         Inst,
                                         WriteObjects,
                                         MaxByteWidth)) {
-        DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
-              << *UI <<"\n");
+        DEBUG(dbgs() << "LV: Found a possible write-write reorder:" << **UI
+                     << "\n");
         return false;
       }
 
@@ -2848,8 +2848,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
                                         Inst,
                                         WriteObjects,
                                         MaxByteWidth)) {
-        DEBUG(dbgs() << "LV: Found a possible read-write reorder:"
-              << *UI <<"\n");
+        DEBUG(dbgs() << "LV: Found a possible read-write reorder:" << **UI
+                     << "\n");
         return false;
       }
     }
-- 
cgit v1.2.3-70-g09d2


From 37d38b7668f3dd64e2263426c29253ace50865b3 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Mon, 6 May 2013 03:06:36 +0000
Subject: Update the comment to mention that we use TTI.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@181178 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 9a85ee812c..491e9cc248 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8,9 +8,9 @@
 //===----------------------------------------------------------------------===//
 //
 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
-// and generates target-independent LLVM-IR. Legalization of the IR is done
-// in the codegen. However, the vectorizer uses (will use) the codegen
-// interfaces to generate IR that is likely to result in an optimal binary.
+// and generates target-independent LLVM-IR.
+// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
+// of instructions in order to estimate the profitability of vectorization.
 //
 // The loop vectorizer combines consecutive loop iterations into a single
 // 'wide' iteration. After this transformation the index is incremented
-- 
cgit v1.2.3-70-g09d2


From c7594e48edb1b7a2ec9fc81981e539eac0604905 Mon Sep 17 00:00:00 2001
From: Bill Wendling <isanbard@gmail.com>
Date: Wed, 8 May 2013 18:13:06 +0000
Subject: Merging r181286:
 ------------------------------------------------------------------------
 r181286 | arnolds | 2013-05-06 21:37:05 -0700 (Mon, 06 May 2013) | 7 lines

LoopVectorize: getConsecutiveVector must respect signed arithmetic

We were passing an i32 to ConstantInt::get where an i64 was needed and we must
also pass the sign if we pass negatives numbers. The start index passed to
getConsecutiveVector must also be signed.

Should fix PR15882.
------------------------------------------------------------------------


git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_33@181455 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp         | 11 +--
 test/Transforms/LoopVectorize/reverse_induction.ll | 79 ++++++++++++++++++++++
 2 files changed, 85 insertions(+), 5 deletions(-)
 create mode 100644 test/Transforms/LoopVectorize/reverse_induction.ll

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 491e9cc248..ad4a6c7c41 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -216,7 +216,7 @@ private:
   /// This function adds 0, 1, 2 ... to each vector element, starting at zero.
   /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...).
   /// The sequence starts at StartIndex.
-  Value *getConsecutiveVector(Value* Val, unsigned StartIdx, bool Negate);
+  Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate);
 
   /// When we go over instructions in the basic block we rely on previous
   /// values within the current basic block or on loop invariant values.
@@ -829,7 +829,7 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
   return Shuf;
 }
 
-Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, unsigned StartIdx,
+Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx,
                                                  bool Negate) {
   assert(Val->getType()->isVectorTy() && "Must be a vector");
   assert(Val->getType()->getScalarType()->isIntegerTy() &&
@@ -842,8 +842,8 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, unsigned StartIdx,
 
   // Create a vector of consecutive numbers from zero to VF.
   for (int i = 0; i < VLen; ++i) {
-    int Idx = Negate ? (-i): i;
-    Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx));
+    int64_t Idx = Negate ? (-i) : i;
+    Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx, Negate));
   }
 
   // Add the consecutive indices to the vector value.
@@ -2072,7 +2072,8 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
           // After broadcasting the induction variable we need to make the
           // vector consecutive by adding  ... -3, -2, -1, 0.
           for (unsigned part = 0; part < UF; ++part)
-            Entry[part] = getConsecutiveVector(Broadcasted, -VF * part, true);
+            Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part,
+                                               true);
           continue;
         }
 
diff --git a/test/Transforms/LoopVectorize/reverse_induction.ll b/test/Transforms/LoopVectorize/reverse_induction.ll
new file mode 100644
index 0000000000..f43f02bc31
--- /dev/null
+++ b/test/Transforms/LoopVectorize/reverse_induction.ll
@@ -0,0 +1,79 @@
+; RUN: opt < %s -loop-vectorize -force-vector-unroll=2 -force-vector-width=4 -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; Make sure consecutive vector generates correct negative indices.
+; PR15882
+
+; CHECK: reverse_induction_i64
+; CHECK: add <4 x i64> %[[SPLAT:.*]], <i64 0, i64 -1, i64 -2, i64 -3>
+; CHECK: add <4 x i64> %[[SPLAT]], <i64 -4, i64 -5, i64 -6, i64 -7>
+
+define i32 @reverse_induction_i64(i64 %startval, i32 * %ptr) {
+entry:
+  br label %for.body
+
+for.body:
+  %add.i7 = phi i64 [ %startval, %entry ], [ %add.i, %for.body ]
+  %i.06 = phi i32 [ 0, %entry ], [ %inc4, %for.body ]
+  %redux5 = phi i32 [ 0, %entry ], [ %inc.redux, %for.body ]
+  %add.i = add i64 %add.i7, -1
+  %kind_.i = getelementptr inbounds i32* %ptr, i64 %add.i
+  %tmp.i1 = load i32* %kind_.i, align 4
+  %inc.redux = add i32 %tmp.i1, %redux5
+  %inc4 = add i32 %i.06, 1
+  %exitcond = icmp ne i32 %inc4, 1024
+  br i1 %exitcond, label %for.body, label %loopend
+
+loopend:
+  ret i32 %inc.redux
+}
+
+; CHECK: reverse_induction_i128
+; CHECK: add <4 x i128> %[[SPLAT:.*]], <i128 0, i128 -1, i128 -2, i128 -3>
+; CHECK: add <4 x i128> %[[SPLAT]], <i128 -4, i128 -5, i128 -6, i128 -7>
+define i32 @reverse_induction_i128(i128 %startval, i32 * %ptr) {
+entry:
+  br label %for.body
+
+for.body:
+  %add.i7 = phi i128 [ %startval, %entry ], [ %add.i, %for.body ]
+  %i.06 = phi i32 [ 0, %entry ], [ %inc4, %for.body ]
+  %redux5 = phi i32 [ 0, %entry ], [ %inc.redux, %for.body ]
+  %add.i = add i128 %add.i7, -1
+  %kind_.i = getelementptr inbounds i32* %ptr, i128 %add.i
+  %tmp.i1 = load i32* %kind_.i, align 4
+  %inc.redux = add i32 %tmp.i1, %redux5
+  %inc4 = add i32 %i.06, 1
+  %exitcond = icmp ne i32 %inc4, 1024
+  br i1 %exitcond, label %for.body, label %loopend
+
+loopend:
+  ret i32 %inc.redux
+}
+
+; CHECK: reverse_induction_i16
+; CHECK: add <4 x i16> %[[SPLAT:.*]], <i16 0, i16 -1, i16 -2, i16 -3>
+; CHECK: add <4 x i16> %[[SPLAT]], <i16 -4, i16 -5, i16 -6, i16 -7>
+
+define i32 @reverse_induction_i16(i16 %startval, i32 * %ptr) {
+entry:
+  br label %for.body
+
+for.body:
+  %add.i7 = phi i16 [ %startval, %entry ], [ %add.i, %for.body ]
+  %i.06 = phi i32 [ 0, %entry ], [ %inc4, %for.body ]
+  %redux5 = phi i32 [ 0, %entry ], [ %inc.redux, %for.body ]
+  %add.i = add i16 %add.i7, -1
+  %kind_.i = getelementptr inbounds i32* %ptr, i16 %add.i
+  %tmp.i1 = load i32* %kind_.i, align 4
+  %inc.redux = add i32 %tmp.i1, %redux5
+  %inc4 = add i32 %i.06, 1
+  %exitcond = icmp ne i32 %inc4, 1024
+  br i1 %exitcond, label %for.body, label %loopend
+
+loopend:
+  ret i32 %inc.redux
+}
+
+
-- 
cgit v1.2.3-70-g09d2


From 1df0680640af28109fdf60d73f56fe8b0520d8fe Mon Sep 17 00:00:00 2001
From: Bill Wendling <isanbard@gmail.com>
Date: Wed, 22 May 2013 17:30:06 +0000
Subject: Merging r182485:
 ------------------------------------------------------------------------
 r182485 | arnolds | 2013-05-22 09:54:56 -0700 (Wed, 22 May 2013) | 7 lines

LoopVectorize: Make Value pointers that could be RAUW'ed a VH

The Value pointers we store in the induction variable list can be RAUW'ed by a
call to SCEVExpander::expandCodeFor, use a TrackingVH instead. Do the same thing
in some other places where we store pointers that could potentially be RAUW'ed.

Fixes PR16073.
------------------------------------------------------------------------


git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_33@182492 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp     |  7 ++--
 test/Transforms/LoopVectorize/value-ptr-bug.ll | 50 ++++++++++++++++++++++++++
 2 files changed, 54 insertions(+), 3 deletions(-)
 create mode 100644 test/Transforms/LoopVectorize/value-ptr-bug.ll

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index ad4a6c7c41..4f6f51c9bb 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -80,6 +80,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/PatternMatch.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/ValueHandle.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -382,7 +383,7 @@ public:
 
     // The starting value of the reduction.
     // It does not have to be zero!
-    Value *StartValue;
+    TrackingVH<Value> StartValue;
     // The instruction who's value is used outside the loop.
     Instruction *LoopExitInstr;
     // The kind of the reduction.
@@ -427,7 +428,7 @@ public:
     /// This flag indicates if we need to add the runtime check.
     bool Need;
     /// Holds the pointers that we need to check.
-    SmallVector<Value*, 2> Pointers;
+    SmallVector<TrackingVH<Value>, 2> Pointers;
     /// Holds the pointer value at the beginning of the loop.
     SmallVector<const SCEV*, 2> Starts;
     /// Holds the pointer value at the end of the loop.
@@ -441,7 +442,7 @@ public:
     InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {}
     InductionInfo() : StartValue(0), IK(IK_NoInduction) {}
     /// Start value.
-    Value *StartValue;
+    TrackingVH<Value> StartValue;
     /// Induction kind.
     InductionKind IK;
   };
diff --git a/test/Transforms/LoopVectorize/value-ptr-bug.ll b/test/Transforms/LoopVectorize/value-ptr-bug.ll
new file mode 100644
index 0000000000..f376656f07
--- /dev/null
+++ b/test/Transforms/LoopVectorize/value-ptr-bug.ll
@@ -0,0 +1,50 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-unroll=1 -dce -instcombine < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; PR16073
+
+; Because we were caching value pointers accross a function call that could RAUW
+; we would generate an undefined value store below:
+; SCEVExpander::expandCodeFor would change a value (the start value of an
+; induction) that we cached in the induction variable list.
+
+; CHECK: test_vh
+; CHECK-NOT: store <4 x i8> undef
+
+define void @test_vh(i32* %ptr265, i32* %ptr266, i32 %sub267) {
+entry:
+  br label %loop
+
+loop:
+  %inc = phi i32 [ %sub267, %entry ], [ %add, %loop]
+  %ext.inc = sext i32 %inc to i64
+  %add.ptr265 = getelementptr inbounds i32* %ptr265, i64 %ext.inc
+  %add.ptr266 = getelementptr inbounds i32* %ptr266, i64 %ext.inc
+  %add = add i32 %inc, 9
+  %cmp = icmp slt i32 %add, 140
+  br i1 %cmp, label %block1, label %loop
+
+block1:
+  %sub267.lcssa = phi i32 [ %add, %loop ]
+  %add.ptr266.lcssa = phi i32* [ %add.ptr266, %loop ]
+  %add.ptr265.lcssa = phi i32* [ %add.ptr265, %loop ]
+  %tmp29 = bitcast i32* %add.ptr265.lcssa to i8*
+  %tmp30 = bitcast i32* %add.ptr266.lcssa to i8*
+  br label %do.body272
+
+do.body272:
+  %row_width.5 = phi i32 [ %sub267.lcssa, %block1 ], [ %dec, %do.body272 ]
+  %sp.4 = phi i8* [ %tmp30, %block1 ], [ %incdec.ptr273, %do.body272 ]
+  %dp.addr.4 = phi i8* [ %tmp29, %block1 ], [ %incdec.ptr274, %do.body272 ]
+  %incdec.ptr273 = getelementptr inbounds i8* %sp.4, i64 1
+  %tmp31 = load i8* %sp.4, align 1
+  %incdec.ptr274 = getelementptr inbounds i8* %dp.addr.4, i64 1
+  store i8 %tmp31, i8* %dp.addr.4, align 1
+  %dec = add i32 %row_width.5, -1
+  %cmp276 = icmp eq i32 %dec, 0
+  br i1 %cmp276, label %loop.exit, label %do.body272
+
+loop.exit:
+  ret void
+}
-- 
cgit v1.2.3-70-g09d2


From da897215b28fe4f1f94aa5ce5cfa54c7a757064d Mon Sep 17 00:00:00 2001
From: Bill Wendling <isanbard@gmail.com>
Date: Tue, 28 May 2013 18:37:09 +0000
Subject: Merging r182656:
 ------------------------------------------------------------------------
 r182656 | d0k | 2013-05-24 11:05:35 -0700 (Fri, 24 May 2013) | 3 lines

LoopVectorize: LoopSimplify can't canonicalize loops with an indirectbr in it, don't assert on those cases.

Fixes PR16139.
------------------------------------------------------------------------


git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_33@182785 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp   |  5 ++++-
 test/Transforms/LoopVectorize/lcssa-crash.ll | 11 +++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 4f6f51c9bb..bc420bd6a9 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2308,7 +2308,10 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
 }
 
 bool LoopVectorizationLegality::canVectorize() {
-  assert(TheLoop->getLoopPreheader() && "No preheader!!");
+  // We must have a loop in canonical form. Loops with indirectbr in them cannot
+  // be canonicalized.
+  if (!TheLoop->getLoopPreheader())
+    return false;
 
   // We can only vectorize innermost loops.
   if (TheLoop->getSubLoopsVector().size())
diff --git a/test/Transforms/LoopVectorize/lcssa-crash.ll b/test/Transforms/LoopVectorize/lcssa-crash.ll
index 06b3b08aa0..de6be54849 100644
--- a/test/Transforms/LoopVectorize/lcssa-crash.ll
+++ b/test/Transforms/LoopVectorize/lcssa-crash.ll
@@ -27,3 +27,14 @@ for.end.i.i.i:
   unreachable
 }
 
+; PR16139
+define void @test2(i8* %x) {
+entry:
+  indirectbr i8* %x, [ label %L0, label %L1 ]
+
+L0:
+  br label %L0
+
+L1:
+  ret void
+}
-- 
cgit v1.2.3-70-g09d2


From 1419ad7b434bdc1e1ab805752fba1a5b7c02244d Mon Sep 17 00:00:00 2001
From: Bill Wendling <isanbard@gmail.com>
Date: Tue, 4 Jun 2013 04:35:28 +0000
Subject: Merging r183035:
 ------------------------------------------------------------------------
 r183035 | arnolds | 2013-05-31 12:53:50 -0700 (Fri, 31 May 2013) | 7 lines

LoopVectorize: PHIs with only outside users should prevent vectorization

We check that instructions in the loop don't have outside users (except if
they are reduction values). Unfortunately, we skipped this check for
if-convertable PHIs.

Fixes PR16184.
------------------------------------------------------------------------


git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_33@183189 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp       | 43 +++++++++++++++++-------
 test/Transforms/LoopVectorize/no_outside_user.ll | 41 ++++++++++++++++++++++
 2 files changed, 71 insertions(+), 13 deletions(-)
 create mode 100644 test/Transforms/LoopVectorize/no_outside_user.ll

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index bc420bd6a9..08d372512d 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -2378,6 +2378,26 @@ bool LoopVectorizationLegality::canVectorize() {
   return true;
 }
 
+/// \brief Check that the instruction has outside loop users and is not an
+/// identified reduction variable.
+static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
+                               SmallPtrSet<Value *, 4> &Reductions) {
+  // Reduction instructions are allowed to have exit users. All other
+  // instructions must not have external users.
+  if (!Reductions.count(Inst))
+    //Check that all of the users of the loop are inside the BB.
+    for (Value::use_iterator I = Inst->use_begin(), E = Inst->use_end();
+         I != E; ++I) {
+      Instruction *U = cast<Instruction>(*I);
+      // This user may be a reduction exit value.
+      if (!TheLoop->contains(U)) {
+        DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
+        return true;
+      }
+    }
+  return false;
+}
+
 bool LoopVectorizationLegality::canVectorizeInstrs() {
   BasicBlock *PreHeader = TheLoop->getLoopPreheader();
   BasicBlock *Header = TheLoop->getHeader();
@@ -2416,8 +2436,13 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         // If this PHINode is not in the header block, then we know that we
         // can convert it to select during if-conversion. No need to check if
         // the PHIs in this block are induction or reduction variables.
-        if (*bb != Header)
-          continue;
+        if (*bb != Header) {
+          // Check that this instruction has no outside users or is an
+          // identified reduction value with an outside user.
+          if(!hasOutsideLoopUser(TheLoop, it, AllowedExit))
+            continue;
+          return false;
+        }
 
         // We only allow if-converted PHIs with more than two incoming values.
         if (Phi->getNumIncomingValues() != 2) {
@@ -2510,17 +2535,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
 
       // Reduction instructions are allowed to have exit users.
       // All other instructions must not have external users.
-      if (!AllowedExit.count(it))
-        //Check that all of the users of the loop are inside the BB.
-        for (Value::use_iterator I = it->use_begin(), E = it->use_end();
-             I != E; ++I) {
-          Instruction *U = cast<Instruction>(*I);
-          // This user may be a reduction exit value.
-          if (!TheLoop->contains(U)) {
-            DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
-            return false;
-          }
-        }
+      if (hasOutsideLoopUser(TheLoop, it, AllowedExit))
+        return false;
+
     } // next instr.
 
   }
diff --git a/test/Transforms/LoopVectorize/no_outside_user.ll b/test/Transforms/LoopVectorize/no_outside_user.ll
new file mode 100644
index 0000000000..6f0357c5e5
--- /dev/null
+++ b/test/Transforms/LoopVectorize/no_outside_user.ll
@@ -0,0 +1,41 @@
+; RUN: opt -S -loop-vectorize -force-vector-unroll=1 -force-vector-width=2 < %s | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
+
+@f = common global i32 0, align 4
+@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
+@c = common global i32 0, align 4
+@a = common global i32 0, align 4
+@b = common global i32 0, align 4
+@e = common global i32 0, align 4
+
+; We used to vectorize this loop. But it has a value that is used outside of the
+; and is not a recognized reduction variable "tmp17".
+
+; CHECK-NOT: <2 x i32>
+
+define i32 @main()  {
+bb:
+  %b.promoted = load i32* @b, align 4
+  br label %.lr.ph.i
+
+.lr.ph.i:
+  %tmp8 = phi i32 [ %tmp18, %bb16 ], [ %b.promoted, %bb ]
+  %tmp2 = icmp sgt i32 %tmp8, 10
+  br i1 %tmp2, label %bb16, label %bb10
+
+bb10:
+  br label %bb16
+
+bb16:
+  %tmp17 = phi i32 [ 0, %bb10 ], [ 1, %.lr.ph.i ]
+  %tmp18 = add nsw i32 %tmp8, 1
+  %tmp19 = icmp slt i32 %tmp18, 4
+  br i1 %tmp19, label %.lr.ph.i, label %f1.exit.loopexit
+
+f1.exit.loopexit:
+  %.lcssa = phi i32 [ %tmp17, %bb16 ]
+  ret i32 %.lcssa
+}
+
+
-- 
cgit v1.2.3-70-g09d2


From 7dfcb84fc16b3bf6b2379713b53090757f0a45f9 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard@amd.com>
Date: Mon, 8 Jul 2013 21:20:34 +0000
Subject: Fix for a regression caused by the LoopVectorizer when vectorizing
 loops with memory accesses to non-zero address spaces. It simply dropped the
 AS info. Fixes PR16306.

Merged from r184103
Author: Pekka Jaaskelainen <pekka.jaaskelainen@tut.fi>
Date:   Mon Jun 17 18:49:06 2013 +0000

git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/release_33@185869 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp         |  6 +--
 .../LoopVectorize/multiple-address-spaces.ll       | 47 ++++++++++++++++++++++
 2 files changed, 50 insertions(+), 3 deletions(-)
 create mode 100644 test/Transforms/LoopVectorize/multiple-address-spaces.ll

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 08d372512d..11ee99ddf1 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -964,7 +964,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
   Type *DataTy = VectorType::get(ScalarDataTy, VF);
   Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
   unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
-
+  unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
   unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy);
   unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF;
 
@@ -1039,7 +1039,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
         PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
       }
 
-      Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo());
+      Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
       Builder.CreateStore(StoredVal[Part], VecPtr)->setAlignment(Alignment);
     }
   }
@@ -1055,7 +1055,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
       PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
     }
 
-    Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo());
+    Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
     Value *LI = Builder.CreateLoad(VecPtr, "wide.load");
     cast<LoadInst>(LI)->setAlignment(Alignment);
     Entry[Part] = Reverse ? reverseVector(LI) :  LI;
diff --git a/test/Transforms/LoopVectorize/multiple-address-spaces.ll b/test/Transforms/LoopVectorize/multiple-address-spaces.ll
new file mode 100644
index 0000000000..70ced7c406
--- /dev/null
+++ b/test/Transforms/LoopVectorize/multiple-address-spaces.ll
@@ -0,0 +1,47 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-unroll=1 -force-vector-width=4 -dce -instcombine -S | FileCheck %s
+
+; From a simple program with two address spaces:
+; char Y[4*10000] __attribute__((address_space(1)));
+; char X[4*10000];
+; int main() {
+;    for (int i = 0; i < 4*10000; ++i)
+;        X[i] = Y[i] + 1;
+;    return 0;
+;}
+
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@Y = common addrspace(1) global [40000 x i8] zeroinitializer, align 16
+@X = common global [40000 x i8] zeroinitializer, align 16
+
+;CHECK: @main
+;CHECK: bitcast i8 addrspace(1)* %{{.*}} to <4 x i8> addrspace(1)*
+;CHECK: bitcast i8* %{{.*}} to <4 x i8>*
+
+; Function Attrs: nounwind uwtable
+define i32 @main() #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds [40000 x i8] addrspace(1)* @Y, i64 0, i64 %indvars.iv
+  %0 = load i8 addrspace(1)* %arrayidx, align 1, !tbaa !0
+  %add = add i8 %0, 1
+  %arrayidx3 = getelementptr inbounds [40000 x i8]* @X, i64 0, i64 %indvars.iv
+  store i8 %add, i8* %arrayidx3, align 1, !tbaa !0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 40000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret i32 0
+}
+
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!0 = metadata !{metadata !"omnipotent char", metadata !1}
+!1 = metadata !{metadata !"Simple C/C++ TBAA"}
-- 
cgit v1.2.3-70-g09d2