1 files changed, 666 insertions, 193 deletions
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 9bbd9ab60b..f944d9b4fc 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7,15 +7,31 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This is a simple loop vectorizer. We currently only support single block
-// loops. We have a very simple and restrictive legality check: we need to read
-// and write from disjoint memory locations. We still don't have a cost model.
+// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
+// and generates target-independent LLVM-IR. Legalization of the IR is done
+// in the codegen. However, the vectorizes uses (will use) the codegen
+// interfaces to generate IR that is likely to result in an optimal binary.
+//
+// The loop vectorizer combines consecutive loop iteration into a single
+// 'wide' iteration. After this transformation the index is incremented
+// by the SIMD vector width, and not by one.
+//
 // This pass has three parts:
 // 1. The main loop pass that drives the different parts.
 // 2. LoopVectorizationLegality - A helper class that checks for the legality
 //    of the vectorization.
 // 3. SingleBlockLoopVectorizer - A helper class that performs the actual
 //    widening of instructions.
+//===----------------------------------------------------------------------===//
+//
+// The reduction-variable vectorization is based on the paper:
+//  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
+//
+// Variable uniformity checks are inspired by:
+// Karrenberg, R. and Hack, S. Whole Function Vectorization.
+//
+// Other ideas/concepts are from:
+//  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
 //
 //===----------------------------------------------------------------------===//
 #define LV_NAME "loop-vectorize"
@@ -54,40 +70,49 @@ static cl::opt<unsigned>
 DefaultVectorizationFactor("default-loop-vectorize-width",
                           cl::init(4), cl::Hidden,
                           cl::desc("Set the default loop vectorization width"));
-
 namespace {
 
-/// Vectorize a simple loop. This class performs the widening of simple single
-/// basic block loops into vectors. It does not perform any
-/// vectorization-legality checks, and just does it.  It widens the vectors
-/// to a given vectorization factor (VF).
+// Forward declaration.
+class LoopVectorizationLegality;
+
+/// SingleBlockLoopVectorizer vectorizes loops which contain only one basic
+/// block to a specified vectorization factor (VF).
+/// This class performs the widening of scalars into vectors, or multiple
+/// scalars. This class also implements the following features:
+/// * It inserts an epilogue loop for handling loops that don't have iteration
+///   counts that are known to be a multiple of the vectorization factor.
+/// * It handles the code generation for reduction variables.
+/// * Scalarization (implementation using scalars) of un-vectorizable
+///   instructions.
+/// SingleBlockLoopVectorizer does not perform any vectorization-legality
+/// checks, and relies on the caller to check for the different legality
+/// aspects. The SingleBlockLoopVectorizer relies on the
+/// LoopVectorizationLegality class to provide information about the induction
+/// and reduction variables that were found to a given vectorization factor.
 class SingleBlockLoopVectorizer {
 public:
   /// Ctor.
-  SingleBlockLoopVectorizer(Loop *OrigLoop, ScalarEvolution *Se, LoopInfo *Li,
+  SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li,
                             LPPassManager *Lpm, unsigned VecWidth):
-  Orig(OrigLoop), SE(Se), LI(Li), LPM(Lpm), VF(VecWidth),
-   Builder(0), Induction(0), OldInduction(0) { }
-
-  ~SingleBlockLoopVectorizer() {
-    delete Builder;
-  }
+  OrigLoop(Orig), SE(Se), LI(Li), LPM(Lpm), VF(VecWidth),
+  Builder(Se->getContext()), Induction(0), OldInduction(0) { }
 
   // Perform the actual loop widening (vectorization).
-  void vectorize() {
+  void vectorize(LoopVectorizationLegality *Legal) {
     ///Create a new empty loop. Unlink the old loop and connect the new one.
-    createEmptyLoop();
+    createEmptyLoop(Legal);
     /// Widen each instruction in the old loop to a new one in the new loop.
-    vectorizeLoop();
+    /// Use the Legality module to find the induction and reduction variables.
+   vectorizeLoop(Legal);
     // register the new loop.
     cleanup();
  }
 
 private:
   /// Create an empty loop, based on the loop ranges of the old loop.
-  void createEmptyLoop();
+  void createEmptyLoop(LoopVectorizationLegality *Legal);
   /// Copy and widen the instructions from the old loop.
-  void vectorizeLoop();
+  void vectorizeLoop(LoopVectorizationLegality *Legal);
   /// Insert the new loop to the loop hierarchy and pass manager.
   void cleanup();
 
@@ -106,10 +131,6 @@ private:
   /// for each element in the vector. Starting from zero.
   Value *getConsecutiveVector(Value* Val);
 
-  /// Check that the GEP operands are all uniform except for the last index
-  /// which has to be the induction variable.
-  bool isConsecutiveGep(GetElementPtrInst *Gep);
-
   /// When we go over instructions in the basic block we rely on previous
   /// values within the current basic block or on loop invariant values.
   /// When we widen (vectorize) values we place them in the map. If the values
@@ -117,10 +138,14 @@ private:
   /// broadcast them into a vector.
   Value *getVectorValue(Value *V);
 
+  /// Get a uniform vector of constant integers. We use this to get
+  /// vectors of ones and zeros for the reduction code.
+  Constant* getUniformVector(unsigned Val, Type* ScalarTy);
+
   typedef DenseMap<Value*, Value*> ValueMap;
 
   /// The original loop.
-  Loop *Orig;
+  Loop *OrigLoop;
   // Scev analysis to use.
   ScalarEvolution *SE;
   // Loop Info.
@@ -131,10 +156,21 @@ private:
   unsigned VF;
 
   // The builder that we use
-  IRBuilder<> *Builder;
+  IRBuilder<> Builder;
 
   // --- Vectorization state ---
 
+  /// Middle Block between the vector and the scalar.
+  BasicBlock *LoopMiddleBlock;
+  ///The ExitBlock of the scalar loop.
+  BasicBlock *LoopExitBlock;
+  ///The vector loop body.
+  BasicBlock *LoopVectorBody;
+  ///The scalar loop body.
+  BasicBlock *LoopScalarBody;
+  ///The first bypass block.
+  BasicBlock *LoopBypassBlock;
+
   /// The new Induction variable which was added to the new block.
   PHINode *Induction;
   /// The induction variable of the old basic block.
@@ -143,14 +179,55 @@ private:
   ValueMap WidenMap;
 };
 
-/// Perform the vectorization legality check. This class does not look at the
-/// profitability of vectorization, only the legality. At the moment the checks
-/// are very simple and focus on single basic block loops with a constant
-/// iteration count and no reductions.
+/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
+/// to what vectorization factor.
+/// This class does not look at the profitability of vectorization, only the
+/// legality. This class has two main kinds of checks:
+/// * Memory checks - The code in canVectorizeMemory checks if vectorization
+///   will change the order of memory accesses in a way that will change the
+///   correctness of the program.
+/// * Scalars checks - The code in canVectorizeBlock checks for a number
+///   of different conditions, such as the availability of a single induction
+///   variable, that all types are supported and vectorize-able, etc.
+/// This code reflects the capabilities of SingleBlockLoopVectorizer.
+/// This class is also used by SingleBlockLoopVectorizer for identifying
+/// induction variable and the different reduction variables.
 class LoopVectorizationLegality {
 public:
   LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl):
-  TheLoop(Lp), SE(Se), DL(Dl) { }
+  TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { }
+
+  /// This represents the kinds of reductions that we support.
+  /// We use the enum values to hold the 'identity' value for
+  /// each operand. This value does not change the result if applied.
+  enum ReductionKind {
+    NoReduction = -1, /// Not a reduction.
+    IntegerAdd  = 0,  /// Sum of numbers.
+    IntegerMult = 1  /// Product of numbers.
+  };
+
+  /// This POD struct holds information about reduction variables.
+  struct ReductionDescriptor {
+    // Default C'tor
+    ReductionDescriptor():
+    StartValue(0), LoopExitInstr(0), Kind(NoReduction) {}
+
+    // C'tor.
+    ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K):
+    StartValue(Start), LoopExitInstr(Exit), Kind(K) {}
+
+    // The starting value of the reduction.
+    // It does not have to be zero!
+    Value *StartValue;
+    // The instruction who's value is used outside the loop.
+    Instruction *LoopExitInstr;
+    // The kind of the reduction.
+    ReductionKind Kind;
+  };
+
+  /// ReductionList contains the reduction descriptors for all
+  /// of the reductions that were found in the loop.
+  typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
 
   /// Returns the maximum vectorization factor that we *can* use to vectorize
   /// this loop. This does not mean that it is profitable to vectorize this
@@ -158,15 +235,42 @@ public:
   /// can vectorize to any SIMD width below this number.
   unsigned getLoopMaxVF();
 
+  /// Returns the Induction variable.
+  PHINode *getInduction() {return Induction;}
+
+  /// Returns the reduction variables found in the loop.
+  ReductionList *getReductionVars() { return &Reductions; }
+
+  /// Check if the pointer returned by this GEP is consecutive
+  /// when the index is vectorized. This happens when the last
+  /// index of the GEP is consecutive, like the induction variable.
+  /// This check allows us to vectorize A[idx] into a wide load/store.
+  bool isConsecutiveGep(Value *Ptr);
+
 private:
   /// Check if a single basic block loop is vectorizable.
   /// At this point we know that this is a loop with a constant trip count
   /// and we only need to check individual instructions.
   bool canVectorizeBlock(BasicBlock &BB);
 
+  /// When we vectorize loops we may change the order in which
+  /// we read and write from memory. This method checks if it is
+  /// legal to vectorize the code, considering only memory constrains.
+  /// Returns true if BB is vectorizable
+  bool canVectorizeMemory(BasicBlock &BB);
+
   // Check if a pointer value is known to be disjoint.
   // Example: Alloca, Global, NoAlias.
-  bool isidentifiedSafeObject(Value* Val);
+  bool isIdentifiedSafeObject(Value* Val);
+
+  /// Returns True, if 'Phi' is the kind of reduction variable for type
+  /// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
+  bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
+  /// Returns true if the instruction I can be a reduction variable of type
+  /// 'Kind'.
+  bool isReductionInstr(Instruction *I, ReductionKind Kind);
+  /// Returns True, if 'Phi' is an induction variable.
+  bool isInductionVariable(PHINode *Phi);
 
   /// The loop that we evaluate.
   Loop *TheLoop;
@@ -174,6 +278,16 @@ private:
   ScalarEvolution *SE;
   /// DataLayout analysis.
   DataLayout *DL;
+
+  //  ---  vectorization state --- //
+
+  /// Holds the induction variable.
+  PHINode *Induction;
+  /// Holds the reduction variables.
+  ReductionList Reductions;
+  /// Allowed outside users. This holds the reduction
+  /// vars which can be accessed from outside the loop.
+  SmallPtrSet<Value*, 4> AllowedExit;
 };
 
 struct LoopVectorize : public LoopPass {
@@ -188,7 +302,7 @@ struct LoopVectorize : public LoopPass {
   LoopInfo *LI;
 
   virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
-    // Only vectorize innermost loops.
+    // We only vectorize innermost loops.
     if (!L->empty())
       return false;
 
@@ -203,7 +317,8 @@ struct LoopVectorize : public LoopPass {
     LoopVectorizationLegality LVL(L, SE, DL);
     unsigned MaxVF = LVL.getLoopMaxVF();
 
-    // Check that we can vectorize using the chosen vectorization width.
+    // Check that we can vectorize this loop using the chosen vectorization
+    // width.
     if (MaxVF < DefaultVectorizationFactor) {
       DEBUG(dbgs() << "LV: non-vectorizable MaxVF ("<< MaxVF << ").\n");
       return false;
@@ -211,9 +326,9 @@ struct LoopVectorize : public LoopPass {
 
     DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< MaxVF << ").\n");
 
-    // If we decided that is is *legal* to vectorizer the loop. Do it.
+    // If we decided that it is *legal* to vectorizer the loop then do it.
     SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, DefaultVectorizationFactor);
-    LB.vectorize();
+    LB.vectorize(&LVL);
 
     DEBUG(verifyFunction(*L->getHeader()->getParent()));
     return true;
@@ -222,6 +337,7 @@ struct LoopVectorize : public LoopPass {
   virtual void getAnalysisUsage(AnalysisUsage &AU) const {
     LoopPass::getAnalysisUsage(AU);
     AU.addRequiredID(LoopSimplifyID);
+    AU.addRequiredID(LCSSAID);
     AU.addRequired<LoopInfo>();
     AU.addRequired<ScalarEvolution>();
   }
@@ -241,9 +357,9 @@ Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) {
   Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF));
   Value *UndefVal = UndefValue::get(VTy);
   // Insert the value into a new vector.
-  Value *SingleElem = Builder->CreateInsertElement(UndefVal, V, Zero);
+  Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero);
   // Broadcast the scalar into all locations in the vector.
-  Value *Shuf = Builder->CreateShuffleVector(SingleElem, UndefVal, Zeros,
+  Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros,
                                              "broadcast");
   // We are accessing the induction variable. Make sure to promote the
   // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes.
@@ -269,11 +385,11 @@ Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) {
   // Add the consecutive indices to the vector value.
   Constant *Cv = ConstantVector::get(Indices);
   assert(Cv->getType() == Val->getType() && "Invalid consecutive vec");
-  return Builder->CreateAdd(Val, Cv, "induction");
+  return Builder.CreateAdd(Val, Cv, "induction");
 }
 
-
-bool SingleBlockLoopVectorizer::isConsecutiveGep(GetElementPtrInst *Gep) {
+bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) {
+  GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
   if (!Gep)
     return false;
 
@@ -282,7 +398,7 @@ bool SingleBlockLoopVectorizer::isConsecutiveGep(GetElementPtrInst *Gep) {
 
   // Check that all of the gep indices are uniform except for the last.
   for (unsigned i = 0; i < NumOperands - 1; ++i)
-    if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), Orig))
+    if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
       return false;
 
   // We can emit wide load/stores only of the last index is the induction
@@ -301,17 +417,29 @@ bool SingleBlockLoopVectorizer::isConsecutiveGep(GetElementPtrInst *Gep) {
 }
 
 Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) {
+  assert(!V->getType()->isVectorTy() && "Can't widen a vector");
   // If we saved a vectorized copy of V, use it.
-  ValueMap::iterator it = WidenMap.find(V);
-  if (it != WidenMap.end())
-     return it->second;
+  Value *&MapEntry = WidenMap[V];
+  if (MapEntry)
+    return MapEntry;
 
   // Broadcast V and save the value for future uses.
   Value *B = getBroadcastInstrs(V);
-  WidenMap[V] = B;
+  MapEntry = B;
   return B;
 }
 
+Constant*
+SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) {
+  SmallVector<Constant*, 8> Indices;
+  // Create a vector of consecutive numbers from zero to VF.
+  for (unsigned i = 0; i < VF; ++i)
+    Indices.push_back(ConstantInt::get(ScalarTy, Val));
+
+  // Add the consecutive indices to the vector value.
+  return ConstantVector::get(Indices);
+}
+
 void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
   // Holds vector parameters or scalars, in case of uniform vals.
@@ -354,7 +482,7 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
   if (!IsVoidRetTy)
     VecResults = UndefValue::get(VectorType::get(Instr->getType(), VF));
 
-  // For each scalar that we create.
+  // For each scalar that we create:
   for (unsigned i = 0; i < VF; ++i) {
     Instruction *Cloned = Instr->clone();
     if (!IsVoidRetTy)
@@ -364,31 +492,31 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
       Value *Op = Params[op];
       // Param is a vector. Need to extract the right lane.
       if (Op->getType()->isVectorTy())
-        Op = Builder->CreateExtractElement(Op, Builder->getInt32(i));
+        Op = Builder.CreateExtractElement(Op, Builder.getInt32(i));
       Cloned->setOperand(op, Op);
     }
 
     // Place the cloned scalar in the new loop.
-    Builder->Insert(Cloned);
+    Builder.Insert(Cloned);
 
     // If the original scalar returns a value we need to place it in a vector
     // so that future users will be able to use it.
     if (!IsVoidRetTy)
-      VecResults = Builder->CreateInsertElement(VecResults, Cloned,
-                                               Builder->getInt32(i));
+      VecResults = Builder.CreateInsertElement(VecResults, Cloned,
+                                               Builder.getInt32(i));
   }
 
   if (!IsVoidRetTy)
     WidenMap[Instr] = VecResults;
 }
 
-void SingleBlockLoopVectorizer::createEmptyLoop() {
+void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   /*
    In this function we generate a new loop. The new loop will contain
    the vectorized instructions while the old loop will continue to run the
    scalar remainder.
 
-   [  ] <-- vector loop bypass.
+    [ ] <-- vector loop bypass.
   /  |
  /   v
 |   [ ]     <-- vector pre header.
@@ -405,7 +533,7 @@ void SingleBlockLoopVectorizer::createEmptyLoop() {
 |    |
 |    v
 |   [ ] \
-|   [ ]_|   <-- old scalar loop to handle remainder. ()
+|   [ ]_|   <-- old scalar loop to handle remainder.
  \   |
   \  v
    >[ ]     <-- exit block.
@@ -413,11 +541,11 @@ void SingleBlockLoopVectorizer::createEmptyLoop() {
    */
 
   // This is the original scalar-loop preheader.
-  BasicBlock *BypassBlock = Orig->getLoopPreheader();
-  BasicBlock *ExitBlock = Orig->getExitBlock();
+  BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
+  BasicBlock *ExitBlock = OrigLoop->getExitBlock();
   assert(ExitBlock && "Must have an exit block");
 
-  assert(Orig->getNumBlocks() == 1 && "Invalid loop");
+  assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop");
   assert(BypassBlock && "Invalid loop structure");
 
   BasicBlock *VectorPH =
@@ -427,30 +555,26 @@ void SingleBlockLoopVectorizer::createEmptyLoop() {
 
   BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(),
                                                   "middle.block");
-
-
   BasicBlock *ScalarPH =
-          MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(),
-                                       "scalar.preheader");
-
+    MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(),
+                                 "scalar.preheader");
   // Find the induction variable.
-  BasicBlock *OldBasicBlock = Orig->getHeader();
-  OldInduction = dyn_cast<PHINode>(OldBasicBlock->begin());
+  BasicBlock *OldBasicBlock = OrigLoop->getHeader();
+  OldInduction = Legal->getInduction();
   assert(OldInduction && "We must have a single phi node.");
   Type *IdxTy = OldInduction->getType();
 
   // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
   // inside the loop.
-  Builder = new IRBuilder<>(VecBody);
-  Builder->SetInsertPoint(VecBody->getFirstInsertionPt());
+  Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
 
   // Generate the induction variable.
-  Induction = Builder->CreatePHI(IdxTy, 2, "index");
+  Induction = Builder.CreatePHI(IdxTy, 2, "index");
   Constant *Zero = ConstantInt::get(IdxTy, 0);
   Constant *Step = ConstantInt::get(IdxTy, VF);
 
   // Find the loop boundaries.
-  const SCEV *ExitCount = SE->getExitCount(Orig, Orig->getHeader());
+  const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader());
   assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
 
   // Get the total trip count from the count by adding 1.
@@ -496,12 +620,12 @@ void SingleBlockLoopVectorizer::createEmptyLoop() {
   MiddleBlock->getTerminator()->eraseFromParent();
 
   // Create i+1 and fill the PHINode.
-  Value *NextIdx = Builder->CreateAdd(Induction, Step, "index.next");
+  Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next");
   Induction->addIncoming(Zero, VectorPH);
   Induction->addIncoming(NextIdx, VecBody);
   // Create the compare.
-  Value *ICmp = Builder->CreateICmpEQ(NextIdx, CountRoundDown);
-  Builder->CreateCondBr(ICmp, MiddleBlock, VecBody);
+  Value *ICmp = Builder.CreateICmpEQ(NextIdx, CountRoundDown);
+  Builder.CreateCondBr(ICmp, MiddleBlock, VecBody);
 
   // Now we have two terminators. Remove the old one from the block.
   VecBody->getTerminator()->eraseFromParent();
@@ -511,36 +635,68 @@ void SingleBlockLoopVectorizer::createEmptyLoop() {
   OldInduction->setIncomingValue(BlockIdx, CountRoundDown);
 
   // Get ready to start creating new instructions into the vectorized body.
-  Builder->SetInsertPoint(VecBody->getFirstInsertionPt());
+  Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
 
   // Register the new loop.
   Loop* Lp = new Loop();
-  LPM->insertLoop(Lp, Orig->getParentLoop());
+  LPM->insertLoop(Lp, OrigLoop->getParentLoop());
 
   Lp->addBasicBlockToLoop(VecBody, LI->getBase());
 
-  Loop *ParentLoop = Orig->getParentLoop();
+  Loop *ParentLoop = OrigLoop->getParentLoop();
   if (ParentLoop) {
     ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase());
     ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
     ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase());
   }
+
+  // Save the state.
+  LoopMiddleBlock = MiddleBlock;
+  LoopExitBlock = ExitBlock;
+  LoopVectorBody = VecBody;
+  LoopScalarBody = OldBasicBlock;
+  LoopBypassBlock = BypassBlock;
 }
 
-void SingleBlockLoopVectorizer::vectorizeLoop() {
-  BasicBlock &BB = *Orig->getHeader();
+void
+SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
+  typedef SmallVector<PHINode*, 4> PhiVector;
+  BasicBlock &BB = *OrigLoop->getHeader();
+  Constant *Zero = ConstantInt::get(
+    IntegerType::getInt32Ty(BB.getContext()), 0);
+
+  // In order to support reduction variables we need to be able to vectorize
+  // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
+  // steages. First, we create a new vector PHI node with no incoming edges.
+  // We use this value when we vectorize all of the instructions that use the
+  // PHI. Next, after all of the instructions in the block are complete we
+  // add the new incoming edges to the PHI. At this point all of the
+  // instructions in the basic block are vectorized, so we can use them to
+  // construct the PHI.
+  PhiVector PHIsToFix;
 
   // For each instruction in the old loop.
   for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
     Instruction *Inst = it;
 
     switch (Inst->getOpcode()) {
-      case Instruction::PHI:
       case Instruction::Br:
         // Nothing to do for PHIs and BR, since we already took care of the
         // loop control flow instructions.
         continue;
-
+      case Instruction::PHI:{
+        PHINode* P = cast<PHINode>(Inst);
+        // Special handling for the induction var.
+        if (OldInduction == Inst)
+          continue;
+        // This is phase one of vectorizing PHIs.
+        // This has to be a reduction variable.
+        assert(Legal->getReductionVars()->count(P) && "Not a Reduction");
+        Type *VecTy = VectorType::get(Inst->getType(), VF);
+        WidenMap[Inst] = Builder.CreatePHI(VecTy, 2, "vec.phi");
+        PHIsToFix.push_back(P);
+        continue;
+      }
       case Instruction::Add:
       case Instruction::FAdd:
       case Instruction::Sub:
@@ -564,15 +720,27 @@ void SingleBlockLoopVectorizer::vectorizeLoop() {
         Value *A = getVectorValue(Inst->getOperand(0));
         Value *B = getVectorValue(Inst->getOperand(1));
         // Use this vector value for all users of the original instruction.
-        WidenMap[Inst] = Builder->CreateBinOp(BinOp->getOpcode(), A, B);
+        WidenMap[Inst] = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
         break;
       }
       case Instruction::Select: {
         // Widen selects.
-        Value *A = getVectorValue(Inst->getOperand(0));
-        Value *B = getVectorValue(Inst->getOperand(1));
-        Value *C = getVectorValue(Inst->getOperand(2));
-        WidenMap[Inst] = Builder->CreateSelect(A, B, C);
+        // If the selector is loop invariant we can create a select
+        // instruction with a scalar condition. Otherwise, use vector-select.
+        Value *Cond = Inst->getOperand(0);
+        bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop);
+
+        // The condition can be loop invariant  but still defined inside the
+        // loop. This means that we can't just use the original 'cond' value.
+        // We have to take the 'vectorized' value and pick the first lane.
+        // Instcombine will make this a no-op.
+        Cond = getVectorValue(Cond);
+        if (InvariantCond)
+          Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0));
+
+        Value *Op0 = getVectorValue(Inst->getOperand(1));
+        Value *Op1 = getVectorValue(Inst->getOperand(2));
+        WidenMap[Inst] = Builder.CreateSelect(Cond, Op0, Op1);
         break;
       }
 
@@ -584,9 +752,9 @@ void SingleBlockLoopVectorizer::vectorizeLoop() {
         Value *A = getVectorValue(Inst->getOperand(0));
         Value *B = getVectorValue(Inst->getOperand(1));
         if (FCmp)
-          WidenMap[Inst] = Builder->CreateFCmp(Cmp->getPredicate(), A, B);
+          WidenMap[Inst] = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
         else
-          WidenMap[Inst] = Builder->CreateICmp(Cmp->getPredicate(), A, B);
+          WidenMap[Inst] = Builder.CreateICmp(Cmp->getPredicate(), A, B);
         break;
       }
 
@@ -598,19 +766,24 @@ void SingleBlockLoopVectorizer::vectorizeLoop() {
         unsigned Alignment = SI->getAlignment();
         GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
         // This store does not use GEPs.
-        if (!isConsecutiveGep(Gep)) {
+        if (!Legal->isConsecutiveGep(Gep)) {
           scalarizeInstruction(Inst);
           break;
         }
 
+        // The last index does not have to be the induction. It can be
+        // consecutive and be a function of the index. For example A[I+1];
+        unsigned NumOperands = Gep->getNumOperands();
+        Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
+        LastIndex = Builder.CreateExtractElement(LastIndex, Builder.getInt32(0));
+
         // Create the new GEP with the new induction variable.
         GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
-        unsigned NumOperands = Gep->getNumOperands();
-        Gep2->setOperand(NumOperands - 1, Induction);
-        Ptr = Builder->Insert(Gep2);
-        Ptr = Builder->CreateBitCast(Ptr, StTy->getPointerTo());
+        Gep2->setOperand(NumOperands - 1, LastIndex);
+        Ptr = Builder.Insert(Gep2);
+        Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo());
         Value *Val = getVectorValue(SI->getValueOperand());
-        Builder->CreateStore(Val, Ptr)->setAlignment(Alignment);
+        Builder.CreateStore(Val, Ptr)->setAlignment(Alignment);
         break;
       }
       case Instruction::Load: {
@@ -622,18 +795,23 @@ void SingleBlockLoopVectorizer::vectorizeLoop() {
         GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
 
         // We don't have a gep. Scalarize the load.
-        if (!isConsecutiveGep(Gep)) {
+        if (!Legal->isConsecutiveGep(Gep)) {
           scalarizeInstruction(Inst);
           break;
         }
 
+        // The last index does not have to be the induction. It can be
+        // consecutive and be a function of the index. For example A[I+1];
+        unsigned NumOperands = Gep->getNumOperands();
+        Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
+        LastIndex = Builder.CreateExtractElement(LastIndex, Builder.getInt32(0));
+
         // Create the new GEP with the new induction variable.
         GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
-        unsigned NumOperands = Gep->getNumOperands();
-        Gep2->setOperand(NumOperands - 1, Induction);
-        Ptr = Builder->Insert(Gep2);
-        Ptr = Builder->CreateBitCast(Ptr, RetTy->getPointerTo());
-        LI = Builder->CreateLoad(Ptr);
+        Gep2->setOperand(NumOperands - 1, LastIndex);
+        Ptr = Builder.Insert(Gep2);
+        Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo());
+        LI = Builder.CreateLoad(Ptr);
         LI->setAlignment(Alignment);
         // Use this vector value for all users of the load.
         WidenMap[Inst] = LI;
@@ -655,7 +833,7 @@ void SingleBlockLoopVectorizer::vectorizeLoop() {
         CastInst *CI = dyn_cast<CastInst>(Inst);
         Value *A = getVectorValue(Inst->getOperand(0));
         Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF);
-        WidenMap[Inst] = Builder->CreateCast(CI->getOpcode(), A, DestTy);
+        WidenMap[Inst] = Builder.CreateCast(CI->getOpcode(), A, DestTy);
         break;
       }
 
@@ -665,11 +843,122 @@ void SingleBlockLoopVectorizer::vectorizeLoop() {
         break;
     }// end of switch.
   }// end of for_each instr.
+
+  // At this point every instruction in the original loop is widended to
+  // a vector form. We are almost done. Now, we need to fix the PHI nodes
+  // that we vectorized. The PHI nodes are currently empty because we did
+  // not want to introduce cycles. Notice that the remaining PHI nodes
+  // that we need to fix are reduction variables.
+
+  // Create the 'reduced' values for each of the induction vars.
+  // The reduced values are the vector values that we scalarize and combine
+  // after the loop is finished.
+  for (PhiVector::iterator it = PHIsToFix.begin(), e = PHIsToFix.end();
+       it != e; ++it) {
+    PHINode *RdxPhi = *it;
+    PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]);
+    assert(RdxPhi && "Unable to recover vectorized PHI");
+
+    // Find the reduction variable descriptor.
+    assert(Legal->getReductionVars()->count(RdxPhi) &&
+           "Unable to find the reduction variable");
+    LoopVectorizationLegality::ReductionDescriptor RdxDesc =
+      (*Legal->getReductionVars())[RdxPhi];
+
+    // We need to generate a reduction vector from the incoming scalar.
+    // To do so, we need to generate the 'identity' vector and overide
+    // one of the elements with the incoming scalar reduction. We need
+    // to do it in the vector-loop preheader.
+    Builder.SetInsertPoint(LoopBypassBlock->getTerminator());
+
+    // This is the vector-clone of the value that leaves the loop.
+    Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
+    Type *VecTy = VectorExit->getType();
+
+    // Find the reduction identity variable. The value of the enum is the
+    // identity. Zero for addition. One for Multiplication.
+    unsigned IdentitySclr =  RdxDesc.Kind;
+    Constant *Identity = getUniformVector(IdentitySclr,
+                                          VecTy->getScalarType());
+
+    // This vector is the Identity vector where the first element is the
+    // incoming scalar reduction.
+    Value *VectorStart = Builder.CreateInsertElement(Identity,
+                                                    RdxDesc.StartValue, Zero);
+
+
+    // Fix the vector-loop phi.
+    // We created the induction variable so we know that the
+    // preheader is the first entry.
+    BasicBlock *VecPreheader = Induction->getIncomingBlock(0);
+
+    // Reductions do not have to start at zero. They can start with
+    // any loop invariant values.
+    VecRdxPhi->addIncoming(VectorStart, VecPreheader);
+    unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
+    Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx));
+    VecRdxPhi->addIncoming(Val, LoopVectorBody);
+
+    // Before each round, move the insertion point right between
+    // the PHIs and the values we are going to write.
+    // This allows us to write both PHINodes and the extractelement
+    // instructions.
+    Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt());
+
+    // This PHINode contains the vectorized reduction variable, or
+    // the initial value vector, if we bypass the vector loop.
+    PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi");
+    NewPhi->addIncoming(VectorStart, LoopBypassBlock);
+    NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody);
+
+    // Extract the first scalar.
+    Value *Scalar0 =
+      Builder.CreateExtractElement(NewPhi, Builder.getInt32(0));
+    // Extract and sum the remaining vector elements.
+    for (unsigned i=1; i < VF; ++i) {
+      Value *Scalar1 =
+        Builder.CreateExtractElement(NewPhi, Builder.getInt32(i));
+      if (RdxDesc.Kind == LoopVectorizationLegality::IntegerAdd) {
+        Scalar0 = Builder.CreateAdd(Scalar0, Scalar1);
+      } else {
+        Scalar0 = Builder.CreateMul(Scalar0, Scalar1);
+      }
+    }
+
+    // Now, we need to fix the users of the reduction variable
+    // inside and outside of the scalar remainder loop.
+    // We know that the loop is in LCSSA form. We need to update the
+    // PHI nodes in the exit blocks.
+    for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
+         LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) {
+      PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
+      if (!LCSSAPhi) continue;
+
+      // All PHINodes need to have a single entry edge, or two if
+      // we already fixed them.
+      assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
+
+      // We found our reduction value exit-PHI. Update it with the
+      // incoming bypass edge.
+      if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) {
+        // Add an edge coming from the bypass.
+        LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock);
+        break;
+      }
+    }// end of the LCSSA phi scan.
+
+    // Fix the scalar loop reduction variable with the incoming reduction sum
+    // from the vector body and from the backedge value.
+    int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
+    int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block.
+    (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0);
+    (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
+  }// end of for each redux variable.
 }
 
 void SingleBlockLoopVectorizer::cleanup() {
   // The original basic block.
-  SE->forgetLoop(Orig);
+  SE->forgetLoop(OrigLoop);
 }
 
 unsigned LoopVectorizationLegality::getLoopMaxVF() {
@@ -712,55 +1001,100 @@ unsigned LoopVectorizationLegality::getLoopMaxVF() {
 }
 
 bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
-  // Holds the read and write pointers that we find.
-  typedef SmallVector<Value*, 10> ValueVector;
-  ValueVector Reads;
-  ValueVector Writes;
-
-  unsigned NumPhis = 0;
+  // Scan the instructions in the block and look for hazards.
   for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
     Instruction *I = it;
 
     PHINode *Phi = dyn_cast<PHINode>(I);
     if (Phi) {
-      NumPhis++;
+      // This should not happen because the loop should be normalized.
+      if (Phi->getNumIncomingValues() != 2) {
+        DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
+        return false;
+      }
       // We only look at integer phi nodes.
       if (!Phi->getType()->isIntegerTy()) {
         DEBUG(dbgs() << "LV: Found an non-int PHI.\n");
         return false;
       }
 
-      // If we found an induction variable.
-      if (NumPhis > 1) {
-        DEBUG(dbgs() << "LV: Found more than one PHI.\n");
-        return false;
+      if (isInductionVariable(Phi)) {
+        if (Induction) {
+          DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n");
+          return false;
+        }
+        DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n");
+        Induction = Phi;
+        continue;
       }
-
-      // This should not happen because the loop should be normalized.
-      if (Phi->getNumIncomingValues() != 2) {
-        DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
-        return false;
+      if (AddReductionVar(Phi, IntegerAdd)) {
+        DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n");
+        continue;
       }
-
-      // Check that the PHI is consecutive and starts at zero.
-      const SCEV *PhiScev = SE->getSCEV(Phi);
-      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
-      if (!AR) {
-        DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
-        return false;
+      if (AddReductionVar(Phi, IntegerMult)) {
+        DEBUG(dbgs() << "LV: Found an Mult reduction PHI."<< *Phi <<"\n");
+        continue;
       }
 
-      const SCEV *Step = AR->getStepRecurrence(*SE);
-      const SCEV *Start = AR->getStart();
+      DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
+      return false;
+    }// end of PHI handling
 
-      if (!Step->isOne() || !Start->isZero()) {
-        DEBUG(dbgs() << "LV: PHI does not start at zero or steps by one.\n");
-        return false;
-      }
+    // We still don't handle functions.
+    CallInst *CI = dyn_cast<CallInst>(I);