1 files changed, 373 insertions, 186 deletions
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index a7ef248e6e..55733f7f8a 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -25,6 +25,7 @@
 // 4. LoopVectorizationCostModel - A unit that checks for the profitability
 //    of vectorization. It decides on the optimal vector width, which
 //    can be one, if vectorization is not profitable.
+//
 //===----------------------------------------------------------------------===//
 //
 // The reduction-variable vectorization is based on the paper:
@@ -36,6 +37,9 @@
 // Other ideas/concepts are from:
 //  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
 //
+//  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
+//  Vectorizing Compilers.
+//
 //===----------------------------------------------------------------------===//
 #define LV_NAME "loop-vectorize"
 #define DEBUG_TYPE LV_NAME
@@ -82,6 +86,9 @@ const unsigned TinyTripCountThreshold = 16;
 /// number of pointers. Notice that the check is quadratic!
 const unsigned RuntimeMemoryCheckThreshold = 2;
 
+/// This is the highest vector width that we try to generate.
+const unsigned MaxVectorSize = 8;
+
 namespace {
 
 // Forward declarations.
@@ -106,23 +113,28 @@ class SingleBlockLoopVectorizer {
 public:
   /// Ctor.
   SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li,
-                            DominatorTree *dt, LPPassManager *Lpm,
+                            DominatorTree *Dt, DataLayout *Dl,
+                            LPPassManager *Lpm,
                             unsigned VecWidth):
-  OrigLoop(Orig), SE(Se), LI(Li), DT(dt), LPM(Lpm), VF(VecWidth),
+  OrigLoop(Orig), SE(Se), LI(Li), DT(Dt), DL(Dl), LPM(Lpm), VF(VecWidth),
   Builder(Se->getContext()), Induction(0), OldInduction(0) { }
 
   // Perform the actual loop widening (vectorization).
   void vectorize(LoopVectorizationLegality *Legal) {
-    ///Create a new empty loop. Unlink the old loop and connect the new one.
+    // Create a new empty loop. Unlink the old loop and connect the new one.
     createEmptyLoop(Legal);
-    /// Widen each instruction in the old loop to a new one in the new loop.
-    /// Use the Legality module to find the induction and reduction variables.
+    // Widen each instruction in the old loop to a new one in the new loop.
+    // Use the Legality module to find the induction and reduction variables.
     vectorizeLoop(Legal);
     // Register the new loop and update the analysis passes.
     updateAnalysis();
  }
 
 private:
+  /// Add code that checks at runtime if the accessed arrays overlap.
+  /// Returns the comperator value or NULL if no check is needed.
+  Value *addRuntimeCheck(LoopVectorizationLegality *Legal,
+                         Instruction *Loc);
   /// Create an empty loop, based on the loop ranges of the old loop.
   void createEmptyLoop(LoopVectorizationLegality *Legal);
   /// Copy and widen the instructions from the old loop.
@@ -167,6 +179,8 @@ private:
   LoopInfo *LI;
   // Dominator Tree.
   DominatorTree *DT;
+  // Data Layout.
+  DataLayout *DL;
   // Loop Pass Manager;
   LPPassManager *LPM;
   // The vectorization factor to use.
@@ -250,16 +264,46 @@ public:
   // This POD struct holds information about the memory runtime legality
   // check that a group of pointers do not overlap.
   struct RuntimePointerCheck {
+    RuntimePointerCheck(): Need(false) {}
+
+    /// Reset the state of the pointer runtime information.
+    void reset() {
+      Need = false;
+      Pointers.clear();
+      Starts.clear();
+      Ends.clear();
+    }
+
+    /// Insert a pointer and calculate the start and end SCEVs.
+    void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr) {
+      const SCEV *Sc = SE->getSCEV(Ptr);
+      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
+      assert(AR && "Invalid addrec expression");
+      const SCEV *Ex = SE->getExitCount(Lp, Lp->getHeader());
+      const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
+      Pointers.push_back(Ptr);
+      Starts.push_back(AR->getStart());
+      Ends.push_back(ScEnd);
+    }
+
     /// This flag indicates if we need to add the runtime check.
     bool Need;
     /// Holds the pointers that we need to check.
     SmallVector<Value*, 2> Pointers;
+    /// Holds the pointer value at the beginning of the loop.
+    SmallVector<const SCEV*, 2> Starts;
+    /// Holds the pointer value at the end of the loop.
+    SmallVector<const SCEV*, 2> Ends;
   };
 
   /// ReductionList contains the reduction descriptors for all
   /// of the reductions that were found in the loop.
   typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
 
+  /// InductionList saves induction variables and maps them to the initial
+  /// value entring the loop.
+  typedef DenseMap<PHINode*, Value*> InductionList;
+
   /// Returns true if it is legal to vectorize this loop.
   /// This does not mean that it is profitable to vectorize this
   /// loop, only that it is legal to do so.
@@ -271,11 +315,14 @@ public:
   /// Returns the reduction variables found in the loop.
   ReductionList *getReductionVars() { return &Reductions; }
 
-  /// Check if the pointer returned by this GEP is consecutive
-  /// when the index is vectorized. This happens when the last
-  /// index of the GEP is consecutive, like the induction variable.
+  /// Returns the induction variables found in the loop.
+  InductionList *getInductionVars() { return &Inductions; }
+
+  /// Check if this  pointer is consecutive when vectorizing. This happens
+  /// when the last index of the GEP is the induction variable, or that the
+  /// pointer itself is an induction variable.
   /// This check allows us to vectorize A[idx] into a wide load/store.
-  bool isConsecutiveGep(Value *Ptr);
+  bool isConsecutivePtr(Value *Ptr);
 
   /// Returns true if the value V is uniform within the loop.
   bool isUniform(Value *V);
@@ -317,10 +364,16 @@ private:
 
   //  ---  vectorization state --- //
 
-  /// Holds the induction variable.
+  /// Holds the integer induction variable. This is the counter of the
+  /// loop.
   PHINode *Induction;
   /// Holds the reduction variables.
   ReductionList Reductions;
+  /// Holds all of the induction variables that we found in the loop.
+  /// Notice that inductions don't need to start at zero and that induction
+  /// variables can be pointers.
+  InductionList Inductions;
+
   /// Allowed outside users. This holds the reduction
   /// vars which can be accessed from outside the loop.
   SmallPtrSet<Value*, 4> AllowedExit;
@@ -350,7 +403,7 @@ public:
   /// Returns the most profitable vectorization factor for the loop that is
   /// smaller or equal to the VF argument. This method checks every power
   /// of two up to VF.
-  unsigned findBestVectorizationFactor(unsigned VF = 8);
+  unsigned findBestVectorizationFactor(unsigned VF = MaxVectorSize);
 
 private:
   /// Returns the expected execution cost. The unit of the cost does
@@ -438,7 +491,7 @@ struct LoopVectorize : public LoopPass {
           "\n");
 
     // If we decided that it is *legal* to vectorizer the loop then do it.
-    SingleBlockLoopVectorizer LB(L, SE, LI, DT, &LPM, VF);
+    SingleBlockLoopVectorizer LB(L, SE, LI, DT, DL, &LPM, VF);
     LB.vectorize(&LVL);
 
     DEBUG(verifyFunction(*L->getHeader()->getParent()));
@@ -459,10 +512,6 @@ struct LoopVectorize : public LoopPass {
 };
 
 Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) {
-  // Instructions that access the old induction variable
-  // actually want to get the new one.
-  if (V == OldInduction)
-    V = Induction;
   // Create the types.
   LLVMContext &C = V->getContext();
   Type *VTy = VectorType::get(V->getType(), VF);
@@ -502,7 +551,14 @@ Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) {
   return Builder.CreateAdd(Val, Cv, "induction");
 }
 
-bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) {
+bool LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
+  assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr");
+
+  // If this pointer is an induction variable, return it.
+  PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
+  if (Phi && getInductionVars()->count(Phi))
+    return true;
+
   GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
   if (!Gep)
     return false;
@@ -549,13 +605,7 @@ Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) {
 
 Constant*
 SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) {
-  SmallVector<Constant*, 8> Indices;
-  // Create a vector of consecutive numbers from zero to VF.
-  for (unsigned i = 0; i < VF; ++i)
-    Indices.push_back(ConstantInt::get(ScalarTy, Val, true));
-
-  // Add the consecutive indices to the vector value.
-  return ConstantVector::get(Indices);
+  return ConstantVector::getSplat(VF, ConstantInt::get(ScalarTy, Val, true));
 }
 
 void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
@@ -569,7 +619,7 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
 
     // If we are accessing the old induction variable, use the new one.
     if (SrcOp == OldInduction) {
-      Params.push_back(getBroadcastInstrs(Induction));
+      Params.push_back(getVectorValue(Induction));
       continue;
     }
 
@@ -628,6 +678,67 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
     WidenMap[Instr] = VecResults;
 }
 
+Value*
+SingleBlockLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
+                                           Instruction *Loc) {
+  LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
+    Legal->getRuntimePointerCheck();
+
+  if (!PtrRtCheck->Need)
+    return NULL;
+
+  Value *MemoryRuntimeCheck = 0;
+  unsigned NumPointers = PtrRtCheck->Pointers.size();
+  SmallVector<Value* , 2> Starts;
+  SmallVector<Value* , 2> Ends;
+
+  SCEVExpander Exp(*SE, "induction");
+
+  // Use this type for pointer arithmetic.
+  Type* PtrArithTy = PtrRtCheck->Pointers[0]->getType();
+
+  for (unsigned i=0; i < NumPointers; ++i) {
+    Value *Ptr = PtrRtCheck->Pointers[i];
+    const SCEV *Sc = SE->getSCEV(Ptr);
+
+    if (SE->isLoopInvariant(Sc, OrigLoop)) {
+      DEBUG(dbgs() << "LV1: Adding RT check for a loop invariant ptr:" <<
+            *Ptr <<"\n");
+      Starts.push_back(Ptr);
+      Ends.push_back(Ptr);
+    } else {
+      DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n");
+
+      Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i],
+                                       PtrArithTy, Loc);
+      Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc);
+      Starts.push_back(Start);
+      Ends.push_back(End);
+    }
+  }
+
+  for (unsigned i = 0; i < NumPointers; ++i) {
+    for (unsigned j = i+1; j < NumPointers; ++j) {
+      Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
+                                    Starts[i], Ends[j], "bound0", Loc);
+      Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
+                                    Starts[j], Ends[i], "bound1", Loc);
+      Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1,
+                                                 "found.conflict", Loc);
+      if (MemoryRuntimeCheck)
+        MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or,
+                                                    MemoryRuntimeCheck,
+                                                    IsConflict,
+                                                    "conflict.rdx", Loc);
+      else
+        MemoryRuntimeCheck = IsConflict;
+
+    }
+  }
+
+  return MemoryRuntimeCheck;
+}
+
 void
 SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   /*
@@ -659,9 +770,18 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
    ...
    */
 
+  BasicBlock *OldBasicBlock = OrigLoop->getHeader();
+  BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
+  BasicBlock *ExitBlock = OrigLoop->getExitBlock();
+  assert(ExitBlock && "Must have an exit block");
+
+  // Some loops have a single integer induction variable, while other loops
+  // don't. One example is c++ iterators that often have multiple pointer
+  // induction variables. In the code below we also support a case where we
+  // don't have a single induction variable.
   OldInduction = Legal->getInduction();
-  assert(OldInduction && "We must have a single phi node.");
-  Type *IdxTy = OldInduction->getType();
+  Type *IdxTy = OldInduction ? OldInduction->getType() :
+    DL->getIntPtrType(SE->getContext());
 
   // Find the loop boundaries.
   const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader());
@@ -670,35 +790,42 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   // Get the total trip count from the count by adding 1.
   ExitCount = SE->getAddExpr(ExitCount,
                              SE->getConstant(ExitCount->getType(), 1));
-  // We may need to extend the index in case there is a type mismatch.
-  // We know that the count starts at zero and does not overflow.
-  // We are using Zext because it should be less expensive.
-  if (ExitCount->getType() != IdxTy)
-    ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy);
 
-  // This is the original scalar-loop preheader.
-  BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
-  BasicBlock *ExitBlock = OrigLoop->getExitBlock();
-  assert(ExitBlock && "Must have an exit block");
+  // Expand the trip count and place the new instructions in the preheader.
+  // Notice that the pre-header does not change, only the loop body.
+  SCEVExpander Exp(*SE, "induction");
 
-  // The loop index does not have to start at Zero. It starts with this value.
-  Value *StartIdx = OldInduction->getIncomingValueForBlock(BypassBlock);
+  // Count holds the overall loop count (N).
+  Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
+                                   BypassBlock->getTerminator());
+
+  // The loop index does not have to start at Zero. Find the original start
+  // value from the induction PHI node. If we don't have an induction variable
+  // then we know that it starts at zero.
+  Value *StartIdx = OldInduction ?
+    OldInduction->getIncomingValueForBlock(BypassBlock):
+    ConstantInt::get(IdxTy, 0);
 
   assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop");
   assert(BypassBlock && "Invalid loop structure");
 
+  // Generate the code that checks in runtime if arrays overlap.
+  Value *MemoryRuntimeCheck = addRuntimeCheck(Legal,
+                                              BypassBlock->getTerminator());
+
+  // Split the single block loop into the two loop structure described above.
   BasicBlock *VectorPH =
       BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
-  BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(),
-                                                 "vector.body");
-
-  BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(),
-                                                  "middle.block");
+  BasicBlock *VecBody =
+    VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
+  BasicBlock *MiddleBlock =
+    VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
   BasicBlock *ScalarPH =
-    MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(),
-                                 "scalar.preheader");
-  // Find the induction variable.
-  BasicBlock *OldBasicBlock = OrigLoop->getHeader();
+    MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
+
+  // This is the location in which we add all of the logic for bypassing
+  // the new vector loop.
+  Instruction *Loc = BypassBlock->getTerminator();
 
   // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
   // inside the loop.
@@ -708,13 +835,16 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   Induction = Builder.CreatePHI(IdxTy, 2, "index");
   Constant *Step = ConstantInt::get(IdxTy, VF);
 
-  // Expand the trip count and place the new instructions in the preheader.
-  // Notice that the pre-header does not change, only the loop body.
-  SCEVExpander Exp(*SE, "induction");
-  Instruction *Loc = BypassBlock->getTerminator();
-
-  // Count holds the overall loop count (N).
-  Value *Count = Exp.expandCodeFor(ExitCount, Induction->getType(), Loc);
+  // We may need to extend the index in case there is a type mismatch.
+  // We know that the count starts at zero and does not overflow.
+  if (Count->getType() != IdxTy) {
+    // The exit count can be of pointer type. Convert it to the correct
+    // integer type.
+    if (ExitCount->getType()->isPointerTy())
+      Count = CastInst::CreatePointerCast(Count, IdxTy, "ptrcnt.to.int", Loc);
+    else
+      Count = CastInst::CreateZExtOrBitCast(Count, IdxTy, "zext.cnt", Loc);
+  }
 
   // Add the start index to the loop count to get the new end index.
   Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc);
@@ -727,84 +857,79 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx,
                                                      "end.idx.rnd.down", Loc);
 
-  // Now, compare the new count to zero. If it is zero, jump to the scalar part.
+  // Now, compare the new count to zero. If it is zero skip the vector loop and
+  // jump to the scalar loop.
   Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
                                IdxEndRoundDown,
                                StartIdx,
                                "cmp.zero", Loc);
 
-  LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
-    Legal->getRuntimePointerCheck();
-  Value *MemoryRuntimeCheck = 0;
-  if (PtrRtCheck->Need) {
-    unsigned NumPointers = PtrRtCheck->Pointers.size();
-    SmallVector<Value* , 2> Starts;
-    SmallVector<Value* , 2> Ends;
-
-    // Use this type for pointer arithmetic.
-    Type* PtrArithTy = PtrRtCheck->Pointers[0]->getType();
-
-    for (unsigned i=0; i < NumPointers; ++i) {
-      Value *Ptr = PtrRtCheck->Pointers[i];
-      const SCEV *Sc = SE->getSCEV(Ptr);
-
-      if (SE->isLoopInvariant(Sc, OrigLoop)) {
-        DEBUG(dbgs() << "LV1: Adding RT check for a loop invariant ptr:" <<
-              *Ptr <<"\n");
-        Starts.push_back(Ptr);
-        Ends.push_back(Ptr);
-      } else {
-        DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n");
-        const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
-        Value *Start = Exp.expandCodeFor(AR->getStart(), PtrArithTy, Loc);
-        const SCEV *Ex = SE->getExitCount(OrigLoop, OrigLoop->getHeader());
-        const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
-        assert(!isa<SCEVCouldNotCompute>(ScEnd) && "Invalid scev range.");
-        Value *End = Exp.expandCodeFor(ScEnd, PtrArithTy, Loc);
-        Starts.push_back(Start);
-        Ends.push_back(End);
-      }
-    }
-
-    for (unsigned i=0; i < NumPointers; ++i) {
-      for (unsigned j=i+1; j < NumPointers; ++j) {
-        Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
-                                      Starts[0], Ends[1], "bound0", Loc);
-        Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
-                                      Starts[1], Ends[0], "bound1", Loc);
-        Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1,
-                                                    "found.conflict", Loc);
-        if (MemoryRuntimeCheck) {
-          MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or,
-                                                      MemoryRuntimeCheck,
-                                                      IsConflict,
-                                                      "conflict.rdx", Loc);
-        } else {
-          MemoryRuntimeCheck = IsConflict;
-        }
-      }
-    }
-  }// end of need-runtime-check code.
-
   // If we are using memory runtime checks, include them in.
-  if (MemoryRuntimeCheck) {
+  if (MemoryRuntimeCheck)
     Cmp = BinaryOperator::Create(Instruction::Or, Cmp, MemoryRuntimeCheck,
                                  "CntOrMem", Loc);
-  }
 
   BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc);
   // Remove the old terminator.
   Loc->eraseFromParent();
 
   // We are going to resume the execution of the scalar loop.
-  // This PHI decides on what number to start. If we come from the
-  // vector loop then we need to start with the end index minus the
-  // index modulo VF. If we come from a bypass edge then we need to start
-  // from the real start.
-  PHINode* ResumeIndex = PHINode::Create(IdxTy, 2, "resume.idx",
-                                         MiddleBlock->getTerminator());
-  ResumeIndex->addIncoming(StartIdx, BypassBlock);
-  ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
+  // Go over all of the induction variables that we found and fix the
+  // PHIs that are left in the scalar version of the loop.
+  // The starting values of PHI nodes depend on the counter of the last
+  // iteration in the vectorized loop.
+  // If we come from a bypass edge then we need to start from the original start
+  // value.
+
+  // This variable saves the new starting index for the scalar loop.
+  PHINode *ResumeIndex = 0;
+  LoopVectorizationLegality::InductionList::iterator I, E;
+  LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
+  for (I = List->begin(), E = List->end(); I != E; ++I) {
+    PHINode *OrigPhi = I->first;
+    PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val",
+                                           MiddleBlock->getTerminator());
+    Value *EndValue = 0;
+    if (OrigPhi->getType()->isIntegerTy()) {
+      // Handle the integer induction counter:
+      assert(OrigPhi == OldInduction && "Unknown integer PHI");
+      // We know what the end value is.
+      EndValue = IdxEndRoundDown;
+      // We also know which PHI node holds it.
+      ResumeIndex = ResumeVal;
+    } else {
+      // For pointer induction variables, calculate the offset using
+      // the end index.
+      EndValue = GetElementPtrInst::Create(I->second, CountRoundDown,
+                                           "ptr.ind.end",
+                                           BypassBlock->getTerminator());
+    }
+
+    // The new PHI merges the original incoming value, in case of a bypass,
+    // or the value at the end of the vectorized loop.
+    ResumeVal->addIncoming(I->second, BypassBlock);
+    ResumeVal->addIncoming(EndValue, VecBody);
+
+    // Fix the scalar body counter (PHI node).
+    unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
+    OrigPhi->setIncomingValue(BlockIdx, ResumeVal);
+  }
+
+  // If we are generating a new induction variable then we also need to
+  // generate the code that calculates the exit value. This value is not
+  // simply the end of the counter because we may skip the vectorized body
+  // in case of a runtime check.
+  if (!OldInduction){
+    assert(!ResumeIndex && "Unexpected resume value found");
+    ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
+                                  MiddleBlock->getTerminator());
+    ResumeIndex->addIncoming(StartIdx, BypassBlock);
+    ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
+  }
+
+  // Make sure that we found the index where scalar loop needs to continue.
+  assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() &&
+         "Invalid resume Index");
 
   // Add a check in the middle block to see if we have completed
   // all of the iterations in the first vector loop.
@@ -828,10 +953,6 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   // Now we have two terminators. Remove the old one from the block.
   VecBody->getTerminator()->eraseFromParent();
 
-  // Fix the scalar body iteration count.
-  unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH);
-  OldInduction->setIncomingValue(BlockIdx, ResumeIndex);
-
   // Get ready to start creating new instructions into the vectorized body.
   Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
 
@@ -901,7 +1022,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
   // add the new incoming edges to the PHI. At this point all of the
   // instructions in the basic block are vectorized, so we can use them to
   // construct the PHI.
-  PhiVector PHIsToFix;
+  PhiVector RdxPHIsToFix;
 
   // For each instruction in the old loop.
   for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
@@ -914,15 +1035,53 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         continue;
       case Instruction::PHI:{
         PHINode* P = cast<PHINode>(Inst);
-        // Special handling for the induction var.
-        if (OldInduction == Inst)
+        // Handle reduction variables:
+        if (Legal->getReductionVars()->count(P)) {
+          // This is phase one of vectorizing PHIs.
+          Type *VecTy = VectorType::get(Inst->getType(), VF);
+          WidenMap[Inst] = PHINode::Create(VecTy, 2, "vec.phi",
+                                  LoopVectorBody->getFirstInsertionPt());
+          RdxPHIsToFix.push_back(P);
+          continue;
+        }
+
+        // This PHINode must be an induction variable.
+        // Make sure that we know about it.
+        assert(Legal->getInductionVars()->count(P) &&
+               "Not an induction variable");
+
+        if (P->getType()->isIntegerTy()) {
+          assert(P == OldInduction && "Unexpected PHI");
+          WidenMap[Inst] = getBroadcastInstrs(Induction);
           continue;
-        // This is phase one of vectorizing PHIs.
-        // This has to be a reduction variable.
-        assert(Legal->getReductionVars()->count(P) && "Not a Reduction");
-        Type *VecTy = VectorType::get(Inst->getType(), VF);
-        WidenMap[Inst] = Builder.CreatePHI(VecTy, 2, "vec.phi");
-        PHIsToFix.push_back(P);
+        }
+
+        // Handle pointer inductions.
+        assert(P->getType()->isPointerTy() && "Unexpected type.");
+        Value *StartIdx = OldInduction ?
+          Legal->getInductionVars()->lookup(OldInduction) :
+          ConstantInt::get(Induction->getType(), 0);
+
+        // This is the pointer value coming into the loop.
+        Value *StartPtr = Legal->getInductionVars()->lookup(P);
+
+        // This is the normalized GEP that starts counting at zero.
+        Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
+                                                 "normalized.idx");
+
+        // This is the vector of results. Notice that we don't generate vector
+        // geps because scalar geps result in better code.
+        Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
+        for (unsigned int i = 0; i < VF; ++i) {
+          Constant *Idx = ConstantInt::get(Induction->getType(), i);
+          Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
+          Value *SclrGep = Builder.CreateGEP(StartPtr, GlobalIdx, "next.gep");
+          VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
+                                               Builder.getInt32(i),
+                                               "insert.gep");
+        }
+
+        WidenMap[Inst] = VecVal;
         continue;
       }
       case Instruction::Add:
@@ -1010,21 +1169,27 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
 
         // This store does not use GEPs.
-        if (!Legal->isConsecutiveGep(Gep)) {
+        if (!Legal->isConsecutivePtr(Ptr)) {
           scalarizeInstruction(Inst);
           break;
         }
 
-        // The last index does not have to be the induction. It can be
-        // consecutive and be a function of the index. For example A[I+1];
-        unsigned NumOperands = Gep->getNumOperands();
-        Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1));
-        LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
-
-        // Create the new GEP with the new induction variable.
-        GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
-        Gep2->setOperand(NumOperands - 1, LastIndex);
-        Ptr = Builder.Insert(Gep2);
+        if (Gep) {
+          // The last index does not have to be the induction. It can be
+          // consecutive and be a function of the index. For example A[I+1];
+          unsigned NumOperands = Gep->getNumOperands();
+          Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1));
+          LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
+
+          // Create the new GEP with the new induction variable.
+          GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+          Gep2->setOperand(NumOperands - 1, LastIndex);
+          Ptr = Builder.Insert(Gep2);
+        } else {
+          // Use the induction element ptr.
+          assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
+          Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero);
+        }
         Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo());
         Value *Val = getVectorValue(SI->getValueOperand());
         Builder.CreateStore(Val, Ptr)->setAlignment(Alignment);
@@ -1038,23 +1203,31 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         unsigned Alignment = LI->getAlignment();
         GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
 
-        // If we don't have a gep, or that the pointer is loop invariant,
+        // If the pointer is loop invariant or if it is non consecutive,
         // scalarize the load.
-        if (!Gep || Legal->isUniform(Gep) || !Legal->isConsecutiveGep(Gep)) {
+        bool Con = Legal->isConsecutivePtr(Ptr);
+        if (Legal->isUniform(Ptr) || !Con) {
           scalarizeInstruction(Inst);
           break;
         }
 
-        // The last index does not have to be the induction. It can be
-        // consecutive and be a function of the index. For example A[I+1];
-        unsigned NumOperands = Gep->getNumOperands();
-        Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
-        LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
+        if (Gep) {
+          // The last index does not have to be the induction. It can be
+          // consecutive and be a function of the index. For example A[I+1];
+          unsigned NumOperands = Gep->getNumOperands();
+          Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
+          LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
+
+          // Create the new GEP with the new induction variable.
+          GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+          Gep2->setOperand(NumOperands - 1, LastIndex);
+          Ptr = Builder.Insert(Gep2);
+        } else {
+          // Use the induction element ptr.
+          assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
+          Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero);
+        }
 
-        // Create the new GEP with the new induction variable.
-        GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
-        Gep2->setOperand(NumOperands - 1, LastIndex);
-        Ptr = Builder.Insert(Gep2);
         Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo());
         LI = Builder.CreateLoad(Ptr);
         LI->setAlignment(Alignment);
@@ -1098,7 +1271,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
   // Create the 'reduced' values for each of the induction vars.
   // The reduced values are the vector values that we scalarize and combine
   // after the loop is finished.
-  for (PhiVector::iterator it = PHIsToFix.begin(), e = PHIsToFix.end();
+  for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end();
        it != e; ++it) {
     PHINode *RdxPhi = *it;
     PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]);
@@ -1130,7 +1303,6 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     Value *VectorStart = Builder.CreateInsertElement(Identity,
                                                     RdxDesc.StartValue, Zero);
 
-
     // Fix the vector-loop phi.
     // We created the induction variable so we know that the
     // preheader is the first entry.
@@ -1236,7 +1408,7 @@ bool LoopVectorizationLegality::canVectorize() {
   if (!TheLoop->getLoopPreheader()) {
     assert(false && "No preheader!!");
     DEBUG(dbgs() << "LV: Loop not normalized." << "\n");
-    return  false;
+    return false;
   }
 
   // We can only vectorize single basic block loops.
@@ -1282,23 +1454,34 @@ bool LoopVectorizationLegality::canVectorize() {
 }
 
 bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
+
+  BasicBlock *PreHeader = TheLoop->getLoopPreheader();
+
   // Scan the instructions in the block and look for hazards.
   for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
     Instruction *I = it;
 
-    PHINode *Phi = dyn_cast<PHINode>(I);
-    if (Phi) {
+    if (PHINode *Phi = dyn_cast<PHINode>(I)) {
       // This should not happen because the loop should be normalized.
       if (Phi->getNumIncomingValues() != 2) {
         DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
         return false;
       }
-      // We only look at integer phi nodes.
-      if (!Phi->getType()->isIntegerTy()) {
-        DEBUG(dbgs() << "LV: Found an non-int PHI.\n");
+
+      // This is the value coming from the preheader.
+      Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);
+
+      // We only look at integer and pointer phi nodes.
+      if (Phi->getType()->isPointerTy() && isInductionVariable(Phi)) {
+        DEBUG(dbgs() << "LV: Found a pointer induction variable.\n");
+        Inductions[Phi] = StartValue;
+        continue;
+      } else if (!Phi->getType()->isIntegerTy()) {
+        DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
         return false;
       }
 
+      // Handle integer PHIs:
       if (isInductionVariable(Phi)) {
         if (Induction) {
           DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n");
@@ -1306,6 +1489,7 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
         }
         DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n");
         Induction = Phi;
+        Inductions[Phi] = StartValue;
         continue;
       }
       if (AddReductionVar(Phi, IntegerAdd)) {
@@ -1364,8 +1548,8 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
   } // next instr.
 
   if (!Induction) {
-      DEBUG(dbgs() << "LV: Did not find an induction var.\n");
-      return false;
+    DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
+    assert(getInductionVars()->size() && "No induction variables");
   }
 
   // Don't vectorize if the memory dependencies do not allow vectorization.
@@ -1382,15 +1566,10 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
   while (Worklist.size()) {
     Instruction *I = dyn_cast<Instruction>(Worklist.back());
     Worklist.pop_back();
-    // Look at instructions inside this block.
-    if (!I) continue;
-    if (I->getParent() != &BB) continue;
 
-    // Stop when reaching PHI nodes.
-    if (isa<PHINode>(I)) {
-      assert(I == Induction && "Found a uniform PHI that is not the induction");
-      break;
-    }
+    // Look at instructions inside this block. Stop when reaching PHI nodes.
+    if (!I || I->getParent() != &BB || isa<PHINode>(I))
+      continue;
 
     // This is a known uniform.
     Uniforms.insert(I);
@@ -1493,7 +1672,7 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
     // If the address of i is unknown (for example A[B[i]]) then we may
     // read a few words, modify, and write a few words, and some of the
     // words may be written to the same address.
-    if (Seen.insert(Ptr) || !isConsecutiveGep(Ptr))
+    if (Seen.insert(Ptr) || !isConsecutivePtr(Ptr))
       Reads.push_back(Ptr);
   }
 
@@ -1509,7 +1688,7 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
   bool RT = true;
   for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I)
     if (hasComputableBounds(*I)) {
-      PtrRtCheck.Pointers.push_back(*I);
+      PtrRtCheck.insert(SE, TheLoop, *I);
       DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
     } else {
       RT = false;
@@ -1517,7 +1696,7 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
     }
   for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I)
     if (hasComputableBounds(*I)) {
-      PtrRtCheck.Pointers.push_back(*I);
+      PtrRtCheck.insert(SE, TheLoop, *I);
       DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
     } else {
       RT = false;
@@ -1527,7 +1706,7 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
   // Check that we did not collect too many pointers or found a
   // unsizeable pointer.
   if (!RT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) {
-    PtrRtCheck.Pointers.clear();
+    PtrRtCheck.reset();
     RT = false;
   }
 
@@ -1582,8 +1761,7 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
 
   // It is safe to vectorize and we don't need any runtime checks.
   DEBUG(dbgs() << "LV: We don't need a runtime memory check.\n");
-  PtrRtCheck.Pointers.clear();
-  PtrRtCheck.Need = false;
+  PtrRtCheck.reset();
   return true;
 }
 
@@ -1677,8 +1855,6 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I,
     case Instruction::Sub:
       return Kind == IntegerAdd;
     case Instruction::Mul:
-    case Instruction::UDiv:
-    case Instruction::SDiv:
       return Kind == IntegerMult;
     case Instruction::And:
       return Kind == IntegerAnd;
@@ -1690,6 +1866,11 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I,
 }
 
 bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
+  Type *PhiTy = Phi->getType();
+  // We only handle integer and pointer inductions variables.
+  if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy())
+    return false;
+
   // Check that the PHI is consecutive and starts at zero.
   const SCEV *PhiScev = SE->getSCEV(Phi);
   const SCEVAddR