1 files changed, 311 insertions, 67 deletions
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 892808760f..3f1d82cf5b 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -78,6 +78,10 @@ VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
 /// We don't vectorize loops with a known constant trip count below this number.
 const unsigned TinyTripCountThreshold = 16;
 
+/// When performing a runtime memory check, do not check more than this
+/// number of pointers. Notice that the check is quadratic!
+const unsigned RuntimeMemoryCheckThreshold = 2;
+
 namespace {
 
 // Forward declarations.
@@ -114,7 +118,7 @@ public:
     /// Widen each instruction in the old loop to a new one in the new loop.
     /// Use the Legality module to find the induction and reduction variables.
     vectorizeLoop(Legal);
-    // register the new loop.
+    // Register the new loop and update the analysis passes.
     updateAnalysis();
  }
 
@@ -123,7 +127,8 @@ private:
   void createEmptyLoop(LoopVectorizationLegality *Legal);
   /// Copy and widen the instructions from the old loop.
   void vectorizeLoop(LoopVectorizationLegality *Legal);
-  /// Insert the new loop to the loop hierarchy and pass manager.
+  /// Insert the new loop to the loop hierarchy and pass manager
+  /// and update the analysis passes.
   void updateAnalysis();
 
   /// This instruction is un-vectorizable. Implement it as a sequence
@@ -242,10 +247,23 @@ public:
     ReductionKind Kind;
   };
 
+  // This POD struct holds information about the memory runtime legality
+  // check that a group of pointers do not overlap.
+  struct RuntimePointerCheck {
+    /// This flag indicates if we need to add the runtime check.
+    bool Need;
+    /// Holds the pointers that we need to check.
+    SmallVector<Value*, 2> Pointers;
+  };
+
   /// ReductionList contains the reduction descriptors for all
   /// of the reductions that were found in the loop.
   typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
 
+  /// InductionList saves induction variables and maps them to the initial
+  /// value entring the loop.
+  typedef DenseMap<PHINode*, Value*> InductionList;
+
   /// Returns true if it is legal to vectorize this loop.
   /// This does not mean that it is profitable to vectorize this
   /// loop, only that it is legal to do so.
@@ -257,15 +275,23 @@ public:
   /// Returns the reduction variables found in the loop.
   ReductionList *getReductionVars() { return &Reductions; }
 
+  /// Returns the induction variables found in the loop.
+  InductionList *getInductionVars() { return &Inductions; }
+
   /// Check if the pointer returned by this GEP is consecutive
   /// when the index is vectorized. This happens when the last
   /// index of the GEP is consecutive, like the induction variable.
   /// This check allows us to vectorize A[idx] into a wide load/store.
   bool isConsecutiveGep(Value *Ptr);
 
+  /// Returns true if the value V is uniform within the loop.
+  bool isUniform(Value *V);
+
   /// Returns true if this instruction will remain scalar after vectorization.
   bool isUniformAfterVectorization(Instruction* I) {return Uniforms.count(I);}
 
+  /// Returns the information that we collected about runtime memory check.
+  RuntimePointerCheck *getRuntimePointerCheck() {return &PtrRtCheck; }
 private:
   /// Check if a single basic block loop is vectorizable.
   /// At this point we know that this is a loop with a constant trip count
@@ -286,6 +312,8 @@ private:
   bool isReductionInstr(Instruction *I, ReductionKind Kind);
   /// Returns True, if 'Phi' is an induction variable.
   bool isInductionVariable(PHINode *Phi);
+  /// Return true if can compute the address bounds of Ptr within the loop.
+  bool hasComputableBounds(Value *Ptr);
 
   /// The loop that we evaluate.
   Loop *TheLoop;
@@ -296,16 +324,25 @@ private:
 
   //  ---  vectorization state --- //
 
-  /// Holds the induction variable.
+  /// Holds the integer induction variable. This is the counter of the
+  /// loop.
   PHINode *Induction;
   /// Holds the reduction variables.
   ReductionList Reductions;
+  /// Holds all of the induction variables that we found in the loop.
+  /// Notice that inductions don't need to start at zero and that induction
+  /// variables can be pointers.
+  InductionList Inductions;
+
   /// Allowed outside users. This holds the reduction
   /// vars which can be accessed from outside the loop.
   SmallPtrSet<Value*, 4> AllowedExit;
   /// This set holds the variables which are known to be uniform after
   /// vectorization.
   SmallPtrSet<Instruction*, 4> Uniforms;
+  /// We need to check that all of the pointers in this list are disjoint
+  /// at runtime.
+  RuntimePointerCheck PtrRtCheck;
 };
 
 /// LoopVectorizationCostModel - estimates the expected speedups due to
@@ -506,6 +543,10 @@ bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) {
   return false;
 }
 
+bool LoopVectorizationLegality::isUniform(Value *V) {
+  return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
+}
+
 Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) {
   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
   // If we saved a vectorized copy of V, use it.
@@ -521,13 +562,7 @@ Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) {
 
 Constant*
 SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) {
-  SmallVector<Constant*, 8> Indices;
-  // Create a vector of consecutive numbers from zero to VF.
-  for (unsigned i = 0; i < VF; ++i)
-    Indices.push_back(ConstantInt::get(ScalarTy, Val, true));
-
-  // Add the consecutive indices to the vector value.
-  return ConstantVector::get(Indices);
+  return ConstantVector::getSplat(VF, ConstantInt::get(ScalarTy, Val, true));
 }
 
 void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
@@ -631,13 +666,29 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
    ...
    */
 
+  OldInduction = Legal->getInduction();
+  assert(OldInduction && "We must have a single phi node.");
+  Type *IdxTy = OldInduction->getType();
+
+  // Find the loop boundaries.
+  const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader());
+  assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
+
+  // Get the total trip count from the count by adding 1.
+  ExitCount = SE->getAddExpr(ExitCount,
+                             SE->getConstant(ExitCount->getType(), 1));
+  // We may need to extend the index in case there is a type mismatch.
+  // We know that the count starts at zero and does not overflow.
+  // We are using Zext because it should be less expensive.
+  if (ExitCount->getType() != IdxTy)
+    ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy);
+
   // This is the original scalar-loop preheader.
   BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
   assert(ExitBlock && "Must have an exit block");
 
   // The loop index does not have to start at Zero. It starts with this value.
-  OldInduction = Legal->getInduction();
   Value *StartIdx = OldInduction->getIncomingValueForBlock(BypassBlock);
 
   assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop");
@@ -655,8 +706,6 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
                                  "scalar.preheader");
   // Find the induction variable.
   BasicBlock *OldBasicBlock = OrigLoop->getHeader();
-  assert(OldInduction && "We must have a single phi node.");
-  Type *IdxTy = OldInduction->getType();
 
   // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
   // inside the loop.
@@ -666,25 +715,11 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   Induction = Builder.CreatePHI(IdxTy, 2, "index");
   Constant *Step = ConstantInt::get(IdxTy, VF);
 
-  // Find the loop boundaries.
-  const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader());
-  assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
-
-  // Get the total trip count from the count by adding 1.
-  ExitCount = SE->getAddExpr(ExitCount,
-                             SE->getConstant(ExitCount->getType(), 1));
-
   // Expand the trip count and place the new instructions in the preheader.
   // Notice that the pre-header does not change, only the loop body.
   SCEVExpander Exp(*SE, "induction");
   Instruction *Loc = BypassBlock->getTerminator();
 
-  // We may need to extend the index in case there is a type mismatch.
-  // We know that the count starts at zero and does not overflow.
-  // We are using Zext because it should be less expensive.
-  if (ExitCount->getType() != Induction->getType())
-    ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy);
-
   // Count holds the overall loop count (N).
   Value *Count = Exp.expandCodeFor(ExitCount, Induction->getType(), Loc);
 
@@ -704,15 +739,121 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
                                IdxEndRoundDown,
                                StartIdx,
                                "cmp.zero", Loc);
+
+  LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
+    Legal->getRuntimePointerCheck();
+  Value *MemoryRuntimeCheck = 0;
+  if (PtrRtCheck->Need) {
+    unsigned NumPointers = PtrRtCheck->Pointers.size();
+    SmallVector<Value* , 2> Starts;
+    SmallVector<Value* , 2> Ends;
+
+    // Use this type for pointer arithmetic.
+    Type* PtrArithTy = PtrRtCheck->Pointers[0]->getType();
+
+    for (unsigned i=0; i < NumPointers; ++i) {
+      Value *Ptr = PtrRtCheck->Pointers[i];
+      const SCEV *Sc = SE->getSCEV(Ptr);
+
+      if (SE->isLoopInvariant(Sc, OrigLoop)) {
+        DEBUG(dbgs() << "LV1: Adding RT check for a loop invariant ptr:" <<
+              *Ptr <<"\n");
+        Starts.push_back(Ptr);
+        Ends.push_back(Ptr);
+      } else {
+        DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n");
+        const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
+        Value *Start = Exp.expandCodeFor(AR->getStart(), PtrArithTy, Loc);
+        const SCEV *Ex = SE->getExitCount(OrigLoop, OrigLoop->getHeader());
+        const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
+        assert(!isa<SCEVCouldNotCompute>(ScEnd) && "Invalid scev range.");
+        Value *End = Exp.expandCodeFor(ScEnd, PtrArithTy, Loc);
+        Starts.push_back(Start);
+        Ends.push_back(End);
+      }
+    }
+
+    for (unsigned i=0; i < NumPointers; ++i) {
+      for (unsigned j=i+1; j < NumPointers; ++j) {
+        Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
+                                      Starts[0], Ends[1], "bound0", Loc);
+        Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
+                                      Starts[1], Ends[0], "bound1", Loc);
+        Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1,
+                                                    "found.conflict", Loc);
+        if (MemoryRuntimeCheck) {
+          MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or,
+                                                      MemoryRuntimeCheck,
+                                                      IsConflict,
+                                                      "conflict.rdx", Loc);
+        } else {
+          MemoryRuntimeCheck = IsConflict;
+        }
+      }
+    }
+  }// end of need-runtime-check code.
+
+  // If we are using memory runtime checks, include them in.
+  if (MemoryRuntimeCheck) {
+    Cmp = BinaryOperator::Create(Instruction::Or, Cmp, MemoryRuntimeCheck,
+                                 "CntOrMem", Loc);
+  }
+
   BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc);
   // Remove the old terminator.
   Loc->eraseFromParent();
 
+  // We are going to resume the execution of the scalar loop.
+  // Go over all of the induction variables that we found and fix the
+  // PHIs that are left in the scalar version of the loop.
+  // The starting values of PHI nodes depend on the counter of the last
+  // iteration in the vectorized loop.
+  // If we come from a bypass edge then we need to start from the original start
+  // value.
+
+  // This variable saves the new starting index for the scalar loop.
+  Value *ResumeIndex = 0;
+  LoopVectorizationLegality::InductionList::iterator I, E;
+  LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
+  for (I = List->begin(), E = List->end(); I != E; ++I) {
+    PHINode *OrigPhi = I->first;
+    PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val",
+                                           MiddleBlock->getTerminator());
+    Value *EndValue = 0;
+    if (OrigPhi->getType()->isIntegerTy()) {
+      // Handle the integer induction counter:
+      assert(OrigPhi == OldInduction && "Unknown integer PHI");
+      // We know what the end value is.
+      EndValue = IdxEndRoundDown;
+      // We also know which PHI node holds it.
+      ResumeIndex = ResumeVal;
+    } else {
+      // For pointer induction variables, calculate the offset using
+      // the end index.
+      EndValue = GetElementPtrInst::Create(I->second, IdxEndRoundDown,
+                                           "ptr.ind.end",
+                                           BypassBlock->getTerminator());
+    }
+
+    // The new PHI merges the original incoming value, in case of a bypass,
+    // or the value at the end of the vectorized loop.
+    ResumeVal->addIncoming(I->second, BypassBlock);
+    ResumeVal->addIncoming(EndValue, VecBody);
+
+    // Fix the scalar body counter (PHI node).
+    unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH);
+    OrigPhi->setIncomingValue(BlockIdx, ResumeVal);
+  }
+
+  // Make sure that we found the index where scalar loop needs to continue.
+  assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() &&
+         "Invalid resume Index");
+
   // Add a check in the middle block to see if we have completed
   // all of the iterations in the first vector loop.
   // If (N - N%VF) == N, then we *don't* need to run the remainder.
   Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd,
-                                IdxEndRoundDown, "cmp.n",
+                                ResumeIndex, "cmp.n",
                                 MiddleBlock->getTerminator());
 
   BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator());
@@ -730,10 +871,6 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   // Now we have two terminators. Remove the old one from the block.
   VecBody->getTerminator()->eraseFromParent();
 
-  // Fix the scalar body iteration count.
-  unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH);
-  OldInduction->setIncomingValue(BlockIdx, IdxEndRoundDown);
-
   // Get ready to start creating new instructions into the vectorized body.
   Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
 
@@ -803,7 +940,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
   // add the new incoming edges to the PHI. At this point all of the
   // instructions in the basic block are vectorized, so we can use them to
   // construct the PHI.
-  PhiVector PHIsToFix;
+  PhiVector RdxPHIsToFix;
 
   // For each instruction in the old loop.
   for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
@@ -819,13 +956,40 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         // Special handling for the induction var.
         if (OldInduction == Inst)
           continue;
-        // This is phase one of vectorizing PHIs.
-        // This has to be a reduction variable.
-        assert(Legal->getReductionVars()->count(P) && "Not a Reduction");
-        Type *VecTy = VectorType::get(Inst->getType(), VF);
-        WidenMap[Inst] = Builder.CreatePHI(VecTy, 2, "vec.phi");
-        PHIsToFix.push_back(P);
-        continue;
+
+        // Handle reduction variables:
+        if (Legal->getReductionVars()->count(P)) {
+          // This is phase one of vectorizing PHIs.
+          Type *VecTy = VectorType::get(Inst->getType(), VF);
+          WidenMap[Inst] = Builder.CreatePHI(VecTy, 2, "vec.phi");
+          RdxPHIsToFix.push_back(P);
+          continue;
+        }
+
+        // Handle pointer inductions:
+        if (Legal->getInductionVars()->count(P)) {
+          Value *StartIdx = Legal->getInductionVars()->lookup(OldInduction);
+          Value *StartPtr = Legal->getInductionVars()->lookup(P);
+          // This is the normalized GEP that starts counting at zero.
+          Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
+                                                   "normalized.idx");
+          // This is the first GEP in the sequence.
+          Value *FirstGep = Builder.CreateGEP(StartPtr, NormalizedIdx,
+                                              "induc.ptr");
+          // This is the vector of results. Notice that we don't generate vector
+          // geps because scalar geps result in better code.
+          Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
+          for (unsigned int i = 0; i < VF; ++i) {
+            Value *SclrGep = Builder.CreateGEP(FirstGep, Builder.getInt32(i),
+                                               "next.gep");
+            VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
+                                                 Builder.getInt32(i),
+                                                 "insert.gep");
+          }
+
+          WidenMap[Inst] = VecVal;
+          continue;
+        }
       }
       case Instruction::Add:
       case Instruction::FAdd:
@@ -905,7 +1069,12 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF);
         Value *Ptr = SI->getPointerOperand();
         unsigned Alignment = SI->getAlignment();
+
+        assert(!Legal->isUniform(Ptr) &&
+               "We do not allow storing to uniform addresses");
+
         GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+
         // This store does not use GEPs.
         if (!Legal->isConsecutiveGep(Gep)) {
           scalarizeInstruction(Inst);
@@ -935,8 +1104,9 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         unsigned Alignment = LI->getAlignment();
         GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
 
-        // We don't have a gep. Scalarize the load.
-        if (!Legal->isConsecutiveGep(Gep)) {
+        // If we don't have a gep, or that the pointer is loop invariant,
+        // scalarize the load.
+        if (!Gep || Legal->isUniform(Gep) || !Legal->isConsecutiveGep(Gep)) {
           scalarizeInstruction(Inst);
           break;
         }
@@ -994,7 +1164,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
   // Create the 'reduced' values for each of the induction vars.
   // The reduced values are the vector values that we scalarize and combine
   // after the loop is finished.
-  for (PhiVector::iterator it = PHIsToFix.begin(), e = PHIsToFix.end();
+  for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end();
        it != e; ++it) {
     PHINode *RdxPhi = *it;
     PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]);
@@ -1026,7 +1196,6 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     Value *VectorStart = Builder.CreateInsertElement(Identity,
                                                     RdxDesc.StartValue, Zero);
 
-
     // Fix the vector-loop phi.
     // We created the induction variable so we know that the
     // preheader is the first entry.
@@ -1146,12 +1315,6 @@ bool LoopVectorizationLegality::canVectorize() {
   BasicBlock *BB = TheLoop->getHeader();
   DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n");
 
-  // Go over each instruction and look at memory deps.
-  if (!canVectorizeBlock(*BB)) {
-    DEBUG(dbgs() << "LV: Can't vectorize this loop header\n");
-    return false;
-  }
-
   // ScalarEvolution needs to be able to find the exit count.
   const SCEV *ExitCount = SE->getExitCount(TheLoop, BB);
   if (ExitCount == SE->getCouldNotCompute()) {
@@ -1167,7 +1330,15 @@ bool LoopVectorizationLegality::canVectorize() {
     return false;
   }
 
-  DEBUG(dbgs() << "LV: We can vectorize this loop!\n");
+  // Go over each instruction and look at memory deps.
+  if (!canVectorizeBlock(*BB)) {
+    DEBUG(dbgs() << "LV: Can't vectorize this loop header\n");
+    return false;
+  }
+
+  DEBUG(dbgs() << "LV: We can vectorize this loop" <<
+        (PtrRtCheck.Need ? " (with a runtime bound check)" : "")
+        <<"!\n");
 
   // Okay! We can vectorize. At this point we don't have any other mem analysis
   // which may limit our maximum vectorization factor, so just return true with
@@ -1176,23 +1347,33 @@ bool LoopVectorizationLegality::canVectorize() {
 }
 
 bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
+  BasicBlock *PreHeader = TheLoop->getLoopPreheader();
+
   // Scan the instructions in the block and look for hazards.
   for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
     Instruction *I = it;
 
-    PHINode *Phi = dyn_cast<PHINode>(I);
-    if (Phi) {
+    if (PHINode *Phi = dyn_cast<PHINode>(I)) {
       // This should not happen because the loop should be normalized.
       if (Phi->getNumIncomingValues() != 2) {
         DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
         return false;
       }
-      // We only look at integer phi nodes.
-      if (!Phi->getType()->isIntegerTy()) {
-        DEBUG(dbgs() << "LV: Found an non-int PHI.\n");
+
+      // This is the value coming from the preheader.
+      Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);
+
+      // We only look at integer and pointer phi nodes.
+      if (Phi->getType()->isPointerTy() && isInductionVariable(Phi)) {
+        DEBUG(dbgs() << "LV: Found a pointer induction variable.\n");
+        Inductions[Phi] = StartValue;
+        continue;
+      } else if (!Phi->getType()->isIntegerTy()) {
+        DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
         return false;
       }
 
+      // Handle integer PHIs:
       if (isInductionVariable(Phi)) {
         if (Induction) {
           DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n");
@@ -1200,6 +1381,7 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
         }
         DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n");
         Induction = Phi;
+        Inductions[Phi] = StartValue;
         continue;
       }
       if (AddReductionVar(Phi, IntegerAdd)) {
@@ -1304,6 +1486,8 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
   // Holds the Load and Store *instructions*.
   ValueVector Loads;
   ValueVector Stores;
+  PtrRtCheck.Pointers.clear();
+  PtrRtCheck.Need = false;
 
   // Scan the BB and collect legal loads and stores.
   for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
@@ -1361,6 +1545,12 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
     StoreInst *ST = dyn_cast<StoreInst>(*I);
     assert(ST && "Bad StoreInst");
     Value* Ptr = ST->getPointerOperand();
+
+    if (isUniform(Ptr)) {
+      DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
+      return false;
+    }
+
     // If we did *not* see this pointer before, insert it to
     // the read-write list. At this phase it is only a 'write' list.
     if (Seen.insert(Ptr))
@@ -1390,6 +1580,39 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
     return true;
   }
 
+  // Find pointers with computable bounds. We are going to use this information
+  // to place a runtime bound check.
+  bool RT = true;
+  for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I)
+    if (hasComputableBounds(*I)) {
+      PtrRtCheck.Pointers.push_back(*I);
+      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
+    } else {
+      RT = false;
+      break;
+    }
+  for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I)
+    if (hasComputableBounds(*I)) {
+      PtrRtCheck.Pointers.push_back(*I);
+      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
+    } else {
+      RT = false;
+      break;
+    }
+
+  // Check that we did not collect too many pointers or found a
+  // unsizeable pointer.
+  if (!RT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) {
+    PtrRtCheck.Pointers.clear();
+    RT = false;
+  }
+
+  PtrRtCheck.Need = RT;
+
+  if (RT) {
+    DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n");
+  }
+
   // Now that the pointers are in two lists (Reads and ReadWrites), we
   // can check that there are no conflicts between each of the writes and
   // between the writes to the reads.
@@ -1404,12 +1627,12 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
          it != e; ++it) {
       if (!isIdentifiedObject(*it)) {
         DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n");
-        return false;
+        return RT;
       }
       if (!WriteObjects.insert(*it)) {
         DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
               << **it <<"\n");
-        return false;
+        return RT;
       }
     }
     TempObjects.clear();
@@ -1422,18 +1645,21 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
          it != e; ++it) {
       if (!isIdentifiedObject(*it)) {
         DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n");
-        return false;
+        return RT;
       }
       if (WriteObjects.count(*it)) {
         DEBUG(dbgs() << "LV: Found a possible read/write reorder:"
               << **it <<"\n");
-        return false;
+        return RT;
       }
     }
     TempObjects.clear();
   }
 
-  // All is okay.
+  // It is safe to vectorize and we don't need any runtime checks.
+  DEBUG(dbgs() << "LV: We don't need a runtime memory check.\n");
+  PtrRtCheck.Pointers.clear();
+  PtrRtCheck.Need = false;
   return true;
 }
 
@@ -1527,8 +1753,6 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I,
     case Instruction::Sub:
       return Kind == IntegerAdd;
     case Instruction::Mul:
-    case Instruction::UDiv:
-    case Instruction::SDiv:
       return Kind == IntegerMult;
     case Instruction::And:
       return Kind == IntegerAnd;
@@ -1540,6 +1764,11 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I,
 }
 
 bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
+  Type *PhiTy = Phi->getType();
+  // We only handle integer and pointer inductions variables.
+  if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy())
+    return false;
+
   // Check that the PHI is consecutive and starts at zero.
   const SCEV *PhiScev = SE->getSCEV(Phi);
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
@@ -1549,11 +1778,26 @@ bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
   }
   const SCEV *Step = AR->getStepRecurrence(*SE);
 
-  if (!Step->isOne()) {
-    DEBUG(dbgs() << "LV: PHI stride does not equal one.\n");
+  // Integer inductions need to have a stride of one.
+  if (PhiTy->isIntegerTy())
+    return Step->isOne();
+
+  // Calculate the pointer stride and check if it is consecutive.
+  const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
+  if (!C) return false;
+
+  assert(PhiTy->isPointerTy() && "The PHI must be a pointer");
+  uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType());
+  return (C->getValue()->equalsInt(Size));
+}
+
+bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) {
+  const SCEV *PhiScev = SE->getSCEV(Ptr);
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
+  if (!AR)
     return false;
-  }
-  return true;
+
+  return AR->isAffine();
 }
 
 unsigned