From 3ef9dfa6858e25015c3e36b2f1a0ba5ebdea80d2 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Thu, 25 Oct 2012 21:03:48 +0000
Subject: LoopVectorize: Teach the cost model to query scalar costs as scalar
 types and not vectors of 1.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166715 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 102 +++++++++++++++++------------
 1 file changed, 61 insertions(+), 41 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 423c7a4911..e47baf8908 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -324,6 +324,11 @@ private:
   /// width. Vector width of one means scalar.
   unsigned getInstructionCost(Instruction *I, unsigned VF);
 
+  /// A helper function for converting Scalar types to vector types.
+  /// If the incoming type is void, we return void. If the VF is 1, we return
+  /// the scalar type.
+  static Type* ToVectorTy(Type *Scalar, unsigned VF);
+
   /// The loop that we evaluate.
   Loop *TheLoop;
   /// Scev analysis.
@@ -1478,8 +1483,16 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
 unsigned
 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   assert(VTTI && "Invalid vector target transformation info");
+
+  Type *RetTy = I->getType();
+  Type *VectorTy = ToVectorTy(RetTy, VF);
+
+  // TODO: We need to estimate the cost of intrinsic calls.
   switch (I->getOpcode()) {
     case Instruction::GetElementPtr:
+      // We mark this instruction as zero-cost because scalar GEPs are usually
+      // lowered to the intruction addressing mode. At the moment we don't
+      // generate vector geps.
       return 0;
     case Instruction::Br: {
       return VTTI->getInstrCost(I->getOpcode());
@@ -1504,74 +1517,76 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     case Instruction::And:
     case Instruction::Or:
     case Instruction::Xor: {
-      Type *VTy = VectorType::get(I->getType(), VF);
-      return VTTI->getInstrCost(I->getOpcode(), VTy);
+      return VTTI->getInstrCost(I->getOpcode(), VectorTy);
     }
     case Instruction::Select: {
       SelectInst *SI = cast<SelectInst>(I);
-      Type *VTy = VectorType::get(I->getType(), VF);
       const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
       bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
       Type *CondTy = SI->getCondition()->getType();
       if (ScalarCond)
         CondTy = VectorType::get(CondTy, VF);
 
-      return VTTI->getInstrCost(I->getOpcode(), VTy, CondTy);
+      return VTTI->getInstrCost(I->getOpcode(), VectorTy, CondTy);
     }
     case Instruction::ICmp:
     case Instruction::FCmp: {
-      Type *VTy = VectorType::get(I->getOperand(0)->getType(), VF);
-      return VTTI->getInstrCost(I->getOpcode(), VTy);
+      Type *ValTy = I->getOperand(0)->getType();
+      VectorTy = ToVectorTy(ValTy, VF);
+      return VTTI->getInstrCost(I->getOpcode(), VectorTy);
     }
     case Instruction::Store: {
       StoreInst *SI = cast<StoreInst>(I);
-      Type *VTy = VectorType::get(SI->getValueOperand()->getType(), VF);
+      Type *ValTy = SI->getValueOperand()->getType();
+      VectorTy = ToVectorTy(ValTy, VF);
+
+      if (VF == 1)
+        return VTTI->getMemoryOpCost(I->getOpcode(), ValTy,
+                              SI->getAlignment(), SI->getPointerAddressSpace());
 
       // Scalarized stores.
       if (!Legal->isConsecutiveGep(SI->getPointerOperand())) {
         unsigned Cost = 0;
-        if (VF != 1) {
-          unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
-                                                VTy);
-          // The cost of extracting from the value vector and pointer vector.
-          Cost += VF * (ExtCost * 2);
-        }
+        unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
+                                              ValTy);
+        // The cost of extracting from the value vector.
+        Cost += VF * (ExtCost);
         // The cost of the scalar stores.
         Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
-                                           VTy->getScalarType(),
+                                           ValTy->getScalarType(),
                                            SI->getAlignment(),
                                            SI->getPointerAddressSpace());
         return Cost;
       }
 
       // Wide stores.
-      return VTTI->getMemoryOpCost(I->getOpcode(), VTy, SI->getAlignment(),
+      return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(),
                                    SI->getPointerAddressSpace());
     }
     case Instruction::Load: {
       LoadInst *LI = cast<LoadInst>(I);
-      Type *VTy = VectorType::get(I->getType(), VF);
+
+      if (VF == 1)
+        return VTTI->getMemoryOpCost(I->getOpcode(), RetTy,
+                                     LI->getAlignment(),
+                                     LI->getPointerAddressSpace());
 
       // Scalarized loads.
       if (!Legal->isConsecutiveGep(LI->getPointerOperand())) {
         unsigned Cost = 0;
-        if (VF != 1) {
-          unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, VTy);
-          unsigned ExCost = VTTI->getInstrCost(Instruction::ExtractValue, VTy);
-
-          // The cost of inserting the loaded value into the result vector, and
-          // extracting from a vector of pointers.
-          Cost += VF * (InCost + ExCost);
-        }
+        unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy);
+        // The cost of inserting the loaded value into the result vector.
+        Cost += VF * (InCost);
         // The cost of the scalar stores.
-        Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), VTy->getScalarType(),
+        Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
+                                           RetTy->getScalarType(),
                                            LI->getAlignment(),
                                            LI->getPointerAddressSpace());
         return Cost;
       }
 
       // Wide loads.
-      return VTTI->getMemoryOpCost(I->getOpcode(), VTy, LI->getAlignment(),
+      return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(),
                                    LI->getPointerAddressSpace());
     }
     case Instruction::ZExt:
@@ -1586,35 +1601,40 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     case Instruction::Trunc:
     case Instruction::FPTrunc:
     case Instruction::BitCast: {
-      Type *SrcTy = VectorType::get(I->getOperand(0)->getType(), VF);
-      Type *DstTy = VectorType::get(I->getType(), VF);
-      return VTTI->getInstrCost(I->getOpcode(), DstTy, SrcTy);
+      Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
+      return VTTI->getInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
     }
     default: {
       // We are scalarizing the instruction. Return the cost of the scalar
       // instruction, plus the cost of insert and extract into vector
       // elements, times the vector width.
       unsigned Cost = 0;
-      Type *Ty = I->getType();
 
-      if (!Ty->isVoidTy()) {
-        Type *VTy = VectorType::get(Ty, VF);
-        unsigned InsCost = VTTI->getInstrCost(Instruction::InsertElement, VTy);
-        unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, VTy);
-        Cost += VF * (InsCost + ExtCost);
-      }
+      bool IsVoid = RetTy->isVoidTy();
 
-      /// We don't have any information on the scalar instruction, but maybe
-      /// the target has.
-      /// TODO: This may be a target-specific intrinsic.
-      /// Need to add API for that.
-      Cost += VF * VTTI->getInstrCost(I->getOpcode(), Ty);
+      unsigned InsCost = (IsVoid ? 0 :
+                          VTTI->getInstrCost(Instruction::InsertElement,
+                                             VectorTy));
 
+      unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
+                                            VectorTy);
+
+      // The cost of inserting the results plus extracting each one of the
+      // operands.
+      Cost += VF * (InsCost + ExtCost * I->getNumOperands());
+
+      // The cost of executing VF copies of the scalar instruction.
+      Cost += VF * VTTI->getInstrCost(I->getOpcode(), RetTy);
       return Cost;
     }
   }// end of switch.
 }
 
+Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) {
+  if (Scalar->isVoidTy() || VF == 1)
+    return Scalar;
+  return VectorType::get(Scalar, VF);
+}
 
 } // namespace
 
-- 
cgit v1.2.3-70-g09d2


From a5a3a61c5fdcee972791d4e08441ba6edf131b88 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Fri, 26 Oct 2012 23:49:28 +0000
Subject: Refactor the VectorTargetTransformInfo interface.

Add getCostXXX calls for different families of opcodes, such as casts, arithmetic, cmp, etc.

Port the LoopVectorizer to the new API.

The LoopVectorizer now finds instructions which will remain uniform after vectorization. It uses this information when calculating the cost of these instructions.



git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166836 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Target/TargetLowering.h            |   7 ++
 include/llvm/Target/TargetTransformImpl.h       |  19 ++-
 include/llvm/TargetTransformInfo.h              |  32 ++++-
 lib/Target/TargetTransformImpl.cpp              | 152 +++++++++++++++++++-----
 lib/Transforms/Vectorize/LoopVectorize.cpp      |  61 ++++++++--
 test/Transforms/LoopVectorize/X86/cost-model.ll |   2 +-
 6 files changed, 234 insertions(+), 39 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index 19eb941635..9d0aeaa356 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -411,6 +411,13 @@ public:
        getOperationAction(Op, VT) == Custom);
   }
 
+  /// isOperationExpand - Return true if the specified operation is illegal on
+  /// this target or unlikely to be made legal with custom lowering. This is
+  /// used to help guide high-level lowering decisions.
+  bool isOperationExpand(unsigned Op, EVT VT) const {
+    return (!isTypeLegal(VT) || getOperationAction(Op, VT) == Expand);
+  }
+
   /// isOperationLegal - Return true if the specified operation is legal on this
   /// target.
   bool isOperationLegal(unsigned Op, EVT VT) const {
diff --git a/include/llvm/Target/TargetTransformImpl.h b/include/llvm/Target/TargetTransformImpl.h
index 133be87194..fd4b737afd 100644
--- a/include/llvm/Target/TargetTransformImpl.h
+++ b/include/llvm/Target/TargetTransformImpl.h
@@ -56,15 +56,32 @@ private:
   std::pair<unsigned, EVT>
   getTypeLegalizationCost(LLVMContext &C, EVT Ty) const;
 
+  /// Estimate the overhead of scalarizing an instruction. Insert and Extract
+  /// are set if the result needs to be inserted and/or extracted from vectors.
+  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
+
 public:
   explicit VectorTargetTransformImpl(const TargetLowering *TL) : TLI(TL) {}
-  
+
   virtual ~VectorTargetTransformImpl() {}
 
   virtual unsigned getInstrCost(unsigned Opcode, Type *Ty1, Type *Ty2) const;
 
+  virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const;
+
   virtual unsigned getBroadcastCost(Type *Tp) const;
 
+  virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
+                                    Type *Src) const;
+
+  virtual unsigned getCFInstrCost(unsigned Opcode) const;
+
+  virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                      Type *CondTy) const;
+
+  virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
+                                      unsigned Index) const;
+
   virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src,
                                    unsigned Alignment,
                                    unsigned AddressSpace) const;
diff --git a/include/llvm/TargetTransformInfo.h b/include/llvm/TargetTransformInfo.h
index 71c78ec52e..96761594fb 100644
--- a/include/llvm/TargetTransformInfo.h
+++ b/include/llvm/TargetTransformInfo.h
@@ -143,13 +143,43 @@ public:
     return 1;
   }
 
+  /// Returns the expected cost of arithmetic ops, such as mul, xor, fsub, etc.
+  virtual unsigned getArithmeticInstrCost(unsigned Opcode, Type *Ty) const {
+    return 1;
+  }
+
   /// Returns the cost of a vector broadcast of a scalar at place zero to a
   /// vector of type 'Tp'.
   virtual unsigned getBroadcastCost(Type *Tp) const {
     return 1;
   }
 
-  /// Returns the cost of Load and Store instructions. 
+  /// Returns the expected cost of cast instructions, such as bitcast, trunc,
+  /// zext, etc.
+  virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
+                                    Type *Src) const {
+    return 1;
+  }
+
+  /// Returns the expected cost of control-flow related instrutctions such as
+  /// Phi, Ret, Br.
+  virtual unsigned getCFInstrCost(unsigned Opcode) const {
+    return 1;
+  }
+
+  /// Returns the expected cost of compare and select instructions.
+  virtual unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                      Type *CondTy = 0) const {
+    return 1;
+  }
+
+  /// Returns the expected cost of vector Insert and Extract.
+  virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
+                                      unsigned Index = 0) const {
+    return 1;
+  }
+
+  /// Returns the cost of Load and Store instructions.
   virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src,
                                    unsigned Alignment,
                                    unsigned AddressSpace) const {
diff --git a/lib/Target/TargetTransformImpl.cpp b/lib/Target/TargetTransformImpl.cpp
index 40184ed78d..d3ab105988 100644
--- a/lib/Target/TargetTransformImpl.cpp
+++ b/lib/Target/TargetTransformImpl.cpp
@@ -126,7 +126,7 @@ static int InstructionOpcodeToISD(unsigned Opcode) {
 
 std::pair<unsigned, EVT>
 VectorTargetTransformImpl::getTypeLegalizationCost(LLVMContext &C,
-                                                         EVT Ty) const {
+                                                   EVT Ty) const {
   unsigned Cost = 1;
   // We keep legalizing the type until we find a legal kind. We assume that
   // the only operation that costs anything is the split. After splitting
@@ -135,7 +135,7 @@ VectorTargetTransformImpl::getTypeLegalizationCost(LLVMContext &C,
     TargetLowering::LegalizeKind LK = TLI->getTypeConversion(C, Ty);
 
     if (LK.first == TargetLowering::TypeLegal)
-      return std::make_pair(Cost, LK.second);
+      return std::make_pair(Cost, Ty);
 
     if (LK.first == TargetLowering::TypeSplitVector)
       Cost *= 2;
@@ -146,44 +146,144 @@ VectorTargetTransformImpl::getTypeLegalizationCost(LLVMContext &C,
 }
 
 unsigned
-VectorTargetTransformImpl::getInstrCost(unsigned Opcode, Type *Ty1,
-                                        Type *Ty2) const {
+VectorTargetTransformImpl::getScalarizationOverhead(Type *Ty,
+                                                    bool Insert,
+                                                    bool Extract) const {
+  assert (Ty->isVectorTy() && "Can only scalarize vectors");
+   unsigned Cost = 0;
+
+  for (int i = 0, e = Ty->getVectorNumElements(); i < e; ++i) {
+    if (Insert)
+      Cost += getVectorInstrCost(Instruction::InsertElement, Ty, i);
+    if (Extract)
+      Cost += getVectorInstrCost(Instruction::ExtractElement, Ty, i);
+  }
+
+  return Cost;
+}
+
+unsigned VectorTargetTransformImpl::getArithmeticInstrCost(unsigned Opcode,
+                                                           Type *Ty) const {
   // Check if any of the operands are vector operands.
   int ISD = InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  std::pair<unsigned, EVT> LT =
+  getTypeLegalizationCost(Ty->getContext(), TLI->getValueType(Ty));
+
+  if (!TLI->isOperationExpand(ISD, LT.second)) {
+    // The operation is legal. Assume it costs 1. Multiply
+    // by the type-legalization overhead.
+    return LT.first * 1;
+  }
+
+  // Else, assume that we need to scalarize this op.
+  if (Ty->isVectorTy()) {
+    unsigned Num = Ty->getVectorNumElements();
+    unsigned Cost = getArithmeticInstrCost(Opcode, Ty->getScalarType());
+    // return the cost of multiple scalar invocation plus the cost of inserting
+    // and extracting the values.
+    return getScalarizationOverhead(Ty, true, true) + Num * Cost;
+  }
+
+  // We don't know anything about this scalar instruction.
+  return 1;
+}
+
+unsigned VectorTargetTransformImpl::getBroadcastCost(Type *Tp) const {
+  return 1;
+}
+
+unsigned VectorTargetTransformImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
+                                  Type *Src) const {
+  assert(Src->isVectorTy() == Dst->isVectorTy() && "Invalid input types");
+  int ISD = InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
 
-  // If we don't have any information about this instruction assume it costs 1.
-  if (ISD == 0)
-    return 1;
+  std::pair<unsigned, EVT> SrcLT =
+  getTypeLegalizationCost(Src->getContext(), TLI->getValueType(Src));
 
+  std::pair<unsigned, EVT> DstLT =
+  getTypeLegalizationCost(Dst->getContext(), TLI->getValueType(Dst));
+
+  // If the cast is between same-sized registers, then the check is simple.
+  if (SrcLT.first == DstLT.first &&
+      SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) {
+    // Just check the op cost:
+    if (!TLI->isOperationExpand(ISD, DstLT.second)) {
+      // The operation is legal. Assume it costs 1. Multiply
+      // by the type-legalization overhead.
+      return SrcLT.first * 1;
+    }
+  }
+
+  // Otherwise, assume that the cast is scalarized.
+  if (Dst->isVectorTy()) {
+    unsigned Num = Dst->getVectorNumElements();
+    unsigned Cost = getCastInstrCost(Opcode, Src->getScalarType(),
+                                     Dst->getScalarType());
+    // return the cost of multiple scalar invocation plus the cost of inserting
+    // and extracting the values.
+    return getScalarizationOverhead(Dst, true, true) + Num * Cost;
+  }
+
+  // Unknown scalar opcode.
+  return 1;
+}
+
+unsigned VectorTargetTransformImpl::getCFInstrCost(unsigned Opcode) const {
+  return 1;
+}
+
+unsigned VectorTargetTransformImpl::getCmpSelInstrCost(unsigned Opcode,
+                                                       Type *ValTy,
+                                                       Type *CondTy) const {
+  int ISD = InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+  
   // Selects on vectors are actually vector selects.
   if (ISD == ISD::SELECT) {
-    assert(Ty2 && "Ty2 must hold the condition type");
-    if (Ty2->isVectorTy())
-    ISD = ISD::VSELECT;
+    assert(CondTy && "CondTy must exist");
+    if (CondTy->isVectorTy())
+      ISD = ISD::VSELECT;
   }
 
-  assert(Ty1 && "We need to have at least one type");
+  std::pair<unsigned, EVT> LT =
+  getTypeLegalizationCost(ValTy->getContext(), TLI->getValueType(ValTy));
 
-  // From this stage we look at the legalized type.
-  std::pair<unsigned, EVT>  LT =
-  getTypeLegalizationCost(Ty1->getContext(), TLI->getValueType(Ty1));
-
-  if (TLI->isOperationLegalOrCustom(ISD, LT.second)) {
+  if (!TLI->isOperationExpand(ISD, LT.second)) {
     // The operation is legal. Assume it costs 1. Multiply
     // by the type-legalization overhead.
     return LT.first * 1;
   }
 
-  unsigned NumElem =
-    (LT.second.isVector() ? LT.second.getVectorNumElements() : 1);
+  // Otherwise, assume that the cast is scalarized.
+  if (ValTy->isVectorTy()) {
+    unsigned Num = ValTy->getVectorNumElements();
+    if (CondTy)
+      CondTy = CondTy->getScalarType();
+    unsigned Cost = getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
+                                       CondTy);
+
+    // return the cost of multiple scalar invocation plus the cost of inserting
+    // and extracting the values.
+    return getScalarizationOverhead(ValTy, true, false) + Num * Cost;
+  }
 
-  // We will probably scalarize this instruction. Assume that the cost is the
-  // number of the vector elements.
-  return LT.first * NumElem * 1;
+  // Unknown scalar opcode. 
+  return 1;
+}
+
+/// Returns the expected cost of Vector Insert and Extract.
+unsigned VectorTargetTransformImpl::getVectorInstrCost(unsigned Opcode,
+                                                       Type *Val,
+                                                       unsigned Index) const {
+  return 1;
 }
 
 unsigned
-VectorTargetTransformImpl::getBroadcastCost(Type *Tp) const {
+VectorTargetTransformImpl::getInstrCost(unsigned Opcode, Type *Ty1,
+                                        Type *Ty2) const {
   return 1;
 }
 
@@ -191,17 +291,15 @@ unsigned
 VectorTargetTransformImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
                                            unsigned Alignment,
                                            unsigned AddressSpace) const {
-  // From this stage we look at the legalized type.
-  std::pair<unsigned, EVT>  LT =
+  std::pair<unsigned, EVT> LT =
   getTypeLegalizationCost(Src->getContext(), TLI->getValueType(Src));
+
   // Assume that all loads of legal types cost 1.
   return LT.first;
 }
 
 unsigned
 VectorTargetTransformImpl::getNumberOfParts(Type *Tp) const {
-  std::pair<unsigned, EVT>  LT =
-  getTypeLegalizationCost(Tp->getContext(), TLI->getValueType(Tp));
-  return LT.first;
+  return TLI->getNumRegisters(Tp->getContext(), TLI->getValueType(Tp));
 }
 
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index e47baf8908..1773812da2 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -108,7 +108,7 @@ public:
     createEmptyLoop(Legal);
     /// Widen each instruction in the old loop to a new one in the new loop.
     /// Use the Legality module to find the induction and reduction variables.
-   vectorizeLoop(Legal);
+    vectorizeLoop(Legal);
     // register the new loop.
     cleanup();
  }
@@ -254,6 +254,9 @@ public:
   /// This check allows us to vectorize A[idx] into a wide load/store.
   bool isConsecutiveGep(Value *Ptr);
 
+  /// Returns true if this instruction will remain scalar after vectorization.
+  bool isUniformAfterVectorization(Instruction* I) {return Uniforms.count(I);}
+
 private:
   /// Check if a single basic block loop is vectorizable.
   /// At this point we know that this is a loop with a constant trip count
@@ -291,6 +294,9 @@ private:
   /// Allowed outside users. This holds the reduction
   /// vars which can be accessed from outside the loop.
   SmallPtrSet<Value*, 4> AllowedExit;
+  /// This set holds the variables which are known to be uniform after
+  /// vectorization.
+  SmallPtrSet<Instruction*, 4> Uniforms;
 };
 
 /// LoopVectorizationCostModel - estimates the expected speedups due to
@@ -1177,9 +1183,40 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
       return false;
   }
 
-  // If the memory dependencies do not prevent us from
-  // vectorizing, then vectorize.
-  return canVectorizeMemory(BB);
+  // Don't vectorize if the memory dependencies do not allow vectorization.
+  if (!canVectorizeMemory(BB))
+    return false;
+
+  // We now know that the loop is vectorizable!
+  // Collect variables that will remain uniform after vectorization.
+  std::vector<Value*> Worklist;
+
+  // Start with the conditional branch and walk up the block.
+  Worklist.push_back(BB.getTerminator()->getOperand(0));
+
+  while (Worklist.size()) {
+    Instruction *I = dyn_cast<Instruction>(Worklist.back());
+    Worklist.pop_back();
+    // Look at instructions inside this block.
+    if (!I) continue;
+    if (I->getParent() != &BB) continue;
+
+    // Stop when reaching PHI nodes.
+    if (isa<PHINode>(I)) {
+      assert(I == Induction && "Found a uniform PHI that is not the induction");
+      break;
+    }
+
+    // This is a known uniform.
+    Uniforms.insert(I);
+
+    // Insert all operands.
+    for (int i=0, Op = I->getNumOperands(); i < Op; ++i) {
+      Worklist.push_back(I->getOperand(i));
+    }
+  }
+
+  return true;
 }
 
 bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
@@ -1484,9 +1521,15 @@ unsigned
 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   assert(VTTI && "Invalid vector target transformation info");
 
+  // If we know that this instruction will remain uniform, check the cost of
+  // the scalar version.
+  if (Legal->isUniformAfterVectorization(I))
+    VF = 1;
+
   Type *RetTy = I->getType();
   Type *VectorTy = ToVectorTy(RetTy, VF);
 
+
   // TODO: We need to estimate the cost of intrinsic calls.
   switch (I->getOpcode()) {
     case Instruction::GetElementPtr:
@@ -1495,7 +1538,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
       // generate vector geps.
       return 0;
     case Instruction::Br: {
-      return VTTI->getInstrCost(I->getOpcode());
+      return VTTI->getCFInstrCost(I->getOpcode());
     }
     case Instruction::PHI:
       return 0;
@@ -1517,7 +1560,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     case Instruction::And:
     case Instruction::Or:
     case Instruction::Xor: {
-      return VTTI->getInstrCost(I->getOpcode(), VectorTy);
+      return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy);
     }
     case Instruction::Select: {
       SelectInst *SI = cast<SelectInst>(I);
@@ -1527,13 +1570,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
       if (ScalarCond)
         CondTy = VectorType::get(CondTy, VF);
 
-      return VTTI->getInstrCost(I->getOpcode(), VectorTy, CondTy);
+      return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
     }
     case Instruction::ICmp:
     case Instruction::FCmp: {
       Type *ValTy = I->getOperand(0)->getType();
       VectorTy = ToVectorTy(ValTy, VF);
-      return VTTI->getInstrCost(I->getOpcode(), VectorTy);
+      return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy);
     }
     case Instruction::Store: {
       StoreInst *SI = cast<StoreInst>(I);
@@ -1602,7 +1645,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
     case Instruction::FPTrunc:
     case Instruction::BitCast: {
       Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
-      return VTTI->getInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
+      return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
     }
     default: {
       // We are scalarizing the instruction. Return the cost of the scalar
diff --git a/test/Transforms/LoopVectorize/X86/cost-model.ll b/test/Transforms/LoopVectorize/X86/cost-model.ll
index 628f9912c8..40e660855b 100644
--- a/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -9,7 +9,7 @@ target triple = "x86_64-apple-macosx10.8.0"
 @a = common global [2048 x i32] zeroinitializer, align 16
 
 ;CHECK: cost_model_1
-;CHECK: <4 x i32>
+;CHECK-NOT: <4 x i32>
 ;CHECK: ret void
 define void @cost_model_1() nounwind uwtable noinline ssp {
 entry:
-- 
cgit v1.2.3-70-g09d2


From f065a8467785015336432e3e6e584798d8b48d8e Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Sat, 27 Oct 2012 04:11:32 +0000
Subject: 1. Fix a bug in getTypeConversion. When a *simple* type is split, we
 need to return the type of the split result. 2. Change the maximum
 vectorization width from 4 to 8. 3. A test for both.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166864 91177308-0d34-0410-b5e6-96231b3b80d8
---
 include/llvm/Target/TargetLowering.h              |  3 ++
 lib/Transforms/Vectorize/LoopVectorize.cpp        |  2 +-
 test/Transforms/LoopVectorize/X86/gcc-examples.ll | 62 +++++++++++++++++++++++
 3 files changed, 66 insertions(+), 1 deletion(-)
 create mode 100644 test/Transforms/LoopVectorize/X86/gcc-examples.ll

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index 9d0aeaa356..13f80fda3d 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -1980,6 +1980,9 @@ public:
          ValueTypeActions.getTypeAction(NVT.getSimpleVT()) != TypePromoteInteger)
          && "Promote may not follow Expand or Promote");
 
+      if (LA == TypeSplitVector)
+        NVT = EVT::getVectorVT(Context, VT.getVectorElementType(),
+                               VT.getVectorNumElements() / 2);
       return LegalizeKind(LA, NVT);
     }
 
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 1773812da2..be197db956 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -317,7 +317,7 @@ public:
   /// Returns the most profitable vectorization factor for the loop that is
   /// smaller or equal to the VF argument. This method checks every power
   /// of two up to VF.
-  unsigned findBestVectorizationFactor(unsigned VF = 4);
+  unsigned findBestVectorizationFactor(unsigned VF = 8);
 
 private:
   /// Returns the expected execution cost. The unit of the cost does
diff --git a/test/Transforms/LoopVectorize/X86/gcc-examples.ll b/test/Transforms/LoopVectorize/X86/gcc-examples.ll
new file mode 100644
index 0000000000..e7a63c9316
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/gcc-examples.ll
@@ -0,0 +1,62 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@b = common global [2048 x i32] zeroinitializer, align 16
+@c = common global [2048 x i32] zeroinitializer, align 16
+@a = common global [2048 x i32] zeroinitializer, align 16
+
+; Select VF = 8;
+;CHECK: @example1
+;CHECK: load <8 x i32>
+;CHECK: add <8 x i32>
+;CHECK: store <8 x i32>
+;CHECK: ret void
+define void @example1() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+
+; Select VF=4 because sext <8 x i1> to <8 x i32> is expensive. 
+;CHECK: @example10b
+;CHECK: load <4 x i16>
+;CHECK: sext <4 x i16>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example10b(i16* noalias nocapture %sa, i16* noalias nocapture %sb, i16* noalias nocapture %sc, i32* noalias nocapture %ia, i32* noalias nocapture %ib, i32* noalias nocapture %ic) nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds i16* %sb, i64 %indvars.iv
+  %3 = load i16* %2, align 2
+  %4 = sext i16 %3 to i32
+  %5 = getelementptr inbounds i32* %ia, i64 %indvars.iv
+  store i32 %4, i32* %5, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %6, label %1
+
+; <label>:6                                       ; preds = %1
+  ret void
+}
+
-- 
cgit v1.2.3-70-g09d2


From 369ff7b740c154dc4357c6d063f4957498a1e7b9 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Mon, 29 Oct 2012 21:52:38 +0000
Subject: LoopVectorize: Update and preserve the dominator tree info.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166970 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 46 ++++++++++++++++++++++++------
 1 file changed, 37 insertions(+), 9 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index be197db956..dcdf0a358f 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -55,6 +55,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -98,8 +99,9 @@ class SingleBlockLoopVectorizer {
 public:
   /// Ctor.
   SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li,
-                            LPPassManager *Lpm, unsigned VecWidth):
-  OrigLoop(Orig), SE(Se), LI(Li), LPM(Lpm), VF(VecWidth),
+                            DominatorTree *dt, LPPassManager *Lpm,
+                            unsigned VecWidth):
+  OrigLoop(Orig), SE(Se), LI(Li), DT(dt), LPM(Lpm), VF(VecWidth),
   Builder(Se->getContext()), Induction(0), OldInduction(0) { }
 
   // Perform the actual loop widening (vectorization).
@@ -110,7 +112,7 @@ public:
     /// Use the Legality module to find the induction and reduction variables.
     vectorizeLoop(Legal);
     // register the new loop.
-    cleanup();
+    updateAnalysis();
  }
 
 private:
@@ -119,7 +121,7 @@ private:
   /// Copy and widen the instructions from the old loop.
   void vectorizeLoop(LoopVectorizationLegality *Legal);
   /// Insert the new loop to the loop hierarchy and pass manager.
-  void cleanup();
+  void updateAnalysis();
 
   /// This instruction is un-vectorizable. Implement it as a sequence
   /// of scalars.
@@ -155,6 +157,8 @@ private:
   ScalarEvolution *SE;
   // Loop Info.
   LoopInfo *LI;
+  // Dominator Tree.
+  DominatorTree *DT;
   // Loop Pass Manager;
   LPPassManager *LPM;
   // The vectorization factor to use.
@@ -165,6 +169,10 @@ private:
 
   // --- Vectorization state ---
 
+  /// The vector-loop preheader.
+  BasicBlock *LoopVectorPreHeader;
+  /// The scalar-loop preheader.
+  BasicBlock *LoopScalarPreHeader;
   /// Middle Block between the vector and the scalar.
   BasicBlock *LoopMiddleBlock;
   ///The ExitBlock of the scalar loop.
@@ -357,6 +365,7 @@ struct LoopVectorize : public LoopPass {
   DataLayout *DL;
   LoopInfo *LI;
   TargetTransformInfo *TTI;
+  DominatorTree *DT;
 
   virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
     // We only vectorize innermost loops.
@@ -367,6 +376,7 @@ struct LoopVectorize : public LoopPass {
     DL = getAnalysisIfAvailable<DataLayout>();
     LI = &getAnalysis<LoopInfo>();
     TTI = getAnalysisIfAvailable<TargetTransformInfo>();
+    DT = &getAnalysis<DominatorTree>();
 
     DEBUG(dbgs() << "LV: Checking a loop in \"" <<
           L->getHeader()->getParent()->getName() << "\"\n");
@@ -401,7 +411,7 @@ struct LoopVectorize : public LoopPass {
     DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ").\n");
 
     // If we decided that it is *legal* to vectorizer the loop then do it.
-    SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, VF);
+    SingleBlockLoopVectorizer LB(L, SE, LI, DT, &LPM, VF);
     LB.vectorize(&LVL);
 
     DEBUG(verifyFunction(*L->getHeader()->getParent()));
@@ -414,6 +424,9 @@ struct LoopVectorize : public LoopPass {
     AU.addRequiredID(LCSSAID);
     AU.addRequired<LoopInfo>();
     AU.addRequired<ScalarEvolution>();
+    AU.addRequired<DominatorTree>();
+    AU.addPreserved<LoopInfo>();
+    AU.addPreserved<DominatorTree>();
   }
 
 };
@@ -725,6 +738,8 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal
   }
 
   // Save the state.
+  LoopVectorPreHeader = VectorPH;
+  LoopScalarPreHeader = ScalarPH;
   LoopMiddleBlock = MiddleBlock;
   LoopExitBlock = ExitBlock;
   LoopVectorBody = VecBody;
@@ -855,8 +870,8 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         // The last index does not have to be the induction. It can be
         // consecutive and be a function of the index. For example A[I+1];
         unsigned NumOperands = Gep->getNumOperands();
-        Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
-        LastIndex = Builder.CreateExtractElement(LastIndex, Builder.getInt32(0));
+        Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1));
+        LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
 
         // Create the new GEP with the new induction variable.
         GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
@@ -885,7 +900,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         // consecutive and be a function of the index. For example A[I+1];
         unsigned NumOperands = Gep->getNumOperands();
         Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
-        LastIndex = Builder.CreateExtractElement(LastIndex, Builder.getInt32(0));
+        LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
 
         // Create the new GEP with the new induction variable.
         GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
@@ -1051,9 +1066,22 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
   }// end of for each redux variable.
 }
 
-void SingleBlockLoopVectorizer::cleanup() {
+void SingleBlockLoopVectorizer::updateAnalysis() {
   // The original basic block.
   SE->forgetLoop(OrigLoop);
+
+  // Update the dominator tree information.
+  assert(DT->properlyDominates(LoopBypassBlock, LoopExitBlock) &&
+         "Entry does not dominate exit.");
+
+  DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlock);
+  DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader);
+  DT->addNewBlock(LoopMiddleBlock, LoopBypassBlock);
+  DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock);
+  DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader);
+  DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock);
+
+  DEBUG(DT->verifyAnalysis());
 }
 
 bool LoopVectorizationLegality::canVectorize() {
-- 
cgit v1.2.3-70-g09d2


From 803ea1c40aa059adf5b1baef0bf4510483565d07 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Tue, 30 Oct 2012 00:40:39 +0000
Subject: LoopVectorizer: change debug prints: Print the module identifier when
 deciding to vectorize. When deciding not to vectorize do not print the called
 function name because it can be null.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@166989 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index dcdf0a358f..0e6103e16a 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -408,7 +408,9 @@ struct LoopVectorize : public LoopPass {
       VF = VectorizationFactor;
     }
 
-    DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ").\n");
+    DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<<
+          L->getHeader()->getParent()->getParent()->getModuleIdentifier()<<
+          "\n");
 
     // If we decided that it is *legal* to vectorizer the loop then do it.
     SingleBlockLoopVectorizer LB(L, SE, LI, DT, &LPM, VF);
@@ -597,7 +599,8 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
     WidenMap[Instr] = VecResults;
 }
 
-void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
+void
+SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   /*
    In this function we generate a new loop. The new loop will contain
    the vectorized instructions while the old loop will continue to run the
@@ -1178,8 +1181,7 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
     // We still don't handle functions.
     CallInst *CI = dyn_cast<CallInst>(I);
     if (CI) {
-      DEBUG(dbgs() << "LV: Found a call site:"<<
-            CI->getCalledFunction()->getName() << "\n");
+      DEBUG(dbgs() << "LV: Found a call site.\n");
       return false;
     }
 
-- 
cgit v1.2.3-70-g09d2


From cc03331caaeee00b1238654feda8c5a517e48c3a Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Tue, 30 Oct 2012 18:12:36 +0000
Subject: LoopVectorize: Fix a bug in the initialization of reduction
 variables. AND needs to start at all-one while XOR, and OR need to start at
 zero.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167032 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 28 +++++++++++++++++++++-------
 test/Transforms/LoopVectorize/reduction.ll |  1 +
 2 files changed, 22 insertions(+), 7 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0e6103e16a..ac82a66b80 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -211,8 +211,6 @@ public:
   TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { }
 
   /// This represents the kinds of reductions that we support.
-  /// We use the enum values to hold the 'identity' value for
-  /// each operand. This value does not change the result if applied.
   enum ReductionKind {
     NoReduction = -1, /// Not a reduction.
     IntegerAdd  = 0,  /// Sum of numbers.
@@ -523,7 +521,7 @@ SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) {
   SmallVector<Constant*, 8> Indices;
   // Create a vector of consecutive numbers from zero to VF.
   for (unsigned i = 0; i < VF; ++i)
-    Indices.push_back(ConstantInt::get(ScalarTy, Val));
+    Indices.push_back(ConstantInt::get(ScalarTy, Val, true));
 
   // Add the consecutive indices to the vector value.
   return ConstantVector::get(Indices);
@@ -750,6 +748,23 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   LoopBypassBlock = BypassBlock;
 }
 
+
+static unsigned
+getReductionIdentity(LoopVectorizationLegality::ReductionKind K) {
+  switch (K) {
+  case LoopVectorizationLegality::IntegerXor:
+  case LoopVectorizationLegality::IntegerAdd:
+  case LoopVectorizationLegality::IntegerOr:
+    return 0;
+  case LoopVectorizationLegality::IntegerMult:
+    return 1;
+  case LoopVectorizationLegality::IntegerAnd:
+    return -1;
+  default:
+    llvm_unreachable("Unknown reduction kind");
+  }
+}
+
 void
 SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
   //===------------------------------------------------===//
@@ -974,10 +989,9 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
     Type *VecTy = VectorExit->getType();
 
-    // Find the reduction identity variable. The value of the enum is the
-    // identity. Zero for addition. One for Multiplication.
-    unsigned IdentitySclr =  RdxDesc.Kind;
-    Constant *Identity = getUniformVector(IdentitySclr,
+    // Find the reduction identity variable. Zero for addition, or, xor,
+    // one for multiplication, -1 for And.
+    Constant *Identity = getUniformVector(getReductionIdentity(RdxDesc.Kind),
                                           VecTy->getScalarType());
 
     // This vector is the Identity vector where the first element is the
diff --git a/test/Transforms/LoopVectorize/reduction.ll b/test/Transforms/LoopVectorize/reduction.ll
index 746a08c3ea..d19f7c1b0b 100644
--- a/test/Transforms/LoopVectorize/reduction.ll
+++ b/test/Transforms/LoopVectorize/reduction.ll
@@ -151,6 +151,7 @@ for.end:                                          ; preds = %for.body, %entry
 
 ;CHECK: @reduction_and
 ;CHECK: and <4 x i32>
+;CHECK: <i32 -1, i32 -1, i32 -1, i32 -1>
 ;CHECK: ret i32
 define i32 @reduction_and(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
 entry:
-- 
cgit v1.2.3-70-g09d2


From e709f5b600fd630c4f58b5dba14c8069a03093ea Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Tue, 30 Oct 2012 18:36:45 +0000
Subject: LoopVectorize: Add support for write-only loops when the write
 destination is a single pointer. Speedup SciMark by 1%

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167035 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp  |  7 +++++++
 test/Transforms/LoopVectorize/write-only.ll | 26 ++++++++++++++++++++++++++
 2 files changed, 33 insertions(+)
 create mode 100644 test/Transforms/LoopVectorize/write-only.ll

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index ac82a66b80..9e05cacbbe 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1348,6 +1348,13 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
       Reads.push_back(Ptr);
   }
 
+  // If we write (or read-write) to a single destination and there are no
+  // other reads in this loop then is it safe to vectorize.
+  if (ReadWrites.size() == 1 && Reads.size() == 0) {
+    DEBUG(dbgs() << "LV: Found a write-only loop!\n");
+    return true;
+  }
+
   // Now that the pointers are in two lists (Reads and ReadWrites), we
   // can check that there are no conflicts between each of the writes and
   // between the writes to the reads.
diff --git a/test/Transforms/LoopVectorize/write-only.ll b/test/Transforms/LoopVectorize/write-only.ll
new file mode 100644
index 0000000000..eb02760413
--- /dev/null
+++ b/test/Transforms/LoopVectorize/write-only.ll
@@ -0,0 +1,26 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: @read_mod_write_single_ptr
+;CHECK: load <4 x float>
+;CHECK: ret i32
+define i32 @read_mod_write_single_ptr(float* nocapture %a, i32 %n) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds float* %a, i64 %indvars.iv
+  %3 = load float* %2, align 4
+  %4 = fmul float %3, 3.000000e+00
+  store float %4, float* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
-- 
cgit v1.2.3-70-g09d2


From a368b89f2842530d07c0ac8e3b533882e165f197 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Tue, 30 Oct 2012 22:06:26 +0000
Subject: Add documentation.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167055 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 9e05cacbbe..431a8478a0 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -749,16 +749,21 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
 }
 
 
+/// This function returns the identity element (or neutral element) for
+/// the operation K.
 static unsigned
 getReductionIdentity(LoopVectorizationLegality::ReductionKind K) {
   switch (K) {
   case LoopVectorizationLegality::IntegerXor:
   case LoopVectorizationLegality::IntegerAdd:
   case LoopVectorizationLegality::IntegerOr:
+    // Adding, Xoring, Oring zero to a number does not change it.
     return 0;
   case LoopVectorizationLegality::IntegerMult:
+    // Multiplying a number by 1 does not change it.
     return 1;
   case LoopVectorizationLegality::IntegerAnd:
+    // AND-ing a number with an all-1 value does not change it.
     return -1;
   default:
     llvm_unreachable("Unknown reduction kind");
-- 
cgit v1.2.3-70-g09d2


From 462d1ca42831df53a3c6435d247776f0b85bd594 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Wed, 31 Oct 2012 00:45:26 +0000
Subject: Add support for loops that don't start with Zero. This is important
 for loops in the LAPACK test-suite. These loops start at 1 because they are
 auto-converted from fortran.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167084 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp      | 31 ++++++++++------
 test/Transforms/LoopVectorize/X86/avx1.ll       | 49 +++++++++++++++++++++++++
 test/Transforms/LoopVectorize/start-non-zero.ll | 35 ++++++++++++++++++
 3 files changed, 103 insertions(+), 12 deletions(-)
 create mode 100644 test/Transforms/LoopVectorize/X86/avx1.ll
 create mode 100644 test/Transforms/LoopVectorize/start-non-zero.ll

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 431a8478a0..e82dfa2454 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -633,6 +633,10 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
   assert(ExitBlock && "Must have an exit block");
 
+  // The loop index does not have to start at Zero. It starts with this value.
+  OldInduction = Legal->getInduction();
+  Value *StartIdx = OldInduction->getIncomingValueForBlock(BypassBlock);
+
   assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop");
   assert(BypassBlock && "Invalid loop structure");
 
@@ -648,7 +652,6 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
                                  "scalar.preheader");
   // Find the induction variable.
   BasicBlock *OldBasicBlock = OrigLoop->getHeader();
-  OldInduction = Legal->getInduction();
   assert(OldInduction && "We must have a single phi node.");
   Type *IdxTy = OldInduction->getType();
 
@@ -658,7 +661,6 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
 
   // Generate the induction variable.
   Induction = Builder.CreatePHI(IdxTy, 2, "index");
-  Constant *Zero = ConstantInt::get(IdxTy, 0);
   Constant *Step = ConstantInt::get(IdxTy, VF);
 
   // Find the loop boundaries.
@@ -682,15 +684,22 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
 
   // Count holds the overall loop count (N).
   Value *Count = Exp.expandCodeFor(ExitCount, Induction->getType(), Loc);
+
+  // Add the start index to the loop count to get the new end index.
+  Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc);
+
   // Now we need to generate the expression for N - (N % VF), which is
   // the part that the vectorized body will execute.
   Constant *CIVF = ConstantInt::get(IdxTy, VF);
   Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc);
   Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc);
+  Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx,
+                                                     "end.idx.rnd.down", Loc);
 
   // Now, compare the new count to zero. If it is zero, jump to the scalar part.
   Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
-                               CountRoundDown, ConstantInt::getNullValue(IdxTy),
+                               IdxEndRoundDown,
+                               StartIdx,
                                "cmp.zero", Loc);
   BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc);
   // Remove the old terminator.
@@ -699,8 +708,8 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   // Add a check in the middle block to see if we have completed
   // all of the iterations in the first vector loop.
   // If (N - N%VF) == N, then we *don't* need to run the remainder.
-  Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count,
-                                CountRoundDown, "cmp.n",
+  Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd,
+                                IdxEndRoundDown, "cmp.n",
                                 MiddleBlock->getTerminator());
 
   BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator());
@@ -709,10 +718,10 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
 
   // Create i+1 and fill the PHINode.
   Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next");
-  Induction->addIncoming(Zero, VectorPH);
+  Induction->addIncoming(StartIdx, VectorPH);
   Induction->addIncoming(NextIdx, VecBody);
   // Create the compare.
-  Value *ICmp = Builder.CreateICmpEQ(NextIdx, CountRoundDown);
+  Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown);
   Builder.CreateCondBr(ICmp, MiddleBlock, VecBody);
 
   // Now we have two terminators. Remove the old one from the block.
@@ -720,7 +729,7 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
 
   // Fix the scalar body iteration count.
   unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH);
-  OldInduction->setIncomingValue(BlockIdx, CountRoundDown);
+  OldInduction->setIncomingValue(BlockIdx, IdxEndRoundDown);
 
   // Get ready to start creating new instructions into the vectorized body.
   Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
@@ -748,7 +757,6 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   LoopBypassBlock = BypassBlock;
 }
 
-
 /// This function returns the identity element (or neutral element) for
 /// the operation K.
 static unsigned
@@ -1518,10 +1526,9 @@ bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
     return false;
   }
   const SCEV *Step = AR->getStepRecurrence(*SE);
-  const SCEV *Start = AR->getStart();
 
-  if (!Step->isOne() || !Start->isZero()) {
-    DEBUG(dbgs() << "LV: PHI does not start at zero or steps by one.\n");
+  if (!Step->isOne()) {
+    DEBUG(dbgs() << "LV: PHI stride does not equal one.\n");
     return false;
   }
   return true;
diff --git a/test/Transforms/LoopVectorize/X86/avx1.ll b/test/Transforms/LoopVectorize/X86/avx1.ll
new file mode 100644
index 0000000000..a2d176a534
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/avx1.ll
@@ -0,0 +1,49 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: @read_mod_write_single_ptr
+;CHECK: load <8 x float>
+;CHECK: ret i32
+define i32 @read_mod_write_single_ptr(float* nocapture %a, i32 %n) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds float* %a, i64 %indvars.iv
+  %3 = load float* %2, align 4
+  %4 = fmul float %3, 3.000000e+00
+  store float %4, float* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
+
+
+;CHECK: @read_mod_i64
+;CHECK: load <8 x i64>
+;CHECK: ret i32
+define i32 @read_mod_i64(i64* nocapture %a, i32 %n) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 0, %0 ]
+  %2 = getelementptr inbounds i64* %a, i64 %indvars.iv
+  %3 = load i64* %2, align 4
+  %4 = mul i64 %3, 3
+  store i64 %4, i64* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
diff --git a/test/Transforms/LoopVectorize/start-non-zero.ll b/test/Transforms/LoopVectorize/start-non-zero.ll
new file mode 100644
index 0000000000..257df05ac0
--- /dev/null
+++ b/test/Transforms/LoopVectorize/start-non-zero.ll
@@ -0,0 +1,35 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: @start_at_nonzero
+;CHECK: mul <4 x i32>
+;CHECK: ret i32
+define i32 @start_at_nonzero(i32* nocapture %a, i32 %start, i32 %end) nounwind uwtable ssp {
+entry:
+  %cmp3 = icmp slt i32 %start, %end
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:                                   ; preds = %entry
+  %0 = sext i32 %start to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %indvars.iv = phi i64 [ %0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
+  %1 = load i32* %arrayidx, align 4, !tbaa !0
+  %mul = mul nsw i32 %1, 333
+  store i32 %mul, i32* %arrayidx, align 4, !tbaa !0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %2 = trunc i64 %indvars.iv.next to i32
+  %cmp = icmp slt i32 %2, %end
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret i32 4
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
-- 
cgit v1.2.3-70-g09d2


From 5b2c4dc5f8f70352f029d595797089821eb39b3c Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Wed, 31 Oct 2012 03:31:07 +0000
Subject: LoopVectorize: Do not vectorize loops with tiny constant trip counts.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167101 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp  |  8 +++++++
 test/Transforms/LoopVectorize/small-loop.ll | 33 +++++++++++++++++++++++++++++
 2 files changed, 41 insertions(+)
 create mode 100644 test/Transforms/LoopVectorize/small-loop.ll

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index e82dfa2454..fd728bd616 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -1145,6 +1145,14 @@ bool LoopVectorizationLegality::canVectorize() {
     return false;
   }
 
+  // Do not loop-vectorize loops with a tiny trip count.
+  unsigned TC = SE->getSmallConstantTripCount(TheLoop, BB);
+  if (TC > 0 && TC < 16) {
+    DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " <<
+          "This loop is not worth vectorizing.\n");
+    return false;
+  }
+
   DEBUG(dbgs() << "LV: We can vectorize this loop!\n");
 
   // Okay! We can vectorize. At this point we don't have any other mem analysis
diff --git a/test/Transforms/LoopVectorize/small-loop.ll b/test/Transforms/LoopVectorize/small-loop.ll
new file mode 100644
index 0000000000..4a6e4b231d
--- /dev/null
+++ b/test/Transforms/LoopVectorize/small-loop.ll
@@ -0,0 +1,33 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@a = common global [2048 x i32] zeroinitializer, align 16
+@b = common global [2048 x i32] zeroinitializer, align 16
+@c = common global [2048 x i32] zeroinitializer, align 16
+
+;CHECK: @example1
+;CHECK-NOT: load <4 x i32>
+;CHECK: ret void
+define void @example1() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 8  ;   <-----  A really small trip count.
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
-- 
cgit v1.2.3-70-g09d2


From 2a5401cc75acf38092a7224cadaf895281d41b29 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Wed, 31 Oct 2012 16:14:06 +0000
Subject: Remove enum values since they are not used anymore.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167131 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index fd728bd616..40235ef104 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -212,12 +212,12 @@ public:
 
   /// This represents the kinds of reductions that we support.
   enum ReductionKind {
-    NoReduction = -1, /// Not a reduction.
-    IntegerAdd  = 0,  /// Sum of numbers.
-    IntegerMult = 1,  /// Product of numbers.
-    IntegerOr   = 2,  /// Bitwise or logical OR of numbers.
-    IntegerAnd  = 3,  /// Bitwise or logical AND of numbers.
-    IntegerXor  = 4   /// Bitwise or logical XOR of numbers.
+    NoReduction, /// Not a reduction.
+    IntegerAdd,  /// Sum of numbers.
+    IntegerMult, /// Product of numbers.
+    IntegerOr,   /// Bitwise or logical OR of numbers.
+    IntegerAnd,  /// Bitwise or logical AND of numbers.
+    IntegerXor   /// Bitwise or logical XOR of numbers.
   };
 
   /// This POD struct holds information about reduction variables.
-- 
cgit v1.2.3-70-g09d2


From 4c1b4b1fe794437cbb245b11650d9e4001c9605e Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Wed, 31 Oct 2012 16:22:16 +0000
Subject: Put the threshold magic number in a variable.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167134 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 40235ef104..94e56a1713 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -75,6 +75,9 @@ static cl::opt<unsigned>
 VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
           cl::desc("Set the default vectorization width. Zero is autoselect."));
 
+/// We don't vectorize loops with a known constant trip count below this number.
+const int TinyTripCountThreshold = 16;
+
 namespace {
 
 // Forward declarations.
@@ -1147,7 +1150,7 @@ bool LoopVectorizationLegality::canVectorize() {
 
   // Do not loop-vectorize loops with a tiny trip count.
   unsigned TC = SE->getSmallConstantTripCount(TheLoop, BB);
-  if (TC > 0 && TC < 16) {
+  if (TC > 0 && TC < TinyTripCountThreshold) {
     DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " <<
           "This loop is not worth vectorizing.\n");
     return false;
-- 
cgit v1.2.3-70-g09d2


From e57b2cbce652d45ed5516e52ad82991bfa03cfd7 Mon Sep 17 00:00:00 2001
From: Nadav Rotem <nrotem@apple.com>
Date: Wed, 31 Oct 2012 21:40:39 +0000
Subject: LoopVectorize: Preserve NSW, NUW and IsExact flags.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167174 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp        | 13 +++++-
 test/Transforms/LoopVectorize/X86/gcc-examples.ll |  2 +-
 test/Transforms/LoopVectorize/flags.ll            | 53 +++++++++++++++++++++++
 test/Transforms/LoopVectorize/gcc-examples.ll     |  4 +-
 test/Transforms/LoopVectorize/increment.ll        |  2 +-
 test/Transforms/LoopVectorize/non-const-n.ll      |  2 +-
 test/Transforms/LoopVectorize/reduction.ll        |  2 +-
 test/Transforms/LoopVectorize/start-non-zero.ll   |  4 +-
 8 files changed, 74 insertions(+), 8 deletions(-)
 create mode 100644 test/Transforms/LoopVectorize/flags.ll

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 94e56a1713..c9871e25f1 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -849,8 +849,19 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
         Value *A = getVectorValue(Inst->getOperand(0));
         Value *B = getVectorValue(Inst->getOperand(1));
+
         // Use this vector value for all users of the original instruction.
-        WidenMap[Inst] = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
+        Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
+        WidenMap[Inst] = V;
+
+        // Update the NSW, NUW and Exact flags.
+        BinaryOperator *VecOp = cast<BinaryOperator>(V);
+        if (isa<OverflowingBinaryOperator>(BinOp)) {
+          VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap());
+          VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap());
+        }
+        if (isa<PossiblyExactOperator>(VecOp))
+          VecOp->setIsExact(BinOp->isExact());
         break;
       }
       case Instruction::Select: {
diff --git a/test/Transforms/LoopVectorize/X86/gcc-examples.ll b/test/Transforms/LoopVectorize/X86/gcc-examples.ll
index e7a63c9316..574c529834 100644
--- a/test/Transforms/LoopVectorize/X86/gcc-examples.ll
+++ b/test/Transforms/LoopVectorize/X86/gcc-examples.ll
@@ -10,7 +10,7 @@ target triple = "x86_64-apple-macosx10.8.0"
 ; Select VF = 8;
 ;CHECK: @example1
 ;CHECK: load <8 x i32>
-;CHECK: add <8 x i32>
+;CHECK: add nsw <8 x i32>
 ;CHECK: store <8 x i32>
 ;CHECK: ret void
 define void @example1() nounwind uwtable ssp {
diff --git a/test/Transforms/LoopVectorize/flags.ll b/test/Transforms/LoopVectorize/flags.ll
new file mode 100644
index 0000000000..2f22a76457
--- /dev/null
+++ b/test/Transforms/LoopVectorize/flags.ll
@@ -0,0 +1,53 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: @flags1
+;CHECK: load <4 x i32>
+;CHECK: mul nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret i32
+define i32 @flags1(i32 %n, i32* nocapture %A) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 9
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 9, %0 ]
+  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = mul nsw i32 %3, 3
+  store i32 %4, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
+
+
+;CHECK: @flags2
+;CHECK: load <4 x i32>
+;CHECK: mul <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret i32
+define i32 @flags2(i32 %n, i32* nocapture %A) nounwind uwtable ssp {
+  %1 = icmp sgt i32 %n, 9
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ 9, %0 ]
+  %2 = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = mul i32 %3, 3
+  store i32 %4, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret i32 undef
+}
diff --git a/test/Transforms/LoopVectorize/gcc-examples.ll b/test/Transforms/LoopVectorize/gcc-examples.ll
index d8942ac861..fce29d2404 100644
--- a/test/Transforms/LoopVectorize/gcc-examples.ll
+++ b/test/Transforms/LoopVectorize/gcc-examples.ll
@@ -21,7 +21,7 @@ target triple = "x86_64-apple-macosx10.8.0"
 
 ;CHECK: @example1
 ;CHECK: load <4 x i32>
-;CHECK: add <4 x i32>
+;CHECK: add nsw <4 x i32>
 ;CHECK: store <4 x i32>
 ;CHECK: ret void
 define void @example1() nounwind uwtable ssp {
@@ -227,6 +227,8 @@ define i32 @example9() nounwind uwtable readonly ssp {
 }
 
 ;CHECK: @example10a
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
 ;CHECK: load <4 x i16>
 ;CHECK: add <4 x i16>
 ;CHECK: store <4 x i16>
diff --git a/test/Transforms/LoopVectorize/increment.ll b/test/Transforms/LoopVectorize/increment.ll
index 069b7ea031..71ea7689fc 100644
--- a/test/Transforms/LoopVectorize/increment.ll
+++ b/test/Transforms/LoopVectorize/increment.ll
@@ -11,7 +11,7 @@ target triple = "x86_64-apple-macosx10.8.0"
 ;  }
 ;CHECK: @inc
 ;CHECK: load <4 x i32>
-;CHECK: add <4 x i32>
+;CHECK: add nsw <4 x i32>
 ;CHECK: store <4 x i32>
 ;CHECK: ret void
 define void @inc(i32 %n) nounwind uwtable noinline ssp {
diff --git a/test/Transforms/LoopVectorize/non-const-n.ll b/test/Transforms/LoopVectorize/non-const-n.ll
index 7727b0a2dc..1a6c15ed96 100644
--- a/test/Transforms/LoopVectorize/non-const-n.ll
+++ b/test/Transforms/LoopVectorize/non-const-n.ll
@@ -11,7 +11,7 @@ target triple = "x86_64-apple-macosx10.8.0"
 ;CHECK: shl i32
 ;CHECK: zext i32
 ;CHECK: load <4 x i32>
-;CHECK: add <4 x i32>
+;CHECK: add nsw <4 x i32>
 ;CHECK: store <4 x i32>
 ;CHECK: ret void
 define void @example1(i32 %n) nounwind uwtable ssp {
diff --git a/test/Transforms/LoopVectorize/reduction.ll b/test/Transforms/LoopVectorize/reduction.ll
index d19f7c1b0b..c1848b35fc 100644
--- a/test/Transforms/LoopVectorize/reduction.ll
+++ b/test/Transforms/LoopVectorize/reduction.ll
@@ -66,7 +66,7 @@ define i32 @reduction_prod(i32 %n, i32* noalias nocapture %A, i32* noalias nocap
 ;CHECK: @reduction_mix
 ;CHECK: phi <4 x i32>
 ;CHECK: load <4 x i32>
-;CHECK: mul <4 x i32>
+;CHECK: mul nsw <4 x i32>
 ;CHECK: ret i32
 define i32 @reduction_mix(i32 %n, i32* noalias nocapture %A, i32* noalias nocapture %B) nounwind uwtable readonly noinline ssp {
   %1 = icmp sgt i32 %n, 0
diff --git a/test/Transforms/LoopVectorize/start-non-zero.ll b/test/Transforms/LoopVectorize/start-non-zero.ll
index 257df05ac0..5aa3bc034d 100644
--- a/test/Transforms/LoopVectorize/start-non-zero.ll
+++ b/test/Transforms/LoopVectorize/start-non-zero.ll
@@ -4,7 +4,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "x86_64-apple-macosx10.8.0"
 
 ;CHECK: @start_at_nonzero
-;CHECK: mul <4 x i32>
+;CHECK: mul nuw <4 x i32>
 ;CHECK: ret i32
 define i32 @start_at_nonzero(i32* nocapture %a, i32 %start, i32 %end) nounwind uwtable ssp {
 entry:
@@ -19,7 +19,7 @@ for.body:                                         ; preds = %for.body.lr.ph, %fo
   %indvars.iv = phi i64 [ %0, %for.body.lr.ph ], [ %indvars.iv.next, %for.body ]
   %arrayidx = getelementptr inbounds i32* %a, i64 %indvars.iv
   %1 = load i32* %arrayidx, align 4, !tbaa !0
-  %mul = mul nsw i32 %1, 333
+  %mul = mul nuw i32 %1, 333
   store i32 %mul, i32* %arrayidx, align 4, !tbaa !0
   %indvars.iv.next = add i64 %indvars.iv, 1
   %2 = trunc i64 %indvars.iv.next to i32
-- 
cgit v1.2.3-70-g09d2


From 6e48f0307758096d06d0e87875294c76df81dec1 Mon Sep 17 00:00:00 2001
From: Chandler Carruth <chandlerc@gmail.com>
Date: Fri, 2 Nov 2012 05:24:00 +0000
Subject: Fix sign compare warning. Patch by Mahesha HS.

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@167282 91177308-0d34-0410-b5e6-96231b3b80d8
---
 lib/Transforms/Vectorize/LoopVectorize.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')

diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index c9871e25f1..892808760f 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -76,7 +76,7 @@ VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
           cl::desc("Set the default vectorization width. Zero is autoselect."));
 
 /// We don't vectorize loops with a known constant trip count below this number.
-const int TinyTripCountThreshold = 16;
+const unsigned TinyTripCountThreshold = 16;
 
 namespace {
 
@@ -1161,7 +1161,7 @@ bool LoopVectorizationLegality::canVectorize() {
 
   // Do not loop-vectorize loops with a tiny trip count.
   unsigned TC = SE->getSmallConstantTripCount(TheLoop, BB);
-  if (TC > 0 && TC < TinyTripCountThreshold) {
+  if (TC > 0u && TC < TinyTripCountThreshold) {
     DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " <<
           "This loop is not worth vectorizing.\n");
     return false;
-- 
cgit v1.2.3-70-g09d2