aboutsummaryrefslogtreecommitdiff
path: root/lib/Transforms/Vectorize/LoopVectorize.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r--lib/Transforms/Vectorize/LoopVectorize.cpp1096
1 files changed, 711 insertions, 385 deletions
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index bc8e1217be..a696a2ffba 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9,10 +9,10 @@
//
// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
// and generates target-independent LLVM-IR. Legalization of the IR is done
-// in the codegen. However, the vectorizes uses (will use) the codegen
+// in the codegen. However, the vectorizer uses (will use) the codegen
// interfaces to generate IR that is likely to result in an optimal binary.
//
-// The loop vectorizer combines consecutive loop iteration into a single
+// The loop vectorizer combines consecutive loop iterations into a single
// 'wide' iteration. After this transformation the index is incremented
// by the SIMD vector width, and not by one.
//
@@ -32,7 +32,7 @@
// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
//
// Variable uniformity checks are inspired by:
-// Karrenberg, R. and Hack, S. Whole Function Vectorization.
+// Karrenberg, R. and Hack, S. Whole Function Vectorization.
//
// Other ideas/concepts are from:
// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
@@ -79,6 +79,7 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLibraryInfo.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
@@ -101,14 +102,16 @@ EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
cl::desc("Enable if-conversion during vectorization."));
/// We don't vectorize loops with a known constant trip count below this number.
-static const unsigned TinyTripCountVectorThreshold = 16;
+static cl::opt<unsigned>
+TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16),
+ cl::Hidden,
+ cl::desc("Don't vectorize loops with a constant "
+ "trip count that is smaller than this "
+ "value."));
/// We don't unroll loops with a known constant trip count below this number.
static const unsigned TinyTripCountUnrollThreshold = 128;
-/// We don't unroll loops that are larget than this threshold.
-static const unsigned MaxLoopSizeThreshold = 32;
-
/// When performing a runtime memory check, do not check more than this
/// number of pointers. Notice that the check is quadratic!
static const unsigned RuntimeMemoryCheckThreshold = 4;
@@ -136,10 +139,11 @@ class LoopVectorizationCostModel;
class InnerLoopVectorizer {
public:
InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
- DominatorTree *DT, DataLayout *DL, unsigned VecWidth,
+ DominatorTree *DT, DataLayout *DL,
+ const TargetLibraryInfo *TLI, unsigned VecWidth,
unsigned UnrollFactor)
- : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), VF(VecWidth),
- UF(UnrollFactor), Builder(SE->getContext()), Induction(0),
+ : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), TLI(TLI),
+ VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), Induction(0),
OldInduction(0), WidenMap(UnrollFactor) {}
// Perform the actual loop widening (vectorization).
@@ -163,8 +167,8 @@ private:
/// Add code that checks at runtime if the accessed arrays overlap.
/// Returns the comparator value or NULL if no check is needed.
- Value *addRuntimeCheck(LoopVectorizationLegality *Legal,
- Instruction *Loc);
+ Instruction *addRuntimeCheck(LoopVectorizationLegality *Legal,
+ Instruction *Loc);
/// Create an empty loop, based on the loop ranges of the old loop.
void createEmptyLoop(LoopVectorizationLegality *Legal);
/// Copy and widen the instructions from the old loop.
@@ -190,6 +194,10 @@ private:
/// of scalars.
void scalarizeInstruction(Instruction *Instr);
+ /// Vectorize Load and Store instructions,
+ void vectorizeMemoryInstruction(Instruction *Instr,
+ LoopVectorizationLegality *Legal);
+
/// Create a broadcast instruction. This method generates a broadcast
/// instruction (shuffle) for loop invariant values and for the induction
/// value. If this is the induction variable then we extend it to N, N+1, ...
@@ -222,31 +230,34 @@ private:
ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {}
/// \return True if 'Key' is saved in the Value Map.
- bool has(Value *Key) { return MapStoreage.count(Key); }
+ bool has(Value *Key) const { return MapStorage.count(Key); }
/// Initializes a new entry in the map. Sets all of the vector parts to the
/// save value in 'Val'.
/// \return A reference to a vector with splat values.
VectorParts &splat(Value *Key, Value *Val) {
- MapStoreage[Key].clear();
- MapStoreage[Key].append(UF, Val);
- return MapStoreage[Key];
+ VectorParts &Entry = MapStorage[Key];
+ Entry.assign(UF, Val);
+ return Entry;
}
///\return A reference to the value that is stored at 'Key'.
VectorParts &get(Value *Key) {
- if (!has(Key))
- MapStoreage[Key].resize(UF);
- return MapStoreage[Key];
+ VectorParts &Entry = MapStorage[Key];
+ if (Entry.empty())
+ Entry.resize(UF);
+ assert(Entry.size() == UF);
+ return Entry;
}
+ private:
/// The unroll factor. Each entry in the map stores this number of vector
/// elements.
unsigned UF;
/// Map storage. We use std::map and not DenseMap because insertions to a
/// dense map invalidates its iterators.
- std::map<Value*, VectorParts> MapStoreage;
+ std::map<Value *, VectorParts> MapStorage;
};
/// The original loop.
@@ -259,6 +270,9 @@ private:
DominatorTree *DT;
/// Data Layout.
DataLayout *DL;
+ /// Target Library Info.
+ const TargetLibraryInfo *TLI;
+
/// The vectorization SIMD factor to use. Each vector will have this many
/// vector elements.
unsigned VF;
@@ -283,8 +297,8 @@ private:
BasicBlock *LoopVectorBody;
///The scalar loop body.
BasicBlock *LoopScalarBody;
- ///The first bypass block.
- BasicBlock *LoopBypassBlock;
+ /// A list of all bypass blocks. The first block is the entry of the loop.
+ SmallVector<BasicBlock *, 4> LoopBypassBlocks;
/// The new Induction variable which was added to the new block.
PHINode *Induction;
@@ -310,8 +324,10 @@ private:
class LoopVectorizationLegality {
public:
LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL,
- DominatorTree *DT)
- : TheLoop(L), SE(SE), DL(DL), DT(DT), Induction(0) {}
+ DominatorTree *DT, TargetTransformInfo* TTI,
+ AliasAnalysis *AA, TargetLibraryInfo *TLI)
+ : TheLoop(L), SE(SE), DL(DL), DT(DT), TTI(TTI), AA(AA), TLI(TLI),
+ Induction(0) {}
/// This enum represents the kinds of reductions that we support.
enum ReductionKind {
@@ -330,7 +346,8 @@ public:
IK_NoInduction, ///< Not an induction variable.
IK_IntInduction, ///< Integer induction variable. Step = 1.
IK_ReverseIntInduction, ///< Reverse int induction variable. Step = -1.
- IK_PtrInduction ///< Pointer induction variable. Step = sizeof(elem).
+ IK_PtrInduction, ///< Pointer induction var. Step = sizeof(elem).
+ IK_ReversePtrInduction ///< Reverse ptr indvar. Step = - sizeof(elem).
};
/// This POD struct holds information about reduction variables.
@@ -394,6 +411,11 @@ public:
/// induction descriptor.
typedef MapVector<PHINode*, InductionInfo> InductionList;
+ /// Alias(Multi)Map stores the values (GEPs or underlying objects and their
+ /// respective Store/Load instruction(s) to calculate aliasing.
+ typedef DenseMap<Value*, Instruction* > AliasMap;
+ typedef DenseMap<Value*, std::vector<Instruction*> > AliasMultiMap;
+
/// Returns true if it is legal to vectorize this loop.
/// This does not mean that it is profitable to vectorize this
/// loop, only that it is legal to do so.
@@ -467,6 +489,14 @@ private:
InductionKind isInductionVariable(PHINode *Phi);
/// Return true if can compute the address bounds of Ptr within the loop.
bool hasComputableBounds(Value *Ptr);
+ /// Return true if there is the chance of write reorder.
+ bool hasPossibleGlobalWriteReorder(Value *Object,
+ Instruction *Inst,
+ AliasMultiMap &WriteObjects,
+ unsigned MaxByteWidth);
+ /// Return the AA location for a load or a store.
+ AliasAnalysis::Location getLoadStoreLocation(Instruction *Inst);
+
/// The loop that we evaluate.
Loop *TheLoop;
@@ -474,8 +504,14 @@ private:
ScalarEvolution *SE;
/// DataLayout analysis.
DataLayout *DL;
- // Dominators.
+ /// Dominators.
DominatorTree *DT;
+ /// Target Info.
+ TargetTransformInfo *TTI;
+ /// Alias Analysis.
+ AliasAnalysis *AA;
+ /// Target Library Info.
+ TargetLibraryInfo *TLI;
// --- vectorization state --- //
@@ -511,16 +547,23 @@ class LoopVectorizationCostModel {
public:
LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
LoopVectorizationLegality *Legal,
- const TargetTransformInfo &TTI)
- : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI) {}
-
- /// \return The most profitable vectorization factor.
+ const TargetTransformInfo &TTI,
+ DataLayout *DL, const TargetLibraryInfo *TLI)
+ : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI) {}
+
+ /// Information about vectorization costs
+ struct VectorizationFactor {
+ unsigned Width; // Vector width with best cost
+ unsigned Cost; // Cost of the loop with that width
+ };
+ /// \return The most profitable vectorization factor and the cost of that VF.
/// This method checks every power of two up to VF. If UserVF is not ZERO
/// then this vectorization factor will be selected if vectorization is
/// possible.
- unsigned selectVectorizationFactor(bool OptForSize, unsigned UserVF);
+ VectorizationFactor selectVectorizationFactor(bool OptForSize,
+ unsigned UserVF);
- /// \returns The size (in bits) of the widest type in the code that
+ /// \return The size (in bits) of the widest type in the code that
/// needs to be vectorized. We ignore values that remain scalar such as
/// 64 bit loop indices.
unsigned getWidestType();
@@ -528,7 +571,10 @@ public:
/// \return The most profitable unroll factor.
/// If UserUF is non-zero then this method finds the best unroll-factor
/// based on register pressure and other parameters.
- unsigned selectUnrollFactor(bool OptForSize, unsigned UserUF);
+ /// VF and LoopCost are the selected vectorization factor and the cost of the
+ /// selected VF.
+ unsigned selectUnrollFactor(bool OptForSize, unsigned UserUF, unsigned VF,
+ unsigned LoopCost);
/// \brief A struct that represents some properties of the register usage
/// of a loop.
@@ -560,6 +606,10 @@ private:
/// the scalar type.
static Type* ToVectorTy(Type *Scalar, unsigned VF);
+ /// Returns whether the instruction is a load or store and will be a emitted
+ /// as a vector operation.
+ bool isConsecutiveLoadOrStore(Instruction *I);
+
/// The loop that we evaluate.
Loop *TheLoop;
/// Scev analysis.
@@ -570,6 +620,10 @@ private:
LoopVectorizationLegality *Legal;
/// Vector target information.
const TargetTransformInfo &TTI;
+ /// Target data layout information.
+ DataLayout *DL;
+ /// Target Library Info.
+ const TargetLibraryInfo *TLI;
};
/// The LoopVectorize Pass.
@@ -586,6 +640,8 @@ struct LoopVectorize : public LoopPass {
LoopInfo *LI;
TargetTransformInfo *TTI;
DominatorTree *DT;
+ AliasAnalysis *AA;
+ TargetLibraryInfo *TLI;
virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
// We only vectorize innermost loops.
@@ -597,21 +653,23 @@ struct LoopVectorize : public LoopPass {
LI = &getAnalysis<LoopInfo>();
TTI = &getAnalysis<TargetTransformInfo>();
DT = &getAnalysis<DominatorTree>();
+ AA = getAnalysisIfAvailable<AliasAnalysis>();
+ TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
DEBUG(dbgs() << "LV: Checking a loop in \"" <<
L->getHeader()->getParent()->getName() << "\"\n");
// Check if it is legal to vectorize the loop.
- LoopVectorizationLegality LVL(L, SE, DL, DT);
+ LoopVectorizationLegality LVL(L, SE, DL, DT, TTI, AA, TLI);
if (!LVL.canVectorize()) {
DEBUG(dbgs() << "LV: Not vectorizing.\n");
return false;
}
// Use the cost model.
- LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI);
+ LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI);
- // Check the function attribues to find out if this function should be
+ // Check the function attributes to find out if this function should be
// optimized for size.
Function *F = L->getHeader()->getParent();
Attribute::AttrKind SzAttr = Attribute::OptimizeForSize;
@@ -626,20 +684,24 @@ struct LoopVectorize : public LoopPass {
return false;
}
- unsigned VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor);
- unsigned UF = CM.selectUnrollFactor(OptForSize, VectorizationUnroll);
+ // Select the optimal vectorization factor.
+ LoopVectorizationCostModel::VectorizationFactor VF;
+ VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor);
+ // Select the unroll factor.
+ unsigned UF = CM.selectUnrollFactor(OptForSize, VectorizationUnroll,
+ VF.Width, VF.Cost);
- if (VF == 1) {
+ if (VF.Width == 1) {
DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
return false;
}
- DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<<
+ DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF.Width << ") in "<<
F->getParent()->getModuleIdentifier()<<"\n");
DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n");
- // If we decided that it is *legal* to vectorizer the loop then do it.
- InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF, UF);
+ // If we decided that it is *legal* to vectorize the loop then do it.
+ InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF);
LB.vectorize(&LVL);
DEBUG(verifyFunction(*L->getHeader()->getParent()));
@@ -728,6 +790,9 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, unsigned StartIdx,
int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr");
+ // Make sure that the pointer does not point to structs.
+ if (cast<PointerType>(Ptr->getType())->getElementType()->isAggregateType())
+ return 0;
// If this value is a pointer induction variable we know it is consecutive.
PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
@@ -735,6 +800,8 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
InductionInfo II = Inductions[Phi];
if (IK_PtrInduction == II.IK)
return 1;
+ else if (IK_ReversePtrInduction == II.IK)
+ return -1;
}
GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
@@ -744,6 +811,29 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
unsigned NumOperands = Gep->getNumOperands();
Value *LastIndex = Gep->getOperand(NumOperands - 1);
+ Value *GpPtr = Gep->getPointerOperand();
+ // If this GEP value is a consecutive pointer induction variable and all of
+ // the indices are constant then we know it is consecutive. We can
+ Phi = dyn_cast<PHINode>(GpPtr);
+ if (Phi && Inductions.count(Phi)) {
+
+ // Make sure that the pointer does not point to structs.
+ PointerType *GepPtrType = cast<PointerType>(GpPtr->getType());
+ if (GepPtrType->getElementType()->isAggregateType())
+ return 0;
+
+ // Make sure that all of the index operands are loop invariant.
+ for (unsigned i = 1; i < NumOperands; ++i)
+ if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
+ return 0;
+
+ InductionInfo II = Inductions[Phi];
+ if (IK_PtrInduction == II.IK)
+ return 1;
+ else if (IK_ReversePtrInduction == II.IK)
+ return -1;
+ }
+
// Check that all of the gep indices are uniform except for the last.
for (unsigned i = 0; i < NumOperands - 1; ++i)
if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
@@ -782,8 +872,7 @@ InnerLoopVectorizer::getVectorValue(Value *V) {
// If this scalar is unknown, assume that it is a constant or that it is
// loop invariant. Broadcast V and save the value for future uses.
Value *B = getBroadcastInstrs(V);
- WidenMap.splat(V, B);
- return WidenMap.get(V);
+ return WidenMap.splat(V, B);
}
Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
@@ -797,6 +886,111 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
"reverse");
}
+
+void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
+ LoopVectorizationLegality *Legal) {
+ // Attempt to issue a wide load.
+ LoadInst *LI = dyn_cast<LoadInst>(Instr);
+ StoreInst *SI = dyn_cast<StoreInst>(Instr);
+
+ assert((LI || SI) && "Invalid Load/Store instruction");
+
+ Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType();
+ Type *DataTy = VectorType::get(ScalarDataTy, VF);
+ Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
+ unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
+
+ // If the pointer is loop invariant or if it is non consecutive,
+ // scalarize the load.
+ int Stride = Legal->isConsecutivePtr(Ptr);
+ bool Reverse = Stride < 0;
+ bool UniformLoad = LI && Legal->isUniform(Ptr);
+ if (Stride == 0 || UniformLoad)
+ return scalarizeInstruction(Instr);
+
+ Constant *Zero = Builder.getInt32(0);
+ VectorParts &Entry = WidenMap.get(Instr);
+
+ // Handle consecutive loads/stores.
+ GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+ if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) {
+ Value *PtrOperand = Gep->getPointerOperand();
+ Value *FirstBasePtr = getVectorValue(PtrOperand)[0];
+ FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero);
+
+ // Create the new GEP with the new induction variable.
+ GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+ Gep2->setOperand(0, FirstBasePtr);
+ Gep2->setName("gep.indvar.base");
+ Ptr = Builder.Insert(Gep2);
+ } else if (Gep) {
+ assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()),
+ OrigLoop) && "Base ptr must be invariant");
+
+ // The last index does not have to be the induction. It can be
+ // consecutive and be a function of the index. For example A[I+1];
+ unsigned NumOperands = Gep->getNumOperands();
+
+ Value *LastGepOperand = Gep->getOperand(NumOperands - 1);
+ VectorParts &GEPParts = getVectorValue(LastGepOperand);
+ Value *LastIndex = GEPParts[0];
+ LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
+
+ // Create the new GEP with the new induction variable.
+ GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+ Gep2->setOperand(NumOperands - 1, LastIndex);
+ Gep2->setName("gep.indvar.idx");
+ Ptr = Builder.Insert(Gep2);
+ } else {
+ // Use the induction element ptr.
+ assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
+ VectorParts &PtrVal = getVectorValue(Ptr);
+ Ptr = Builder.CreateExtractElement(PtrVal[0], Zero);
+ }
+
+ // Handle Stores:
+ if (SI) {
+ assert(!Legal->isUniform(SI->getPointerOperand()) &&
+ "We do not allow storing to uniform addresses");
+
+ VectorParts &StoredVal = getVectorValue(SI->getValueOperand());
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ // Calculate the pointer for the specific unroll-part.
+ Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
+
+ if (Reverse) {
+ // If we store to reverse consecutive memory locations then we need
+ // to reverse the order of elements in the stored value.
+ StoredVal[Part] = reverseVector(StoredVal[Part]);
+ // If the address is consecutive but reversed, then the
+ // wide store needs to start at the last vector element.
+ PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
+ PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
+ }
+
+ Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo());
+ Builder.CreateStore(StoredVal[Part], VecPtr)->setAlignment(Alignment);
+ }
+ }
+
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ // Calculate the pointer for the specific unroll-part.
+ Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF));
+
+ if (Reverse) {
+ // If the address is consecutive but reversed, then the
+ // wide store needs to start at the last vector element.
+ PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF));
+ PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
+ }
+
+ Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo());
+ Value *LI = Builder.CreateLoad(VecPtr, "wide.load");
+ cast<LoadInst>(LI)->setAlignment(Alignment);
+ Entry[Part] = Reverse ? reverseVector(LI) : LI;
+ }
+}
+
void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
// Holds vector parameters or scalars, in case of uniform vals.
@@ -868,7 +1062,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
}
}
-Value*
+Instruction *
InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
Instruction *Loc) {
LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
@@ -877,7 +1071,7 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
if (!PtrRtCheck->Need)
return NULL;
- Value *MemoryRuntimeCheck = 0;
+ Instruction *MemoryRuntimeCheck = 0;
unsigned NumPointers = PtrRtCheck->Pointers.size();
SmallVector<Value* , 2> Starts;
SmallVector<Value* , 2> Ends;
@@ -906,28 +1100,23 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
}
}
+ IRBuilder<> ChkBuilder(Loc);
+
for (unsigned i = 0; i < NumPointers; ++i) {
for (unsigned j = i+1; j < NumPointers; ++j) {
- Instruction::CastOps Op = Instruction::BitCast;
- Value *Start0 = CastInst::Create(Op, Starts[i], PtrArithTy, "bc", Loc);
- Value *Start1 = CastInst::Create(Op, Starts[j], PtrArithTy, "bc", Loc);
- Value *End0 = CastInst::Create(Op, Ends[i], PtrArithTy, "bc", Loc);
- Value *End1 = CastInst::Create(Op, Ends[j], PtrArithTy, "bc", Loc);
-
- Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
- Start0, End1, "bound0", Loc);
- Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
- Start1, End0, "bound1", Loc);
- Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1,
- "found.conflict", Loc);
+ Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy, "bc");
+ Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy, "bc");
+ Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy, "bc");
+ Value *End1 = ChkBuilder.CreateBitCast(Ends[j], PtrArithTy, "bc");
+
+ Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0");
+ Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1");
+ Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict");
if (MemoryRuntimeCheck)
- MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or,
- MemoryRuntimeCheck,
- IsConflict,
- "conflict.rdx", Loc);
- else
- MemoryRuntimeCheck = IsConflict;
+ IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict,
+ "conflict.rdx");
+ MemoryRuntimeCheck = cast<Instruction>(IsConflict);
}
}
@@ -941,7 +1130,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
the vectorized instructions while the old loop will continue to run the
scalar remainder.
- [ ] <-- vector loop bypass.
+ [ ] <-- vector loop bypass (may consist of multiple blocks).
/ |
/ v
| [ ] <-- vector pre header.
@@ -1002,10 +1191,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
ConstantInt::get(IdxTy, 0);
assert(BypassBlock && "Invalid loop structure");
-
- // Generate the code that checks in runtime if arrays overlap.
- Value *MemoryRuntimeCheck = addRuntimeCheck(Legal,
- BypassBlock->getTerminator());
+ LoopBypassBlocks.push_back(BypassBlock);
// Split the single block loop into the two loop structure described above.
BasicBlock *VectorPH =
@@ -1017,10 +1203,6 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
BasicBlock *ScalarPH =
MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
- // This is the location in which we add all of the logic for bypassing
- // the new vector loop.
- Instruction *Loc = BypassBlock->getTerminator();
-
// Use this IR builder to create the loop instructions (Phi, Br, Cmp)
// inside the loop.
Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
@@ -1031,45 +1213,62 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
// times the unroll factor (num of SIMD instructions).
Constant *Step = ConstantInt::get(IdxTy, VF * UF);
+ // This is the IR builder that we use to add all of the logic for bypassing
+ // the new vector loop.
+ IRBuilder<> BypassBuilder(BypassBlock->getTerminator());
+
// We may need to extend the index in case there is a type mismatch.
// We know that the count starts at zero and does not overflow.
- unsigned IdxTyBW = IdxTy->getScalarSizeInBits();
if (Count->getType() != IdxTy) {
// The exit count can be of pointer type. Convert it to the correct
// integer type.
if (ExitCount->getType()->isPointerTy())
- Count = CastInst::CreatePointerCast(Count, IdxTy, "ptrcnt.to.int", Loc);
- else if (IdxTyBW < Count->getType()->getScalarSizeInBits())
- Count = CastInst::CreateTruncOrBitCast(Count, IdxTy, "tr.cnt", Loc);
+ Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int");
else
- Count = CastInst::CreateZExtOrBitCast(Count, IdxTy, "zext.cnt", Loc);
+ Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast");
}
// Add the start index to the loop count to get the new end index.
- Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc);
+ Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx");
// Now we need to generate the expression for N - (N % VF), which is
// the part that the vectorized body will execute.
- Value *R = BinaryOperator::CreateURem(Count, Step, "n.mod.vf", Loc);
- Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc);
- Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx,
- "end.idx.rnd.down", Loc);
+ Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf");
+ Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec");
+ Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx,
+ "end.idx.rnd.down");
// Now, compare the new count to zero. If it is zero skip the vector loop and
// jump to the scalar loop.
- Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
- IdxEndRoundDown,
- StartIdx,
- "cmp.zero", Loc);
-
- // If we are using memory runtime checks, include them in.
- if (MemoryRuntimeCheck)
- Cmp = BinaryOperator::Create(Instruction::Or, Cmp, MemoryRuntimeCheck,
- "CntOrMem", Loc);
+ Value *Cmp = BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx,
+ "cmp.zero");
+
+ BasicBlock *LastBypassBlock = BypassBlock;
+
+ // Generate the code that checks in runtime if arrays overlap. We put the
+ // checks into a separate block to make the more common case of few elements
+ // faster.
+ Instruction *MemRuntimeCheck = addRuntimeCheck(Legal,
+ BypassBlock->getTerminator());
+ if (MemRuntimeCheck) {
+ // Create a new block containing the memory check.
+ BasicBlock *CheckBlock = BypassBlock->splitBasicBlock(MemRuntimeCheck,
+ "vector.memcheck");
+ LoopBypassBlocks.push_back(CheckBlock);
+
+ // Replace the branch into the memory check block with a conditional branch
+ // for the "few elements case".
+ Instruction *OldTerm = BypassBlock->getTerminator();
+ BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm);
+ OldTerm->eraseFromParent();
+
+ Cmp = MemRuntimeCheck;
+ LastBypassBlock = CheckBlock;
+ }
- BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc);
- // Remove the old terminator.
- Loc->eraseFromParent();
+ LastBypassBlock->getTerminator()->eraseFromParent();
+ BranchInst::Create(MiddleBlock, VectorPH, Cmp,
+ LastBypassBlock);
// We are going to resume the execution of the scalar loop.
// Go over all of the induction variables that we found and fix the
@@ -1109,30 +1308,45 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
Value *CRD = CountRoundDown;
if (CRDSize > IISize)
CRD = CastInst::Create(Instruction::Trunc, CountRoundDown,
- II.StartValue->getType(),
- "tr.crd", BypassBlock->getTerminator());
+ II.StartValue->getType(), "tr.crd",
+ LoopBypassBlocks.back()->getTerminator());
else if (CRDSize < IISize)
CRD = CastInst::Create(Instruction::SExt, CountRoundDown,
II.StartValue->getType(),
- "sext.crd", BypassBlock->getTerminator());
+ "sext.crd",
+ LoopBypassBlocks.back()->getTerminator());
// Handle reverse integer induction counter:
- EndValue = BinaryOperator::CreateSub(II.StartValue, CRD, "rev.ind.end",
- BypassBlock->getTerminator());
+ EndValue =
+ BinaryOperator::CreateSub(II.StartValue, CRD, "rev.ind.end",
+ LoopBypassBlocks.back()->getTerminator());
break;
}
case LoopVectorizationLegality::IK_PtrInduction: {
// For pointer induction variables, calculate the offset using
// the end index.
- EndValue = GetElementPtrInst::Create(II.StartValue, CountRoundDown,
- "ptr.ind.end",
- BypassBlock->getTerminator());
+ EndValue =
+ GetElementPtrInst::Create(II.StartValue, CountRoundDown, "ptr.ind.end",
+ LoopBypassBlocks.back()->getTerminator());
+ break;
+ }
+ case LoopVectorizationLegality::IK_ReversePtrInduction: {
+ // The value at the end of the loop for the reverse pointer is calculated
+ // by creating a GEP with a negative index starting from the start value.
+ Value *Zero = ConstantInt::get(CountRoundDown->getType(), 0);
+ Value *NegIdx = BinaryOperator::CreateSub(Zero, CountRoundDown,
+ "rev.ind.end",
+ LoopBypassBlocks.back()->getTerminator());
+ EndValue = GetElementPtrInst::Create(II.StartValue, NegIdx,
+ "rev.ptr.ind.end",
+ LoopBypassBlocks.back()->getTerminator());
break;
}
}// end of case
// The new PHI merges the original incoming value, in case of a bypass,
// or the value at the end of the vectorized loop.
- ResumeVal->addIncoming(II.StartValue, BypassBlock);
+ for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+ ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]);
ResumeVal->addIncoming(EndValue, VecBody);
// Fix the scalar body counter (PHI node).
@@ -1148,7 +1362,8 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
assert(!ResumeIndex && "Unexpected resume value found");
ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
MiddleBlock->getTerminator());
- ResumeIndex->addIncoming(StartIdx, BypassBlock);
+ for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+ ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]);
ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
}
@@ -1188,6 +1403,8 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
// Insert the new loop into the loop nest and register the new basic blocks.
if (ParentLoop) {
ParentLoop->addChildLoop(Lp);
+ for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I)
+ ParentLoop->addBasicBlockToLoop(LoopBypassBlocks[I], LI->getBase());
ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase());
ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase());
@@ -1204,7 +1421,6 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
LoopExitBlock = ExitBlock;
LoopVectorBody = VecBody;
LoopScalarBody = OldBasicBlock;
- LoopBypassBlock = BypassBlock;
}
/// This function returns the identity element (or neutral element) for
@@ -1234,34 +1450,108 @@ getReductionIdentity(LoopVectorizationLegality::ReductionKind K, Type *Tp) {
}
}
-static bool
-isTriviallyVectorizableIntrinsic(Instruction *Inst) {
- IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst);
- if (!II)
- return false;
- switch (II->getIntrinsicID()) {
- case Intrinsic::sqrt:
- case Intrinsic::sin:
- case Intrinsic::cos:
- case Intrinsic::exp:
- case Intrinsic::exp2:
- case Intrinsic::log:
- case Intrinsic::log10:
- case Intrinsic::log2:
- case Intrinsic::fabs:
- case Intrinsic::floor:
- case Intrinsic::ceil:
- case Intrinsic::trunc:
- case Intrinsic::rint:
- case Intrinsic::nearbyint:
- case Intrinsic::pow:
- case Intrinsic::fma:
- case Intrinsic::fmuladd:
- return true;
+static Intrinsic::ID
+getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) {
+ // If we have an intrinsic call, check if it is trivially vectorizable.
+ if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::sqrt:
+ case Intrinsic::sin:
+ case Intrinsic::cos:
+ case Intrinsic::exp:
+ case Intrinsic::exp2:
+ case Intrinsic::log:
+ case Intrinsic::log10:
+ case Intrinsic::log2:
+ case Intrinsic::fabs:
+ case Intrinsic::floor:
+ case Intrinsic::ceil:
+ case Intrinsic::trunc:
+ case Intrinsic::rint:
+ case Intrinsic::nearbyint:
+ case Intrinsic::pow:
+ case Intrinsic::fma:
+ case Intrinsic::fmuladd:
+ return II->getIntrinsicID();
+ default:
+ return Intrinsic::not_intrinsic;
+ }
+ }
+
+ if (!TLI)
+ return Intrinsic::not_intrinsic;
+
+ LibFunc::Func Func;
+ Function *F = CI->getCalledFunction();
+ // We're going to make assumptions on the semantics of the functions, check
+ // that the target knows that it's available in this environment.
+ if (!F || !TLI->getLibFunc(F->getName(), Func))
+ return Intrinsic::not_intrinsic;
+
+ // Otherwise check if we have a call to a function that can be turned into a
+ // vector intrinsic.
+ switch (Func) {
default:
- return false;
+ break;
+ case LibFunc::sin:
+ case LibFunc::sinf:
+ case LibFunc::sinl:
+ return Intrinsic::sin;
+ case LibFunc::cos:
+ case LibFunc::cosf:
+ case LibFunc::cosl:
+ return Intrinsic::cos;
+ case LibFunc::exp:
+ case LibFunc::expf:
+ case LibFunc::expl:
+ return Intrinsic::exp;
+ case LibFunc::exp2:
+ case LibFunc::exp2f:
+ case LibFunc::exp2l:
+ return Intrinsic::exp2;
+ case LibFunc::log:
+ case LibFunc::logf:
+ case LibFunc::logl:
+ return Intrinsic::log;
+ case LibFunc::log10:
+ case LibFunc::log10f:
+ case LibFunc::log10l:
+ return Intrinsic::log10;
+ case LibFunc::log2:
+ case LibFunc::log2f:
+ case LibFunc::log2l:
+ return Intrinsic::log2;
+ case LibFunc::fabs:
+ case LibFunc::fabsf:
+ case LibFunc::fabsl:
+ return Intrinsic::fabs;
+ case LibFunc::floor:
+ case LibFunc::floorf:
+ case LibFunc::floorl:
+ return Intrinsic::floor;
+ case LibFunc::ceil:
+ case LibFunc::ceilf:
+ case LibFunc::ceill:
+ return Intrinsic::ceil;
+ case LibFunc::trunc:
+ case LibFunc::truncf:
+ case LibFunc::truncl:
+ return Intrinsic::trunc;
+ case LibFunc::rint:
+ case LibFunc::rintf:
+ case LibFunc::rintl:
+ return Intrinsic::rint;
+ case LibFunc::nearbyint:
+ case LibFunc::nearbyintf:
+ case LibFunc::nearbyintl:
+ return Intrinsic::nearbyint;
+ case LibFunc::pow:
+ case LibFunc::powf:
+ case LibFunc::powl:
+ return Intrinsic::pow;
}
- return false;
+
+ return Intrinsic::not_intrinsic;
}
/// This func