diff options
Diffstat (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r-- | lib/Transforms/Vectorize/LoopVectorize.cpp | 1096 |
1 files changed, 711 insertions, 385 deletions
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index bc8e1217be..a696a2ffba 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -9,10 +9,10 @@ // // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops // and generates target-independent LLVM-IR. Legalization of the IR is done -// in the codegen. However, the vectorizes uses (will use) the codegen +// in the codegen. However, the vectorizer uses (will use) the codegen // interfaces to generate IR that is likely to result in an optimal binary. // -// The loop vectorizer combines consecutive loop iteration into a single +// The loop vectorizer combines consecutive loop iterations into a single // 'wide' iteration. After this transformation the index is incremented // by the SIMD vector width, and not by one. // @@ -32,7 +32,7 @@ // D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. // // Variable uniformity checks are inspired by: -// Karrenberg, R. and Hack, S. Whole Function Vectorization. +// Karrenberg, R. and Hack, S. Whole Function Vectorization. // // Other ideas/concepts are from: // A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. @@ -79,6 +79,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" @@ -101,14 +102,16 @@ EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, cl::desc("Enable if-conversion during vectorization.")); /// We don't vectorize loops with a known constant trip count below this number. -static const unsigned TinyTripCountVectorThreshold = 16; +static cl::opt<unsigned> +TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16), + cl::Hidden, + cl::desc("Don't vectorize loops with a constant " + "trip count that is smaller than this " + "value.")); /// We don't unroll loops with a known constant trip count below this number. static const unsigned TinyTripCountUnrollThreshold = 128; -/// We don't unroll loops that are larget than this threshold. -static const unsigned MaxLoopSizeThreshold = 32; - /// When performing a runtime memory check, do not check more than this /// number of pointers. Notice that the check is quadratic! static const unsigned RuntimeMemoryCheckThreshold = 4; @@ -136,10 +139,11 @@ class LoopVectorizationCostModel; class InnerLoopVectorizer { public: InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI, - DominatorTree *DT, DataLayout *DL, unsigned VecWidth, + DominatorTree *DT, DataLayout *DL, + const TargetLibraryInfo *TLI, unsigned VecWidth, unsigned UnrollFactor) - : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), VF(VecWidth), - UF(UnrollFactor), Builder(SE->getContext()), Induction(0), + : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), TLI(TLI), + VF(VecWidth), UF(UnrollFactor), Builder(SE->getContext()), Induction(0), OldInduction(0), WidenMap(UnrollFactor) {} // Perform the actual loop widening (vectorization). @@ -163,8 +167,8 @@ private: /// Add code that checks at runtime if the accessed arrays overlap. /// Returns the comparator value or NULL if no check is needed. - Value *addRuntimeCheck(LoopVectorizationLegality *Legal, - Instruction *Loc); + Instruction *addRuntimeCheck(LoopVectorizationLegality *Legal, + Instruction *Loc); /// Create an empty loop, based on the loop ranges of the old loop. void createEmptyLoop(LoopVectorizationLegality *Legal); /// Copy and widen the instructions from the old loop. @@ -190,6 +194,10 @@ private: /// of scalars. void scalarizeInstruction(Instruction *Instr); + /// Vectorize Load and Store instructions, + void vectorizeMemoryInstruction(Instruction *Instr, + LoopVectorizationLegality *Legal); + /// Create a broadcast instruction. This method generates a broadcast /// instruction (shuffle) for loop invariant values and for the induction /// value. If this is the induction variable then we extend it to N, N+1, ... @@ -222,31 +230,34 @@ private: ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {} /// \return True if 'Key' is saved in the Value Map. - bool has(Value *Key) { return MapStoreage.count(Key); } + bool has(Value *Key) const { return MapStorage.count(Key); } /// Initializes a new entry in the map. Sets all of the vector parts to the /// save value in 'Val'. /// \return A reference to a vector with splat values. VectorParts &splat(Value *Key, Value *Val) { - MapStoreage[Key].clear(); - MapStoreage[Key].append(UF, Val); - return MapStoreage[Key]; + VectorParts &Entry = MapStorage[Key]; + Entry.assign(UF, Val); + return Entry; } ///\return A reference to the value that is stored at 'Key'. VectorParts &get(Value *Key) { - if (!has(Key)) - MapStoreage[Key].resize(UF); - return MapStoreage[Key]; + VectorParts &Entry = MapStorage[Key]; + if (Entry.empty()) + Entry.resize(UF); + assert(Entry.size() == UF); + return Entry; } + private: /// The unroll factor. Each entry in the map stores this number of vector /// elements. unsigned UF; /// Map storage. We use std::map and not DenseMap because insertions to a /// dense map invalidates its iterators. - std::map<Value*, VectorParts> MapStoreage; + std::map<Value *, VectorParts> MapStorage; }; /// The original loop. @@ -259,6 +270,9 @@ private: DominatorTree *DT; /// Data Layout. DataLayout *DL; + /// Target Library Info. + const TargetLibraryInfo *TLI; + /// The vectorization SIMD factor to use. Each vector will have this many /// vector elements. unsigned VF; @@ -283,8 +297,8 @@ private: BasicBlock *LoopVectorBody; ///The scalar loop body. BasicBlock *LoopScalarBody; - ///The first bypass block. - BasicBlock *LoopBypassBlock; + /// A list of all bypass blocks. The first block is the entry of the loop. + SmallVector<BasicBlock *, 4> LoopBypassBlocks; /// The new Induction variable which was added to the new block. PHINode *Induction; @@ -310,8 +324,10 @@ private: class LoopVectorizationLegality { public: LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL, - DominatorTree *DT) - : TheLoop(L), SE(SE), DL(DL), DT(DT), Induction(0) {} + DominatorTree *DT, TargetTransformInfo* TTI, + AliasAnalysis *AA, TargetLibraryInfo *TLI) + : TheLoop(L), SE(SE), DL(DL), DT(DT), TTI(TTI), AA(AA), TLI(TLI), + Induction(0) {} /// This enum represents the kinds of reductions that we support. enum ReductionKind { @@ -330,7 +346,8 @@ public: IK_NoInduction, ///< Not an induction variable. IK_IntInduction, ///< Integer induction variable. Step = 1. IK_ReverseIntInduction, ///< Reverse int induction variable. Step = -1. - IK_PtrInduction ///< Pointer induction variable. Step = sizeof(elem). + IK_PtrInduction, ///< Pointer induction var. Step = sizeof(elem). + IK_ReversePtrInduction ///< Reverse ptr indvar. Step = - sizeof(elem). }; /// This POD struct holds information about reduction variables. @@ -394,6 +411,11 @@ public: /// induction descriptor. typedef MapVector<PHINode*, InductionInfo> InductionList; + /// Alias(Multi)Map stores the values (GEPs or underlying objects and their + /// respective Store/Load instruction(s) to calculate aliasing. + typedef DenseMap<Value*, Instruction* > AliasMap; + typedef DenseMap<Value*, std::vector<Instruction*> > AliasMultiMap; + /// Returns true if it is legal to vectorize this loop. /// This does not mean that it is profitable to vectorize this /// loop, only that it is legal to do so. @@ -467,6 +489,14 @@ private: InductionKind isInductionVariable(PHINode *Phi); /// Return true if can compute the address bounds of Ptr within the loop. bool hasComputableBounds(Value *Ptr); + /// Return true if there is the chance of write reorder. + bool hasPossibleGlobalWriteReorder(Value *Object, + Instruction *Inst, + AliasMultiMap &WriteObjects, + unsigned MaxByteWidth); + /// Return the AA location for a load or a store. + AliasAnalysis::Location getLoadStoreLocation(Instruction *Inst); + /// The loop that we evaluate. Loop *TheLoop; @@ -474,8 +504,14 @@ private: ScalarEvolution *SE; /// DataLayout analysis. DataLayout *DL; - // Dominators. + /// Dominators. DominatorTree *DT; + /// Target Info. + TargetTransformInfo *TTI; + /// Alias Analysis. + AliasAnalysis *AA; + /// Target Library Info. + TargetLibraryInfo *TLI; // --- vectorization state --- // @@ -511,16 +547,23 @@ class LoopVectorizationCostModel { public: LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI, LoopVectorizationLegality *Legal, - const TargetTransformInfo &TTI) - : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI) {} - - /// \return The most profitable vectorization factor. + const TargetTransformInfo &TTI, + DataLayout *DL, const TargetLibraryInfo *TLI) + : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI), DL(DL), TLI(TLI) {} + + /// Information about vectorization costs + struct VectorizationFactor { + unsigned Width; // Vector width with best cost + unsigned Cost; // Cost of the loop with that width + }; + /// \return The most profitable vectorization factor and the cost of that VF. /// This method checks every power of two up to VF. If UserVF is not ZERO /// then this vectorization factor will be selected if vectorization is /// possible. - unsigned selectVectorizationFactor(bool OptForSize, unsigned UserVF); + VectorizationFactor selectVectorizationFactor(bool OptForSize, + unsigned UserVF); - /// \returns The size (in bits) of the widest type in the code that + /// \return The size (in bits) of the widest type in the code that /// needs to be vectorized. We ignore values that remain scalar such as /// 64 bit loop indices. unsigned getWidestType(); @@ -528,7 +571,10 @@ public: /// \return The most profitable unroll factor. /// If UserUF is non-zero then this method finds the best unroll-factor /// based on register pressure and other parameters. - unsigned selectUnrollFactor(bool OptForSize, unsigned UserUF); + /// VF and LoopCost are the selected vectorization factor and the cost of the + /// selected VF. + unsigned selectUnrollFactor(bool OptForSize, unsigned UserUF, unsigned VF, + unsigned LoopCost); /// \brief A struct that represents some properties of the register usage /// of a loop. @@ -560,6 +606,10 @@ private: /// the scalar type. static Type* ToVectorTy(Type *Scalar, unsigned VF); + /// Returns whether the instruction is a load or store and will be a emitted + /// as a vector operation. + bool isConsecutiveLoadOrStore(Instruction *I); + /// The loop that we evaluate. Loop *TheLoop; /// Scev analysis. @@ -570,6 +620,10 @@ private: LoopVectorizationLegality *Legal; /// Vector target information. const TargetTransformInfo &TTI; + /// Target data layout information. + DataLayout *DL; + /// Target Library Info. + const TargetLibraryInfo *TLI; }; /// The LoopVectorize Pass. @@ -586,6 +640,8 @@ struct LoopVectorize : public LoopPass { LoopInfo *LI; TargetTransformInfo *TTI; DominatorTree *DT; + AliasAnalysis *AA; + TargetLibraryInfo *TLI; virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { // We only vectorize innermost loops. @@ -597,21 +653,23 @@ struct LoopVectorize : public LoopPass { LI = &getAnalysis<LoopInfo>(); TTI = &getAnalysis<TargetTransformInfo>(); DT = &getAnalysis<DominatorTree>(); + AA = getAnalysisIfAvailable<AliasAnalysis>(); + TLI = getAnalysisIfAvailable<TargetLibraryInfo>(); DEBUG(dbgs() << "LV: Checking a loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); // Check if it is legal to vectorize the loop. - LoopVectorizationLegality LVL(L, SE, DL, DT); + LoopVectorizationLegality LVL(L, SE, DL, DT, TTI, AA, TLI); if (!LVL.canVectorize()) { DEBUG(dbgs() << "LV: Not vectorizing.\n"); return false; } // Use the cost model. - LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI); + LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI, DL, TLI); - // Check the function attribues to find out if this function should be + // Check the function attributes to find out if this function should be // optimized for size. Function *F = L->getHeader()->getParent(); Attribute::AttrKind SzAttr = Attribute::OptimizeForSize; @@ -626,20 +684,24 @@ struct LoopVectorize : public LoopPass { return false; } - unsigned VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor); - unsigned UF = CM.selectUnrollFactor(OptForSize, VectorizationUnroll); + // Select the optimal vectorization factor. + LoopVectorizationCostModel::VectorizationFactor VF; + VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor); + // Select the unroll factor. + unsigned UF = CM.selectUnrollFactor(OptForSize, VectorizationUnroll, + VF.Width, VF.Cost); - if (VF == 1) { + if (VF.Width == 1) { DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); return false; } - DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<< + DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF.Width << ") in "<< F->getParent()->getModuleIdentifier()<<"\n"); DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n"); - // If we decided that it is *legal* to vectorizer the loop then do it. - InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF, UF); + // If we decided that it is *legal* to vectorize the loop then do it. + InnerLoopVectorizer LB(L, SE, LI, DT, DL, TLI, VF.Width, UF); LB.vectorize(&LVL); DEBUG(verifyFunction(*L->getHeader()->getParent())); @@ -728,6 +790,9 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, unsigned StartIdx, int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr"); + // Make sure that the pointer does not point to structs. + if (cast<PointerType>(Ptr->getType())->getElementType()->isAggregateType()) + return 0; // If this value is a pointer induction variable we know it is consecutive. PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr); @@ -735,6 +800,8 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { InductionInfo II = Inductions[Phi]; if (IK_PtrInduction == II.IK) return 1; + else if (IK_ReversePtrInduction == II.IK) + return -1; } GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr); @@ -744,6 +811,29 @@ int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { unsigned NumOperands = Gep->getNumOperands(); Value *LastIndex = Gep->getOperand(NumOperands - 1); + Value *GpPtr = Gep->getPointerOperand(); + // If this GEP value is a consecutive pointer induction variable and all of + // the indices are constant then we know it is consecutive. We can + Phi = dyn_cast<PHINode>(GpPtr); + if (Phi && Inductions.count(Phi)) { + + // Make sure that the pointer does not point to structs. + PointerType *GepPtrType = cast<PointerType>(GpPtr->getType()); + if (GepPtrType->getElementType()->isAggregateType()) + return 0; + + // Make sure that all of the index operands are loop invariant. + for (unsigned i = 1; i < NumOperands; ++i) + if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) + return 0; + + InductionInfo II = Inductions[Phi]; + if (IK_PtrInduction == II.IK) + return 1; + else if (IK_ReversePtrInduction == II.IK) + return -1; + } + // Check that all of the gep indices are uniform except for the last. for (unsigned i = 0; i < NumOperands - 1; ++i) if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) @@ -782,8 +872,7 @@ InnerLoopVectorizer::getVectorValue(Value *V) { // If this scalar is unknown, assume that it is a constant or that it is // loop invariant. Broadcast V and save the value for future uses. Value *B = getBroadcastInstrs(V); - WidenMap.splat(V, B); - return WidenMap.get(V); + return WidenMap.splat(V, B); } Value *InnerLoopVectorizer::reverseVector(Value *Vec) { @@ -797,6 +886,111 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) { "reverse"); } + +void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr, + LoopVectorizationLegality *Legal) { + // Attempt to issue a wide load. + LoadInst *LI = dyn_cast<LoadInst>(Instr); + StoreInst *SI = dyn_cast<StoreInst>(Instr); + + assert((LI || SI) && "Invalid Load/Store instruction"); + + Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType(); + Type *DataTy = VectorType::get(ScalarDataTy, VF); + Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand(); + unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment(); + + // If the pointer is loop invariant or if it is non consecutive, + // scalarize the load. + int Stride = Legal->isConsecutivePtr(Ptr); + bool Reverse = Stride < 0; + bool UniformLoad = LI && Legal->isUniform(Ptr); + if (Stride == 0 || UniformLoad) + return scalarizeInstruction(Instr); + + Constant *Zero = Builder.getInt32(0); + VectorParts &Entry = WidenMap.get(Instr); + + // Handle consecutive loads/stores. + GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); + if (Gep && Legal->isInductionVariable(Gep->getPointerOperand())) { + Value *PtrOperand = Gep->getPointerOperand(); + Value *FirstBasePtr = getVectorValue(PtrOperand)[0]; + FirstBasePtr = Builder.CreateExtractElement(FirstBasePtr, Zero); + + // Create the new GEP with the new induction variable. + GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); + Gep2->setOperand(0, FirstBasePtr); + Gep2->setName("gep.indvar.base"); + Ptr = Builder.Insert(Gep2); + } else if (Gep) { + assert(SE->isLoopInvariant(SE->getSCEV(Gep->getPointerOperand()), + OrigLoop) && "Base ptr must be invariant"); + + // The last index does not have to be the induction. It can be + // consecutive and be a function of the index. For example A[I+1]; + unsigned NumOperands = Gep->getNumOperands(); + + Value *LastGepOperand = Gep->getOperand(NumOperands - 1); + VectorParts &GEPParts = getVectorValue(LastGepOperand); + Value *LastIndex = GEPParts[0]; + LastIndex = Builder.CreateExtractElement(LastIndex, Zero); + + // Create the new GEP with the new induction variable. + GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); + Gep2->setOperand(NumOperands - 1, LastIndex); + Gep2->setName("gep.indvar.idx"); + Ptr = Builder.Insert(Gep2); + } else { + // Use the induction element ptr. + assert(isa<PHINode>(Ptr) && "Invalid induction ptr"); + VectorParts &PtrVal = getVectorValue(Ptr); + Ptr = Builder.CreateExtractElement(PtrVal[0], Zero); + } + + // Handle Stores: + if (SI) { + assert(!Legal->isUniform(SI->getPointerOperand()) && + "We do not allow storing to uniform addresses"); + + VectorParts &StoredVal = getVectorValue(SI->getValueOperand()); + for (unsigned Part = 0; Part < UF; ++Part) { + // Calculate the pointer for the specific unroll-part. + Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); + + if (Reverse) { + // If we store to reverse consecutive memory locations then we need + // to reverse the order of elements in the stored value. + StoredVal[Part] = reverseVector(StoredVal[Part]); + // If the address is consecutive but reversed, then the + // wide store needs to start at the last vector element. + PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); + PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); + } + + Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo()); + Builder.CreateStore(StoredVal[Part], VecPtr)->setAlignment(Alignment); + } + } + + for (unsigned Part = 0; Part < UF; ++Part) { + // Calculate the pointer for the specific unroll-part. + Value *PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(Part * VF)); + + if (Reverse) { + // If the address is consecutive but reversed, then the + // wide store needs to start at the last vector element. + PartPtr = Builder.CreateGEP(Ptr, Builder.getInt32(-Part * VF)); + PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF)); + } + + Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo()); + Value *LI = Builder.CreateLoad(VecPtr, "wide.load"); + cast<LoadInst>(LI)->setAlignment(Alignment); + Entry[Part] = Reverse ? reverseVector(LI) : LI; + } +} + void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); // Holds vector parameters or scalars, in case of uniform vals. @@ -868,7 +1062,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { } } -Value* +Instruction * InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, Instruction *Loc) { LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck = @@ -877,7 +1071,7 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, if (!PtrRtCheck->Need) return NULL; - Value *MemoryRuntimeCheck = 0; + Instruction *MemoryRuntimeCheck = 0; unsigned NumPointers = PtrRtCheck->Pointers.size(); SmallVector<Value* , 2> Starts; SmallVector<Value* , 2> Ends; @@ -906,28 +1100,23 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, } } + IRBuilder<> ChkBuilder(Loc); + for (unsigned i = 0; i < NumPointers; ++i) { for (unsigned j = i+1; j < NumPointers; ++j) { - Instruction::CastOps Op = Instruction::BitCast; - Value *Start0 = CastInst::Create(Op, Starts[i], PtrArithTy, "bc", Loc); - Value *Start1 = CastInst::Create(Op, Starts[j], PtrArithTy, "bc", Loc); - Value *End0 = CastInst::Create(Op, Ends[i], PtrArithTy, "bc", Loc); - Value *End1 = CastInst::Create(Op, Ends[j], PtrArithTy, "bc", Loc); - - Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, - Start0, End1, "bound0", Loc); - Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, - Start1, End0, "bound1", Loc); - Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1, - "found.conflict", Loc); + Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy, "bc"); + Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy, "bc"); + Value *End0 = ChkBuilder.CreateBitCast(Ends[i], PtrArithTy, "bc"); + Value *End1 = ChkBuilder.CreateBitCast(Ends[j], PtrArithTy, "bc"); + + Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "bound0"); + Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "bound1"); + Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict"); if (MemoryRuntimeCheck) - MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or, - MemoryRuntimeCheck, - IsConflict, - "conflict.rdx", Loc); - else - MemoryRuntimeCheck = IsConflict; + IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, + "conflict.rdx"); + MemoryRuntimeCheck = cast<Instruction>(IsConflict); } } @@ -941,7 +1130,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { the vectorized instructions while the old loop will continue to run the scalar remainder. - [ ] <-- vector loop bypass. + [ ] <-- vector loop bypass (may consist of multiple blocks). / | / v | [ ] <-- vector pre header. @@ -1002,10 +1191,7 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { ConstantInt::get(IdxTy, 0); assert(BypassBlock && "Invalid loop structure"); - - // Generate the code that checks in runtime if arrays overlap. - Value *MemoryRuntimeCheck = addRuntimeCheck(Legal, - BypassBlock->getTerminator()); + LoopBypassBlocks.push_back(BypassBlock); // Split the single block loop into the two loop structure described above. BasicBlock *VectorPH = @@ -1017,10 +1203,6 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { BasicBlock *ScalarPH = MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); - // This is the location in which we add all of the logic for bypassing - // the new vector loop. - Instruction *Loc = BypassBlock->getTerminator(); - // Use this IR builder to create the loop instructions (Phi, Br, Cmp) // inside the loop. Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); @@ -1031,45 +1213,62 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { // times the unroll factor (num of SIMD instructions). Constant *Step = ConstantInt::get(IdxTy, VF * UF); + // This is the IR builder that we use to add all of the logic for bypassing + // the new vector loop. + IRBuilder<> BypassBuilder(BypassBlock->getTerminator()); + // We may need to extend the index in case there is a type mismatch. // We know that the count starts at zero and does not overflow. - unsigned IdxTyBW = IdxTy->getScalarSizeInBits(); if (Count->getType() != IdxTy) { // The exit count can be of pointer type. Convert it to the correct // integer type. if (ExitCount->getType()->isPointerTy()) - Count = CastInst::CreatePointerCast(Count, IdxTy, "ptrcnt.to.int", Loc); - else if (IdxTyBW < Count->getType()->getScalarSizeInBits()) - Count = CastInst::CreateTruncOrBitCast(Count, IdxTy, "tr.cnt", Loc); + Count = BypassBuilder.CreatePointerCast(Count, IdxTy, "ptrcnt.to.int"); else - Count = CastInst::CreateZExtOrBitCast(Count, IdxTy, "zext.cnt", Loc); + Count = BypassBuilder.CreateZExtOrTrunc(Count, IdxTy, "cnt.cast"); } // Add the start index to the loop count to get the new end index. - Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc); + Value *IdxEnd = BypassBuilder.CreateAdd(Count, StartIdx, "end.idx"); // Now we need to generate the expression for N - (N % VF), which is // the part that the vectorized body will execute. - Value *R = BinaryOperator::CreateURem(Count, Step, "n.mod.vf", Loc); - Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc); - Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx, - "end.idx.rnd.down", Loc); + Value *R = BypassBuilder.CreateURem(Count, Step, "n.mod.vf"); + Value *CountRoundDown = BypassBuilder.CreateSub(Count, R, "n.vec"); + Value *IdxEndRoundDown = BypassBuilder.CreateAdd(CountRoundDown, StartIdx, + "end.idx.rnd.down"); // Now, compare the new count to zero. If it is zero skip the vector loop and // jump to the scalar loop. - Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, - IdxEndRoundDown, - StartIdx, - "cmp.zero", Loc); - - // If we are using memory runtime checks, include them in. - if (MemoryRuntimeCheck) - Cmp = BinaryOperator::Create(Instruction::Or, Cmp, MemoryRuntimeCheck, - "CntOrMem", Loc); + Value *Cmp = BypassBuilder.CreateICmpEQ(IdxEndRoundDown, StartIdx, + "cmp.zero"); + + BasicBlock *LastBypassBlock = BypassBlock; + + // Generate the code that checks in runtime if arrays overlap. We put the + // checks into a separate block to make the more common case of few elements + // faster. + Instruction *MemRuntimeCheck = addRuntimeCheck(Legal, + BypassBlock->getTerminator()); + if (MemRuntimeCheck) { + // Create a new block containing the memory check. + BasicBlock *CheckBlock = BypassBlock->splitBasicBlock(MemRuntimeCheck, + "vector.memcheck"); + LoopBypassBlocks.push_back(CheckBlock); + + // Replace the branch into the memory check block with a conditional branch + // for the "few elements case". + Instruction *OldTerm = BypassBlock->getTerminator(); + BranchInst::Create(MiddleBlock, CheckBlock, Cmp, OldTerm); + OldTerm->eraseFromParent(); + + Cmp = MemRuntimeCheck; + LastBypassBlock = CheckBlock; + } - BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc); - // Remove the old terminator. - Loc->eraseFromParent(); + LastBypassBlock->getTerminator()->eraseFromParent(); + BranchInst::Create(MiddleBlock, VectorPH, Cmp, + LastBypassBlock); // We are going to resume the execution of the scalar loop. // Go over all of the induction variables that we found and fix the @@ -1109,30 +1308,45 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { Value *CRD = CountRoundDown; if (CRDSize > IISize) CRD = CastInst::Create(Instruction::Trunc, CountRoundDown, - II.StartValue->getType(), - "tr.crd", BypassBlock->getTerminator()); + II.StartValue->getType(), "tr.crd", + LoopBypassBlocks.back()->getTerminator()); else if (CRDSize < IISize) CRD = CastInst::Create(Instruction::SExt, CountRoundDown, II.StartValue->getType(), - "sext.crd", BypassBlock->getTerminator()); + "sext.crd", + LoopBypassBlocks.back()->getTerminator()); // Handle reverse integer induction counter: - EndValue = BinaryOperator::CreateSub(II.StartValue, CRD, "rev.ind.end", - BypassBlock->getTerminator()); + EndValue = + BinaryOperator::CreateSub(II.StartValue, CRD, "rev.ind.end", + LoopBypassBlocks.back()->getTerminator()); break; } case LoopVectorizationLegality::IK_PtrInduction: { // For pointer induction variables, calculate the offset using // the end index. - EndValue = GetElementPtrInst::Create(II.StartValue, CountRoundDown, - "ptr.ind.end", - BypassBlock->getTerminator()); + EndValue = + GetElementPtrInst::Create(II.StartValue, CountRoundDown, "ptr.ind.end", + LoopBypassBlocks.back()->getTerminator()); + break; + } + case LoopVectorizationLegality::IK_ReversePtrInduction: { + // The value at the end of the loop for the reverse pointer is calculated + // by creating a GEP with a negative index starting from the start value. + Value *Zero = ConstantInt::get(CountRoundDown->getType(), 0); + Value *NegIdx = BinaryOperator::CreateSub(Zero, CountRoundDown, + "rev.ind.end", + LoopBypassBlocks.back()->getTerminator()); + EndValue = GetElementPtrInst::Create(II.StartValue, NegIdx, + "rev.ptr.ind.end", + LoopBypassBlocks.back()->getTerminator()); break; } }// end of case // The new PHI merges the original incoming value, in case of a bypass, // or the value at the end of the vectorized loop. - ResumeVal->addIncoming(II.StartValue, BypassBlock); + for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) + ResumeVal->addIncoming(II.StartValue, LoopBypassBlocks[I]); ResumeVal->addIncoming(EndValue, VecBody); // Fix the scalar body counter (PHI node). @@ -1148,7 +1362,8 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { assert(!ResumeIndex && "Unexpected resume value found"); ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val", MiddleBlock->getTerminator()); - ResumeIndex->addIncoming(StartIdx, BypassBlock); + for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I) + ResumeIndex->addIncoming(StartIdx, LoopBypassBlocks[I]); ResumeIndex->addIncoming(IdxEndRoundDown, VecBody); } @@ -1188,6 +1403,8 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { // Insert the new loop into the loop nest and register the new basic blocks. if (ParentLoop) { ParentLoop->addChildLoop(Lp); + for (unsigned I = 1, E = LoopBypassBlocks.size(); I != E; ++I) + ParentLoop->addBasicBlockToLoop(LoopBypassBlocks[I], LI->getBase()); ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); @@ -1204,7 +1421,6 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { LoopExitBlock = ExitBlock; LoopVectorBody = VecBody; LoopScalarBody = OldBasicBlock; - LoopBypassBlock = BypassBlock; } /// This function returns the identity element (or neutral element) for @@ -1234,34 +1450,108 @@ getReductionIdentity(LoopVectorizationLegality::ReductionKind K, Type *Tp) { } } -static bool -isTriviallyVectorizableIntrinsic(Instruction *Inst) { - IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst); - if (!II) - return false; - switch (II->getIntrinsicID()) { - case Intrinsic::sqrt: - case Intrinsic::sin: - case Intrinsic::cos: - case Intrinsic::exp: - case Intrinsic::exp2: - case Intrinsic::log: - case Intrinsic::log10: - case Intrinsic::log2: - case Intrinsic::fabs: - case Intrinsic::floor: - case Intrinsic::ceil: - case Intrinsic::trunc: - case Intrinsic::rint: - case Intrinsic::nearbyint: - case Intrinsic::pow: - case Intrinsic::fma: - case Intrinsic::fmuladd: - return true; +static Intrinsic::ID +getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) { + // If we have an intrinsic call, check if it is trivially vectorizable. + if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) { + switch (II->getIntrinsicID()) { + case Intrinsic::sqrt: + case Intrinsic::sin: + case Intrinsic::cos: + case Intrinsic::exp: + case Intrinsic::exp2: + case Intrinsic::log: + case Intrinsic::log10: + case Intrinsic::log2: + case Intrinsic::fabs: + case Intrinsic::floor: + case Intrinsic::ceil: + case Intrinsic::trunc: + case Intrinsic::rint: + case Intrinsic::nearbyint: + case Intrinsic::pow: + case Intrinsic::fma: + case Intrinsic::fmuladd: + return II->getIntrinsicID(); + default: + return Intrinsic::not_intrinsic; + } + } + + if (!TLI) + return Intrinsic::not_intrinsic; + + LibFunc::Func Func; + Function *F = CI->getCalledFunction(); + // We're going to make assumptions on the semantics of the functions, check + // that the target knows that it's available in this environment. + if (!F || !TLI->getLibFunc(F->getName(), Func)) + return Intrinsic::not_intrinsic; + + // Otherwise check if we have a call to a function that can be turned into a + // vector intrinsic. + switch (Func) { default: - return false; + break; + case LibFunc::sin: + case LibFunc::sinf: + case LibFunc::sinl: + return Intrinsic::sin; + case LibFunc::cos: + case LibFunc::cosf: + case LibFunc::cosl: + return Intrinsic::cos; + case LibFunc::exp: + case LibFunc::expf: + case LibFunc::expl: + return Intrinsic::exp; + case LibFunc::exp2: + case LibFunc::exp2f: + case LibFunc::exp2l: + return Intrinsic::exp2; + case LibFunc::log: + case LibFunc::logf: + case LibFunc::logl: + return Intrinsic::log; + case LibFunc::log10: + case LibFunc::log10f: + case LibFunc::log10l: + return Intrinsic::log10; + case LibFunc::log2: + case LibFunc::log2f: + case LibFunc::log2l: + return Intrinsic::log2; + case LibFunc::fabs: + case LibFunc::fabsf: + case LibFunc::fabsl: + return Intrinsic::fabs; + case LibFunc::floor: + case LibFunc::floorf: + case LibFunc::floorl: + return Intrinsic::floor; + case LibFunc::ceil: + case LibFunc::ceilf: + case LibFunc::ceill: + return Intrinsic::ceil; + case LibFunc::trunc: + case LibFunc::truncf: + case LibFunc::truncl: + return Intrinsic::trunc; + case LibFunc::rint: + case LibFunc::rintf: + case LibFunc::rintl: + return Intrinsic::rint; + case LibFunc::nearbyint: + case LibFunc::nearbyintf: + case LibFunc::nearbyintl: + return Intrinsic::nearbyint; + case LibFunc::pow: + case LibFunc::powf: + case LibFunc::powl: + return Intrinsic::pow; } - return false; + + return Intrinsic::not_intrinsic; } /// This func |