diff options
Diffstat (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r-- | lib/Transforms/Vectorize/LoopVectorize.cpp | 859 |
1 files changed, 666 insertions, 193 deletions
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 9bbd9ab60b..f944d9b4fc 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7,15 +7,31 @@ // //===----------------------------------------------------------------------===// // -// This is a simple loop vectorizer. We currently only support single block -// loops. We have a very simple and restrictive legality check: we need to read -// and write from disjoint memory locations. We still don't have a cost model. +// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops +// and generates target-independent LLVM-IR. Legalization of the IR is done +// in the codegen. However, the vectorizes uses (will use) the codegen +// interfaces to generate IR that is likely to result in an optimal binary. +// +// The loop vectorizer combines consecutive loop iteration into a single +// 'wide' iteration. After this transformation the index is incremented +// by the SIMD vector width, and not by one. +// // This pass has three parts: // 1. The main loop pass that drives the different parts. // 2. LoopVectorizationLegality - A helper class that checks for the legality // of the vectorization. // 3. SingleBlockLoopVectorizer - A helper class that performs the actual // widening of instructions. +//===----------------------------------------------------------------------===// +// +// The reduction-variable vectorization is based on the paper: +// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. +// +// Variable uniformity checks are inspired by: +// Karrenberg, R. and Hack, S. Whole Function Vectorization. +// +// Other ideas/concepts are from: +// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. // //===----------------------------------------------------------------------===// #define LV_NAME "loop-vectorize" @@ -54,40 +70,49 @@ static cl::opt<unsigned> DefaultVectorizationFactor("default-loop-vectorize-width", cl::init(4), cl::Hidden, cl::desc("Set the default loop vectorization width")); - namespace { -/// Vectorize a simple loop. This class performs the widening of simple single -/// basic block loops into vectors. It does not perform any -/// vectorization-legality checks, and just does it. It widens the vectors -/// to a given vectorization factor (VF). +// Forward declaration. +class LoopVectorizationLegality; + +/// SingleBlockLoopVectorizer vectorizes loops which contain only one basic +/// block to a specified vectorization factor (VF). +/// This class performs the widening of scalars into vectors, or multiple +/// scalars. This class also implements the following features: +/// * It inserts an epilogue loop for handling loops that don't have iteration +/// counts that are known to be a multiple of the vectorization factor. +/// * It handles the code generation for reduction variables. +/// * Scalarization (implementation using scalars) of un-vectorizable +/// instructions. +/// SingleBlockLoopVectorizer does not perform any vectorization-legality +/// checks, and relies on the caller to check for the different legality +/// aspects. The SingleBlockLoopVectorizer relies on the +/// LoopVectorizationLegality class to provide information about the induction +/// and reduction variables that were found to a given vectorization factor. class SingleBlockLoopVectorizer { public: /// Ctor. - SingleBlockLoopVectorizer(Loop *OrigLoop, ScalarEvolution *Se, LoopInfo *Li, + SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li, LPPassManager *Lpm, unsigned VecWidth): - Orig(OrigLoop), SE(Se), LI(Li), LPM(Lpm), VF(VecWidth), - Builder(0), Induction(0), OldInduction(0) { } - - ~SingleBlockLoopVectorizer() { - delete Builder; - } + OrigLoop(Orig), SE(Se), LI(Li), LPM(Lpm), VF(VecWidth), + Builder(Se->getContext()), Induction(0), OldInduction(0) { } // Perform the actual loop widening (vectorization). - void vectorize() { + void vectorize(LoopVectorizationLegality *Legal) { ///Create a new empty loop. Unlink the old loop and connect the new one. - createEmptyLoop(); + createEmptyLoop(Legal); /// Widen each instruction in the old loop to a new one in the new loop. - vectorizeLoop(); + /// Use the Legality module to find the induction and reduction variables. + vectorizeLoop(Legal); // register the new loop. cleanup(); } private: /// Create an empty loop, based on the loop ranges of the old loop. - void createEmptyLoop(); + void createEmptyLoop(LoopVectorizationLegality *Legal); /// Copy and widen the instructions from the old loop. - void vectorizeLoop(); + void vectorizeLoop(LoopVectorizationLegality *Legal); /// Insert the new loop to the loop hierarchy and pass manager. void cleanup(); @@ -106,10 +131,6 @@ private: /// for each element in the vector. Starting from zero. Value *getConsecutiveVector(Value* Val); - /// Check that the GEP operands are all uniform except for the last index - /// which has to be the induction variable. - bool isConsecutiveGep(GetElementPtrInst *Gep); - /// When we go over instructions in the basic block we rely on previous /// values within the current basic block or on loop invariant values. /// When we widen (vectorize) values we place them in the map. If the values @@ -117,10 +138,14 @@ private: /// broadcast them into a vector. Value *getVectorValue(Value *V); + /// Get a uniform vector of constant integers. We use this to get + /// vectors of ones and zeros for the reduction code. + Constant* getUniformVector(unsigned Val, Type* ScalarTy); + typedef DenseMap<Value*, Value*> ValueMap; /// The original loop. - Loop *Orig; + Loop *OrigLoop; // Scev analysis to use. ScalarEvolution *SE; // Loop Info. @@ -131,10 +156,21 @@ private: unsigned VF; // The builder that we use - IRBuilder<> *Builder; + IRBuilder<> Builder; // --- Vectorization state --- + /// Middle Block between the vector and the scalar. + BasicBlock *LoopMiddleBlock; + ///The ExitBlock of the scalar loop. + BasicBlock *LoopExitBlock; + ///The vector loop body. + BasicBlock *LoopVectorBody; + ///The scalar loop body. + BasicBlock *LoopScalarBody; + ///The first bypass block. + BasicBlock *LoopBypassBlock; + /// The new Induction variable which was added to the new block. PHINode *Induction; /// The induction variable of the old basic block. @@ -143,14 +179,55 @@ private: ValueMap WidenMap; }; -/// Perform the vectorization legality check. This class does not look at the -/// profitability of vectorization, only the legality. At the moment the checks -/// are very simple and focus on single basic block loops with a constant -/// iteration count and no reductions. +/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and +/// to what vectorization factor. +/// This class does not look at the profitability of vectorization, only the +/// legality. This class has two main kinds of checks: +/// * Memory checks - The code in canVectorizeMemory checks if vectorization +/// will change the order of memory accesses in a way that will change the +/// correctness of the program. +/// * Scalars checks - The code in canVectorizeBlock checks for a number +/// of different conditions, such as the availability of a single induction +/// variable, that all types are supported and vectorize-able, etc. +/// This code reflects the capabilities of SingleBlockLoopVectorizer. +/// This class is also used by SingleBlockLoopVectorizer for identifying +/// induction variable and the different reduction variables. class LoopVectorizationLegality { public: LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl): - TheLoop(Lp), SE(Se), DL(Dl) { } + TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { } + + /// This represents the kinds of reductions that we support. + /// We use the enum values to hold the 'identity' value for + /// each operand. This value does not change the result if applied. + enum ReductionKind { + NoReduction = -1, /// Not a reduction. + IntegerAdd = 0, /// Sum of numbers. + IntegerMult = 1 /// Product of numbers. + }; + + /// This POD struct holds information about reduction variables. + struct ReductionDescriptor { + // Default C'tor + ReductionDescriptor(): + StartValue(0), LoopExitInstr(0), Kind(NoReduction) {} + + // C'tor. + ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K): + StartValue(Start), LoopExitInstr(Exit), Kind(K) {} + + // The starting value of the reduction. + // It does not have to be zero! + Value *StartValue; + // The instruction who's value is used outside the loop. + Instruction *LoopExitInstr; + // The kind of the reduction. + ReductionKind Kind; + }; + + /// ReductionList contains the reduction descriptors for all + /// of the reductions that were found in the loop. + typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList; /// Returns the maximum vectorization factor that we *can* use to vectorize /// this loop. This does not mean that it is profitable to vectorize this @@ -158,15 +235,42 @@ public: /// can vectorize to any SIMD width below this number. unsigned getLoopMaxVF(); + /// Returns the Induction variable. + PHINode *getInduction() {return Induction;} + + /// Returns the reduction variables found in the loop. + ReductionList *getReductionVars() { return &Reductions; } + + /// Check if the pointer returned by this GEP is consecutive + /// when the index is vectorized. This happens when the last + /// index of the GEP is consecutive, like the induction variable. + /// This check allows us to vectorize A[idx] into a wide load/store. + bool isConsecutiveGep(Value *Ptr); + private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count /// and we only need to check individual instructions. bool canVectorizeBlock(BasicBlock &BB); + /// When we vectorize loops we may change the order in which + /// we read and write from memory. This method checks if it is + /// legal to vectorize the code, considering only memory constrains. + /// Returns true if BB is vectorizable + bool canVectorizeMemory(BasicBlock &BB); + // Check if a pointer value is known to be disjoint. // Example: Alloca, Global, NoAlias. - bool isidentifiedSafeObject(Value* Val); + bool isIdentifiedSafeObject(Value* Val); + + /// Returns True, if 'Phi' is the kind of reduction variable for type + /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. + bool AddReductionVar(PHINode *Phi, ReductionKind Kind); + /// Returns true if the instruction I can be a reduction variable of type + /// 'Kind'. + bool isReductionInstr(Instruction *I, ReductionKind Kind); + /// Returns True, if 'Phi' is an induction variable. + bool isInductionVariable(PHINode *Phi); /// The loop that we evaluate. Loop *TheLoop; @@ -174,6 +278,16 @@ private: ScalarEvolution *SE; /// DataLayout analysis. DataLayout *DL; + + // --- vectorization state --- // + + /// Holds the induction variable. + PHINode *Induction; + /// Holds the reduction variables. + ReductionList Reductions; + /// Allowed outside users. This holds the reduction + /// vars which can be accessed from outside the loop. + SmallPtrSet<Value*, 4> AllowedExit; }; struct LoopVectorize : public LoopPass { @@ -188,7 +302,7 @@ struct LoopVectorize : public LoopPass { LoopInfo *LI; virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { - // Only vectorize innermost loops. + // We only vectorize innermost loops. if (!L->empty()) return false; @@ -203,7 +317,8 @@ struct LoopVectorize : public LoopPass { LoopVectorizationLegality LVL(L, SE, DL); unsigned MaxVF = LVL.getLoopMaxVF(); - // Check that we can vectorize using the chosen vectorization width. + // Check that we can vectorize this loop using the chosen vectorization + // width. if (MaxVF < DefaultVectorizationFactor) { DEBUG(dbgs() << "LV: non-vectorizable MaxVF ("<< MaxVF << ").\n"); return false; @@ -211,9 +326,9 @@ struct LoopVectorize : public LoopPass { DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< MaxVF << ").\n"); - // If we decided that is is *legal* to vectorizer the loop. Do it. + // If we decided that it is *legal* to vectorizer the loop then do it. SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, DefaultVectorizationFactor); - LB.vectorize(); + LB.vectorize(&LVL); DEBUG(verifyFunction(*L->getHeader()->getParent())); return true; @@ -222,6 +337,7 @@ struct LoopVectorize : public LoopPass { virtual void getAnalysisUsage(AnalysisUsage &AU) const { LoopPass::getAnalysisUsage(AU); AU.addRequiredID(LoopSimplifyID); + AU.addRequiredID(LCSSAID); AU.addRequired<LoopInfo>(); AU.addRequired<ScalarEvolution>(); } @@ -241,9 +357,9 @@ Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) { Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF)); Value *UndefVal = UndefValue::get(VTy); // Insert the value into a new vector. - Value *SingleElem = Builder->CreateInsertElement(UndefVal, V, Zero); + Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero); // Broadcast the scalar into all locations in the vector. - Value *Shuf = Builder->CreateShuffleVector(SingleElem, UndefVal, Zeros, + Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros, "broadcast"); // We are accessing the induction variable. Make sure to promote the // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes. @@ -269,11 +385,11 @@ Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) { // Add the consecutive indices to the vector value. Constant *Cv = ConstantVector::get(Indices); assert(Cv->getType() == Val->getType() && "Invalid consecutive vec"); - return Builder->CreateAdd(Val, Cv, "induction"); + return Builder.CreateAdd(Val, Cv, "induction"); } - -bool SingleBlockLoopVectorizer::isConsecutiveGep(GetElementPtrInst *Gep) { +bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) { + GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr); if (!Gep) return false; @@ -282,7 +398,7 @@ bool SingleBlockLoopVectorizer::isConsecutiveGep(GetElementPtrInst *Gep) { // Check that all of the gep indices are uniform except for the last. for (unsigned i = 0; i < NumOperands - 1; ++i) - if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), Orig)) + if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) return false; // We can emit wide load/stores only of the last index is the induction @@ -301,17 +417,29 @@ bool SingleBlockLoopVectorizer::isConsecutiveGep(GetElementPtrInst *Gep) { } Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) { + assert(!V->getType()->isVectorTy() && "Can't widen a vector"); // If we saved a vectorized copy of V, use it. - ValueMap::iterator it = WidenMap.find(V); - if (it != WidenMap.end()) - return it->second; + Value *&MapEntry = WidenMap[V]; + if (MapEntry) + return MapEntry; // Broadcast V and save the value for future uses. Value *B = getBroadcastInstrs(V); - WidenMap[V] = B; + MapEntry = B; return B; } +Constant* +SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) { + SmallVector<Constant*, 8> Indices; + // Create a vector of consecutive numbers from zero to VF. + for (unsigned i = 0; i < VF; ++i) + Indices.push_back(ConstantInt::get(ScalarTy, Val)); + + // Add the consecutive indices to the vector value. + return ConstantVector::get(Indices); +} + void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); // Holds vector parameters or scalars, in case of uniform vals. @@ -354,7 +482,7 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { if (!IsVoidRetTy) VecResults = UndefValue::get(VectorType::get(Instr->getType(), VF)); - // For each scalar that we create. + // For each scalar that we create: for (unsigned i = 0; i < VF; ++i) { Instruction *Cloned = Instr->clone(); if (!IsVoidRetTy) @@ -364,31 +492,31 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { Value *Op = Params[op]; // Param is a vector. Need to extract the right lane. if (Op->getType()->isVectorTy()) - Op = Builder->CreateExtractElement(Op, Builder->getInt32(i)); + Op = Builder.CreateExtractElement(Op, Builder.getInt32(i)); Cloned->setOperand(op, Op); } // Place the cloned scalar in the new loop. - Builder->Insert(Cloned); + Builder.Insert(Cloned); // If the original scalar returns a value we need to place it in a vector // so that future users will be able to use it. if (!IsVoidRetTy) - VecResults = Builder->CreateInsertElement(VecResults, Cloned, - Builder->getInt32(i)); + VecResults = Builder.CreateInsertElement(VecResults, Cloned, + Builder.getInt32(i)); } if (!IsVoidRetTy) WidenMap[Instr] = VecResults; } -void SingleBlockLoopVectorizer::createEmptyLoop() { +void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { /* In this function we generate a new loop. The new loop will contain the vectorized instructions while the old loop will continue to run the scalar remainder. - [ ] <-- vector loop bypass. + [ ] <-- vector loop bypass. / | / v | [ ] <-- vector pre header. @@ -405,7 +533,7 @@ void SingleBlockLoopVectorizer::createEmptyLoop() { | | | v | [ ] \ -| [ ]_| <-- old scalar loop to handle remainder. () +| [ ]_| <-- old scalar loop to handle remainder. \ | \ v >[ ] <-- exit block. @@ -413,11 +541,11 @@ void SingleBlockLoopVectorizer::createEmptyLoop() { */ // This is the original scalar-loop preheader. - BasicBlock *BypassBlock = Orig->getLoopPreheader(); - BasicBlock *ExitBlock = Orig->getExitBlock(); + BasicBlock *BypassBlock = OrigLoop->getLoopPreheader(); + BasicBlock *ExitBlock = OrigLoop->getExitBlock(); assert(ExitBlock && "Must have an exit block"); - assert(Orig->getNumBlocks() == 1 && "Invalid loop"); + assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop"); assert(BypassBlock && "Invalid loop structure"); BasicBlock *VectorPH = @@ -427,30 +555,26 @@ void SingleBlockLoopVectorizer::createEmptyLoop() { BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); - - BasicBlock *ScalarPH = - MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), - "scalar.preheader"); - + MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), + "scalar.preheader"); // Find the induction variable. - BasicBlock *OldBasicBlock = Orig->getHeader(); - OldInduction = dyn_cast<PHINode>(OldBasicBlock->begin()); + BasicBlock *OldBasicBlock = OrigLoop->getHeader(); + OldInduction = Legal->getInduction(); assert(OldInduction && "We must have a single phi node."); Type *IdxTy = OldInduction->getType(); // Use this IR builder to create the loop instructions (Phi, Br, Cmp) // inside the loop. - Builder = new IRBuilder<>(VecBody); - Builder->SetInsertPoint(VecBody->getFirstInsertionPt()); + Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); // Generate the induction variable. - Induction = Builder->CreatePHI(IdxTy, 2, "index"); + Induction = Builder.CreatePHI(IdxTy, 2, "index"); Constant *Zero = ConstantInt::get(IdxTy, 0); Constant *Step = ConstantInt::get(IdxTy, VF); // Find the loop boundaries. - const SCEV *ExitCount = SE->getExitCount(Orig, Orig->getHeader()); + const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader()); assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); // Get the total trip count from the count by adding 1. @@ -496,12 +620,12 @@ void SingleBlockLoopVectorizer::createEmptyLoop() { MiddleBlock->getTerminator()->eraseFromParent(); // Create i+1 and fill the PHINode. - Value *NextIdx = Builder->CreateAdd(Induction, Step, "index.next"); + Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); Induction->addIncoming(Zero, VectorPH); Induction->addIncoming(NextIdx, VecBody); // Create the compare. - Value *ICmp = Builder->CreateICmpEQ(NextIdx, CountRoundDown); - Builder->CreateCondBr(ICmp, MiddleBlock, VecBody); + Value *ICmp = Builder.CreateICmpEQ(NextIdx, CountRoundDown); + Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); // Now we have two terminators. Remove the old one from the block. VecBody->getTerminator()->eraseFromParent(); @@ -511,36 +635,68 @@ void SingleBlockLoopVectorizer::createEmptyLoop() { OldInduction->setIncomingValue(BlockIdx, CountRoundDown); // Get ready to start creating new instructions into the vectorized body. - Builder->SetInsertPoint(VecBody->getFirstInsertionPt()); + Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); // Register the new loop. Loop* Lp = new Loop(); - LPM->insertLoop(Lp, Orig->getParentLoop()); + LPM->insertLoop(Lp, OrigLoop->getParentLoop()); Lp->addBasicBlockToLoop(VecBody, LI->getBase()); - Loop *ParentLoop = Orig->getParentLoop(); + Loop *ParentLoop = OrigLoop->getParentLoop(); if (ParentLoop) { ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); } + + // Save the state. + LoopMiddleBlock = MiddleBlock; + LoopExitBlock = ExitBlock; + LoopVectorBody = VecBody; + LoopScalarBody = OldBasicBlock; + LoopBypassBlock = BypassBlock; } -void SingleBlockLoopVectorizer::vectorizeLoop() { - BasicBlock &BB = *Orig->getHeader(); +void +SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { + typedef SmallVector<PHINode*, 4> PhiVector; + BasicBlock &BB = *OrigLoop->getHeader(); + Constant *Zero = ConstantInt::get( + IntegerType::getInt32Ty(BB.getContext()), 0); + + // In order to support reduction variables we need to be able to vectorize + // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two + // steages. First, we create a new vector PHI node with no incoming edges. + // We use this value when we vectorize all of the instructions that use the + // PHI. Next, after all of the instructions in the block are complete we + // add the new incoming edges to the PHI. At this point all of the + // instructions in the basic block are vectorized, so we can use them to + // construct the PHI. + PhiVector PHIsToFix; // For each instruction in the old loop. for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { Instruction *Inst = it; switch (Inst->getOpcode()) { - case Instruction::PHI: case Instruction::Br: // Nothing to do for PHIs and BR, since we already took care of the // loop control flow instructions. continue; - + case Instruction::PHI:{ + PHINode* P = cast<PHINode>(Inst); + // Special handling for the induction var. + if (OldInduction == Inst) + continue; + // This is phase one of vectorizing PHIs. + // This has to be a reduction variable. + assert(Legal->getReductionVars()->count(P) && "Not a Reduction"); + Type *VecTy = VectorType::get(Inst->getType(), VF); + WidenMap[Inst] = Builder.CreatePHI(VecTy, 2, "vec.phi"); + PHIsToFix.push_back(P); + continue; + } case Instruction::Add: case Instruction::FAdd: case Instruction::Sub: @@ -564,15 +720,27 @@ void SingleBlockLoopVectorizer::vectorizeLoop() { Value *A = getVectorValue(Inst->getOperand(0)); Value *B = getVectorValue(Inst->getOperand(1)); // Use this vector value for all users of the original instruction. - WidenMap[Inst] = Builder->CreateBinOp(BinOp->getOpcode(), A, B); + WidenMap[Inst] = Builder.CreateBinOp(BinOp->getOpcode(), A, B); break; } case Instruction::Select: { // Widen selects. - Value *A = getVectorValue(Inst->getOperand(0)); - Value *B = getVectorValue(Inst->getOperand(1)); - Value *C = getVectorValue(Inst->getOperand(2)); - WidenMap[Inst] = Builder->CreateSelect(A, B, C); + // If the selector is loop invariant we can create a select + // instruction with a scalar condition. Otherwise, use vector-select. + Value *Cond = Inst->getOperand(0); + bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop); + + // The condition can be loop invariant but still defined inside the + // loop. This means that we can't just use the original 'cond' value. + // We have to take the 'vectorized' value and pick the first lane. + // Instcombine will make this a no-op. + Cond = getVectorValue(Cond); + if (InvariantCond) + Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0)); + + Value *Op0 = getVectorValue(Inst->getOperand(1)); + Value *Op1 = getVectorValue(Inst->getOperand(2)); + WidenMap[Inst] = Builder.CreateSelect(Cond, Op0, Op1); break; } @@ -584,9 +752,9 @@ void SingleBlockLoopVectorizer::vectorizeLoop() { Value *A = getVectorValue(Inst->getOperand(0)); Value *B = getVectorValue(Inst->getOperand(1)); if (FCmp) - WidenMap[Inst] = Builder->CreateFCmp(Cmp->getPredicate(), A, B); + WidenMap[Inst] = Builder.CreateFCmp(Cmp->getPredicate(), A, B); else - WidenMap[Inst] = Builder->CreateICmp(Cmp->getPredicate(), A, B); + WidenMap[Inst] = Builder.CreateICmp(Cmp->getPredicate(), A, B); break; } @@ -598,19 +766,24 @@ void SingleBlockLoopVectorizer::vectorizeLoop() { unsigned Alignment = SI->getAlignment(); GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); // This store does not use GEPs. - if (!isConsecutiveGep(Gep)) { + if (!Legal->isConsecutiveGep(Gep)) { scalarizeInstruction(Inst); break; } + // The last index does not have to be the induction. It can be + // consecutive and be a function of the index. For example A[I+1]; + unsigned NumOperands = Gep->getNumOperands(); + Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1)); + LastIndex = Builder.CreateExtractElement(LastIndex, Builder.getInt32(0)); + // Create the new GEP with the new induction variable. GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); - unsigned NumOperands = Gep->getNumOperands(); - Gep2->setOperand(NumOperands - 1, Induction); - Ptr = Builder->Insert(Gep2); - Ptr = Builder->CreateBitCast(Ptr, StTy->getPointerTo()); + Gep2->setOperand(NumOperands - 1, LastIndex); + Ptr = Builder.Insert(Gep2); + Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo()); Value *Val = getVectorValue(SI->getValueOperand()); - Builder->CreateStore(Val, Ptr)->setAlignment(Alignment); + Builder.CreateStore(Val, Ptr)->setAlignment(Alignment); break; } case Instruction::Load: { @@ -622,18 +795,23 @@ void SingleBlockLoopVectorizer::vectorizeLoop() { GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); // We don't have a gep. Scalarize the load. - if (!isConsecutiveGep(Gep)) { + if (!Legal->isConsecutiveGep(Gep)) { scalarizeInstruction(Inst); break; } + // The last index does not have to be the induction. It can be + // consecutive and be a function of the index. For example A[I+1]; + unsigned NumOperands = Gep->getNumOperands(); + Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1)); + LastIndex = Builder.CreateExtractElement(LastIndex, Builder.getInt32(0)); + // Create the new GEP with the new induction variable. GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); - unsigned NumOperands = Gep->getNumOperands(); - Gep2->setOperand(NumOperands - 1, Induction); - Ptr = Builder->Insert(Gep2); - Ptr = Builder->CreateBitCast(Ptr, RetTy->getPointerTo()); - LI = Builder->CreateLoad(Ptr); + Gep2->setOperand(NumOperands - 1, LastIndex); + Ptr = Builder.Insert(Gep2); + Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo()); + LI = Builder.CreateLoad(Ptr); LI->setAlignment(Alignment); // Use this vector value for all users of the load. WidenMap[Inst] = LI; @@ -655,7 +833,7 @@ void SingleBlockLoopVectorizer::vectorizeLoop() { CastInst *CI = dyn_cast<CastInst>(Inst); Value *A = getVectorValue(Inst->getOperand(0)); Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF); - WidenMap[Inst] = Builder->CreateCast(CI->getOpcode(), A, DestTy); + WidenMap[Inst] = Builder.CreateCast(CI->getOpcode(), A, DestTy); break; } @@ -665,11 +843,122 @@ void SingleBlockLoopVectorizer::vectorizeLoop() { break; }// end of switch. }// end of for_each instr. + + // At this point every instruction in the original loop is widended to + // a vector form. We are almost done. Now, we need to fix the PHI nodes + // that we vectorized. The PHI nodes are currently empty because we did + // not want to introduce cycles. Notice that the remaining PHI nodes + // that we need to fix are reduction variables. + + // Create the 'reduced' values for each of the induction vars. + // The reduced values are the vector values that we scalarize and combine + // after the loop is finished. + for (PhiVector::iterator it = PHIsToFix.begin(), e = PHIsToFix.end(); + it != e; ++it) { + PHINode *RdxPhi = *it; + PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]); + assert(RdxPhi && "Unable to recover vectorized PHI"); + + // Find the reduction variable descriptor. + assert(Legal->getReductionVars()->count(RdxPhi) && + "Unable to find the reduction variable"); + LoopVectorizationLegality::ReductionDescriptor RdxDesc = + (*Legal->getReductionVars())[RdxPhi]; + + // We need to generate a reduction vector from the incoming scalar. + // To do so, we need to generate the 'identity' vector and overide + // one of the elements with the incoming scalar reduction. We need + // to do it in the vector-loop preheader. + Builder.SetInsertPoint(LoopBypassBlock->getTerminator()); + + // This is the vector-clone of the value that leaves the loop. + Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr); + Type *VecTy = VectorExit->getType(); + + // Find the reduction identity variable. The value of the enum is the + // identity. Zero for addition. One for Multiplication. + unsigned IdentitySclr = RdxDesc.Kind; + Constant *Identity = getUniformVector(IdentitySclr, + VecTy->getScalarType()); + + // This vector is the Identity vector where the first element is the + // incoming scalar reduction. + Value *VectorStart = Builder.CreateInsertElement(Identity, + RdxDesc.StartValue, Zero); + + + // Fix the vector-loop phi. + // We created the induction variable so we know that the + // preheader is the first entry. + BasicBlock *VecPreheader = Induction->getIncomingBlock(0); + + // Reductions do not have to start at zero. They can start with + // any loop invariant values. + VecRdxPhi->addIncoming(VectorStart, VecPreheader); + unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); + Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx)); + VecRdxPhi->addIncoming(Val, LoopVectorBody); + + // Before each round, move the insertion point right between + // the PHIs and the values we are going to write. + // This allows us to write both PHINodes and the extractelement + // instructions. + Builder.SetInsertPoint(LoopMiddleBlock->getFirstInsertionPt()); + + // This PHINode contains the vectorized reduction variable, or + // the initial value vector, if we bypass the vector loop. + PHINode *NewPhi = Builder.CreatePHI(VecTy, 2, "rdx.vec.exit.phi"); + NewPhi->addIncoming(VectorStart, LoopBypassBlock); + NewPhi->addIncoming(getVectorValue(RdxDesc.LoopExitInstr), LoopVectorBody); + + // Extract the first scalar. + Value *Scalar0 = + Builder.CreateExtractElement(NewPhi, Builder.getInt32(0)); + // Extract and sum the remaining vector elements. + for (unsigned i=1; i < VF; ++i) { + Value *Scalar1 = + Builder.CreateExtractElement(NewPhi, Builder.getInt32(i)); + if (RdxDesc.Kind == LoopVectorizationLegality::IntegerAdd) { + Scalar0 = Builder.CreateAdd(Scalar0, Scalar1); + } else { + Scalar0 = Builder.CreateMul(Scalar0, Scalar1); + } + } + + // Now, we need to fix the users of the reduction variable + // inside and outside of the scalar remainder loop. + // We know that the loop is in LCSSA form. We need to update the + // PHI nodes in the exit blocks. + for (BasicBlock::iterator LEI = LoopExitBlock->begin(), + LEE = LoopExitBlock->end(); LEI != LEE; ++LEI) { + PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI); + if (!LCSSAPhi) continue; + + // All PHINodes need to have a single entry edge, or two if + // we already fixed them. + assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI"); + + // We found our reduction value exit-PHI. Update it with the + // incoming bypass edge. + if (LCSSAPhi->getIncomingValue(0) == RdxDesc.LoopExitInstr) { + // Add an edge coming from the bypass. + LCSSAPhi->addIncoming(Scalar0, LoopMiddleBlock); + break; + } + }// end of the LCSSA phi scan. + + // Fix the scalar loop reduction variable with the incoming reduction sum + // from the vector body and from the backedge value. + int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); + int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block. + (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0); + (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); + }// end of for each redux variable. } void SingleBlockLoopVectorizer::cleanup() { // The original basic block. - SE->forgetLoop(Orig); + SE->forgetLoop(OrigLoop); } unsigned LoopVectorizationLegality::getLoopMaxVF() { @@ -712,55 +1001,100 @@ unsigned LoopVectorizationLegality::getLoopMaxVF() { } bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { - // Holds the read and write pointers that we find. - typedef SmallVector<Value*, 10> ValueVector; - ValueVector Reads; - ValueVector Writes; - - unsigned NumPhis = 0; + // Scan the instructions in the block and look for hazards. for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { Instruction *I = it; PHINode *Phi = dyn_cast<PHINode>(I); if (Phi) { - NumPhis++; + // This should not happen because the loop should be normalized. + if (Phi->getNumIncomingValues() != 2) { + DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); + return false; + } // We only look at integer phi nodes. if (!Phi->getType()->isIntegerTy()) { DEBUG(dbgs() << "LV: Found an non-int PHI.\n"); return false; } - // If we found an induction variable. - if (NumPhis > 1) { - DEBUG(dbgs() << "LV: Found more than one PHI.\n"); - return false; + if (isInductionVariable(Phi)) { + if (Induction) { + DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n"); + return false; + } + DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n"); + Induction = Phi; + continue; } - - // This should not happen because the loop should be normalized. - if (Phi->getNumIncomingValues() != 2) { - DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); - return false; + if (AddReductionVar(Phi, IntegerAdd)) { + DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n"); + continue; } - - // Check that the PHI is consecutive and starts at zero. - const SCEV *PhiScev = SE->getSCEV(Phi); - const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); - if (!AR) { - DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n"); - return false; + if (AddReductionVar(Phi, IntegerMult)) { + DEBUG(dbgs() << "LV: Found an Mult reduction PHI."<< *Phi <<"\n"); + continue; } - const SCEV *Step = AR->getStepRecurrence(*SE); - const SCEV *Start = AR->getStart(); + DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); + return false; + }// end of PHI handling - if (!Step->isOne() || !Start->isZero()) { - DEBUG(dbgs() << "LV: PHI does not start at zero or steps by one.\n"); - return false; - } + // We still don't handle functions. + CallInst *CI = dyn_cast<CallInst>(I); |