diff options
Diffstat (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r-- | lib/Transforms/Vectorize/LoopVectorize.cpp | 318 |
1 files changed, 233 insertions, 85 deletions
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 423c7a4911..892808760f 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -55,6 +55,7 @@ #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/LoopInfo.h" @@ -74,6 +75,9 @@ static cl::opt<unsigned> VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, cl::desc("Set the default vectorization width. Zero is autoselect.")); +/// We don't vectorize loops with a known constant trip count below this number. +const unsigned TinyTripCountThreshold = 16; + namespace { // Forward declarations. @@ -98,8 +102,9 @@ class SingleBlockLoopVectorizer { public: /// Ctor. SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li, - LPPassManager *Lpm, unsigned VecWidth): - OrigLoop(Orig), SE(Se), LI(Li), LPM(Lpm), VF(VecWidth), + DominatorTree *dt, LPPassManager *Lpm, + unsigned VecWidth): + OrigLoop(Orig), SE(Se), LI(Li), DT(dt), LPM(Lpm), VF(VecWidth), Builder(Se->getContext()), Induction(0), OldInduction(0) { } // Perform the actual loop widening (vectorization). @@ -108,9 +113,9 @@ public: createEmptyLoop(Legal); /// Widen each instruction in the old loop to a new one in the new loop. /// Use the Legality module to find the induction and reduction variables. - vectorizeLoop(Legal); + vectorizeLoop(Legal); // register the new loop. - cleanup(); + updateAnalysis(); } private: @@ -119,7 +124,7 @@ private: /// Copy and widen the instructions from the old loop. void vectorizeLoop(LoopVectorizationLegality *Legal); /// Insert the new loop to the loop hierarchy and pass manager. - void cleanup(); + void updateAnalysis(); /// This instruction is un-vectorizable. Implement it as a sequence /// of scalars. @@ -155,6 +160,8 @@ private: ScalarEvolution *SE; // Loop Info. LoopInfo *LI; + // Dominator Tree. + DominatorTree *DT; // Loop Pass Manager; LPPassManager *LPM; // The vectorization factor to use. @@ -165,6 +172,10 @@ private: // --- Vectorization state --- + /// The vector-loop preheader. + BasicBlock *LoopVectorPreHeader; + /// The scalar-loop preheader. + BasicBlock *LoopScalarPreHeader; /// Middle Block between the vector and the scalar. BasicBlock *LoopMiddleBlock; ///The ExitBlock of the scalar loop. @@ -203,15 +214,13 @@ public: TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { } /// This represents the kinds of reductions that we support. - /// We use the enum values to hold the 'identity' value for - /// each operand. This value does not change the result if applied. enum ReductionKind { - NoReduction = -1, /// Not a reduction. - IntegerAdd = 0, /// Sum of numbers. - IntegerMult = 1, /// Product of numbers. - IntegerOr = 2, /// Bitwise or logical OR of numbers. - IntegerAnd = 3, /// Bitwise or logical AND of numbers. - IntegerXor = 4 /// Bitwise or logical XOR of numbers. + NoReduction, /// Not a reduction. + IntegerAdd, /// Sum of numbers. + IntegerMult, /// Product of numbers. + IntegerOr, /// Bitwise or logical OR of numbers. + IntegerAnd, /// Bitwise or logical AND of numbers. + IntegerXor /// Bitwise or logical XOR of numbers. }; /// This POD struct holds information about reduction variables. @@ -254,6 +263,9 @@ public: /// This check allows us to vectorize A[idx] into a wide load/store. bool isConsecutiveGep(Value *Ptr); + /// Returns true if this instruction will remain scalar after vectorization. + bool isUniformAfterVectorization(Instruction* I) {return Uniforms.count(I);} + private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count @@ -291,6 +303,9 @@ private: /// Allowed outside users. This holds the reduction /// vars which can be accessed from outside the loop. SmallPtrSet<Value*, 4> AllowedExit; + /// This set holds the variables which are known to be uniform after + /// vectorization. + SmallPtrSet<Instruction*, 4> Uniforms; }; /// LoopVectorizationCostModel - estimates the expected speedups due to @@ -311,7 +326,7 @@ public: /// Returns the most profitable vectorization factor for the loop that is /// smaller or equal to the VF argument. This method checks every power /// of two up to VF. - unsigned findBestVectorizationFactor(unsigned VF = 4); + unsigned findBestVectorizationFactor(unsigned VF = 8); private: /// Returns the expected execution cost. The unit of the cost does @@ -324,6 +339,11 @@ private: /// width. Vector width of one means scalar. unsigned getInstructionCost(Instruction *I, unsigned VF); + /// A helper function for converting Scalar types to vector types. + /// If the incoming type is void, we return void. If the VF is 1, we return + /// the scalar type. + static Type* ToVectorTy(Type *Scalar, unsigned VF); + /// The loop that we evaluate. Loop *TheLoop; /// Scev analysis. @@ -346,6 +366,7 @@ struct LoopVectorize : public LoopPass { DataLayout *DL; LoopInfo *LI; TargetTransformInfo *TTI; + DominatorTree *DT; virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { // We only vectorize innermost loops. @@ -356,6 +377,7 @@ struct LoopVectorize : public LoopPass { DL = getAnalysisIfAvailable<DataLayout>(); LI = &getAnalysis<LoopInfo>(); TTI = getAnalysisIfAvailable<TargetTransformInfo>(); + DT = &getAnalysis<DominatorTree>(); DEBUG(dbgs() << "LV: Checking a loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); @@ -387,10 +409,12 @@ struct LoopVectorize : public LoopPass { VF = VectorizationFactor; } - DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ").\n"); + DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<< + L->getHeader()->getParent()->getParent()->getModuleIdentifier()<< + "\n"); // If we decided that it is *legal* to vectorizer the loop then do it. - SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, VF); + SingleBlockLoopVectorizer LB(L, SE, LI, DT, &LPM, VF); LB.vectorize(&LVL); DEBUG(verifyFunction(*L->getHeader()->getParent())); @@ -403,6 +427,9 @@ struct LoopVectorize : public LoopPass { AU.addRequiredID(LCSSAID); AU.addRequired<LoopInfo>(); AU.addRequired<ScalarEvolution>(); + AU.addRequired<DominatorTree>(); + AU.addPreserved<LoopInfo>(); + AU.addPreserved<DominatorTree>(); } }; @@ -497,7 +524,7 @@ SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) { SmallVector<Constant*, 8> Indices; // Create a vector of consecutive numbers from zero to VF. for (unsigned i = 0; i < VF; ++i) - Indices.push_back(ConstantInt::get(ScalarTy, Val)); + Indices.push_back(ConstantInt::get(ScalarTy, Val, true)); // Add the consecutive indices to the vector value. return ConstantVector::get(Indices); @@ -573,7 +600,8 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { WidenMap[Instr] = VecResults; } -void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { +void +SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { /* In this function we generate a new loop. The new loop will contain the vectorized instructions while the old loop will continue to run the @@ -608,6 +636,10 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal BasicBlock *ExitBlock = OrigLoop->getExitBlock(); assert(ExitBlock && "Must have an exit block"); + // The loop index does not have to start at Zero. It starts with this value. + OldInduction = Legal->getInduction(); + Value *StartIdx = OldInduction->getIncomingValueForBlock(BypassBlock); + assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop"); assert(BypassBlock && "Invalid loop structure"); @@ -623,7 +655,6 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal "scalar.preheader"); // Find the induction variable. BasicBlock *OldBasicBlock = OrigLoop->getHeader(); - OldInduction = Legal->getInduction(); assert(OldInduction && "We must have a single phi node."); Type *IdxTy = OldInduction->getType(); @@ -633,7 +664,6 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal // Generate the induction variable. Induction = Builder.CreatePHI(IdxTy, 2, "index"); - Constant *Zero = ConstantInt::get(IdxTy, 0); Constant *Step = ConstantInt::get(IdxTy, VF); // Find the loop boundaries. @@ -657,15 +687,22 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal // Count holds the overall loop count (N). Value *Count = Exp.expandCodeFor(ExitCount, Induction->getType(), Loc); + + // Add the start index to the loop count to get the new end index. + Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc); + // Now we need to generate the expression for N - (N % VF), which is // the part that the vectorized body will execute. Constant *CIVF = ConstantInt::get(IdxTy, VF); Value *R = BinaryOperator::CreateURem(Count, CIVF, "n.mod.vf", Loc); Value *CountRoundDown = BinaryOperator::CreateSub(Count, R, "n.vec", Loc); + Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx, + "end.idx.rnd.down", Loc); // Now, compare the new count to zero. If it is zero, jump to the scalar part. Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, - CountRoundDown, ConstantInt::getNullValue(IdxTy), + IdxEndRoundDown, + StartIdx, "cmp.zero", Loc); BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc); // Remove the old terminator. @@ -674,8 +711,8 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal // Add a check in the middle block to see if we have completed // all of the iterations in the first vector loop. // If (N - N%VF) == N, then we *don't* need to run the remainder. - Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, - CountRoundDown, "cmp.n", + Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd, + IdxEndRoundDown, "cmp.n", MiddleBlock->getTerminator()); BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator()); @@ -684,10 +721,10 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal // Create i+1 and fill the PHINode. Value *NextIdx = Builder.CreateAdd(Induction, Step, "index.next"); - Induction->addIncoming(Zero, VectorPH); + Induction->addIncoming(StartIdx, VectorPH); Induction->addIncoming(NextIdx, VecBody); // Create the compare. - Value *ICmp = Builder.CreateICmpEQ(NextIdx, CountRoundDown); + Value *ICmp = Builder.CreateICmpEQ(NextIdx, IdxEndRoundDown); Builder.CreateCondBr(ICmp, MiddleBlock, VecBody); // Now we have two terminators. Remove the old one from the block. @@ -695,7 +732,7 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal // Fix the scalar body iteration count. unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH); - OldInduction->setIncomingValue(BlockIdx, CountRoundDown); + OldInduction->setIncomingValue(BlockIdx, IdxEndRoundDown); // Get ready to start creating new instructions into the vectorized body. Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); @@ -714,6 +751,8 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal } // Save the state. + LoopVectorPreHeader = VectorPH; + LoopScalarPreHeader = ScalarPH; LoopMiddleBlock = MiddleBlock; LoopExitBlock = ExitBlock; LoopVectorBody = VecBody; @@ -721,6 +760,27 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal LoopBypassBlock = BypassBlock; } +/// This function returns the identity element (or neutral element) for +/// the operation K. +static unsigned +getReductionIdentity(LoopVectorizationLegality::ReductionKind K) { + switch (K) { + case LoopVectorizationLegality::IntegerXor: + case LoopVectorizationLegality::IntegerAdd: + case LoopVectorizationLegality::IntegerOr: + // Adding, Xoring, Oring zero to a number does not change it. + return 0; + case LoopVectorizationLegality::IntegerMult: + // Multiplying a number by 1 does not change it. + return 1; + case LoopVectorizationLegality::IntegerAnd: + // AND-ing a number with an all-1 value does not change it. + return -1; + default: + llvm_unreachable("Unknown reduction kind"); + } +} + void SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { //===------------------------------------------------===// @@ -789,8 +849,19 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst); Value *A = getVectorValue(Inst->getOperand(0)); Value *B = getVectorValue(Inst->getOperand(1)); + // Use this vector value for all users of the original instruction. - WidenMap[Inst] = Builder.CreateBinOp(BinOp->getOpcode(), A, B); + Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B); + WidenMap[Inst] = V; + + // Update the NSW, NUW and Exact flags. + BinaryOperator *VecOp = cast<BinaryOperator>(V); + if (isa<OverflowingBinaryOperator>(BinOp)) { + VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap()); + VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap()); + } + if (isa<PossiblyExactOperator>(VecOp)) + VecOp->setIsExact(BinOp->isExact()); break; } case Instruction::Select: { @@ -844,8 +915,8 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // The last index does not have to be the induction. It can be // consecutive and be a function of the index. For example A[I+1]; unsigned NumOperands = Gep->getNumOperands(); - Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1)); - LastIndex = Builder.CreateExtractElement(LastIndex, Builder.getInt32(0)); + Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1)); + LastIndex = Builder.CreateExtractElement(LastIndex, Zero); // Create the new GEP with the new induction variable. GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); @@ -874,7 +945,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // consecutive and be a function of the index. For example A[I+1]; unsigned NumOperands = Gep->getNumOperands(); Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1)); - LastIndex = Builder.CreateExtractElement(LastIndex, Builder.getInt32(0)); + LastIndex = Builder.CreateExtractElement(LastIndex, Zero); // Create the new GEP with the new induction variable. GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); @@ -945,10 +1016,9 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { Value *VectorExit = getVectorValue(RdxDesc.LoopExitInstr); Type *VecTy = VectorExit->getType(); - // Find the reduction identity variable. The value of the enum is the - // identity. Zero for addition. One for Multiplication. - unsigned IdentitySclr = RdxDesc.Kind; - Constant *Identity = getUniformVector(IdentitySclr, + // Find the reduction identity variable. Zero for addition, or, xor, + // one for multiplication, -1 for And. + Constant *Identity = getUniformVector(getReductionIdentity(RdxDesc.Kind), VecTy->getScalarType()); // This vector is the Identity vector where the first element is the @@ -1040,9 +1110,22 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { }// end of for each redux variable. } -void SingleBlockLoopVectorizer::cleanup() { +void SingleBlockLoopVectorizer::updateAnalysis() { // The original basic block. SE->forgetLoop(OrigLoop); + + // Update the dominator tree information. + assert(DT->properlyDominates(LoopBypassBlock, LoopExitBlock) && + "Entry does not dominate exit."); + + DT->addNewBlock(LoopVectorPreHeader, LoopBypassBlock); + DT->addNewBlock(LoopVectorBody, LoopVectorPreHeader); + DT->addNewBlock(LoopMiddleBlock, LoopBypassBlock); + DT->addNewBlock(LoopScalarPreHeader, LoopMiddleBlock); + DT->changeImmediateDominator(LoopScalarBody, LoopScalarPreHeader); + DT->changeImmediateDominator(LoopExitBlock, LoopMiddleBlock); + + DEBUG(DT->verifyAnalysis()); } bool LoopVectorizationLegality::canVectorize() { @@ -1076,6 +1159,14 @@ bool LoopVectorizationLegality::canVectorize() { return false; } + // Do not loop-vectorize loops with a tiny trip count. + unsigned TC = SE->getSmallConstantTripCount(TheLoop, BB); + if (TC > 0u && TC < TinyTripCountThreshold) { + DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << + "This loop is not worth vectorizing.\n"); + return false; + } + DEBUG(dbgs() << "LV: We can vectorize this loop!\n"); // Okay! We can vectorize. At this point we don't have any other mem analysis @@ -1139,8 +1230,7 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { // We still don't handle functions. CallInst *CI = dyn_cast<CallInst>(I); if (CI) { - DEBUG(dbgs() << "LV: Found a call site:"<< - CI->getCalledFunction()->getName() << "\n"); + DEBUG(dbgs() << "LV: Found a call site.\n"); return false; } @@ -1172,9 +1262,40 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { return false; } - // If the memory dependencies do not prevent us from - // vectorizing, then vectorize. - return canVectorizeMemory(BB); + // Don't vectorize if the memory dependencies do not allow vectorization. + if (!canVectorizeMemory(BB)) + return false; + + // We now know that the loop is vectorizable! + // Collect variables that will remain uniform after vectorization. + std::vector<Value*> Worklist; + + // Start with the conditional branch and walk up the block. + Worklist.push_back(BB.getTerminator()->getOperand(0)); + + while (Worklist.size()) { + Instruction *I = dyn_cast<Instruction>(Worklist.back()); + Worklist.pop_back(); + // Look at instructions inside this block. + if (!I) continue; + if (I->getParent() != &BB) continue; + + // Stop when reaching PHI nodes. + if (isa<PHINode>(I)) { + assert(I == Induction && "Found a uniform PHI that is not the induction"); + break; + } + + // This is a known uniform. + Uniforms.insert(I); + + // Insert all operands. + for (int i=0, Op = I->getNumOperands(); i < Op; ++i) { + Worklist.push_back(I->getOperand(i)); + } + } + + return true; } bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { @@ -1262,6 +1383,13 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { Reads.push_back(Ptr); } + // If we write (or read-write) to a single destination and there are no + // other reads in this loop then is it safe to vectorize. + if (ReadWrites.size() == 1 && Reads.size() == 0) { + DEBUG(dbgs() << "LV: Found a write-only loop!\n"); + return true; + } + // Now that the pointers are in two lists (Reads and ReadWrites), we // can check that there are no conflicts between each of the writes and // between the writes to the reads. @@ -1420,10 +1548,9 @@ bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { return false; } const SCEV *Step = AR->getStepRecurrence(*SE); - const SCEV *Start = AR->getStart(); - if (!Step->isOne() || !Start->isZero()) { - DEBUG(dbgs() << "LV: PHI does not start at zero or steps by one.\n"); + if (!Step->isOne()) { + DEBUG(dbgs() << "LV: PHI stride does not equal one.\n"); return false; } return true; @@ -1478,11 +1605,25 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { assert(VTTI && "Invalid vector target transformation info"); + + // If we know that this instruction will remain uniform, check the cost of + // the scalar version. + if (Legal->isUniformAfterVectorization(I)) + VF = 1; + + Type *RetTy = I->getType(); + Type *VectorTy = ToVectorTy(RetTy, VF); + + + // TODO: We need to estimate the cost of intrinsic calls. switch (I->getOpcode()) { case Instruction::GetElementPtr: + // We mark this instruction as zero-cost because scalar GEPs are usually + // lowered to the intruction addressing mode. At the moment we don't + // generate vector geps. return 0; case Instruction::Br: { - return VTTI->getInstrCost(I->getOpcode()); + return VTTI->getCFInstrCost(I->getOpcode()); } case Instruction::PHI: return 0; @@ -1504,74 +1645,76 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { case Instruction::And: case Instruction::Or: case Instruction::Xor: { - Type *VTy = VectorType::get(I->getType(), VF); - return VTTI->getInstrCost(I->getOpcode(), VTy); + return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy); } case Instruction::Select: { SelectInst *SI = cast<SelectInst>(I); - Type *VTy = VectorType::get(I->getType(), VF); const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); Type *CondTy = SI->getCondition()->getType(); if (ScalarCond) CondTy = VectorType::get(CondTy, VF); - return VTTI->getInstrCost(I->getOpcode(), VTy, CondTy); + return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); } case Instruction::ICmp: case Instruction::FCmp: { - Type *VTy = VectorType::get(I->getOperand(0)->getType(), VF); - return VTTI->getInstrCost(I->getOpcode(), VTy); + Type *ValTy = I->getOperand(0)->getType(); + VectorTy = ToVectorTy(ValTy, VF); + return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy); } case Instruction::Store: { StoreInst *SI = cast<StoreInst>(I); - Type *VTy = VectorType::get(SI->getValueOperand()->getType(), VF); + Type *ValTy = SI->getValueOperand()->getType(); + VectorTy = ToVectorTy(ValTy, VF); + + if (VF == 1) + return VTTI->getMemoryOpCost(I->getOpcode(), ValTy, + SI->getAlignment(), SI->getPointerAddressSpace()); // Scalarized stores. if (!Legal->isConsecutiveGep(SI->getPointerOperand())) { unsigned Cost = 0; - if (VF != 1) { - unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, - VTy); - // The cost of extracting from the value vector and pointer vector. - Cost += VF * (ExtCost * 2); - } + unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, + ValTy); + // The cost of extracting from the value vector. + Cost += VF * (ExtCost); // The cost of the scalar stores. Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), - VTy->getScalarType(), + ValTy->getScalarType(), SI->getAlignment(), SI->getPointerAddressSpace()); return Cost; } // Wide stores. - return VTTI->getMemoryOpCost(I->getOpcode(), VTy, SI->getAlignment(), + return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(), SI->getPointerAddressSpace()); } case Instruction::Load: { LoadInst *LI = cast<LoadInst>(I); - Type *VTy = VectorType::get(I->getType(), VF); + + if (VF == 1) + return VTTI->getMemoryOpCost(I->getOpcode(), RetTy, + LI->getAlignment(), + LI->getPointerAddressSpace()); // Scalarized loads. if (!Legal->isConsecutiveGep(LI->getPointerOperand())) { unsigned Cost = 0; - if (VF != 1) { - unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, VTy); - unsigned ExCost = VTTI->getInstrCost(Instruction::ExtractValue, VTy); - - // The cost of inserting the loaded value into the result vector, and - // extracting from a vector of pointers. - Cost += VF * (InCost + ExCost); - } + unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy); + // The cost of inserting the loaded value into the result vector. + Cost += VF * (InCost); // The cost of the scalar stores. - Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), VTy->getScalarType(), + Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), + RetTy->getScalarType(), LI->getAlignment(), LI->getPointerAddressSpace()); return Cost; } // Wide loads. - return VTTI->getMemoryOpCost(I->getOpcode(), VTy, LI->getAlignment(), + return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(), LI->getPointerAddressSpace()); } case Instruction::ZExt: @@ -1586,35 +1729,40 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { case Instruction::Trunc: case Instruction::FPTrunc: case Instruction::BitCast: { - Type *SrcTy = VectorType::get(I->getOperand(0)->getType(), VF); - Type *DstTy = VectorType::get(I->getType(), VF); - return VTTI->getInstrCost(I->getOpcode(), DstTy, SrcTy); + Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); + return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); } default: { // We are scalarizing the instruction. Return the cost of the scalar // instruction, plus the cost of insert and extract into vector // elements, times the vector width. unsigned Cost = 0; - Type *Ty = I->getType(); - if (!Ty->isVoidTy()) { - Type *VTy = VectorType::get(Ty, VF); - unsigned InsCost = VTTI->getInstrCost(Instruction::InsertElement, VTy); - unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, VTy); - Cost += VF * (InsCost + ExtCost); - } + bool IsVoid = RetTy->isVoidTy(); - /// We don't have any information on the scalar instruction, but maybe - /// the target has. - /// TODO: This may be a target-specific intrinsic. - /// Need to add API for that. - Cost += VF * VTTI->getInstrCost(I->getOpcode(), Ty); + unsigned InsCost = (IsVoid ? 0 : + VTTI->getInstrCost(Instruction::InsertElement, + VectorTy)); + unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, + VectorTy); + + // The cost of inserting the results plus extracting each one of the + // operands. + Cost += VF * (InsCost + ExtCost * I->getNumOperands()); + + // The cost of executing VF copies of the scalar instruction. + Cost += VF * VTTI->getInstrCost(I->getOpcode(), RetTy); return Cost; } }// end of switch. } +Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) { + if (Scalar->isVoidTy() || VF == 1) + return Scalar; + return VectorType::get(Scalar, VF); +} } // namespace |