diff options
Diffstat (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r-- | lib/Transforms/Vectorize/LoopVectorize.cpp | 388 |
1 files changed, 332 insertions, 56 deletions
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index f944d9b4fc..423c7a4911 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -18,10 +18,13 @@ // // This pass has three parts: // 1. The main loop pass that drives the different parts. -// 2. LoopVectorizationLegality - A helper class that checks for the legality +// 2. LoopVectorizationLegality - A unit that checks for the legality // of the vectorization. -// 3. SingleBlockLoopVectorizer - A helper class that performs the actual +// 3. SingleBlockLoopVectorizer - A unit that performs the actual // widening of instructions. +// 4. LoopVectorizationCostModel - A unit that checks for the profitability +// of vectorization. It decides on the optimal vector width, which +// can be one, if vectorization is not profitable. //===----------------------------------------------------------------------===// // // The reduction-variable vectorization is based on the paper: @@ -51,13 +54,14 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" -#include "llvm/Transforms/Scalar.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/Analysis/ValueTracking.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/TargetTransformInfo.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -67,13 +71,14 @@ using namespace llvm; static cl::opt<unsigned> -DefaultVectorizationFactor("default-loop-vectorize-width", - cl::init(4), cl::Hidden, - cl::desc("Set the default loop vectorization width")); +VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, + cl::desc("Set the default vectorization width. Zero is autoselect.")); + namespace { -// Forward declaration. +// Forward declarations. class LoopVectorizationLegality; +class LoopVectorizationCostModel; /// SingleBlockLoopVectorizer vectorizes loops which contain only one basic /// block to a specified vectorization factor (VF). @@ -203,7 +208,10 @@ public: enum ReductionKind { NoReduction = -1, /// Not a reduction. IntegerAdd = 0, /// Sum of numbers. - IntegerMult = 1 /// Product of numbers. + IntegerMult = 1, /// Product of numbers. + IntegerOr = 2, /// Bitwise or logical OR of numbers. + IntegerAnd = 3, /// Bitwise or logical AND of numbers. + IntegerXor = 4 /// Bitwise or logical XOR of numbers. }; /// This POD struct holds information about reduction variables. @@ -229,11 +237,10 @@ public: /// of the reductions that were found in the loop. typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList; - /// Returns the maximum vectorization factor that we *can* use to vectorize - /// this loop. This does not mean that it is profitable to vectorize this - /// loop, only that it is legal to do so. This may be a large number. We - /// can vectorize to any SIMD width below this number. - unsigned getLoopMaxVF(); + /// Returns true if it is legal to vectorize this loop. + /// This does not mean that it is profitable to vectorize this + /// loop, only that it is legal to do so. + bool canVectorize(); /// Returns the Induction variable. PHINode *getInduction() {return Induction;} @@ -259,10 +266,6 @@ private: /// Returns true if BB is vectorizable bool canVectorizeMemory(BasicBlock &BB); - // Check if a pointer value is known to be disjoint. - // Example: Alloca, Global, NoAlias. - bool isIdentifiedSafeObject(Value* Val); - /// Returns True, if 'Phi' is the kind of reduction variable for type /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. bool AddReductionVar(PHINode *Phi, ReductionKind Kind); @@ -290,6 +293,48 @@ private: SmallPtrSet<Value*, 4> AllowedExit; }; +/// LoopVectorizationCostModel - estimates the expected speedups due to +/// vectorization. +/// In many cases vectorization is not profitable. This can happen because +/// of a number of reasons. In this class we mainly attempt to predict +/// the expected speedup/slowdowns due to the supported instruction set. +/// We use the VectorTargetTransformInfo to query the different backends +/// for the cost of different operations. +class LoopVectorizationCostModel { +public: + /// C'tor. + LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se, + LoopVectorizationLegality *Leg, + const VectorTargetTransformInfo *Vtti): + TheLoop(Lp), SE(Se), Legal(Leg), VTTI(Vtti) { } + + /// Returns the most profitable vectorization factor for the loop that is + /// smaller or equal to the VF argument. This method checks every power + /// of two up to VF. + unsigned findBestVectorizationFactor(unsigned VF = 4); + +private: + /// Returns the expected execution cost. The unit of the cost does + /// not matter because we use the 'cost' units to compare different + /// vector widths. The cost that is returned is *not* normalized by + /// the factor width. + unsigned expectedCost(unsigned VF); + + /// Returns the execution time cost of an instruction for a given vector + /// width. Vector width of one means scalar. + unsigned getInstructionCost(Instruction *I, unsigned VF); + + /// The loop that we evaluate. + Loop *TheLoop; + /// Scev analysis. + ScalarEvolution *SE; + + /// Vectorization legality. + LoopVectorizationLegality *Legal; + /// Vector target information. + const VectorTargetTransformInfo *VTTI; +}; + struct LoopVectorize : public LoopPass { static char ID; // Pass identification, replacement for typeid @@ -300,6 +345,7 @@ struct LoopVectorize : public LoopPass { ScalarEvolution *SE; DataLayout *DL; LoopInfo *LI; + TargetTransformInfo *TTI; virtual bool runOnLoop(Loop *L, LPPassManager &LPM) { // We only vectorize innermost loops. @@ -309,25 +355,42 @@ struct LoopVectorize : public LoopPass { SE = &getAnalysis<ScalarEvolution>(); DL = getAnalysisIfAvailable<DataLayout>(); LI = &getAnalysis<LoopInfo>(); + TTI = getAnalysisIfAvailable<TargetTransformInfo>(); DEBUG(dbgs() << "LV: Checking a loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); // Check if it is legal to vectorize the loop. LoopVectorizationLegality LVL(L, SE, DL); - unsigned MaxVF = LVL.getLoopMaxVF(); - - // Check that we can vectorize this loop using the chosen vectorization - // width. - if (MaxVF < DefaultVectorizationFactor) { - DEBUG(dbgs() << "LV: non-vectorizable MaxVF ("<< MaxVF << ").\n"); + if (!LVL.canVectorize()) { + DEBUG(dbgs() << "LV: Not vectorizing.\n"); return false; } - DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< MaxVF << ").\n"); + // Select the preffered vectorization factor. + unsigned VF = 1; + if (VectorizationFactor == 0) { + const VectorTargetTransformInfo *VTTI = 0; + if (TTI) + VTTI = TTI->getVectorTargetTransformInfo(); + // Use the cost model. + LoopVectorizationCostModel CM(L, SE, &LVL, VTTI); + VF = CM.findBestVectorizationFactor(); + + if (VF == 1) { + DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); + return false; + } + + } else { + // Use the user command flag. + VF = VectorizationFactor; + } + + DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ").\n"); // If we decided that it is *legal* to vectorizer the loop then do it. - SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, DefaultVectorizationFactor); + SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, VF); LB.vectorize(&LVL); DEBUG(verifyFunction(*L->getHeader()->getParent())); @@ -660,6 +723,13 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal void SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { + //===------------------------------------------------===// + // + // Notice: any optimization or new instruction that go + // into the code below should be also be implemented in + // the cost-model. + // + //===------------------------------------------------===// typedef SmallVector<PHINode*, 4> PhiVector; BasicBlock &BB = *OrigLoop->getHeader(); Constant *Zero = ConstantInt::get( @@ -914,14 +984,28 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // Extract the first scalar. Value *Scalar0 = Builder.CreateExtractElement(NewPhi, Builder.getInt32(0)); - // Extract and sum the remaining vector elements. + // Extract and reduce the remaining vector elements. for (unsigned i=1; i < VF; ++i) { Value *Scalar1 = Builder.CreateExtractElement(NewPhi, Builder.getInt32(i)); - if (RdxDesc.Kind == LoopVectorizationLegality::IntegerAdd) { - Scalar0 = Builder.CreateAdd(Scalar0, Scalar1); - } else { - Scalar0 = Builder.CreateMul(Scalar0, Scalar1); + switch (RdxDesc.Kind) { + case LoopVectorizationLegality::IntegerAdd: + Scalar0 = Builder.CreateAdd(Scalar0, Scalar1); + break; + case LoopVectorizationLegality::IntegerMult: + Scalar0 = Builder.CreateMul(Scalar0, Scalar1); + break; + case LoopVectorizationLegality::IntegerOr: + Scalar0 = Builder.CreateOr(Scalar0, Scalar1); + break; + case LoopVectorizationLegality::IntegerAnd: + Scalar0 = Builder.CreateAnd(Scalar0, Scalar1); + break; + case LoopVectorizationLegality::IntegerXor: + Scalar0 = Builder.CreateXor(Scalar0, Scalar1); + break; + default: + llvm_unreachable("Unknown reduction operation"); } } @@ -961,18 +1045,18 @@ void SingleBlockLoopVectorizer::cleanup() { SE->forgetLoop(OrigLoop); } -unsigned LoopVectorizationLegality::getLoopMaxVF() { +bool LoopVectorizationLegality::canVectorize() { if (!TheLoop->getLoopPreheader()) { assert(false && "No preheader!!"); DEBUG(dbgs() << "LV: Loop not normalized." << "\n"); - return 1; + return false; } // We can only vectorize single basic block loops. unsigned NumBlocks = TheLoop->getNumBlocks(); if (NumBlocks != 1) { DEBUG(dbgs() << "LV: Too many blocks:" << NumBlocks << "\n"); - return 1; + return false; } // We need to have a loop header. @@ -982,22 +1066,22 @@ unsigned LoopVectorizationLegality::getLoopMaxVF() { // Go over each instruction and look at memory deps. if (!canVectorizeBlock(*BB)) { DEBUG(dbgs() << "LV: Can't vectorize this loop header\n"); - return 1; + return false; } // ScalarEvolution needs to be able to find the exit count. const SCEV *ExitCount = SE->getExitCount(TheLoop, BB); if (ExitCount == SE->getCouldNotCompute()) { DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); - return 1; + return false; } DEBUG(dbgs() << "LV: We can vectorize this loop!\n"); // Okay! We can vectorize. At this point we don't have any other mem analysis - // which may limit our maximum vectorization factor, so just return the - // maximum SIMD size. - return DefaultVectorizationFactor; + // which may limit our maximum vectorization factor, so just return true with + // no restrictions. + return true; } bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { @@ -1032,7 +1116,19 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { continue; } if (AddReductionVar(Phi, IntegerMult)) { - DEBUG(dbgs() << "LV: Found an Mult reduction PHI."<< *Phi <<"\n"); + DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n"); + continue; + } + if (AddReductionVar(Phi, IntegerOr)) { + DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n"); + continue; + } + if (AddReductionVar(Phi, IntegerAnd)) { + DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n"); + continue; + } + if (AddReductionVar(Phi, IntegerXor)) { + DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n"); continue; } @@ -1178,7 +1274,7 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { GetUnderlyingObjects(*I, TempObjects, DL); for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); it != e; ++it) { - if (!isIdentifiedSafeObject(*it)) { + if (!isIdentifiedObject(*it)) { DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n"); return false; } @@ -1196,7 +1292,7 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { GetUnderlyingObjects(*I, TempObjects, DL); for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end(); it != e; ++it) { - if (!isIdentifiedSafeObject(*it)) { + if (!isIdentifiedObject(*it)) { DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n"); return false; } @@ -1213,19 +1309,6 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { return true; } -/// Checks if the value is a Global variable or if it is an Arguments -/// marked with the NoAlias attribute. -bool LoopVectorizationLegality::isIdentifiedSafeObject(Value* Val) { - assert(Val && "Invalid value"); - if (isa<GlobalValue>(Val)) - return true; - if (isa<AllocaInst>(Val)) - return true; - if (Argument *A = dyn_cast<Argument>(Val)) - return A->hasNoAliasAttr(); - return false; -} - bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, ReductionKind Kind) { if (Phi->getNumIncomingValues() != 2) @@ -1319,6 +1402,12 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I, case Instruction::UDiv: case Instruction::SDiv: return Kind == IntegerMult; + case Instruction::And: + return Kind == IntegerAnd; + case Instruction::Or: + return Kind == IntegerOr; + case Instruction::Xor: + return Kind == IntegerXor; } } @@ -1340,6 +1429,193 @@ bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { return true; } +unsigned +LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) { + if (!VTTI) { + DEBUG(dbgs() << "LV: No vector target information. Not vectorizing. \n"); + return 1; + } + + float Cost = expectedCost(1); + unsigned Width = 1; + DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n"); + for (unsigned i=2; i <= VF; i*=2) { + // Notice that the vector loop needs to be executed less times, so + // we need to divide the cost of the vector loops by the width of + // the vector elements. + float VectorCost = expectedCost(i) / (float)i; + DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " << + (int)VectorCost << ".\n"); + if (VectorCost < Cost) { + Cost = VectorCost; + Width = i; + } + } + + DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n"); + return Width; +} + +unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { + // We can only estimate the cost of single basic block loops. + assert(1 == TheLoop->getNumBlocks() && "Too many blocks in loop"); + + BasicBlock *BB = TheLoop->getHeader(); + unsigned Cost = 0; + + // For each instruction in the old loop. + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + Instruction *Inst = it; + unsigned C = getInstructionCost(Inst, VF); + Cost += C; + DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF "<< VF << + " For instruction: "<< *Inst << "\n"); + } + + return Cost; +} + +unsigned +LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { + assert(VTTI && "Invalid vector target transformation info"); + switch (I->getOpcode()) { + case Instruction::GetElementPtr: + return 0; + case Instruction::Br: { + return VTTI->getInstrCost(I->getOpcode()); + } + case Instruction::PHI: + return 0; + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + Type *VTy = VectorType::get(I->getType(), VF); + return VTTI->getInstrCost(I->getOpcode(), VTy); + } + case Instruction::Select: { + SelectInst *SI = cast<SelectInst>(I); + Type *VTy = VectorType::get(I->getType(), VF); + const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); + bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); + Type *CondTy = SI->getCondition()->getType(); + if (ScalarCond) + CondTy = VectorType::get(CondTy, VF); + + return VTTI->getInstrCost(I->getOpcode(), VTy, CondTy); + } + case Instruction::ICmp: + case Instruction::FCmp: { + Type *VTy = VectorType::get(I->getOperand(0)->getType(), VF); + return VTTI->getInstrCost(I->getOpcode(), VTy); + } + case Instruction::Store: { + StoreInst *SI = cast<StoreInst>(I); + Type *VTy = VectorType::get(SI->getValueOperand()->getType(), VF); + + // Scalarized stores. + if (!Legal->isConsecutiveGep(SI->getPointerOperand())) { + unsigned Cost = 0; + if (VF != 1) { + unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, + VTy); + // The cost of extracting from the value vector and pointer vector. + Cost += VF * (ExtCost * 2); + } + // The cost of the scalar stores. + Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), + VTy->getScalarType(), + SI->getAlignment(), + SI->getPointerAddressSpace()); + return Cost; + } + + // Wide stores. + return VTTI->getMemoryOpCost(I->getOpcode(), VTy, SI->getAlignment(), + SI->getPointerAddressSpace()); + } + case Instruction::Load: { + LoadInst *LI = cast<LoadInst>(I); + Type *VTy = VectorType::get(I->getType(), VF); + + // Scalarized loads. + if (!Legal->isConsecutiveGep(LI->getPointerOperand())) { + unsigned Cost = 0; + if (VF != 1) { + unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, VTy); + unsigned ExCost = VTTI->getInstrCost(Instruction::ExtractValue, VTy); + + // The cost of inserting the loaded value into the result vector, and + // extracting from a vector of pointers. + Cost += VF * (InCost + ExCost); + } + // The cost of the scalar stores. + Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), VTy->getScalarType(), + LI->getAlignment(), + LI->getPointerAddressSpace()); + return Cost; + } + + // Wide loads. + return VTTI->getMemoryOpCost(I->getOpcode(), VTy, LI->getAlignment(), + LI->getPointerAddressSpace()); + } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + Type *SrcTy = VectorType::get(I->getOperand(0)->getType(), VF); + Type *DstTy = VectorType::get(I->getType(), VF); + return VTTI->getInstrCost(I->getOpcode(), DstTy, SrcTy); + } + default: { + // We are scalarizing the instruction. Return the cost of the scalar + // instruction, plus the cost of insert and extract into vector + // elements, times the vector width. + unsigned Cost = 0; + Type *Ty = I->getType(); + + if (!Ty->isVoidTy()) { + Type *VTy = VectorType::get(Ty, VF); + unsigned InsCost = VTTI->getInstrCost(Instruction::InsertElement, VTy); + unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, VTy); + Cost += VF * (InsCost + ExtCost); + } + + /// We don't have any information on the scalar instruction, but maybe + /// the target has. + /// TODO: This may be a target-specific intrinsic. + /// Need to add API for that. + Cost += VF * VTTI->getInstrCost(I->getOpcode(), Ty); + + return Cost; + } + }// end of switch. +} + + } // namespace char LoopVectorize::ID = 0; |