aboutsummaryrefslogtreecommitdiff
path: root/lib/Transforms/Vectorize/LoopVectorize.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r--lib/Transforms/Vectorize/LoopVectorize.cpp388
1 files changed, 332 insertions, 56 deletions
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index f944d9b4fc..423c7a4911 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -18,10 +18,13 @@
//
// This pass has three parts:
// 1. The main loop pass that drives the different parts.
-// 2. LoopVectorizationLegality - A helper class that checks for the legality
+// 2. LoopVectorizationLegality - A unit that checks for the legality
// of the vectorization.
-// 3. SingleBlockLoopVectorizer - A helper class that performs the actual
+// 3. SingleBlockLoopVectorizer - A unit that performs the actual
// widening of instructions.
+// 4. LoopVectorizationCostModel - A unit that checks for the profitability
+// of vectorization. It decides on the optimal vector width, which
+// can be one, if vectorization is not profitable.
//===----------------------------------------------------------------------===//
//
// The reduction-variable vectorization is based on the paper:
@@ -51,13 +54,14 @@
#include "llvm/ADT/StringExtras.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Transforms/Scalar.h"
#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/TargetTransformInfo.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
@@ -67,13 +71,14 @@
using namespace llvm;
static cl::opt<unsigned>
-DefaultVectorizationFactor("default-loop-vectorize-width",
- cl::init(4), cl::Hidden,
- cl::desc("Set the default loop vectorization width"));
+VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
+ cl::desc("Set the default vectorization width. Zero is autoselect."));
+
namespace {
-// Forward declaration.
+// Forward declarations.
class LoopVectorizationLegality;
+class LoopVectorizationCostModel;
/// SingleBlockLoopVectorizer vectorizes loops which contain only one basic
/// block to a specified vectorization factor (VF).
@@ -203,7 +208,10 @@ public:
enum ReductionKind {
NoReduction = -1, /// Not a reduction.
IntegerAdd = 0, /// Sum of numbers.
- IntegerMult = 1 /// Product of numbers.
+ IntegerMult = 1, /// Product of numbers.
+ IntegerOr = 2, /// Bitwise or logical OR of numbers.
+ IntegerAnd = 3, /// Bitwise or logical AND of numbers.
+ IntegerXor = 4 /// Bitwise or logical XOR of numbers.
};
/// This POD struct holds information about reduction variables.
@@ -229,11 +237,10 @@ public:
/// of the reductions that were found in the loop.
typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
- /// Returns the maximum vectorization factor that we *can* use to vectorize
- /// this loop. This does not mean that it is profitable to vectorize this
- /// loop, only that it is legal to do so. This may be a large number. We
- /// can vectorize to any SIMD width below this number.
- unsigned getLoopMaxVF();
+ /// Returns true if it is legal to vectorize this loop.
+ /// This does not mean that it is profitable to vectorize this
+ /// loop, only that it is legal to do so.
+ bool canVectorize();
/// Returns the Induction variable.
PHINode *getInduction() {return Induction;}
@@ -259,10 +266,6 @@ private:
/// Returns true if BB is vectorizable
bool canVectorizeMemory(BasicBlock &BB);
- // Check if a pointer value is known to be disjoint.
- // Example: Alloca, Global, NoAlias.
- bool isIdentifiedSafeObject(Value* Val);
-
/// Returns True, if 'Phi' is the kind of reduction variable for type
/// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
@@ -290,6 +293,48 @@ private:
SmallPtrSet<Value*, 4> AllowedExit;
};
+/// LoopVectorizationCostModel - estimates the expected speedups due to
+/// vectorization.
+/// In many cases vectorization is not profitable. This can happen because
+/// of a number of reasons. In this class we mainly attempt to predict
+/// the expected speedup/slowdowns due to the supported instruction set.
+/// We use the VectorTargetTransformInfo to query the different backends
+/// for the cost of different operations.
+class LoopVectorizationCostModel {
+public:
+ /// C'tor.
+ LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se,
+ LoopVectorizationLegality *Leg,
+ const VectorTargetTransformInfo *Vtti):
+ TheLoop(Lp), SE(Se), Legal(Leg), VTTI(Vtti) { }
+
+ /// Returns the most profitable vectorization factor for the loop that is
+ /// smaller or equal to the VF argument. This method checks every power
+ /// of two up to VF.
+ unsigned findBestVectorizationFactor(unsigned VF = 4);
+
+private:
+ /// Returns the expected execution cost. The unit of the cost does
+ /// not matter because we use the 'cost' units to compare different
+ /// vector widths. The cost that is returned is *not* normalized by
+ /// the factor width.
+ unsigned expectedCost(unsigned VF);
+
+ /// Returns the execution time cost of an instruction for a given vector
+ /// width. Vector width of one means scalar.
+ unsigned getInstructionCost(Instruction *I, unsigned VF);
+
+ /// The loop that we evaluate.
+ Loop *TheLoop;
+ /// Scev analysis.
+ ScalarEvolution *SE;
+
+ /// Vectorization legality.
+ LoopVectorizationLegality *Legal;
+ /// Vector target information.
+ const VectorTargetTransformInfo *VTTI;
+};
+
struct LoopVectorize : public LoopPass {
static char ID; // Pass identification, replacement for typeid
@@ -300,6 +345,7 @@ struct LoopVectorize : public LoopPass {
ScalarEvolution *SE;
DataLayout *DL;
LoopInfo *LI;
+ TargetTransformInfo *TTI;
virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
// We only vectorize innermost loops.
@@ -309,25 +355,42 @@ struct LoopVectorize : public LoopPass {
SE = &getAnalysis<ScalarEvolution>();
DL = getAnalysisIfAvailable<DataLayout>();
LI = &getAnalysis<LoopInfo>();
+ TTI = getAnalysisIfAvailable<TargetTransformInfo>();
DEBUG(dbgs() << "LV: Checking a loop in \"" <<
L->getHeader()->getParent()->getName() << "\"\n");
// Check if it is legal to vectorize the loop.
LoopVectorizationLegality LVL(L, SE, DL);
- unsigned MaxVF = LVL.getLoopMaxVF();
-
- // Check that we can vectorize this loop using the chosen vectorization
- // width.
- if (MaxVF < DefaultVectorizationFactor) {
- DEBUG(dbgs() << "LV: non-vectorizable MaxVF ("<< MaxVF << ").\n");
+ if (!LVL.canVectorize()) {
+ DEBUG(dbgs() << "LV: Not vectorizing.\n");
return false;
}
- DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< MaxVF << ").\n");
+ // Select the preffered vectorization factor.
+ unsigned VF = 1;
+ if (VectorizationFactor == 0) {
+ const VectorTargetTransformInfo *VTTI = 0;
+ if (TTI)
+ VTTI = TTI->getVectorTargetTransformInfo();
+ // Use the cost model.
+ LoopVectorizationCostModel CM(L, SE, &LVL, VTTI);
+ VF = CM.findBestVectorizationFactor();
+
+ if (VF == 1) {
+ DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
+ return false;
+ }
+
+ } else {
+ // Use the user command flag.
+ VF = VectorizationFactor;
+ }
+
+ DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ").\n");
// If we decided that it is *legal* to vectorizer the loop then do it.
- SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, DefaultVectorizationFactor);
+ SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, VF);
LB.vectorize(&LVL);
DEBUG(verifyFunction(*L->getHeader()->getParent()));
@@ -660,6 +723,13 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal
void
SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
+ //===------------------------------------------------===//
+ //
+ // Notice: any optimization or new instruction that go
+ // into the code below should be also be implemented in
+ // the cost-model.
+ //
+ //===------------------------------------------------===//
typedef SmallVector<PHINode*, 4> PhiVector;
BasicBlock &BB = *OrigLoop->getHeader();
Constant *Zero = ConstantInt::get(
@@ -914,14 +984,28 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
// Extract the first scalar.
Value *Scalar0 =
Builder.CreateExtractElement(NewPhi, Builder.getInt32(0));
- // Extract and sum the remaining vector elements.
+ // Extract and reduce the remaining vector elements.
for (unsigned i=1; i < VF; ++i) {
Value *Scalar1 =
Builder.CreateExtractElement(NewPhi, Builder.getInt32(i));
- if (RdxDesc.Kind == LoopVectorizationLegality::IntegerAdd) {
- Scalar0 = Builder.CreateAdd(Scalar0, Scalar1);
- } else {
- Scalar0 = Builder.CreateMul(Scalar0, Scalar1);
+ switch (RdxDesc.Kind) {
+ case LoopVectorizationLegality::IntegerAdd:
+ Scalar0 = Builder.CreateAdd(Scalar0, Scalar1);
+ break;
+ case LoopVectorizationLegality::IntegerMult:
+ Scalar0 = Builder.CreateMul(Scalar0, Scalar1);
+ break;
+ case LoopVectorizationLegality::IntegerOr:
+ Scalar0 = Builder.CreateOr(Scalar0, Scalar1);
+ break;
+ case LoopVectorizationLegality::IntegerAnd:
+ Scalar0 = Builder.CreateAnd(Scalar0, Scalar1);
+ break;
+ case LoopVectorizationLegality::IntegerXor:
+ Scalar0 = Builder.CreateXor(Scalar0, Scalar1);
+ break;
+ default:
+ llvm_unreachable("Unknown reduction operation");
}
}
@@ -961,18 +1045,18 @@ void SingleBlockLoopVectorizer::cleanup() {
SE->forgetLoop(OrigLoop);
}
-unsigned LoopVectorizationLegality::getLoopMaxVF() {
+bool LoopVectorizationLegality::canVectorize() {
if (!TheLoop->getLoopPreheader()) {
assert(false && "No preheader!!");
DEBUG(dbgs() << "LV: Loop not normalized." << "\n");
- return 1;
+ return false;
}
// We can only vectorize single basic block loops.
unsigned NumBlocks = TheLoop->getNumBlocks();
if (NumBlocks != 1) {
DEBUG(dbgs() << "LV: Too many blocks:" << NumBlocks << "\n");
- return 1;
+ return false;
}
// We need to have a loop header.
@@ -982,22 +1066,22 @@ unsigned LoopVectorizationLegality::getLoopMaxVF() {
// Go over each instruction and look at memory deps.
if (!canVectorizeBlock(*BB)) {
DEBUG(dbgs() << "LV: Can't vectorize this loop header\n");
- return 1;
+ return false;
}
// ScalarEvolution needs to be able to find the exit count.
const SCEV *ExitCount = SE->getExitCount(TheLoop, BB);
if (ExitCount == SE->getCouldNotCompute()) {
DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
- return 1;
+ return false;
}
DEBUG(dbgs() << "LV: We can vectorize this loop!\n");
// Okay! We can vectorize. At this point we don't have any other mem analysis
- // which may limit our maximum vectorization factor, so just return the
- // maximum SIMD size.
- return DefaultVectorizationFactor;
+ // which may limit our maximum vectorization factor, so just return true with
+ // no restrictions.
+ return true;
}
bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
@@ -1032,7 +1116,19 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
continue;
}
if (AddReductionVar(Phi, IntegerMult)) {
- DEBUG(dbgs() << "LV: Found an Mult reduction PHI."<< *Phi <<"\n");
+ DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n");
+ continue;
+ }
+ if (AddReductionVar(Phi, IntegerOr)) {
+ DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n");
+ continue;
+ }
+ if (AddReductionVar(Phi, IntegerAnd)) {
+ DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n");
+ continue;
+ }
+ if (AddReductionVar(Phi, IntegerXor)) {
+ DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n");
continue;
}
@@ -1178,7 +1274,7 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
GetUnderlyingObjects(*I, TempObjects, DL);
for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
it != e; ++it) {
- if (!isIdentifiedSafeObject(*it)) {
+ if (!isIdentifiedObject(*it)) {
DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n");
return false;
}
@@ -1196,7 +1292,7 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
GetUnderlyingObjects(*I, TempObjects, DL);
for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
it != e; ++it) {
- if (!isIdentifiedSafeObject(*it)) {
+ if (!isIdentifiedObject(*it)) {
DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n");
return false;
}
@@ -1213,19 +1309,6 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
return true;
}
-/// Checks if the value is a Global variable or if it is an Arguments
-/// marked with the NoAlias attribute.
-bool LoopVectorizationLegality::isIdentifiedSafeObject(Value* Val) {
- assert(Val && "Invalid value");
- if (isa<GlobalValue>(Val))
- return true;
- if (isa<AllocaInst>(Val))
- return true;
- if (Argument *A = dyn_cast<Argument>(Val))
- return A->hasNoAliasAttr();
- return false;
-}
-
bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
ReductionKind Kind) {
if (Phi->getNumIncomingValues() != 2)
@@ -1319,6 +1402,12 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I,
case Instruction::UDiv:
case Instruction::SDiv:
return Kind == IntegerMult;
+ case Instruction::And:
+ return Kind == IntegerAnd;
+ case Instruction::Or:
+ return Kind == IntegerOr;
+ case Instruction::Xor:
+ return Kind == IntegerXor;
}
}
@@ -1340,6 +1429,193 @@ bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
return true;
}
+unsigned
+LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) {
+ if (!VTTI) {
+ DEBUG(dbgs() << "LV: No vector target information. Not vectorizing. \n");
+ return 1;
+ }
+
+ float Cost = expectedCost(1);
+ unsigned Width = 1;
+ DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n");
+ for (unsigned i=2; i <= VF; i*=2) {
+ // Notice that the vector loop needs to be executed less times, so
+ // we need to divide the cost of the vector loops by the width of
+ // the vector elements.
+ float VectorCost = expectedCost(i) / (float)i;
+ DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " <<
+ (int)VectorCost << ".\n");
+ if (VectorCost < Cost) {
+ Cost = VectorCost;
+ Width = i;
+ }
+ }
+
+ DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n");
+ return Width;
+}
+
+unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
+ // We can only estimate the cost of single basic block loops.
+ assert(1 == TheLoop->getNumBlocks() && "Too many blocks in loop");
+
+ BasicBlock *BB = TheLoop->getHeader();
+ unsigned Cost = 0;
+
+ // For each instruction in the old loop.
+ for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+ Instruction *Inst = it;
+ unsigned C = getInstructionCost(Inst, VF);
+ Cost += C;
+ DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF "<< VF <<
+ " For instruction: "<< *Inst << "\n");
+ }
+
+ return Cost;
+}
+
+unsigned
+LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
+ assert(VTTI && "Invalid vector target transformation info");
+ switch (I->getOpcode()) {
+ case Instruction::GetElementPtr:
+ return 0;
+ case Instruction::Br: {
+ return VTTI->getInstrCost(I->getOpcode());
+ }
+ case Instruction::PHI:
+ return 0;
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ Type *VTy = VectorType::get(I->getType(), VF);
+ return VTTI->getInstrCost(I->getOpcode(), VTy);
+ }
+ case Instruction::Select: {
+ SelectInst *SI = cast<SelectInst>(I);
+ Type *VTy = VectorType::get(I->getType(), VF);
+ const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
+ bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
+ Type *CondTy = SI->getCondition()->getType();
+ if (ScalarCond)
+ CondTy = VectorType::get(CondTy, VF);
+
+ return VTTI->getInstrCost(I->getOpcode(), VTy, CondTy);
+ }
+ case Instruction::ICmp:
+ case Instruction::FCmp: {
+ Type *VTy = VectorType::get(I->getOperand(0)->getType(), VF);
+ return VTTI->getInstrCost(I->getOpcode(), VTy);
+ }
+ case Instruction::Store: {
+ StoreInst *SI = cast<StoreInst>(I);
+ Type *VTy = VectorType::get(SI->getValueOperand()->getType(), VF);
+
+ // Scalarized stores.
+ if (!Legal->isConsecutiveGep(SI->getPointerOperand())) {
+ unsigned Cost = 0;
+ if (VF != 1) {
+ unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
+ VTy);
+ // The cost of extracting from the value vector and pointer vector.
+ Cost += VF * (ExtCost * 2);
+ }
+ // The cost of the scalar stores.
+ Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
+ VTy->getScalarType(),
+ SI->getAlignment(),
+ SI->getPointerAddressSpace());
+ return Cost;
+ }
+
+ // Wide stores.
+ return VTTI->getMemoryOpCost(I->getOpcode(), VTy, SI->getAlignment(),
+ SI->getPointerAddressSpace());
+ }
+ case Instruction::Load: {
+ LoadInst *LI = cast<LoadInst>(I);
+ Type *VTy = VectorType::get(I->getType(), VF);
+
+ // Scalarized loads.
+ if (!Legal->isConsecutiveGep(LI->getPointerOperand())) {
+ unsigned Cost = 0;
+ if (VF != 1) {
+ unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, VTy);
+ unsigned ExCost = VTTI->getInstrCost(Instruction::ExtractValue, VTy);
+
+ // The cost of inserting the loaded value into the result vector, and
+ // extracting from a vector of pointers.
+ Cost += VF * (InCost + ExCost);
+ }
+ // The cost of the scalar stores.
+ Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), VTy->getScalarType(),
+ LI->getAlignment(),
+ LI->getPointerAddressSpace());
+ return Cost;
+ }
+
+ // Wide loads.
+ return VTTI->getMemoryOpCost(I->getOpcode(), VTy, LI->getAlignment(),
+ LI->getPointerAddressSpace());
+ }
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast: {
+ Type *SrcTy = VectorType::get(I->getOperand(0)->getType(), VF);
+ Type *DstTy = VectorType::get(I->getType(), VF);
+ return VTTI->getInstrCost(I->getOpcode(), DstTy, SrcTy);
+ }
+ default: {
+ // We are scalarizing the instruction. Return the cost of the scalar
+ // instruction, plus the cost of insert and extract into vector
+ // elements, times the vector width.
+ unsigned Cost = 0;
+ Type *Ty = I->getType();
+
+ if (!Ty->isVoidTy()) {
+ Type *VTy = VectorType::get(Ty, VF);
+ unsigned InsCost = VTTI->getInstrCost(Instruction::InsertElement, VTy);
+ unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, VTy);
+ Cost += VF * (InsCost + ExtCost);
+ }
+
+ /// We don't have any information on the scalar instruction, but maybe
+ /// the target has.
+ /// TODO: This may be a target-specific intrinsic.
+ /// Need to add API for that.
+ Cost += VF * VTTI->getInstrCost(I->getOpcode(), Ty);
+
+ return Cost;
+ }
+ }// end of switch.
+}
+
+
} // namespace
char LoopVectorize::ID = 0;