aboutsummaryrefslogtreecommitdiff
path: root/lib/Transforms/Vectorize/LoopVectorize.cpp
diff options
context:
space:
mode:
authorMatt Beaumont-Gay <matthewbg@google.com>2012-12-14 17:55:15 +0000
committerMatt Beaumont-Gay <matthewbg@google.com>2012-12-14 17:55:15 +0000
commit6aed25d93d1cfcde5809a73ffa7dc1b0d6396f66 (patch)
tree57e2fdf1caf960d8d878e0289f32af6759832b49 /lib/Transforms/Vectorize/LoopVectorize.cpp
parent7139cfb19b1cc28dfd5e274c07ec68835bc6d6d6 (diff)
parent1ad9253c9d34ccbce3e7e4ea5d87c266cbf93410 (diff)
Updating branches/google/stable to r169803
git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/google/stable@170212 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r--lib/Transforms/Vectorize/LoopVectorize.cpp2277
1 files changed, 1304 insertions, 973 deletions
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 892808760f..feeececedb 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6,355 +6,50 @@
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
-//
-// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
-// and generates target-independent LLVM-IR. Legalization of the IR is done
-// in the codegen. However, the vectorizes uses (will use) the codegen
-// interfaces to generate IR that is likely to result in an optimal binary.
-//
-// The loop vectorizer combines consecutive loop iteration into a single
-// 'wide' iteration. After this transformation the index is incremented
-// by the SIMD vector width, and not by one.
-//
-// This pass has three parts:
-// 1. The main loop pass that drives the different parts.
-// 2. LoopVectorizationLegality - A unit that checks for the legality
-// of the vectorization.
-// 3. SingleBlockLoopVectorizer - A unit that performs the actual
-// widening of instructions.
-// 4. LoopVectorizationCostModel - A unit that checks for the profitability
-// of vectorization. It decides on the optimal vector width, which
-// can be one, if vectorization is not profitable.
-//===----------------------------------------------------------------------===//
-//
-// The reduction-variable vectorization is based on the paper:
-// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
-//
-// Variable uniformity checks are inspired by:
-// Karrenberg, R. and Hack, S. Whole Function Vectorization.
-//
-// Other ideas/concepts are from:
-// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
-//
-//===----------------------------------------------------------------------===//
-#define LV_NAME "loop-vectorize"
-#define DEBUG_TYPE LV_NAME
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Instructions.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Pass.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Value.h"
-#include "llvm/Function.h"
-#include "llvm/Analysis/Verifier.h"
-#include "llvm/Module.h"
-#include "llvm/Type.h"
-#include "llvm/ADT/SmallVector.h"
+#include "LoopVectorize.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/Dominators.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/TargetTransformInfo.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Constants.h"
+#include "llvm/DataLayout.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/DataLayout.h"
+#include "llvm/TargetTransformInfo.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
-#include <algorithm>
-using namespace llvm;
+#include "llvm/Transforms/Vectorize.h"
+#include "llvm/Type.h"
+#include "llvm/Value.h"
static cl::opt<unsigned>
VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
- cl::desc("Set the default vectorization width. Zero is autoselect."));
+ cl::desc("Sets the SIMD width. Zero is autoselect."));
-/// We don't vectorize loops with a known constant trip count below this number.
-const unsigned TinyTripCountThreshold = 16;
+static cl::opt<bool>
+EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
+ cl::desc("Enable if-conversion during vectorization."));
namespace {
-// Forward declarations.
-class LoopVectorizationLegality;
-class LoopVectorizationCostModel;
-
-/// SingleBlockLoopVectorizer vectorizes loops which contain only one basic
-/// block to a specified vectorization factor (VF).
-/// This class performs the widening of scalars into vectors, or multiple
-/// scalars. This class also implements the following features:
-/// * It inserts an epilogue loop for handling loops that don't have iteration
-/// counts that are known to be a multiple of the vectorization factor.
-/// * It handles the code generation for reduction variables.
-/// * Scalarization (implementation using scalars) of un-vectorizable
-/// instructions.
-/// SingleBlockLoopVectorizer does not perform any vectorization-legality
-/// checks, and relies on the caller to check for the different legality
-/// aspects. The SingleBlockLoopVectorizer relies on the
-/// LoopVectorizationLegality class to provide information about the induction
-/// and reduction variables that were found to a given vectorization factor.
-class SingleBlockLoopVectorizer {
-public:
- /// Ctor.
- SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li,
- DominatorTree *dt, LPPassManager *Lpm,
- unsigned VecWidth):
- OrigLoop(Orig), SE(Se), LI(Li), DT(dt), LPM(Lpm), VF(VecWidth),
- Builder(Se->getContext()), Induction(0), OldInduction(0) { }
-
- // Perform the actual loop widening (vectorization).
- void vectorize(LoopVectorizationLegality *Legal) {
- ///Create a new empty loop. Unlink the old loop and connect the new one.
- createEmptyLoop(Legal);
- /// Widen each instruction in the old loop to a new one in the new loop.
- /// Use the Legality module to find the induction and reduction variables.
- vectorizeLoop(Legal);
- // register the new loop.
- updateAnalysis();
- }
-
-private:
- /// Create an empty loop, based on the loop ranges of the old loop.
- void createEmptyLoop(LoopVectorizationLegality *Legal);
- /// Copy and widen the instructions from the old loop.
- void vectorizeLoop(LoopVectorizationLegality *Legal);
- /// Insert the new loop to the loop hierarchy and pass manager.
- void updateAnalysis();
-
- /// This instruction is un-vectorizable. Implement it as a sequence
- /// of scalars.
- void scalarizeInstruction(Instruction *Instr);
-
- /// Create a broadcast instruction. This method generates a broadcast
- /// instruction (shuffle) for loop invariant values and for the induction
- /// value. If this is the induction variable then we extend it to N, N+1, ...
- /// this is needed because each iteration in the loop corresponds to a SIMD
- /// element.
- Value *getBroadcastInstrs(Value *V);
-
- /// This is a helper function used by getBroadcastInstrs. It adds 0, 1, 2 ..
- /// for each element in the vector. Starting from zero.
- Value *getConsecutiveVector(Value* Val);
-
- /// When we go over instructions in the basic block we rely on previous
- /// values within the current basic block or on loop invariant values.
- /// When we widen (vectorize) values we place them in the map. If the values
- /// are not within the map, they have to be loop invariant, so we simply
- /// broadcast them into a vector.
- Value *getVectorValue(Value *V);
-
- /// Get a uniform vector of constant integers. We use this to get
- /// vectors of ones and zeros for the reduction code.
- Constant* getUniformVector(unsigned Val, Type* ScalarTy);
-
- typedef DenseMap<Value*, Value*> ValueMap;
-
- /// The original loop.
- Loop *OrigLoop;
- // Scev analysis to use.
- ScalarEvolution *SE;
- // Loop Info.
- LoopInfo *LI;
- // Dominator Tree.
- DominatorTree *DT;
- // Loop Pass Manager;
- LPPassManager *LPM;
- // The vectorization factor to use.
- unsigned VF;
-
- // The builder that we use
- IRBuilder<> Builder;
-
- // --- Vectorization state ---
-
- /// The vector-loop preheader.
- BasicBlock *LoopVectorPreHeader;
- /// The scalar-loop preheader.
- BasicBlock *LoopScalarPreHeader;
- /// Middle Block between the vector and the scalar.
- BasicBlock *LoopMiddleBlock;
- ///The ExitBlock of the scalar loop.
- BasicBlock *LoopExitBlock;
- ///The vector loop body.
- BasicBlock *LoopVectorBody;
- ///The scalar loop body.
- BasicBlock *LoopScalarBody;
- ///The first bypass block.
- BasicBlock *LoopBypassBlock;
-
- /// The new Induction variable which was added to the new block.
- PHINode *Induction;
- /// The induction variable of the old basic block.
- PHINode *OldInduction;
- // Maps scalars to widened vectors.
- ValueMap WidenMap;
-};
-
-/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
-/// to what vectorization factor.
-/// This class does not look at the profitability of vectorization, only the
-/// legality. This class has two main kinds of checks:
-/// * Memory checks - The code in canVectorizeMemory checks if vectorization
-/// will change the order of memory accesses in a way that will change the
-/// correctness of the program.
-/// * Scalars checks - The code in canVectorizeBlock checks for a number
-/// of different conditions, such as the availability of a single induction
-/// variable, that all types are supported and vectorize-able, etc.
-/// This code reflects the capabilities of SingleBlockLoopVectorizer.
-/// This class is also used by SingleBlockLoopVectorizer for identifying
-/// induction variable and the different reduction variables.
-class LoopVectorizationLegality {
-public:
- LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl):
- TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { }
-
- /// This represents the kinds of reductions that we support.
- enum ReductionKind {
- NoReduction, /// Not a reduction.
- IntegerAdd, /// Sum of numbers.
- IntegerMult, /// Product of numbers.
- IntegerOr, /// Bitwise or logical OR of numbers.
- IntegerAnd, /// Bitwise or logical AND of numbers.
- IntegerXor /// Bitwise or logical XOR of numbers.
- };
-
- /// This POD struct holds information about reduction variables.
- struct ReductionDescriptor {
- // Default C'tor
- ReductionDescriptor():
- StartValue(0), LoopExitInstr(0), Kind(NoReduction) {}
-
- // C'tor.
- ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K):
- StartValue(Start), LoopExitInstr(Exit), Kind(K) {}
-
- // The starting value of the reduction.
- // It does not have to be zero!
- Value *StartValue;
- // The instruction who's value is used outside the loop.
- Instruction *LoopExitInstr;
- // The kind of the reduction.
- ReductionKind Kind;
- };
-
- /// ReductionList contains the reduction descriptors for all
- /// of the reductions that were found in the loop.
- typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
-
- /// Returns true if it is legal to vectorize this loop.
- /// This does not mean that it is profitable to vectorize this
- /// loop, only that it is legal to do so.
- bool canVectorize();
-
- /// Returns the Induction variable.
- PHINode *getInduction() {return Induction;}
-
- /// Returns the reduction variables found in the loop.
- ReductionList *getReductionVars() { return &Reductions; }
-
- /// Check if the pointer returned by this GEP is consecutive
- /// when the index is vectorized. This happens when the last
- /// index of the GEP is consecutive, like the induction variable.
- /// This check allows us to vectorize A[idx] into a wide load/store.
- bool isConsecutiveGep(Value *Ptr);
-
- /// Returns true if this instruction will remain scalar after vectorization.
- bool isUniformAfterVectorization(Instruction* I) {return Uniforms.count(I);}
-
-private:
- /// Check if a single basic block loop is vectorizable.
- /// At this point we know that this is a loop with a constant trip count
- /// and we only need to check individual instructions.
- bool canVectorizeBlock(BasicBlock &BB);
-
- /// When we vectorize loops we may change the order in which
- /// we read and write from memory. This method checks if it is
- /// legal to vectorize the code, considering only memory constrains.
- /// Returns true if BB is vectorizable
- bool canVectorizeMemory(BasicBlock &BB);
-
- /// Returns True, if 'Phi' is the kind of reduction variable for type
- /// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
- bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
- /// Returns true if the instruction I can be a reduction variable of type
- /// 'Kind'.
- bool isReductionInstr(Instruction *I, ReductionKind Kind);
- /// Returns True, if 'Phi' is an induction variable.
- bool isInductionVariable(PHINode *Phi);
-
- /// The loop that we evaluate.
- Loop *TheLoop;
- /// Scev analysis.
- ScalarEvolution *SE;
- /// DataLayout analysis.
- DataLayout *DL;
-
- // --- vectorization state --- //
-
- /// Holds the induction variable.
- PHINode *Induction;
- /// Holds the reduction variables.
- ReductionList Reductions;
- /// Allowed outside users. This holds the reduction
- /// vars which can be accessed from outside the loop.
- SmallPtrSet<Value*, 4> AllowedExit;
- /// This set holds the variables which are known to be uniform after
- /// vectorization.
- SmallPtrSet<Instruction*, 4> Uniforms;
-};
-
-/// LoopVectorizationCostModel - estimates the expected speedups due to
-/// vectorization.
-/// In many cases vectorization is not profitable. This can happen because
-/// of a number of reasons. In this class we mainly attempt to predict
-/// the expected speedup/slowdowns due to the supported instruction set.
-/// We use the VectorTargetTransformInfo to query the different backends
-/// for the cost of different operations.
-class LoopVectorizationCostModel {
-public:
- /// C'tor.
- LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se,
- LoopVectorizationLegality *Leg,
- const VectorTargetTransformInfo *Vtti):
- TheLoop(Lp), SE(Se), Legal(Leg), VTTI(Vtti) { }
-
- /// Returns the most profitable vectorization factor for the loop that is
- /// smaller or equal to the VF argument. This method checks every power
- /// of two up to VF.
- unsigned findBestVectorizationFactor(unsigned VF = 8);
-
-private:
- /// Returns the expected execution cost. The unit of the cost does
- /// not matter because we use the 'cost' units to compare different
- /// vector widths. The cost that is returned is *not* normalized by
- /// the factor width.
- unsigned expectedCost(unsigned VF);
-
- /// Returns the execution time cost of an instruction for a given vector
- /// width. Vector width of one means scalar.
- unsigned getInstructionCost(Instruction *I, unsigned VF);
-
- /// A helper function for converting Scalar types to vector types.
- /// If the incoming type is void, we return void. If the VF is 1, we return
- /// the scalar type.
- static Type* ToVectorTy(Type *Scalar, unsigned VF);
-
- /// The loop that we evaluate.
- Loop *TheLoop;
- /// Scev analysis.
- ScalarEvolution *SE;
-
- /// Vectorization legality.
- LoopVectorizationLegality *Legal;
- /// Vector target information.
- const VectorTargetTransformInfo *VTTI;
-};
-
+/// The LoopVectorize Pass.
struct LoopVectorize : public LoopPass {
static char ID; // Pass identification, replacement for typeid
@@ -383,7 +78,7 @@ struct LoopVectorize : public LoopPass {
L->getHeader()->getParent()->getName() << "\"\n");
// Check if it is legal to vectorize the loop.
- LoopVectorizationLegality LVL(L, SE, DL);
+ LoopVectorizationLegality LVL(L, SE, DL, DT);
if (!LVL.canVectorize()) {
DEBUG(dbgs() << "LV: Not vectorizing.\n");
return false;
@@ -414,7 +109,7 @@ struct LoopVectorize : public LoopPass {
"\n");
// If we decided that it is *legal* to vectorizer the loop then do it.
- SingleBlockLoopVectorizer LB(L, SE, LI, DT, &LPM, VF);
+ InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF);
LB.vectorize(&LVL);
DEBUG(verifyFunction(*L->getHeader()->getParent()));
@@ -434,15 +129,44 @@ struct LoopVectorize : public LoopPass {
};
-Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) {
- // Instructions that access the old induction variable
- // actually want to get the new one.
- if (V == OldInduction)
- V = Induction;
+}// namespace
+
+//===----------------------------------------------------------------------===//
+// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
+// LoopVectorizationCostModel.
+//===----------------------------------------------------------------------===//
+
+void
+LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE,
+ Loop *Lp, Value *Ptr) {
+ const SCEV *Sc = SE->getSCEV(Ptr);
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
+ assert(AR && "Invalid addrec expression");
+ const SCEV *Ex = SE->getExitCount(Lp, Lp->getLoopLatch());
+ const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
+ Pointers.push_back(Ptr);
+ Starts.push_back(AR->getStart());
+ Ends.push_back(ScEnd);
+}
+
+Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
// Create the types.
LLVMContext &C = V->getContext();
Type *VTy = VectorType::get(V->getType(), VF);
Type *I32 = IntegerType::getInt32Ty(C);
+
+ // Save the current insertion location.
+ Instruction *Loc = Builder.GetInsertPoint();
+
+ // We need to place the broadcast of invariant variables outside the loop.
+ Instruction *Instr = dyn_cast<Instruction>(V);
+ bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody);
+ bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
+
+ // Place the code for broadcasting invariant variables in the new preheader.
+ if (Invariant)
+ Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+
Constant *Zero = ConstantInt::get(I32, 0);
Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF));
Value *UndefVal = UndefValue::get(VTy);
@@ -450,27 +174,28 @@ Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) {
Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero);
// Broadcast the scalar into all locations in the vector.
Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros,
- "broadcast");
- // We are accessing the induction variable. Make sure to promote the
- // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes.
- if (V == Induction)
- return getConsecutiveVector(Shuf);
+ "broadcast");
+
+ // Restore the builder insertion point.
+ if (Invariant)
+ Builder.SetInsertPoint(Loc);
+
return Shuf;
}
-Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) {
+Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, bool Negate) {
assert(Val->getType()->isVectorTy() && "Must be a vector");
assert(Val->getType()->getScalarType()->isIntegerTy() &&
"Elem must be an integer");
// Create the types.
Type *ITy = Val->getType()->getScalarType();
VectorType *Ty = cast<VectorType>(Val->getType());
- unsigned VLen = Ty->getNumElements();
+ int VLen = Ty->getNumElements();
SmallVector<Constant*, 8> Indices;
// Create a vector of consecutive numbers from zero to VF.
- for (unsigned i = 0; i < VLen; ++i)
- Indices.push_back(ConstantInt::get(ITy, i));
+ for (int i = 0; i < VLen; ++i)
+ Indices.push_back(ConstantInt::get(ITy, Negate ? (-i): i ));
// Add the consecutive indices to the vector value.
Constant *Cv = ConstantVector::get(Indices);
@@ -478,7 +203,17 @@ Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) {
return Builder.CreateAdd(Val, Cv, "induction");
}
-bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) {
+bool LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
+ assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr");
+
+ // If this value is a pointer induction variable we know it is consecutive.
+ PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
+ if (Phi && Inductions.count(Phi)) {
+ InductionInfo II = Inductions[Phi];
+ if (PtrInduction == II.IK)
+ return true;
+ }
+
GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
if (!Gep)
return false;
@@ -491,7 +226,7 @@ bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) {
if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
return false;
- // We can emit wide load/stores only of the last index is the induction
+ // We can emit wide load/stores only if the last index is the induction
// variable.
const SCEV *Last = SE->getSCEV(LastIndex);
if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
@@ -506,7 +241,12 @@ bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) {
return false;
}
-Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) {
+bool LoopVectorizationLegality::isUniform(Value *V) {
+ return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
+}
+
+Value *InnerLoopVectorizer::getVectorValue(Value *V) {
+ assert(V != Induction && "The new induction variable should not be used.");
assert(!V->getType()->isVectorTy() && "Can't widen a vector");
// If we saved a vectorized copy of V, use it.
Value *&MapEntry = WidenMap[V];
@@ -520,17 +260,11 @@ Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) {
}
Constant*
-SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) {
- SmallVector<Constant*, 8> Indices;
- // Create a vector of consecutive numbers from zero to VF.
- for (unsigned i = 0; i < VF; ++i)
- Indices.push_back(ConstantInt::get(ScalarTy, Val, true));
-
- // Add the consecutive indices to the vector value.
- return ConstantVector::get(Indices);
+InnerLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) {
+ return ConstantVector::getSplat(VF, ConstantInt::get(ScalarTy, Val, true));
}
-void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
+void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
// Holds vector parameters or scalars, in case of uniform vals.
SmallVector<Value*, 8> Params;
@@ -541,7 +275,7 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
// If we are accessing the old induction variable, use the new one.
if (SrcOp == OldInduction) {
- Params.push_back(getBroadcastInstrs(Induction));
+ Params.push_back(getVectorValue(SrcOp));
continue;
}
@@ -600,63 +334,158 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
WidenMap[Instr] = VecResults;
}
+Value*
+InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
+ Instruction *Loc) {
+ LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
+ Legal->getRuntimePointerCheck();
+
+ if (!PtrRtCheck->Need)
+ return NULL;
+
+ Value *MemoryRuntimeCheck = 0;
+ unsigned NumPointers = PtrRtCheck->Pointers.size();
+ SmallVector<Value* , 2> Starts;
+ SmallVector<Value* , 2> Ends;
+
+ SCEVExpander Exp(*SE, "induction");
+
+ // Use this type for pointer arithmetic.
+ Type* PtrArithTy = Type::getInt8PtrTy(Loc->getContext(), 0);
+
+ for (unsigned i = 0; i < NumPointers; ++i) {
+ Value *Ptr = PtrRtCheck->Pointers[i];
+ const SCEV *Sc = SE->getSCEV(Ptr);
+
+ if (SE->isLoopInvariant(Sc, OrigLoop)) {
+ DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" <<
+ *Ptr <<"\n");
+ Starts.push_back(Ptr);
+ Ends.push_back(Ptr);
+ } else {
+ DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n");
+
+ Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc);
+ Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc);
+ Starts.push_back(Start);
+ Ends.push_back(End);
+ }
+ }
+
+ for (unsigned i = 0; i < NumPointers; ++i) {
+ for (unsigned j = i+1; j < NumPointers; ++j) {
+ Instruction::CastOps Op = Instruction::BitCast;
+ Value *Start0 = CastInst::Create(Op, Starts[i], PtrArithTy, "bc", Loc);
+ Value *Start1 = CastInst::Create(Op, Starts[j], PtrArithTy, "bc", Loc);
+ Value *End0 = CastInst::Create(Op, Ends[i], PtrArithTy, "bc", Loc);
+ Value *End1 = CastInst::Create(Op, Ends[j], PtrArithTy, "bc", Loc);
+
+ Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
+ Start0, End1, "bound0", Loc);
+ Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
+ Start1, End0, "bound1", Loc);
+ Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1,
+ "found.conflict", Loc);
+ if (MemoryRuntimeCheck)
+ MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or,
+ MemoryRuntimeCheck,
+ IsConflict,
+ "conflict.rdx", Loc);
+ else
+ MemoryRuntimeCheck = IsConflict;
+
+ }
+ }
+
+ return MemoryRuntimeCheck;
+}
+
void
-SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
+InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
/*
In this function we generate a new loop. The new loop will contain
the vectorized instructions while the old loop will continue to run the
scalar remainder.
- [ ] <-- vector loop bypass.
- / |
- / v
-| [ ] <-- vector pre header.
-| |
-| v
-| [ ] \
-| [ ]_| <-- vector loop.
-| |
- \ v
+ [ ] <-- vector loop bypass.
+ / |
+ / v
+ | [ ] <-- vector pre header.
+ | |
+ | v
+ | [ ] \
+ | [ ]_| <-- vector loop.
+ | |
+ \ v
>[ ] <--- middle-block.
- / |
- / v
-| [ ] <--- new preheader.
-| |
-| v
-| [ ] \
-| [ ]_| <-- old scalar loop to handle remainder.
- \ |
- \ v
+ / |
+ / v
+ | [ ] <--- new preheader.
+ | |
+ | v
+ | [ ] \
+ | [ ]_| <-- old scalar loop to handle remainder.
+ \ |
+ \ v
>[ ] <-- exit block.
...
*/
- // This is the original scalar-loop preheader.
+ BasicBlock *OldBasicBlock = OrigLoop->getHeader();
BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
BasicBlock *ExitBlock = OrigLoop->getExitBlock();
assert(ExitBlock && "Must have an exit block");
- // The loop index does not have to start at Zero. It starts with this value.
+ // Some loops have a single integer induction variable, while other loops
+ // don't. One example is c++ iterators that often have multiple pointer
+ // induction variables. In the code below we also support a case where we
+ // don't have a single induction variable.
OldInduction = Legal->getInduction();
- Value *StartIdx = OldInduction->getIncomingValueForBlock(BypassBlock);
+ Type *IdxTy = OldInduction ? OldInduction->getType() :
+ DL->getIntPtrType(SE->getContext());
+
+ // Find the loop boundaries.
+ const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getLoopLatch());
+ assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
+
+ // Get the total trip count from the count by adding 1.
+ ExitCount = SE->getAddExpr(ExitCount,
+ SE->getConstant(ExitCount->getType(), 1));
+
+ // Expand the trip count and place the new instructions in the preheader.
+ // Notice that the pre-header does not change, only the loop body.
+ SCEVExpander Exp(*SE, "induction");
+
+ // Count holds the overall loop count (N).
+ Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(),
+ BypassBlock->getTerminator());
+
+ // The loop index does not have to start at Zero. Find the original start
+ // value from the induction PHI node. If we don't have an induction variable
+ // then we know that it starts at zero.
+ Value *StartIdx = OldInduction ?
+ OldInduction->getIncomingValueForBlock(BypassBlock):
+ ConstantInt::get(IdxTy, 0);
- assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop");
assert(BypassBlock && "Invalid loop structure");
- BasicBlock *VectorPH =
- BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
- BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(),
- "vector.body");
+ // Generate the code that checks in runtime if arrays overlap.
+ Value *MemoryRuntimeCheck = addRuntimeCheck(Legal,
+ BypassBlock->getTerminator());
- BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(),
- "middle.block");
+ // Split the single block loop into the two loop structure described above.
+ BasicBlock *VectorPH =
+ BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
+ BasicBlock *VecBody =
+ VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
+ BasicBlock *MiddleBlock =
+ VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
BasicBlock *ScalarPH =
- MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(),
- "scalar.preheader");
- // Find the induction variable.
- BasicBlock *OldBasicBlock = OrigLoop->getHeader();
- assert(OldInduction && "We must have a single phi node.");
- Type *IdxTy = OldInduction->getType();
+ MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
+
+ // This is the location in which we add all of the logic for bypassing
+ // the new vector loop.
+ Instruction *Loc = BypassBlock->getTerminator();
// Use this IR builder to create the loop instructions (Phi, Br, Cmp)
// inside the loop.
@@ -666,27 +495,16 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
Induction = Builder.CreatePHI(IdxTy, 2, "index");
Constant *Step = ConstantInt::get(IdxTy, VF);
- // Find the loop boundaries.
- const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader());
- assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
-
- // Get the total trip count from the count by adding 1.
- ExitCount = SE->getAddExpr(ExitCount,
- SE->getConstant(ExitCount->getType(), 1));
-
- // Expand the trip count and place the new instructions in the preheader.
- // Notice that the pre-header does not change, only the loop body.
- SCEVExpander Exp(*SE, "induction");
- Instruction *Loc = BypassBlock->getTerminator();
-
// We may need to extend the index in case there is a type mismatch.
// We know that the count starts at zero and does not overflow.
- // We are using Zext because it should be less expensive.
- if (ExitCount->getType() != Induction->getType())
- ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy);
-
- // Count holds the overall loop count (N).
- Value *Count = Exp.expandCodeFor(ExitCount, Induction->getType(), Loc);
+ if (Count->getType() != IdxTy) {
+ // The exit count can be of pointer type. Convert it to the correct
+ // integer type.
+ if (ExitCount->getType()->isPointerTy())
+ Count = CastInst::CreatePointerCast(Count, IdxTy, "ptrcnt.to.int", Loc);
+ else
+ Count = CastInst::CreateZExtOrBitCast(Count, IdxTy, "zext.cnt", Loc);
+ }
// Add the start index to the loop count to get the new end index.
Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc);
@@ -699,20 +517,112 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx,
"end.idx.rnd.down", Loc);
- // Now, compare the new count to zero. If it is zero, jump to the scalar part.
+ // Now, compare the new count to zero. If it is zero skip the vector loop and
+ // jump to the scalar loop.
Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
IdxEndRoundDown,
StartIdx,
"cmp.zero", Loc);
+
+ // If we are using memory runtime checks, include them in.
+ if (MemoryRuntimeCheck)
+ Cmp = BinaryOperator::Create(Instruction::Or, Cmp, MemoryRuntimeCheck,
+ "CntOrMem", Loc);
+
BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc);
// Remove the old terminator.
Loc->eraseFromParent();
+ // We are going to resume the execution of the scalar loop.
+ // Go over all of the induction variables that we found and fix the
+ // PHIs that are left in the scalar version of the loop.
+ // The starting values of PHI nodes depend on the counter of the last
+ // iteration in the vectorized loop.
+ // If we come from a bypass edge then we need to start from the original
+ // start value.
+
+ // This variable saves the new starting index for the scalar loop.
+ PHINode *ResumeIndex = 0;
+ LoopVectorizationLegality::InductionList::iterator I, E;
+ LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
+ for (I = List->begin(), E = List->end(); I != E; ++I) {
+ PHINode *OrigPhi = I->first;
+ LoopVectorizationLegality::InductionInfo II = I->second;
+ PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val",
+ MiddleBlock->getTerminator());
+ Value *EndValue = 0;
+ switch (II.IK) {
+ case LoopVectorizationLegality::NoInduction:
+ llvm_unreachable("Unknown induction");
+ case LoopVectorizationLegality::IntInduction: {
+ // Handle the integer induction counter:
+ assert(OrigPhi->getType()->isIntegerTy() && "Invalid type");
+ assert(OrigPhi == OldInduction && "Unknown integer PHI");
+ // We know what the end value is.
+ EndValue = IdxEndRoundDown;
+ // We also know which PHI node holds it.
+ ResumeIndex = ResumeVal;
+ break;
+ }
+ case LoopVectorizationLegality::ReverseIntInduction: {
+ // Convert the CountRoundDown variable to the PHI size.
+ unsigned CRDSize = CountRoundDown->getType()->getScalarSizeInBits();
+ unsigned IISize = II.StartValue->getType()->getScalarSizeInBits();
+ Value *CRD = CountRoundDown;
+ if (CRDSize > IISize)
+ CRD = CastInst::Create(Instruction::Trunc, CountRoundDown,
+ II.StartValue->getType(),
+ "tr.crd", BypassBlock->getTerminator());
+ else if (CRDSize < IISize)
+ CRD = CastInst::Create(Instruction::SExt, CountRoundDown,
+ II.StartValue->getType(),
+ "sext.crd", BypassBlock->getTerminator());
+ // Handle reverse integer induction counter:
+ EndValue = BinaryOperator::CreateSub(II.StartValue, CRD, "rev.ind.end",
+ BypassBlock->getTerminator());
+ break;
+ }
+ case LoopVectorizationLegality::PtrInduction: {
+ // For pointer induction variables, calculate the offset using
+ // the end index.
+ EndValue = GetElementPtrInst::Create(II.StartValue, CountRoundDown,
+ "ptr.ind.end",
+ BypassBlock->getTerminator());
+ break;
+ }
+ }// end of case
+
+ // The new PHI merges the original incoming value, in case of a bypass,
+ // or the value at the end of the vectorized loop.
+ ResumeVal->addIncoming(II.StartValue, BypassBlock);
+ ResumeVal->addIncoming(EndValue, VecBody);
+
+ // Fix the scalar body counter (PHI node).
+ unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH);
+ OrigPhi->setIncomingValue(BlockIdx, ResumeVal);
+ }
+
+ // If we are generating a new induction variable then we also need to
+ // generate the code that calculates the exit value. This value is not
+ // simply the end of the counter because we may skip the vectorized body
+ // in case of a runtime check.
+ if (!OldInduction){
+ assert(!ResumeIndex && "Unexpected resume value found");
+ ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val",
+ MiddleBlock->getTerminator());
+ ResumeIndex->addIncoming(StartIdx, BypassBlock);
+ ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
+ }
+
+ // Make sure that we found the index where scalar loop needs to continue.
+ assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() &&
+ "Invalid resume Index");
+
// Add a check in the middle block to see if we have completed
// all of the iterations in the first vector loop.
// If (N - N%VF) == N, then we *don't* need to run the remainder.
Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd,
- IdxEndRoundDown, "cmp.n",
+ ResumeIndex, "cmp.n",
MiddleBlock->getTerminator());
BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator());
@@ -730,26 +640,25 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
// Now we have two terminators. Remove the old one from the block.
VecBody->getTerminator()->eraseFromParent();
- // Fix the scalar body iteration count.
- unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH);
- OldInduction->setIncomingValue(BlockIdx, IdxEndRoundDown);
-
// Get ready to start creating new instructions into the vectorized body.
Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
- // Register the new loop.
+ // Create and register the new vector loop.
Loop* Lp = new Loop();
- LPM->insertLoop(Lp, OrigLoop->getParentLoop());
-
- Lp->addBasicBlockToLoop(VecBody, LI->getBase());
-
Loop *ParentLoop = OrigLoop->getParentLoop();
+
+ // Insert the new loop into the loop nest and register the new basic blocks.
if (ParentLoop) {
+ ParentLoop->addChildLoop(Lp);
ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase());
ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase());
+ } else {
+ LI->addTopLevelLoop(Lp);
}
+ Lp->addBasicBlockToLoop(VecBody, LI->getBase());
+
// Save the state.
LoopVectorPreHeader = VectorPH;
LoopScalarPreHeader = ScalarPH;
@@ -781,8 +690,37 @@ getReductionIdentity(LoopVectorizationLegality::ReductionKind K) {
}
}
+static bool
+isTriviallyVectorizableIntrinsic(Instruction *Inst) {
+ IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst);
+ if (!II)
+ return false;
+ switch (II->getIntrinsicID()) {
+ case Intrinsic::sqrt:
+ case Intrinsic::sin:
+ case Intrinsic::cos:
+ case Intrinsic::exp:
+ case Intrinsic::exp2:
+ case Intrinsic::log:
+ case Intrinsic::log10:
+ case Intrinsic::log2:
+ case Intrinsic::fabs:
+ case Intrinsic::floor:
+ case Intrinsic::ceil:
+ case Intrinsic::trunc:
+ case Intrinsic::rint:
+ case Intrinsic::nearbyint:
+ case Intrinsic::pow:
+ case Intrinsic::fma:
+ return true;
+ default:
+ return false;
+ }
+ return false;
+}
+
void
-SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
+InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
//===------------------------------------------------===//
//
// Notice: any optimization or new instruction that go
@@ -790,202 +728,31 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
// the cost-model.
//
//===------------------------------------------------===//
- typedef SmallVector<PHINode*, 4> PhiVector;
BasicBlock &BB = *OrigLoop->getHeader();
- Constant *Zero = ConstantInt::get(
- IntegerType::getInt32Ty(BB.getContext()), 0);
+ Constant *Zero =
+ ConstantInt::get(IntegerType::getInt32Ty(BB.getContext()), 0);
// In order to support reduction variables we need to be able to vectorize
// Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
- // steages. First, we create a new vector PHI node with no incoming edges.
+ // stages. First, we create a new vector PHI node with no incoming edges.
// We use this value when we vectorize all of the instructions that use the
// PHI. Next, after all of the instructions in the block are complete we
// add the new incoming edges to the PHI. At this point all of the
// instructions in the basic block are vectorized, so we can use them to
// construct the PHI.
- PhiVector PHIsToFix;
+ PhiVector RdxPHIsToFix;
- // For each instruction in the old loop.
- for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
- Instruction *Inst = it;
+ // Scan the loop in a topological order to ensure that defs are vectorized
+ // before users.
+ LoopBlocksDFS DFS(OrigLoop);
+ DFS.perform(LI);
- switch (Inst->getOpcode()) {
- case Instruction::Br:
- // Nothing to do for PHIs and BR, since we already took care of the
- // loop control flow instructions.
- continue;
- case Instruction::PHI:{
- PHINode* P = cast<PHINode>(Inst);
- // Special handling for the induction var.
- if (OldInduction == Inst)
- continue;
- // This is phase one of vectorizing PHIs.
- // This has to be a reduction variable.
- assert(Legal->getReductionVars()->count(P) && "Not a Reduction");
- Type *VecTy = VectorType::get(Inst->getType(), VF);
- WidenMap[Inst] = Builder.CreatePHI(VecTy, 2, "vec.phi");
- PHIsToFix.push_back(P);
- continue;
- }
- case Instruction::Add:
- case Instruction::FAdd:
- case Instruction::Sub:
- case Instruction::FSub:
- case Instruction::Mul:
- case Instruction::FMul:
- case Instruction::UDiv:
- case Instruction::SDiv:
- case Instruction::FDiv:
- case Instruction::URem:
- case Instruction::SRem:
- case Instruction::FRem:
- case Instruction::Shl:
- case Instruction::LShr:
- case Instruction::AShr:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor: {
- // Just widen binops.
- BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
- Value *A = getVectorValue(Inst->getOperand(0));
- Value *B = getVectorValue(Inst->getOperand(1));
-
- // Use this vector value for all users of the original instruction.
- Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
- WidenMap[Inst] = V;
-
- // Update the NSW, NUW and Exact flags.
- BinaryOperator *VecOp = cast<BinaryOperator>(V);
- if (isa<OverflowingBinaryOperator>(BinOp)) {
- VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap());
- VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap());
- }
- if (isa<PossiblyExactOperator>(VecOp))
- VecOp->setIsExact(BinOp->isExact());
- break;
- }
- case Instruction::Select: {
- // Widen selects.
- // If the selector is loop invariant we can create a select
- // instruction with a scalar condition. Otherwise, use vector-select.
- Value *Cond = Inst->getOperand(0);
- bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop);
-
- // The condition can be loop invariant but still defined inside the
- // loop. This means that we can't just use the original 'cond' value.
- // We have to take the 'vectorized' value and pick the first lane.
- // Instcombine will make this a no-op.
- Cond = getVectorValue(Cond);
- if (InvariantCond)
- Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0));
-
- Value *Op0 = getVectorValue(Inst->getOperand(1));
- Value *Op1 = getVectorValue(Inst->getOperand(2));
- WidenMap[Inst] = Builder.CreateSelect(Cond, Op0, Op1);
- break;
- }
-
- case Instruction::ICmp:
- case Instruction::FCmp: {
- // Widen compares. Generate vector compares.
- bool FCmp = (Inst->getOpcode() == Instruction::FCmp);
- CmpInst *Cmp = dyn_cast<CmpInst>(Inst);
- Value *A = getVectorValue(Inst->getOperand(0));
- Value *B = getVectorValue(Inst->getOperand(1));
- if (FCmp)
- WidenMap[Inst] = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
- else
- WidenMap[Inst] = Builder.CreateICmp(Cmp->getPredicate(), A, B);
- break;
- }
-
- case Instruction::Store: {
- // Attempt to issue a wide store.
- StoreInst *SI = dyn_cast<StoreInst>(Inst);
- Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF);
- Value *Ptr = SI->getPointerOperand();
- unsigned Alignment = SI->getAlignment();
- GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
- // This store does not use GEPs.
- if (!Legal->isConsecutiveGep(Gep)) {
- scalarizeInstruction(Inst);
- break;
- }
+ // Vectorize all of the blocks in the original loop.
+ for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
+ be = DFS.endRPO(); bb != be; ++bb)
+ vectorizeBlockInLoop(Legal, *bb, &RdxPHIsToFix);
- // The last index does not have to be the induction. It can be
- // consecutive and be a function of the index. For example A[I+1];
- unsigned NumOperands = Gep->getNumOperands();
- Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1));
- LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
-
- // Create the new GEP with the new induction variable.
- GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
- Gep2->setOperand(NumOperands - 1, LastIndex);
- Ptr = Builder.Insert(Gep2);
- Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo());
- Value *Val = getVectorValue(SI->getValueOperand());
- Builder.CreateStore(Val, Ptr)->setAlignment(Alignment);
- break;
- }
- case Instruction::Load: {
- // Attempt to issue a wide load.
- LoadInst *LI = dyn_cast<LoadInst>(Inst);
- Type *RetTy = VectorType::get(LI->getType(), VF);
- Value *Ptr = LI->getPointerOperand();
- unsigned Alignment = LI->getAlignment();
- GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
-
- // We don't have a gep. Scalarize the load.
- if (!Legal->isConsecutiveGep(Gep)) {
- scalarizeInstruction(Inst);
- break;
- }
-
- // The last index does not have to be the induction. It can be
- // consecutive and be a function of the index. For example A[I+1];
- unsigned NumOperands = Gep->getNumOperands();
- Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
- LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
-
- // Create the new GEP with the new induction variable.
- GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
- Gep2->setOperand(NumOperands - 1, LastIndex);
- Ptr = Builder.Insert(Gep2);
- Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo());
- LI = Builder.CreateLoad(Ptr);
- LI->setAlignment(Alignment);
- // Use this vector value for all users of the load.
- WidenMap[Inst] = LI;
- break;
- }
- case Instruction::ZExt:
- case Instruction::SExt:
- case Instruction::FPToUI:
- case Instruction::FPToSI:
- case Instruction::FPExt:
- case Instruction::PtrToInt:
- case Instruction::IntToPtr:
- case Instruction::SIToFP:
- case Instruction::UIToFP:
- case Instruction::Trunc:
- case Instruction::FPTrunc:
- case Instruction::BitCast: {
- /// Vectorize bitcasts.
- CastInst *CI = dyn_cast<CastInst>(Inst);
- Value *A = getVectorValue(Inst->getOperand(0));
- Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF);
- WidenMap[Inst] = Builder.CreateCast(CI->getOpcode(), A, DestTy);
- break;
- }
-
- default:
- /// All other instructions are unsupported. Scalarize them.
- scalarizeInstruction(Inst);
- break;
- }// end of switch.
- }// end of for_each instr.
-
- // At this point every instruction in the original loop is widended to
+ // At this point every instruction in the original loop is widened to
// a vector form. We are almost done. Now, we need to fix the PHI nodes
// that we vectorized. The PHI nodes are currently empty because we did
// not want to introduce cycles. Notice that the remaining PHI nodes
@@ -994,7 +761,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
// Create the 'reduced' values for each of the induction vars.
// The reduced values are the vector values that we scalarize and combine
// after the loop is finished.
- for (PhiVector::iterator it = PHIsToFix.begin(), e = PHIsToFix.end();
+ for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end();
it != e; ++it) {
PHINode *RdxPhi = *it;
PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]);
@@ -1004,7 +771,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
assert(Legal->getReductionVars()->count(RdxPhi) &&
"Unable to find the reduction variable");
LoopVectorizationLegality::ReductionDescriptor RdxDesc =
- (*Legal->getReductionVars())[RdxPhi];
+ (*Legal->getReductionVars())[RdxPhi];
// We need to generate a reduction vector from the incoming scalar.
// To do so, we need to generate the 'identity' vector and overide
@@ -1024,8 +791,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
// This vector is the Identity vector where the first element is the
// incoming scalar reduction.
Value *VectorStart = Builder.CreateInsertElement(Identity,
- RdxDesc.StartValue, Zero);
-
+ RdxDesc.StartValue, Zero);
// Fix the vector-loop phi.
// We created the induction variable so we know that the
@@ -1035,8 +801,8 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
// Reductions do not have to start at zero. They can start with
// any loop invariant values.
VecRdxPhi->addIncoming(VectorStart, VecPreheader);
- unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
- Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx));
+ Value *Val =
+ getVectorValue(RdxPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
VecRdxPhi->addIncoming(Val, LoopVectorBody);
// Before each round, move the insertion point right between
@@ -1053,29 +819,29 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
// Extract the first scalar.
Value *Scalar0 =
- Builder.CreateExtractElement(NewPhi, Builder.getInt32(0));
+ Builder.CreateExtractElement(NewPhi, Builder.getInt32(0));
// Extract and reduce the remaining vector elements.
for (unsigned i=1; i < VF; ++i) {
Value *Scalar1 =
- Builder.CreateExtractElement(NewPhi, Builder.getInt32(i));
+ Builder.CreateExtractElement(NewPhi, Builder.getInt32(i));
switch (RdxDesc.Kind) {
- case LoopVectorizationLegality::IntegerAdd:
- Scalar0 = Builder.CreateAdd(Scalar0, Scalar1);
- break;
- case LoopVectorizationLegality::IntegerMult:
- Scalar0 = Builder.CreateMul(Scalar0, Scalar1);
- break;
- case LoopVectorizationLegality::IntegerOr:
- Scalar0 = Builder.CreateOr(Scalar0, Scalar1);
- break;
- case LoopVectorizationLegality::IntegerAnd:
- Scalar0 = Builder.CreateAnd(Scalar0, Scalar1);
- break;
- case LoopVectorizationLegality::IntegerXor:
- Scalar0 = Builder.CreateXor(Scalar0, Scalar1);
- break;
- default:
- llvm_unreachable("Unknown reduction operation");
+ case LoopVectorizationLegality::IntegerAdd:
+ Scalar0 = Builder.CreateAdd(Scalar0, Scalar1, "add.rdx");
+ break;
+ case LoopVectorizationLegality::IntegerMult:
+ Scalar0 = Builder.CreateMul(Scalar0, Scalar1, "mul.rdx");
+ break;
+ case LoopVectorizationLegality::IntegerOr:
+ Scalar0 = Builder.CreateOr(Scalar0, Scalar1, "or.rdx");
+ break;
+ case LoopVectorizationLegality::IntegerAnd:
+ Scalar0 = Builder.CreateAnd(Scalar0, Scalar1, "and.rdx");
+ break;
+ case LoopVectorizationLegality::IntegerXor:
+ Scalar0 = Builder.CreateXor(Scalar0, Scalar1, "xor.rdx");
+ break;
+ default:
+ llvm_unreachable("Unknown reduction operation");
}
}
@@ -1103,15 +869,373 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
// Fix the scalar loop reduction variable with the incoming reduction sum
// from the vector body and from the backedge value.
- int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
- int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block.
+ int IncomingEdgeBlockIdx =
+ (RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch());
+ assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
+ // Pick the other block.
+ int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
(RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0);
(RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
}// end of for each redux variable.
}
-void SingleBlockLoopVectorizer::updateAnalysis() {
- // The original basic block.
+Value *InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
+ assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) &&
+ "Invalid edge");
+
+ Value *SrcMask = createBlockInMask(Src);
+
+ // The terminator has to be a branch inst!
+ BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
+ assert(BI && "Unexpected terminator found");
+
+ Value *EdgeMask = SrcMask;
+ if (BI->isConditional()) {
+ EdgeMask = getVectorValue(BI->getCondition());
+ if (BI->getSuccessor(0) != Dst)
+ EdgeMask = Builder.CreateNot(EdgeMask);
+ }
+
+ return Builder.CreateAnd(EdgeMask, SrcMask);
+}
+
+Value *InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
+ assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
+
+ // Loop incoming mask is all-one.
+ if (OrigLoop->getHeader() == BB) {
+ Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1);
+ return getVectorValue(C);
+ }
+
+ // This is the block mask. We OR all incoming edges, and with zero.
+ Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0);
+ Value *BlockMask = getVectorValue(Zero);
+
+ // For each pred:
+ for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it)
+ BlockMask = Builder.CreateOr(BlockMask, createEdgeMask(*it, BB));
+
+ return BlockMask;
+}
+
+void
+InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
+ BasicBlock *BB, PhiVector *PV) {
+ Constant *Zero =
+ ConstantInt::get(IntegerType::getInt32Ty(BB->getContext()), 0);
+
+ // For each instruction in the old loop.
+ for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+ switch (it->getOpcode()) {
+ case Instruction::Br:
+ // Nothing to do for PHIs and BR, since we already took care of the
+ // loop control flow instructions.
+ continue;
+ case Instruction::PHI:{
+ PHINode* P = cast<PHINode>(it);
+ // Handle reduction variables:
+ if (Legal->getReductionVars()->count(P)) {
+ // This is phase one of vectorizing PHIs.
+ Type *VecTy = VectorType::get(it->getType(), VF);
+ WidenMap[it] =
+ PHINode::Create(VecTy, 2, "vec.phi",
+ LoopVectorBody->getFirstInsertionPt());
+ PV->push_back(P);
+ continue;
+ }
+
+ // Check for PHI nodes that are lowered to vector selects.
+ if (P->getParent() != OrigLoop->getHeader()) {
+ // We know that all PHIs in non header blocks are converted into
+ // selects, so we don't have to worry about the insertion order and we
+ // can just use the builder.
+
+ // At this point we generate the predication tree. There may be
+ // duplications since this is a simple recursive scan, but future
+ // optimizations will clean it up.
+ Value *Cond = createBlockInMask(P->getIncomingBlock(0));
+ WidenMap[P] =
+ Builder.CreateSelect(Cond,
+ getVectorValue(P->getIncomingValue(0)),
+ getVectorValue(P->getIncomingValue(1)),
+ "predphi");
+ continue;
+ }
+
+ // This PHINode must be an induction variable.
+ // Make sure that we know about it.
+ assert(Legal->getInductionVars()->count(P) &&
+ "Not an induction variable");
+
+ LoopVectorizationLegality::InductionInfo II =
+ Legal->getInductionVars()->lookup(P);
+
+ switch (II.IK) {
+ case LoopVectorizationLegality::NoInduction:
+ llvm_unreachable("Unknown induction");
+ case LoopVectorizationLegality::IntInduction: {
+ assert(P == OldInduction && "Unexpected PHI");
+ Value *Broadcasted = getBroadcastInstrs(Induction);
+ // After broadcasting the induction variable we need to make the
+ // vector consecutive by adding 0, 1, 2 ...
+ Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted);
+ WidenMap[OldInduction] = ConsecutiveInduction;
+ continue;
+ }
+ case LoopVectorizationLegality::ReverseIntInduction:
+ case LoopVectorizationLegality::PtrInduction:
+ // Handle reverse integer and pointer inductions.
+ Value *StartIdx = 0;
+ // If we have a single integer induction variable then use it.
+ // Otherwise, start counting at zero.
+ if (OldInduction) {
+ LoopVectorizationLegality::InductionInfo OldII =
+ Legal->getInductionVars()->lookup(OldInduction);
+ StartIdx = OldII.StartValue;
+ } else {
+ StartIdx = ConstantInt::get(Induction->getType(), 0);
+ }
+ // This is the normalized GEP that starts counting at zero.
+ Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
+ "normalized.idx");
+
+ // Handle the reverse integer induction variable case.
+ if (LoopVectorizationLegality::ReverseIntInduction == II.IK) {
+ IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType());
+ Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy,
+ "resize.norm.idx");
+ Value *ReverseInd = Builder.CreateSub(II.StartValue, CNI,
+ "reverse.idx");
+
+ // This is a new value so do not hoist it out.
+ Value *Broadcasted = getBroadcastInstrs(ReverseInd);
+ // After broadcasting the induction variable we need to make the
+ // vector consecutive by adding ... -3, -2, -1, 0.
+ Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted,
+ true);
+ WidenMap[it] = ConsecutiveInduction;
+ continue;
+ }
+
+ // Handle the pointer induction variable case.
+ assert(P->getType()->isPointerTy() && "Unexpected type.");
+
+ // This is the vector of results. Notice that we don't generate
+ // vector geps because scalar geps result in better code.
+ Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
+ for (unsigned int i = 0; i < VF; ++i) {
+ Constant *Idx = ConstantInt::get(Induction->getType(), i);
+ Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx,
+ "gep.idx");
+ Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
+ "next.gep");
+ VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
+ Builder.getInt32(i),
+ "insert.gep");
+ }
+
+ WidenMap[it] = VecVal;
+ continue;
+ }
+
+ }// End of PHI.
+
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor: {
+ // Just widen binops.
+ BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it);
+ Value *A = getVectorValue(it->getOperand(0));
+ Value *B = getVectorValue(it->getOperand(1));
+
+ // Use this vector value for all users of the original instruction.
+ Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
+ WidenMap[it] = V;
+
+ // Update the NSW, NUW and Exact flags.
+ BinaryOperator *VecOp = cast<BinaryOperator>(V);
+ if (isa<OverflowingBinaryOperator>(BinOp)) {
+ VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap());
+ VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap());
+ }
+ if (isa<PossiblyExactOperator>(VecOp))
+ VecOp->setIsExact(BinOp->isExact());
+ break;
+ }
+ case Instruction::Select: {
+ // Widen selects.
+ // If the selector is loop invariant we can create a select
+ // instruction with a scalar condition. Otherwise, use vector-select.
+ Value *Cond = it->getOperand(0);
+ bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop);
+
+ // The condition can be loop invariant but still defined inside the
+ // loop. This means that we can't just use the original 'cond' value.
+ // We have to take the 'vectorized' value and pick the first lane.
+ // Instcombine will make this a no-op.
+ Cond = getVectorValue(Cond);
+ if (InvariantCond)
+ Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0));
+
+ Value *Op0 = getVectorValue(it->getOperand(1));
+ Value *Op1 = getVectorValue(it->getOperand(2));
+ WidenMap[it] = Builder.CreateSelect(Cond, Op0, Op1);
+ break;
+ }
+
+ case Instruction::ICmp:
+ case Instruction::FCmp: {
+ // Widen compares. Generate vector compares.
+ bool FCmp = (it->getOpcode() == Instruction::FCmp);
+ CmpInst *Cmp = dyn_cast<CmpInst>(it);
+ Value *A = getVectorValue(it->getOperand(0));
+ Value *B = getVectorValue(it->getOperand(1));
+ if (FCmp)
+ WidenMap[it] = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
+ else
+ WidenMap[it] = Builder.CreateICmp(Cmp->getPredicate(), A, B);
+ break;
+ }
+
+ case Instruction::Store: {
+ // Attempt to issue a wide store.
+ StoreInst *SI = dyn_cast<StoreInst>(it);
+ Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF);
+ Value *Ptr = SI->getPointerOperand();
+ unsigned Alignment = SI->getAlignment();
+
+ assert(!Legal->isUniform(Ptr) &&
+ "We do not allow storing to uniform addresses");
+
+ GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+
+ // This store does not use GEPs.
+ if (!Legal->isConsecutivePtr(Ptr)) {
+ scalarizeInstruction(it);
+ break;
+ }
+
+ if (Gep) {
+ // The last index does not have to be the induction. It can be
+ // consecutive and be a function of the index. For example A[I+1];
+ unsigned NumOperands = Gep->getNumOperands();
+ Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1));
+ LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
+
+ // Create the new GEP with the new induction variable.
+ GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+ Gep2->setOperand(NumOperands - 1, LastIndex);
+ Ptr = Builder.Insert(Gep2);
+ } else {
+ // Use the induction element ptr.
+ assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
+ Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero);
+ }
+ Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo());
+ Value *Val = getVectorValue(SI->getValueOperand());
+ Builder.CreateStore(Val, Ptr)->setAlignment(Alignment);
+ break;
+ }
+ case Instruction::Load: {
+ // Attempt to issue a wide load.
+ LoadInst *LI = dyn_cast<LoadInst>(it);
+ Type *RetTy = VectorType::get(LI->getType(), VF);
+ Value *Ptr = LI->getPointerOperand();
+ unsigned Alignment = LI->getAlignment();
+ GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+
+ // If the pointer is loop invariant or if it is non consecutive,
+ // scalarize the load.
+ bool Con = Legal->isConsecutivePtr(Ptr);
+ if (Legal->isUniform(Ptr) || !Con) {
+ scalarizeInstruction(it);
+ break;
+ }
+
+ if (Gep) {
+ // The last index does not have to be the induction. It can be
+ // consecutive and be a function of the index. For example A[I+1];
+ unsigned NumOperands = Gep->getNumOperands();
+ Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
+ LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
+
+ // Create the new GEP with the new induction variable.
+ GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+ Gep2->setOperand(NumOperands - 1, LastIndex);
+ Ptr = Builder.Insert(Gep2);
+ } else {
+ // Use the induction element ptr.
+ assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
+ Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero);
+ }
+
+ Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo());
+ LI = Builder.CreateLoad(Ptr);
+ LI->setAlignment(Alignment);
+ // Use this vector value for all users of the load.
+ WidenMap[it] = LI;
+ break;
+ }
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast: {
+ /// Vectorize bitcasts.
+ CastInst *CI = dyn_cast<CastInst>(it);
+ Value *A = getVectorValue(it->getOperand(0));
+ Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF);
+ WidenMap[it] = Builder.CreateCast(CI->getOpcode(), A, DestTy);
+ break;
+ }
+
+ case Instruction::Call: {
+ assert(isTriviallyVectorizableIntrinsic(it));
+ Module *M = BB->getParent()->getParent();
+ IntrinsicInst *II = cast<IntrinsicInst>(it);
+ Intrinsic::ID ID = II->getIntrinsicID();
+ SmallVector<Value*, 4> Args;
+ for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i)
+ Args.push_back(getVectorValue(II->getArgOperand(i)));
+ Type *Tys[] = { VectorType::get(II->getType()->getScalarType(), VF) };
+ Function *F = Intrinsic::getDeclaration(M, ID, Tys);
+ WidenMap[it] = Builder.CreateCall(F, Args);
+ break;
+ }
+
+ default:
+ // All other instructions are unsupported. Scalarize them.
+ scalarizeInstruction(it);
+ break;
+ }// end of switch.
+ }// end of for_each instr.
+}
+
+void InnerLoopVectorizer::updateAnalysis() {
+ // Forget the original basic block.
SE->forgetLoop(OrigLoop);
// Update the dominator tree information.
@@ -1128,46 +1252,93 @@ void SingleBlockLoopVectorizer::updateAnalysis() {
DEBUG(DT->verifyAnalysis());
}
-bool LoopVectorizationLegality::canVectorize() {
- if (!TheLoop->getLoopPreheader()) {
- assert(false && "No preheader!!");
- DEBUG(dbgs() << "LV: Loop not normalized." << "\n");
- return false;
+bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
+ if (!EnableIfConversion)
+ return false;
+
+ assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
+ std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector();
+
+ // Collect the blocks that need predication.
+ for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) {
+ BasicBlock *BB = LoopBlocks[i];
+
+ // We must have at most two predecessors because we need to convert
+ // all PHIs to selects.
+ unsigned Preds = std::distance(pred_begin(BB), pred_end(BB));
+ if (Preds > 2)
+ return false;
+
+ // We must be able to predicate all blocks that need to be predicated.
+ if (blockNeedsPredication(BB) && !blockCanBePredicated(BB))
+ return false;
}
- // We can only vectorize single basic block loops.
- unsigned NumBlocks = TheLoop->getNumBlocks();
- if (NumBlocks != 1) {
- DEBUG(dbgs() << "LV: Too many blocks:" << NumBlocks << "\n");
+ // We can if-convert this loop.
+ return true;
+}
+
+bool LoopVectorizationLegality::canVectorize() {
+ assert(TheLoop->getLoopPreheader() && "No preheader!!");
+
+ // We can only vectorize innermost loops.
+ if (TheLoop->getSubLoopsVector().size())
return false;
- }
- // We need to have a loop header.
- BasicBlock *BB = TheLoop->getHeader();
- DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n");
+ // We must have a single backedge.
+ if (TheLoop->getNumBackEdges() != 1)
+ return false;
- // Go over each instruction and look at memory deps.
- if (!canVectorizeBlock(*BB)) {
- DEBUG(dbgs() << "LV: Can't vectorize this loop header\n");
+ // We must have a single exiting block.
+ if (!TheLoop->getExitingBlock())
+ return false;
+
+ unsigned NumBlocks = TheLoop->getNumBlocks();
+
+ // Check if we can if-convert non single-bb loops.
+ if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
+ DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
return false;
}
+ // We need to have a loop header.
+ BasicBlock *Latch = TheLoop->getLoopLatch();
+ DEBUG(dbgs() << "LV: Found a loop: " <<
+ TheLoop->getHeader()->getName() << "\n");
+
// ScalarEvolution needs to be able to find the exit count.
- const SCEV *ExitCount = SE->getExitCount(TheLoop, BB);
+ const SCEV *ExitCount = SE->getExitCount(TheLoop, Latch);
if (ExitCount == SE->getCouldNotCompute()) {
DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
return false;
}
// Do not loop-vectorize loops with a tiny trip count.
- unsigned TC = SE->getSmallConstantTripCount(TheLoop, BB);
+ unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch);
if (TC > 0u && TC < TinyTripCountThreshold) {
DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " <<
"This loop is not worth vectorizing.\n");
return false;
}
- DEBUG(dbgs() << "LV: We can vectorize this loop!\n");
+ // Check if we can vectorize the instructions and CFG in this loop.
+ if (!canVectorizeInstrs()) {
+ DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
+ return false;
+ }
+
+ // Go over each instruction and look at memory deps.
+ if (!canVectorizeMemory()) {
+ DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
+ return false;
+ }
+
+ // Collect all of the variables that remain uniform after vectorization.
+ collectLoopUniforms();
+
+ DEBUG(dbgs() << "LV: We can vectorize this loop" <<
+ (PtrRtCheck.Need ? " (with a runtime bound check)" : "")
+ <<"!\n");
// Okay! We can vectorize. At this point we don't have any other mem analysis
// which may limit our maximum vectorization factor, so just return true with
@@ -1175,165 +1346,194 @@ bool LoopVectorizationLegality::canVectorize() {
return true;
}
-bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
- // Scan the instructions in the block and look for hazards.
- for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
- Instruction *I = it;
+bool LoopVectorizationLegality::canVectorizeInstrs() {
+ BasicBlock *PreHeader = TheLoop->getLoopPreheader();
+ BasicBlock *Header = TheLoop->getHeader();
- PHINode *Phi = dyn_cast<PHINode>(I);
- if (Phi) {
- // This should not happen because the loop should be normalized.
- if (Phi->getNumIncomingValues() != 2) {
- DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
- return false;
- }
- // We only look at integer phi nodes.
- if (!Phi->getType()->isIntegerTy()) {
- DEBUG(dbgs() << "LV: Found an non-int PHI.\n");
- return false;
- }
+ // For each block in the loop.
+ for (Loop::block_iterator bb = TheLoop->block_begin(),
+ be = TheLoop->block_end(); bb != be; ++bb) {
- if (isInductionVariable(Phi)) {
- if (Induction) {
- DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n");
+ // Scan the instructions in the block and look for hazards.
+ for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
+ ++it) {
+
+ if (PHINode *Phi = dyn_cast<PHINode>(it)) {
+ // This should not happen because the loop should be normalized.
+ if (Phi->getNumIncomingValues() != 2) {
+ DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
return false;
}
- DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n");
- Induction = Phi;
- continue;
- }
- if (AddReductionVar(Phi, IntegerAdd)) {
- DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n");
- continue;
- }
- if (AddReductionVar(Phi, IntegerMult)) {
- DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n");
- continue;
- }
- if (AddReductionVar(Phi, IntegerOr)) {
- DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n");
- continue;
- }
- if (AddReductionVar(Phi, IntegerAnd)) {
- DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n");
- continue;
- }
- if (AddReductionVar(Phi, IntegerXor)) {
- DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n");
- continue;
- }
- DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
- return false;
- }// end of PHI handling
+ // Check that this PHI type is allowed.
+ if (!Phi->getType()->isIntegerTy() &&
+ !Phi->getType()->isPointerTy()) {
+ DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
+ return false;
+ }
- // We still don't handle functions.
- CallInst *CI = dyn_cast<CallInst>(I);
- if (CI) {
- DEBUG(dbgs() << "LV: Found a call site.\n");
- return false;
- }
+ // If this PHINode is not in the header block, then we know that we
+ // can convert it to select during if-conversion. No need to check if
+ // the PHIs in this block are induction or reduction variables.
+ if (*bb != Header)
+ continue;
- // We do not re-vectorize vectors.
- if (!VectorType::isValidElementType(I->getType()) &&
- !I->getType()->isVoidTy()) {
- DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n");
- return false;
- }
+ // This is the value coming from the preheader.
+ Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);
+ // Check if this is an induction variable.
+ InductionKind IK = isInductionVariable(Phi);
+
+ if (NoInduction != IK) {
+ // Int inductions are special because we only allow one IV.
+ if (IK == IntInduction) {
+ if (Induction) {
+ DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n");
+ return false;
+ }
+ Induction = Phi;
+ }
+
+ DEBUG(dbgs() << "LV: Found an induction variable.\n");
+ Inductions[Phi] = InductionInfo(StartValue, IK);
+ continue;
+ }
- // Reduction instructions are allowed to have exit users.
- // All other instructions must not have external users.
- if (!AllowedExit.count(I))
- //Check that all of the users of the loop are inside the BB.
- for (Value::use_iterator it = I->use_begin(), e = I->use_end();
- it != e; ++it) {
- Instruction *U = cast<Instruction>(*it);
- // This user may be a reduction exit value.
- BasicBlock *Parent = U->getParent();
- if (Parent != &BB) {
- DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
- return false;
+ if (AddReductionVar(Phi, IntegerAdd)) {
+ DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n");
+ continue;
}
- }
- } // next instr.
+ if (AddReductionVar(Phi, IntegerMult)) {
+ DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n");
+ continue;
+ }
+ if (AddReductionVar(Phi, IntegerOr)) {
+ DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n");
+ continue;
+ }
+ if (AddReductionVar(Phi, IntegerAnd)) {
+ DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n");
+ continue;
+ }
+ if (AddReductionVar(Phi, IntegerXor)) {
+ DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n");
+ continue;
+ }
+
+ DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
+ return false;
+ }// end of PHI handling
+
+ // We still don't handle functions.
+ CallInst *CI = dyn_cast<CallInst>(it);
+ if (CI && !isTriviallyVectorizableIntrinsic(it)) {
+ DEBUG(dbgs() << "LV: Found a call site.\n");
+ return false;
+ }
+
+ // We do not re-vectorize vectors.
+ if (!VectorType::isValidElementType(it->getType()) &&
+ !it->getType()->isVoidTy()) {
+ DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n");
+ return false;
+ }
+
+ // Reduction instructions are allowed to have exit users.
+ // All other instructions must not have external users.
+ if (!AllowedExit.count(it))
+ //Check that all of the users of the loop are inside the BB.
+ for (Value::use_iterator I = it->use_begin(), E = it->use_end();
+ I != E; ++I) {
+ Instruction *U = cast<Instruction>(*I);
+ // This user may be a reduction exit value.
+ if (!TheLoop->contains(U)) {
+ DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
+ return false;
+ }
+ }
+ } // next instr.
+
+ }
if (!Induction) {
- DEBUG(dbgs() << "LV: Did not find an induction var.\n");
- return false;
+ DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
+ assert(getInductionVars()->size() && "No induction variables");
}
- // Don't vectorize if the memory dependencies do not allow vectorization.
- if (!canVectorizeMemory(BB))
- return false;
+ return true;
+}
+void LoopVectorizationLegality::collectLoopUniforms() {
// We now know that the loop is vectorizable!
// Collect variables that will remain uniform after vectorization.
std::vector<Value*> Worklist;
+ BasicBlock *Latch = TheLoop->getLoopLatch();
// Start with the conditional branch and walk up the block.
- Worklist.push_back(BB.getTerminator()->getOperand(0));
+ Worklist.push_back(Latch->getTerminator()->getOperand(0));
while (Worklist.size()) {
Instruction *I = dyn_cast<Instruction>(Worklist.back());
Worklist.pop_back();
- // Look at instructions inside this block.
- if (!I) continue;
- if (I->getParent() != &BB) continue;
+ // Look at instructions inside this loop.
// Stop when reaching PHI nodes.
- if (isa<PHINode>(I)) {
- assert(I == Induction && "Found a uniform PHI that is not the induction");
- break;
- }
+ // TODO: we need to follow values all over the loop, not only in this block.
+ if (!I || !TheLoop->contains(I) || isa<PHINode>(I))
+ continue;
// This is a known uniform.
Uniforms.insert(I);
// Insert all operands.
- for (int i=0, Op = I->getNumOperands(); i < Op; ++i) {
+ for (int i = 0, Op = I->getNumOperands(); i < Op; ++i) {
Worklist.push_back(I->getOperand(i));
}
}
-
- return true;
}
-bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
+bool LoopVectorizationLegality::canVectorizeMemory() {
typedef SmallVector<Value*, 16> ValueVector;
typedef SmallPtrSet<Value*, 16> ValueSet;
// Holds the Load and Store *instructions*.
ValueVector Loads;
ValueVector Stores;
-
- // Scan the BB and collect legal loads and stores.
- for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
- Instruction *I = it;
-
- // If this is a load, save it. If this instruction can read from memory
- // but is not a load, then we quit. Notice that we don't handle function
- // calls that read or write.
- if (I->mayReadFromMemory()) {
- LoadInst *Ld = dyn_cast<LoadInst>(I);
- if (!Ld) return false;
- if (!Ld->isSimple()) {
- DEBUG(dbgs() << "LV: Found a non-simple load.\n");
- return false;
+ PtrRtCheck.Pointers.clear();
+ PtrRtCheck.Need = false;
+
+ // For each block.
+ for (Loop::block_iterator bb = TheLoop->block_begin(),
+ be = TheLoop->block_end(); bb != be; ++bb) {
+
+ // Scan the BB and collect legal loads and stores.
+ for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
+ ++it) {
+
+ // If this is a load, save it. If this instruction can read from memory
+ // but is not a load, then we quit. Notice that we don't handle function
+ // calls that read or write.
+ if (it->mayReadFromMemory()) {
+ LoadInst *Ld = dyn_cast<LoadInst>(it);
+ if (!Ld) return false;
+ if (!Ld->isSimple()) {
+ DEBUG(dbgs() << "LV: Found a non-simple load.\n");
+ return false;
+ }
+ Loads.push_back(Ld);
+ continue;
}
- Loads.push_back(Ld);
- continue;
- }
- // Save store instructions. Abort if other instructions write to memory.
- if (I->mayWriteToMemory()) {
- StoreInst *St = dyn_cast<StoreInst>(I);
- if (!St) return false;
- if (!St->isSimple()) {
- DEBUG(dbgs() << "LV: Found a non-simple store.\n");
- return false;
+ // Save 'store' instructions. Abort if other instructions write to memory.
+ if (it->mayWriteToMemory()) {
+ StoreInst *St = dyn_cast<StoreInst>(it);
+ if (!St) return false;
+ if (!St->isSimple()) {
+ DEBUG(dbgs() << "LV: Found a non-simple store.\n");
+ return false;
+ }
+ Stores.push_back(St);
}
- Stores.push_back(St);
- }
- } // next instr.
+ } // next instr.
+ } // next block.
// Now we have two lists that hold the loads and the stores.
// Next, we find the pointers that they use.
@@ -1341,8 +1541,8 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
// Check if we see any stores. If there are no stores, then we don't
// care if the pointers are *restrict*.
if (!Stores.size()) {
- DEBUG(dbgs() << "LV: Found a read-only loop!\n");
- return true;
+ DEBUG(dbgs() << "LV: Found a read-only loop!\n");
+ return true;
}
// Holds the read and read-write *pointers* that we find.
@@ -1361,6 +1561,12 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
StoreInst *ST = dyn_cast<StoreInst>(*I);
assert(ST && "Bad StoreInst");
Value* Ptr = ST->getPointerOperand();
+
+ if (isUniform(Ptr)) {
+ DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
+ return false;
+ }
+
// If we did *not* see this pointer before, insert it to
// the read-write list. At this phase it is only a 'write' list.
if (Seen.insert(Ptr))
@@ -1379,7 +1585,7 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
// If the address of i is unknown (for example A[B[i]]) then we may
// read a few words, modify, and write a few words, and some of the
// words may be written to the same address.
- if (Seen.insert(Ptr) || !isConsecutiveGep(Ptr))
+ if (Seen.insert(Ptr) || !isConsecutivePtr(Ptr))
Reads.push_back(Ptr);
}
@@ -1390,6 +1596,39 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
return true;
}
+ // Find pointers with computable bounds. We are going to use this information
+ // to place a runtime bound check.
+ bool RT = true;
+ for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I)
+ if (hasComputableBounds(*I)) {
+ PtrRtCheck.insert(SE, TheLoop, *I);
+ DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
+ } else {
+ RT = false;
+ break;
+ }
+ for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I)
+ if (hasComputableBounds(*I)) {
+ PtrRtCheck.insert(SE, TheLoop, *I);
+ DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
+ } else {
+ RT = false;
+ break;
+ }
+
+ // Check that we did not collect too many pointers or found a
+ // unsizeable pointer.
+ if (!RT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) {
+ PtrRtCheck.reset();
+ RT = false;
+ }
+
+ PtrRtCheck.Need = RT;
+
+ if (RT) {
+ DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n");
+ }
+
// Now that the pointers are in two lists (Reads and ReadWrites), we
// can check that there are no conflicts between each of the writes and
// between the writes to the reads.
@@ -1404,12 +1643,12 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
it != e; ++it) {
if (!isIdentifiedObject(*it)) {
DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n");
- return false;
+ return RT;
}
if (!WriteObjects.insert(*it)) {
DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
<< **it <<"\n");
- return false;
+ return RT;
}
}
TempObjects.clear();
@@ -1422,18 +1661,20 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
it != e; ++it) {
if (!isIdentifiedObject(*it)) {
DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n");
- return false;
+ return RT;
}
if (WriteObjects.count(*it)) {
DEBUG(dbgs() << "LV: Found a possible read/write reorder:"
<< **it <<"\n");
- return false;
+ return RT;
}
}
TempObjects.clear();
}
- // All is okay.
+ // It is safe to vectorize and we don't need any runtime checks.
+ DEBUG(dbgs() << "LV: We don't need a runtime memory check.\n");
+ PtrRtCheck.reset();
return true;
}
@@ -1442,11 +1683,13 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
if (Phi->getNumIncomingValues() != 2)
return false;
- // Find the possible incoming reduction variable.
- BasicBlock *BB = Phi->getParent();
- int SelfEdgeIdx = Phi->getBasicBlockIndex(BB);
- int InEdgeBlockIdx = (SelfEdgeIdx ? 0 : 1); // The other entry.
- Value *RdxStart = Phi->getIncomingValue(InEdgeBlockIdx);
+ // Reduction variables are only found in the loop header block.
+ if (Phi->getParent() != TheLoop->getHeader())
+ return false;
+
+ // Obtain the reduction start value from the value that comes from the loop
+ // preheader.
+ Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader());
// ExitInstruction is the single value which is used outside the loop.
// We only allow for a single reduction value to be used outside the loop.
@@ -1461,20 +1704,20 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
// Also, we can't have multiple block-local users.
Instruction *Iter = Phi;
while (true) {
+ // If the instruction has no users then this is a broken
+ // chain and can't be a reduction variable.
+ if (Iter->use_empty())
+ return false;
+
// Any reduction instr must be of one of the allowed kinds.
if (!isReductionInstr(Iter, Kind))
return false;
- // Did we found a user inside this block ?
+ // Did we find a user inside this block ?
bool FoundInBlockUser = false;
// Did we reach the initial PHI node ?
bool FoundStartPHI = false;
- // If the instruction has no users then this is a broken
- // chain and can't be a reduction variable.
- if (Iter->use_empty())
- return false;
-
// For each of the *users* of iter.
for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end();
it != e; ++it) {
@@ -1484,14 +1727,23 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
FoundStartPHI = true;
continue;
}
+
// Check if we found the exit user.
BasicBlock *Parent = U->getParent();
- if (Parent != BB) {
- // We must have a single exit instruction.
+ if (!TheLoop->contains(Parent)) {
+ // Exit if you find multiple outside users.
if (ExitInstruction != 0)
return false;
ExitInstruction = Iter;
}
+
+ // We allow in-loop PHINodes which are not the original reduction PHI
+ // node. If this PHI is the only user of Iter (happens in IF w/ no ELSE
+ // structure) then don't skip this PHI.
+ if (isa<PHINode>(U) && U->getParent() != TheLoop->getHeader() &&
+ TheLoop->contains(U) && Iter->getNumUses() > 1)
+ continue;
+
// We can't have multiple inside users.
if (FoundInBlockUser)
return false;
@@ -1502,60 +1754,121 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
// We found a reduction var if we have reached the original
// phi node and we only have a single instruction with out-of-loop
// users.
- if (FoundStartPHI && ExitInstruction) {
- // This instruction is allowed to have out-of-loop users.
- AllowedExit.insert(ExitInstruction);
-
- // Save the description of this reduction variable.
- ReductionDescriptor RD(RdxStart, ExitInstruction, Kind);
- Reductions[Phi] = RD;
- return true;
- }
+ if (FoundStartPHI && ExitInstruction) {
+ // This instruction is allowed to have out-of-loop users.
+ AllowedExit.insert(ExitInstruction);
+
+ // Save the description of this reduction variable.
+ ReductionDescriptor RD(RdxStart, ExitInstruction, Kind);
+ Reductions[Phi] = RD;
+ return true;
+ }
+
+ // If we've reached the start PHI but did not find an outside user then
+ // this is dead code. Abort.
+ if (FoundStartPHI)
+ return false;
}
}
bool
LoopVectorizationLegality::isReductionInstr(Instruction *I,
ReductionKind Kind) {
- switch (I->getOpcode()) {
- default:
- return false;
- case Instruction::PHI:
- // possibly.
- return true;
- case Instruction::Add:
- case Instruction::Sub:
- return Kind == IntegerAdd;
- case Instruction::Mul:
- case Instruction::UDiv:
- case Instruction::SDiv:
- return Kind == IntegerMult;
- case Instruction::And:
- return Kind == IntegerAnd;
- case Instruction::Or:
- return Kind == IntegerOr;
- case Instruction::Xor:
- return Kind == IntegerXor;
- }
+ switch (I->getOpcode()) {
+ default:
+ return false;
+ case Instruction::PHI:
+ // possibly.
+ return true;
+ case Instruction::Add:
+ case Instruction::Sub:
+ return Kind == IntegerAdd;
+ case Instruction::Mul:
+ return Kind == IntegerMult;
+ case Instruction::And:
+ return Kind == IntegerAnd;
+ case Instruction::Or:
+ return Kind == IntegerOr;
+ case Instruction::Xor:
+ return Kind == IntegerXor;
+ }
}
-bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
+LoopVectorizationLegality::InductionKind
+LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
+ Type *PhiTy = Phi->getType();
+ // We only handle integer and pointer inductions variables.
+ if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy())
+ return NoInduction;
+
// Check that the PHI is consecutive and starts at zero.
const SCEV *PhiScev = SE->getSCEV(Phi);
const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
if (!AR) {
DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
- return false;
+ return NoInduction;
}
const SCEV *Step = AR->getStepRecurrence(*SE);
- if (!Step->isOne()) {
- DEBUG(dbgs() << "LV: PHI stride does not equal one.\n");
- return false;
+ // Integer inductions need to have a stride of one.
+ if (PhiTy->isIntegerTy()) {
+ if (Step->isOne())
+ return IntInduction;
+ if (Step->isAllOnesValue())
+ return ReverseIntInduction;
+ return NoInduction;
+ }
+
+ // Calculate the pointer stride and check if it is consecutive.
+ const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
+ if (!C)
+ return NoInduction;
+
+ assert(PhiTy->isPointerTy() && "The PHI must be a pointer");
+ uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType());
+ if (C->getValue()->equalsInt(Size))
+ return PtrInduction;
+
+ return NoInduction;
+}
+
+bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) {
+ assert(TheLoop->contains(BB) && "Unknown block used");
+
+ // Blocks that do not dominate the latch need predication.
+ BasicBlock* Latch = TheLoop->getLoopLatch();
+ return !DT->dominates(BB, Latch);
+}
+
+bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) {
+ for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+ // We don't predicate loads/stores at the moment.
+ if (it->mayReadFromMemory() || it->mayWriteToMemory() || it->mayThrow())
+ return false;
+
+ // The isntructions below can trap.
+ switch (it->getOpcode()) {
+ default: continue;
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ return false;
+ }
}
+
return true;
}
+bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) {
+ const SCEV *PhiScev = SE->getSCEV(Ptr);
+ const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
+ if (!AR)
+ return false;
+
+ return AR->isAffine();
+}
+
unsigned
LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) {
if (!VTTI) {
@@ -1584,19 +1897,29 @@ LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) {
}
unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
- // We can only estimate the cost of single basic block loops.
- assert(1 == TheLoop->getNumBlocks() && "Too many blocks in loop");
-
- BasicBlock *BB = TheLoop->getHeader();
unsigned Cost = 0;
- // For each instruction in the old loop.
- for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
- Instruction *Inst = it;
- unsigned C = getInstructionCost(Inst, VF);
- Cost += C;
- DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF "<< VF <<
- " For instruction: "<< *Inst << "\n");
+ // For each block.
+ for (Loop::block_iterator bb = TheLoop->block_begin(),
+ be = TheLoop->block_end(); bb != be; ++bb) {
+ unsigned BlockCost = 0;
+ BasicBlock *BB = *bb;
+
+ // For each instruction in the old loop.
+ for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+ unsigned C = getInstructionCost(it, VF);
+ Cost += C;
+ DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF " <<
+ VF << " For instruction: "<< *it << "\n");
+ }
+
+ // We assume that if-converted blocks have a 50% chance of being executed.
+ // When the code is scalar then some of the blocks are avoided due to CF.
+ // When the code is vectorized we execute all code paths.
+ if (Legal->blockNeedsPredication(*bb) && VF == 1)
+ BlockCost /= 2;
+
+ Cost += BlockCost;
}
return Cost;
@@ -1614,147 +1937,156 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
Type *RetTy = I->getType();
Type *VectorTy = ToVectorTy(RetTy, VF);
-
// TODO: We need to estimate the cost of intrinsic calls.
switch (I->getOpcode()) {
- case Instruction::GetElementPtr:
- // We mark this instruction as zero-cost because scalar GEPs are usually
- // lowered to the intruction addressing mode. At the moment we don't
- // generate vector geps.
- return 0;
- case Instruction::Br: {
- return VTTI->getCFInstrCost(I->getOpcode());
- }
- case Instruction::PHI:
- return 0;
- case Instruction::Add:
- case Instruction::FAdd:
- case Instruction::Sub:
- case Instruction::FSub:
- case Instruction::Mul:
- case Instruction::FMul:
- case Instruction::UDiv:
- case Instruction::SDiv:
- case Instruction::FDiv:
- case Instruction::URem:
- case Instruction::SRem:
- case Instruction::FRem:
- case Instruction::Shl:
- case Instruction::LShr:
- case Instruction::AShr:
- case Instruction::And:
- case Instruction::Or:
- case Instruction::Xor: {
- return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy);
- }
- case Instruction::Select: {
- SelectInst *SI = cast<SelectInst>(I);
- const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
- bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
- Type *CondTy = SI->getCondition()->getType();
- if (ScalarCond)
- CondTy = VectorType::get(CondTy, VF);
-
- return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
- }
- case Instruction::ICmp:
- case Instruction::FCmp: {
- Type *ValTy = I->getOperand(0)->getType();
- VectorTy = ToVectorTy(ValTy, VF);
- return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy);
- }
- case Instruction::Store: {
- StoreInst *SI = cast<StoreInst>(I);
- Type *ValTy = SI->getValueOperand()->getType();
- VectorTy = ToVectorTy(ValTy, VF);
-
- if (VF == 1)
- return VTTI->getMemoryOpCost(I->getOpcode(), ValTy,
- SI->getAlignment(), SI->getPointerAddressSpace());
-
- // Scalarized stores.
- if (!Legal->isConsecutiveGep(SI->getPointerOperand())) {
- unsigned Cost = 0;
- unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
- ValTy);
- // The cost of extracting from the value vector.
- Cost += VF * (ExtCost);
- // The cost of the scalar stores.
- Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
- ValTy->getScalarType(),
- SI->getAlignment(),
- SI->getPointerAddressSpace());
- return Cost;
- }
-
- // Wide stores.
- return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(),
+ case Instruction::GetElementPtr:
+ // We mark this instruction as zero-cost because scalar GEPs are usually
+ // lowered to the intruction addressing mode. At the moment we don't
+ // generate vector geps.
+ return 0;
+ case Instruction::Br: {
+ return VTTI->getCFInstrCost(I->getOpcode());
+ }
+ case Instruction::PHI:
+ //TODO: IF-converted IFs become selects.
+ return 0;
+ case Instruction::Add:
+ case Instruction::FAdd:
+ case Instruction::Sub:
+ case Instruction::FSub:
+ case Instruction::Mul:
+ case Instruction::FMul:
+ case Instruction::UDiv:
+ case Instruction::SDiv:
+ case Instruction::FDiv:
+ case Instruction::URem:
+ case Instruction::SRem:
+ case Instruction::FRem:
+ case Instruction::Shl:
+ case Instruction::LShr:
+ case Instruction::AShr:
+ case Instruction::And:
+ case Instruction::Or:
+ case Instruction::Xor:
+ return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy);
+ case Instruction::Select: {
+ SelectInst *SI = cast<SelectInst>(I);
+ const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
+ bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
+ Type *CondTy = SI->getCondition()->getType();
+ if (ScalarCond)
+ CondTy = VectorType::get(CondTy, VF);
+
+ return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
+ }
+ case Instruction::ICmp:
+ case Instruction::FCmp: {
+ Type *ValTy = I->getOperand(0)->getType();
+ VectorTy = ToVectorTy(ValTy, VF);
+ return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy);
+ }
+ case Instruction::Store: {
+ StoreInst *SI = cast<StoreInst>(I);
+ Type *ValTy = SI->getValueOperand()->getType();
+ VectorTy = ToVectorTy(ValTy, VF);
+
+ if (VF == 1)
+ return VTTI->getMemoryOpCost(I->getOpcode(), ValTy,
+ SI->getAlignment(),
SI->getPointerAddressSpace());
+
+ // Scalarized stores.
+ if (!Legal->isConsecutivePtr(SI->getPointerOperand())) {
+ unsigned Cost = 0;
+ unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
+ ValTy);
+ // The cost of extracting from the value vector.
+ Cost += VF * (ExtCost);
+ // The cost of the scalar stores.
+ Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
+ ValTy->getScalarType(),
+ SI->getAlignment(),
+ SI->getPointerAddressSpace());
+ return Cost;
}
- case Instruction::Load: {
- LoadInst *LI = cast<LoadInst>(I);
-
- if (VF == 1)
- return VTTI->getMemoryOpCost(I->getOpcode(), RetTy,
- LI->getAlignment(),
- LI->getPointerAddressSpace());
-
- // Scalarized loads.
- if (!Legal->isConsecutiveGep(LI->getPointerOperand())) {
- unsigned Cost = 0;
- unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy);
- // The cost of inserting the loaded value into the result vector.
- Cost += VF * (InCost);
- // The cost of the scalar stores.
- Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
- RetTy->getScalarType(),
- LI->getAlignment(),
- LI->getPointerAddressSpace());
- return Cost;
- }
- // Wide loads.
- return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(),
+ // Wide stores.
+ return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(),
+ SI->getPointerAddressSpace());
+ }
+ case Instruction::Load: {
+ LoadInst *LI = cast<LoadInst>(I);
+
+ if (VF == 1)
+ return VTTI->getMemoryOpCost(I->getOpcode(), RetTy,
+ LI->getAlignment(),
LI->getPointerAddressSpace());
- }
- case Instruction::ZExt:
- case Instruction::SExt:
- case Instruction::FPToUI:
- case Instruction::FPToSI:
- case Instruction::FPExt:
- case Instruction::PtrToInt:
- case Instruction::IntToPtr:
- case Instruction::SIToFP:
- case Instruction::UIToFP:
- case Instruction::Trunc:
- case Instruction::FPTrunc:
- case Instruction::BitCast: {
- Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
- return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
- }
- default: {
- // We are scalarizing the instruction. Return the cost of the scalar
- // instruction, plus the cost of insert and extract into vector
- // elements, times the vector width.
+
+ // Scalarized loads.
+ if (!Legal->isConsecutivePtr(LI->getPointerOperand())) {
unsigned Cost = 0;
+ unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy);
+ // The cost of inserting the loaded value into the result vector.
+ Cost += VF * (InCost);
+ // The cost of the scalar stores.
+ Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
+ RetTy->getScalarType(),
+ LI->getAlignment(),
+ LI->getPointerAddressSpace());
+ return Cost;
+ }
- bool IsVoid = RetTy->isVoidTy();
+ // Wide loads.
+ return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(),
+ LI->getPointerAddressSpace());
+ }
+ case Instruction::ZExt:
+ case Instruction::SExt:
+ case Instruction::FPToUI:
+ case Instruction::FPToSI:
+ case Instruction::FPExt:
+ case Instruction::PtrToInt:
+ case Instruction::IntToPtr:
+ case Instruction::SIToFP:
+ case Instruction::UIToFP:
+ case Instruction::Trunc:
+ case Instruction::FPTrunc:
+ case Instruction::BitCast: {
+ Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
+ return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
+ }
+ case Instruction::Call: {
+ assert(isTriviallyVectorizableIntrinsic(I));
+ IntrinsicInst *II = cast<IntrinsicInst>(I);
+ Type *RetTy = ToVectorTy(II->getType(), VF);
+ SmallVector<Type*, 4> Tys;
+ for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i)
+ Tys.push_back(ToVectorTy(II->getArgOperand(i)->getType(), VF));
+ return VTTI->getIntrinsicInstrCost(II->getIntrinsicID(), RetTy, Tys);
+ }
+ default: {
+ // We are scalarizing the instruction. Return the cost of the scalar
+ // instruction, plus the cost of insert and extract into vector
+ // elements, times the vector width.
+ unsigned Cost = 0;
- unsigned InsCost = (IsVoid ? 0 :
- VTTI->getInstrCost(Instruction::InsertElement,
- VectorTy));
+ bool IsVoid = RetTy->isVoidTy();
- unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
- VectorTy);
+ unsigned InsCost = (IsVoid ? 0 :
+ VTTI->getInstrCost(Instruction::InsertElement,
+ VectorTy));
- // The cost of inserting the results plus extracting each one of the
- // operands.
- Cost += VF * (InsCost + ExtCost * I->getNumOperands());
+ unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
+ VectorTy);
- // The cost of executing VF copies of the scalar instruction.
- Cost += VF * VTTI->getInstrCost(I->getOpcode(), RetTy);
- return Cost;
- }
+ // The cost of inserting the results plus extracting each one of the
+ // operands.
+ Cost += VF * (InsCost + ExtCost * I->getNumOperands());
+
+ // The cost of executing VF copies of the scalar instruction.
+ Cost += VF * VTTI->getInstrCost(I->getOpcode(), RetTy);
+ return Cost;
+ }
}// end of switch.
}
@@ -1764,8 +2096,6 @@ Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) {
return VectorType::get(Scalar, VF);
}
-} // namespace
-
char LoopVectorize::ID = 0;
static const char lv_name[] = "Loop Vectorization";
INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
@@ -1780,3 +2110,4 @@ namespace llvm {
}
}
+