diff options
author | Matt Beaumont-Gay <matthewbg@google.com> | 2012-12-14 17:55:15 +0000 |
---|---|---|
committer | Matt Beaumont-Gay <matthewbg@google.com> | 2012-12-14 17:55:15 +0000 |
commit | 6aed25d93d1cfcde5809a73ffa7dc1b0d6396f66 (patch) | |
tree | 57e2fdf1caf960d8d878e0289f32af6759832b49 /lib/Transforms/Vectorize/LoopVectorize.cpp | |
parent | 7139cfb19b1cc28dfd5e274c07ec68835bc6d6d6 (diff) | |
parent | 1ad9253c9d34ccbce3e7e4ea5d87c266cbf93410 (diff) |
Updating branches/google/stable to r169803
git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/google/stable@170212 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r-- | lib/Transforms/Vectorize/LoopVectorize.cpp | 2277 |
1 files changed, 1304 insertions, 973 deletions
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 892808760f..feeececedb 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6,355 +6,50 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -// -// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops -// and generates target-independent LLVM-IR. Legalization of the IR is done -// in the codegen. However, the vectorizes uses (will use) the codegen -// interfaces to generate IR that is likely to result in an optimal binary. -// -// The loop vectorizer combines consecutive loop iteration into a single -// 'wide' iteration. After this transformation the index is incremented -// by the SIMD vector width, and not by one. -// -// This pass has three parts: -// 1. The main loop pass that drives the different parts. -// 2. LoopVectorizationLegality - A unit that checks for the legality -// of the vectorization. -// 3. SingleBlockLoopVectorizer - A unit that performs the actual -// widening of instructions. -// 4. LoopVectorizationCostModel - A unit that checks for the profitability -// of vectorization. It decides on the optimal vector width, which -// can be one, if vectorization is not profitable. -//===----------------------------------------------------------------------===// -// -// The reduction-variable vectorization is based on the paper: -// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization. -// -// Variable uniformity checks are inspired by: -// Karrenberg, R. and Hack, S. Whole Function Vectorization. -// -// Other ideas/concepts are from: -// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later. -// -//===----------------------------------------------------------------------===// -#define LV_NAME "loop-vectorize" -#define DEBUG_TYPE LV_NAME -#include "llvm/Constants.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Instructions.h" -#include "llvm/LLVMContext.h" -#include "llvm/Pass.h" -#include "llvm/Analysis/LoopPass.h" -#include "llvm/Value.h" -#include "llvm/Function.h" -#include "llvm/Analysis/Verifier.h" -#include "llvm/Module.h" -#include "llvm/Type.h" -#include "llvm/ADT/SmallVector.h" +#include "LoopVectorize.h" #include "llvm/ADT/StringExtras.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AliasSetTracker.h" -#include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/Dominators.h" -#include "llvm/Analysis/ScalarEvolutionExpressions.h" -#include "llvm/Analysis/ScalarEvolutionExpander.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/LoopIterator.h" +#include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ScalarEvolutionExpander.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" -#include "llvm/TargetTransformInfo.h" +#include "llvm/Analysis/Verifier.h" +#include "llvm/Constants.h" +#include "llvm/DataLayout.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/LLVMContext.h" +#include "llvm/Module.h" +#include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/DataLayout.h" +#include "llvm/TargetTransformInfo.h" +#include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" -#include <algorithm> -using namespace llvm; +#include "llvm/Transforms/Vectorize.h" +#include "llvm/Type.h" +#include "llvm/Value.h" static cl::opt<unsigned> VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden, - cl::desc("Set the default vectorization width. Zero is autoselect.")); + cl::desc("Sets the SIMD width. Zero is autoselect.")); -/// We don't vectorize loops with a known constant trip count below this number. -const unsigned TinyTripCountThreshold = 16; +static cl::opt<bool> +EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden, + cl::desc("Enable if-conversion during vectorization.")); namespace { -// Forward declarations. -class LoopVectorizationLegality; -class LoopVectorizationCostModel; - -/// SingleBlockLoopVectorizer vectorizes loops which contain only one basic -/// block to a specified vectorization factor (VF). -/// This class performs the widening of scalars into vectors, or multiple -/// scalars. This class also implements the following features: -/// * It inserts an epilogue loop for handling loops that don't have iteration -/// counts that are known to be a multiple of the vectorization factor. -/// * It handles the code generation for reduction variables. -/// * Scalarization (implementation using scalars) of un-vectorizable -/// instructions. -/// SingleBlockLoopVectorizer does not perform any vectorization-legality -/// checks, and relies on the caller to check for the different legality -/// aspects. The SingleBlockLoopVectorizer relies on the -/// LoopVectorizationLegality class to provide information about the induction -/// and reduction variables that were found to a given vectorization factor. -class SingleBlockLoopVectorizer { -public: - /// Ctor. - SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li, - DominatorTree *dt, LPPassManager *Lpm, - unsigned VecWidth): - OrigLoop(Orig), SE(Se), LI(Li), DT(dt), LPM(Lpm), VF(VecWidth), - Builder(Se->getContext()), Induction(0), OldInduction(0) { } - - // Perform the actual loop widening (vectorization). - void vectorize(LoopVectorizationLegality *Legal) { - ///Create a new empty loop. Unlink the old loop and connect the new one. - createEmptyLoop(Legal); - /// Widen each instruction in the old loop to a new one in the new loop. - /// Use the Legality module to find the induction and reduction variables. - vectorizeLoop(Legal); - // register the new loop. - updateAnalysis(); - } - -private: - /// Create an empty loop, based on the loop ranges of the old loop. - void createEmptyLoop(LoopVectorizationLegality *Legal); - /// Copy and widen the instructions from the old loop. - void vectorizeLoop(LoopVectorizationLegality *Legal); - /// Insert the new loop to the loop hierarchy and pass manager. - void updateAnalysis(); - - /// This instruction is un-vectorizable. Implement it as a sequence - /// of scalars. - void scalarizeInstruction(Instruction *Instr); - - /// Create a broadcast instruction. This method generates a broadcast - /// instruction (shuffle) for loop invariant values and for the induction - /// value. If this is the induction variable then we extend it to N, N+1, ... - /// this is needed because each iteration in the loop corresponds to a SIMD - /// element. - Value *getBroadcastInstrs(Value *V); - - /// This is a helper function used by getBroadcastInstrs. It adds 0, 1, 2 .. - /// for each element in the vector. Starting from zero. - Value *getConsecutiveVector(Value* Val); - - /// When we go over instructions in the basic block we rely on previous - /// values within the current basic block or on loop invariant values. - /// When we widen (vectorize) values we place them in the map. If the values - /// are not within the map, they have to be loop invariant, so we simply - /// broadcast them into a vector. - Value *getVectorValue(Value *V); - - /// Get a uniform vector of constant integers. We use this to get - /// vectors of ones and zeros for the reduction code. - Constant* getUniformVector(unsigned Val, Type* ScalarTy); - - typedef DenseMap<Value*, Value*> ValueMap; - - /// The original loop. - Loop *OrigLoop; - // Scev analysis to use. - ScalarEvolution *SE; - // Loop Info. - LoopInfo *LI; - // Dominator Tree. - DominatorTree *DT; - // Loop Pass Manager; - LPPassManager *LPM; - // The vectorization factor to use. - unsigned VF; - - // The builder that we use - IRBuilder<> Builder; - - // --- Vectorization state --- - - /// The vector-loop preheader. - BasicBlock *LoopVectorPreHeader; - /// The scalar-loop preheader. - BasicBlock *LoopScalarPreHeader; - /// Middle Block between the vector and the scalar. - BasicBlock *LoopMiddleBlock; - ///The ExitBlock of the scalar loop. - BasicBlock *LoopExitBlock; - ///The vector loop body. - BasicBlock *LoopVectorBody; - ///The scalar loop body. - BasicBlock *LoopScalarBody; - ///The first bypass block. - BasicBlock *LoopBypassBlock; - - /// The new Induction variable which was added to the new block. - PHINode *Induction; - /// The induction variable of the old basic block. - PHINode *OldInduction; - // Maps scalars to widened vectors. - ValueMap WidenMap; -}; - -/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and -/// to what vectorization factor. -/// This class does not look at the profitability of vectorization, only the -/// legality. This class has two main kinds of checks: -/// * Memory checks - The code in canVectorizeMemory checks if vectorization -/// will change the order of memory accesses in a way that will change the -/// correctness of the program. -/// * Scalars checks - The code in canVectorizeBlock checks for a number -/// of different conditions, such as the availability of a single induction -/// variable, that all types are supported and vectorize-able, etc. -/// This code reflects the capabilities of SingleBlockLoopVectorizer. -/// This class is also used by SingleBlockLoopVectorizer for identifying -/// induction variable and the different reduction variables. -class LoopVectorizationLegality { -public: - LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl): - TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { } - - /// This represents the kinds of reductions that we support. - enum ReductionKind { - NoReduction, /// Not a reduction. - IntegerAdd, /// Sum of numbers. - IntegerMult, /// Product of numbers. - IntegerOr, /// Bitwise or logical OR of numbers. - IntegerAnd, /// Bitwise or logical AND of numbers. - IntegerXor /// Bitwise or logical XOR of numbers. - }; - - /// This POD struct holds information about reduction variables. - struct ReductionDescriptor { - // Default C'tor - ReductionDescriptor(): - StartValue(0), LoopExitInstr(0), Kind(NoReduction) {} - - // C'tor. - ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K): - StartValue(Start), LoopExitInstr(Exit), Kind(K) {} - - // The starting value of the reduction. - // It does not have to be zero! - Value *StartValue; - // The instruction who's value is used outside the loop. - Instruction *LoopExitInstr; - // The kind of the reduction. - ReductionKind Kind; - }; - - /// ReductionList contains the reduction descriptors for all - /// of the reductions that were found in the loop. - typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList; - - /// Returns true if it is legal to vectorize this loop. - /// This does not mean that it is profitable to vectorize this - /// loop, only that it is legal to do so. - bool canVectorize(); - - /// Returns the Induction variable. - PHINode *getInduction() {return Induction;} - - /// Returns the reduction variables found in the loop. - ReductionList *getReductionVars() { return &Reductions; } - - /// Check if the pointer returned by this GEP is consecutive - /// when the index is vectorized. This happens when the last - /// index of the GEP is consecutive, like the induction variable. - /// This check allows us to vectorize A[idx] into a wide load/store. - bool isConsecutiveGep(Value *Ptr); - - /// Returns true if this instruction will remain scalar after vectorization. - bool isUniformAfterVectorization(Instruction* I) {return Uniforms.count(I);} - -private: - /// Check if a single basic block loop is vectorizable. - /// At this point we know that this is a loop with a constant trip count - /// and we only need to check individual instructions. - bool canVectorizeBlock(BasicBlock &BB); - - /// When we vectorize loops we may change the order in which - /// we read and write from memory. This method checks if it is - /// legal to vectorize the code, considering only memory constrains. - /// Returns true if BB is vectorizable - bool canVectorizeMemory(BasicBlock &BB); - - /// Returns True, if 'Phi' is the kind of reduction variable for type - /// 'Kind'. If this is a reduction variable, it adds it to ReductionList. - bool AddReductionVar(PHINode *Phi, ReductionKind Kind); - /// Returns true if the instruction I can be a reduction variable of type - /// 'Kind'. - bool isReductionInstr(Instruction *I, ReductionKind Kind); - /// Returns True, if 'Phi' is an induction variable. - bool isInductionVariable(PHINode *Phi); - - /// The loop that we evaluate. - Loop *TheLoop; - /// Scev analysis. - ScalarEvolution *SE; - /// DataLayout analysis. - DataLayout *DL; - - // --- vectorization state --- // - - /// Holds the induction variable. - PHINode *Induction; - /// Holds the reduction variables. - ReductionList Reductions; - /// Allowed outside users. This holds the reduction - /// vars which can be accessed from outside the loop. - SmallPtrSet<Value*, 4> AllowedExit; - /// This set holds the variables which are known to be uniform after - /// vectorization. - SmallPtrSet<Instruction*, 4> Uniforms; -}; - -/// LoopVectorizationCostModel - estimates the expected speedups due to -/// vectorization. -/// In many cases vectorization is not profitable. This can happen because -/// of a number of reasons. In this class we mainly attempt to predict -/// the expected speedup/slowdowns due to the supported instruction set. -/// We use the VectorTargetTransformInfo to query the different backends -/// for the cost of different operations. -class LoopVectorizationCostModel { -public: - /// C'tor. - LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se, - LoopVectorizationLegality *Leg, - const VectorTargetTransformInfo *Vtti): - TheLoop(Lp), SE(Se), Legal(Leg), VTTI(Vtti) { } - - /// Returns the most profitable vectorization factor for the loop that is - /// smaller or equal to the VF argument. This method checks every power - /// of two up to VF. - unsigned findBestVectorizationFactor(unsigned VF = 8); - -private: - /// Returns the expected execution cost. The unit of the cost does - /// not matter because we use the 'cost' units to compare different - /// vector widths. The cost that is returned is *not* normalized by - /// the factor width. - unsigned expectedCost(unsigned VF); - - /// Returns the execution time cost of an instruction for a given vector - /// width. Vector width of one means scalar. - unsigned getInstructionCost(Instruction *I, unsigned VF); - - /// A helper function for converting Scalar types to vector types. - /// If the incoming type is void, we return void. If the VF is 1, we return - /// the scalar type. - static Type* ToVectorTy(Type *Scalar, unsigned VF); - - /// The loop that we evaluate. - Loop *TheLoop; - /// Scev analysis. - ScalarEvolution *SE; - - /// Vectorization legality. - LoopVectorizationLegality *Legal; - /// Vector target information. - const VectorTargetTransformInfo *VTTI; -}; - +/// The LoopVectorize Pass. struct LoopVectorize : public LoopPass { static char ID; // Pass identification, replacement for typeid @@ -383,7 +78,7 @@ struct LoopVectorize : public LoopPass { L->getHeader()->getParent()->getName() << "\"\n"); // Check if it is legal to vectorize the loop. - LoopVectorizationLegality LVL(L, SE, DL); + LoopVectorizationLegality LVL(L, SE, DL, DT); if (!LVL.canVectorize()) { DEBUG(dbgs() << "LV: Not vectorizing.\n"); return false; @@ -414,7 +109,7 @@ struct LoopVectorize : public LoopPass { "\n"); // If we decided that it is *legal* to vectorizer the loop then do it. - SingleBlockLoopVectorizer LB(L, SE, LI, DT, &LPM, VF); + InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF); LB.vectorize(&LVL); DEBUG(verifyFunction(*L->getHeader()->getParent())); @@ -434,15 +129,44 @@ struct LoopVectorize : public LoopPass { }; -Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) { - // Instructions that access the old induction variable - // actually want to get the new one. - if (V == OldInduction) - V = Induction; +}// namespace + +//===----------------------------------------------------------------------===// +// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and +// LoopVectorizationCostModel. +//===----------------------------------------------------------------------===// + +void +LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE, + Loop *Lp, Value *Ptr) { + const SCEV *Sc = SE->getSCEV(Ptr); + const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc); + assert(AR && "Invalid addrec expression"); + const SCEV *Ex = SE->getExitCount(Lp, Lp->getLoopLatch()); + const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE); + Pointers.push_back(Ptr); + Starts.push_back(AR->getStart()); + Ends.push_back(ScEnd); +} + +Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) { // Create the types. LLVMContext &C = V->getContext(); Type *VTy = VectorType::get(V->getType(), VF); Type *I32 = IntegerType::getInt32Ty(C); + + // Save the current insertion location. + Instruction *Loc = Builder.GetInsertPoint(); + + // We need to place the broadcast of invariant variables outside the loop. + Instruction *Instr = dyn_cast<Instruction>(V); + bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody); + bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr; + + // Place the code for broadcasting invariant variables in the new preheader. + if (Invariant) + Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator()); + Constant *Zero = ConstantInt::get(I32, 0); Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF)); Value *UndefVal = UndefValue::get(VTy); @@ -450,27 +174,28 @@ Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) { Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero); // Broadcast the scalar into all locations in the vector. Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros, - "broadcast"); - // We are accessing the induction variable. Make sure to promote the - // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes. - if (V == Induction) - return getConsecutiveVector(Shuf); + "broadcast"); + + // Restore the builder insertion point. + if (Invariant) + Builder.SetInsertPoint(Loc); + return Shuf; } -Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) { +Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, bool Negate) { assert(Val->getType()->isVectorTy() && "Must be a vector"); assert(Val->getType()->getScalarType()->isIntegerTy() && "Elem must be an integer"); // Create the types. Type *ITy = Val->getType()->getScalarType(); VectorType *Ty = cast<VectorType>(Val->getType()); - unsigned VLen = Ty->getNumElements(); + int VLen = Ty->getNumElements(); SmallVector<Constant*, 8> Indices; // Create a vector of consecutive numbers from zero to VF. - for (unsigned i = 0; i < VLen; ++i) - Indices.push_back(ConstantInt::get(ITy, i)); + for (int i = 0; i < VLen; ++i) + Indices.push_back(ConstantInt::get(ITy, Negate ? (-i): i )); // Add the consecutive indices to the vector value. Constant *Cv = ConstantVector::get(Indices); @@ -478,7 +203,17 @@ Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) { return Builder.CreateAdd(Val, Cv, "induction"); } -bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) { +bool LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) { + assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr"); + + // If this value is a pointer induction variable we know it is consecutive. + PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr); + if (Phi && Inductions.count(Phi)) { + InductionInfo II = Inductions[Phi]; + if (PtrInduction == II.IK) + return true; + } + GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr); if (!Gep) return false; @@ -491,7 +226,7 @@ bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) { if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop)) return false; - // We can emit wide load/stores only of the last index is the induction + // We can emit wide load/stores only if the last index is the induction // variable. const SCEV *Last = SE->getSCEV(LastIndex); if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) { @@ -506,7 +241,12 @@ bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) { return false; } -Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) { +bool LoopVectorizationLegality::isUniform(Value *V) { + return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop)); +} + +Value *InnerLoopVectorizer::getVectorValue(Value *V) { + assert(V != Induction && "The new induction variable should not be used."); assert(!V->getType()->isVectorTy() && "Can't widen a vector"); // If we saved a vectorized copy of V, use it. Value *&MapEntry = WidenMap[V]; @@ -520,17 +260,11 @@ Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) { } Constant* -SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) { - SmallVector<Constant*, 8> Indices; - // Create a vector of consecutive numbers from zero to VF. - for (unsigned i = 0; i < VF; ++i) - Indices.push_back(ConstantInt::get(ScalarTy, Val, true)); - - // Add the consecutive indices to the vector value. - return ConstantVector::get(Indices); +InnerLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) { + return ConstantVector::getSplat(VF, ConstantInt::get(ScalarTy, Val, true)); } -void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { +void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) { assert(!Instr->getType()->isAggregateType() && "Can't handle vectors"); // Holds vector parameters or scalars, in case of uniform vals. SmallVector<Value*, 8> Params; @@ -541,7 +275,7 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { // If we are accessing the old induction variable, use the new one. if (SrcOp == OldInduction) { - Params.push_back(getBroadcastInstrs(Induction)); + Params.push_back(getVectorValue(SrcOp)); continue; } @@ -600,63 +334,158 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) { WidenMap[Instr] = VecResults; } +Value* +InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal, + Instruction *Loc) { + LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck = + Legal->getRuntimePointerCheck(); + + if (!PtrRtCheck->Need) + return NULL; + + Value *MemoryRuntimeCheck = 0; + unsigned NumPointers = PtrRtCheck->Pointers.size(); + SmallVector<Value* , 2> Starts; + SmallVector<Value* , 2> Ends; + + SCEVExpander Exp(*SE, "induction"); + + // Use this type for pointer arithmetic. + Type* PtrArithTy = Type::getInt8PtrTy(Loc->getContext(), 0); + + for (unsigned i = 0; i < NumPointers; ++i) { + Value *Ptr = PtrRtCheck->Pointers[i]; + const SCEV *Sc = SE->getSCEV(Ptr); + + if (SE->isLoopInvariant(Sc, OrigLoop)) { + DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" << + *Ptr <<"\n"); + Starts.push_back(Ptr); + Ends.push_back(Ptr); + } else { + DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n"); + + Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc); + Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc); + Starts.push_back(Start); + Ends.push_back(End); + } + } + + for (unsigned i = 0; i < NumPointers; ++i) { + for (unsigned j = i+1; j < NumPointers; ++j) { + Instruction::CastOps Op = Instruction::BitCast; + Value *Start0 = CastInst::Create(Op, Starts[i], PtrArithTy, "bc", Loc); + Value *Start1 = CastInst::Create(Op, Starts[j], PtrArithTy, "bc", Loc); + Value *End0 = CastInst::Create(Op, Ends[i], PtrArithTy, "bc", Loc); + Value *End1 = CastInst::Create(Op, Ends[j], PtrArithTy, "bc", Loc); + + Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, + Start0, End1, "bound0", Loc); + Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE, + Start1, End0, "bound1", Loc); + Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1, + "found.conflict", Loc); + if (MemoryRuntimeCheck) + MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or, + MemoryRuntimeCheck, + IsConflict, + "conflict.rdx", Loc); + else + MemoryRuntimeCheck = IsConflict; + + } + } + + return MemoryRuntimeCheck; +} + void -SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { +InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { /* In this function we generate a new loop. The new loop will contain the vectorized instructions while the old loop will continue to run the scalar remainder. - [ ] <-- vector loop bypass. - / | - / v -| [ ] <-- vector pre header. -| | -| v -| [ ] \ -| [ ]_| <-- vector loop. -| | - \ v + [ ] <-- vector loop bypass. + / | + / v + | [ ] <-- vector pre header. + | | + | v + | [ ] \ + | [ ]_| <-- vector loop. + | | + \ v >[ ] <--- middle-block. - / | - / v -| [ ] <--- new preheader. -| | -| v -| [ ] \ -| [ ]_| <-- old scalar loop to handle remainder. - \ | - \ v + / | + / v + | [ ] <--- new preheader. + | | + | v + | [ ] \ + | [ ]_| <-- old scalar loop to handle remainder. + \ | + \ v >[ ] <-- exit block. ... */ - // This is the original scalar-loop preheader. + BasicBlock *OldBasicBlock = OrigLoop->getHeader(); BasicBlock *BypassBlock = OrigLoop->getLoopPreheader(); BasicBlock *ExitBlock = OrigLoop->getExitBlock(); assert(ExitBlock && "Must have an exit block"); - // The loop index does not have to start at Zero. It starts with this value. + // Some loops have a single integer induction variable, while other loops + // don't. One example is c++ iterators that often have multiple pointer + // induction variables. In the code below we also support a case where we + // don't have a single induction variable. OldInduction = Legal->getInduction(); - Value *StartIdx = OldInduction->getIncomingValueForBlock(BypassBlock); + Type *IdxTy = OldInduction ? OldInduction->getType() : + DL->getIntPtrType(SE->getContext()); + + // Find the loop boundaries. + const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getLoopLatch()); + assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); + + // Get the total trip count from the count by adding 1. + ExitCount = SE->getAddExpr(ExitCount, + SE->getConstant(ExitCount->getType(), 1)); + + // Expand the trip count and place the new instructions in the preheader. + // Notice that the pre-header does not change, only the loop body. + SCEVExpander Exp(*SE, "induction"); + + // Count holds the overall loop count (N). + Value *Count = Exp.expandCodeFor(ExitCount, ExitCount->getType(), + BypassBlock->getTerminator()); + + // The loop index does not have to start at Zero. Find the original start + // value from the induction PHI node. If we don't have an induction variable + // then we know that it starts at zero. + Value *StartIdx = OldInduction ? + OldInduction->getIncomingValueForBlock(BypassBlock): + ConstantInt::get(IdxTy, 0); - assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop"); assert(BypassBlock && "Invalid loop structure"); - BasicBlock *VectorPH = - BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); - BasicBlock *VecBody = VectorPH->splitBasicBlock(VectorPH->getTerminator(), - "vector.body"); + // Generate the code that checks in runtime if arrays overlap. + Value *MemoryRuntimeCheck = addRuntimeCheck(Legal, + BypassBlock->getTerminator()); - BasicBlock *MiddleBlock = VecBody->splitBasicBlock(VecBody->getTerminator(), - "middle.block"); + // Split the single block loop into the two loop structure described above. + BasicBlock *VectorPH = + BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph"); + BasicBlock *VecBody = + VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body"); + BasicBlock *MiddleBlock = + VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block"); BasicBlock *ScalarPH = - MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), - "scalar.preheader"); - // Find the induction variable. - BasicBlock *OldBasicBlock = OrigLoop->getHeader(); - assert(OldInduction && "We must have a single phi node."); - Type *IdxTy = OldInduction->getType(); + MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph"); + + // This is the location in which we add all of the logic for bypassing + // the new vector loop. + Instruction *Loc = BypassBlock->getTerminator(); // Use this IR builder to create the loop instructions (Phi, Br, Cmp) // inside the loop. @@ -666,27 +495,16 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { Induction = Builder.CreatePHI(IdxTy, 2, "index"); Constant *Step = ConstantInt::get(IdxTy, VF); - // Find the loop boundaries. - const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader()); - assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count"); - - // Get the total trip count from the count by adding 1. - ExitCount = SE->getAddExpr(ExitCount, - SE->getConstant(ExitCount->getType(), 1)); - - // Expand the trip count and place the new instructions in the preheader. - // Notice that the pre-header does not change, only the loop body. - SCEVExpander Exp(*SE, "induction"); - Instruction *Loc = BypassBlock->getTerminator(); - // We may need to extend the index in case there is a type mismatch. // We know that the count starts at zero and does not overflow. - // We are using Zext because it should be less expensive. - if (ExitCount->getType() != Induction->getType()) - ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy); - - // Count holds the overall loop count (N). - Value *Count = Exp.expandCodeFor(ExitCount, Induction->getType(), Loc); + if (Count->getType() != IdxTy) { + // The exit count can be of pointer type. Convert it to the correct + // integer type. + if (ExitCount->getType()->isPointerTy()) + Count = CastInst::CreatePointerCast(Count, IdxTy, "ptrcnt.to.int", Loc); + else + Count = CastInst::CreateZExtOrBitCast(Count, IdxTy, "zext.cnt", Loc); + } // Add the start index to the loop count to get the new end index. Value *IdxEnd = BinaryOperator::CreateAdd(Count, StartIdx, "end.idx", Loc); @@ -699,20 +517,112 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { Value *IdxEndRoundDown = BinaryOperator::CreateAdd(CountRoundDown, StartIdx, "end.idx.rnd.down", Loc); - // Now, compare the new count to zero. If it is zero, jump to the scalar part. + // Now, compare the new count to zero. If it is zero skip the vector loop and + // jump to the scalar loop. Value *Cmp = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEndRoundDown, StartIdx, "cmp.zero", Loc); + + // If we are using memory runtime checks, include them in. + if (MemoryRuntimeCheck) + Cmp = BinaryOperator::Create(Instruction::Or, Cmp, MemoryRuntimeCheck, + "CntOrMem", Loc); + BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc); // Remove the old terminator. Loc->eraseFromParent(); + // We are going to resume the execution of the scalar loop. + // Go over all of the induction variables that we found and fix the + // PHIs that are left in the scalar version of the loop. + // The starting values of PHI nodes depend on the counter of the last + // iteration in the vectorized loop. + // If we come from a bypass edge then we need to start from the original + // start value. + + // This variable saves the new starting index for the scalar loop. + PHINode *ResumeIndex = 0; + LoopVectorizationLegality::InductionList::iterator I, E; + LoopVectorizationLegality::InductionList *List = Legal->getInductionVars(); + for (I = List->begin(), E = List->end(); I != E; ++I) { + PHINode *OrigPhi = I->first; + LoopVectorizationLegality::InductionInfo II = I->second; + PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val", + MiddleBlock->getTerminator()); + Value *EndValue = 0; + switch (II.IK) { + case LoopVectorizationLegality::NoInduction: + llvm_unreachable("Unknown induction"); + case LoopVectorizationLegality::IntInduction: { + // Handle the integer induction counter: + assert(OrigPhi->getType()->isIntegerTy() && "Invalid type"); + assert(OrigPhi == OldInduction && "Unknown integer PHI"); + // We know what the end value is. + EndValue = IdxEndRoundDown; + // We also know which PHI node holds it. + ResumeIndex = ResumeVal; + break; + } + case LoopVectorizationLegality::ReverseIntInduction: { + // Convert the CountRoundDown variable to the PHI size. + unsigned CRDSize = CountRoundDown->getType()->getScalarSizeInBits(); + unsigned IISize = II.StartValue->getType()->getScalarSizeInBits(); + Value *CRD = CountRoundDown; + if (CRDSize > IISize) + CRD = CastInst::Create(Instruction::Trunc, CountRoundDown, + II.StartValue->getType(), + "tr.crd", BypassBlock->getTerminator()); + else if (CRDSize < IISize) + CRD = CastInst::Create(Instruction::SExt, CountRoundDown, + II.StartValue->getType(), + "sext.crd", BypassBlock->getTerminator()); + // Handle reverse integer induction counter: + EndValue = BinaryOperator::CreateSub(II.StartValue, CRD, "rev.ind.end", + BypassBlock->getTerminator()); + break; + } + case LoopVectorizationLegality::PtrInduction: { + // For pointer induction variables, calculate the offset using + // the end index. + EndValue = GetElementPtrInst::Create(II.StartValue, CountRoundDown, + "ptr.ind.end", + BypassBlock->getTerminator()); + break; + } + }// end of case + + // The new PHI merges the original incoming value, in case of a bypass, + // or the value at the end of the vectorized loop. + ResumeVal->addIncoming(II.StartValue, BypassBlock); + ResumeVal->addIncoming(EndValue, VecBody); + + // Fix the scalar body counter (PHI node). + unsigned BlockIdx = OrigPhi->getBasicBlockIndex(ScalarPH); + OrigPhi->setIncomingValue(BlockIdx, ResumeVal); + } + + // If we are generating a new induction variable then we also need to + // generate the code that calculates the exit value. This value is not + // simply the end of the counter because we may skip the vectorized body + // in case of a runtime check. + if (!OldInduction){ + assert(!ResumeIndex && "Unexpected resume value found"); + ResumeIndex = PHINode::Create(IdxTy, 2, "new.indc.resume.val", + MiddleBlock->getTerminator()); + ResumeIndex->addIncoming(StartIdx, BypassBlock); + ResumeIndex->addIncoming(IdxEndRoundDown, VecBody); + } + + // Make sure that we found the index where scalar loop needs to continue. + assert(ResumeIndex && ResumeIndex->getType()->isIntegerTy() && + "Invalid resume Index"); + // Add a check in the middle block to see if we have completed // all of the iterations in the first vector loop. // If (N - N%VF) == N, then we *don't* need to run the remainder. Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd, - IdxEndRoundDown, "cmp.n", + ResumeIndex, "cmp.n", MiddleBlock->getTerminator()); BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator()); @@ -730,26 +640,25 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) { // Now we have two terminators. Remove the old one from the block. VecBody->getTerminator()->eraseFromParent(); - // Fix the scalar body iteration count. - unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH); - OldInduction->setIncomingValue(BlockIdx, IdxEndRoundDown); - // Get ready to start creating new instructions into the vectorized body. Builder.SetInsertPoint(VecBody->getFirstInsertionPt()); - // Register the new loop. + // Create and register the new vector loop. Loop* Lp = new Loop(); - LPM->insertLoop(Lp, OrigLoop->getParentLoop()); - - Lp->addBasicBlockToLoop(VecBody, LI->getBase()); - Loop *ParentLoop = OrigLoop->getParentLoop(); + + // Insert the new loop into the loop nest and register the new basic blocks. if (ParentLoop) { + ParentLoop->addChildLoop(Lp); ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase()); ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase()); ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase()); + } else { + LI->addTopLevelLoop(Lp); } + Lp->addBasicBlockToLoop(VecBody, LI->getBase()); + // Save the state. LoopVectorPreHeader = VectorPH; LoopScalarPreHeader = ScalarPH; @@ -781,8 +690,37 @@ getReductionIdentity(LoopVectorizationLegality::ReductionKind K) { } } +static bool +isTriviallyVectorizableIntrinsic(Instruction *Inst) { + IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst); + if (!II) + return false; + switch (II->getIntrinsicID()) { + case Intrinsic::sqrt: + case Intrinsic::sin: + case Intrinsic::cos: + case Intrinsic::exp: + case Intrinsic::exp2: + case Intrinsic::log: + case Intrinsic::log10: + case Intrinsic::log2: + case Intrinsic::fabs: + case Intrinsic::floor: + case Intrinsic::ceil: + case Intrinsic::trunc: + case Intrinsic::rint: + case Intrinsic::nearbyint: + case Intrinsic::pow: + case Intrinsic::fma: + return true; + default: + return false; + } + return false; +} + void -SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { +InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { //===------------------------------------------------===// // // Notice: any optimization or new instruction that go @@ -790,202 +728,31 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // the cost-model. // //===------------------------------------------------===// - typedef SmallVector<PHINode*, 4> PhiVector; BasicBlock &BB = *OrigLoop->getHeader(); - Constant *Zero = ConstantInt::get( - IntegerType::getInt32Ty(BB.getContext()), 0); + Constant *Zero = + ConstantInt::get(IntegerType::getInt32Ty(BB.getContext()), 0); // In order to support reduction variables we need to be able to vectorize // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two - // steages. First, we create a new vector PHI node with no incoming edges. + // stages. First, we create a new vector PHI node with no incoming edges. // We use this value when we vectorize all of the instructions that use the // PHI. Next, after all of the instructions in the block are complete we // add the new incoming edges to the PHI. At this point all of the // instructions in the basic block are vectorized, so we can use them to // construct the PHI. - PhiVector PHIsToFix; + PhiVector RdxPHIsToFix; - // For each instruction in the old loop. - for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { - Instruction *Inst = it; + // Scan the loop in a topological order to ensure that defs are vectorized + // before users. + LoopBlocksDFS DFS(OrigLoop); + DFS.perform(LI); - switch (Inst->getOpcode()) { - case Instruction::Br: - // Nothing to do for PHIs and BR, since we already took care of the - // loop control flow instructions. - continue; - case Instruction::PHI:{ - PHINode* P = cast<PHINode>(Inst); - // Special handling for the induction var. - if (OldInduction == Inst) - continue; - // This is phase one of vectorizing PHIs. - // This has to be a reduction variable. - assert(Legal->getReductionVars()->count(P) && "Not a Reduction"); - Type *VecTy = VectorType::get(Inst->getType(), VF); - WidenMap[Inst] = Builder.CreatePHI(VecTy, 2, "vec.phi"); - PHIsToFix.push_back(P); - continue; - } - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - // Just widen binops. - BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst); - Value *A = getVectorValue(Inst->getOperand(0)); - Value *B = getVectorValue(Inst->getOperand(1)); - - // Use this vector value for all users of the original instruction. - Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B); - WidenMap[Inst] = V; - - // Update the NSW, NUW and Exact flags. - BinaryOperator *VecOp = cast<BinaryOperator>(V); - if (isa<OverflowingBinaryOperator>(BinOp)) { - VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap()); - VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap()); - } - if (isa<PossiblyExactOperator>(VecOp)) - VecOp->setIsExact(BinOp->isExact()); - break; - } - case Instruction::Select: { - // Widen selects. - // If the selector is loop invariant we can create a select - // instruction with a scalar condition. Otherwise, use vector-select. - Value *Cond = Inst->getOperand(0); - bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop); - - // The condition can be loop invariant but still defined inside the - // loop. This means that we can't just use the original 'cond' value. - // We have to take the 'vectorized' value and pick the first lane. - // Instcombine will make this a no-op. - Cond = getVectorValue(Cond); - if (InvariantCond) - Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0)); - - Value *Op0 = getVectorValue(Inst->getOperand(1)); - Value *Op1 = getVectorValue(Inst->getOperand(2)); - WidenMap[Inst] = Builder.CreateSelect(Cond, Op0, Op1); - break; - } - - case Instruction::ICmp: - case Instruction::FCmp: { - // Widen compares. Generate vector compares. - bool FCmp = (Inst->getOpcode() == Instruction::FCmp); - CmpInst *Cmp = dyn_cast<CmpInst>(Inst); - Value *A = getVectorValue(Inst->getOperand(0)); - Value *B = getVectorValue(Inst->getOperand(1)); - if (FCmp) - WidenMap[Inst] = Builder.CreateFCmp(Cmp->getPredicate(), A, B); - else - WidenMap[Inst] = Builder.CreateICmp(Cmp->getPredicate(), A, B); - break; - } - - case Instruction::Store: { - // Attempt to issue a wide store. - StoreInst *SI = dyn_cast<StoreInst>(Inst); - Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF); - Value *Ptr = SI->getPointerOperand(); - unsigned Alignment = SI->getAlignment(); - GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); - // This store does not use GEPs. - if (!Legal->isConsecutiveGep(Gep)) { - scalarizeInstruction(Inst); - break; - } + // Vectorize all of the blocks in the original loop. + for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(), + be = DFS.endRPO(); bb != be; ++bb) + vectorizeBlockInLoop(Legal, *bb, &RdxPHIsToFix); - // The last index does not have to be the induction. It can be - // consecutive and be a function of the index. For example A[I+1]; - unsigned NumOperands = Gep->getNumOperands(); - Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1)); - LastIndex = Builder.CreateExtractElement(LastIndex, Zero); - - // Create the new GEP with the new induction variable. - GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); - Gep2->setOperand(NumOperands - 1, LastIndex); - Ptr = Builder.Insert(Gep2); - Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo()); - Value *Val = getVectorValue(SI->getValueOperand()); - Builder.CreateStore(Val, Ptr)->setAlignment(Alignment); - break; - } - case Instruction::Load: { - // Attempt to issue a wide load. - LoadInst *LI = dyn_cast<LoadInst>(Inst); - Type *RetTy = VectorType::get(LI->getType(), VF); - Value *Ptr = LI->getPointerOperand(); - unsigned Alignment = LI->getAlignment(); - GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); - - // We don't have a gep. Scalarize the load. - if (!Legal->isConsecutiveGep(Gep)) { - scalarizeInstruction(Inst); - break; - } - - // The last index does not have to be the induction. It can be - // consecutive and be a function of the index. For example A[I+1]; - unsigned NumOperands = Gep->getNumOperands(); - Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1)); - LastIndex = Builder.CreateExtractElement(LastIndex, Zero); - - // Create the new GEP with the new induction variable. - GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); - Gep2->setOperand(NumOperands - 1, LastIndex); - Ptr = Builder.Insert(Gep2); - Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo()); - LI = Builder.CreateLoad(Ptr); - LI->setAlignment(Alignment); - // Use this vector value for all users of the load. - WidenMap[Inst] = LI; - break; - } - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { - /// Vectorize bitcasts. - CastInst *CI = dyn_cast<CastInst>(Inst); - Value *A = getVectorValue(Inst->getOperand(0)); - Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF); - WidenMap[Inst] = Builder.CreateCast(CI->getOpcode(), A, DestTy); - break; - } - - default: - /// All other instructions are unsupported. Scalarize them. - scalarizeInstruction(Inst); - break; - }// end of switch. - }// end of for_each instr. - - // At this point every instruction in the original loop is widended to + // At this point every instruction in the original loop is widened to // a vector form. We are almost done. Now, we need to fix the PHI nodes // that we vectorized. The PHI nodes are currently empty because we did // not want to introduce cycles. Notice that the remaining PHI nodes @@ -994,7 +761,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // Create the 'reduced' values for each of the induction vars. // The reduced values are the vector values that we scalarize and combine // after the loop is finished. - for (PhiVector::iterator it = PHIsToFix.begin(), e = PHIsToFix.end(); + for (PhiVector::iterator it = RdxPHIsToFix.begin(), e = RdxPHIsToFix.end(); it != e; ++it) { PHINode *RdxPhi = *it; PHINode *VecRdxPhi = dyn_cast<PHINode>(WidenMap[RdxPhi]); @@ -1004,7 +771,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { assert(Legal->getReductionVars()->count(RdxPhi) && "Unable to find the reduction variable"); LoopVectorizationLegality::ReductionDescriptor RdxDesc = - (*Legal->getReductionVars())[RdxPhi]; + (*Legal->getReductionVars())[RdxPhi]; // We need to generate a reduction vector from the incoming scalar. // To do so, we need to generate the 'identity' vector and overide @@ -1024,8 +791,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // This vector is the Identity vector where the first element is the // incoming scalar reduction. Value *VectorStart = Builder.CreateInsertElement(Identity, - RdxDesc.StartValue, Zero); - + RdxDesc.StartValue, Zero); // Fix the vector-loop phi. // We created the induction variable so we know that the @@ -1035,8 +801,8 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // Reductions do not have to start at zero. They can start with // any loop invariant values. VecRdxPhi->addIncoming(VectorStart, VecPreheader); - unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); - Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx)); + Value *Val = + getVectorValue(RdxPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch())); VecRdxPhi->addIncoming(Val, LoopVectorBody); // Before each round, move the insertion point right between @@ -1053,29 +819,29 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // Extract the first scalar. Value *Scalar0 = - Builder.CreateExtractElement(NewPhi, Builder.getInt32(0)); + Builder.CreateExtractElement(NewPhi, Builder.getInt32(0)); // Extract and reduce the remaining vector elements. for (unsigned i=1; i < VF; ++i) { Value *Scalar1 = - Builder.CreateExtractElement(NewPhi, Builder.getInt32(i)); + Builder.CreateExtractElement(NewPhi, Builder.getInt32(i)); switch (RdxDesc.Kind) { - case LoopVectorizationLegality::IntegerAdd: - Scalar0 = Builder.CreateAdd(Scalar0, Scalar1); - break; - case LoopVectorizationLegality::IntegerMult: - Scalar0 = Builder.CreateMul(Scalar0, Scalar1); - break; - case LoopVectorizationLegality::IntegerOr: - Scalar0 = Builder.CreateOr(Scalar0, Scalar1); - break; - case LoopVectorizationLegality::IntegerAnd: - Scalar0 = Builder.CreateAnd(Scalar0, Scalar1); - break; - case LoopVectorizationLegality::IntegerXor: - Scalar0 = Builder.CreateXor(Scalar0, Scalar1); - break; - default: - llvm_unreachable("Unknown reduction operation"); + case LoopVectorizationLegality::IntegerAdd: + Scalar0 = Builder.CreateAdd(Scalar0, Scalar1, "add.rdx"); + break; + case LoopVectorizationLegality::IntegerMult: + Scalar0 = Builder.CreateMul(Scalar0, Scalar1, "mul.rdx"); + break; + case LoopVectorizationLegality::IntegerOr: + Scalar0 = Builder.CreateOr(Scalar0, Scalar1, "or.rdx"); + break; + case LoopVectorizationLegality::IntegerAnd: + Scalar0 = Builder.CreateAnd(Scalar0, Scalar1, "and.rdx"); + break; + case LoopVectorizationLegality::IntegerXor: + Scalar0 = Builder.CreateXor(Scalar0, Scalar1, "xor.rdx"); + break; + default: + llvm_unreachable("Unknown reduction operation"); } } @@ -1103,15 +869,373 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) { // Fix the scalar loop reduction variable with the incoming reduction sum // from the vector body and from the backedge value. - int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody); - int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block. + int IncomingEdgeBlockIdx = + (RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch()); + assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index"); + // Pick the other block. + int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0); (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr); }// end of for each redux variable. } -void SingleBlockLoopVectorizer::updateAnalysis() { - // The original basic block. +Value *InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) { + assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) && + "Invalid edge"); + + Value *SrcMask = createBlockInMask(Src); + + // The terminator has to be a branch inst! + BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator()); + assert(BI && "Unexpected terminator found"); + + Value *EdgeMask = SrcMask; + if (BI->isConditional()) { + EdgeMask = getVectorValue(BI->getCondition()); + if (BI->getSuccessor(0) != Dst) + EdgeMask = Builder.CreateNot(EdgeMask); + } + + return Builder.CreateAnd(EdgeMask, SrcMask); +} + +Value *InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) { + assert(OrigLoop->contains(BB) && "Block is not a part of a loop"); + + // Loop incoming mask is all-one. + if (OrigLoop->getHeader() == BB) { + Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1); + return getVectorValue(C); + } + + // This is the block mask. We OR all incoming edges, and with zero. + Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0); + Value *BlockMask = getVectorValue(Zero); + + // For each pred: + for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it) + BlockMask = Builder.CreateOr(BlockMask, createEdgeMask(*it, BB)); + + return BlockMask; +} + +void +InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal, + BasicBlock *BB, PhiVector *PV) { + Constant *Zero = + ConstantInt::get(IntegerType::getInt32Ty(BB->getContext()), 0); + + // For each instruction in the old loop. + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + switch (it->getOpcode()) { + case Instruction::Br: + // Nothing to do for PHIs and BR, since we already took care of the + // loop control flow instructions. + continue; + case Instruction::PHI:{ + PHINode* P = cast<PHINode>(it); + // Handle reduction variables: + if (Legal->getReductionVars()->count(P)) { + // This is phase one of vectorizing PHIs. + Type *VecTy = VectorType::get(it->getType(), VF); + WidenMap[it] = + PHINode::Create(VecTy, 2, "vec.phi", + LoopVectorBody->getFirstInsertionPt()); + PV->push_back(P); + continue; + } + + // Check for PHI nodes that are lowered to vector selects. + if (P->getParent() != OrigLoop->getHeader()) { + // We know that all PHIs in non header blocks are converted into + // selects, so we don't have to worry about the insertion order and we + // can just use the builder. + + // At this point we generate the predication tree. There may be + // duplications since this is a simple recursive scan, but future + // optimizations will clean it up. + Value *Cond = createBlockInMask(P->getIncomingBlock(0)); + WidenMap[P] = + Builder.CreateSelect(Cond, + getVectorValue(P->getIncomingValue(0)), + getVectorValue(P->getIncomingValue(1)), + "predphi"); + continue; + } + + // This PHINode must be an induction variable. + // Make sure that we know about it. + assert(Legal->getInductionVars()->count(P) && + "Not an induction variable"); + + LoopVectorizationLegality::InductionInfo II = + Legal->getInductionVars()->lookup(P); + + switch (II.IK) { + case LoopVectorizationLegality::NoInduction: + llvm_unreachable("Unknown induction"); + case LoopVectorizationLegality::IntInduction: { + assert(P == OldInduction && "Unexpected PHI"); + Value *Broadcasted = getBroadcastInstrs(Induction); + // After broadcasting the induction variable we need to make the + // vector consecutive by adding 0, 1, 2 ... + Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted); + WidenMap[OldInduction] = ConsecutiveInduction; + continue; + } + case LoopVectorizationLegality::ReverseIntInduction: + case LoopVectorizationLegality::PtrInduction: + // Handle reverse integer and pointer inductions. + Value *StartIdx = 0; + // If we have a single integer induction variable then use it. + // Otherwise, start counting at zero. + if (OldInduction) { + LoopVectorizationLegality::InductionInfo OldII = + Legal->getInductionVars()->lookup(OldInduction); + StartIdx = OldII.StartValue; + } else { + StartIdx = ConstantInt::get(Induction->getType(), 0); + } + // This is the normalized GEP that starts counting at zero. + Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx, + "normalized.idx"); + + // Handle the reverse integer induction variable case. + if (LoopVectorizationLegality::ReverseIntInduction == II.IK) { + IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType()); + Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy, + "resize.norm.idx"); + Value *ReverseInd = Builder.CreateSub(II.StartValue, CNI, + "reverse.idx"); + + // This is a new value so do not hoist it out. + Value *Broadcasted = getBroadcastInstrs(ReverseInd); + // After broadcasting the induction variable we need to make the + // vector consecutive by adding ... -3, -2, -1, 0. + Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted, + true); + WidenMap[it] = ConsecutiveInduction; + continue; + } + + // Handle the pointer induction variable case. + assert(P->getType()->isPointerTy() && "Unexpected type."); + + // This is the vector of results. Notice that we don't generate + // vector geps because scalar geps result in better code. + Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF)); + for (unsigned int i = 0; i < VF; ++i) { + Constant *Idx = ConstantInt::get(Induction->getType(), i); + Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, + "gep.idx"); + Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx, + "next.gep"); + VecVal = Builder.CreateInsertElement(VecVal, SclrGep, + Builder.getInt32(i), + "insert.gep"); + } + + WidenMap[it] = VecVal; + continue; + } + + }// End of PHI. + + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: { + // Just widen binops. + BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it); + Value *A = getVectorValue(it->getOperand(0)); + Value *B = getVectorValue(it->getOperand(1)); + + // Use this vector value for all users of the original instruction. + Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B); + WidenMap[it] = V; + + // Update the NSW, NUW and Exact flags. + BinaryOperator *VecOp = cast<BinaryOperator>(V); + if (isa<OverflowingBinaryOperator>(BinOp)) { + VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap()); + VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap()); + } + if (isa<PossiblyExactOperator>(VecOp)) + VecOp->setIsExact(BinOp->isExact()); + break; + } + case Instruction::Select: { + // Widen selects. + // If the selector is loop invariant we can create a select + // instruction with a scalar condition. Otherwise, use vector-select. + Value *Cond = it->getOperand(0); + bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop); + + // The condition can be loop invariant but still defined inside the + // loop. This means that we can't just use the original 'cond' value. + // We have to take the 'vectorized' value and pick the first lane. + // Instcombine will make this a no-op. + Cond = getVectorValue(Cond); + if (InvariantCond) + Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0)); + + Value *Op0 = getVectorValue(it->getOperand(1)); + Value *Op1 = getVectorValue(it->getOperand(2)); + WidenMap[it] = Builder.CreateSelect(Cond, Op0, Op1); + break; + } + + case Instruction::ICmp: + case Instruction::FCmp: { + // Widen compares. Generate vector compares. + bool FCmp = (it->getOpcode() == Instruction::FCmp); + CmpInst *Cmp = dyn_cast<CmpInst>(it); + Value *A = getVectorValue(it->getOperand(0)); + Value *B = getVectorValue(it->getOperand(1)); + if (FCmp) + WidenMap[it] = Builder.CreateFCmp(Cmp->getPredicate(), A, B); + else + WidenMap[it] = Builder.CreateICmp(Cmp->getPredicate(), A, B); + break; + } + + case Instruction::Store: { + // Attempt to issue a wide store. + StoreInst *SI = dyn_cast<StoreInst>(it); + Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF); + Value *Ptr = SI->getPointerOperand(); + unsigned Alignment = SI->getAlignment(); + + assert(!Legal->isUniform(Ptr) && + "We do not allow storing to uniform addresses"); + + GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); + + // This store does not use GEPs. + if (!Legal->isConsecutivePtr(Ptr)) { + scalarizeInstruction(it); + break; + } + + if (Gep) { + // The last index does not have to be the induction. It can be + // consecutive and be a function of the index. For example A[I+1]; + unsigned NumOperands = Gep->getNumOperands(); + Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1)); + LastIndex = Builder.CreateExtractElement(LastIndex, Zero); + + // Create the new GEP with the new induction variable. + GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); + Gep2->setOperand(NumOperands - 1, LastIndex); + Ptr = Builder.Insert(Gep2); + } else { + // Use the induction element ptr. + assert(isa<PHINode>(Ptr) && "Invalid induction ptr"); + Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero); + } + Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo()); + Value *Val = getVectorValue(SI->getValueOperand()); + Builder.CreateStore(Val, Ptr)->setAlignment(Alignment); + break; + } + case Instruction::Load: { + // Attempt to issue a wide load. + LoadInst *LI = dyn_cast<LoadInst>(it); + Type *RetTy = VectorType::get(LI->getType(), VF); + Value *Ptr = LI->getPointerOperand(); + unsigned Alignment = LI->getAlignment(); + GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr); + + // If the pointer is loop invariant or if it is non consecutive, + // scalarize the load. + bool Con = Legal->isConsecutivePtr(Ptr); + if (Legal->isUniform(Ptr) || !Con) { + scalarizeInstruction(it); + break; + } + + if (Gep) { + // The last index does not have to be the induction. It can be + // consecutive and be a function of the index. For example A[I+1]; + unsigned NumOperands = Gep->getNumOperands(); + Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1)); + LastIndex = Builder.CreateExtractElement(LastIndex, Zero); + + // Create the new GEP with the new induction variable. + GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone()); + Gep2->setOperand(NumOperands - 1, LastIndex); + Ptr = Builder.Insert(Gep2); + } else { + // Use the induction element ptr. + assert(isa<PHINode>(Ptr) && "Invalid induction ptr"); + Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero); + } + + Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo()); + LI = Builder.CreateLoad(Ptr); + LI->setAlignment(Alignment); + // Use this vector value for all users of the load. + WidenMap[it] = LI; + break; + } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + /// Vectorize bitcasts. + CastInst *CI = dyn_cast<CastInst>(it); + Value *A = getVectorValue(it->getOperand(0)); + Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF); + WidenMap[it] = Builder.CreateCast(CI->getOpcode(), A, DestTy); + break; + } + + case Instruction::Call: { + assert(isTriviallyVectorizableIntrinsic(it)); + Module *M = BB->getParent()->getParent(); + IntrinsicInst *II = cast<IntrinsicInst>(it); + Intrinsic::ID ID = II->getIntrinsicID(); + SmallVector<Value*, 4> Args; + for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i) + Args.push_back(getVectorValue(II->getArgOperand(i))); + Type *Tys[] = { VectorType::get(II->getType()->getScalarType(), VF) }; + Function *F = Intrinsic::getDeclaration(M, ID, Tys); + WidenMap[it] = Builder.CreateCall(F, Args); + break; + } + + default: + // All other instructions are unsupported. Scalarize them. + scalarizeInstruction(it); + break; + }// end of switch. + }// end of for_each instr. +} + +void InnerLoopVectorizer::updateAnalysis() { + // Forget the original basic block. SE->forgetLoop(OrigLoop); // Update the dominator tree information. @@ -1128,46 +1252,93 @@ void SingleBlockLoopVectorizer::updateAnalysis() { DEBUG(DT->verifyAnalysis()); } -bool LoopVectorizationLegality::canVectorize() { - if (!TheLoop->getLoopPreheader()) { - assert(false && "No preheader!!"); - DEBUG(dbgs() << "LV: Loop not normalized." << "\n"); - return false; +bool LoopVectorizationLegality::canVectorizeWithIfConvert() { + if (!EnableIfConversion) + return false; + + assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable"); + std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector(); + + // Collect the blocks that need predication. + for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) { + BasicBlock *BB = LoopBlocks[i]; + + // We must have at most two predecessors because we need to convert + // all PHIs to selects. + unsigned Preds = std::distance(pred_begin(BB), pred_end(BB)); + if (Preds > 2) + return false; + + // We must be able to predicate all blocks that need to be predicated. + if (blockNeedsPredication(BB) && !blockCanBePredicated(BB)) + return false; } - // We can only vectorize single basic block loops. - unsigned NumBlocks = TheLoop->getNumBlocks(); - if (NumBlocks != 1) { - DEBUG(dbgs() << "LV: Too many blocks:" << NumBlocks << "\n"); + // We can if-convert this loop. + return true; +} + +bool LoopVectorizationLegality::canVectorize() { + assert(TheLoop->getLoopPreheader() && "No preheader!!"); + + // We can only vectorize innermost loops. + if (TheLoop->getSubLoopsVector().size()) return false; - } - // We need to have a loop header. - BasicBlock *BB = TheLoop->getHeader(); - DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n"); + // We must have a single backedge. + if (TheLoop->getNumBackEdges() != 1) + return false; - // Go over each instruction and look at memory deps. - if (!canVectorizeBlock(*BB)) { - DEBUG(dbgs() << "LV: Can't vectorize this loop header\n"); + // We must have a single exiting block. + if (!TheLoop->getExitingBlock()) + return false; + + unsigned NumBlocks = TheLoop->getNumBlocks(); + + // Check if we can if-convert non single-bb loops. + if (NumBlocks != 1 && !canVectorizeWithIfConvert()) { + DEBUG(dbgs() << "LV: Can't if-convert the loop.\n"); return false; } + // We need to have a loop header. + BasicBlock *Latch = TheLoop->getLoopLatch(); + DEBUG(dbgs() << "LV: Found a loop: " << + TheLoop->getHeader()->getName() << "\n"); + // ScalarEvolution needs to be able to find the exit count. - const SCEV *ExitCount = SE->getExitCount(TheLoop, BB); + const SCEV *ExitCount = SE->getExitCount(TheLoop, Latch); if (ExitCount == SE->getCouldNotCompute()) { DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n"); return false; } // Do not loop-vectorize loops with a tiny trip count. - unsigned TC = SE->getSmallConstantTripCount(TheLoop, BB); + unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch); if (TC > 0u && TC < TinyTripCountThreshold) { DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " << "This loop is not worth vectorizing.\n"); return false; } - DEBUG(dbgs() << "LV: We can vectorize this loop!\n"); + // Check if we can vectorize the instructions and CFG in this loop. + if (!canVectorizeInstrs()) { + DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n"); + return false; + } + + // Go over each instruction and look at memory deps. + if (!canVectorizeMemory()) { + DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n"); + return false; + } + + // Collect all of the variables that remain uniform after vectorization. + collectLoopUniforms(); + + DEBUG(dbgs() << "LV: We can vectorize this loop" << + (PtrRtCheck.Need ? " (with a runtime bound check)" : "") + <<"!\n"); // Okay! We can vectorize. At this point we don't have any other mem analysis // which may limit our maximum vectorization factor, so just return true with @@ -1175,165 +1346,194 @@ bool LoopVectorizationLegality::canVectorize() { return true; } -bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) { - // Scan the instructions in the block and look for hazards. - for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { - Instruction *I = it; +bool LoopVectorizationLegality::canVectorizeInstrs() { + BasicBlock *PreHeader = TheLoop->getLoopPreheader(); + BasicBlock *Header = TheLoop->getHeader(); - PHINode *Phi = dyn_cast<PHINode>(I); - if (Phi) { - // This should not happen because the loop should be normalized. - if (Phi->getNumIncomingValues() != 2) { - DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); - return false; - } - // We only look at integer phi nodes. - if (!Phi->getType()->isIntegerTy()) { - DEBUG(dbgs() << "LV: Found an non-int PHI.\n"); - return false; - } + // For each block in the loop. + for (Loop::block_iterator bb = TheLoop->block_begin(), + be = TheLoop->block_end(); bb != be; ++bb) { - if (isInductionVariable(Phi)) { - if (Induction) { - DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n"); + // Scan the instructions in the block and look for hazards. + for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; + ++it) { + + if (PHINode *Phi = dyn_cast<PHINode>(it)) { + // This should not happen because the loop should be normalized. + if (Phi->getNumIncomingValues() != 2) { + DEBUG(dbgs() << "LV: Found an invalid PHI.\n"); return false; } - DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n"); - Induction = Phi; - continue; - } - if (AddReductionVar(Phi, IntegerAdd)) { - DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n"); - continue; - } - if (AddReductionVar(Phi, IntegerMult)) { - DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n"); - continue; - } - if (AddReductionVar(Phi, IntegerOr)) { - DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n"); - continue; - } - if (AddReductionVar(Phi, IntegerAnd)) { - DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n"); - continue; - } - if (AddReductionVar(Phi, IntegerXor)) { - DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n"); - continue; - } - DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); - return false; - }// end of PHI handling + // Check that this PHI type is allowed. + if (!Phi->getType()->isIntegerTy() && + !Phi->getType()->isPointerTy()) { + DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n"); + return false; + } - // We still don't handle functions. - CallInst *CI = dyn_cast<CallInst>(I); - if (CI) { - DEBUG(dbgs() << "LV: Found a call site.\n"); - return false; - } + // If this PHINode is not in the header block, then we know that we + // can convert it to select during if-conversion. No need to check if + // the PHIs in this block are induction or reduction variables. + if (*bb != Header) + continue; - // We do not re-vectorize vectors. - if (!VectorType::isValidElementType(I->getType()) && - !I->getType()->isVoidTy()) { - DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n"); - return false; - } + // This is the value coming from the preheader. + Value *StartValue = Phi->getIncomingValueForBlock(PreHeader); + // Check if this is an induction variable. + InductionKind IK = isInductionVariable(Phi); + + if (NoInduction != IK) { + // Int inductions are special because we only allow one IV. + if (IK == IntInduction) { + if (Induction) { + DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n"); + return false; + } + Induction = Phi; + } + + DEBUG(dbgs() << "LV: Found an induction variable.\n"); + Inductions[Phi] = InductionInfo(StartValue, IK); + continue; + } - // Reduction instructions are allowed to have exit users. - // All other instructions must not have external users. - if (!AllowedExit.count(I)) - //Check that all of the users of the loop are inside the BB. - for (Value::use_iterator it = I->use_begin(), e = I->use_end(); - it != e; ++it) { - Instruction *U = cast<Instruction>(*it); - // This user may be a reduction exit value. - BasicBlock *Parent = U->getParent(); - if (Parent != &BB) { - DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); - return false; + if (AddReductionVar(Phi, IntegerAdd)) { + DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n"); + continue; } - } - } // next instr. + if (AddReductionVar(Phi, IntegerMult)) { + DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n"); + continue; + } + if (AddReductionVar(Phi, IntegerOr)) { + DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n"); + continue; + } + if (AddReductionVar(Phi, IntegerAnd)) { + DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n"); + continue; + } + if (AddReductionVar(Phi, IntegerXor)) { + DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n"); + continue; + } + + DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n"); + return false; + }// end of PHI handling + + // We still don't handle functions. + CallInst *CI = dyn_cast<CallInst>(it); + if (CI && !isTriviallyVectorizableIntrinsic(it)) { + DEBUG(dbgs() << "LV: Found a call site.\n"); + return false; + } + + // We do not re-vectorize vectors. + if (!VectorType::isValidElementType(it->getType()) && + !it->getType()->isVoidTy()) { + DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n"); + return false; + } + + // Reduction instructions are allowed to have exit users. + // All other instructions must not have external users. + if (!AllowedExit.count(it)) + //Check that all of the users of the loop are inside the BB. + for (Value::use_iterator I = it->use_begin(), E = it->use_end(); + I != E; ++I) { + Instruction *U = cast<Instruction>(*I); + // This user may be a reduction exit value. + if (!TheLoop->contains(U)) { + DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n"); + return false; + } + } + } // next instr. + + } if (!Induction) { - DEBUG(dbgs() << "LV: Did not find an induction var.\n"); - return false; + DEBUG(dbgs() << "LV: Did not find one integer induction var.\n"); + assert(getInductionVars()->size() && "No induction variables"); } - // Don't vectorize if the memory dependencies do not allow vectorization. - if (!canVectorizeMemory(BB)) - return false; + return true; +} +void LoopVectorizationLegality::collectLoopUniforms() { // We now know that the loop is vectorizable! // Collect variables that will remain uniform after vectorization. std::vector<Value*> Worklist; + BasicBlock *Latch = TheLoop->getLoopLatch(); // Start with the conditional branch and walk up the block. - Worklist.push_back(BB.getTerminator()->getOperand(0)); + Worklist.push_back(Latch->getTerminator()->getOperand(0)); while (Worklist.size()) { Instruction *I = dyn_cast<Instruction>(Worklist.back()); Worklist.pop_back(); - // Look at instructions inside this block. - if (!I) continue; - if (I->getParent() != &BB) continue; + // Look at instructions inside this loop. // Stop when reaching PHI nodes. - if (isa<PHINode>(I)) { - assert(I == Induction && "Found a uniform PHI that is not the induction"); - break; - } + // TODO: we need to follow values all over the loop, not only in this block. + if (!I || !TheLoop->contains(I) || isa<PHINode>(I)) + continue; // This is a known uniform. Uniforms.insert(I); // Insert all operands. - for (int i=0, Op = I->getNumOperands(); i < Op; ++i) { + for (int i = 0, Op = I->getNumOperands(); i < Op; ++i) { Worklist.push_back(I->getOperand(i)); } } - - return true; } -bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { +bool LoopVectorizationLegality::canVectorizeMemory() { typedef SmallVector<Value*, 16> ValueVector; typedef SmallPtrSet<Value*, 16> ValueSet; // Holds the Load and Store *instructions*. ValueVector Loads; ValueVector Stores; - - // Scan the BB and collect legal loads and stores. - for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) { - Instruction *I = it; - - // If this is a load, save it. If this instruction can read from memory - // but is not a load, then we quit. Notice that we don't handle function - // calls that read or write. - if (I->mayReadFromMemory()) { - LoadInst *Ld = dyn_cast<LoadInst>(I); - if (!Ld) return false; - if (!Ld->isSimple()) { - DEBUG(dbgs() << "LV: Found a non-simple load.\n"); - return false; + PtrRtCheck.Pointers.clear(); + PtrRtCheck.Need = false; + + // For each block. + for (Loop::block_iterator bb = TheLoop->block_begin(), + be = TheLoop->block_end(); bb != be; ++bb) { + + // Scan the BB and collect legal loads and stores. + for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e; + ++it) { + + // If this is a load, save it. If this instruction can read from memory + // but is not a load, then we quit. Notice that we don't handle function + // calls that read or write. + if (it->mayReadFromMemory()) { + LoadInst *Ld = dyn_cast<LoadInst>(it); + if (!Ld) return false; + if (!Ld->isSimple()) { + DEBUG(dbgs() << "LV: Found a non-simple load.\n"); + return false; + } + Loads.push_back(Ld); + continue; } - Loads.push_back(Ld); - continue; - } - // Save store instructions. Abort if other instructions write to memory. - if (I->mayWriteToMemory()) { - StoreInst *St = dyn_cast<StoreInst>(I); - if (!St) return false; - if (!St->isSimple()) { - DEBUG(dbgs() << "LV: Found a non-simple store.\n"); - return false; + // Save 'store' instructions. Abort if other instructions write to memory. + if (it->mayWriteToMemory()) { + StoreInst *St = dyn_cast<StoreInst>(it); + if (!St) return false; + if (!St->isSimple()) { + DEBUG(dbgs() << "LV: Found a non-simple store.\n"); + return false; + } + Stores.push_back(St); } - Stores.push_back(St); - } - } // next instr. + } // next instr. + } // next block. // Now we have two lists that hold the loads and the stores. // Next, we find the pointers that they use. @@ -1341,8 +1541,8 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { // Check if we see any stores. If there are no stores, then we don't // care if the pointers are *restrict*. if (!Stores.size()) { - DEBUG(dbgs() << "LV: Found a read-only loop!\n"); - return true; + DEBUG(dbgs() << "LV: Found a read-only loop!\n"); + return true; } // Holds the read and read-write *pointers* that we find. @@ -1361,6 +1561,12 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { StoreInst *ST = dyn_cast<StoreInst>(*I); assert(ST && "Bad StoreInst"); Value* Ptr = ST->getPointerOperand(); + + if (isUniform(Ptr)) { + DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n"); + return false; + } + // If we did *not* see this pointer before, insert it to // the read-write list. At this phase it is only a 'write' list. if (Seen.insert(Ptr)) @@ -1379,7 +1585,7 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { // If the address of i is unknown (for example A[B[i]]) then we may // read a few words, modify, and write a few words, and some of the // words may be written to the same address. - if (Seen.insert(Ptr) || !isConsecutiveGep(Ptr)) + if (Seen.insert(Ptr) || !isConsecutivePtr(Ptr)) Reads.push_back(Ptr); } @@ -1390,6 +1596,39 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { return true; } + // Find pointers with computable bounds. We are going to use this information + // to place a runtime bound check. + bool RT = true; + for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I) + if (hasComputableBounds(*I)) { + PtrRtCheck.insert(SE, TheLoop, *I); + DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); + } else { + RT = false; + break; + } + for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I) + if (hasComputableBounds(*I)) { + PtrRtCheck.insert(SE, TheLoop, *I); + DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n"); + } else { + RT = false; + break; + } + + // Check that we did not collect too many pointers or found a + // unsizeable pointer. + if (!RT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) { + PtrRtCheck.reset(); + RT = false; + } + + PtrRtCheck.Need = RT; + + if (RT) { + DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n"); + } + // Now that the pointers are in two lists (Reads and ReadWrites), we // can check that there are no conflicts between each of the writes and // between the writes to the reads. @@ -1404,12 +1643,12 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { it != e; ++it) { if (!isIdentifiedObject(*it)) { DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n"); - return false; + return RT; } if (!WriteObjects.insert(*it)) { DEBUG(dbgs() << "LV: Found a possible write-write reorder:" << **it <<"\n"); - return false; + return RT; } } TempObjects.clear(); @@ -1422,18 +1661,20 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) { it != e; ++it) { if (!isIdentifiedObject(*it)) { DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n"); - return false; + return RT; } if (WriteObjects.count(*it)) { DEBUG(dbgs() << "LV: Found a possible read/write reorder:" << **it <<"\n"); - return false; + return RT; } } TempObjects.clear(); } - // All is okay. + // It is safe to vectorize and we don't need any runtime checks. + DEBUG(dbgs() << "LV: We don't need a runtime memory check.\n"); + PtrRtCheck.reset(); return true; } @@ -1442,11 +1683,13 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, if (Phi->getNumIncomingValues() != 2) return false; - // Find the possible incoming reduction variable. - BasicBlock *BB = Phi->getParent(); - int SelfEdgeIdx = Phi->getBasicBlockIndex(BB); - int InEdgeBlockIdx = (SelfEdgeIdx ? 0 : 1); // The other entry. - Value *RdxStart = Phi->getIncomingValue(InEdgeBlockIdx); + // Reduction variables are only found in the loop header block. + if (Phi->getParent() != TheLoop->getHeader()) + return false; + + // Obtain the reduction start value from the value that comes from the loop + // preheader. + Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader()); // ExitInstruction is the single value which is used outside the loop. // We only allow for a single reduction value to be used outside the loop. @@ -1461,20 +1704,20 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, // Also, we can't have multiple block-local users. Instruction *Iter = Phi; while (true) { + // If the instruction has no users then this is a broken + // chain and can't be a reduction variable. + if (Iter->use_empty()) + return false; + // Any reduction instr must be of one of the allowed kinds. if (!isReductionInstr(Iter, Kind)) return false; - // Did we found a user inside this block ? + // Did we find a user inside this block ? bool FoundInBlockUser = false; // Did we reach the initial PHI node ? bool FoundStartPHI = false; - // If the instruction has no users then this is a broken - // chain and can't be a reduction variable. - if (Iter->use_empty()) - return false; - // For each of the *users* of iter. for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end(); it != e; ++it) { @@ -1484,14 +1727,23 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, FoundStartPHI = true; continue; } + // Check if we found the exit user. BasicBlock *Parent = U->getParent(); - if (Parent != BB) { - // We must have a single exit instruction. + if (!TheLoop->contains(Parent)) { + // Exit if you find multiple outside users. if (ExitInstruction != 0) return false; ExitInstruction = Iter; } + + // We allow in-loop PHINodes which are not the original reduction PHI + // node. If this PHI is the only user of Iter (happens in IF w/ no ELSE + // structure) then don't skip this PHI. + if (isa<PHINode>(U) && U->getParent() != TheLoop->getHeader() && + TheLoop->contains(U) && Iter->getNumUses() > 1) + continue; + // We can't have multiple inside users. if (FoundInBlockUser) return false; @@ -1502,60 +1754,121 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi, // We found a reduction var if we have reached the original // phi node and we only have a single instruction with out-of-loop // users. - if (FoundStartPHI && ExitInstruction) { - // This instruction is allowed to have out-of-loop users. - AllowedExit.insert(ExitInstruction); - - // Save the description of this reduction variable. - ReductionDescriptor RD(RdxStart, ExitInstruction, Kind); - Reductions[Phi] = RD; - return true; - } + if (FoundStartPHI && ExitInstruction) { + // This instruction is allowed to have out-of-loop users. + AllowedExit.insert(ExitInstruction); + + // Save the description of this reduction variable. + ReductionDescriptor RD(RdxStart, ExitInstruction, Kind); + Reductions[Phi] = RD; + return true; + } + + // If we've reached the start PHI but did not find an outside user then + // this is dead code. Abort. + if (FoundStartPHI) + return false; } } bool LoopVectorizationLegality::isReductionInstr(Instruction *I, ReductionKind Kind) { - switch (I->getOpcode()) { - default: - return false; - case Instruction::PHI: - // possibly. - return true; - case Instruction::Add: - case Instruction::Sub: - return Kind == IntegerAdd; - case Instruction::Mul: - case Instruction::UDiv: - case Instruction::SDiv: - return Kind == IntegerMult; - case Instruction::And: - return Kind == IntegerAnd; - case Instruction::Or: - return Kind == IntegerOr; - case Instruction::Xor: - return Kind == IntegerXor; - } + switch (I->getOpcode()) { + default: + return false; + case Instruction::PHI: + // possibly. + return true; + case Instruction::Add: + case Instruction::Sub: + return Kind == IntegerAdd; + case Instruction::Mul: + return Kind == IntegerMult; + case Instruction::And: + return Kind == IntegerAnd; + case Instruction::Or: + return Kind == IntegerOr; + case Instruction::Xor: + return Kind == IntegerXor; + } } -bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { +LoopVectorizationLegality::InductionKind +LoopVectorizationLegality::isInductionVariable(PHINode *Phi) { + Type *PhiTy = Phi->getType(); + // We only handle integer and pointer inductions variables. + if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy()) + return NoInduction; + // Check that the PHI is consecutive and starts at zero. const SCEV *PhiScev = SE->getSCEV(Phi); const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); if (!AR) { DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n"); - return false; + return NoInduction; } const SCEV *Step = AR->getStepRecurrence(*SE); - if (!Step->isOne()) { - DEBUG(dbgs() << "LV: PHI stride does not equal one.\n"); - return false; + // Integer inductions need to have a stride of one. + if (PhiTy->isIntegerTy()) { + if (Step->isOne()) + return IntInduction; + if (Step->isAllOnesValue()) + return ReverseIntInduction; + return NoInduction; + } + + // Calculate the pointer stride and check if it is consecutive. + const SCEVConstant *C = dyn_cast<SCEVConstant>(Step); + if (!C) + return NoInduction; + + assert(PhiTy->isPointerTy() && "The PHI must be a pointer"); + uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType()); + if (C->getValue()->equalsInt(Size)) + return PtrInduction; + + return NoInduction; +} + +bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB) { + assert(TheLoop->contains(BB) && "Unknown block used"); + + // Blocks that do not dominate the latch need predication. + BasicBlock* Latch = TheLoop->getLoopLatch(); + return !DT->dominates(BB, Latch); +} + +bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) { + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + // We don't predicate loads/stores at the moment. + if (it->mayReadFromMemory() || it->mayWriteToMemory() || it->mayThrow()) + return false; + + // The isntructions below can trap. + switch (it->getOpcode()) { + default: continue; + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::URem: + case Instruction::SRem: + return false; + } } + return true; } +bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) { + const SCEV *PhiScev = SE->getSCEV(Ptr); + const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev); + if (!AR) + return false; + + return AR->isAffine(); +} + unsigned LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) { if (!VTTI) { @@ -1584,19 +1897,29 @@ LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) { } unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) { - // We can only estimate the cost of single basic block loops. - assert(1 == TheLoop->getNumBlocks() && "Too many blocks in loop"); - - BasicBlock *BB = TheLoop->getHeader(); unsigned Cost = 0; - // For each instruction in the old loop. - for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { - Instruction *Inst = it; - unsigned C = getInstructionCost(Inst, VF); - Cost += C; - DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF "<< VF << - " For instruction: "<< *Inst << "\n"); + // For each block. + for (Loop::block_iterator bb = TheLoop->block_begin(), + be = TheLoop->block_end(); bb != be; ++bb) { + unsigned BlockCost = 0; + BasicBlock *BB = *bb; + + // For each instruction in the old loop. + for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) { + unsigned C = getInstructionCost(it, VF); + Cost += C; + DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF " << + VF << " For instruction: "<< *it << "\n"); + } + + // We assume that if-converted blocks have a 50% chance of being executed. + // When the code is scalar then some of the blocks are avoided due to CF. + // When the code is vectorized we execute all code paths. + if (Legal->blockNeedsPredication(*bb) && VF == 1) + BlockCost /= 2; + + Cost += BlockCost; } return Cost; @@ -1614,147 +1937,156 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) { Type *RetTy = I->getType(); Type *VectorTy = ToVectorTy(RetTy, VF); - // TODO: We need to estimate the cost of intrinsic calls. switch (I->getOpcode()) { - case Instruction::GetElementPtr: - // We mark this instruction as zero-cost because scalar GEPs are usually - // lowered to the intruction addressing mode. At the moment we don't - // generate vector geps. - return 0; - case Instruction::Br: { - return VTTI->getCFInstrCost(I->getOpcode()); - } - case Instruction::PHI: - return 0; - case Instruction::Add: - case Instruction::FAdd: - case Instruction::Sub: - case Instruction::FSub: - case Instruction::Mul: - case Instruction::FMul: - case Instruction::UDiv: - case Instruction::SDiv: - case Instruction::FDiv: - case Instruction::URem: - case Instruction::SRem: - case Instruction::FRem: - case Instruction::Shl: - case Instruction::LShr: - case Instruction::AShr: - case Instruction::And: - case Instruction::Or: - case Instruction::Xor: { - return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy); - } - case Instruction::Select: { - SelectInst *SI = cast<SelectInst>(I); - const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); - bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); - Type *CondTy = SI->getCondition()->getType(); - if (ScalarCond) - CondTy = VectorType::get(CondTy, VF); - - return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); - } - case Instruction::ICmp: - case Instruction::FCmp: { - Type *ValTy = I->getOperand(0)->getType(); - VectorTy = ToVectorTy(ValTy, VF); - return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy); - } - case Instruction::Store: { - StoreInst *SI = cast<StoreInst>(I); - Type *ValTy = SI->getValueOperand()->getType(); - VectorTy = ToVectorTy(ValTy, VF); - - if (VF == 1) - return VTTI->getMemoryOpCost(I->getOpcode(), ValTy, - SI->getAlignment(), SI->getPointerAddressSpace()); - - // Scalarized stores. - if (!Legal->isConsecutiveGep(SI->getPointerOperand())) { - unsigned Cost = 0; - unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, - ValTy); - // The cost of extracting from the value vector. - Cost += VF * (ExtCost); - // The cost of the scalar stores. - Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), - ValTy->getScalarType(), - SI->getAlignment(), - SI->getPointerAddressSpace()); - return Cost; - } - - // Wide stores. - return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(), + case Instruction::GetElementPtr: + // We mark this instruction as zero-cost because scalar GEPs are usually + // lowered to the intruction addressing mode. At the moment we don't + // generate vector geps. + return 0; + case Instruction::Br: { + return VTTI->getCFInstrCost(I->getOpcode()); + } + case Instruction::PHI: + //TODO: IF-converted IFs become selects. + return 0; + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy); + case Instruction::Select: { + SelectInst *SI = cast<SelectInst>(I); + const SCEV *CondSCEV = SE->getSCEV(SI->getCondition()); + bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop)); + Type *CondTy = SI->getCondition()->getType(); + if (ScalarCond) + CondTy = VectorType::get(CondTy, VF); + + return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy); + } + case Instruction::ICmp: + case Instruction::FCmp: { + Type *ValTy = I->getOperand(0)->getType(); + VectorTy = ToVectorTy(ValTy, VF); + return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy); + } + case Instruction::Store: { + StoreInst *SI = cast<StoreInst>(I); + Type *ValTy = SI->getValueOperand()->getType(); + VectorTy = ToVectorTy(ValTy, VF); + + if (VF == 1) + return VTTI->getMemoryOpCost(I->getOpcode(), ValTy, + SI->getAlignment(), SI->getPointerAddressSpace()); + + // Scalarized stores. + if (!Legal->isConsecutivePtr(SI->getPointerOperand())) { + unsigned Cost = 0; + unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, + ValTy); + // The cost of extracting from the value vector. + Cost += VF * (ExtCost); + // The cost of the scalar stores. + Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), + ValTy->getScalarType(), + SI->getAlignment(), + SI->getPointerAddressSpace()); + return Cost; } - case Instruction::Load: { - LoadInst *LI = cast<LoadInst>(I); - - if (VF == 1) - return VTTI->getMemoryOpCost(I->getOpcode(), RetTy, - LI->getAlignment(), - LI->getPointerAddressSpace()); - - // Scalarized loads. - if (!Legal->isConsecutiveGep(LI->getPointerOperand())) { - unsigned Cost = 0; - unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy); - // The cost of inserting the loaded value into the result vector. - Cost += VF * (InCost); - // The cost of the scalar stores. - Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), - RetTy->getScalarType(), - LI->getAlignment(), - LI->getPointerAddressSpace()); - return Cost; - } - // Wide loads. - return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(), + // Wide stores. + return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(), + SI->getPointerAddressSpace()); + } + case Instruction::Load: { + LoadInst *LI = cast<LoadInst>(I); + + if (VF == 1) + return VTTI->getMemoryOpCost(I->getOpcode(), RetTy, + LI->getAlignment(), LI->getPointerAddressSpace()); - } - case Instruction::ZExt: - case Instruction::SExt: - case Instruction::FPToUI: - case Instruction::FPToSI: - case Instruction::FPExt: - case Instruction::PtrToInt: - case Instruction::IntToPtr: - case Instruction::SIToFP: - case Instruction::UIToFP: - case Instruction::Trunc: - case Instruction::FPTrunc: - case Instruction::BitCast: { - Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); - return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); - } - default: { - // We are scalarizing the instruction. Return the cost of the scalar - // instruction, plus the cost of insert and extract into vector - // elements, times the vector width. + + // Scalarized loads. + if (!Legal->isConsecutivePtr(LI->getPointerOperand())) { unsigned Cost = 0; + unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy); + // The cost of inserting the loaded value into the result vector. + Cost += VF * (InCost); + // The cost of the scalar stores. + Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), + RetTy->getScalarType(), + LI->getAlignment(), + LI->getPointerAddressSpace()); + return Cost; + } - bool IsVoid = RetTy->isVoidTy(); + // Wide loads. + return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(), + LI->getPointerAddressSpace()); + } + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: { + Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF); + return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy); + } + case Instruction::Call: { + assert(isTriviallyVectorizableIntrinsic(I)); + IntrinsicInst *II = cast<IntrinsicInst>(I); + Type *RetTy = ToVectorTy(II->getType(), VF); + SmallVector<Type*, 4> Tys; + for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i) + Tys.push_back(ToVectorTy(II->getArgOperand(i)->getType(), VF)); + return VTTI->getIntrinsicInstrCost(II->getIntrinsicID(), RetTy, Tys); + } + default: { + // We are scalarizing the instruction. Return the cost of the scalar + // instruction, plus the cost of insert and extract into vector + // elements, times the vector width. + unsigned Cost = 0; - unsigned InsCost = (IsVoid ? 0 : - VTTI->getInstrCost(Instruction::InsertElement, - VectorTy)); + bool IsVoid = RetTy->isVoidTy(); - unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, - VectorTy); + unsigned InsCost = (IsVoid ? 0 : + VTTI->getInstrCost(Instruction::InsertElement, + VectorTy)); - // The cost of inserting the results plus extracting each one of the - // operands. - Cost += VF * (InsCost + ExtCost * I->getNumOperands()); + unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, + VectorTy); - // The cost of executing VF copies of the scalar instruction. - Cost += VF * VTTI->getInstrCost(I->getOpcode(), RetTy); - return Cost; - } + // The cost of inserting the results plus extracting each one of the + // operands. + Cost += VF * (InsCost + ExtCost * I->getNumOperands()); + + // The cost of executing VF copies of the scalar instruction. + Cost += VF * VTTI->getInstrCost(I->getOpcode(), RetTy); + return Cost; + } }// end of switch. } @@ -1764,8 +2096,6 @@ Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) { return VectorType::get(Scalar, VF); } -} // namespace - char LoopVectorize::ID = 0; static const char lv_name[] = "Loop Vectorization"; INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false) @@ -1780,3 +2110,4 @@ namespace llvm { } } + |