diff options
author | Alexander Kornienko <alexfh@google.com> | 2013-03-14 10:51:38 +0000 |
---|---|---|
committer | Alexander Kornienko <alexfh@google.com> | 2013-03-14 10:51:38 +0000 |
commit | 647735c781c5b37061ee03d6e9e6c7dda92218e2 (patch) | |
tree | 5a5e56606d41060263048b5a5586b3d2380898ba /lib/Transforms/Vectorize | |
parent | 6aed25d93d1cfcde5809a73ffa7dc1b0d6396f66 (diff) | |
parent | f635ef401786c84df32090251a8cf45981ecca33 (diff) |
Updating branches/google/stable to r176857
git-svn-id: https://llvm.org/svn/llvm-project/llvm/branches/google/stable@177040 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Transforms/Vectorize')
-rw-r--r-- | lib/Transforms/Vectorize/BBVectorize.cpp | 1108 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/LoopVectorize.cpp | 2588 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/LoopVectorize.h | 458 | ||||
-rw-r--r-- | lib/Transforms/Vectorize/Vectorize.cpp | 2 |
4 files changed, 2624 insertions, 1532 deletions
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp index a48229132b..17900dabbe 100644 --- a/lib/Transforms/Vectorize/BBVectorize.cpp +++ b/lib/Transforms/Vectorize/BBVectorize.cpp @@ -29,26 +29,25 @@ #include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Constants.h" -#include "llvm/DataLayout.h" -#include "llvm/DerivedTypes.h" -#include "llvm/Function.h" -#include "llvm/Instructions.h" -#include "llvm/IntrinsicInst.h" -#include "llvm/Intrinsics.h" -#include "llvm/LLVMContext.h" -#include "llvm/Metadata.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Type.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/TargetTransformInfo.h" #include "llvm/Transforms/Utils/Local.h" -#include "llvm/Type.h" #include <algorithm> -#include <map> using namespace llvm; static cl::opt<bool> @@ -89,6 +88,10 @@ MaxInsts("bb-vectorize-max-instr-per-group", cl::init(500), cl::Hidden, cl::desc("The maximum number of pairable instructions per group")); static cl::opt<unsigned> +MaxPairs("bb-vectorize-max-pairs-per-group", cl::init(3000), cl::Hidden, + cl::desc("The maximum number of candidate instruction pairs per group")); + +static cl::opt<unsigned> MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200), cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use" " a full cycle check")); @@ -199,9 +202,7 @@ namespace { DT = &P->getAnalysis<DominatorTree>(); SE = &P->getAnalysis<ScalarEvolution>(); TD = P->getAnalysisIfAvailable<DataLayout>(); - TTI = IgnoreTargetInfo ? 0 : - P->getAnalysisIfAvailable<TargetTransformInfo>(); - VTTI = TTI ? TTI->getVectorTargetTransformInfo() : 0; + TTI = IgnoreTargetInfo ? 0 : &P->getAnalysis<TargetTransformInfo>(); } typedef std::pair<Value *, Value *> ValuePair; @@ -209,18 +210,12 @@ namespace { typedef std::pair<ValuePair, size_t> ValuePairWithDepth; typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair typedef std::pair<VPPair, unsigned> VPPairWithType; - typedef std::pair<std::multimap<Value *, Value *>::iterator, - std::multimap<Value *, Value *>::iterator> VPIteratorPair; - typedef std::pair<std::multimap<ValuePair, ValuePair>::iterator, - std::multimap<ValuePair, ValuePair>::iterator> - VPPIteratorPair; AliasAnalysis *AA; DominatorTree *DT; ScalarEvolution *SE; DataLayout *TD; - TargetTransformInfo *TTI; - const VectorTargetTransformInfo *VTTI; + const TargetTransformInfo *TTI; // FIXME: const correct? @@ -228,7 +223,7 @@ namespace { bool getCandidatePairs(BasicBlock &BB, BasicBlock::iterator &Start, - std::multimap<Value *, Value *> &CandidatePairs, + DenseMap<Value *, std::vector<Value *> > &CandidatePairs, DenseSet<ValuePair> &FixedOrderPairs, DenseMap<ValuePair, int> &CandidatePairCostSavings, std::vector<Value *> &PairableInsts, bool NonPow2Len); @@ -242,33 +237,36 @@ namespace { PairConnectionSplat }; - void computeConnectedPairs(std::multimap<Value *, Value *> &CandidatePairs, - std::vector<Value *> &PairableInsts, - std::multimap<ValuePair, ValuePair> &ConnectedPairs, - DenseMap<VPPair, unsigned> &PairConnectionTypes); + void computeConnectedPairs( + DenseMap<Value *, std::vector<Value *> > &CandidatePairs, + DenseSet<ValuePair> &CandidatePairsSet, + std::vector<Value *> &PairableInsts, + DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, + DenseMap<VPPair, unsigned> &PairConnectionTypes); void buildDepMap(BasicBlock &BB, - std::multimap<Value *, Value *> &CandidatePairs, - std::vector<Value *> &PairableInsts, - DenseSet<ValuePair> &PairableInstUsers); - - void choosePairs(std::multimap<Value *, Value *> &CandidatePairs, - DenseMap<ValuePair, int> &CandidatePairCostSavings, - std::vector<Value *> &PairableInsts, - DenseSet<ValuePair> &FixedOrderPairs, - DenseMap<VPPair, unsigned> &PairConnectionTypes, - std::multimap<ValuePair, ValuePair> &ConnectedPairs, - std::multimap<ValuePair, ValuePair> &ConnectedPairDeps, - DenseSet<ValuePair> &PairableInstUsers, - DenseMap<Value *, Value *>& ChosenPairs); + DenseMap<Value *, std::vector<Value *> > &CandidatePairs, + std::vector<Value *> &PairableInsts, + DenseSet<ValuePair> &PairableInstUsers); + + void choosePairs(DenseMap<Value *, std::vector<Value *> > &CandidatePairs, + DenseSet<ValuePair> &CandidatePairsSet, + DenseMap<ValuePair, int> &CandidatePairCostSavings, + std::vector<Value *> &PairableInsts, + DenseSet<ValuePair> &FixedOrderPairs, + DenseMap<VPPair, unsigned> &PairConnectionTypes, + DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, + DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps, + DenseSet<ValuePair> &PairableInstUsers, + DenseMap<Value *, Value *>& ChosenPairs); void fuseChosenPairs(BasicBlock &BB, - std::vector<Value *> &PairableInsts, - DenseMap<Value *, Value *>& ChosenPairs, - DenseSet<ValuePair> &FixedOrderPairs, - DenseMap<VPPair, unsigned> &PairConnectionTypes, - std::multimap<ValuePair, ValuePair> &ConnectedPairs, - std::multimap<ValuePair, ValuePair> &ConnectedPairDeps); + std::vector<Value *> &PairableInsts, + DenseMap<Value *, Value *>& ChosenPairs, + DenseSet<ValuePair> &FixedOrderPairs, + DenseMap<VPPair, unsigned> &PairConnectionTypes, + DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, + DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps); bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore); @@ -280,56 +278,63 @@ namespace { bool trackUsesOfI(DenseSet<Value *> &Users, AliasSetTracker &WriteSet, Instruction *I, Instruction *J, bool UpdateUsers = true, - std::multimap<Value *, Value *> *LoadMoveSet = 0); + DenseSet<ValuePair> *LoadMoveSetPairs = 0); - void computePairsConnectedTo( - std::multimap<Value *, Value *> &CandidatePairs, - std::vector<Value *> &PairableInsts, - std::multimap<ValuePair, ValuePair> &ConnectedPairs, - DenseMap<VPPair, unsigned> &PairConnectionTypes, - ValuePair P); + void computePairsConnectedTo( + DenseMap<Value *, std::vector<Value *> > &CandidatePairs, + DenseSet<ValuePair> &CandidatePairsSet, + std::vector<Value *> &PairableInsts, + DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, + DenseMap<VPPair, unsigned> &PairConnectionTypes, + ValuePair P); bool pairsConflict(ValuePair P, ValuePair Q, - DenseSet<ValuePair> &PairableInstUsers, - std::multimap<ValuePair, ValuePair> *PairableInstUserMap = 0); + DenseSet<ValuePair> &PairableInstUsers, + DenseMap<ValuePair, std::vector<ValuePair> > + *PairableInstUserMap = 0, + DenseSet<VPPair> *PairableInstUserPairSet = 0); bool pairWillFormCycle(ValuePair P, - std::multimap<ValuePair, ValuePair> &PairableInstUsers, - DenseSet<ValuePair> &CurrentPairs); - - void pruneTreeFor( - std::multimap<Value *, Value *> &CandidatePairs, - std::vector<Value *> &PairableInsts, - std::multimap<ValuePair, ValuePair> &ConnectedPairs, - DenseSet<ValuePair> &PairableInstUsers, - std::multimap<ValuePair, ValuePair> &PairableInstUserMap, - DenseMap<Value *, Value *> &ChosenPairs, - DenseMap<ValuePair, size_t> &Tree, - DenseSet<ValuePair> &PrunedTree, ValuePair J, - bool UseCycleCheck); - - void buildInitialTreeFor( - std::multimap<Value *, Value *> &CandidatePairs, - std::vector<Value *> &PairableInsts, - std::multimap<ValuePair, ValuePair> &ConnectedPairs, - DenseSet<ValuePair> &PairableInstUsers, - DenseMap<Value *, Value *> &ChosenPairs, - DenseMap<ValuePair, size_t> &Tree, ValuePair J); - - void findBestTreeFor( - std::multimap<Value *, Value *> &CandidatePairs, - DenseMap<ValuePair, int> &CandidatePairCostSavings, - std::vector<Value *> &PairableInsts, - DenseSet<ValuePair> &FixedOrderPairs, - DenseMap<VPPair, unsigned> &PairConnectionTypes, - std::multimap<ValuePair, ValuePair> &ConnectedPairs, - std::multimap<ValuePair, ValuePair> &ConnectedPairDeps, - DenseSet<ValuePair> &PairableInstUsers, - std::multimap<ValuePair, ValuePair> &PairableInstUserMap, - DenseMap<Value *, Value *> &ChosenPairs, - DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth, - int &BestEffSize, VPIteratorPair ChoiceRange, - bool UseCycleCheck); + DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUsers, + DenseSet<ValuePair> &CurrentPairs); + + void pruneDAGFor( + DenseMap<Value *, std::vector<Value *> > &CandidatePairs, + std::vector<Value *> &PairableInsts, + DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, + DenseSet<ValuePair> &PairableInstUsers, + DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap, + DenseSet<VPPair> &PairableInstUserPairSet, + DenseMap<Value *, Value *> &ChosenPairs, + DenseMap<ValuePair, size_t> &DAG, + DenseSet<ValuePair> &PrunedDAG, ValuePair J, + bool UseCycleCheck); + + void buildInitialDAGFor( + DenseMap<Value *, std::vector<Value *> > &CandidatePairs, + DenseSet<ValuePair> &CandidatePairsSet, + std::vector<Value *> &PairableInsts, + DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, + DenseSet<ValuePair> &PairableInstUsers, + DenseMap<Value *, Value *> &ChosenPairs, + DenseMap<ValuePair, size_t> &DAG, ValuePair J); + + void findBestDAGFor( + DenseMap<Value *, std::vector<Value *> > &CandidatePairs, + DenseSet<ValuePair> &CandidatePairsSet, + DenseMap<ValuePair, int> &CandidatePairCostSavings, + std::vector<Value *> &PairableInsts, + DenseSet<ValuePair> &FixedOrderPairs, + DenseMap<VPPair, unsigned> &PairConnectionTypes, + DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairs, + DenseMap<ValuePair, std::vector<ValuePair> > &ConnectedPairDeps, + DenseSet<ValuePair> &PairableInstUsers, + DenseMap<ValuePair, std::vector<ValuePair> > &PairableInstUserMap, + DenseSet<VPPair> &PairableInstUserPairSet, + DenseMap<Value *, Value *> &ChosenPairs, + DenseSet<ValuePair> &BestDAG, size_t &BestMaxDepth, + int &BestEffSize, Value *II, std::vector<Value *>&JJ, + bool UseCycleCheck); Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I, Instruction *J, unsigned o); @@ -361,20 +366,22 @@ namespace { void collectPairLoadMoveSet(BasicBlock &BB, DenseMap<Value *, Value *> &ChosenPairs, - std::multimap<Value *, Value *> &LoadMoveSet, + DenseMap<Value *, std::vector<Value *> > &LoadMoveSet, + DenseSet<ValuePair> &LoadMoveSetPairs, Instruction *I); void collectLoadMoveSet(BasicBlock &BB, std::vector<Value *> &PairableInsts, DenseMap<Value *, Value *> &ChosenPairs, - std::multimap<Value *, Value *> &LoadMoveSet); + DenseMap<Value *, std::vector<Value *> > &LoadMoveSet, + DenseSet<ValuePair> &LoadMoveSetPairs); bool canMoveUsesOfIAfterJ(BasicBlock &BB, - std::multimap<Value *, Value *> &LoadMoveSet, + DenseSet<ValuePair> &LoadMoveSetPairs, Instruction *I, Instruction *J); void moveUsesOfIAfterJ(BasicBlock &BB, - std::multimap<Value *, Value *> &LoadMoveSet, + DenseSet<ValuePair> &LoadMoveSetPairs, Instruction *&InsertionPt, Instruction *I, Instruction *J); @@ -387,7 +394,7 @@ namespace { return false; } - DEBUG(if (VTTI) dbgs() << "BBV: using target information\n"); + DEBUG(if (TTI) dbgs() << "BBV: using target information\n"); bool changed = false; // Iterate a sufficient number of times to merge types of size 1 bit, @@ -395,7 +402,7 @@ namespace { // target vector register. unsigned n = 1; for (unsigned v = 2; - (VTTI || v <= Config.VectorBits) && + (TTI || v <= Config.VectorBits) && (!Config.MaxIter || n <= Config.MaxIter); v *= 2, ++n) { DEBUG(dbgs() << "BBV: fusing loop #" << n << @@ -426,9 +433,7 @@ namespace { DT = &getAnalysis<DominatorTree>(); SE = &getAnalysis<ScalarEvolution>(); TD = getAnalysisIfAvailable<DataLayout>(); - TTI = IgnoreTargetInfo ? 0 : - getAnalysisIfAvailable<TargetTransformInfo>(); - VTTI = TTI ? TTI->getVectorTargetTransformInfo() : 0; + TTI = IgnoreTargetInfo ? 0 : &getAnalysis<TargetTransformInfo>(); return vectorizeBB(BB); } @@ -438,6 +443,7 @@ namespace { AU.addRequired<AliasAnalysis>(); AU.addRequired<DominatorTree>(); AU.addRequired<ScalarEvolution>(); + AU.addRequired<TargetTransformInfo>(); AU.addPreserved<AliasAnalysis>(); AU.addPreserved<DominatorTree>(); AU.addPreserved<ScalarEvolution>(); @@ -467,18 +473,18 @@ namespace { static inline void getInstructionTypes(Instruction *I, Type *&T1, Type *&T2) { - if (isa<StoreInst>(I)) { + if (StoreInst *SI = dyn_cast<StoreInst>(I)) { // For stores, it is the value type, not the pointer type that matters // because the value is what will come from a vector register. - Value *IVal = cast<StoreInst>(I)->getValueOperand(); + Value *IVal = SI->getValueOperand(); T1 = IVal->getType(); } else { T1 = I->getType(); } - if (I->isCast()) - T2 = cast<CastInst>(I)->getSrcTy(); + if (CastInst *CI = dyn_cast<CastInst>(I)) + T2 = CI->getSrcTy(); else T2 = T1; @@ -504,7 +510,7 @@ namespace { // InsertElement and ExtractElement have a depth factor of zero. This is // for two reasons: First, they cannot be usefully fused. Second, because // the pass generates a lot of these, they can confuse the simple metric - // used to compare the trees in the next iteration. Thus, giving them a + // used to compare the dags in the next iteration. Thus, giving them a // weight of zero allows the pass to essentially ignore them in // subsequent iterations when looking for vectorization opportunities // while still tracking dependency chains that flow through those @@ -520,7 +526,7 @@ namespace { return 1; } - // Returns the cost of the provided instruction using VTTI. + // Returns the cost of the provided instruction using TTI. // This does not handle loads and stores. unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2) { switch (Opcode) { @@ -531,7 +537,7 @@ namespace { // generate vector GEPs. return 0; case Instruction::Br: - return VTTI->getCFInstrCost(Opcode); + return TTI->getCFInstrCost(Opcode); case Instruction::PHI: return 0; case Instruction::Add: @@ -552,11 +558,11 @@ namespace { case Instruction::And: case Instruction::Or: case Instruction::Xor: - return VTTI->getArithmeticInstrCost(Opcode, T1); + return TTI->getArithmeticInstrCost(Opcode, T1); case Instruction::Select: case Instruction::ICmp: case Instruction::FCmp: - return VTTI->getCmpSelInstrCost(Opcode, T1, T2); + return TTI->getCmpSelInstrCost(Opcode, T1, T2); case Instruction::ZExt: case Instruction::SExt: case Instruction::FPToUI: @@ -570,7 +576,7 @@ namespace { case Instruction::FPTrunc: case Instruction::BitCast: case Instruction::ShuffleVector: - return VTTI->getCastInstrCost(Opcode, T1, T2); + return TTI->getCastInstrCost(Opcode, T1, T2); } return 1; @@ -642,7 +648,7 @@ namespace { Function *F = I->getCalledFunction(); if (!F) return false; - unsigned IID = F->getIntrinsicID(); + Intrinsic::ID IID = (Intrinsic::ID) F->getIntrinsicID(); if (!IID) return false; switch(IID) { @@ -660,23 +666,11 @@ namespace { case Intrinsic::pow: return Config.VectorizeMath; case Intrinsic::fma: + case Intrinsic::fmuladd: return Config.VectorizeFMA; } } - // Returns true if J is the second element in some pair referenced by - // some multimap pair iterator pair. - template <typename V> - bool isSecondInIteratorPair(V J, std::pair< - typename std::multimap<V, V>::iterator, - typename std::multimap<V, V>::iterator> PairRange) { - for (typename std::multimap<V, V>::iterator K = PairRange.first; - K != PairRange.second; ++K) - if (K->second == J) return true; - - return false; - } - bool isPureIEChain(InsertElementInst *IE) { InsertElementInst *IENext = IE; do { @@ -701,11 +695,12 @@ namespace { DenseMap<Value *, Value *> AllChosenPairs; DenseSet<ValuePair> AllFixedOrderPairs; DenseMap<VPPair, unsigned> AllPairConnectionTypes; - std::multimap<ValuePair, ValuePair> AllConnectedPairs, AllConnectedPairDeps; + DenseMap<ValuePair, std::vector<ValuePair> > AllConnectedPairs, + AllConnectedPairDeps; do { std::vector<Value *> PairableInsts; - std::multimap<Value *, Value *> CandidatePairs; + DenseMap<Value *, std::vector<Value *> > CandidatePairs; DenseSet<ValuePair> FixedOrderPairs; DenseMap<ValuePair, int> CandidatePairCostSavings; ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs, @@ -714,6 +709,14 @@ namespace { PairableInsts, NonPow2Len); if (PairableInsts.empty()) continue; + // Build the candidate pair set for faster lookups. + DenseSet<ValuePair> CandidatePairsSet; + for (DenseMap<Value *, std::vector<Value *> >::iterator I = + CandidatePairs.begin(), E = CandidatePairs.end(); I != E; ++I) + for (std::vector<Value *>::iterator J = I->second.begin(), + JE = I->second.end(); J != JE; ++J) + CandidatePairsSet.insert(ValuePair(I->first, *J)); + // Now we have a map of all of the pairable instructions and we need to // select the best possible pairing. A good pairing is one such that the // users of the pair are also paired. This defines a (directed) forest @@ -723,30 +726,33 @@ namespace { // Note that it only matters that both members of the second pair use some // element of the first pair (to allow for splatting). - std::multimap<ValuePair, ValuePair> ConnectedPairs, ConnectedPairDeps; + DenseMap<ValuePair, std::vector<ValuePair> > ConnectedPairs, + ConnectedPairDeps; DenseMap<VPPair, unsigned> PairConnectionTypes; - computeConnectedPairs(CandidatePairs, PairableInsts, ConnectedPairs, - PairConnectionTypes); + computeConnectedPairs(CandidatePairs, CandidatePairsSet, + PairableInsts, ConnectedPairs, PairConnectionTypes); if (ConnectedPairs.empty()) continue; - for (std::multimap<ValuePair, ValuePair>::iterator + for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator I = ConnectedPairs.begin(), IE = ConnectedPairs.end(); - I != IE; ++I) { - ConnectedPairDeps.insert(VPPair(I->second, I->first)); - } + I != IE; ++I) + for (std::vector<ValuePair>::iterator J = I->second.begin(), + JE = I->second.end(); J != JE; ++J) + ConnectedPairDeps[*J].push_back(I->first); // Build the pairable-instruction dependency map DenseSet<ValuePair> PairableInstUsers; buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers); // There is now a graph of the connected pairs. For each variable, pick - // the pairing with the largest tree meeting the depth requirement on at - // least one branch. Then select all pairings that are part of that tree + // the pairing with the largest dag meeting the depth requirement on at + // least one branch. Then select all pairings that are part of that dag // and remove them from the list of available pairings and pairable // variables. DenseMap<Value *, Value *> ChosenPairs; - choosePairs(CandidatePairs, CandidatePairCostSavings, + choosePairs(CandidatePairs, CandidatePairsSet, + CandidatePairCostSavings, PairableInsts, FixedOrderPairs, PairConnectionTypes, ConnectedPairs, ConnectedPairDeps, PairableInstUsers, ChosenPairs); @@ -780,14 +786,15 @@ namespace { } } - for (std::multimap<ValuePair, ValuePair>::iterator + for (DenseMap<ValuePair, std::vector<ValuePair> >::iterator I = ConnectedPairs.begin(), IE = ConnectedPairs.end(); - I != IE; ++I) { - if (AllPairConnectionTypes.count(*I)) { - AllConnectedPairs.insert(*I); - AllConnectedPairDeps.insert(VPPair(I->second, I->first)); - } - } + I != IE; ++I) + for (std::vector<ValuePair>::iterator J = I->second.begin(), + JE = I->second.end(); J != JE; ++J) + if (AllPairConnectionTypes.count(VPPair(I->first, *J))) { + AllConnectedPairs[I->first].push_back(*J); + AllConnectedPairDeps[*J].push_back(I->first); + } } while (ShouldContinue); if (AllChosenPairs.empty()) return false; @@ -903,8 +910,8 @@ namespace { T2->getScalarType()->isPointerTy())) return false; - if (!VTTI && (T1->getPrimitiveSizeInBits() >= Config.VectorBits || - T2->getPrimitiveSizeInBits() >= Config.VectorBits)) + if (!TTI && (T1->getPrimitiveSizeInBits() >= Config.VectorBits || + T2->getPrimitiveSizeInBits() >= Config.VectorBits)) return false; return true; @@ -913,7 +920,7 @@ namespace { // This function returns true if the two provided instructions are compatible // (meaning that they can be fused into a vector instruction). This assumes // that I has already been determined to be vectorizable and that J is not - // in the use tree of I. + // in the use dag of I. bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J, bool IsSimpleLoadStore, bool NonPow2Len, int &CostSavings, int &FixedOrder) { @@ -935,7 +942,7 @@ namespace { unsigned MaxTypeBits = std::max( IT1->getPrimitiveSizeInBits() + JT1->getPrimitiveSizeInBits(), IT2->getPrimitiveSizeInBits() + JT2->getPrimitiveSizeInBits()); - if (!VTTI && MaxTypeBits > Config.VectorBits) + if (!TTI && MaxTypeBits > Config.VectorBits) return false; // FIXME: handle addsub-type operations! @@ -967,21 +974,26 @@ namespace { return false; } - if (VTTI) { - unsigned ICost = VTTI->getMemoryOpCost(I->getOpcode(), I->getType(), - IAlignment, IAddressSpace); - unsigned JCost = VTTI->getMemoryOpCost(J->getOpcode(), J->getType(), - JAlignment, JAddressSpace); - unsigned VCost = VTTI->getMemoryOpCost(I->getOpcode(), VType, - BottomAlignment, - IAddressSpace); + if (TTI) { + unsigned ICost = TTI->getMemoryOpCost(I->getOpcode(), aTypeI, + IAlignment, IAddressSpace); + unsigned JCost = TTI->getMemoryOpCost(J->getOpcode(), aTypeJ, + JAlignment, JAddressSpace); + unsigned VCost = TTI->getMemoryOpCost(I->getOpcode(), VType, + BottomAlignment, + IAddressSpace); + + ICost += TTI->getAddressComputationCost(aTypeI); + JCost += TTI->getAddressComputationCost(aTypeJ); + VCost += TTI->getAddressComputationCost(VType); + if (VCost > ICost + JCost) return false; // We don't want to fuse to a type that will be split, even // if the two input types will also be split and there is no other // associated cost. - unsigned VParts = VTTI->getNumberOfParts(VType); + unsigned VParts = TTI->getNumberOfParts(VType); if (VParts > 1) return false; else if (!VParts && VCost == ICost + JCost) @@ -992,11 +1004,17 @@ namespace { } else { return false; } - } else if (VTTI) { + } else if (TTI) { unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2); unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2); Type *VT1 = getVecTypeForPair(IT1, JT1), *VT2 = getVecTypeForPair(IT2, JT2); + + // Note that this procedure is incorrect for insert and extract element + // instructions (because combining these often results in a shuffle), + // but this cost is ignored (because insert and extract element + // instructions are assigned a zero depth factor and are not really + // fused in general). unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2); if (VCost > ICost + JCost) @@ -1005,8 +1023,8 @@ namespace { // We don't want to fuse to a type that will be split, even // if the two input types will also be split and there is no other // associated cost. - unsigned VParts1 = VTTI->getNumberOfParts(VT1), - VParts2 = VTTI->getNumberOfParts(VT2); + unsigned VParts1 = TTI->getNumberOfParts(VT1), + VParts2 = TTI->getNumberOfParts(VT2); if (VParts1 > 1 || VParts2 > 1) return false; else if ((!VParts1 || !VParts2) && VCost == ICost + JCost) @@ -1019,14 +1037,67 @@ namespace { // vectorized, the second arguments must be equal. CallInst *CI = dyn_cast<CallInst>(I); Function *FI; - if (CI && (FI = CI->getCalledFunction()) && - FI->getIntrinsicID() == Intrinsic::powi) { - - Value *A1I = CI->getArgOperand(1), - *A1J = cast<CallInst>(J)->getArgOperand(1); - const SCEV *A1ISCEV = SE->getSCEV(A1I), - *A1JSCEV = SE->getSCEV(A1J); - return (A1ISCEV == A1JSCEV); + if (CI && (FI = CI->getCalledFunction())) { + Intrinsic::ID IID = (Intrinsic::ID) FI->getIntrinsicID(); + if (IID == Intrinsic::powi) { + Value *A1I = CI->getArgOperand(1), + *A1J = cast<CallInst>(J)->getArgOperand(1); + const SCEV *A1ISCEV = SE->getSCEV(A1I), + *A1JSCEV = SE->getSCEV(A1J); + return (A1ISCEV == A1JSCEV); + } + + if (IID && TTI) { + SmallVector<Type*, 4> Tys; + for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) + Tys.push_back(CI->getArgOperand(i)->getType()); + unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, Tys); + + Tys.clear(); + CallInst *CJ = cast<CallInst>(J); + for (unsigned i = 0, ie = CJ->getNumArgOperands(); i != ie; ++i) + Tys.push_back(CJ->getArgOperand(i)->getType()); + unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, Tys); + + Tys.clear(); + assert(CI->getNumArgOperands() == CJ->getNumArgOperands() && + "Intrinsic argument counts differ"); + for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { + if (IID == Intrinsic::powi && i == 1) + Tys.push_back(CI->getArgOperand(i)->getType()); + else + Tys.push_back(getVecTypeForPair(CI->getArgOperand(i)->getType(), + CJ->getArgOperand(i)->getType())); + } + + Type *RetTy = getVecTypeForPair(IT1, JT1); + unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys); + + if (VCost > ICost + JCost) + return false; + + // We don't want to fuse to a type that will be split, even + // if the two input types will also be split and there is no other + // associated cost. + unsigned RetParts = TTI->getNumberOfParts(RetTy); + if (RetParts > 1) + return false; + else if (!RetParts && VCost == ICost + JCost) + return false; + + for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) { + if (!Tys[i]->isVectorTy()) + continue; + + unsigned NumParts = TTI->getNumberOfParts(Tys[i]); + if (NumParts > 1) + return false; + else if (!NumParts && VCost == ICost + JCost) + return false; + } + + CostSavings = ICost + JCost - VCost; + } } return true; @@ -1040,7 +1111,7 @@ namespace { // to contain any memory locations to which J writes. The function returns // true if J uses I. By default, alias analysis is used to determine // whether J reads from memory that overlaps with a location in WriteSet. - // If LoadMoveSet is not null, then it is a previously-computed multimap + // If LoadMoveSet is not null, then it is a previously-computed map // where the key is the memory-based user instruction and the value is // the instruction to be compared with I. So, if LoadMoveSet is provided, // then the alias analysis is not used. This is necessary because this @@ -1050,7 +1121,7 @@ namespace { bool BBVectorize::trackUsesOfI(DenseSet<Value *> &Users, AliasSetTracker &WriteSet, Instruction *I, Instruction *J, bool UpdateUsers, - std::multimap<Value *, Value *> *LoadMoveSet) { + DenseSet<ValuePair> *LoadMoveSetPairs) { bool UsesI = false; // This instruction may already be marked as a user due, for example, to @@ -1068,9 +1139,8 @@ namespace { } } if (!UsesI && J->mayReadFromMemory()) { - if (LoadMoveSet) { - VPIteratorPair JPairRange = LoadMoveSet->equal_range(J); - UsesI = isSecondInIteratorPair<Value*>(I, JPairRange); + if (LoadMoveSetPairs) { + UsesI = LoadMoveSetPairs->count(ValuePair(J, I)); } else { for (AliasSetTracker::iterator W = WriteSet.begin(), WE = WriteSet.end(); W != WE; ++W) { @@ -1094,10 +1164,11 @@ namespace { // basic block and collects all candidate pairs for vectorization. bool BBVectorize::getCandidatePairs(BasicBlock &BB, BasicBlock::iterator &Start, - std::multimap<Value *, Value *> &CandidatePairs, + DenseMap<Value *, std::vector<Value *> > &CandidatePairs, DenseSet<ValuePair> &FixedOrderPairs, DenseMap<ValuePair, int> &CandidatePairCostSavings, std::vector<Value *> &PairableInsts, bool NonPow2Len) { + size_t TotalPairs = 0; BasicBlock::iterator E = BB.end(); if (Start == E) return false; @@ -1143,8 +1214,9 @@ namespace { PairableInsts.push_back(I); } - CandidatePairs.insert(ValuePair(I, J)); - if (VTTI) + CandidatePairs[I].push_back(J); + ++TotalPairs; + if (TTI) CandidatePairCostSavings.insert(ValuePairWithCost(ValuePair(I, J), CostSavings)); @@ -1167,7 +1239,8 @@ namespace { // If we have already found too many pairs, break here and this function // will be called again starting after the last instruction selected // during this invocation. - if (PairableInsts.size() >= Config.MaxInsts) { + if (PairableInsts.size() >= Config.MaxInsts || + TotalPairs >= Config.MaxPairs) { ShouldContinue = true; break; } @@ -1187,11 +1260,12 @@ namespace { // it looks for pairs such that both members have an input which is an // output of PI or PJ. void BBVectorize::computePairsConnectedTo( - std::multimap<Value *, Value *> &CandidatePairs, - std::vector<Value *> &PairableInsts, - std::multimap<ValuePair, ValuePair> &ConnectedPairs, - DenseMap<VPPair, unsigned> &PairConnectionTypes, - ValuePair P) { |