diff options
author | Derek Schuff <dschuff@chromium.org> | 2012-11-06 10:23:47 -0800 |
---|---|---|
committer | Derek Schuff <dschuff@chromium.org> | 2012-11-06 10:23:47 -0800 |
commit | 5bcab54cfde18b4b11f163d7d916711df70cbebf (patch) | |
tree | c5774bfc00faa412178497d9ae92dea73d717a7c /lib/Transforms/Vectorize/BBVectorize.cpp | |
parent | 96cb06677afe87ea958bf986ca2b9fb87daa2da1 (diff) | |
parent | cfe09ed28d8a65b671e8b7a716a933e98e810e32 (diff) |
Merge commit 'cfe09ed28d8a65b671e8b7a716a933e98e810e32'
Conflicts:
lib/Target/ARM/ARMFrameLowering.cpp
lib/Target/Mips/MipsRegisterInfo.cpp
lib/Target/X86/X86ISelLowering.cpp
lib/Transforms/IPO/ExtractGV.cpp
tools/Makefile
tools/gold/gold-plugin.cpp
The only interesting conflict was X86ISelLowering.ccp, which
meant I had to essentially revert r167104. The problem is that we are
using ESP as the stack pointer in X86ISelLowering and RSP as the
stack pointer in X86FrameLowering, and that revision made them
both consistently use X86RegisterInfo to determine which to use.
Diffstat (limited to 'lib/Transforms/Vectorize/BBVectorize.cpp')
-rw-r--r-- | lib/Transforms/Vectorize/BBVectorize.cpp | 898 |
1 files changed, 703 insertions, 195 deletions
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp index 81125f22a6..4653a7d7c8 100644 --- a/lib/Transforms/Vectorize/BBVectorize.cpp +++ b/lib/Transforms/Vectorize/BBVectorize.cpp @@ -43,16 +43,26 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Support/ValueHandle.h" #include "llvm/DataLayout.h" +#include "llvm/TargetTransformInfo.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Vectorize.h" #include <algorithm> #include <map> using namespace llvm; +static cl::opt<bool> +IgnoreTargetInfo("bb-vectorize-ignore-target-info", cl::init(false), + cl::Hidden, cl::desc("Ignore target information")); + static cl::opt<unsigned> ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden, cl::desc("The required chain depth for vectorization")); +static cl::opt<bool> +UseChainDepthWithTI("bb-vectorize-use-chain-depth", cl::init(false), + cl::Hidden, cl::desc("Use the chain depth requirement with" + " target information")); + static cl::opt<unsigned> SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden, cl::desc("The maximum search distance for instruction pairs")); @@ -94,8 +104,9 @@ static cl::opt<bool> NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden, cl::desc("Don't try to vectorize floating-point values")); +// FIXME: This should default to false once pointer vector support works. static cl::opt<bool> -NoPointers("bb-vectorize-no-pointers", cl::init(false), cl::Hidden, +NoPointers("bb-vectorize-no-pointers", cl::init(/*false*/ true), cl::Hidden, cl::desc("Don't try to vectorize pointer values")); static cl::opt<bool> @@ -160,6 +171,12 @@ DebugCycleCheck("bb-vectorize-debug-cycle-check", cl::init(false), cl::Hidden, cl::desc("When debugging is enabled, output information on the" " cycle-checking process")); + +static cl::opt<bool> +PrintAfterEveryPair("bb-vectorize-debug-print-after-every-pair", + cl::init(false), cl::Hidden, + cl::desc("When debugging is enabled, dump the basic block after" + " every pair is fused")); #endif STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize"); @@ -181,11 +198,16 @@ namespace { DT = &P->getAnalysis<DominatorTree>(); SE = &P->getAnalysis<ScalarEvolution>(); TD = P->getAnalysisIfAvailable<DataLayout>(); + TTI = IgnoreTargetInfo ? 0 : + P->getAnalysisIfAvailable<TargetTransformInfo>(); + VTTI = TTI ? TTI->getVectorTargetTransformInfo() : 0; } typedef std::pair<Value *, Value *> ValuePair; + typedef std::pair<ValuePair, int> ValuePairWithCost; typedef std::pair<ValuePair, size_t> ValuePairWithDepth; typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair + typedef std::pair<VPPair, unsigned> VPPairWithType; typedef std::pair<std::multimap<Value *, Value *>::iterator, std::multimap<Value *, Value *>::iterator> VPIteratorPair; typedef std::pair<std::multimap<ValuePair, ValuePair>::iterator, @@ -196,6 +218,8 @@ namespace { DominatorTree *DT; ScalarEvolution *SE; DataLayout *TD; + TargetTransformInfo *TTI; + const VectorTargetTransformInfo *VTTI; // FIXME: const correct? @@ -204,11 +228,23 @@ namespace { bool getCandidatePairs(BasicBlock &BB, BasicBlock::iterator &Start, std::multimap<Value *, Value *> &CandidatePairs, + DenseSet<ValuePair> &FixedOrderPairs, + DenseMap<ValuePair, int> &CandidatePairCostSavings, std::vector<Value *> &PairableInsts, bool NonPow2Len); + // FIXME: The current implementation does not account for pairs that + // are connected in multiple ways. For example: + // C1 = A1 / A2; C2 = A2 / A1 (which may be both direct and a swap) + enum PairConnectionType { + PairConnectionDirect, + PairConnectionSwap, + PairConnectionSplat + }; + void computeConnectedPairs(std::multimap<Value *, Value *> &CandidatePairs, std::vector<Value *> &PairableInsts, - std::multimap<ValuePair, ValuePair> &ConnectedPairs); + std::multimap<ValuePair, ValuePair> &ConnectedPairs, + DenseMap<VPPair, unsigned> &PairConnectionTypes); void buildDepMap(BasicBlock &BB, std::multimap<Value *, Value *> &CandidatePairs, @@ -216,19 +252,29 @@ namespace { DenseSet<ValuePair> &PairableInstUsers); void choosePairs(std::multimap<Value *, Value *> &CandidatePairs, + DenseMap<ValuePair, int> &CandidatePairCostSavings, std::vector<Value *> &PairableInsts, + DenseSet<ValuePair> &FixedOrderPairs, + DenseMap<VPPair, unsigned> &PairConnectionTypes, std::multimap<ValuePair, ValuePair> &ConnectedPairs, + std::multimap<ValuePair, ValuePair> &ConnectedPairDeps, DenseSet<ValuePair> &PairableInstUsers, DenseMap<Value *, Value *>& ChosenPairs); void fuseChosenPairs(BasicBlock &BB, std::vector<Value *> &PairableInsts, - DenseMap<Value *, Value *>& ChosenPairs); + DenseMap<Value *, Value *>& ChosenPairs, + DenseSet<ValuePair> &FixedOrderPairs, + DenseMap<VPPair, unsigned> &PairConnectionTypes, + std::multimap<ValuePair, ValuePair> &ConnectedPairs, + std::multimap<ValuePair, ValuePair> &ConnectedPairDeps); + bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore); bool areInstsCompatible(Instruction *I, Instruction *J, - bool IsSimpleLoadStore, bool NonPow2Len); + bool IsSimpleLoadStore, bool NonPow2Len, + int &CostSavings, int &FixedOrder); bool trackUsesOfI(DenseSet<Value *> &Users, AliasSetTracker &WriteSet, Instruction *I, @@ -239,6 +285,7 @@ namespace { std::multimap<Value *, Value *> &CandidatePairs, std::vector<Value *> &PairableInsts, std::multimap<ValuePair, ValuePair> &ConnectedPairs, + DenseMap<VPPair, unsigned> &PairConnectionTypes, ValuePair P); bool pairsConflict(ValuePair P, ValuePair Q, @@ -270,17 +317,21 @@ namespace { void findBestTreeFor( std::multimap<Value *, Value *> &CandidatePairs, + DenseMap<ValuePair, int> &CandidatePairCostSavings, std::vector<Value *> &PairableInsts, + DenseSet<ValuePair> &FixedOrderPairs, + DenseMap<VPPair, unsigned> &PairConnectionTypes, std::multimap<ValuePair, ValuePair> &ConnectedPairs, + std::multimap<ValuePair, ValuePair> &ConnectedPairDeps, DenseSet<ValuePair> &PairableInstUsers, std::multimap<ValuePair, ValuePair> &PairableInstUserMap, DenseMap<Value *, Value *> &ChosenPairs, DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth, - size_t &BestEffSize, VPIteratorPair ChoiceRange, + int &BestEffSize, VPIteratorPair ChoiceRange, bool UseCycleCheck); Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I, - Instruction *J, unsigned o, bool FlipMemInputs); + Instruction *J, unsigned o); void fillNewShuffleMask(LLVMContext& Context, Instruction *J, unsigned MaskOffset, unsigned NumInElem, @@ -292,20 +343,20 @@ namespace { bool expandIEChain(LLVMContext& Context, Instruction *I, Instruction *J, unsigned o, Value *&LOp, unsigned numElemL, - Type *ArgTypeL, Type *ArgTypeR, + Type *ArgTypeL, Type *ArgTypeR, bool IBeforeJ, unsigned IdxOff = 0); Value *getReplacementInput(LLVMContext& Context, Instruction *I, - Instruction *J, unsigned o, bool FlipMemInputs); + Instruction *J, unsigned o, bool IBeforeJ); void getReplacementInputsForPair(LLVMContext& Context, Instruction *I, Instruction *J, SmallVector<Value *, 3> &ReplacedOperands, - bool FlipMemInputs); + bool IBeforeJ); void replaceOutputsOfPair(LLVMContext& Context, Instruction *I, Instruction *J, Instruction *K, Instruction *&InsertionPt, Instruction *&K1, - Instruction *&K2, bool FlipMemInputs); + Instruction *&K2); void collectPairLoadMoveSet(BasicBlock &BB, DenseMap<Value *, Value *> &ChosenPairs, @@ -317,10 +368,6 @@ namespace { DenseMap<Value *, Value *> &ChosenPairs, std::multimap<Value *, Value *> &LoadMoveSet); - void collectPtrInfo(std::vector<Value *> &PairableInsts, - DenseMap<Value *, Value *> &ChosenPairs, - DenseSet<Value *> &LowPtrInsts); - bool canMoveUsesOfIAfterJ(BasicBlock &BB, std::multimap<Value *, Value *> &LoadMoveSet, Instruction *I, Instruction *J); @@ -339,13 +386,16 @@ namespace { return false; } + DEBUG(if (VTTI) dbgs() << "BBV: using target information\n"); + bool changed = false; // Iterate a sufficient number of times to merge types of size 1 bit, // then 2 bits, then 4, etc. up to half of the target vector width of the // target vector register. unsigned n = 1; for (unsigned v = 2; - v <= Config.VectorBits && (!Config.MaxIter || n <= Config.MaxIter); + (VTTI || v <= Config.VectorBits) && + (!Config.MaxIter || n <= Config.MaxIter); v *= 2, ++n) { DEBUG(dbgs() << "BBV: fusing loop #" << n << " for " << BB.getName() << " in " << @@ -375,6 +425,9 @@ namespace { DT = &getAnalysis<DominatorTree>(); SE = &getAnalysis<ScalarEvolution>(); TD = getAnalysisIfAvailable<DataLayout>(); + TTI = IgnoreTargetInfo ? 0 : + getAnalysisIfAvailable<TargetTransformInfo>(); + VTTI = TTI ? TTI->getVectorTargetTransformInfo() : 0; return vectorizeBB(BB); } @@ -427,6 +480,10 @@ namespace { T2 = cast<CastInst>(I)->getSrcTy(); else T2 = T1; + + if (SelectInst *SI = dyn_cast<SelectInst>(I)) { + T2 = SI->getCondition()->getType(); + } } // Returns the weight associated with the provided value. A chain of @@ -458,6 +515,62 @@ namespace { return 1; } + // Returns the cost of the provided instruction using VTTI. + // This does not handle loads and stores. + unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2) { + switch (Opcode) { + default: break; + case Instruction::GetElementPtr: + // We mark this instruction as zero-cost because scalar GEPs are usually + // lowered to the intruction addressing mode. At the moment we don't + // generate vector GEPs. + return 0; + case Instruction::Br: + return VTTI->getCFInstrCost(Opcode); + case Instruction::PHI: + return 0; + case Instruction::Add: + case Instruction::FAdd: + case Instruction::Sub: + case Instruction::FSub: + case Instruction::Mul: + case Instruction::FMul: + case Instruction::UDiv: + case Instruction::SDiv: + case Instruction::FDiv: + case Instruction::URem: + case Instruction::SRem: + case Instruction::FRem: + case Instruction::Shl: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::And: + case Instruction::Or: + case Instruction::Xor: + return VTTI->getArithmeticInstrCost(Opcode, T1); + case Instruction::Select: + case Instruction::ICmp: + case Instruction::FCmp: + return VTTI->getCmpSelInstrCost(Opcode, T1, T2); + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::FPExt: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::SIToFP: + case Instruction::UIToFP: + case Instruction::Trunc: + case Instruction::FPTrunc: + case Instruction::BitCast: + case Instruction::ShuffleVector: + return VTTI->getCastInstrCost(Opcode, T1, T2); + } + + return 1; + } + // This determines the relative offset of two loads or stores, returning // true if the offset could be determined to be some constant value. // For example, if OffsetInElmts == 1, then J accesses the memory directly @@ -465,20 +578,30 @@ namespace { // directly after J. bool getPairPtrInfo(Instruction *I, Instruction *J, Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment, - int64_t &OffsetInElmts) { + unsigned &IAddressSpace, unsigned &JAddressSpace, + int64_t &OffsetInElmts, bool ComputeOffset = true) { OffsetInElmts = 0; - if (isa<LoadInst>(I)) { - IPtr = cast<LoadInst>(I)->getPointerOperand(); - JPtr = cast<LoadInst>(J)->getPointerOperand(); - IAlignment = cast<LoadInst>(I)->getAlignment(); - JAlignment = cast<LoadInst>(J)->getAlignment(); + if (LoadInst *LI = dyn_cast<LoadInst>(I)) { + LoadInst *LJ = cast<LoadInst>(J); + IPtr = LI->getPointerOperand(); + JPtr = LJ->getPointerOperand(); + IAlignment = LI->getAlignment(); + JAlignment = LJ->getAlignment(); + IAddressSpace = LI->getPointerAddressSpace(); + JAddressSpace = LJ->getPointerAddressSpace(); } else { - IPtr = cast<StoreInst>(I)->getPointerOperand(); - JPtr = cast<StoreInst>(J)->getPointerOperand(); - IAlignment = cast<StoreInst>(I)->getAlignment(); - JAlignment = cast<StoreInst>(J)->getAlignment(); + StoreInst *SI = cast<StoreInst>(I), *SJ = cast<StoreInst>(J); + IPtr = SI->getPointerOperand(); + JPtr = SJ->getPointerOperand(); + IAlignment = SI->getAlignment(); + JAlignment = SJ->getAlignment(); + IAddressSpace = SI->getPointerAddressSpace(); + JAddressSpace = SJ->getPointerAddressSpace(); } + if (!ComputeOffset) + return true; + const SCEV *IPtrSCEV = SE->getSCEV(IPtr); const SCEV *JPtrSCEV = SE->getSCEV(JPtr); @@ -558,11 +681,18 @@ namespace { std::vector<Value *> AllPairableInsts; DenseMap<Value *, Value *> AllChosenPairs; + DenseSet<ValuePair> AllFixedOrderPairs; + DenseMap<VPPair, unsigned> AllPairConnectionTypes; + std::multimap<ValuePair, ValuePair> AllConnectedPairs, AllConnectedPairDeps; do { std::vector<Value *> PairableInsts; std::multimap<Value *, Value *> CandidatePairs; + DenseSet<ValuePair> FixedOrderPairs; + DenseMap<ValuePair, int> CandidatePairCostSavings; ShouldContinue = getCandidatePairs(BB, Start, CandidatePairs, + FixedOrderPairs, + CandidatePairCostSavings, PairableInsts, NonPow2Len); if (PairableInsts.empty()) continue; @@ -575,10 +705,18 @@ namespace { // Note that it only matters that both members of the second pair use some // element of the first pair (to allow for splatting). - std::multimap<ValuePair, ValuePair> ConnectedPairs; - computeConnectedPairs(CandidatePairs, PairableInsts, ConnectedPairs); + std::multimap<ValuePair, ValuePair> ConnectedPairs, ConnectedPairDeps; + DenseMap<VPPair, unsigned> PairConnectionTypes; + computeConnectedPairs(CandidatePairs, PairableInsts, ConnectedPairs, + PairConnectionTypes); if (ConnectedPairs.empty()) continue; + for (std::multimap<ValuePair, ValuePair>::iterator + I = ConnectedPairs.begin(), IE = ConnectedPairs.end(); + I != IE; ++I) { + ConnectedPairDeps.insert(VPPair(I->second, I->first)); + } + // Build the pairable-instruction dependency map DenseSet<ValuePair> PairableInstUsers; buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers); @@ -590,13 +728,48 @@ namespace { // variables. DenseMap<Value *, Value *> ChosenPairs; - choosePairs(CandidatePairs, PairableInsts, ConnectedPairs, + choosePairs(CandidatePairs, CandidatePairCostSavings, + PairableInsts, FixedOrderPairs, PairConnectionTypes, + ConnectedPairs, ConnectedPairDeps, PairableInstUsers, ChosenPairs); if (ChosenPairs.empty()) continue; AllPairableInsts.insert(AllPairableInsts.end(), PairableInsts.begin(), PairableInsts.end()); AllChosenPairs.insert(ChosenPairs.begin(), ChosenPairs.end()); + + // Only for the chosen pairs, propagate information on fixed-order pairs, + // pair connections, and their types to the data structures used by the + // pair fusion procedures. + for (DenseMap<Value *, Value *>::iterator I = ChosenPairs.begin(), + IE = ChosenPairs.end(); I != IE; ++I) { + if (FixedOrderPairs.count(*I)) + AllFixedOrderPairs.insert(*I); + else if (FixedOrderPairs.count(ValuePair(I->second, I->first))) + AllFixedOrderPairs.insert(ValuePair(I->second, I->first)); + + for (DenseMap<Value *, Value *>::iterator J = ChosenPairs.begin(); + J != IE; ++J) { + DenseMap<VPPair, unsigned>::iterator K = + PairConnectionTypes.find(VPPair(*I, *J)); + if (K != PairConnectionTypes.end()) { + AllPairConnectionTypes.insert(*K); + } else { + K = PairConnectionTypes.find(VPPair(*J, *I)); + if (K != PairConnectionTypes.end()) + AllPairConnectionTypes.insert(*K); + } + } + } + + for (std::multimap<ValuePair, ValuePair>::iterator + I = ConnectedPairs.begin(), IE = ConnectedPairs.end(); + I != IE; ++I) { + if (AllPairConnectionTypes.count(*I)) { + AllConnectedPairs.insert(*I); + AllConnectedPairDeps.insert(VPPair(I->second, I->first)); + } + } } while (ShouldContinue); if (AllChosenPairs.empty()) return false; @@ -609,7 +782,9 @@ namespace { // replaced with a vector_extract on the result. Subsequent optimization // passes should coalesce the build/extract combinations. - fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs); + fuseChosenPairs(BB, AllPairableInsts, AllChosenPairs, AllFixedOrderPairs, + AllPairConnectionTypes, + AllConnectedPairs, AllConnectedPairDeps); // It is important to cleanup here so that future iterations of this // function have less work to do. @@ -679,15 +854,22 @@ namespace { !(VectorType::isValidElementType(T2) || T2->isVectorTy())) return false; - if (T1->getScalarSizeInBits() == 1 && T2->getScalarSizeInBits() == 1) { + if (T1->getScalarSizeInBits() == 1) { if (!Config.VectorizeBools) return false; } else { - if (!Config.VectorizeInts - && (T1->isIntOrIntVectorTy() || T2->isIntOrIntVectorTy())) + if (!Config.VectorizeInts && T1->isIntOrIntVectorTy()) return false; } - + + if (T2->getScalarSizeInBits() == 1) { + if (!Config.VectorizeBools) + return false; + } else { + if (!Config.VectorizeInts && T2->isIntOrIntVectorTy()) + return false; + } + if (!Config.VectorizeFloats && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy())) return false; @@ -703,8 +885,8 @@ namespace { T2->getScalarType()->isPointerTy())) return false; - if (T1->getPrimitiveSizeInBits() >= Config.VectorBits || - T2->getPrimitiveSizeInBits() >= Config.VectorBits) + if (!VTTI && (T1->getPrimitiveSizeInBits() >= Config.VectorBits || + T2->getPrimitiveSizeInBits() >= Config.VectorBits)) return false; return true; @@ -715,10 +897,14 @@ namespace { // that I has already been determined to be vectorizable and that J is not // in the use tree of I. bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J, - bool IsSimpleLoadStore, bool NonPow2Len) { + bool IsSimpleLoadStore, bool NonPow2Len, + int &CostSavings, int &FixedOrder) { DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I << " <-> " << *J << "\n"); + CostSavings = 0; + FixedOrder = 0; + // Loads and stores can be merged if they have different alignments, // but are otherwise the same. if (!J->isSameOperationAs(I, Instruction::CompareIgnoringAlignment | @@ -731,38 +917,83 @@ namespace { unsigned MaxTypeBits = std::max( IT1->getPrimitiveSizeInBits() + JT1->getPrimitiveSizeInBits(), IT2->getPrimitiveSizeInBits() + JT2->getPrimitiveSizeInBits()); - if (MaxTypeBits > Config.VectorBits) + if (!VTTI && MaxTypeBits > Config.VectorBits) return false; // FIXME: handle addsub-type operations! if (IsSimpleLoadStore) { Value *IPtr, *JPtr; - unsigned IAlignment, JAlignment; + unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace; int64_t OffsetInElmts = 0; if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, + IAddressSpace, JAddressSpace, OffsetInElmts) && abs64(OffsetInElmts) == 1) { - if (Config.AlignedOnly) { - Type *aTypeI = isa<StoreInst>(I) ? - cast<StoreInst>(I)->getValueOperand()->getType() : I->getType(); - Type *aTypeJ = isa<StoreInst>(J) ? - cast<StoreInst>(J)->getValueOperand()->getType() : J->getType(); + FixedOrder = (int) OffsetInElmts; + unsigned BottomAlignment = IAlignment; + if (OffsetInElmts < 0) BottomAlignment = JAlignment; + Type *aTypeI = isa<StoreInst>(I) ? + cast<StoreInst>(I)->getValueOperand()->getType() : I->getType(); + Type *aTypeJ = isa<StoreInst>(J) ? + cast<StoreInst>(J)->getValueOperand()->getType() : J->getType(); + Type *VType = getVecTypeForPair(aTypeI, aTypeJ); + + if (Config.AlignedOnly) { // An aligned load or store is possible only if the instruction // with the lower offset has an alignment suitable for the // vector type. - unsigned BottomAlignment = IAlignment; - if (OffsetInElmts < 0) BottomAlignment = JAlignment; - - Type *VType = getVecTypeForPair(aTypeI, aTypeJ); unsigned VecAlignment = TD->getPrefTypeAlignment(VType); if (BottomAlignment < VecAlignment) return false; } + + if (VTTI) { + unsigned ICost = VTTI->getMemoryOpCost(I->getOpcode(), I->getType(), + IAlignment, IAddressSpace); + unsigned JCost = VTTI->getMemoryOpCost(J->getOpcode(), J->getType(), + JAlignment, JAddressSpace); + unsigned VCost = VTTI->getMemoryOpCost(I->getOpcode(), VType, + BottomAlignment, + IAddressSpace); + if (VCost > ICost + JCost) + return false; + + // We don't want to fuse to a type that will be split, even + // if the two input types will also be split and there is no other + // associated cost. + unsigned VParts = VTTI->getNumberOfParts(VType); + if (VParts > 1) + return false; + else if (!VParts && VCost == ICost + JCost) + return false; + + CostSavings = ICost + JCost - VCost; + } } else { return false; } + } else if (VTTI) { + unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2); + unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2); + Type *VT1 = getVecTypeForPair(IT1, JT1), + *VT2 = getVecTypeForPair(IT2, JT2); + unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2); + + if (VCost > ICost + JCost) + return false; + + // We don't want to fuse to a type that will be split, even + // if the two input types will also be split and there is no other + // associated cost. + unsigned VParts = VTTI->getNumberOfParts(VT1); + if (VParts > 1) + return false; + else if (!VParts && VCost == ICost + JCost) + return false; + + CostSavings = ICost + JCost - VCost; } // The powi intrinsic is special because only the first argument is @@ -845,6 +1076,8 @@ namespace { bool BBVectorize::getCandidatePairs(BasicBlock &BB, BasicBlock::iterator &Start, std::multimap<Value *, Value *> &CandidatePairs, + DenseSet<ValuePair> &FixedOrderPairs, + DenseMap<ValuePair, int> &CandidatePairCostSavings, std::vector<Value *> &PairableInsts, bool NonPow2Len) { BasicBlock::iterator E = BB.end(); if (Start == E) return false; @@ -881,7 +1114,9 @@ namespace { // J does not use I, and comes before the first use of I, so it can be // merged with I if the instructions are compatible. - if (!areInstsCompatible(I, J, IsSimpleLoadStore, NonPow2Len)) continue; + int CostSavings, FixedOrder; + if (!areInstsCompatible(I, J, IsSimpleLoadStore, NonPow2Len, + CostSavings, FixedOrder)) continue; // J is a candidate for merging with I. if (!PairableInsts.size() || @@ -890,6 +1125,14 @@ namespace { } CandidatePairs.insert(ValuePair(I, J)); + if (VTTI) + CandidatePairCostSavings.insert(ValuePairWithCost(ValuePair(I, J), + CostSavings)); + + if (FixedOrder == 1) + FixedOrderPairs.insert(ValuePair(I, J)); + else if (FixedOrder == -1) + FixedOrderPairs.insert(ValuePair(J, I)); // The next call to this function must start after the last instruction // selected during this invocation. @@ -899,7 +1142,8 @@ namespace { } DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair " - << *I << " <-> " << *J << "\n"); + << *I << " <-> " << *J << " (cost savings: " << + CostSavings << ")\n"); // If we have already found too many pairs, break here and this function // will be called again starting after the last instruction selected @@ -927,6 +1171,7 @@ namespace { std::multimap<Value *, Value *> &CandidatePairs, std::vector<Value *> &PairableInsts, std::multimap<ValuePair, ValuePair> &ConnectedPairs, + DenseMap<VPPair, unsigned> &PairConnectionTypes, ValuePair P) { StoreInst *SI, *SJ; @@ -958,12 +1203,18 @@ namespace { VPIteratorPair JPairRange = CandidatePairs.equal_range(*J); // Look for <I, J>: - if (isSecondInIteratorPair<Value*>(*J, IPairRange)) - ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J))); + if (isSecondInIteratorPair<Value*>(*J, IPairRange)) { + VPPair VP(P, ValuePair(*I, *J)); + ConnectedPairs.insert(VP); + PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionDirect)); + } // Look for <J, I>: - if (isSecondInIteratorPair<Value*>(*I, JPairRange)) - ConnectedPairs.insert(VPPair(P, ValuePair(*J, *I))); + if (isSecondInIteratorPair<Value*>(*I, JPairRange)) { + VPPair VP(P, ValuePair(*J, *I)); + ConnectedPairs.insert(VP); + PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSwap)); + } } if (Config.SplatBreaksChain) continue; @@ -974,8 +1225,11 @@ namespace { P.first == SJ->getPointerOperand()) continue; - if (isSecondInIteratorPair<Value*>(*J, IPairRange)) - ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J))); + if (isSecondInIteratorPair<Value*>(*J, IPairRange)) { + VPPair VP(P, ValuePair(*I, *J)); + ConnectedPairs.insert(VP); + PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat)); + } } } @@ -997,8 +1251,11 @@ namespace { P.second == SJ->getPointerOperand()) continue; - if (isSecondInIteratorPair<Value*>(*J, IPairRange)) - ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J))); + if (isSecondInIteratorPair<Value*>(*J, IPairRange)) { + VPPair VP(P, ValuePair(*I, *J)); + ConnectedPairs.insert(VP); + PairConnectionTypes.insert(VPPairWithType(VP, PairConnectionSplat)); + } } } } @@ -1009,7 +1266,8 @@ namespace { void BBVectorize::computeConnectedPairs( std::multimap<Value *, Value *> &CandidatePairs, std::vector<Value *> &PairableInsts, - std::multimap<ValuePair, ValuePair> &ConnectedPairs) { + std::multimap<ValuePair, ValuePair> &ConnectedPairs, + DenseMap<VPPair, unsigned> &PairConnectionTypes) { for (std::vector<Value *>::iterator PI = PairableInsts.begin(), PE = PairableInsts.end(); PI != PE; ++PI) { @@ -1018,7 +1276,7 @@ namespace { for (std::multimap<Value *, Value *>::iterator P = choiceRange.first; P != choiceRange.second; ++P) computePairsConnectedTo(CandidatePairs, PairableInsts, - ConnectedPairs, *P); + ConnectedPairs, PairConnectionTypes, *P); } DEBUG(dbgs() << "BBV: found " << ConnectedPairs.size() @@ -1353,13 +1611,17 @@ namespace { // pairs, given the choice of root pairs as an iterator range. void BBVectorize::findBestTreeFor( std::multimap<Value *, Value *> &CandidatePairs, + DenseMap<ValuePair, int> &CandidatePairCostSavings, std::vector<Value *> &PairableInsts, + DenseSet<ValuePair> &FixedOrderPairs, + DenseMap<VPPair, unsigned> &PairConnectionTypes, std::multimap<ValuePair, ValuePair> &ConnectedPairs, + std::multimap<ValuePair, ValuePair> &ConnectedPairDeps, DenseSet<ValuePair> &PairableInstUsers, std::multimap<ValuePair, ValuePair> &PairableInstUserMap, DenseMap<Value *, Value *> &ChosenPairs, DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth, - size_t &BestEffSize, VPIteratorPair ChoiceRange, + int &BestEffSize, VPIteratorPair ChoiceRange, bool UseCycleCheck) { for (std::multimap<Value *, Value *>::iterator J = ChoiceRange.first; J != ChoiceRange.second; ++J) { @@ -1409,17 +1671,243 @@ namespace { PairableInstUsers, PairableInstUserMap, ChosenPairs, Tree, PrunedTree, *J, UseCycleCheck); - size_t EffSize = 0; - for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(), - E = PrunedTree.end(); S != E; ++S) - EffSize += getDepthFactor(S->first); + int EffSize = 0; + if (VTTI) { + DenseSet<Value *> PrunedTreeInstrs; + for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(), + E = PrunedTree.end(); S != E; ++S) { + PrunedTreeInstrs.insert(S->first); + PrunedTreeInstrs.insert(S->second); + } + + // The set of pairs that have already contributed to the total cost. + DenseSet<ValuePair> IncomingPairs; + + // The node weights represent the cost savings associated with + // fusing the pair of instructions. + for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(), + E = PrunedTree.end(); S != E; ++S) { + bool FlipOrder = false; + + if (getDepthFactor(S->first)) { + int ESContrib = CandidatePairCostSavings.find(*S)->second; + DEBUG(if (DebugPairSelection) dbgs() << "\tweight {" + << *S->first << " <-> " << *S->second << "} = " << + ESContrib << "\n"); + EffSize += ESContrib; + } + + // The edge weights contribute in a negative sense: they represent + // the cost of shuffles. + VPPIteratorPair IP = ConnectedPairDeps.equal_range(*S); + if (IP.first != ConnectedPairDeps.end()) { + unsigned NumDepsDirect = 0, NumDepsSwap = 0; + for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first; + Q != IP.second; ++Q) { + if (!PrunedTree.count(Q->second)) + continue; + DenseMap<VPPair, unsigned>::iterator R = + PairConnectionTypes.find(VPPair(Q->second, Q->first)); + assert(R != PairConnectionTypes.end() && + "Cannot find pair connection type"); + if (R->second == PairConnectionDirect) + ++NumDepsDirect; + else if (R->second == PairConnectionSwap) + ++NumDepsSwap; + } + + // If there are more swaps than direct connections, then + // the pair order will be flipped during fusion. So the real + // number of swaps is the minimum number. + FlipOrder = !FixedOrderPairs.count(*S) && + ((NumDepsSwap > NumDepsDirect) || + FixedOrderPairs.count(ValuePair(S->second, S->first))); + + for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first; + Q != IP.second; ++Q) { + if (!PrunedTree.count(Q->second)) + continue; + DenseMap<VPPair, unsigned>::iterator R = + PairConnectionTypes.find(VPPair(Q->second, Q->first)); + assert(R != PairConnectionTypes.end() && + "Cannot find pair connection type"); + Type *Ty1 = Q->second.first->getType(), + *Ty2 = Q->second.second->getType(); + Type *VTy = getVecTypeForPair(Ty1, Ty2); + if ((R->second == PairConnectionDirect && FlipOrder) || + (R->second == PairConnectionSwap && !FlipOrder) || + R->second == PairConnectionSplat) { + int ESContrib = (int) getInstrCost(Instruction::ShuffleVector, + VTy, VTy); + DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << + *Q->second.first << " <-> " << *Q->second.second << + "} -> {" << + *S->first << " <-> " << *S->second << "} = " << + ESContrib << "\n"); + EffSize -= ESContrib; + } + } + } + + // Compute the cost of outgoing edges. We assume that edges outgoing + // to shuffles, inserts or extracts can be merged, and so contribute + // no additional cost. + if (!S->first->getType()->isVoidTy()) { + Type *Ty1 = S->first->getType(), + *Ty2 = S->second->getType(); + Type *VTy = getVecTypeForPair(Ty1, Ty2); + + bool NeedsExtraction = false; + for (Value::use_iterator I = S->first->use_begin(), + IE = S->first->use_end(); I != IE; ++I) { + if (isa<ShuffleVectorInst>(*I) || + isa<InsertElementInst>(*I) || + isa<ExtractElementInst>(*I)) + continue; + if (PrunedTreeInstrs.count(*I)) + continue; + NeedsExtraction = true; + break; + } + + if (NeedsExtraction) { + int ESContrib; + if (Ty1->isVectorTy()) + ESContrib = (int) getInstrCost(Instruction::ShuffleVector, + Ty1, VTy); + else + ESContrib = (int) VTTI->getVectorInstrCost( + Instruction::ExtractElement, VTy, 0); + + DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << + *S->first << "} = " << ESContrib << "\n"); + EffSize -= ESContrib; + } + + NeedsExtraction = false; + for (Value::use_iterator I = S->second->use_begin(), + IE = S->second->use_end(); I != IE; ++I) { + if (isa<ShuffleVectorInst>(*I) || + isa<InsertElementInst>(*I) || + isa<ExtractElementInst>(*I)) + continue; + if (PrunedTreeInstrs.count(*I)) + continue; + NeedsExtraction = true; + break; + } + + if (NeedsExtraction) { + int ESContrib; + if (Ty2->isVectorTy()) + ESContrib = (int) getInstrCost(Instruction::ShuffleVector, + Ty2, VTy); + else + ESContrib = (int) VTTI->getVectorInstrCost( + Instruction::ExtractElement, VTy, 1); + DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" << + *S->second << "} = " << ESContrib << "\n"); + EffSize -= ESContrib; + } + } + + // Compute the cost of incoming edges. + if (!isa<LoadInst>(S->first) && !isa<StoreInst>(S->first)) { + Instruction *S1 = cast<Instruction>(S->first), + *S2 = cast<Instruction>(S->second); + for (unsigned o = 0; o < S1->getNumOperands(); ++o) { + Value *O1 = S1->getOperand(o), *O2 = S2->getOperand(o); + + // Combining constants into vector constants (or small vector + // constants into larger ones are assumed free). + if (isa<Constant>(O1) && isa<Constant>(O2)) + continue; + + if (FlipOrder) + std::swap(O1, O2); + + ValuePair VP = ValuePair(O1, O2); + ValuePair VPR = ValuePair(O2, O1); + + // Internal edges are not handled here. + if (PrunedTree.count(VP) || PrunedTree.count(VPR)) + continue; + + Type *Ty1 = O1->getType(), + *Ty2 = O2->getType(); + Type *VTy = getVecTypeForPair(Ty1, Ty2); + + // Combining vector operations of the same type is also assumed + // folded with other operations. + if (Ty1 == Ty2 && + (isa<ShuffleVectorInst>(O1) || + isa<InsertElementInst>(O1) || + isa<InsertElementInst>(O1)) && + (isa<ShuffleVectorInst>(O2) || + isa<InsertElementInst>(O2) || + isa<InsertElementInst>(O2))) + continue; + + int ESContrib; + // This pair has already been formed. + if (IncomingPairs.count(VP)) { + continue; + } else if (IncomingPairs.count(VPR)) { + ESContrib = (int) getInstrCost(Instruction::ShuffleVector, + VTy, VTy); + } else if (!Ty1->isVectorTy() && !Ty2->isVectorTy()) { + ESContrib = (int) VTTI->getVectorInstrCost( + Instruction::InsertElement, VTy, 0); + ESContrib += (int) VTTI->getVectorInstrCost( + Instruction::InsertElement, VTy, 1); + } else if (!Ty1->isVectorTy()) { + // O1 needs to be inserted into a vector of size O2, and then + // both need to be shuffled together. + ESContrib = (int) VTTI->getVectorInstrCost( + Instruction::InsertElement, Ty2, 0); + ESContrib += (int) getInstrCost(Instruction::ShuffleVector, + VTy, Ty2); + } else if (!Ty2->isVectorTy()) { + // O2 needs to be inserted into a vector of size O1, and then + // both need to be shuffled together. + ESContrib = (int) VTTI->getVectorInstrCost( + Instruction::InsertElement, Ty1, 0); + ESContrib += (int) getInstrCost(Instruction::ShuffleVector, + VTy, Ty1); + } else { + Type *TyBig = Ty1, *TySmall = Ty2; + if (Ty2->getVectorNumElements() > Ty1->getVectorNumElements()) + std::swap(TyBig, TySmall); + + ESContrib = (int) getInstrCost(Instruction::ShuffleVector, + VTy, TyBig); + if (TyBig != TySmall) + ESContrib += (int) getInstrCost(Instruction::ShuffleVector, + TyBig, TySmall); + } + + DEBUG(if (DebugPairSelection) dbgs() << "\tcost {" + << *O1 << " <-> " << *O2 << "} = " << + ESContrib << "\n"); + EffSize -= ESContrib; + IncomingPairs.insert(VP); + } + } + } + } else { + for (DenseSet<ValuePair>::iterator S = PrunedTree.begin(), + E = PrunedTree.end(); S != E; ++S) + EffSize += (int) getDepthFactor(S->first); + } DEBUG(if (DebugPairSelection) dbgs() << "BBV: found pruned Tree for pair {" << *J->first << " <-> " << *J->second << "} of depth " << MaxDepth << " and size " << PrunedTree.size() << " (effective size: " << EffSize << ")\n"); - if (MaxDepth >= Config.ReqChainDepth && EffSize > BestEffSize) { + if (((VTTI && !UseChainDepthWithTI) || + MaxDepth >= Config.ReqChainDepth) && + EffSize > 0 && EffSize > BestEffSize) { BestMaxDepth = MaxDepth; BestEffSize = EffSize; BestTree = PrunedTree; @@ -1431,8 +1919,12 @@ namespace { // that will be fused into vector instructions. void BBVectorize::choosePairs( std::multimap<Value *, Value *> &CandidatePairs, + DenseMap<ValuePair, int> &CandidatePairCostSavings, std::vector<Value *> &PairableInsts, + DenseSet<ValuePair> &FixedOrderPairs, + DenseMap<VPPair, unsigned> &PairConnectionTypes, std::multimap<ValuePair, ValuePair> &ConnectedPairs, + std::multimap<ValuePair, ValuePair> &ConnectedPairDeps, DenseSet<ValuePair> &PairableInstUsers, DenseMap<Value *, Value *>& ChosenPairs) { bool UseCycleCheck = @@ -1447,9 +1939,12 @@ namespace { VPIteratorPair ChoiceRange = CandidatePairs.equal_range(*I); // The best pair to choose and its tree: - size_t BestMaxDepth = 0, BestEffSize = 0; + size_t BestMaxDepth = 0; + int BestEffSize = 0; DenseSet<ValuePair> BestTree; - findBestTreeFor(CandidatePairs, PairableInsts, ConnectedPairs, + findBestTreeFor(CandidatePairs, CandidatePairCostSavings, + PairableInsts, FixedOrderPairs, PairConnectionTypes, + ConnectedPairs, ConnectedPairDeps, PairableInstUsers, PairableInstUserMap, ChosenPairs, BestTree, BestMaxDepth, BestEffSize, ChoiceRange, UseCycleCheck); @@ -1502,24 +1997,19 @@ namespace { // Returns the value that is to be used as the pointer input to the vector // instruction that fuses I with J. Value *BBVectorize::getReplacementPointerInput(LLVMContext& Context, - Instruction *I, Instruction *J, unsigned o, - bool FlipMemInputs) { + Instruction *I, Instruction *J, unsigned o) { Value *IPtr, *JPtr; - unsigned IAlignment, JAlignment; + unsigned IAlignment, JAlignment, IAddressSpace, JAddressSpace; int64_t OffsetInElmts; - // Note: the analysis might fail here, that is why FlipMemInputs has + // Note: the analysis might fail here, that is why the pair order has // been precomputed (OffsetInElmts must be unused here). (void) getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, - OffsetInElmts); + IAddressSpace, JAddressSpace, + OffsetInElmts, false); // The pointer value is taken to be the one with the lowest offset. - Value *VPtr; - if (!FlipMemInputs) { - VPtr = IPtr; - } else { - VPtr = JPtr; - } + Value *VPtr = IPtr; Type *ArgTypeI = cast<PointerType>(IPtr->getType())->getElementType(); Type *ArgTypeJ = cast<PointerType>(JPtr->getType())->getElementType(); @@ -1527,7 +2017,7 @@ namespace { Type *VArgPtrType = PointerType::get(VArgType, cast<PointerType>(IPtr->getType())->getAddressSpace()); return new BitCastInst(VPtr, VArgPtrType, getReplacementName(I, true, o), - /* insert before */ FlipMemInputs ? J : I); + /* insert before */ I); } void BBVectorize::fillNewShuffleMask(LLVMContext& Context, Instruction *J, @@ -1597,7 +2087,7 @@ namespace { Instruction *J, unsigned o, Value *&LOp, unsigned numElemL, Type *ArgTypeL, Type *ArgTypeH, - unsigned IdxOff) { + bool IBeforeJ, unsigned IdxOff) { bool ExpandedIEChain = false; if (InsertElementInst *LIE = dyn_cast<InsertElementInst>(LOp)) { // If we have a pure insertelement chain, then this can be rewritten @@ -1631,8 +2121,9 @@ namespace { LIENext = InsertElementInst::Create(LIEPrev, VectElemts[i], ConstantInt::get(Type::getInt32Ty(Context), i + IdxOff), - getReplacementName(I, true, o, i+1)); - LIENext->insertBefore(J); + getReplacementName(IBeforeJ ? I : J, + true, o, i+1)); + LIENext->insertBefore(IBeforeJ ? J : I); LIEPrev = LIENext; } @@ -1647,7 +2138,7 @@ namespace { // Returns the value to be used as the specified operand of the vector // instruction that fuses I with J. Value *BBVectorize::getReplacementInput(LLVMContext& Context, Instruction *I, - Instruction *J, unsigned o, bool FlipMemInputs) { + Instruction *J, unsigned o, bool IBeforeJ) { Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), 1); @@ -1658,12 +2149,6 @@ namespace { Instruction *L = I, *H = J; Type *ArgTypeL = ArgTypeI, *ArgTypeH = ArgTypeJ; - if (FlipMemInputs) { - L = J; - H = I; - ArgTypeL = ArgTypeJ; - ArgTypeH = ArgTypeI; - } unsigned numElemL; if (ArgTypeL->isVectorTy()) @@ -1816,8 +2301,9 @@ namespace { Instruction *S = new ShuffleVectorInst(I1, UndefValue::get(I1T), ConstantVector::get(Mask), - getReplacementName(I, true, o)); - S->insertBefore(J); + getReplacementName(IBeforeJ ? I : J, + true, o)); + S->insertBefore(IBeforeJ ? J : I); return S; } @@ -1838,8 +2324,9 @@ namespace { Instruction *NewI1 = new ShuffleVectorInst(I1, UndefValue::get(I1T), ConstantVector::get(Mask), - getReplacementName(I, true, o, 1)); - NewI1->insertBefore(J); + getReplacementName(IBeforeJ ? I : J, + true, o, 1)); + NewI1->insertBefore(IBeforeJ ? J : I); I1 = NewI1; I1T = I2T; I1Elem = I2Elem; @@ -1854,8 +2341,9 @@ namespace { Instruction *NewI2 = new ShuffleVectorInst(I2, UndefValue::get(I2T), ConstantVector::get(Mask), - getReplacementName(I, true, o, 1)); - NewI2->insertBefore(J); + getReplacementName(IBeforeJ ? I : J, + true, o, 1)); + NewI2->insertBefore(IBeforeJ ? J : I); I2 = NewI2; I2T = I1T; I2Elem = I1Elem; @@ -1875,8 +2363,8 @@ namespace { Instruction *NewOp = new ShuffleVectorInst(I1, I2, ConstantVector::get(Mask), - getReplacementName(I, true, o)); - NewOp->insertBefore(J); + getReplacementName(IBeforeJ ? I : J, true, o)); + NewOp->insertBefore(IBeforeJ ? J : I); return NewOp; } } @@ -1884,17 +2372,17 @@ namespace { Type *ArgType = ArgTypeL; if (numElemL < numElemH) { if (numElemL == 1 && expandIEChain(Context, I, J, o, HOp, numElemH, - ArgTypeL, VArgType, 1)) { + ArgTypeL, VArgType, IBeforeJ, 1)) { // This is another short-circuit case: we're combining a scalar into // a vector that is formed by an IE chain. We've just expanded the IE // chain, now insert the scalar and we're done. Instruction *S = InsertElementInst::Create(HOp, LOp, CV0, - getReplacementName(I, true, o)); - S->insertBefore(J); + getReplacementName(IBeforeJ ? I : J, true, o)); + S->insertBefore(IBeforeJ ? J : I); return S; } else if (!expandIEChain(Context, I, J, o, LOp, numElemL, ArgTypeL, - ArgTypeH)) { + ArgTypeH, IBeforeJ)) { // The two vector inputs to the shuffle must be the same length, // so extend the smaller vector to be the same length as the larger one. Instruction *NLOp; @@ -1909,29 +2397,32 @@ namespace { NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL), ConstantVector::get(Mask), - getReplacementName(I, true, o, 1)); + getReplacementName(IBeforeJ ? I : J, + true, o, 1)); } else { NLOp = InsertElementInst::Create(UndefValue::get(ArgTypeH), LOp, CV0, - getReplacementName(I, true, o, 1)); + getReplacementName(IBeforeJ ? I : J, + true, o, 1)); } - NLOp->insertBefore(J); + NLOp->insertBefore(IBeforeJ ? J : I); LOp = NLOp; } ArgType = ArgTypeH; } else if (numElemL > numElemH) { if (numElemH == 1 && expandIEChain(Context, I, J, o, LOp, numElemL, - ArgTypeH, VArgType)) { + ArgTypeH, VArgType, IBeforeJ)) { Instruction *S = InsertElementInst::Create(LOp, HOp, ConstantInt::get(Type::getInt32Ty(Context), numElemL), - getReplacementName(I, true, o)); - S->insertBefore(J); + getReplacementName(IBeforeJ ? I : J, + true, o)); + S->insertBefore(IBeforeJ ? J : I); return S; } else if (!expandIEChain(Context, I, J, o, HOp, numElemH, ArgTypeH, - ArgTypeL)) { + ArgTypeL, IBeforeJ)) { Instruction *NHOp; if (numElemH > 1) { std::vector<Constant *> Mask(numElemL); @@ -1943,13 +2434,15 @@ namespace { NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH), ConstantVector::get(Mask), - getReplacementName(I, true, o, 1)); + getReplacementName(IBeforeJ ? I : J, + true, o, 1)); } else { NHOp = InsertElementInst::Create(UndefValue::get(ArgTypeL), HOp, CV0, - getReplacementName(I, true, o, 1)); + getReplacementName(IBeforeJ ? I : J, + true, o, 1)); } - NHOp->insertBefore(J); + NHOp->insertBefore(IBeforeJ ? J : I); HOp = NHOp; } } @@ -1967,19 +2460,21 @@ namespace { } Instruction *BV = new ShuffleVectorInst(LOp, HOp, - ConstantVector::get(Mask), - getReplacementName(I, true, o)); - BV->insertBefore(J); + ConstantVector::get(Mask), + getReplacementName(IBeforeJ ? I : J, true, o)); + BV->insertBefore(IBeforeJ ? J : I); return BV; } Instruction *BV1 = InsertElementInst::Create( UndefValue::get(VArgType), LOp, CV0, - getReplacementName(I, true, o, 1)); - BV1->insertBefore(I); + getReplacementName(IBeforeJ ? I : J, + true, o, 1)); + BV1->insertBefore(IBeforeJ ? J : I); Instruction *BV2 = InsertElementInst::Create(BV1, HOp, CV1, - getReplacementName(I, true, o, 2)); - BV2->insertBefore(J); + getReplacementName(IBeforeJ ? I : J, + true, o, 2)); + BV2->insertBefore(IBeforeJ ? J : I); return BV2; } @@ -1988,7 +2483,7 @@ namespace { void BBVectorize::getReplacementInputsForPair(LLVMContext& Context, Instruction *I, Instruction *J, SmallVector<Value *, 3> &ReplacedOperands, - bool FlipMemInputs) { + bool IBeforeJ) { unsigned NumOperands = I->getNumOperands(); for (unsigned p = 0, o = NumOperands-1; p < NumOperands; ++p, --o) { @@ -1997,8 +2492,7 @@ namespace { if (isa<LoadInst>(I) || (o == 1 && isa<StoreInst>(I))) { // This is the pointer for a load/store instruction. - ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o, - FlipMemInputs); + ReplacedOperands[o] = getReplacementPointerInput(Context, I, J, o); continue; } else if (isa<CallInst>(I)) { Function *F = cast<CallInst>(I)->getCalledFunction(); @@ -2026,8 +2520,7 @@ namespace { continue; } - ReplacedOperands[o] = - getReplacementInput(Context, I, J, o, FlipMemInputs); + ReplacedOperands[o] = getReplacementInput(Context, I, J, o, IBeforeJ); } } @@ -2038,8 +2531,7 @@ namespace { void BBVectorize::replaceOutputsOfPair(LLVMContext& Context, Instruction *I, Instruction *J, Instruction *K, Instruction *&InsertionPt, - Instruction *&K1, Instruction *&K2, - bool FlipMemInputs) { + Instruction *&K1, Instruction *&K2) { if (isa<StoreInst>(I)) { AA->replaceWithNewValue(I, K); AA->replaceWithNewValue(J, K); @@ -2069,13 +2561,11 @@ namespace { } K1 = new ShuffleVectorInst(K, UndefValue::get(VType), - ConstantVector::get( - FlipMemInputs ? Mask2 : Mask1), + ConstantVector::get( Mask1), getReplacementName(K, false, 1)); } else { Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); - Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem-1); - K1 = ExtractElementInst::Create(K, FlipMemInputs ? CV1 : CV0, + K1 = ExtractElementInst::Create(K, CV0, getReplacementName(K, false, 1)); } @@ -2087,13 +2577,11 @@ namespace { } K2 = new ShuffleVectorInst(K, UndefValue::get(VType), - ConstantVector::get( - FlipMemInputs ? Mask1 : Mask2), + ConstantVector::get( Mask2), getReplacementName(K, false, 2)); } else { - Value *CV0 = ConstantInt::get(Type::getInt32Ty(Context), 0); Value *CV1 = ConstantInt::get(Type::getInt32Ty(Context), numElem-1); - K2 = ExtractElementInst::Create(K, FlipMemInputs ? CV0 : CV1, + K2 = ExtractElementInst::Create(K, CV1, getReplacementName(K, false, 2)); } @@ -2193,36 +2681,6 @@ namespace { } } - // As with the aliasing information, SCEV can also change because of - // vectorization. This information is used to compute relative pointer - // offsets; the necessary information will be cached here prior to - // fusion. - void BBVectorize::collectPtrInfo(std::vector<Value *> &PairableInsts, - DenseMap<Value *, Value *> &ChosenPairs, - DenseSet<Value *> &LowPtrInsts) { - for (std::vector<Value *>::iterator PI = PairableInsts.begin(), - PIE = PairableInsts.end(); PI != PIE; ++PI) { - DenseMap<Value *, Value *>::iterator P = ChosenPairs.find(*PI); - if (P == ChosenPairs.end()) continue; - - Instruction *I = cast<Instruction>(P->first); - Instruction *J = cast<Instruction>(P->second); - - if (!isa<LoadInst>(I) && !isa<StoreInst>(I)) - continue; - - Value *IPtr, *JPtr; - unsigned IAlignment, JAlignment; - int64_t OffsetInElmts; - if (!getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, - OffsetInElmts) || abs64(OffsetInElmts) != 1) - llvm_unreachable("Pre-fusion pointer analysis failed"); - - Value *LowPI = (OffsetInElmts > 0) ? I : J; - LowPtrInsts.insert(LowPI); - } - } - // When the first instruction in each pair is cloned, it will inherit its // parent's metadata. This metadata must be combined with that of the other // instruction in a safe way. @@ -2256,27 +2714,27 @@ namespace { // second member). void BBVectorize::fuseChosenPairs(BasicBlock &BB, std::vector<Value *> &PairableInsts, - DenseMap<Value *, Value *> &ChosenPairs) { + DenseMap<Value *, Value *> &ChosenPairs, + DenseSet<ValuePair> &FixedOrderPairs, + DenseMap<VPPair, unsigned> &PairConnectionTypes, + std::multimap<ValuePair, ValuePair> &ConnectedPairs, + std::multimap<ValuePair, ValuePair> &ConnectedPairDeps) { LLVMContext& Context = BB.getContext(); // During the vectorization process, the order of the pairs to be fused // could be flipped. So we'll add each pair, flipped, into the ChosenPairs // list. After a pair is fused, the flipped pair is removed from the list. - std::vector<ValuePair> FlippedPairs; - FlippedPairs.reserve(ChosenPairs.size()); + DenseSet<ValuePair> FlippedPairs; for (DenseMap<Value *, Value *>::iterator P = ChosenPairs.begin(), E = ChosenPairs.end(); P != E; ++P) - FlippedPairs.push_back(ValuePair(P->second, P->first)); - for (std::vector<ValuePair>::iterator P = FlippedPairs.begin(), + FlippedPairs.insert(ValuePair(P->second, P->first)); + for (DenseSet<ValuePair>::iterator P = FlippedPairs.begin(), E = FlippedPairs.end(); P != E; ++P) ChosenPairs.insert(*P); std::multimap<Value *, Value *> LoadMoveSet; collectLoadMoveSet(BB, PairableInsts, ChosenPairs, LoadMoveSet); - DenseSet<Value *> LowPtrInsts; - collectPtrInfo(PairableInsts, ChosenPairs, LowPtrInsts); - DEBUG(dbgs() << "BBV: initial: \n" << BB << "\n"); for (BasicBlock::iterator PI = BB.getFirstInsertionPt(); PI != BB.end();) { @@ -2316,44 +2774,91 @@ namespace { continue; } - bool FlipMemInputs = false; - if (isa<LoadInst>(I) || isa<StoreInst>(I)) - FlipMemInputs = (LowPtrInsts.find(I) == LowPtrInsts.end()); + // If the pair must have the other order, then flip it. + bool FlipPairOrder = FixedOrderPairs.count(ValuePair(J, I)); + if (!FlipPairOrder && !FixedOrderPairs.count(ValuePair(I, J))) { + // This pair does not have a fixed order, and so we might want to + // flip it if that will yield fewer shuffles. We count the number + // of dependencies connected via swaps, and those directly connected, + // and flip the order if the number of swaps is greater. + bool OrigOrder = true; + VPPIteratorPair IP = ConnectedPairDeps.equal_range(ValuePair(I, J)); + if (IP.first == ConnectedPairDeps.end()) { + IP = ConnectedPairDeps.equal_range(ValuePair(J, I)); + OrigOrder = false; + } + + if (IP.first != ConnectedPairDeps.end()) { + unsigned NumDepsDirect = 0, NumDepsSwap = 0; + for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first; + Q != IP.second; ++Q) { + DenseMap<VPPair, unsigned>::iterator R = + PairConnectionTypes.find(VPPair(Q->second, Q->first)); + assert(R != PairConnectionTypes.end() && + "Cannot find pair connection type"); + if (R->second == PairConnectionDirect) + ++NumDepsDirect; + else if (R->second == PairConnectionSwap) + ++NumDepsSwap; + } + + if (!OrigOrder) + std::swap(NumDepsDirect, NumDepsSwap); + if (NumDepsSwap > NumDepsDirect) { + FlipPairOrder = true; + DEBUG(dbgs() << "BBV: reordering pair: " << *I << + " <-> " << *J << "\n"); + } + } + } + + Instruction *L = I, *H = J; + if (FlipPairOrder) + std::swap(H, L); + + // If the pair being fused uses the opposite order from that in the pair + // connection map, then we need to flip the types. + VPPIteratorPair IP = ConnectedPairs.equal_range(ValuePair(H, L)); + for (std::multimap<ValuePair, ValuePair>::iterator Q = IP.first; + Q != IP.second; ++Q) { + DenseMap<VPPair, unsigned>::iterator R = PairConnectionTypes.find(*Q); + assert(R != PairConnectionTypes.end() && + "Cannot find pair connection type"); + if (R->second == PairConnectionDirect) + R->second = PairConnectionSwap; + else if (R->second == PairConnectionSwap) + R->second = PairConnectionDirect; + } + + bool LBeforeH = !FlipPairOrder; unsigned NumOperands = I->getNumOperands(); SmallVector<Value *, 3> ReplacedOperands(NumOperands); - getReplacementInputsForPair(Context, I, J, ReplacedOperands, - FlipMemInputs); + getReplacementInputsForPair(Context, L, H, ReplacedOperands, + LBeforeH); // Make a copy of the original operation, change its type to the vector // type and replace its operands with the vector operands. - Instruction *K = I->clone(); - if (I->hasName()) K->takeName(I); + Instruction *K = L->clone(); + if (L->hasName()) + K->takeName(L); + else if (H->hasName()) + K->takeName(H); if (!isa<StoreInst>(K)) - K->mutateType(getVecTypeForPair(I->getType(), J->getType())); + K->mutateType(getVecTypeForPair(L->getType(), H->getType())); - combineMetadata(K, J); + combineMetadata(K, H); for (unsigned o = 0; o < NumOperands; ++o) K->setOperand(o, ReplacedOperands[o]); - // If we've flipped the memory inputs, make sure that we take the correct - // alignment. - if (FlipMemInputs) { - if (isa<StoreInst>(K)) - cast<StoreInst>(K)->setAlignment(cast<StoreInst>(J)->getAlignment()); - else - cast<LoadInst>(K)->setAlignment(cast<LoadInst>(J)->getAlignment()); - } - K->insertAfter(J); // Instruction insertion point: Instruction *InsertionPt = K; Instruction *K1 = 0, *K2 = 0; - replaceOutputsOfPair(Context, I, J, K, InsertionPt, K1, K2, - FlipMemInputs); + replaceOutputsOfPair(Context, L, H, K, InsertionPt, K1, K2); // The use tree of the first original instruction must be moved to after // the location of the second instruction. The entire use tree of the @@ -2363,10 +2868,10 @@ namespace { moveUsesOfIAfterJ(BB, LoadMoveSet, InsertionPt, I, J); if (!isa<StoreInst>(I)) { - I->replaceAllUsesWith(K1); - J->replaceAllUsesWith(K2); - AA->replaceWithNewValue(I, K1); - AA->replaceWithNewValue(J, K2); + L->replaceAllUsesWith(K1); + H->replaceAllUsesWith(K2); + AA->replaceWithNewValue(L, K1); + AA->replaceWithNewValue(H, K2); } // Instructions that may read from memory may be in the load move set. @@ -2399,6 +2904,9 @@ namespace { SE->forgetValue(J); I->eraseFromParent(); J->eraseFromParent(); + + DEBUG(if (PrintAfterEveryPair) dbgs() << "BBV: block is now: \n" << + BB << "\n"); } DEBUG(dbgs() << "BBV: final: \n" << BB << "\n"); |