diff options
Diffstat (limited to 'lib/Transforms/Vectorize/BBVectorize.cpp')
-rw-r--r-- | lib/Transforms/Vectorize/BBVectorize.cpp | 1796 |
1 files changed, 1796 insertions, 0 deletions
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp new file mode 100644 index 0000000000..9c2c8dd51b --- /dev/null +++ b/lib/Transforms/Vectorize/BBVectorize.cpp @@ -0,0 +1,1796 @@ +//===- BBVectorize.cpp - A Basic-Block Vectorizer -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a basic-block vectorization pass. The algorithm was +// inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral, +// et al. It works by looking for chains of pairable operations and then +// pairing them. +// +//===----------------------------------------------------------------------===// + +#define BBV_NAME "bb-vectorize" +#define DEBUG_TYPE BBV_NAME +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Intrinsics.h" +#include "llvm/LLVMContext.h" +#include "llvm/Pass.h" +#include "llvm/Type.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/ValueHandle.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Transforms/Vectorize.h" +#include <algorithm> +#include <map> +using namespace llvm; + +static cl::opt<unsigned> +ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden, + cl::desc("The required chain depth for vectorization")); + +static cl::opt<unsigned> +SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden, + cl::desc("The maximum search distance for instruction pairs")); + +static cl::opt<bool> +SplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden, + cl::desc("Replicating one element to a pair breaks the chain")); + +static cl::opt<unsigned> +VectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden, + cl::desc("The size of the native vector registers")); + +static cl::opt<unsigned> +MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden, + cl::desc("The maximum number of pairing iterations")); + +static cl::opt<unsigned> +MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200), + cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use" + " a full cycle check")); + +static cl::opt<bool> +NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden, + cl::desc("Don't try to vectorize integer values")); + +static cl::opt<bool> +NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden, + cl::desc("Don't try to vectorize floating-point values")); + +static cl::opt<bool> +NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden, + cl::desc("Don't try to vectorize casting (conversion) operations")); + +static cl::opt<bool> +NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden, + cl::desc("Don't try to vectorize floating-point math intrinsics")); + +static cl::opt<bool> +NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden, + cl::desc("Don't try to vectorize the fused-multiply-add intrinsic")); + +static cl::opt<bool> +NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden, + cl::desc("Don't try to vectorize loads and stores")); + +static cl::opt<bool> +AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden, + cl::desc("Only generate aligned loads and stores")); + +static cl::opt<bool> +FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden, + cl::desc("Use a fast instruction dependency analysis")); + +#ifndef NDEBUG +static cl::opt<bool> +DebugInstructionExamination("bb-vectorize-debug-instruction-examination", + cl::init(false), cl::Hidden, + cl::desc("When debugging is enabled, output information on the" + " instruction-examination process")); +static cl::opt<bool> +DebugCandidateSelection("bb-vectorize-debug-candidate-selection", + cl::init(false), cl::Hidden, + cl::desc("When debugging is enabled, output information on the" + " candidate-selection process")); +static cl::opt<bool> +DebugPairSelection("bb-vectorize-debug-pair-selection", + cl::init(false), cl::Hidden, + cl::desc("When debugging is enabled, output information on the" + " pair-selection process")); +static cl::opt<bool> +DebugCycleCheck("bb-vectorize-debug-cycle-check", + cl::init(false), cl::Hidden, + cl::desc("When debugging is enabled, output information on the" + " cycle-checking process")); +#endif + +STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize"); + +namespace { + struct BBVectorize : public BasicBlockPass { + static char ID; // Pass identification, replacement for typeid + BBVectorize() : BasicBlockPass(ID) { + initializeBBVectorizePass(*PassRegistry::getPassRegistry()); + } + + typedef std::pair<Value *, Value *> ValuePair; + typedef std::pair<ValuePair, size_t> ValuePairWithDepth; + typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair + typedef std::pair<std::multimap<Value *, Value *>::iterator, + std::multimap<Value *, Value *>::iterator> VPIteratorPair; + typedef std::pair<std::multimap<ValuePair, ValuePair>::iterator, + std::multimap<ValuePair, ValuePair>::iterator> + VPPIteratorPair; + + AliasAnalysis *AA; + ScalarEvolution *SE; + TargetData *TD; + + // FIXME: const correct? + + bool vectorizePairs(BasicBlock &BB); + + void getCandidatePairs(BasicBlock &BB, + std::multimap<Value *, Value *> &CandidatePairs, + std::vector<Value *> &PairableInsts); + + void computeConnectedPairs(std::multimap<Value *, Value *> &CandidatePairs, + std::vector<Value *> &PairableInsts, + std::multimap<ValuePair, ValuePair> &ConnectedPairs); + + void buildDepMap(BasicBlock &BB, + std::multimap<Value *, Value *> &CandidatePairs, + std::vector<Value *> &PairableInsts, + DenseSet<ValuePair> &PairableInstUsers); + + void choosePairs(std::multimap<Value *, Value *> &CandidatePairs, + std::vector<Value *> &PairableInsts, + std::multimap<ValuePair, ValuePair> &ConnectedPairs, + DenseSet<ValuePair> &PairableInstUsers, + DenseMap<Value *, Value *>& ChosenPairs); + + void fuseChosenPairs(BasicBlock &BB, + std::vector<Value *> &PairableInsts, + DenseMap<Value *, Value *>& ChosenPairs); + + bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore); + + bool areInstsCompatible(Instruction *I, Instruction *J, + bool IsSimpleLoadStore); + + bool trackUsesOfI(DenseSet<Value *> &Users, + AliasSetTracker &WriteSet, Instruction *I, + Instruction *J, bool UpdateUsers = true, + std::multimap<Value *, Value *> *LoadMoveSet = 0); + + void computePairsConnectedTo( + std::multimap<Value *, Value *> &CandidatePairs, + std::vector<Value *> &PairableInsts, + std::multimap<ValuePair, ValuePair> &ConnectedPairs, + ValuePair P); + + bool pairsConflict(ValuePair P, ValuePair Q, + DenseSet<ValuePair> &PairableInstUsers, + std::multimap<ValuePair, ValuePair> *PairableInstUserMap = 0); + + bool pairWillFormCycle(ValuePair P, + std::multimap<ValuePair, ValuePair> &PairableInstUsers, + DenseSet<ValuePair> &CurrentPairs); + + void pruneTreeFor( + std::multimap<Value *, Value *> &CandidatePairs, + std::vector<Value *> &PairableInsts, + std::multimap<ValuePair, ValuePair> &ConnectedPairs, + DenseSet<ValuePair> &PairableInstUsers, + std::multimap<ValuePair, ValuePair> &PairableInstUserMap, + DenseMap<Value *, Value *> &ChosenPairs, + DenseMap<ValuePair, size_t> &Tree, + DenseSet<ValuePair> &PrunedTree, ValuePair J, + bool UseCycleCheck); + + void buildInitialTreeFor( + std::multimap<Value *, Value *> &CandidatePairs, + std::vector<Value *> &PairableInsts, + std::multimap<ValuePair, ValuePair> &ConnectedPairs, + DenseSet<ValuePair> &PairableInstUsers, + DenseMap<Value *, Value *> &ChosenPairs, + DenseMap<ValuePair, size_t> &Tree, ValuePair J); + + void findBestTreeFor( + std::multimap<Value *, Value *> &CandidatePairs, + std::vector<Value *> &PairableInsts, + std::multimap<ValuePair, ValuePair> &ConnectedPairs, + DenseSet<ValuePair> &PairableInstUsers, + std::multimap<ValuePair, ValuePair> &PairableInstUserMap, + DenseMap<Value *, Value *> &ChosenPairs, + DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth, + size_t &BestEffSize, VPIteratorPair ChoiceRange, + bool UseCycleCheck); + + Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I, + Instruction *J, unsigned o, bool &FlipMemInputs); + + void fillNewShuffleMask(LLVMContext& Context, Instruction *J, + unsigned NumElem, unsigned MaskOffset, unsigned NumInElem, + unsigned IdxOffset, std::vector<Constant*> &Mask); + + Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I, + Instruction *J); + + Value *getReplacementInput(LLVMContext& Context, Instruction *I, + Instruction *J, unsigned o, bool FlipMemInputs); + + void getReplacementInputsForPair(LLVMContext& Context, Instruction *I, + Instruction *J, SmallVector<Value *, 3> &ReplacedOperands, + bool &FlipMemInputs); + + void replaceOutputsOfPair(LLVMContext& Context, Instruction *I, + Instruction *J, Instruction *K, + Instruction *&InsertionPt, Instruction *&K1, + Instruction *&K2, bool &FlipMemInputs); + + void collectPairLoadMoveSet(BasicBlock &BB, + DenseMap<Value *, Value *> &ChosenPairs, + std::multimap<Value *, Value *> &LoadMoveSet, + Instruction *I); + + void collectLoadMoveSet(BasicBlock &BB, + std::vector<Value *> &PairableInsts, + DenseMap<Value *, Value *> &ChosenPairs, + std::multimap<Value *, Value *> &LoadMoveSet); + + bool canMoveUsesOfIAfterJ(BasicBlock &BB, + std::multimap<Value *, Value *> &LoadMoveSet, + Instruction *I, Instruction *J); + + void moveUsesOfIAfterJ(BasicBlock &BB, + std::multimap<Value *, Value *> &LoadMoveSet, + Instruction *&InsertionPt, + Instruction *I, Instruction *J); + + virtual bool runOnBasicBlock(BasicBlock &BB) { + AA = &getAnalysis<AliasAnalysis>(); + SE = &getAnalysis<ScalarEvolution>(); + TD = getAnalysisIfAvailable<TargetData>(); + + bool changed = false; + // Iterate a sufficient number of times to merge types of size 1 bit, + // then 2 bits, then 4, etc. up to half of the target vector width of the + // target vector register. + for (unsigned v = 2, n = 1; v <= VectorBits && (!MaxIter || n <= MaxIter); + v *= 2, ++n) { + DEBUG(dbgs() << "BBV: fusing loop #" << n << + " for " << BB.getName() << " in " << + BB.getParent()->getName() << "...\n"); + if (vectorizePairs(BB)) + changed = true; + else + break; + } + + DEBUG(dbgs() << "BBV: done!\n"); + return changed; + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + BasicBlockPass::getAnalysisUsage(AU); + AU.addRequired<AliasAnalysis>(); + AU.addRequired<ScalarEvolution>(); + AU.addPreserved<AliasAnalysis>(); + AU.addPreserved<ScalarEvolution>(); + } + + // This returns the vector type that holds a pair of the provided type. + // If the provided type is already a vector, then its length is doubled. + static inline VectorType *getVecTypeForPair(Type *ElemTy) { + if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) { + unsigned numElem = VTy->getNumElements(); + return VectorType::get(ElemTy->getScalarType(), numElem*2); + } else { + return VectorType::get(ElemTy, 2); + } + } + + // Returns the weight associated with the provided value. A chain of + // candidate pairs has a length given by the sum of the weights of its + // members (one weight per pair; the weight of each member of the pair + // is assumed to be the same). This length is then compared to the + // chain-length threshold to determine if a given chain is significant + // enough to be vectorized. The length is also used in comparing + // candidate chains where longer chains are considered to be better. + // Note: when this function returns 0, the resulting instructions are + // not actually fused. + static inline size_t getDepthFactor(Value *V) { + // InsertElement and ExtractElement have a depth factor of zero. This is + // for two reasons: First, they cannot be usefully fused. Second, because + // the pass generates a lot of these, they can confuse the simple metric + // used to compare the trees in the next iteration. Thus, giving them a + // weight of zero allows the pass to essentially ignore them in + // subsequent iterations when looking for vectorization opportunities + // while still tracking dependency chains that flow through those + // instructions. + if (isa<InsertElementInst>(V) || isa<ExtractElementInst>(V)) + return 0; + + return 1; + } + + // This determines the relative offset of two loads or stores, returning + // true if the offset could be determined to be some constant value. + // For example, if OffsetInElmts == 1, then J accesses the memory directly + // after I; if OffsetInElmts == -1 then I accesses the memory + // directly after J. This function assumes that both instructions + // have the same type. + bool getPairPtrInfo(Instruction *I, Instruction *J, + Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment, + int64_t &OffsetInElmts) { + OffsetInElmts = 0; + if (isa<LoadInst>(I)) { + IPtr = cast<LoadInst>(I)->getPointerOperand(); + JPtr = cast<LoadInst>(J)->getPointerOperand(); + IAlignment = cast<LoadInst>(I)->getAlignment(); + JAlignment = cast<LoadInst>(J)->getAlignment(); + } else { + IPtr = cast<StoreInst>(I)->getPointerOperand(); + JPtr = cast<StoreInst>(J)->getPointerOperand(); + IAlignment = cast<StoreInst>(I)->getAlignment(); + JAlignment = cast<StoreInst>(J)->getAlignment(); + } + + const SCEV *IPtrSCEV = SE->getSCEV(IPtr); + const SCEV *JPtrSCEV = SE->getSCEV(JPtr); + + // If this is a trivial offset, then we'll get something like + // 1*sizeof(type). With target data, which we need anyway, this will get + // constant folded into a number. + const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV); + if (const SCEVConstant *ConstOffSCEV = + dyn_cast<SCEVConstant>(OffsetSCEV)) { + ConstantInt *IntOff = ConstOffSCEV->getValue(); + int64_t Offset = IntOff->getSExtValue(); + + Type *VTy = cast<PointerType>(IPtr->getType())->getElementType(); + int64_t VTyTSS = (int64_t) TD->getTypeStoreSize(VTy); + + assert(VTy == cast<PointerType>(JPtr->getType())->getElementType()); + + OffsetInElmts = Offset/VTyTSS; + return (abs64(Offset) % VTyTSS) == 0; + } + + return false; + } + + // Returns true if the provided CallInst represents an intrinsic that can + // be vectorized. + bool isVectorizableIntrinsic(CallInst* I) { + Function *F = I->getCalledFunction(); + if (!F) return false; + + unsigned IID = F->getIntrinsicID(); + if (!IID) return false; + + switch(IID) { + default: + return false; + case Intrinsic::sqrt: + case Intrinsic::powi: + case Intrinsic::sin: + case Intrinsic::cos: + case Intrinsic::log: + case Intrinsic::log2: + case Intrinsic::log10: + case Intrinsic::exp: + case Intrinsic::exp2: + case Intrinsic::pow: + return !NoMath; + case Intrinsic::fma: + return !NoFMA; + } + } + + // Returns true if J is the second element in some pair referenced by + // some multimap pair iterator pair. + template <typename V> + bool isSecondInIteratorPair(V J, std::pair< + typename std::multimap<V, V>::iterator, + typename std::multimap<V, V>::iterator> PairRange) { + for (typename std::multimap<V, V>::iterator K = PairRange.first; + K != PairRange.second; ++K) + if (K->second == J) return true; + + return false; + } + }; + + // This function implements one vectorization iteration on the provided + // basic block. It returns true if the block is changed. + bool BBVectorize::vectorizePairs(BasicBlock &BB) { + std::vector<Value *> PairableInsts; + std::multimap<Value *, Value *> CandidatePairs; + getCandidatePairs(BB, CandidatePairs, PairableInsts); + if (PairableInsts.size() == 0) return false; + + // Now we have a map of all of the pairable instructions and we need to + // select the best possible pairing. A good pairing is one such that the + // users of the pair are also paired. This defines a (directed) forest + // over the pairs such that two pairs are connected iff the second pair + // uses the first. + + // Note that it only matters that both members of the second pair use some + // element of the first pair (to allow for splatting). + + std::multimap<ValuePair, ValuePair> ConnectedPairs; + computeConnectedPairs(CandidatePairs, PairableInsts, ConnectedPairs); + if (ConnectedPairs.size() == 0) return false; + + // Build the pairable-instruction dependency map + DenseSet<ValuePair> PairableInstUsers; + buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers); + + // There is now a graph of the connected pairs. For each variable, pick the + // pairing with the largest tree meeting the depth requirement on at least + // one branch. Then select all pairings that are part of that tree and + // remove them from the list of available pairings and pairable variables. + + DenseMap<Value *, Value *> ChosenPairs; + choosePairs(CandidatePairs, PairableInsts, ConnectedPairs, + PairableInstUsers, ChosenPairs); + + if (ChosenPairs.size() == 0) return false; + NumFusedOps += ChosenPairs.size(); + + // A set of pairs has now been selected. It is now necessary to replace the + // paired instructions with vector instructions. For this procedure each + // operand much be replaced with a vector operand. This vector is formed + // by using build_vector on the old operands. The replaced values are then + // replaced with a vector_extract on the result. Subsequent optimization + // passes should coalesce the build/extract combinations. + + fuseChosenPairs(BB, PairableInsts, ChosenPairs); + + return true; + } + + // This function returns true if the provided instruction is capable of being + // fused into a vector instruction. This determination is based only on the + // type and other attributes of the instruction. + bool BBVectorize::isInstVectorizable(Instruction *I, + bool &IsSimpleLoadStore) { + IsSimpleLoadStore = false; + + if (CallInst *C = dyn_cast<CallInst>(I)) { + if (!isVectorizableIntrinsic(C)) + return false; + } else if (LoadInst *L = dyn_cast<LoadInst>(I)) { + // Vectorize simple loads if possbile: + IsSimpleLoadStore = L->isSimple(); + if (!IsSimpleLoadStore || NoMemOps) + return false; + } else if (StoreInst *S = dyn_cast<StoreInst>(I)) { + // Vectorize simple stores if possbile: + IsSimpleLoadStore = S->isSimple(); + if (!IsSimpleLoadStore || NoMemOps) + return false; + } else if (CastInst *C = dyn_cast<CastInst>(I)) { + // We can vectorize casts, but not casts of pointer types, etc. + if (NoCasts) + return false; + + Type *SrcTy = C->getSrcTy(); + if (!SrcTy->isSingleValueType() || SrcTy->isPointerTy()) + return false; + + Type *DestTy = C->getDestTy(); + if (!DestTy->isSingleValueType() || DestTy->isPointerTy()) + return false; + } else if (!(I->isBinaryOp() || isa<ShuffleVectorInst>(I) || + isa<ExtractElementInst>(I) || isa<InsertElementInst>(I))) { + return false; + } + + // We can't vectorize memory operations without target data + if (TD == 0 && IsSimpleLoadStore) + return false; + + Type *T1, *T2; + if (isa<StoreInst>(I)) { + // For stores, it is the value type, not the pointer type that matters + // because the value is what will come from a vector register. + + Value *IVal = cast<StoreInst>(I)->getValueOperand(); + T1 = IVal->getType(); + } else { + T1 = I->getType(); + } + + if (I->isCast()) + T2 = cast<CastInst>(I)->getSrcTy(); + else + T2 = T1; + + // Not every type can be vectorized... + if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) || + !(VectorType::isValidElementType(T2) || T2->isVectorTy())) + return false; + + if (NoInts && (T1->isIntOrIntVectorTy() || T2->isIntOrIntVectorTy())) + return false; + + if (NoFloats && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy())) + return false; + + if (T1->getPrimitiveSizeInBits() > VectorBits/2 || + T2->getPrimitiveSizeInBits() > VectorBits/2) + return false; + + return true; + } + + // This function returns true if the two provided instructions are compatible + // (meaning that they can be fused into a vector instruction). This assumes + // that I has already been determined to be vectorizable and that J is not + // in the use tree of I. + bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J, + bool IsSimpleLoadStore) { + DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I << + " <-> " << *J << "\n"); + + // Loads and stores can be merged if they have different alignments, + // but are otherwise the same. + LoadInst *LI, *LJ; + StoreInst *SI, *SJ; + if ((LI = dyn_cast<LoadInst>(I)) && (LJ = dyn_cast<LoadInst>(J))) { + if (I->getType() != J->getType()) + return false; + + if (LI->getPointerOperand()->getType() != + LJ->getPointerOperand()->getType() || + LI->isVolatile() != LJ->isVolatile() || + LI->getOrdering() != LJ->getOrdering() || + LI->getSynchScope() != LJ->getSynchScope()) + return false; + } else if ((SI = dyn_cast<StoreInst>(I)) && (SJ = dyn_cast<StoreInst>(J))) { + if (SI->getValueOperand()->getType() != + SJ->getValueOperand()->getType() || + SI->getPointerOperand()->getType() != + SJ->getPointerOperand()->getType() || + SI->isVolatile() != SJ->isVolatile() || + SI->getOrdering() != SJ->getOrdering() || + SI->getSynchScope() != SJ->getSynchScope()) + return false; + } else if (!J->isSameOperationAs(I)) { + return false; + } + // FIXME: handle addsub-type operations! + + if (IsSimpleLoadStore) { + Value *IPtr, *JPtr; + unsigned IAlignment, JAlignment; + int64_t OffsetInElmts = 0; + if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment, + OffsetInElmts) && abs64(OffsetInElmts) == 1) { + if (AlignedOnly) { + Type *aType = isa<StoreInst>(I) ? + cast<StoreInst>(I)->getValueOperand()->getType() : I->getType(); + // An aligned load or store is possible only if the instruction + // with the lower offset has an alignment suitable for the + // vector type. + + unsigned BottomAlignment = IAlignment; + if (OffsetInElmts < 0) BottomAlignment = JAlignment; + + Type *VType = getVecTypeForPair(aType); + unsigned VecAlignment = TD->getPrefTypeAlignment(VType); + if (BottomAlignment < VecAlignment) + return false; + } + } else { + return false; + } + } else if (isa<ShuffleVectorInst>(I)) { + // Only merge two shuffles if they're both constant + return isa<Constant>(I->getOperand(2)) && + isa<Constant>(J->getOperand(2)); + // FIXME: We may want to vectorize non-constant shuffles also. + } + + return true; + } + + // Figure out whether or not J uses I and update the users and write-set + // structures associated with I. Specifically, Users represents the set of + // instructions that depend on I. WriteSet represents the set + // of memory locations that are dependent on I. If UpdateUsers is true, + // and J uses I, then Users is updated to contain J and WriteSet is updated + // to contain any memory locations to which J writes. The function returns + // true if J uses I. By default, alias analysis is used to determine + // whether J reads from memory that overlaps with a location in WriteSet. + // If LoadMoveSet is not null, then it is a previously-computed multimap + // where the key is the memory-based user instruction and the value is + // the instruction to be compared with I. So, if LoadMoveSet is provided, + // then the alias analysis is not used. This is necessary because this + // function is called during the process of moving instructions during + // vectorization and the results of the alias analysis are not stable during + // that process. + bool BBVectorize::trackUsesOfI(DenseSet<Value *> &Users, + AliasSetTracker &WriteSet, Instruction *I, + Instruction *J, bool UpdateUsers, + std::multimap<Value *, Value *> *LoadMoveSet) { + bool UsesI = false; + + // This instruction may already be marked as a user due, for example, to + // being a member of a selected pair. + if (Users.count(J)) + UsesI = true; + + if (!UsesI) + for (User::op_iterator JU = J->op_begin(), e = J->op_end(); + JU != e; ++JU) { + Value *V = *JU; + if (I == V || Users.count(V)) { + UsesI = true; + break; + } + } + if (!UsesI && J->mayReadFromMemory()) { + if (LoadMoveSet) { + VPIteratorPair JPairRange = LoadMoveSet->equal_range(J); + UsesI = isSecondInIteratorPair<Value*>(I, JPairRange); + } else { + for (AliasSetTracker::iterator W = WriteSet.begin(), + WE = WriteSet.end(); W != WE; ++W) { + for (AliasSet::iterator A = W->begin(), AE = W->end(); + A != AE; ++A) { + AliasAnalysis::Location ptrLoc(A->getValue(), A->getSize(), + A->getTBAAInfo()); + if (AA->getModRefInfo(J, ptrLoc) != AliasAnalysis::NoModRef) { + UsesI = true; + break; + } + } + if (UsesI) break; + } + } + } + + if (UsesI && UpdateUsers) { + if (J->mayWriteToMemory()) WriteSet.add(J); + Users.insert(J); + } + + return UsesI; + } + + // This function iterates over all instruction pairs in the provided + // basic block and collects all candidate pairs for vectorization. + void BBVectorize::getCandidatePairs(BasicBlock &BB, + std::multimap<Value *, Value *> &CandidatePairs, + std::vector<Value *> &PairableInsts) { + BasicBlock::iterator E = BB.end(); + for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) { + bool IsSimpleLoadStore; + if (!isInstVectorizable(I, IsSimpleLoadStore)) continue; + + // Look for an instruction with which to pair instruction *I... + DenseSet<Value *> Users; + AliasSetTracker WriteSet(*AA); + BasicBlock::iterator J = I; ++J; + for (unsigned ss = 0; J != E && ss <= SearchLimit; ++J, ++ss) { + // Determine if J uses I, if so, exit the loop. + bool UsesI = trackUsesOfI(Users, WriteSet, I, J, !FastDep); + if (FastDep) { + // Note: For this heuristic to be effective, independent operations + // must tend to be intermixed. This is likely to be true from some + // kinds of grouped loop unrolling (but not the generic LLVM pass), + // but otherwise may require some kind of reordering pass. + + // When using fast dependency analysis, + // stop searching after first use: + if (UsesI) break; + } else { + if (UsesI) continue; + } + + // J does not use I, and comes before the first use of I, so it can be + // merged with I if the instructions are compatible. + if (!areInstsCompatible(I, J, IsSimpleLoadStore)) continue; + + // J is a candidate for merging with I. + if (!PairableInsts.size() || + PairableInsts[PairableInsts.size()-1] != I) { + PairableInsts.push_back(I); + } + CandidatePairs.insert(ValuePair(I, J)); + DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair " + << *I << " <-> " << *J << "\n"); + } + } + + DEBUG(dbgs() << "BBV: found " << PairableInsts.size() + << " instructions with candidate pairs\n"); + } + + // Finds candidate pairs connected to the pair P = <PI, PJ>. This means that + // it looks for pairs such that both members have an input which is an + // output of PI or PJ. + void BBVectorize::computePairsConnectedTo( + std::multimap<Value *, Value *> &CandidatePairs, + std::vector<Value *> &PairableInsts, + std::multimap<ValuePair, ValuePair> &ConnectedPairs, + ValuePair P) { + // For each possible pairing for this variable, look at the uses of + // the first value... + for (Value::use_iterator I = P.first->use_begin(), + E = P.first->use_end(); I != E; ++I) { + VPIteratorPair IPairRange = CandidatePairs.equal_range(*I); + + // For each use of the first variable, look for uses of the second + // variable... + for (Value::use_iterator J = P.second->use_begin(), + E2 = P.second->use_end(); J != E2; ++J) { + VPIteratorPair JPairRange = CandidatePairs.equal_range(*J); + + // Look for <I, J>: + if (isSecondInIteratorPair<Value*>(*J, IPairRange)) + ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J))); + + // Look for <J, I>: + if (isSecondInIteratorPair<Value*>(*I, JPairRange)) + ConnectedPairs.insert(VPPair(P, ValuePair(*J, *I))); + } + + if (SplatBreaksChain) continue; + // Look for cases where just the first value in the pair is used by + // both members of another pair (splatting). + for (Value::use_iterator J = P.first->use_begin(); J != E; ++J) { + if (isSecondInIteratorPair<Value*>(*J, IPairRange)) + ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J))); + } + } + + if (SplatBreaksChain) return; + // Look for cases where just the second value in the pair is used by + // both members of another pair (splatting). + for (Value::use_iterator I = P.second->use_begin(), + E = P.second->use_end(); I != E; ++I) { + VPIteratorPair IPairRange = CandidatePairs.equal_range(*I); + + for (Value::use_iterator J = P.second->use_begin(); J != E; ++J) { + if (isSecondInIteratorPair<Value*>(*J, IPairRange)) + ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J))); + } + } + } + + // This function figures out which pairs are connected. Two pairs are + // connected if some output of the first pair forms an input to both members + // of the second pair. + void BBVectorize::computeConnectedPairs( + std::multimap<Value *, Value *> &CandidatePairs, + std::vector<Value *> &PairableInsts, + std::multimap<ValuePair, ValuePair> &ConnectedPairs) { + + for (std::vector<Value *>::iterator PI = PairableInsts.begin(), + PE = PairableInsts.end(); PI != PE; ++PI) { + VPIteratorPair choiceRange = CandidatePairs.equal_range(*PI); + + for (std::multimap<Value *, Value *>::iterator P = choiceRange.first; + P != choiceRange.second; ++P) + computePairsConnectedTo(CandidatePairs, PairableInsts, + ConnectedPairs, *P); + } + + DEBUG(dbgs() << "BBV: found " << ConnectedPairs.size() + << " pair connections.\n"); + } + + // This function builds a set of use tuples such that <A, B> is in the set + // if B is in the use tree of A. If B is in the use tree of A, then B + // depends on the output of A. + void BBVectorize::buildDepMap( + BasicBlock &BB, + std::multimap<Value *, Value *> &CandidatePairs, + std::vector<Value *> &PairableInsts, + DenseSet<ValuePair> &PairableInstUsers) { + DenseSet<Value *> IsInPair; + for (std::multimap<Value *, Value *>::iterator C = CandidatePairs.begin(), + E = CandidatePairs.end(); C != E; ++C) { + IsInPair.insert(C->first); + IsInPair.insert(C->second); + } + + // Iterate through the basic block, recording all Users of each + // pairable instruction. + + BasicBlock::iterator E = BB.end(); + for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) { + if (IsInPair.find(I) == IsInPair.end()) continue; + + DenseSet<Value *> Users; + AliasSetTracker WriteSet(*AA); + for (BasicBlock::iterator J = llvm::next(I); J != E; ++J) + (void) trackUsesOfI(Users, WriteSet, I, J); + + for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end(); + U != E; ++U) + PairableInstUsers.insert(ValuePair(I, *U)); + } + } + + // Returns true if an input to pair P is an output of pair Q and also an + // input of pair Q is an output of pair P. If this is the case, then these + // two pairs cannot be simultaneously fused. + bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q, + DenseSet<ValuePair> &PairableInstUsers, + std::multimap<ValuePair, ValuePair> *PairableInstUserMap) { + // Two pairs are in conflict if they are mutual Users of eachother. + bool QUsesP = PairableInstUsers.count(ValuePair(P.first, Q.first)) || + PairableInstUsers.count(ValuePair(P.first, Q.second)) || + PairableInstUsers.count(ValuePair(P.second, Q.first)) || + PairableInstUsers.count(ValuePair(P.second, Q.second)); + bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first, P.first)) || + PairableInstUsers.count(ValuePair(Q.first, P.second)) || + PairableInstUsers.count(ValuePair(Q.second, P.first)) || + PairableInstUsers.count(ValuePair(Q.second, P.second)); + if (PairableInstUserMap) { + // FIXME: The expensive part of the cycle check is not so much the cycle + // check itself but this edge insertion procedure. This needs some + // profiling and probably a different data structure (same is true of + // most uses of std::multimap). + if (PUsesQ) { + VPPIteratorPair QPairRange = PairableInstUserMap->equal_range(Q); + if (!isSecondInIteratorPair(P, QPairRange)) + PairableInstUserMap->insert(VPPair(Q, P)); + } + if (QUsesP) { + VPPIteratorPair PPairRange = PairableInstUserMap->equal_range(P); + if (!isSecondInIteratorPair(Q, PPairRange)) + PairableInstUserMap->insert(VPPair(P, Q)); + } + } + + return (QUsesP && PUsesQ); + } + + // This function walks the use graph of current pairs to see if, starting + // from P, the walk returns to P. + bool BBVectorize::pairWillFormCycle(ValuePair P, + std::multimap<ValuePair, ValuePair> &PairableInstUserMap, + DenseSet<ValuePair> &CurrentPairs) { + DEBUG(if (DebugCycleCheck) + dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> " + << *P.second << "\n"); + // A lookup table of visisted pairs is kept because the PairableInstUserMap + // contains non-direct associations. + DenseSet<ValuePair> Visited; + std::vector<ValuePair> Q; + // General depth-first post-order traversal: + Q.push_back(P); + while (!Q.empty()) { + ValueP |