aboutsummaryrefslogtreecommitdiff
path: root/lib/Transforms/Vectorize/BBVectorize.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Transforms/Vectorize/BBVectorize.cpp')
-rw-r--r--lib/Transforms/Vectorize/BBVectorize.cpp1796
1 files changed, 1796 insertions, 0 deletions
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp
new file mode 100644
index 0000000000..9c2c8dd51b
--- /dev/null
+++ b/lib/Transforms/Vectorize/BBVectorize.cpp
@@ -0,0 +1,1796 @@
+//===- BBVectorize.cpp - A Basic-Block Vectorizer -------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a basic-block vectorization pass. The algorithm was
+// inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral,
+// et al. It works by looking for chains of pairable operations and then
+// pairing them.
+//
+//===----------------------------------------------------------------------===//
+
+#define BBV_NAME "bb-vectorize"
+#define DEBUG_TYPE BBV_NAME
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Pass.h"
+#include "llvm/Type.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/ValueHandle.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Transforms/Vectorize.h"
+#include <algorithm>
+#include <map>
+using namespace llvm;
+
+static cl::opt<unsigned>
+ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden,
+ cl::desc("The required chain depth for vectorization"));
+
+static cl::opt<unsigned>
+SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden,
+ cl::desc("The maximum search distance for instruction pairs"));
+
+static cl::opt<bool>
+SplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden,
+ cl::desc("Replicating one element to a pair breaks the chain"));
+
+static cl::opt<unsigned>
+VectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden,
+ cl::desc("The size of the native vector registers"));
+
+static cl::opt<unsigned>
+MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden,
+ cl::desc("The maximum number of pairing iterations"));
+
+static cl::opt<unsigned>
+MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200),
+ cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use"
+ " a full cycle check"));
+
+static cl::opt<bool>
+NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden,
+ cl::desc("Don't try to vectorize integer values"));
+
+static cl::opt<bool>
+NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden,
+ cl::desc("Don't try to vectorize floating-point values"));
+
+static cl::opt<bool>
+NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden,
+ cl::desc("Don't try to vectorize casting (conversion) operations"));
+
+static cl::opt<bool>
+NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden,
+ cl::desc("Don't try to vectorize floating-point math intrinsics"));
+
+static cl::opt<bool>
+NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden,
+ cl::desc("Don't try to vectorize the fused-multiply-add intrinsic"));
+
+static cl::opt<bool>
+NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden,
+ cl::desc("Don't try to vectorize loads and stores"));
+
+static cl::opt<bool>
+AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden,
+ cl::desc("Only generate aligned loads and stores"));
+
+static cl::opt<bool>
+FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden,
+ cl::desc("Use a fast instruction dependency analysis"));
+
+#ifndef NDEBUG
+static cl::opt<bool>
+DebugInstructionExamination("bb-vectorize-debug-instruction-examination",
+ cl::init(false), cl::Hidden,
+ cl::desc("When debugging is enabled, output information on the"
+ " instruction-examination process"));
+static cl::opt<bool>
+DebugCandidateSelection("bb-vectorize-debug-candidate-selection",
+ cl::init(false), cl::Hidden,
+ cl::desc("When debugging is enabled, output information on the"
+ " candidate-selection process"));
+static cl::opt<bool>
+DebugPairSelection("bb-vectorize-debug-pair-selection",
+ cl::init(false), cl::Hidden,
+ cl::desc("When debugging is enabled, output information on the"
+ " pair-selection process"));
+static cl::opt<bool>
+DebugCycleCheck("bb-vectorize-debug-cycle-check",
+ cl::init(false), cl::Hidden,
+ cl::desc("When debugging is enabled, output information on the"
+ " cycle-checking process"));
+#endif
+
+STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize");
+
+namespace {
+ struct BBVectorize : public BasicBlockPass {
+ static char ID; // Pass identification, replacement for typeid
+ BBVectorize() : BasicBlockPass(ID) {
+ initializeBBVectorizePass(*PassRegistry::getPassRegistry());
+ }
+
+ typedef std::pair<Value *, Value *> ValuePair;
+ typedef std::pair<ValuePair, size_t> ValuePairWithDepth;
+ typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair
+ typedef std::pair<std::multimap<Value *, Value *>::iterator,
+ std::multimap<Value *, Value *>::iterator> VPIteratorPair;
+ typedef std::pair<std::multimap<ValuePair, ValuePair>::iterator,
+ std::multimap<ValuePair, ValuePair>::iterator>
+ VPPIteratorPair;
+
+ AliasAnalysis *AA;
+ ScalarEvolution *SE;
+ TargetData *TD;
+
+ // FIXME: const correct?
+
+ bool vectorizePairs(BasicBlock &BB);
+
+ void getCandidatePairs(BasicBlock &BB,
+ std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *> &PairableInsts);
+
+ void computeConnectedPairs(std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *> &PairableInsts,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairs);
+
+ void buildDepMap(BasicBlock &BB,
+ std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *> &PairableInsts,
+ DenseSet<ValuePair> &PairableInstUsers);
+
+ void choosePairs(std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *> &PairableInsts,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+ DenseSet<ValuePair> &PairableInstUsers,
+ DenseMap<Value *, Value *>& ChosenPairs);
+
+ void fuseChosenPairs(BasicBlock &BB,
+ std::vector<Value *> &PairableInsts,
+ DenseMap<Value *, Value *>& ChosenPairs);
+
+ bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore);
+
+ bool areInstsCompatible(Instruction *I, Instruction *J,
+ bool IsSimpleLoadStore);
+
+ bool trackUsesOfI(DenseSet<Value *> &Users,
+ AliasSetTracker &WriteSet, Instruction *I,
+ Instruction *J, bool UpdateUsers = true,
+ std::multimap<Value *, Value *> *LoadMoveSet = 0);
+
+ void computePairsConnectedTo(
+ std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *> &PairableInsts,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+ ValuePair P);
+
+ bool pairsConflict(ValuePair P, ValuePair Q,
+ DenseSet<ValuePair> &PairableInstUsers,
+ std::multimap<ValuePair, ValuePair> *PairableInstUserMap = 0);
+
+ bool pairWillFormCycle(ValuePair P,
+ std::multimap<ValuePair, ValuePair> &PairableInstUsers,
+ DenseSet<ValuePair> &CurrentPairs);
+
+ void pruneTreeFor(
+ std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *> &PairableInsts,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+ DenseSet<ValuePair> &PairableInstUsers,
+ std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
+ DenseMap<Value *, Value *> &ChosenPairs,
+ DenseMap<ValuePair, size_t> &Tree,
+ DenseSet<ValuePair> &PrunedTree, ValuePair J,
+ bool UseCycleCheck);
+
+ void buildInitialTreeFor(
+ std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *> &PairableInsts,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+ DenseSet<ValuePair> &PairableInstUsers,
+ DenseMap<Value *, Value *> &ChosenPairs,
+ DenseMap<ValuePair, size_t> &Tree, ValuePair J);
+
+ void findBestTreeFor(
+ std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *> &PairableInsts,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+ DenseSet<ValuePair> &PairableInstUsers,
+ std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
+ DenseMap<Value *, Value *> &ChosenPairs,
+ DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth,
+ size_t &BestEffSize, VPIteratorPair ChoiceRange,
+ bool UseCycleCheck);
+
+ Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I,
+ Instruction *J, unsigned o, bool &FlipMemInputs);
+
+ void fillNewShuffleMask(LLVMContext& Context, Instruction *J,
+ unsigned NumElem, unsigned MaskOffset, unsigned NumInElem,
+ unsigned IdxOffset, std::vector<Constant*> &Mask);
+
+ Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I,
+ Instruction *J);
+
+ Value *getReplacementInput(LLVMContext& Context, Instruction *I,
+ Instruction *J, unsigned o, bool FlipMemInputs);
+
+ void getReplacementInputsForPair(LLVMContext& Context, Instruction *I,
+ Instruction *J, SmallVector<Value *, 3> &ReplacedOperands,
+ bool &FlipMemInputs);
+
+ void replaceOutputsOfPair(LLVMContext& Context, Instruction *I,
+ Instruction *J, Instruction *K,
+ Instruction *&InsertionPt, Instruction *&K1,
+ Instruction *&K2, bool &FlipMemInputs);
+
+ void collectPairLoadMoveSet(BasicBlock &BB,
+ DenseMap<Value *, Value *> &ChosenPairs,
+ std::multimap<Value *, Value *> &LoadMoveSet,
+ Instruction *I);
+
+ void collectLoadMoveSet(BasicBlock &BB,
+ std::vector<Value *> &PairableInsts,
+ DenseMap<Value *, Value *> &ChosenPairs,
+ std::multimap<Value *, Value *> &LoadMoveSet);
+
+ bool canMoveUsesOfIAfterJ(BasicBlock &BB,
+ std::multimap<Value *, Value *> &LoadMoveSet,
+ Instruction *I, Instruction *J);
+
+ void moveUsesOfIAfterJ(BasicBlock &BB,
+ std::multimap<Value *, Value *> &LoadMoveSet,
+ Instruction *&InsertionPt,
+ Instruction *I, Instruction *J);
+
+ virtual bool runOnBasicBlock(BasicBlock &BB) {
+ AA = &getAnalysis<AliasAnalysis>();
+ SE = &getAnalysis<ScalarEvolution>();
+ TD = getAnalysisIfAvailable<TargetData>();
+
+ bool changed = false;
+ // Iterate a sufficient number of times to merge types of size 1 bit,
+ // then 2 bits, then 4, etc. up to half of the target vector width of the
+ // target vector register.
+ for (unsigned v = 2, n = 1; v <= VectorBits && (!MaxIter || n <= MaxIter);
+ v *= 2, ++n) {
+ DEBUG(dbgs() << "BBV: fusing loop #" << n <<
+ " for " << BB.getName() << " in " <<
+ BB.getParent()->getName() << "...\n");
+ if (vectorizePairs(BB))
+ changed = true;
+ else
+ break;
+ }
+
+ DEBUG(dbgs() << "BBV: done!\n");
+ return changed;
+ }
+
+ virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+ BasicBlockPass::getAnalysisUsage(AU);
+ AU.addRequired<AliasAnalysis>();
+ AU.addRequired<ScalarEvolution>();
+ AU.addPreserved<AliasAnalysis>();
+ AU.addPreserved<ScalarEvolution>();
+ }
+
+ // This returns the vector type that holds a pair of the provided type.
+ // If the provided type is already a vector, then its length is doubled.
+ static inline VectorType *getVecTypeForPair(Type *ElemTy) {
+ if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) {
+ unsigned numElem = VTy->getNumElements();
+ return VectorType::get(ElemTy->getScalarType(), numElem*2);
+ } else {
+ return VectorType::get(ElemTy, 2);
+ }
+ }
+
+ // Returns the weight associated with the provided value. A chain of
+ // candidate pairs has a length given by the sum of the weights of its
+ // members (one weight per pair; the weight of each member of the pair
+ // is assumed to be the same). This length is then compared to the
+ // chain-length threshold to determine if a given chain is significant
+ // enough to be vectorized. The length is also used in comparing
+ // candidate chains where longer chains are considered to be better.
+ // Note: when this function returns 0, the resulting instructions are
+ // not actually fused.
+ static inline size_t getDepthFactor(Value *V) {
+ // InsertElement and ExtractElement have a depth factor of zero. This is
+ // for two reasons: First, they cannot be usefully fused. Second, because
+ // the pass generates a lot of these, they can confuse the simple metric
+ // used to compare the trees in the next iteration. Thus, giving them a
+ // weight of zero allows the pass to essentially ignore them in
+ // subsequent iterations when looking for vectorization opportunities
+ // while still tracking dependency chains that flow through those
+ // instructions.
+ if (isa<InsertElementInst>(V) || isa<ExtractElementInst>(V))
+ return 0;
+
+ return 1;
+ }
+
+ // This determines the relative offset of two loads or stores, returning
+ // true if the offset could be determined to be some constant value.
+ // For example, if OffsetInElmts == 1, then J accesses the memory directly
+ // after I; if OffsetInElmts == -1 then I accesses the memory
+ // directly after J. This function assumes that both instructions
+ // have the same type.
+ bool getPairPtrInfo(Instruction *I, Instruction *J,
+ Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment,
+ int64_t &OffsetInElmts) {
+ OffsetInElmts = 0;
+ if (isa<LoadInst>(I)) {
+ IPtr = cast<LoadInst>(I)->getPointerOperand();
+ JPtr = cast<LoadInst>(J)->getPointerOperand();
+ IAlignment = cast<LoadInst>(I)->getAlignment();
+ JAlignment = cast<LoadInst>(J)->getAlignment();
+ } else {
+ IPtr = cast<StoreInst>(I)->getPointerOperand();
+ JPtr = cast<StoreInst>(J)->getPointerOperand();
+ IAlignment = cast<StoreInst>(I)->getAlignment();
+ JAlignment = cast<StoreInst>(J)->getAlignment();
+ }
+
+ const SCEV *IPtrSCEV = SE->getSCEV(IPtr);
+ const SCEV *JPtrSCEV = SE->getSCEV(JPtr);
+
+ // If this is a trivial offset, then we'll get something like
+ // 1*sizeof(type). With target data, which we need anyway, this will get
+ // constant folded into a number.
+ const SCEV *OffsetSCEV = SE->getMinusSCEV(JPtrSCEV, IPtrSCEV);
+ if (const SCEVConstant *ConstOffSCEV =
+ dyn_cast<SCEVConstant>(OffsetSCEV)) {
+ ConstantInt *IntOff = ConstOffSCEV->getValue();
+ int64_t Offset = IntOff->getSExtValue();
+
+ Type *VTy = cast<PointerType>(IPtr->getType())->getElementType();
+ int64_t VTyTSS = (int64_t) TD->getTypeStoreSize(VTy);
+
+ assert(VTy == cast<PointerType>(JPtr->getType())->getElementType());
+
+ OffsetInElmts = Offset/VTyTSS;
+ return (abs64(Offset) % VTyTSS) == 0;
+ }
+
+ return false;
+ }
+
+ // Returns true if the provided CallInst represents an intrinsic that can
+ // be vectorized.
+ bool isVectorizableIntrinsic(CallInst* I) {
+ Function *F = I->getCalledFunction();
+ if (!F) return false;
+
+ unsigned IID = F->getIntrinsicID();
+ if (!IID) return false;
+
+ switch(IID) {
+ default:
+ return false;
+ case Intrinsic::sqrt:
+ case Intrinsic::powi:
+ case Intrinsic::sin:
+ case Intrinsic::cos:
+ case Intrinsic::log:
+ case Intrinsic::log2:
+ case Intrinsic::log10:
+ case Intrinsic::exp:
+ case Intrinsic::exp2:
+ case Intrinsic::pow:
+ return !NoMath;
+ case Intrinsic::fma:
+ return !NoFMA;
+ }
+ }
+
+ // Returns true if J is the second element in some pair referenced by
+ // some multimap pair iterator pair.
+ template <typename V>
+ bool isSecondInIteratorPair(V J, std::pair<
+ typename std::multimap<V, V>::iterator,
+ typename std::multimap<V, V>::iterator> PairRange) {
+ for (typename std::multimap<V, V>::iterator K = PairRange.first;
+ K != PairRange.second; ++K)
+ if (K->second == J) return true;
+
+ return false;
+ }
+ };
+
+ // This function implements one vectorization iteration on the provided
+ // basic block. It returns true if the block is changed.
+ bool BBVectorize::vectorizePairs(BasicBlock &BB) {
+ std::vector<Value *> PairableInsts;
+ std::multimap<Value *, Value *> CandidatePairs;
+ getCandidatePairs(BB, CandidatePairs, PairableInsts);
+ if (PairableInsts.size() == 0) return false;
+
+ // Now we have a map of all of the pairable instructions and we need to
+ // select the best possible pairing. A good pairing is one such that the
+ // users of the pair are also paired. This defines a (directed) forest
+ // over the pairs such that two pairs are connected iff the second pair
+ // uses the first.
+
+ // Note that it only matters that both members of the second pair use some
+ // element of the first pair (to allow for splatting).
+
+ std::multimap<ValuePair, ValuePair> ConnectedPairs;
+ computeConnectedPairs(CandidatePairs, PairableInsts, ConnectedPairs);
+ if (ConnectedPairs.size() == 0) return false;
+
+ // Build the pairable-instruction dependency map
+ DenseSet<ValuePair> PairableInstUsers;
+ buildDepMap(BB, CandidatePairs, PairableInsts, PairableInstUsers);
+
+ // There is now a graph of the connected pairs. For each variable, pick the
+ // pairing with the largest tree meeting the depth requirement on at least
+ // one branch. Then select all pairings that are part of that tree and
+ // remove them from the list of available pairings and pairable variables.
+
+ DenseMap<Value *, Value *> ChosenPairs;
+ choosePairs(CandidatePairs, PairableInsts, ConnectedPairs,
+ PairableInstUsers, ChosenPairs);
+
+ if (ChosenPairs.size() == 0) return false;
+ NumFusedOps += ChosenPairs.size();
+
+ // A set of pairs has now been selected. It is now necessary to replace the
+ // paired instructions with vector instructions. For this procedure each
+ // operand much be replaced with a vector operand. This vector is formed
+ // by using build_vector on the old operands. The replaced values are then
+ // replaced with a vector_extract on the result. Subsequent optimization
+ // passes should coalesce the build/extract combinations.
+
+ fuseChosenPairs(BB, PairableInsts, ChosenPairs);
+
+ return true;
+ }
+
+ // This function returns true if the provided instruction is capable of being
+ // fused into a vector instruction. This determination is based only on the
+ // type and other attributes of the instruction.
+ bool BBVectorize::isInstVectorizable(Instruction *I,
+ bool &IsSimpleLoadStore) {
+ IsSimpleLoadStore = false;
+
+ if (CallInst *C = dyn_cast<CallInst>(I)) {
+ if (!isVectorizableIntrinsic(C))
+ return false;
+ } else if (LoadInst *L = dyn_cast<LoadInst>(I)) {
+ // Vectorize simple loads if possbile:
+ IsSimpleLoadStore = L->isSimple();
+ if (!IsSimpleLoadStore || NoMemOps)
+ return false;
+ } else if (StoreInst *S = dyn_cast<StoreInst>(I)) {
+ // Vectorize simple stores if possbile:
+ IsSimpleLoadStore = S->isSimple();
+ if (!IsSimpleLoadStore || NoMemOps)
+ return false;
+ } else if (CastInst *C = dyn_cast<CastInst>(I)) {
+ // We can vectorize casts, but not casts of pointer types, etc.
+ if (NoCasts)
+ return false;
+
+ Type *SrcTy = C->getSrcTy();
+ if (!SrcTy->isSingleValueType() || SrcTy->isPointerTy())
+ return false;
+
+ Type *DestTy = C->getDestTy();
+ if (!DestTy->isSingleValueType() || DestTy->isPointerTy())
+ return false;
+ } else if (!(I->isBinaryOp() || isa<ShuffleVectorInst>(I) ||
+ isa<ExtractElementInst>(I) || isa<InsertElementInst>(I))) {
+ return false;
+ }
+
+ // We can't vectorize memory operations without target data
+ if (TD == 0 && IsSimpleLoadStore)
+ return false;
+
+ Type *T1, *T2;
+ if (isa<StoreInst>(I)) {
+ // For stores, it is the value type, not the pointer type that matters
+ // because the value is what will come from a vector register.
+
+ Value *IVal = cast<StoreInst>(I)->getValueOperand();
+ T1 = IVal->getType();
+ } else {
+ T1 = I->getType();
+ }
+
+ if (I->isCast())
+ T2 = cast<CastInst>(I)->getSrcTy();
+ else
+ T2 = T1;
+
+ // Not every type can be vectorized...
+ if (!(VectorType::isValidElementType(T1) || T1->isVectorTy()) ||
+ !(VectorType::isValidElementType(T2) || T2->isVectorTy()))
+ return false;
+
+ if (NoInts && (T1->isIntOrIntVectorTy() || T2->isIntOrIntVectorTy()))
+ return false;
+
+ if (NoFloats && (T1->isFPOrFPVectorTy() || T2->isFPOrFPVectorTy()))
+ return false;
+
+ if (T1->getPrimitiveSizeInBits() > VectorBits/2 ||
+ T2->getPrimitiveSizeInBits() > VectorBits/2)
+ return false;
+
+ return true;
+ }
+
+ // This function returns true if the two provided instructions are compatible
+ // (meaning that they can be fused into a vector instruction). This assumes
+ // that I has already been determined to be vectorizable and that J is not
+ // in the use tree of I.
+ bool BBVectorize::areInstsCompatible(Instruction *I, Instruction *J,
+ bool IsSimpleLoadStore) {
+ DEBUG(if (DebugInstructionExamination) dbgs() << "BBV: looking at " << *I <<
+ " <-> " << *J << "\n");
+
+ // Loads and stores can be merged if they have different alignments,
+ // but are otherwise the same.
+ LoadInst *LI, *LJ;
+ StoreInst *SI, *SJ;
+ if ((LI = dyn_cast<LoadInst>(I)) && (LJ = dyn_cast<LoadInst>(J))) {
+ if (I->getType() != J->getType())
+ return false;
+
+ if (LI->getPointerOperand()->getType() !=
+ LJ->getPointerOperand()->getType() ||
+ LI->isVolatile() != LJ->isVolatile() ||
+ LI->getOrdering() != LJ->getOrdering() ||
+ LI->getSynchScope() != LJ->getSynchScope())
+ return false;
+ } else if ((SI = dyn_cast<StoreInst>(I)) && (SJ = dyn_cast<StoreInst>(J))) {
+ if (SI->getValueOperand()->getType() !=
+ SJ->getValueOperand()->getType() ||
+ SI->getPointerOperand()->getType() !=
+ SJ->getPointerOperand()->getType() ||
+ SI->isVolatile() != SJ->isVolatile() ||
+ SI->getOrdering() != SJ->getOrdering() ||
+ SI->getSynchScope() != SJ->getSynchScope())
+ return false;
+ } else if (!J->isSameOperationAs(I)) {
+ return false;
+ }
+ // FIXME: handle addsub-type operations!
+
+ if (IsSimpleLoadStore) {
+ Value *IPtr, *JPtr;
+ unsigned IAlignment, JAlignment;
+ int64_t OffsetInElmts = 0;
+ if (getPairPtrInfo(I, J, IPtr, JPtr, IAlignment, JAlignment,
+ OffsetInElmts) && abs64(OffsetInElmts) == 1) {
+ if (AlignedOnly) {
+ Type *aType = isa<StoreInst>(I) ?
+ cast<StoreInst>(I)->getValueOperand()->getType() : I->getType();
+ // An aligned load or store is possible only if the instruction
+ // with the lower offset has an alignment suitable for the
+ // vector type.
+
+ unsigned BottomAlignment = IAlignment;
+ if (OffsetInElmts < 0) BottomAlignment = JAlignment;
+
+ Type *VType = getVecTypeForPair(aType);
+ unsigned VecAlignment = TD->getPrefTypeAlignment(VType);
+ if (BottomAlignment < VecAlignment)
+ return false;
+ }
+ } else {
+ return false;
+ }
+ } else if (isa<ShuffleVectorInst>(I)) {
+ // Only merge two shuffles if they're both constant
+ return isa<Constant>(I->getOperand(2)) &&
+ isa<Constant>(J->getOperand(2));
+ // FIXME: We may want to vectorize non-constant shuffles also.
+ }
+
+ return true;
+ }
+
+ // Figure out whether or not J uses I and update the users and write-set
+ // structures associated with I. Specifically, Users represents the set of
+ // instructions that depend on I. WriteSet represents the set
+ // of memory locations that are dependent on I. If UpdateUsers is true,
+ // and J uses I, then Users is updated to contain J and WriteSet is updated
+ // to contain any memory locations to which J writes. The function returns
+ // true if J uses I. By default, alias analysis is used to determine
+ // whether J reads from memory that overlaps with a location in WriteSet.
+ // If LoadMoveSet is not null, then it is a previously-computed multimap
+ // where the key is the memory-based user instruction and the value is
+ // the instruction to be compared with I. So, if LoadMoveSet is provided,
+ // then the alias analysis is not used. This is necessary because this
+ // function is called during the process of moving instructions during
+ // vectorization and the results of the alias analysis are not stable during
+ // that process.
+ bool BBVectorize::trackUsesOfI(DenseSet<Value *> &Users,
+ AliasSetTracker &WriteSet, Instruction *I,
+ Instruction *J, bool UpdateUsers,
+ std::multimap<Value *, Value *> *LoadMoveSet) {
+ bool UsesI = false;
+
+ // This instruction may already be marked as a user due, for example, to
+ // being a member of a selected pair.
+ if (Users.count(J))
+ UsesI = true;
+
+ if (!UsesI)
+ for (User::op_iterator JU = J->op_begin(), e = J->op_end();
+ JU != e; ++JU) {
+ Value *V = *JU;
+ if (I == V || Users.count(V)) {
+ UsesI = true;
+ break;
+ }
+ }
+ if (!UsesI && J->mayReadFromMemory()) {
+ if (LoadMoveSet) {
+ VPIteratorPair JPairRange = LoadMoveSet->equal_range(J);
+ UsesI = isSecondInIteratorPair<Value*>(I, JPairRange);
+ } else {
+ for (AliasSetTracker::iterator W = WriteSet.begin(),
+ WE = WriteSet.end(); W != WE; ++W) {
+ for (AliasSet::iterator A = W->begin(), AE = W->end();
+ A != AE; ++A) {
+ AliasAnalysis::Location ptrLoc(A->getValue(), A->getSize(),
+ A->getTBAAInfo());
+ if (AA->getModRefInfo(J, ptrLoc) != AliasAnalysis::NoModRef) {
+ UsesI = true;
+ break;
+ }
+ }
+ if (UsesI) break;
+ }
+ }
+ }
+
+ if (UsesI && UpdateUsers) {
+ if (J->mayWriteToMemory()) WriteSet.add(J);
+ Users.insert(J);
+ }
+
+ return UsesI;
+ }
+
+ // This function iterates over all instruction pairs in the provided
+ // basic block and collects all candidate pairs for vectorization.
+ void BBVectorize::getCandidatePairs(BasicBlock &BB,
+ std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *> &PairableInsts) {
+ BasicBlock::iterator E = BB.end();
+ for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) {
+ bool IsSimpleLoadStore;
+ if (!isInstVectorizable(I, IsSimpleLoadStore)) continue;
+
+ // Look for an instruction with which to pair instruction *I...
+ DenseSet<Value *> Users;
+ AliasSetTracker WriteSet(*AA);
+ BasicBlock::iterator J = I; ++J;
+ for (unsigned ss = 0; J != E && ss <= SearchLimit; ++J, ++ss) {
+ // Determine if J uses I, if so, exit the loop.
+ bool UsesI = trackUsesOfI(Users, WriteSet, I, J, !FastDep);
+ if (FastDep) {
+ // Note: For this heuristic to be effective, independent operations
+ // must tend to be intermixed. This is likely to be true from some
+ // kinds of grouped loop unrolling (but not the generic LLVM pass),
+ // but otherwise may require some kind of reordering pass.
+
+ // When using fast dependency analysis,
+ // stop searching after first use:
+ if (UsesI) break;
+ } else {
+ if (UsesI) continue;
+ }
+
+ // J does not use I, and comes before the first use of I, so it can be
+ // merged with I if the instructions are compatible.
+ if (!areInstsCompatible(I, J, IsSimpleLoadStore)) continue;
+
+ // J is a candidate for merging with I.
+ if (!PairableInsts.size() ||
+ PairableInsts[PairableInsts.size()-1] != I) {
+ PairableInsts.push_back(I);
+ }
+ CandidatePairs.insert(ValuePair(I, J));
+ DEBUG(if (DebugCandidateSelection) dbgs() << "BBV: candidate pair "
+ << *I << " <-> " << *J << "\n");
+ }
+ }
+
+ DEBUG(dbgs() << "BBV: found " << PairableInsts.size()
+ << " instructions with candidate pairs\n");
+ }
+
+ // Finds candidate pairs connected to the pair P = <PI, PJ>. This means that
+ // it looks for pairs such that both members have an input which is an
+ // output of PI or PJ.
+ void BBVectorize::computePairsConnectedTo(
+ std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *> &PairableInsts,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairs,
+ ValuePair P) {
+ // For each possible pairing for this variable, look at the uses of
+ // the first value...
+ for (Value::use_iterator I = P.first->use_begin(),
+ E = P.first->use_end(); I != E; ++I) {
+ VPIteratorPair IPairRange = CandidatePairs.equal_range(*I);
+
+ // For each use of the first variable, look for uses of the second
+ // variable...
+ for (Value::use_iterator J = P.second->use_begin(),
+ E2 = P.second->use_end(); J != E2; ++J) {
+ VPIteratorPair JPairRange = CandidatePairs.equal_range(*J);
+
+ // Look for <I, J>:
+ if (isSecondInIteratorPair<Value*>(*J, IPairRange))
+ ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
+
+ // Look for <J, I>:
+ if (isSecondInIteratorPair<Value*>(*I, JPairRange))
+ ConnectedPairs.insert(VPPair(P, ValuePair(*J, *I)));
+ }
+
+ if (SplatBreaksChain) continue;
+ // Look for cases where just the first value in the pair is used by
+ // both members of another pair (splatting).
+ for (Value::use_iterator J = P.first->use_begin(); J != E; ++J) {
+ if (isSecondInIteratorPair<Value*>(*J, IPairRange))
+ ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
+ }
+ }
+
+ if (SplatBreaksChain) return;
+ // Look for cases where just the second value in the pair is used by
+ // both members of another pair (splatting).
+ for (Value::use_iterator I = P.second->use_begin(),
+ E = P.second->use_end(); I != E; ++I) {
+ VPIteratorPair IPairRange = CandidatePairs.equal_range(*I);
+
+ for (Value::use_iterator J = P.second->use_begin(); J != E; ++J) {
+ if (isSecondInIteratorPair<Value*>(*J, IPairRange))
+ ConnectedPairs.insert(VPPair(P, ValuePair(*I, *J)));
+ }
+ }
+ }
+
+ // This function figures out which pairs are connected. Two pairs are
+ // connected if some output of the first pair forms an input to both members
+ // of the second pair.
+ void BBVectorize::computeConnectedPairs(
+ std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *> &PairableInsts,
+ std::multimap<ValuePair, ValuePair> &ConnectedPairs) {
+
+ for (std::vector<Value *>::iterator PI = PairableInsts.begin(),
+ PE = PairableInsts.end(); PI != PE; ++PI) {
+ VPIteratorPair choiceRange = CandidatePairs.equal_range(*PI);
+
+ for (std::multimap<Value *, Value *>::iterator P = choiceRange.first;
+ P != choiceRange.second; ++P)
+ computePairsConnectedTo(CandidatePairs, PairableInsts,
+ ConnectedPairs, *P);
+ }
+
+ DEBUG(dbgs() << "BBV: found " << ConnectedPairs.size()
+ << " pair connections.\n");
+ }
+
+ // This function builds a set of use tuples such that <A, B> is in the set
+ // if B is in the use tree of A. If B is in the use tree of A, then B
+ // depends on the output of A.
+ void BBVectorize::buildDepMap(
+ BasicBlock &BB,
+ std::multimap<Value *, Value *> &CandidatePairs,
+ std::vector<Value *> &PairableInsts,
+ DenseSet<ValuePair> &PairableInstUsers) {
+ DenseSet<Value *> IsInPair;
+ for (std::multimap<Value *, Value *>::iterator C = CandidatePairs.begin(),
+ E = CandidatePairs.end(); C != E; ++C) {
+ IsInPair.insert(C->first);
+ IsInPair.insert(C->second);
+ }
+
+ // Iterate through the basic block, recording all Users of each
+ // pairable instruction.
+
+ BasicBlock::iterator E = BB.end();
+ for (BasicBlock::iterator I = BB.getFirstInsertionPt(); I != E; ++I) {
+ if (IsInPair.find(I) == IsInPair.end()) continue;
+
+ DenseSet<Value *> Users;
+ AliasSetTracker WriteSet(*AA);
+ for (BasicBlock::iterator J = llvm::next(I); J != E; ++J)
+ (void) trackUsesOfI(Users, WriteSet, I, J);
+
+ for (DenseSet<Value *>::iterator U = Users.begin(), E = Users.end();
+ U != E; ++U)
+ PairableInstUsers.insert(ValuePair(I, *U));
+ }
+ }
+
+ // Returns true if an input to pair P is an output of pair Q and also an
+ // input of pair Q is an output of pair P. If this is the case, then these
+ // two pairs cannot be simultaneously fused.
+ bool BBVectorize::pairsConflict(ValuePair P, ValuePair Q,
+ DenseSet<ValuePair> &PairableInstUsers,
+ std::multimap<ValuePair, ValuePair> *PairableInstUserMap) {
+ // Two pairs are in conflict if they are mutual Users of eachother.
+ bool QUsesP = PairableInstUsers.count(ValuePair(P.first, Q.first)) ||
+ PairableInstUsers.count(ValuePair(P.first, Q.second)) ||
+ PairableInstUsers.count(ValuePair(P.second, Q.first)) ||
+ PairableInstUsers.count(ValuePair(P.second, Q.second));
+ bool PUsesQ = PairableInstUsers.count(ValuePair(Q.first, P.first)) ||
+ PairableInstUsers.count(ValuePair(Q.first, P.second)) ||
+ PairableInstUsers.count(ValuePair(Q.second, P.first)) ||
+ PairableInstUsers.count(ValuePair(Q.second, P.second));
+ if (PairableInstUserMap) {
+ // FIXME: The expensive part of the cycle check is not so much the cycle
+ // check itself but this edge insertion procedure. This needs some
+ // profiling and probably a different data structure (same is true of
+ // most uses of std::multimap).
+ if (PUsesQ) {
+ VPPIteratorPair QPairRange = PairableInstUserMap->equal_range(Q);
+ if (!isSecondInIteratorPair(P, QPairRange))
+ PairableInstUserMap->insert(VPPair(Q, P));
+ }
+ if (QUsesP) {
+ VPPIteratorPair PPairRange = PairableInstUserMap->equal_range(P);
+ if (!isSecondInIteratorPair(Q, PPairRange))
+ PairableInstUserMap->insert(VPPair(P, Q));
+ }
+ }
+
+ return (QUsesP && PUsesQ);
+ }
+
+ // This function walks the use graph of current pairs to see if, starting
+ // from P, the walk returns to P.
+ bool BBVectorize::pairWillFormCycle(ValuePair P,
+ std::multimap<ValuePair, ValuePair> &PairableInstUserMap,
+ DenseSet<ValuePair> &CurrentPairs) {
+ DEBUG(if (DebugCycleCheck)
+ dbgs() << "BBV: starting cycle check for : " << *P.first << " <-> "
+ << *P.second << "\n");
+ // A lookup table of visisted pairs is kept because the PairableInstUserMap
+ // contains non-direct associations.
+ DenseSet<ValuePair> Visited;
+ std::vector<ValuePair> Q;
+ // General depth-first post-order traversal:
+ Q.push_back(P);
+ while (!Q.empty()) {
+ ValueP