diff options
author | Hal Finkel <hfinkel@anl.gov> | 2012-02-01 03:51:43 +0000 |
---|---|---|
committer | Hal Finkel <hfinkel@anl.gov> | 2012-02-01 03:51:43 +0000 |
commit | de5e5ec3045a73a06b1054417f9ac6c02929e9ce (patch) | |
tree | 02f43070729c0328db2d86e33acd94df3ab52d53 | |
parent | d0e277d272d517ca1cda368267d199f0da7cad95 (diff) |
Add a basic-block autovectorization pass.
This is the initial checkin of the basic-block autovectorization pass along with some supporting vectorization infrastructure.
Special thanks to everyone who helped review this code over the last several months (especially Tobias Grosser).
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@149468 91177308-0d34-0410-b5e6-96231b3b80d8
35 files changed, 2635 insertions, 12 deletions
diff --git a/docs/Passes.html b/docs/Passes.html index 5c42f3fdd5..840b2eff06 100644 --- a/docs/Passes.html +++ b/docs/Passes.html @@ -126,6 +126,7 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print " <p>\n" if ! <tr><td><a href="#adce">-adce</a></td><td>Aggressive Dead Code Elimination</td></tr> <tr><td><a href="#always-inline">-always-inline</a></td><td>Inliner for always_inline functions</td></tr> <tr><td><a href="#argpromotion">-argpromotion</a></td><td>Promote 'by reference' arguments to scalars</td></tr> +<tr><td><a href="#bb-vectorize">-bb-vectorize</a></td><td>Combine instructions to form vector instructions within basic blocks</td></tr> <tr><td><a href="#block-placement">-block-placement</a></td><td>Profile Guided Basic Block Placement</td></tr> <tr><td><a href="#break-crit-edges">-break-crit-edges</a></td><td>Break critical edges in CFG</td></tr> <tr><td><a href="#codegenprepare">-codegenprepare</a></td><td>Optimize for code generation</td></tr> @@ -817,6 +818,26 @@ perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print " <p>\n" if ! <!-------------------------------------------------------------------------- --> <h3> + <a name="bb-vectorize">-bb-vectorize: Basic-Block Vectorization</a> +</h3> +<div> + <p>This pass combines instructions inside basic blocks to form vector + instructions. It iterates over each basic block, attempting to pair + compatible instructions, repeating this process until no additional + pairs are selected for vectorization. When the outputs of some pair + of compatible instructions are used as inputs by some other pair of + compatible instructions, those pairs are part of a potential + vectorization chain. Instruction pairs are only fused into vector + instructions when they are part of a chain longer than some + threshold length. Moreover, the pass attempts to find the best + possible chain for each pair of compatible instructions. These + heuristics are intended to prevent vectorization in cases where + it would not yield a performance increase of the resulting code. + </p> +</div> + +<!-------------------------------------------------------------------------- --> +<h3> <a name="block-placement">-block-placement: Profile Guided Basic Block Placement</a> </h3> <div> diff --git a/include/llvm-c/Initialization.h b/include/llvm-c/Initialization.h index 3b59abbec0..cbe60df908 100644 --- a/include/llvm-c/Initialization.h +++ b/include/llvm-c/Initialization.h @@ -25,6 +25,7 @@ extern "C" { void LLVMInitializeCore(LLVMPassRegistryRef R); void LLVMInitializeTransformUtils(LLVMPassRegistryRef R); void LLVMInitializeScalarOpts(LLVMPassRegistryRef R); +void LLVMInitializeVectorization(LLVMPassRegistryRef R); void LLVMInitializeInstCombine(LLVMPassRegistryRef R); void LLVMInitializeIPO(LLVMPassRegistryRef R); void LLVMInitializeInstrumentation(LLVMPassRegistryRef R); diff --git a/include/llvm-c/Transforms/Vectorize.h b/include/llvm-c/Transforms/Vectorize.h new file mode 100644 index 0000000000..178465ac82 --- /dev/null +++ b/include/llvm-c/Transforms/Vectorize.h @@ -0,0 +1,37 @@ +/*===---------------------------Vectorize.h ------------------- -*- C++ -*-===*\ +|*===----------- Vectorization Transformation Library C Interface ---------===*| +|* *| +|* The LLVM Compiler Infrastructure *| +|* *| +|* This file is distributed under the University of Illinois Open Source *| +|* License. See LICENSE.TXT for details. *| +|* *| +|*===----------------------------------------------------------------------===*| +|* *| +|* This header declares the C interface to libLLVMVectorize.a, which *| +|* implements various vectorization transformations of the LLVM IR. *| +|* *| +|* Many exotic languages can interoperate with C code but have a harder time *| +|* with C++ due to name mangling. So in addition to C, this interface enables *| +|* tools written in such languages. *| +|* *| +\*===----------------------------------------------------------------------===*/ + +#ifndef LLVM_C_TRANSFORMS_VECTORIZE_H +#define LLVM_C_TRANSFORMS_VECTORIZE_H + +#include "llvm-c/Core.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** See llvm::createBBVectorizePass function. */ +void LLVMAddBBVectorizePass(LLVMPassManagerRef PM); + +#ifdef __cplusplus +} +#endif /* defined(__cplusplus) */ + +#endif + diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h index 6cc3f195c7..ac129c77d5 100644 --- a/include/llvm/InitializePasses.h +++ b/include/llvm/InitializePasses.h @@ -31,6 +31,10 @@ void initializeTransformUtils(PassRegistry&); /// ScalarOpts library. void initializeScalarOpts(PassRegistry&); +/// initializeVectorization - Initialize all passes linked into the +/// Vectorize library. +void initializeVectorization(PassRegistry&); + /// initializeInstCombine - Initialize all passes linked into the /// ScalarOpts library. void initializeInstCombine(PassRegistry&); @@ -236,7 +240,7 @@ void initializeVirtRegMapPass(PassRegistry&); void initializeInstSimplifierPass(PassRegistry&); void initializeUnpackMachineBundlesPass(PassRegistry&); void initializeFinalizeMachineBundlesPass(PassRegistry&); - +void initializeBBVectorizePass(PassRegistry&); } #endif diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h index 537fab757b..2258d45ce9 100644 --- a/include/llvm/LinkAllPasses.h +++ b/include/llvm/LinkAllPasses.h @@ -31,6 +31,7 @@ #include "llvm/Transforms/Instrumentation.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Vectorize.h" #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h" #include <cstdlib> @@ -151,6 +152,7 @@ namespace { (void) llvm::createCorrelatedValuePropagationPass(); (void) llvm::createMemDepPrinter(); (void) llvm::createInstructionSimplifierPass(); + (void) llvm::createBBVectorizePass(); (void)new llvm::IntervalPartition(); (void)new llvm::FindUsedTypes(); diff --git a/include/llvm/Transforms/IPO/PassManagerBuilder.h b/include/llvm/Transforms/IPO/PassManagerBuilder.h index b4ba184781..a1b4f5cd90 100644 --- a/include/llvm/Transforms/IPO/PassManagerBuilder.h +++ b/include/llvm/Transforms/IPO/PassManagerBuilder.h @@ -99,6 +99,7 @@ public: bool DisableSimplifyLibCalls; bool DisableUnitAtATime; bool DisableUnrollLoops; + bool Vectorize; private: /// ExtensionList - This is list of all of the extensions that are registered. diff --git a/include/llvm/Transforms/Vectorize.h b/include/llvm/Transforms/Vectorize.h new file mode 100644 index 0000000000..dfc099ddb9 --- /dev/null +++ b/include/llvm/Transforms/Vectorize.h @@ -0,0 +1,30 @@ +//===-- Vectorize.h - Vectorization Transformations -------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This header file defines prototypes for accessor functions that expose passes +// in the Vectorize transformations library. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_VECTORIZE_H +#define LLVM_TRANSFORMS_VECTORIZE_H + +namespace llvm { + +class BasicBlockPass; + +//===----------------------------------------------------------------------===// +// +// BBVectorize - A basic-block vectorization pass. +// +BasicBlockPass *createBBVectorizePass(); + +} // End llvm namespace + +#endif diff --git a/lib/Transforms/CMakeLists.txt b/lib/Transforms/CMakeLists.txt index 10e0cc6b56..de1353e6c1 100644 --- a/lib/Transforms/CMakeLists.txt +++ b/lib/Transforms/CMakeLists.txt @@ -3,4 +3,5 @@ add_subdirectory(Instrumentation) add_subdirectory(InstCombine) add_subdirectory(Scalar) add_subdirectory(IPO) +add_subdirectory(Vectorize) add_subdirectory(Hello) diff --git a/lib/Transforms/IPO/LLVMBuild.txt b/lib/Transforms/IPO/LLVMBuild.txt index b358fabdfa..b18c9150f4 100644 --- a/lib/Transforms/IPO/LLVMBuild.txt +++ b/lib/Transforms/IPO/LLVMBuild.txt @@ -20,4 +20,4 @@ type = Library name = IPO parent = Transforms library_name = ipo -required_libraries = Analysis Core IPA InstCombine Scalar Support Target TransformUtils +required_libraries = Analysis Core IPA InstCombine Scalar Vectorize Support Target TransformUtils diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp index afd25dc317..84084374b3 100644 --- a/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -21,14 +21,20 @@ #include "llvm/DefaultPasses.h" #include "llvm/PassManager.h" #include "llvm/Analysis/Passes.h" +#include "llvm/Analysis/Verifier.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Target/TargetLibraryInfo.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Vectorize.h" #include "llvm/Transforms/IPO.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Support/ManagedStatic.h" using namespace llvm; +static cl::opt<bool> +RunVectorization("vectorize", cl::desc("Run vectorization passes")); + PassManagerBuilder::PassManagerBuilder() { OptLevel = 2; SizeLevel = 0; @@ -37,6 +43,7 @@ PassManagerBuilder::PassManagerBuilder() { DisableSimplifyLibCalls = false; DisableUnitAtATime = false; DisableUnrollLoops = false; + Vectorize = RunVectorization; } PassManagerBuilder::~PassManagerBuilder() { @@ -172,6 +179,13 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { addExtensionsToPM(EP_ScalarOptimizerLate, MPM); + if (Vectorize) { + MPM.add(createBBVectorizePass()); + MPM.add(createInstructionCombiningPass()); + if (OptLevel > 1) + MPM.add(createGVNPass()); // Remove redundancies + } + MPM.add(createAggressiveDCEPass()); // Delete dead instructions MPM.add(createCFGSimplificationPass()); // Merge & remove BBs MPM.add(createInstructionCombiningPass()); // Clean up after everything. diff --git a/lib/Transforms/LLVMBuild.txt b/lib/Transforms/LLVMBuild.txt index b2ef49a4c6..f7bca064c7 100644 --- a/lib/Transforms/LLVMBuild.txt +++ b/lib/Transforms/LLVMBuild.txt @@ -16,7 +16,7 @@ ;===------------------------------------------------------------------------===; [common] -subdirectories = IPO InstCombine Instrumentation Scalar Utils +subdirectories = IPO InstCombine Instrumentation Scalar Utils Vectorize [component_0] type = Group diff --git a/lib/Transforms/Makefile b/lib/Transforms/Makefile index e527be25de..8b1df92fa2 100644 --- a/lib/Transforms/Makefile +++ b/lib/Transforms/Makefile @@ -8,7 +8,7 @@ ##===----------------------------------------------------------------------===## LEVEL = ../.. -PARALLEL_DIRS = Utils Instrumentation Scalar InstCombine IPO Hello +PARALLEL_DIRS = Utils Instrumentation Scalar InstCombine IPO Vectorize Hello include $(LEVEL)/Makefile.config diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp new file mode 100644 index 0000000000..9c2c8dd51b --- /dev/null +++ b/lib/Transforms/Vectorize/BBVectorize.cpp @@ -0,0 +1,1796 @@ +//===- BBVectorize.cpp - A Basic-Block Vectorizer -------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements a basic-block vectorization pass. The algorithm was +// inspired by that used by the Vienna MAP Vectorizor by Franchetti and Kral, +// et al. It works by looking for chains of pairable operations and then +// pairing them. +// +//===----------------------------------------------------------------------===// + +#define BBV_NAME "bb-vectorize" +#define DEBUG_TYPE BBV_NAME +#include "llvm/Constants.h" +#include "llvm/DerivedTypes.h" +#include "llvm/Function.h" +#include "llvm/Instructions.h" +#include "llvm/IntrinsicInst.h" +#include "llvm/Intrinsics.h" +#include "llvm/LLVMContext.h" +#include "llvm/Pass.h" +#include "llvm/Type.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AliasSetTracker.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/ValueHandle.h" +#include "llvm/Target/TargetData.h" +#include "llvm/Transforms/Vectorize.h" +#include <algorithm> +#include <map> +using namespace llvm; + +static cl::opt<unsigned> +ReqChainDepth("bb-vectorize-req-chain-depth", cl::init(6), cl::Hidden, + cl::desc("The required chain depth for vectorization")); + +static cl::opt<unsigned> +SearchLimit("bb-vectorize-search-limit", cl::init(400), cl::Hidden, + cl::desc("The maximum search distance for instruction pairs")); + +static cl::opt<bool> +SplatBreaksChain("bb-vectorize-splat-breaks-chain", cl::init(false), cl::Hidden, + cl::desc("Replicating one element to a pair breaks the chain")); + +static cl::opt<unsigned> +VectorBits("bb-vectorize-vector-bits", cl::init(128), cl::Hidden, + cl::desc("The size of the native vector registers")); + +static cl::opt<unsigned> +MaxIter("bb-vectorize-max-iter", cl::init(0), cl::Hidden, + cl::desc("The maximum number of pairing iterations")); + +static cl::opt<unsigned> +MaxCandPairsForCycleCheck("bb-vectorize-max-cycle-check-pairs", cl::init(200), + cl::Hidden, cl::desc("The maximum number of candidate pairs with which to use" + " a full cycle check")); + +static cl::opt<bool> +NoInts("bb-vectorize-no-ints", cl::init(false), cl::Hidden, + cl::desc("Don't try to vectorize integer values")); + +static cl::opt<bool> +NoFloats("bb-vectorize-no-floats", cl::init(false), cl::Hidden, + cl::desc("Don't try to vectorize floating-point values")); + +static cl::opt<bool> +NoCasts("bb-vectorize-no-casts", cl::init(false), cl::Hidden, + cl::desc("Don't try to vectorize casting (conversion) operations")); + +static cl::opt<bool> +NoMath("bb-vectorize-no-math", cl::init(false), cl::Hidden, + cl::desc("Don't try to vectorize floating-point math intrinsics")); + +static cl::opt<bool> +NoFMA("bb-vectorize-no-fma", cl::init(false), cl::Hidden, + cl::desc("Don't try to vectorize the fused-multiply-add intrinsic")); + +static cl::opt<bool> +NoMemOps("bb-vectorize-no-mem-ops", cl::init(false), cl::Hidden, + cl::desc("Don't try to vectorize loads and stores")); + +static cl::opt<bool> +AlignedOnly("bb-vectorize-aligned-only", cl::init(false), cl::Hidden, + cl::desc("Only generate aligned loads and stores")); + +static cl::opt<bool> +FastDep("bb-vectorize-fast-dep", cl::init(false), cl::Hidden, + cl::desc("Use a fast instruction dependency analysis")); + +#ifndef NDEBUG +static cl::opt<bool> +DebugInstructionExamination("bb-vectorize-debug-instruction-examination", + cl::init(false), cl::Hidden, + cl::desc("When debugging is enabled, output information on the" + " instruction-examination process")); +static cl::opt<bool> +DebugCandidateSelection("bb-vectorize-debug-candidate-selection", + cl::init(false), cl::Hidden, + cl::desc("When debugging is enabled, output information on the" + " candidate-selection process")); +static cl::opt<bool> +DebugPairSelection("bb-vectorize-debug-pair-selection", + cl::init(false), cl::Hidden, + cl::desc("When debugging is enabled, output information on the" + " pair-selection process")); +static cl::opt<bool> +DebugCycleCheck("bb-vectorize-debug-cycle-check", + cl::init(false), cl::Hidden, + cl::desc("When debugging is enabled, output information on the" + " cycle-checking process")); +#endif + +STATISTIC(NumFusedOps, "Number of operations fused by bb-vectorize"); + +namespace { + struct BBVectorize : public BasicBlockPass { + static char ID; // Pass identification, replacement for typeid + BBVectorize() : BasicBlockPass(ID) { + initializeBBVectorizePass(*PassRegistry::getPassRegistry()); + } + + typedef std::pair<Value *, Value *> ValuePair; + typedef std::pair<ValuePair, size_t> ValuePairWithDepth; + typedef std::pair<ValuePair, ValuePair> VPPair; // A ValuePair pair + typedef std::pair<std::multimap<Value *, Value *>::iterator, + std::multimap<Value *, Value *>::iterator> VPIteratorPair; + typedef std::pair<std::multimap<ValuePair, ValuePair>::iterator, + std::multimap<ValuePair, ValuePair>::iterator> + VPPIteratorPair; + + AliasAnalysis *AA; + ScalarEvolution *SE; + TargetData *TD; + + // FIXME: const correct? + + bool vectorizePairs(BasicBlock &BB); + + void getCandidatePairs(BasicBlock &BB, + std::multimap<Value *, Value *> &CandidatePairs, + std::vector<Value *> &PairableInsts); + + void computeConnectedPairs(std::multimap<Value *, Value *> &CandidatePairs, + std::vector<Value *> &PairableInsts, + std::multimap<ValuePair, ValuePair> &ConnectedPairs); + + void buildDepMap(BasicBlock &BB, + std::multimap<Value *, Value *> &CandidatePairs, + std::vector<Value *> &PairableInsts, + DenseSet<ValuePair> &PairableInstUsers); + + void choosePairs(std::multimap<Value *, Value *> &CandidatePairs, + std::vector<Value *> &PairableInsts, + std::multimap<ValuePair, ValuePair> &ConnectedPairs, + DenseSet<ValuePair> &PairableInstUsers, + DenseMap<Value *, Value *>& ChosenPairs); + + void fuseChosenPairs(BasicBlock &BB, + std::vector<Value *> &PairableInsts, + DenseMap<Value *, Value *>& ChosenPairs); + + bool isInstVectorizable(Instruction *I, bool &IsSimpleLoadStore); + + bool areInstsCompatible(Instruction *I, Instruction *J, + bool IsSimpleLoadStore); + + bool trackUsesOfI(DenseSet<Value *> &Users, + AliasSetTracker &WriteSet, Instruction *I, + Instruction *J, bool UpdateUsers = true, + std::multimap<Value *, Value *> *LoadMoveSet = 0); + + void computePairsConnectedTo( + std::multimap<Value *, Value *> &CandidatePairs, + std::vector<Value *> &PairableInsts, + std::multimap<ValuePair, ValuePair> &ConnectedPairs, + ValuePair P); + + bool pairsConflict(ValuePair P, ValuePair Q, + DenseSet<ValuePair> &PairableInstUsers, + std::multimap<ValuePair, ValuePair> *PairableInstUserMap = 0); + + bool pairWillFormCycle(ValuePair P, + std::multimap<ValuePair, ValuePair> &PairableInstUsers, + DenseSet<ValuePair> &CurrentPairs); + + void pruneTreeFor( + std::multimap<Value *, Value *> &CandidatePairs, + std::vector<Value *> &PairableInsts, + std::multimap<ValuePair, ValuePair> &ConnectedPairs, + DenseSet<ValuePair> &PairableInstUsers, + std::multimap<ValuePair, ValuePair> &PairableInstUserMap, + DenseMap<Value *, Value *> &ChosenPairs, + DenseMap<ValuePair, size_t> &Tree, + DenseSet<ValuePair> &PrunedTree, ValuePair J, + bool UseCycleCheck); + + void buildInitialTreeFor( + std::multimap<Value *, Value *> &CandidatePairs, + std::vector<Value *> &PairableInsts, + std::multimap<ValuePair, ValuePair> &ConnectedPairs, + DenseSet<ValuePair> &PairableInstUsers, + DenseMap<Value *, Value *> &ChosenPairs, + DenseMap<ValuePair, size_t> &Tree, ValuePair J); + + void findBestTreeFor( + std::multimap<Value *, Value *> &CandidatePairs, + std::vector<Value *> &PairableInsts, + std::multimap<ValuePair, ValuePair> &ConnectedPairs, + DenseSet<ValuePair> &PairableInstUsers, + std::multimap<ValuePair, ValuePair> &PairableInstUserMap, + DenseMap<Value *, Value *> &ChosenPairs, + DenseSet<ValuePair> &BestTree, size_t &BestMaxDepth, + size_t &BestEffSize, VPIteratorPair ChoiceRange, + bool UseCycleCheck); + + Value *getReplacementPointerInput(LLVMContext& Context, Instruction *I, + Instruction *J, unsigned o, bool &FlipMemInputs); + + void fillNewShuffleMask(LLVMContext& Context, Instruction *J, + unsigned NumElem, unsigned MaskOffset, unsigned NumInElem, + unsigned IdxOffset, std::vector<Constant*> &Mask); + + Value *getReplacementShuffleMask(LLVMContext& Context, Instruction *I, + Instruction *J); + + Value *getReplacementInput(LLVMContext& Context, Instruction *I, + Instruction *J, unsigned o, bool FlipMemInputs); + + void getReplacementInputsForPair(LLVMContext& Context, Instruction *I, + Instruction *J, SmallVector<Value *, 3> &ReplacedOperands, + bool &FlipMemInputs); + + void replaceOutputsOfPair(LLVMContext& Context, Instruction *I, + Instruction *J, Instruction *K, + Instruction *&InsertionPt, Instruction *&K1, + Instruction *&K2, bool &FlipMemInputs); + + void collectPairLoadMoveSet(BasicBlock &BB, + DenseMap<Value *, Value *> &ChosenPairs, + std::multimap<Value *, Value *> &LoadMoveSet, + Instruction *I); + + void collectLoadMoveSet(BasicBlock &BB, + std::vector<Value *> &PairableInsts, + DenseMap<Value *, Value *> &ChosenPairs, + std::multimap<Value *, Value *> &LoadMoveSet); + + bool canMoveUsesOfIAfterJ(BasicBlock &BB, + std::multimap<Value *, Value *> &LoadMoveSet, + Instruction *I, Instruction *J); + + void moveUsesOfIAfterJ(BasicBlock &BB, + std::multimap<Value *, Value *> &LoadMoveSet, + Instruction *&InsertionPt, + Instruction *I, Instruction *J); + + virtual bool runOnBasicBlock(BasicBlock &BB) { + AA = &getAnalysis<AliasAnalysis>(); + SE = &getAnalysis<ScalarEvolution>(); + TD = getAnalysisIfAvailable<TargetData>(); + + bool changed = false; + // Iterate a sufficient number of times to merge types of size 1 bit, + // then 2 bits, then 4, etc. up to half of the target vector width of the + // target vector register. + for (unsigned v = 2, n = 1; v <= VectorBits && (!MaxIter || n <= MaxIter); + v *= 2, ++n) { + DEBUG(dbgs() << "BBV: fusing loop #" << n << + " for " << BB.getName() << " in " << + BB.getParent()->getName() << "...\n"); + if (vectorizePairs(BB)) + changed = true; + else + break; + } + + DEBUG(dbgs() << "BBV: done!\n"); + return changed; + } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + BasicBlockPass::getAnalysisUsage(AU); + AU.addRequired<AliasAnalysis>(); + AU.addRequired<ScalarEvolution>(); + AU.addPreserved<AliasAnalysis>(); + AU.addPreserved<ScalarEvolution>(); + } + + // This returns the vector type that holds a pair of the provided type. + // If the provided type is already a vector, then its length is doubled. + static inline VectorType *getVecTypeForPair(Type *ElemTy) { + if (VectorType *VTy = dyn_cast<VectorType>(ElemTy)) { + unsigned numElem = VTy->getNumElements(); + return VectorType::get(ElemTy->getScalarType(), numElem*2); + } else { + return VectorType::get(ElemTy, 2); + } + } + + // Returns the weight associated with the provided value. A chain of + // candidate pairs has a length given by the sum of the weights of its + // members (one weight per pair; the weight of each member of the pair + // is assumed to be the same). This length is then compared to the + // chain-length threshold to determine if a given chain is significant + // enough to be vectorized. The length is also used in comparing + // candidate chains where longer chains are considered to be better. + // Note: when this function returns 0, the resulting instructions are + // not actually fused. + static inline size_t getDepthFactor(Value *V) { + // InsertElement and ExtractElement have a depth factor of zero. This is + // for two reasons: First, they cannot be usefully fused. Second, because + // the pass generates a lot of these, they can confuse the simple metric + // used to compare the trees in the next iteration. Thus, giving them a + // weight of zero allows the pass to essentially ignore them in + // subsequent iterations when looking for vectorization opportunities + // while still tracking dependency chains that flow through those + // instructions. + if (isa<InsertElementInst>(V) || isa<ExtractElementInst>(V)) + return 0; + + return 1; + } + + // This determines the relative offset of two loads or stores, returning + // true if the offset could be determined to be some constant value. + // For example, if OffsetInElmts == 1, then J accesses the memory directly + // after I; if OffsetInElmts == -1 then I accesses the memory + // directly after J. This function assumes that both instructions + // have the same type. + bool getPairPtrInfo(Instruction *I, Instruction *J, + Value *&IPtr, Value *&JPtr, unsigned &IAlignment, unsigned &JAlignment, + int64_t &OffsetInElmts) { + OffsetInElmts = 0; + if (isa<LoadInst>(I)) { + IPt |