aboutsummaryrefslogtreecommitdiff
path: root/lib/Transforms/Vectorize/LoopVectorize.cpp
diff options
context:
space:
mode:
authorDerek Schuff <dschuff@chromium.org>2013-01-30 11:34:40 -0800
committerDerek Schuff <dschuff@chromium.org>2013-01-30 11:34:40 -0800
commit1843e19bce9b11fc840858e136c6c52cf8b42e0b (patch)
treee8bfc928152e2d3b3dd120d141d13dc08a9b49e4 /lib/Transforms/Vectorize/LoopVectorize.cpp
parentaa0fa8a8df25807f784ec9ca9deeb40328636595 (diff)
parenta662a9862501fc86904e90054f7c1519101d9126 (diff)
Merge commit 'a662a9862501fc86904e90054f7c1519101d9126'
Conflicts: include/llvm/CodeGen/IntrinsicLowering.h include/llvm/MC/MCAssembler.h include/llvm/MC/MCObjectStreamer.h lib/LLVMBuild.txt lib/Linker/LinkArchives.cpp lib/MC/MCAssembler.cpp lib/MC/MCELFStreamer.cpp lib/MC/MCParser/AsmParser.cpp lib/MC/MCPureStreamer.cpp lib/MC/WinCOFFStreamer.cpp lib/Makefile lib/Support/Unix/Memory.inc lib/Support/Unix/Process.inc lib/Support/Unix/Program.inc lib/Target/ARM/ARM.h lib/Target/ARM/ARMFastISel.cpp lib/Target/ARM/ARMISelLowering.cpp lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp lib/Target/Mips/MipsInstrFPU.td lib/Target/X86/CMakeLists.txt lib/Target/X86/X86ISelLowering.cpp lib/Target/X86/X86TargetMachine.cpp lib/Target/X86/X86TargetObjectFile.cpp lib/Transforms/InstCombine/InstCombineCalls.cpp test/CodeGen/X86/fast-isel-x86-64.ll tools/llc/llc.cpp tools/lto/LTOModule.cpp utils/TableGen/EDEmitter.cpp
Diffstat (limited to 'lib/Transforms/Vectorize/LoopVectorize.cpp')
-rw-r--r--lib/Transforms/Vectorize/LoopVectorize.cpp1788
1 files changed, 1404 insertions, 384 deletions
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index feeececedb..464ed97506 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6,7 +6,51 @@
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
-#include "LoopVectorize.h"
+//
+// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
+// and generates target-independent LLVM-IR. Legalization of the IR is done
+// in the codegen. However, the vectorizes uses (will use) the codegen
+// interfaces to generate IR that is likely to result in an optimal binary.
+//
+// The loop vectorizer combines consecutive loop iteration into a single
+// 'wide' iteration. After this transformation the index is incremented
+// by the SIMD vector width, and not by one.
+//
+// This pass has three parts:
+// 1. The main loop pass that drives the different parts.
+// 2. LoopVectorizationLegality - A unit that checks for the legality
+// of the vectorization.
+// 3. InnerLoopVectorizer - A unit that performs the actual
+// widening of instructions.
+// 4. LoopVectorizationCostModel - A unit that checks for the profitability
+// of vectorization. It decides on the optimal vector width, which
+// can be one, if vectorization is not profitable.
+//
+//===----------------------------------------------------------------------===//
+//
+// The reduction-variable vectorization is based on the paper:
+// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
+//
+// Variable uniformity checks are inspired by:
+// Karrenberg, R. and Hack, S. Whole Function Vectorization.
+//
+// Other ideas/concepts are from:
+// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
+//
+// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
+// Vectorizing Compilers.
+//
+//===----------------------------------------------------------------------===//
+
+#define LV_NAME "loop-vectorize"
+#define DEBUG_TYPE LV_NAME
+
+#include "llvm/Transforms/Vectorize.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/Analysis/AliasAnalysis.h"
#include "llvm/Analysis/AliasSetTracker.h"
@@ -14,46 +58,526 @@
#include "llvm/Analysis/LoopInfo.h"
#include "llvm/Analysis/LoopIterator.h"
#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolution.h"
#include "llvm/Analysis/ScalarEvolutionExpander.h"
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/Analysis/ValueTracking.h"
#include "llvm/Analysis/Verifier.h"
-#include "llvm/Constants.h"
-#include "llvm/DataLayout.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Function.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Module.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/TargetTransformInfo.h"
#include "llvm/Transforms/Scalar.h"
#include "llvm/Transforms/Utils/BasicBlockUtils.h"
#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Vectorize.h"
-#include "llvm/Type.h"
-#include "llvm/Value.h"
+#include <algorithm>
+#include <map>
+
+using namespace llvm;
static cl::opt<unsigned>
VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
cl::desc("Sets the SIMD width. Zero is autoselect."));
+static cl::opt<unsigned>
+VectorizationUnroll("force-vector-unroll", cl::init(0), cl::Hidden,
+ cl::desc("Sets the vectorization unroll count. "
+ "Zero is autoselect."));
+
static cl::opt<bool>
EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
cl::desc("Enable if-conversion during vectorization."));
+/// We don't vectorize loops with a known constant trip count below this number.
+static const unsigned TinyTripCountVectorThreshold = 16;
+
+/// We don't unroll loops with a known constant trip count below this number.
+static const unsigned TinyTripCountUnrollThreshold = 128;
+
+/// We don't unroll loops that are larget than this threshold.
+static const unsigned MaxLoopSizeThreshold = 32;
+
+/// When performing a runtime memory check, do not check more than this
+/// number of pointers. Notice that the check is quadratic!
+static const unsigned RuntimeMemoryCheckThreshold = 4;
+
namespace {
+// Forward declarations.
+class LoopVectorizationLegality;
+class LoopVectorizationCostModel;
+
+/// InnerLoopVectorizer vectorizes loops which contain only one basic
+/// block to a specified vectorization factor (VF).
+/// This class performs the widening of scalars into vectors, or multiple
+/// scalars. This class also implements the following features:
+/// * It inserts an epilogue loop for handling loops that don't have iteration
+/// counts that are known to be a multiple of the vectorization factor.
+/// * It handles the code generation for reduction variables.
+/// * Scalarization (implementation using scalars) of un-vectorizable
+/// instructions.
+/// InnerLoopVectorizer does not perform any vectorization-legality
+/// checks, and relies on the caller to check for the different legality
+/// aspects. The InnerLoopVectorizer relies on the
+/// LoopVectorizationLegality class to provide information about the induction
+/// and reduction variables that were found to a given vectorization factor.
+class InnerLoopVectorizer {
+public:
+ InnerLoopVectorizer(Loop *OrigLoop, ScalarEvolution *SE, LoopInfo *LI,
+ DominatorTree *DT, DataLayout *DL, unsigned VecWidth,
+ unsigned UnrollFactor)
+ : OrigLoop(OrigLoop), SE(SE), LI(LI), DT(DT), DL(DL), VF(VecWidth),
+ UF(UnrollFactor), Builder(SE->getContext()), Induction(0),
+ OldInduction(0), WidenMap(UnrollFactor) {}
+
+ // Perform the actual loop widening (vectorization).
+ void vectorize(LoopVectorizationLegality *Legal) {
+ // Create a new empty loop. Unlink the old loop and connect the new one.
+ createEmptyLoop(Legal);
+ // Widen each instruction in the old loop to a new one in the new loop.
+ // Use the Legality module to find the induction and reduction variables.
+ vectorizeLoop(Legal);
+ // Register the new loop and update the analysis passes.
+ updateAnalysis();
+ }
+
+private:
+ /// A small list of PHINodes.
+ typedef SmallVector<PHINode*, 4> PhiVector;
+ /// When we unroll loops we have multiple vector values for each scalar.
+ /// This data structure holds the unrolled and vectorized values that
+ /// originated from one scalar instruction.
+ typedef SmallVector<Value*, 2> VectorParts;
+
+ /// Add code that checks at runtime if the accessed arrays overlap.
+ /// Returns the comparator value or NULL if no check is needed.
+ Value *addRuntimeCheck(LoopVectorizationLegality *Legal,
+ Instruction *Loc);
+ /// Create an empty loop, based on the loop ranges of the old loop.
+ void createEmptyLoop(LoopVectorizationLegality *Legal);
+ /// Copy and widen the instructions from the old loop.
+ void vectorizeLoop(LoopVectorizationLegality *Legal);
+
+ /// A helper function that computes the predicate of the block BB, assuming
+ /// that the header block of the loop is set to True. It returns the *entry*
+ /// mask for the block BB.
+ VectorParts createBlockInMask(BasicBlock *BB);
+ /// A helper function that computes the predicate of the edge between SRC
+ /// and DST.
+ VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
+
+ /// A helper function to vectorize a single BB within the innermost loop.
+ void vectorizeBlockInLoop(LoopVectorizationLegality *Legal, BasicBlock *BB,
+ PhiVector *PV);
+
+ /// Insert the new loop to the loop hierarchy and pass manager
+ /// and update the analysis passes.
+ void updateAnalysis();
+
+ /// This instruction is un-vectorizable. Implement it as a sequence
+ /// of scalars.
+ void scalarizeInstruction(Instruction *Instr);
+
+ /// Create a broadcast instruction. This method generates a broadcast
+ /// instruction (shuffle) for loop invariant values and for the induction
+ /// value. If this is the induction variable then we extend it to N, N+1, ...
+ /// this is needed because each iteration in the loop corresponds to a SIMD
+ /// element.
+ Value *getBroadcastInstrs(Value *V);
+
+ /// This function adds 0, 1, 2 ... to each vector element, starting at zero.
+ /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...).
+ /// The sequence starts at StartIndex.
+ Value *getConsecutiveVector(Value* Val, unsigned StartIdx, bool Negate);
+
+ /// When we go over instructions in the basic block we rely on previous
+ /// values within the current basic block or on loop invariant values.
+ /// When we widen (vectorize) values we place them in the map. If the values
+ /// are not within the map, they have to be loop invariant, so we simply
+ /// broadcast them into a vector.
+ VectorParts &getVectorValue(Value *V);
+
+ /// Generate a shuffle sequence that will reverse the vector Vec.
+ Value *reverseVector(Value *Vec);
+
+ /// This is a helper class that holds the vectorizer state. It maps scalar
+ /// instructions to vector instructions. When the code is 'unrolled' then
+ /// then a single scalar value is mapped to multiple vector parts. The parts
+ /// are stored in the VectorPart type.
+ struct ValueMap {
+ /// C'tor. UnrollFactor controls the number of vectors ('parts') that
+ /// are mapped.
+ ValueMap(unsigned UnrollFactor) : UF(UnrollFactor) {}
+
+ /// \return True if 'Key' is saved in the Value Map.
+ bool has(Value *Key) { return MapStoreage.count(Key); }
+
+ /// Initializes a new entry in the map. Sets all of the vector parts to the
+ /// save value in 'Val'.
+ /// \return A reference to a vector with splat values.
+ VectorParts &splat(Value *Key, Value *Val) {
+ MapStoreage[Key].clear();
+ MapStoreage[Key].append(UF, Val);
+ return MapStoreage[Key];
+ }
+
+ ///\return A reference to the value that is stored at 'Key'.
+ VectorParts &get(Value *Key) {
+ if (!has(Key))
+ MapStoreage[Key].resize(UF);
+ return MapStoreage[Key];
+ }
+
+ /// The unroll factor. Each entry in the map stores this number of vector
+ /// elements.
+ unsigned UF;
+
+ /// Map storage. We use std::map and not DenseMap because insertions to a
+ /// dense map invalidates its iterators.
+ std::map<Value*, VectorParts> MapStoreage;
+ };
+
+ /// The original loop.
+ Loop *OrigLoop;
+ /// Scev analysis to use.
+ ScalarEvolution *SE;
+ /// Loop Info.
+ LoopInfo *LI;
+ /// Dominator Tree.
+ DominatorTree *DT;
+ /// Data Layout.
+ DataLayout *DL;
+ /// The vectorization SIMD factor to use. Each vector will have this many
+ /// vector elements.
+ unsigned VF;
+ /// The vectorization unroll factor to use. Each scalar is vectorized to this
+ /// many different vector instructions.
+ unsigned UF;
+
+ /// The builder that we use
+ IRBuilder<> Builder;
+
+ // --- Vectorization state ---
+
+ /// The vector-loop preheader.
+ BasicBlock *LoopVectorPreHeader;
+ /// The scalar-loop preheader.
+ BasicBlock *LoopScalarPreHeader;
+ /// Middle Block between the vector and the scalar.
+ BasicBlock *LoopMiddleBlock;
+ ///The ExitBlock of the scalar loop.
+ BasicBlock *LoopExitBlock;
+ ///The vector loop body.
+ BasicBlock *LoopVectorBody;
+ ///The scalar loop body.
+ BasicBlock *LoopScalarBody;
+ ///The first bypass block.
+ BasicBlock *LoopBypassBlock;
+
+ /// The new Induction variable which was added to the new block.
+ PHINode *Induction;
+ /// The induction variable of the old basic block.
+ PHINode *OldInduction;
+ /// Maps scalars to widened vectors.
+ ValueMap WidenMap;
+};
+
+/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
+/// to what vectorization factor.
+/// This class does not look at the profitability of vectorization, only the
+/// legality. This class has two main kinds of checks:
+/// * Memory checks - The code in canVectorizeMemory checks if vectorization
+/// will change the order of memory accesses in a way that will change the
+/// correctness of the program.
+/// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
+/// checks for a number of different conditions, such as the availability of a
+/// single induction variable, that all types are supported and vectorize-able,
+/// etc. This code reflects the capabilities of InnerLoopVectorizer.
+/// This class is also used by InnerLoopVectorizer for identifying
+/// induction variable and the different reduction variables.
+class LoopVectorizationLegality {
+public:
+ LoopVectorizationLegality(Loop *L, ScalarEvolution *SE, DataLayout *DL,
+ DominatorTree *DT)
+ : TheLoop(L), SE(SE), DL(DL), DT(DT), Induction(0) {}
+
+ /// This enum represents the kinds of reductions that we support.
+ enum ReductionKind {
+ RK_NoReduction, ///< Not a reduction.
+ RK_IntegerAdd, ///< Sum of integers.
+ RK_IntegerMult, ///< Product of integers.
+ RK_IntegerOr, ///< Bitwise or logical OR of numbers.
+ RK_IntegerAnd, ///< Bitwise or logical AND of numbers.
+ RK_IntegerXor, ///< Bitwise or logical XOR of numbers.
+ RK_FloatAdd, ///< Sum of floats.
+ RK_FloatMult ///< Product of floats.
+ };
+
+ /// This enum represents the kinds of inductions that we support.
+ enum InductionKind {
+ IK_NoInduction, ///< Not an induction variable.
+ IK_IntInduction, ///< Integer induction variable. Step = 1.
+ IK_ReverseIntInduction, ///< Reverse int induction variable. Step = -1.
+ IK_PtrInduction ///< Pointer induction variable. Step = sizeof(elem).
+ };
+
+ /// This POD struct holds information about reduction variables.
+ struct ReductionDescriptor {
+ ReductionDescriptor() : StartValue(0), LoopExitInstr(0),
+ Kind(RK_NoReduction) {}
+
+ ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K)
+ : StartValue(Start), LoopExitInstr(Exit), Kind(K) {}
+
+ // The starting value of the reduction.
+ // It does not have to be zero!
+ Value *StartValue;
+ // The instruction who's value is used outside the loop.
+ Instruction *LoopExitInstr;
+ // The kind of the reduction.
+ ReductionKind Kind;
+ };
+
+ // This POD struct holds information about the memory runtime legality
+ // check that a group of pointers do not overlap.
+ struct RuntimePointerCheck {
+ RuntimePointerCheck() : Need(false) {}
+
+ /// Reset the state of the pointer runtime information.
+ void reset() {
+ Need = false;
+ Pointers.clear();
+ Starts.clear();
+ Ends.clear();
+ }
+
+ /// Insert a pointer and calculate the start and end SCEVs.
+ void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr);
+
+ /// This flag indicates if we need to add the runtime check.
+ bool Need;
+ /// Holds the pointers that we need to check.
+ SmallVector<Value*, 2> Pointers;
+ /// Holds the pointer value at the beginning of the loop.
+ SmallVector<const SCEV*, 2> Starts;
+ /// Holds the pointer value at the end of the loop.
+ SmallVector<const SCEV*, 2> Ends;
+ };
+
+ /// A POD for saving information about induction variables.
+ struct InductionInfo {
+ InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {}
+ InductionInfo() : StartValue(0), IK(IK_NoInduction) {}
+ /// Start value.
+ Value *StartValue;
+ /// Induction kind.
+ InductionKind IK;
+ };
+
+ /// ReductionList contains the reduction descriptors for all
+ /// of the reductions that were found in the loop.
+ typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
+
+ /// InductionList saves induction variables and maps them to the
+ /// induction descriptor.
+ typedef MapVector<PHINode*, InductionInfo> InductionList;
+
+ /// Returns true if it is legal to vectorize this loop.
+ /// This does not mean that it is profitable to vectorize this
+ /// loop, only that it is legal to do so.
+ bool canVectorize();
+
+ /// Returns the Induction variable.
+ PHINode *getInduction() { return Induction; }
+
+ /// Returns the reduction variables found in the loop.
+ ReductionList *getReductionVars() { return &Reductions; }
+
+ /// Returns the induction variables found in the loop.
+ InductionList *getInductionVars() { return &Inductions; }
+
+ /// Returns True if V is an induction variable in this loop.
+ bool isInductionVariable(const Value *V);
+
+ /// Return true if the block BB needs to be predicated in order for the loop
+ /// to be vectorized.
+ bool blockNeedsPredication(BasicBlock *BB);
+
+ /// Check if this pointer is consecutive when vectorizing. This happens
+ /// when the last index of the GEP is the induction variable, or that the
+ /// pointer itself is an induction variable.
+ /// This check allows us to vectorize A[idx] into a wide load/store.
+ /// Returns:
+ /// 0 - Stride is unknown or non consecutive.
+ /// 1 - Address is consecutive.
+ /// -1 - Address is consecutive, and decreasing.
+ int isConsecutivePtr(Value *Ptr);
+
+ /// Returns true if the value V is uniform within the loop.
+ bool isUniform(Value *V);
+
+ /// Returns true if this instruction will remain scalar after vectorization.
+ bool isUniformAfterVectorization(Instruction* I) { return Uniforms.count(I); }
+
+ /// Returns the information that we collected about runtime memory check.
+ RuntimePointerCheck *getRuntimePointerCheck() { return &PtrRtCheck; }
+private:
+ /// Check if a single basic block loop is vectorizable.
+ /// At this point we know that this is a loop with a constant trip count
+ /// and we only need to check individual instructions.
+ bool canVectorizeInstrs();
+
+ /// When we vectorize loops we may change the order in which
+ /// we read and write from memory. This method checks if it is
+ /// legal to vectorize the code, considering only memory constrains.
+ /// Returns true if the loop is vectorizable
+ bool canVectorizeMemory();
+
+ /// Return true if we can vectorize this loop using the IF-conversion
+ /// transformation.
+ bool canVectorizeWithIfConvert();
+
+ /// Collect the variables that need to stay uniform after vectorization.
+ void collectLoopUniforms();
+
+ /// Return true if all of the instructions in the block can be speculatively
+ /// executed.
+ bool blockCanBePredicated(BasicBlock *BB);
+
+ /// Returns True, if 'Phi' is the kind of reduction variable for type
+ /// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
+ bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
+ /// Returns true if the instruction I can be a reduction variable of type
+ /// 'Kind'.
+ bool isReductionInstr(Instruction *I, ReductionKind Kind);
+ /// Returns the induction kind of Phi. This function may return NoInduction
+ /// if the PHI is not an induction variable.
+ InductionKind isInductionVariable(PHINode *Phi);
+ /// Return true if can compute the address bounds of Ptr within the loop.
+ bool hasComputableBounds(Value *Ptr);
+
+ /// The loop that we evaluate.
+ Loop *TheLoop;
+ /// Scev analysis.
+ ScalarEvolution *SE;
+ /// DataLayout analysis.
+ DataLayout *DL;
+ // Dominators.
+ DominatorTree *DT;
+
+ // --- vectorization state --- //
+
+ /// Holds the integer induction variable. This is the counter of the
+ /// loop.
+ PHINode *Induction;
+ /// Holds the reduction variables.
+ ReductionList Reductions;
+ /// Holds all of the induction variables that we found in the loop.
+ /// Notice that inductions don't need to start at zero and that induction
+ /// variables can be pointers.
+ InductionList Inductions;
+
+ /// Allowed outside users. This holds the reduction
+ /// vars which can be accessed from outside the loop.
+ SmallPtrSet<Value*, 4> AllowedExit;
+ /// This set holds the variables which are known to be uniform after
+ /// vectorization.
+ SmallPtrSet<Instruction*, 4> Uniforms;
+ /// We need to check that all of the pointers in this list are disjoint
+ /// at runtime.
+ RuntimePointerCheck PtrRtCheck;
+};
+
+/// LoopVectorizationCostModel - estimates the expected speedups due to
+/// vectorization.
+/// In many cases vectorization is not profitable. This can happen because of
+/// a number of reasons. In this class we mainly attempt to predict the
+/// expected speedup/slowdowns due to the supported instruction set. We use the
+/// TargetTransformInfo to query the different backends for the cost of
+/// different operations.
+class LoopVectorizationCostModel {
+public:
+ LoopVectorizationCostModel(Loop *L, ScalarEvolution *SE, LoopInfo *LI,
+ LoopVectorizationLegality *Legal,
+ const TargetTransformInfo &TTI)
+ : TheLoop(L), SE(SE), LI(LI), Legal(Legal), TTI(TTI) {}
+
+ /// \return The most profitable vectorization factor.
+ /// This method checks every power of two up to VF. If UserVF is not ZERO
+ /// then this vectorization factor will be selected if vectorization is
+ /// possible.
+ unsigned selectVectorizationFactor(bool OptForSize, unsigned UserVF);
+
+ /// \returns The size (in bits) of the widest type in the code that
+ /// needs to be vectorized. We ignore values that remain scalar such as
+ /// 64 bit loop indices.
+ unsigned getWidestType();
+
+ /// \return The most profitable unroll factor.
+ /// If UserUF is non-zero then this method finds the best unroll-factor
+ /// based on register pressure and other parameters.
+ unsigned selectUnrollFactor(bool OptForSize, unsigned UserUF);
+
+ /// \brief A struct that represents some properties of the register usage
+ /// of a loop.
+ struct RegisterUsage {
+ /// Holds the number of loop invariant values that are used in the loop.
+ unsigned LoopInvariantRegs;
+ /// Holds the maximum number of concurrent live intervals in the loop.
+ unsigned MaxLocalUsers;
+ /// Holds the number of instructions in the loop.
+ unsigned NumInstructions;
+ };
+
+ /// \return information about the register usage of the loop.
+ RegisterUsage calculateRegisterUsage();
+
+private:
+ /// Returns the expected execution cost. The unit of the cost does
+ /// not matter because we use the 'cost' units to compare different
+ /// vector widths. The cost that is returned is *not* normalized by
+ /// the factor width.
+ unsigned expectedCost(unsigned VF);
+
+ /// Returns the execution time cost of an instruction for a given vector
+ /// width. Vector width of one means scalar.
+ unsigned getInstructionCost(Instruction *I, unsigned VF);
+
+ /// A helper function for converting Scalar types to vector types.
+ /// If the incoming type is void, we return void. If the VF is 1, we return
+ /// the scalar type.
+ static Type* ToVectorTy(Type *Scalar, unsigned VF);
+
+ /// The loop that we evaluate.
+ Loop *TheLoop;
+ /// Scev analysis.
+ ScalarEvolution *SE;
+ /// Loop Info analysis.
+ LoopInfo *LI;
+ /// Vectorization legality.
+ LoopVectorizationLegality *Legal;
+ /// Vector target information.
+ const TargetTransformInfo &TTI;
+};
+
/// The LoopVectorize Pass.
struct LoopVectorize : public LoopPass {
- static char ID; // Pass identification, replacement for typeid
+ /// Pass identification, replacement for typeid
+ static char ID;
- LoopVectorize() : LoopPass(ID) {
+ explicit LoopVectorize() : LoopPass(ID) {
initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
}
@@ -71,7 +595,7 @@ struct LoopVectorize : public LoopPass {
SE = &getAnalysis<ScalarEvolution>();
DL = getAnalysisIfAvailable<DataLayout>();
LI = &getAnalysis<LoopInfo>();
- TTI = getAnalysisIfAvailable<TargetTransformInfo>();
+ TTI = &getAnalysis<TargetTransformInfo>();
DT = &getAnalysis<DominatorTree>();
DEBUG(dbgs() << "LV: Checking a loop in \"" <<
@@ -84,32 +608,38 @@ struct LoopVectorize : public LoopPass {
return false;
}
- // Select the preffered vectorization factor.
- unsigned VF = 1;
- if (VectorizationFactor == 0) {
- const VectorTargetTransformInfo *VTTI = 0;
- if (TTI)
- VTTI = TTI->getVectorTargetTransformInfo();
- // Use the cost model.
- LoopVectorizationCostModel CM(L, SE, &LVL, VTTI);
- VF = CM.findBestVectorizationFactor();
-
- if (VF == 1) {
- DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
- return false;
- }
+ // Use the cost model.
+ LoopVectorizationCostModel CM(L, SE, LI, &LVL, *TTI);
+
+ // Check the function attribues to find out if this function should be
+ // optimized for size.
+ Function *F = L->getHeader()->getParent();
+ Attribute::AttrKind SzAttr = Attribute::OptimizeForSize;
+ Attribute::AttrKind FlAttr = Attribute::NoImplicitFloat;
+ unsigned FnIndex = AttributeSet::FunctionIndex;
+ bool OptForSize = F->getAttributes().hasAttribute(FnIndex, SzAttr);
+ bool NoFloat = F->getAttributes().hasAttribute(FnIndex, FlAttr);
+
+ if (NoFloat) {
+ DEBUG(dbgs() << "LV: Can't vectorize when the NoImplicitFloat"
+ "attribute is used.\n");
+ return false;
+ }
- } else {
- // Use the user command flag.
- VF = VectorizationFactor;
+ unsigned VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor);
+ unsigned UF = CM.selectUnrollFactor(OptForSize, VectorizationUnroll);
+
+ if (VF == 1) {
+ DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
+ return false;
}
DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<<
- L->getHeader()->getParent()->getParent()->getModuleIdentifier()<<
- "\n");
+ F->getParent()->getModuleIdentifier()<<"\n");
+ DEBUG(dbgs() << "LV: Unroll Factor is " << UF << "\n");
// If we decided that it is *legal* to vectorizer the loop then do it.
- InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF);
+ InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF, UF);
LB.vectorize(&LVL);
DEBUG(verifyFunction(*L->getHeader()->getParent()));
@@ -120,16 +650,17 @@ struct LoopVectorize : public LoopPass {
LoopPass::getAnalysisUsage(AU);
AU.addRequiredID(LoopSimplifyID);
AU.addRequiredID(LCSSAID);
+ AU.addRequired<DominatorTree>();
AU.addRequired<LoopInfo>();
AU.addRequired<ScalarEvolution>();
- AU.addRequired<DominatorTree>();
+ AU.addRequired<TargetTransformInfo>();
AU.addPreserved<LoopInfo>();
AU.addPreserved<DominatorTree>();
}
};
-}// namespace
+} // end anonymous namespace
//===----------------------------------------------------------------------===//
// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
@@ -150,11 +681,6 @@ LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE,
}
Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
- // Create the types.
- LLVMContext &C = V->getContext();
- Type *VTy = VectorType::get(V->getType(), VF);
- Type *I32 = IntegerType::getInt32Ty(C);
-
// Save the current insertion location.
Instruction *Loc = Builder.GetInsertPoint();
@@ -167,14 +693,8 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
if (Invariant)
Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
- Constant *Zero = ConstantInt::get(I32, 0);
- Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF));
- Value *UndefVal = UndefValue::get(VTy);
- // Insert the value into a new vector.
- Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero);
// Broadcast the scalar into all locations in the vector.
- Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros,
- "broadcast");
+ Value *Shuf = Builder.CreateVectorSplat(VF, V, "broadcast");
// Restore the builder insertion point.
if (Invariant)
@@ -183,7 +703,8 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
return Shuf;
}
-Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, bool Negate) {
+Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, unsigned StartIdx,
+ bool Negate) {
assert(Val->getType()->isVectorTy() && "Must be a vector");
assert(Val->getType()->getScalarType()->isIntegerTy() &&
"Elem must be an integer");
@@ -194,8 +715,10 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, bool Negate) {
SmallVector<Constant*, 8> Indices;
// Create a vector of consecutive numbers from zero to VF.
- for (int i = 0; i < VLen; ++i)
- Indices.push_back(ConstantInt::get(ITy, Negate ? (-i): i ));
+ for (int i = 0; i < VLen; ++i) {
+ int Idx = Negate ? (-i): i;
+ Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx));
+ }
// Add the consecutive indices to the vector value.
Constant *Cv = ConstantVector::get(Indices);
@@ -203,20 +726,20 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, bool Negate) {
return Builder.CreateAdd(Val, Cv, "induction");
}
-bool LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
+int LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr");
// If this value is a pointer induction variable we know it is consecutive.
PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
if (Phi && Inductions.count(Phi)) {
InductionInfo II = Inductions[Phi];
- if (PtrInduction == II.IK)
- return true;
+ if (IK_PtrInduction == II.IK)
+ return 1;
}
GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
if (!Gep)
- return false;
+ return 0;
unsigned NumOperands = Gep->getNumOperands();
Value *LastIndex = Gep->getOperand(NumOperands - 1);
@@ -224,7 +747,7 @@ bool LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
// Check that all of the gep indices are uniform except for the last.
for (unsigned i = 0; i < NumOperands - 1; ++i)
if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
- return false;
+ return 0;
// We can emit wide load/stores only if the last index is the induction
// variable.
@@ -235,39 +758,49 @@ bool LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
// The memory is consecutive because the last index is consecutive
// and all other indices are loop invariant.
if (Step->isOne())
- return true;
+ return 1;
+ if (Step->isAllOnesValue())
+ return -1;
}
- return false;
+ return 0;
}
bool LoopVectorizationLegality::isUniform(Value *V) {
return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
}
-Value *InnerLoopVectorizer::getVectorValue(Value *V) {
+InnerLoopVectorizer::VectorParts&
+InnerLoopVectorizer::getVectorValue(Value *V) {
assert(V != Induction && "The new induction variable should not be used.");
assert(!V->getType()->isVectorTy() && "Can't widen a vector");
- // If we saved a vectorized copy of V, use it.
- Value *&MapEntry = WidenMap[V];
- if (MapEntry)
- return MapEntry;
- // Broadcast V and save the value for future uses.
+ // If we have this scalar in the map, return it.
+ if (WidenMap.has(V))
+ return WidenMap.get(V);
+
+ // If this scalar is unknown, assume that it is a constant or that it is
+ // loop invariant. Broadcast V and save the value for future uses.
Value *B = getBroadcastInstrs(V);
- MapEntry = B;
- return B;
+ WidenMap.splat(V, B);
+ return WidenMap.get(V);
}
-Constant*
-InnerLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) {
- return ConstantVector::getSplat(VF, ConstantInt::get(ScalarTy, Val, true));
+Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
+ assert(Vec->getType()->isVectorTy() && "Invalid type");
+ SmallVector<Constant*, 8> ShuffleMask;
+ for (unsigned i = 0; i < VF; ++i)
+ ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
+
+ return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
+ ConstantVector::get(ShuffleMask),
+ "reverse");
}
void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
// Holds vector parameters or scalars, in case of uniform vals.
- SmallVector<Value*, 8> Params;
+ SmallVector<VectorParts, 4> Params;
// Find all of the vectorized parameters.
for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
@@ -284,13 +817,15 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
// If the src is an instruction that appeared earlier in the basic block
// then it should already be vectorized.
- if (SrcInst && SrcInst->getParent() == Instr->getParent()) {
- assert(WidenMap.count(SrcInst) && "Source operand is unavailable");
+ if (SrcInst && OrigLoop->contains(SrcInst)) {
+ assert(WidenMap.has(SrcInst) && "Source operand is unavailable");
// The parameter is a vector value from earlier.
- Params.push_back(WidenMap[SrcInst]);
+ Params.push_back(WidenMap.get(SrcInst));
} else {
// The parameter is a scalar from outside the loop. Maybe even a constant.
- Params.push_back(SrcOp);
+ VectorParts Scalars;
+ Scalars.append(UF, SrcOp);
+ Params.push_back(Scalars);
}
}
@@ -299,39 +834,38 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
// Does this instruction return a value ?
bool IsVoidRetTy = Instr->getType()->isVoidTy();
- Value *VecResults = 0;
- // If we have a return value, create an empty vector. We place the scalarized
- // instructions in this vector.
- if (!IsVoidRetTy)
- VecResults = UndefValue::get(VectorType::get(Instr->getType(), VF));
+ Value *UndefVec = IsVoidRetTy ? 0 :
+ UndefValue::get(VectorType::get(Instr->getType(), VF));
+ // Create a new entry in the WidenMap and initialize it to Undef or Null.
+ VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
// For each scalar that we create:
- for (unsigned i = 0; i < VF; ++i) {
- Instruction *Cloned = Instr->clone();
- if (!IsVoidRetTy)
- Cloned->setName(Instr->getName() + ".cloned");
- // Replace the operands of the cloned instrucions with extracted scalars.
- for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
- Value *Op = Params[op];
- // Param is a vector. Need to extract the right lane.
- if (Op->getType()->isVectorTy())
- Op = Builder.CreateExtractElement(Op, Builder.getInt32(i));
- Cloned->setOperand(op, Op);
- }
+ for (unsigned Width = 0; Width < VF; ++Width) {
+ // For each vector unroll 'part':
+ for (unsigned Part = 0; Part < UF; ++Part) {
+ Instruction *Cloned = Instr->clone();
+ if (!IsVoidRetTy)
+ Cloned->setName(Instr->getName() + ".cloned");
+ // Replace the operands of the cloned instrucions with extracted scalars.
+ for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
+ Value *Op = Params[op][Part];
+ // Param is a vector. Need to extract the right lane.
+