diff options
author | Dan Gohman <gohman@apple.com> | 2010-01-21 02:09:26 +0000 |
---|---|---|
committer | Dan Gohman <gohman@apple.com> | 2010-01-21 02:09:26 +0000 |
commit | a10756ee657a4d43a48cca5c166919093930ed6b (patch) | |
tree | 52a9cf0b867521026963ca127f6a46d19f0f8f78 /lib/Transforms/Scalar/LoopStrengthReduce.cpp | |
parent | ac8b4bf66b7263018fe6c133604a30780c24982e (diff) |
Re-implement the main strength-reduction portion of LoopStrengthReduction.
This new version is much more aggressive about doing "full" reduction in
cases where it reduces register pressure, and also more aggressive about
rewriting induction variables to count down (or up) to zero when doing so
reduces register pressure.
It currently uses fairly simplistic algorithms for finding reuse
opportunities, but it introduces a new framework allows it to combine
multiple strategies at once to form hybrid solutions, instead of doing
all full-reduction or all base+index.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@94061 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'lib/Transforms/Scalar/LoopStrengthReduce.cpp')
-rw-r--r-- | lib/Transforms/Scalar/LoopStrengthReduce.cpp | 4574 |
1 files changed, 2305 insertions, 2269 deletions
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index fa820ed8e4..6456ca1eb0 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -17,6 +17,45 @@ // available on the target, and it performs a variety of other optimizations // related to loop induction variables. // +// Terminology note: this code has a lot of handling for "post-increment" or +// "post-inc" users. This is not talking about post-increment addressing modes; +// it is instead talking about code like this: +// +// %i = phi [ 0, %entry ], [ %i.next, %latch ] +// ... +// %i.next = add %i, 1 +// %c = icmp eq %i.next, %n +// +// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however +// it's useful to think about these as the same register, with some uses using +// the value of the register before the add and some using // it after. In this +// example, the icmp is a post-increment user, since it uses %i.next, which is +// the value of the induction variable after the increment. The other common +// case of post-increment users is users outside the loop. +// +// TODO: More sophistication in the way Formulae are generated. +// +// TODO: Handle multiple loops at a time. +// +// TODO: test/CodeGen/X86/full-lsr.ll should get full lsr. The problem is +// that {0,+,1}<%bb> is getting picked first because all 7 uses can +// use it, and while it's a pretty good solution, it means that LSR +// doesn't look further to find an even better solution. +// +// TODO: Should TargetLowering::AddrMode::BaseGV be changed to a ConstantExpr +// instead of a GlobalValue? +// +// TODO: When truncation is free, truncate ICmp users' operands to make it a +// smaller encoding (on x86 at least). +// +// TODO: When a negated register is used by an add (such as in a list of +// multiple base registers, or as the increment expression in an addrec), +// we may not actually need both reg and (-1 * reg) in registers; the +// negation can be implemented by using a sub instead of an add. The +// lack of support for taking this into consideration when making +// register pressure decisions is partly worked around by the "Special" +// use kind. +// //===----------------------------------------------------------------------===// #define DEBUG_TYPE "loop-reduce" @@ -26,2025 +65,1524 @@ #include "llvm/IntrinsicInst.h" #include "llvm/DerivedTypes.h" #include "llvm/Analysis/IVUsers.h" +#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" -#include "llvm/Transforms/Utils/AddrModeMatcher.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" -#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/SmallBitVector.h" +#include "llvm/ADT/SetVector.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetLowering.h" #include <algorithm> using namespace llvm; -STATISTIC(NumReduced , "Number of IV uses strength reduced"); -STATISTIC(NumInserted, "Number of PHIs inserted"); -STATISTIC(NumVariable, "Number of PHIs with variable strides"); -STATISTIC(NumEliminated, "Number of strides eliminated"); -STATISTIC(NumShadow, "Number of Shadow IVs optimized"); -STATISTIC(NumImmSunk, "Number of common expr immediates sunk into uses"); -STATISTIC(NumLoopCond, "Number of loop terminating conds optimized"); -STATISTIC(NumCountZero, "Number of count iv optimized to count toward zero"); - -static cl::opt<bool> EnableFullLSRMode("enable-full-lsr", - cl::init(false), - cl::Hidden); - namespace { - struct BasedUser; +// Constant strides come first which in turns are sorted by their absolute +// values. If absolute values are the same, then positive strides comes first. +// e.g. +// 4, -1, X, 1, 2 ==> 1, -1, 2, 4, X +struct StrideCompare { + const ScalarEvolution &SE; + explicit StrideCompare(const ScalarEvolution &se) : SE(se) {} + + bool operator()(const SCEV *const &LHS, const SCEV *const &RHS) const { + const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(LHS); + const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(RHS); + if (LHSC && RHSC) { + unsigned BitWidth = std::max(SE.getTypeSizeInBits(LHS->getType()), + SE.getTypeSizeInBits(RHS->getType())); + APInt LV = LHSC->getValue()->getValue(); + APInt RV = RHSC->getValue()->getValue(); + LV.sextOrTrunc(BitWidth); + RV.sextOrTrunc(BitWidth); + APInt ALV = LV.abs(); + APInt ARV = RV.abs(); + if (ALV == ARV) { + if (LV != RV) + return LV.sgt(RV); + } else { + return ALV.ult(ARV); + } + + // If it's the same value but different type, sort by bit width so + // that we emit larger induction variables before smaller + // ones, letting the smaller be re-written in terms of larger ones. + return SE.getTypeSizeInBits(RHS->getType()) < + SE.getTypeSizeInBits(LHS->getType()); + } + return LHSC && !RHSC; + } +}; - /// IVInfo - This structure keeps track of one IV expression inserted during - /// StrengthReduceStridedIVUsers. It contains the stride, the common base, as - /// well as the PHI node and increment value created for rewrite. - struct IVExpr { - const SCEV *Stride; - const SCEV *Base; - PHINode *PHI; +/// RegSortData - This class holds data which is used to order reuse +/// candidates. +class RegSortData { +public: + /// Bits - This represents the set of LSRUses (by index) which reference a + /// particular register. + SmallBitVector Bits; - IVExpr(const SCEV *const stride, const SCEV *const base, PHINode *phi) - : Stride(stride), Base(base), PHI(phi) {} - }; + /// MaxComplexity - This represents the greatest complexity (see the comments + /// on Formula::getComplexity) seen with a particular register. + uint32_t MaxComplexity; - /// IVsOfOneStride - This structure keeps track of all IV expression inserted - /// during StrengthReduceStridedIVUsers for a particular stride of the IV. - struct IVsOfOneStride { - std::vector<IVExpr> IVs; + /// Index - This holds an arbitrary value used as a last-resort tie breaker + /// to ensure deterministic behavior. + unsigned Index; - void addIV(const SCEV *const Stride, const SCEV *const Base, PHINode *PHI) { - IVs.push_back(IVExpr(Stride, Base, PHI)); - } - }; + RegSortData() {} - class LoopStrengthReduce : public LoopPass { - IVUsers *IU; - ScalarEvolution *SE; - bool Changed; - - /// IVsByStride - Keep track of all IVs that have been inserted for a - /// particular stride. - std::map<const SCEV *, IVsOfOneStride> IVsByStride; - - /// DeadInsts - Keep track of instructions we may have made dead, so that - /// we can remove them after we are done working. - SmallVector<WeakVH, 16> DeadInsts; - - /// TLI - Keep a pointer of a TargetLowering to consult for determining - /// transformation profitability. - const TargetLowering *TLI; - - public: - static char ID; // Pass ID, replacement for typeid - explicit LoopStrengthReduce(const TargetLowering *tli = NULL) : - LoopPass(&ID), TLI(tli) {} - - bool runOnLoop(Loop *L, LPPassManager &LPM); - - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - // We split critical edges, so we change the CFG. However, we do update - // many analyses if they are around. - AU.addPreservedID(LoopSimplifyID); - AU.addPreserved("loops"); - AU.addPreserved("domfrontier"); - AU.addPreserved("domtree"); - - AU.addRequiredID(LoopSimplifyID); - AU.addRequired<ScalarEvolution>(); - AU.addPreserved<ScalarEvolution>(); - AU.addRequired<IVUsers>(); - AU.addPreserved<IVUsers>(); - } + void print(raw_ostream &OS) const; + void dump() const; +}; - private: - void OptimizeIndvars(Loop *L); - - /// OptimizeLoopTermCond - Change loop terminating condition to use the - /// postinc iv when possible. - void OptimizeLoopTermCond(Loop *L); - - /// OptimizeShadowIV - If IV is used in a int-to-float cast - /// inside the loop then try to eliminate the cast opeation. - void OptimizeShadowIV(Loop *L); - - /// OptimizeMax - Rewrite the loop's terminating condition - /// if it uses a max computation. - ICmpInst *OptimizeMax(Loop *L, ICmpInst *Cond, - IVStrideUse* &CondUse); - - /// OptimizeLoopCountIV - If, after all sharing of IVs, the IV used for - /// deciding when to exit the loop is used only for that purpose, try to - /// rearrange things so it counts down to a test against zero. - bool OptimizeLoopCountIV(Loop *L); - bool OptimizeLoopCountIVOfStride(const SCEV* &Stride, - IVStrideUse* &CondUse, Loop *L); - - /// StrengthReduceIVUsersOfStride - Strength reduce all of the users of a - /// single stride of IV. All of the users may have different starting - /// values, and this may not be the only stride. - void StrengthReduceIVUsersOfStride(const SCEV *Stride, - IVUsersOfOneStride &Uses, - Loop *L); - void StrengthReduceIVUsers(Loop *L); - - ICmpInst *ChangeCompareStride(Loop *L, ICmpInst *Cond, - IVStrideUse* &CondUse, - const SCEV* &CondStride, - bool PostPass = false); - - bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse, - const SCEV* &CondStride); - bool RequiresTypeConversion(const Type *Ty, const Type *NewTy); - const SCEV *CheckForIVReuse(bool, bool, bool, const SCEV *, - IVExpr&, const Type*, - const std::vector<BasedUser>& UsersToProcess); - bool ValidScale(bool, int64_t, - const std::vector<BasedUser>& UsersToProcess); - bool ValidOffset(bool, int64_t, int64_t, - const std::vector<BasedUser>& UsersToProcess); - const SCEV *CollectIVUsers(const SCEV *Stride, - IVUsersOfOneStride &Uses, - Loop *L, - bool &AllUsesAreAddresses, - bool &AllUsesAreOutsideLoop, - std::vector<BasedUser> &UsersToProcess); - bool StrideMightBeShared(const SCEV *Stride, Loop *L, bool CheckPreInc); - bool ShouldUseFullStrengthReductionMode( - const std::vector<BasedUser> &UsersToProcess, - const Loop *L, - bool AllUsesAreAddresses, - const SCEV *Stride); - void PrepareToStrengthReduceFully( - std::vector<BasedUser> &UsersToProcess, - const SCEV *Stride, - const SCEV *CommonExprs, - const Loop *L, - SCEVExpander &PreheaderRewriter); - void PrepareToStrengthReduceFromSmallerStride( - std::vector<BasedUser> &UsersToProcess, - Value *CommonBaseV, - const IVExpr &ReuseIV, - Instruction *PreInsertPt); - void PrepareToStrengthReduceWithNewPhi( - std::vector<BasedUser> &UsersToProcess, - const SCEV *Stride, - const SCEV *CommonExprs, - Value *CommonBaseV, - Instruction *IVIncInsertPt, - const Loop *L, - SCEVExpander &PreheaderRewriter); - - void DeleteTriviallyDeadInstructions(); - }; } -char LoopStrengthReduce::ID = 0; -static RegisterPass<LoopStrengthReduce> -X("loop-reduce", "Loop Strength Reduction"); +void RegSortData::print(raw_ostream &OS) const { + OS << "[NumUses=" << Bits.count() + << ", MaxComplexity="; + OS.write_hex(MaxComplexity); + OS << ", Index=" << Index << ']'; +} -Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) { - return new LoopStrengthReduce(TLI); +void RegSortData::dump() const { + print(errs()); errs() << '\n'; } -/// DeleteTriviallyDeadInstructions - If any of the instructions is the -/// specified set are trivially dead, delete them and see if this makes any of -/// their operands subsequently dead. -void LoopStrengthReduce::DeleteTriviallyDeadInstructions() { - while (!DeadInsts.empty()) { - Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val()); +namespace { - if (I == 0 || !isInstructionTriviallyDead(I)) - continue; +/// RegCount - This is a helper class to sort a given set of registers +/// according to associated RegSortData values. +class RegCount { +public: + const SCEV *Reg; + RegSortData Sort; + + RegCount(const SCEV *R, const RegSortData &RSD) + : Reg(R), Sort(RSD) {} + + // Sort by count. Returning true means the register is preferred. + bool operator<(const RegCount &Other) const { + // Sort by the number of unique uses of this register. + unsigned A = Sort.Bits.count(); + unsigned B = Other.Sort.Bits.count(); + if (A != B) return A > B; + + if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) { + const SCEVAddRecExpr *BR = dyn_cast<SCEVAddRecExpr>(Other.Reg); + // AddRecs have higher priority than other things. + if (!BR) return true; + + // Prefer affine values. + if (AR->isAffine() != BR->isAffine()) + return AR->isAffine(); + + const Loop *AL = AR->getLoop(); + const Loop *BL = BR->getLoop(); + if (AL != BL) { + unsigned ADepth = AL->getLoopDepth(); + unsigned BDepth = BL->getLoopDepth(); + // Prefer a less deeply nested addrec. + if (ADepth != BDepth) + return ADepth < BDepth; + + // Different loops at the same depth; do something arbitrary. + BasicBlock *AH = AL->getHeader(); + BasicBlock *BH = BL->getHeader(); + for (Function::iterator I = AH, E = AH->getParent()->end(); I != E; ++I) + if (&*I == BH) return true; + return false; + } - for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) - if (Instruction *U = dyn_cast<Instruction>(*OI)) { - *OI = 0; - if (U->use_empty()) - DeadInsts.push_back(U); + // Sort addrecs by stride. + const SCEV *AStep = AR->getOperand(1); + const SCEV *BStep = BR->getOperand(1); + if (AStep != BStep) { + if (const SCEVConstant *AC = dyn_cast<SCEVConstant>(AStep)) { + const SCEVConstant *BC = dyn_cast<SCEVConstant>(BStep); + if (!BC) return true; + // Arbitrarily prefer wider registers. + if (AC->getValue()->getValue().getBitWidth() != + BC->getValue()->getValue().getBitWidth()) + return AC->getValue()->getValue().getBitWidth() > + BC->getValue()->getValue().getBitWidth(); + // Ignore the sign bit, assuming that striding by a negative value + // is just as easy as by a positive value. + // Prefer the addrec with the lesser absolute value stride, as it + // will allow uses to have simpler addressing modes. + return AC->getValue()->getValue().abs() + .ult(BC->getValue()->getValue().abs()); + } } - I->eraseFromParent(); - Changed = true; + // Then sort by the register which will permit the simplest uses. + // This is a heuristic; currently we only track the most complex use as a + // representative. + if (Sort.MaxComplexity != Other.Sort.MaxComplexity) + return Sort.MaxComplexity < Other.Sort.MaxComplexity; + + // Then sort them by their start values. + const SCEV *AStart = AR->getStart(); + const SCEV *BStart = BR->getStart(); + if (AStart != BStart) { + if (const SCEVConstant *AC = dyn_cast<SCEVConstant>(AStart)) { + const SCEVConstant *BC = dyn_cast<SCEVConstant>(BStart); + if (!BC) return true; + // Arbitrarily prefer wider registers. + if (AC->getValue()->getValue().getBitWidth() != + BC->getValue()->getValue().getBitWidth()) + return AC->getValue()->getValue().getBitWidth() > + BC->getValue()->getValue().getBitWidth(); + // Prefer positive over negative if the absolute values are the same. + if (AC->getValue()->getValue().abs() == + BC->getValue()->getValue().abs()) + return AC->getValue()->getValue().isStrictlyPositive(); + // Prefer the addrec with the lesser absolute value start. + return AC->getValue()->getValue().abs() + .ult(BC->getValue()->getValue().abs()); + } + } + } else { + // AddRecs have higher priority than other things. + if (isa<SCEVAddRecExpr>(Other.Reg)) return false; + // Sort by the register which will permit the simplest uses. + // This is a heuristic; currently we only track the most complex use as a + // representative. + if (Sort.MaxComplexity != Other.Sort.MaxComplexity) + return Sort.MaxComplexity < Other.Sort.MaxComplexity; + } + + + // Tie-breaker: the arbitrary index, to ensure a reliable ordering. + return Sort.Index < Other.Sort.Index; } + + void print(raw_ostream &OS) const; + void dump() const; +}; + } -/// isAddressUse - Returns true if the specified instruction is using the -/// specified value as an address. -static bool isAddressUse(Instruction *Inst, Value *OperandVal) { - bool isAddress = isa<LoadInst>(Inst); - if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { - if (SI->getOperand(1) == OperandVal) - isAddress = true; - } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { - // Addressing modes can also be folded into prefetches and a variety - // of intrinsics. - switch (II->getIntrinsicID()) { - default: break; - case Intrinsic::prefetch: - case Intrinsic::x86_sse2_loadu_dq: - case Intrinsic::x86_sse2_loadu_pd: - case Intrinsic::x86_sse_loadu_ps: - case Intrinsic::x86_sse_storeu_ps: - case Intrinsic::x86_sse2_storeu_pd: - case Intrinsic::x86_sse2_storeu_dq: - case Intrinsic::x86_sse2_storel_dq: - if (II->getOperand(1) == OperandVal) - isAddress = true; - break; - } - } - return isAddress; +void RegCount::print(raw_ostream &OS) const { + OS << *Reg << ':'; + Sort.print(OS); } -/// getAccessType - Return the type of the memory being accessed. -static const Type *getAccessType(const Instruction *Inst) { - const Type *AccessTy = Inst->getType(); - if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) - AccessTy = SI->getOperand(0)->getType(); - else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { - // Addressing modes can also be folded into prefetches and a variety - // of intrinsics. - switch (II->getIntrinsicID()) { - default: break; - case Intrinsic::x86_sse_storeu_ps: - case Intrinsic::x86_sse2_storeu_pd: - case Intrinsic::x86_sse2_storeu_dq: - case Intrinsic::x86_sse2_storel_dq: - AccessTy = II->getOperand(1)->getType(); - break; - } - } - return AccessTy; +void RegCount::dump() const { + print(errs()); errs() << '\n'; } namespace { - /// BasedUser - For a particular base value, keep information about how we've - /// partitioned the expression so far. - struct BasedUser { - /// Base - The Base value for the PHI node that needs to be inserted for - /// this use. As the use is processed, information gets moved from this - /// field to the Imm field (below). BasedUser values are sorted by this - /// field. - const SCEV *Base; - - /// Inst - The instruction using the induction variable. - Instruction *Inst; - - /// OperandValToReplace - The operand value of Inst to replace with the - /// EmittedBase. - Value *OperandValToReplace; - - /// Imm - The immediate value that should be added to the base immediately - /// before Inst, because it will be folded into the imm field of the - /// instruction. This is also sometimes used for loop-variant values that - /// must be added inside the loop. - const SCEV *Imm; - - /// Phi - The induction variable that performs the striding that - /// should be used for this user. - PHINode *Phi; - - // isUseOfPostIncrementedValue - True if this should use the - // post-incremented version of this IV, not the preincremented version. - // This can only be set in special cases, such as the terminating setcc - // instruction for a loop and uses outside the loop that are dominated by - // the loop. - bool isUseOfPostIncrementedValue; - - BasedUser(IVStrideUse &IVSU, ScalarEvolution *se) - : Base(IVSU.getOffset()), Inst(IVSU.getUser()), - OperandValToReplace(IVSU.getOperandValToReplace()), - Imm(se->getIntegerSCEV(0, Base->getType())), - isUseOfPostIncrementedValue(IVSU.isUseOfPostIncrementedValue()) {} - - // Once we rewrite the code to insert the new IVs we want, update the - // operands of Inst to use the new expression 'NewBase', with 'Imm' added - // to it. - void RewriteInstructionToUseNewBase(const SCEV *NewBase, - Instruction *InsertPt, - SCEVExpander &Rewriter, Loop *L, Pass *P, - SmallVectorImpl<WeakVH> &DeadInsts, - ScalarEvolution *SE); - - Value *InsertCodeForBaseAtPosition(const SCEV *NewBase, - const Type *Ty, - SCEVExpander &Rewriter, - Instruction *IP, - ScalarEvolution *SE); - void dump() const; - }; -} -void BasedUser::dump() const { - dbgs() << " Base=" << *Base; - dbgs() << " Imm=" << *Imm; - dbgs() << " Inst: " << *Inst; -} +/// Formula - This class holds information that describes a formula for +/// satisfying a use. It may include broken-out immediates and scaled registers. +struct Formula { + /// AM - This is used to represent complex addressing, as well as other kinds + /// of interesting uses. + TargetLowering::AddrMode AM; -Value *BasedUser::InsertCodeForBaseAtPosition(const SCEV *NewBase, - const Type *Ty, - SCEVExpander &Rewriter, - Instruction *IP, - ScalarEvolution *SE) { - Value *Base = Rewriter.expandCodeFor(NewBase, 0, IP); + /// BaseRegs - The list of "base" registers for this use. When this is + /// non-empty, AM.HasBaseReg should be set to true. + SmallVector<const SCEV *, 2> BaseRegs; - // Wrap the base in a SCEVUnknown so that ScalarEvolution doesn't try to - // re-analyze it. - const SCEV *NewValSCEV = SE->getUnknown(Base); + /// ScaledReg - The 'scaled' register for this use. This should be non-null + /// when AM.Scale is not zero. + const SCEV *ScaledReg; - // Always emit the immediate into the same block as the user. - NewValSCEV = SE->getAddExpr(NewValSCEV, Imm); + Formula() : ScaledReg(0) {} - return Rewriter.expandCodeFor(NewValSCEV, Ty, IP); -} + unsigned getNumRegs() const; + uint32_t getComplexity() const; + const Type *getType() const; + void InitialMatch(const SCEV *S, Loop *L, + ScalarEvolution &SE, DominatorTree &DT); -// Once we rewrite the code to insert the new IVs we want, update the -// operands of Inst to use the new expression 'NewBase', with 'Imm' added -// to it. NewBasePt is the last instruction which contributes to the -// value of NewBase in the case that it's a diffferent instruction from -// the PHI that NewBase is computed from, or null otherwise. -// -void BasedUser::RewriteInstructionToUseNewBase(const SCEV *NewBase, - Instruction *NewBasePt, - SCEVExpander &Rewriter, Loop *L, Pass *P, - SmallVectorImpl<WeakVH> &DeadInsts, - ScalarEvolution *SE) { - if (!isa<PHINode>(Inst)) { - // By default, insert code at the user instruction. - BasicBlock::iterator InsertPt = Inst; - - // However, if the Operand is itself an instruction, the (potentially - // complex) inserted code may be shared by many users. Because of this, we - // want to emit code for the computation of the operand right before its old - // computation. This is usually safe, because we obviously used to use the - // computation when it was computed in its current block. However, in some - // cases (e.g. use of a post-incremented induction variable) the NewBase - // value will be pinned to live somewhere after the original computation. - // In this case, we have to back off. - // - // If this is a use outside the loop (which means after, since it is based - // on a loop indvar) we use the post-incremented value, so that we don't - // artificially make the preinc value live out the bottom of the loop. - if (!isUseOfPostIncrementedValue && L->contains(Inst)) { - if (NewBasePt && isa<PHINode>(OperandValToReplace)) { - InsertPt = NewBasePt; - ++InsertPt; - } else if (Instruction *OpInst - = dyn_cast<Instruction>(OperandValToReplace)) { - InsertPt = OpInst; - while (isa<PHINode>(InsertPt)) ++InsertPt; - } - } - Value *NewVal = InsertCodeForBaseAtPosition(NewBase, - OperandValToReplace->getType(), - Rewriter, InsertPt, SE); - // Replace the use of the operand Value with the new Phi we just created. - Inst->replaceUsesOfWith(OperandValToReplace, NewVal); - - DEBUG(dbgs() << " Replacing with "); - DEBUG(WriteAsOperand(dbgs(), NewVal, /*PrintType=*/false)); - DEBUG(dbgs() << ", which has value " << *NewBase << " plus IMM " - << *Imm << "\n"); - return; + /// referencesReg - Test if this formula references the given register. + bool referencesReg(const SCEV *S) const { + return S == ScaledReg || + std::find(BaseRegs.begin(), BaseRegs.end(), S) != BaseRegs.end(); } - // PHI nodes are more complex. We have to insert one copy of the NewBase+Imm - // expression into each operand block that uses it. Note that PHI nodes can - // have multiple entries for the same predecessor. We use a map to make sure - // that a PHI node only has a single Value* for each predecessor (which also - // prevents us from inserting duplicate code in some blocks). - DenseMap<BasicBlock*, Value*> InsertedCode; - PHINode *PN = cast<PHINode>(Inst); - for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { - if (PN->getIncomingValue(i) == OperandValToReplace) { - // If the original expression is outside the loop, put the replacement - // code in the same place as the original expression, - // which need not be an immediate predecessor of this PHI. This way we - // need only one copy of it even if it is referenced multiple times in - // the PHI. We don't do this when the original expression is inside the - // loop because multiple copies sometimes do useful sinking of code in - // that case(?). - Instruction *OldLoc = dyn_cast<Instruction>(OperandValToReplace); - BasicBlock *PHIPred = PN->getIncomingBlock(i); - if (L->contains(OldLoc)) { - // If this is a critical edge, split the edge so that we do not insert - // the code on all predecessor/successor paths. We do this unless this - // is the canonical backedge for this loop, as this can make some - // inserted code be in an illegal position. - if (e != 1 && PHIPred->getTerminator()->getNumSuccessors() > 1 && - !isa<IndirectBrInst>(PHIPred->getTerminator()) && - (PN->getParent() != L->getHeader() || !L->contains(PHIPred))) { - - // First step, split the critical edge. - BasicBlock *NewBB = SplitCriticalEdge(PHIPred, PN->getParent(), - P, false); - - // Next step: move the basic block. In particular, if the PHI node - // is outside of the loop, and PredTI is in the loop, we want to - // move the block to be immediately before the PHI block, not - // immediately after PredTI. - if (L->contains(PHIPred) && !L->contains(PN)) - NewBB->moveBefore(PN->getParent()); + bool operator==(const Formula &Other) const { + return BaseRegs == Other.BaseRegs && + ScaledReg == Other.ScaledReg && + AM.Scale == Other.AM.Scale && + AM.BaseOffs == Other.AM.BaseOffs && + AM.BaseGV == Other.AM.BaseGV; + } - // Splitting the edge can reduce the number of PHI entries we have. - e = PN->getNumIncomingValues(); - PHIPred = NewBB; - i = PN->getBasicBlockIndex(PHIPred); - } - } - Value *&Code = InsertedCode[PHIPred]; - if (!Code) { - // Insert the code into the end of the predecessor block. - Instruction *InsertPt = (L->contains(OldLoc)) ? - PHIPred->getTerminator() : - OldLoc->getParent()->getTerminator(); - Code = InsertCodeForBaseAtPosition(NewBase, PN->getType(), - Rewriter, InsertPt, SE); - - DEBUG(dbgs() << " Changing PHI use to "); - DEBUG(WriteAsOperand(dbgs(), Code, /*PrintType=*/false)); - DEBUG(dbgs() << ", which has value " << *NewBase << " plus IMM " - << *Imm << "\n"); - } + // This sorts at least partially based on host pointer values which are + // not deterministic, so it is only usable for uniqification. + bool operator<(const Formula &Other) const { + if (BaseRegs != Other.BaseRegs) + return BaseRegs < Other.BaseRegs; + if (ScaledReg != Other.ScaledReg) + return ScaledReg < Other.ScaledReg; + if (AM.Scale != Other.AM.Scale) + return AM.Scale < Other.AM.Scale; + if (AM.BaseOffs != Other.AM.BaseOffs) + return AM.BaseOffs < Other.AM.BaseOffs; + if (AM.BaseGV != Other.AM.BaseGV) + return AM.BaseGV < Other.AM.BaseGV; + return false; + } - // Replace the use of the operand Value with the new Phi we just created. - PN->setIncomingValue(i, Code); - Rewriter.clear(); - } + void print(raw_ostream &OS) const; + void dump() const; +}; + +} + +/// getNumRegs - Return the total number of register operands used by this +/// formula. +unsigned Formula::getNumRegs() const { + return !!ScaledReg + BaseRegs.size(); +} + +/// getComplexity - Return an oversimplified value indicating the complexity +/// of this formula. This is used as a tie-breaker in choosing register +/// preferences. +uint32_t Formula::getComplexity() const { + // Encode the information in a uint32_t so that comparing with operator< + // will be interesting. + return + // Most significant, the number of registers. This saturates because we + // need the bits, and because beyond a few hundred it doesn't really matter. + (std::min(getNumRegs(), (1u<<15)-1) << 17) | + // Having multiple base regs is worse than having a base reg and a scale. + ((BaseRegs.size() > 1) << 16) | + // Scale absolute value. + ((AM.Scale != 0 ? (Log2_64(abs64(AM.Scale)) + 1) : 0u) << 9) | + // Scale sign, which is less significant than the absolute value. + ((AM.Scale < 0) << 8) | + // Offset absolute value. + ((AM.BaseOffs != 0 ? (Log2_64(abs64(AM.BaseOffs)) + 1) : 0u) << 1) | + // If a GV is present, treat it like a maximal offset. + ((AM.BaseGV ? ((1u<<7)-1) : 0) << 1) | + // Offset sign, which is less significant than the absolute offset. + ((AM.BaseOffs < 0) << 0); +} + +/// getType - Return the type of this formula, if it has one, or null +/// otherwise. This type is meaningless except for the bit size. +const Type *Formula::getType() const { + return !BaseRegs.empty() ? BaseRegs.front()->getType() : + ScaledReg ? ScaledReg->getType() : + AM.BaseGV ? AM.BaseGV->getType() : + 0; +} + +namespace { + +/// ComplexitySorter - A predicate which orders Formulae by the number of +/// registers they contain. +struct ComplexitySorter { + bool operator()(const Formula &LHS, const Formula &RHS) const { + unsigned L = LHS.getNumRegs(); + unsigned R = RHS.getNumRegs(); + if (L != R) return L < R; + + return LHS.getComplexity() < RHS.getComplexity(); } +}; - // PHI node might have become a constant value after SplitCriticalEdge. - DeadInsts.push_back(Inst); } +/// DoInitialMatch - Recurrsion helper for InitialMatch. +static void DoInitialMatch(const SCEV *S, Loop *L, + SmallVectorImpl<const SCEV *> &Good, + SmallVectorImpl<const SCEV *> &Bad, + ScalarEvolution &SE, DominatorTree &DT) { + // Collect expressions which properly dominate the loop header. + if (S->properlyDominates(L->getHeader(), &DT)) { + Good.push_back(S); + return; + } -/// fitsInAddressMode - Return true if V can be subsumed within an addressing -/// mode, and does not need to be put in a register first. -static bool fitsInAddressMode(const SCEV *V, const Type *AccessTy, - const TargetLowering *TLI, bool HasBaseReg) { - if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(V)) { - int64_t VC = SC->getValue()->getSExtValue(); - if (TLI) { - TargetLowering::AddrMode AM; - AM.BaseOffs = VC; - AM.HasBaseReg = HasBaseReg; - return TLI->isLegalAddressingMode(AM, AccessTy); - } else { - // Defaults to PPC. PPC allows a sign-extended 16-bit immediate field. - return (VC > -(1 << 16) && VC < (1 << 16)-1); + // Look at add operands. + if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { + for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); + I != E; ++I) + DoInitialMatch(*I, L, Good, Bad, SE, DT); + return; + } + + // Look at addrec operands. + if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { + if (!AR->getStart()->isZero()) { + DoInitialMatch(AR->getStart(), L, Good, Bad, SE, DT); + DoInitialMatch(SE.getAddRecExpr(SE.getIntegerSCEV(0, AR->getType()), + AR->getStepRecurrence(SE), + AR->getLoop()), + L, Good, Bad, SE, DT); + return; } } - if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(V)) - if (GlobalValue *GV = dyn_cast<GlobalValue>(SU->getValue())) { - if (TLI) { - TargetLowering::AddrMode AM; - AM.BaseGV = GV; - AM.HasBaseReg = HasBaseReg; - return TLI->isLegalAddressingMode(AM, AccessTy); - } else { - // Default: assume global addresses are not legal. - } + // Handle a multiplication by -1 (negation) if it didn't fold. + if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) + if (Mul->getOperand(0)->isAllOnesValue()) { + SmallVector<const SCEV *, 4> Ops(Mul->op_begin()+1, Mul->op_end()); + const SCEV *NewMul = SE.getMulExpr(Ops); + + SmallVector<const SCEV *, 4> MyGood; + SmallVector<const SCEV *, 4> MyBad; + |