diff options
Diffstat (limited to 'lib/Transforms/Scalar/LoopStrengthReduce.cpp')
-rw-r--r-- | lib/Transforms/Scalar/LoopStrengthReduce.cpp | 4574 |
1 files changed, 2305 insertions, 2269 deletions
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index fa820ed8e4..6456ca1eb0 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -17,6 +17,45 @@ // available on the target, and it performs a variety of other optimizations // related to loop induction variables. // +// Terminology note: this code has a lot of handling for "post-increment" or +// "post-inc" users. This is not talking about post-increment addressing modes; +// it is instead talking about code like this: +// +// %i = phi [ 0, %entry ], [ %i.next, %latch ] +// ... +// %i.next = add %i, 1 +// %c = icmp eq %i.next, %n +// +// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however +// it's useful to think about these as the same register, with some uses using +// the value of the register before the add and some using // it after. In this +// example, the icmp is a post-increment user, since it uses %i.next, which is +// the value of the induction variable after the increment. The other common +// case of post-increment users is users outside the loop. +// +// TODO: More sophistication in the way Formulae are generated. +// +// TODO: Handle multiple loops at a time. +// +// TODO: test/CodeGen/X86/full-lsr.ll should get full lsr. The problem is +// that {0,+,1}<%bb> is getting picked first because all 7 uses can +// use it, and while it's a pretty good solution, it means that LSR +// doesn't look further to find an even better solution. +// +// TODO: Should TargetLowering::AddrMode::BaseGV be changed to a ConstantExpr +// instead of a GlobalValue? +// +// TODO: When truncation is free, truncate ICmp users' operands to make it a +// smaller encoding (on x86 at least). +// +// TODO: When a negated register is used by an add (such as in a list of +// multiple base registers, or as the increment expression in an addrec), +// we may not actually need both reg and (-1 * reg) in registers; the +// negation can be implemented by using a sub instead of an add. The +// lack of support for taking this into consideration when making +// register pressure decisions is partly worked around by the "Special" +// use kind. +// //===----------------------------------------------------------------------===// #define DEBUG_TYPE "loop-reduce" @@ -26,2025 +65,1524 @@ #include "llvm/IntrinsicInst.h" #include "llvm/DerivedTypes.h" #include "llvm/Analysis/IVUsers.h" +#include "llvm/Analysis/Dominators.h" #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" -#include "llvm/Transforms/Utils/AddrModeMatcher.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" -#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/SmallBitVector.h" +#include "llvm/ADT/SetVector.h" #include "llvm/Support/Debug.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/ValueHandle.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetLowering.h" #include <algorithm> using namespace llvm; -STATISTIC(NumReduced , "Number of IV uses strength reduced"); -STATISTIC(NumInserted, "Number of PHIs inserted"); -STATISTIC(NumVariable, "Number of PHIs with variable strides"); -STATISTIC(NumEliminated, "Number of strides eliminated"); -STATISTIC(NumShadow, "Number of Shadow IVs optimized"); -STATISTIC(NumImmSunk, "Number of common expr immediates sunk into uses"); -STATISTIC(NumLoopCond, "Number of loop terminating conds optimized"); -STATISTIC(NumCountZero, "Number of count iv optimized to count toward zero"); - -static cl::opt<bool> EnableFullLSRMode("enable-full-lsr", - cl::init(false), - cl::Hidden); - namespace { - struct BasedUser; +// Constant strides come first which in turns are sorted by their absolute +// values. If absolute values are the same, then positive strides comes first. +// e.g. +// 4, -1, X, 1, 2 ==> 1, -1, 2, 4, X +struct StrideCompare { + const ScalarEvolution &SE; + explicit StrideCompare(const ScalarEvolution &se) : SE(se) {} + + bool operator()(const SCEV *const &LHS, const SCEV *const &RHS) const { + const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(LHS); + const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(RHS); + if (LHSC && RHSC) { + unsigned BitWidth = std::max(SE.getTypeSizeInBits(LHS->getType()), + SE.getTypeSizeInBits(RHS->getType())); + APInt LV = LHSC->getValue()->getValue(); + APInt RV = RHSC->getValue()->getValue(); + LV.sextOrTrunc(BitWidth); + RV.sextOrTrunc(BitWidth); + APInt ALV = LV.abs(); + APInt ARV = RV.abs(); + if (ALV == ARV) { + if (LV != RV) + return LV.sgt(RV); + } else { + return ALV.ult(ARV); + } + + // If it's the same value but different type, sort by bit width so + // that we emit larger induction variables before smaller + // ones, letting the smaller be re-written in terms of larger ones. + return SE.getTypeSizeInBits(RHS->getType()) < + SE.getTypeSizeInBits(LHS->getType()); + } + return LHSC && !RHSC; + } +}; - /// IVInfo - This structure keeps track of one IV expression inserted during - /// StrengthReduceStridedIVUsers. It contains the stride, the common base, as - /// well as the PHI node and increment value created for rewrite. - struct IVExpr { - const SCEV *Stride; - const SCEV *Base; - PHINode *PHI; +/// RegSortData - This class holds data which is used to order reuse +/// candidates. +class RegSortData { +public: + /// Bits - This represents the set of LSRUses (by index) which reference a + /// particular register. + SmallBitVector Bits; - IVExpr(const SCEV *const stride, const SCEV *const base, PHINode *phi) - : Stride(stride), Base(base), PHI(phi) {} - }; + /// MaxComplexity - This represents the greatest complexity (see the comments + /// on Formula::getComplexity) seen with a particular register. + uint32_t MaxComplexity; - /// IVsOfOneStride - This structure keeps track of all IV expression inserted - /// during StrengthReduceStridedIVUsers for a particular stride of the IV. - struct IVsOfOneStride { - std::vector<IVExpr> IVs; + /// Index - This holds an arbitrary value used as a last-resort tie breaker + /// to ensure deterministic behavior. + unsigned Index; - void addIV(const SCEV *const Stride, const SCEV *const Base, PHINode *PHI) { - IVs.push_back(IVExpr(Stride, Base, PHI)); - } - }; + RegSortData() {} - class LoopStrengthReduce : public LoopPass { - IVUsers *IU; - ScalarEvolution *SE; - bool Changed; - - /// IVsByStride - Keep track of all IVs that have been inserted for a - /// particular stride. - std::map<const SCEV *, IVsOfOneStride> IVsByStride; - - /// DeadInsts - Keep track of instructions we may have made dead, so that - /// we can remove them after we are done working. - SmallVector<WeakVH, 16> DeadInsts; - - /// TLI - Keep a pointer of a TargetLowering to consult for determining - /// transformation profitability. - const TargetLowering *TLI; - - public: - static char ID; // Pass ID, replacement for typeid - explicit LoopStrengthReduce(const TargetLowering *tli = NULL) : - LoopPass(&ID), TLI(tli) {} - - bool runOnLoop(Loop *L, LPPassManager &LPM); - - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - // We split critical edges, so we change the CFG. However, we do update - // many analyses if they are around. - AU.addPreservedID(LoopSimplifyID); - AU.addPreserved("loops"); - AU.addPreserved("domfrontier"); - AU.addPreserved("domtree"); - - AU.addRequiredID(LoopSimplifyID); - AU.addRequired<ScalarEvolution>(); - AU.addPreserved<ScalarEvolution>(); - AU.addRequired<IVUsers>(); - AU.addPreserved<IVUsers>(); - } + void print(raw_ostream &OS) const; + void dump() const; +}; - private: - void OptimizeIndvars(Loop *L); - - /// OptimizeLoopTermCond - Change loop terminating condition to use the - /// postinc iv when possible. - void OptimizeLoopTermCond(Loop *L); - - /// OptimizeShadowIV - If IV is used in a int-to-float cast - /// inside the loop then try to eliminate the cast opeation. - void OptimizeShadowIV(Loop *L); - - /// OptimizeMax - Rewrite the loop's terminating condition - /// if it uses a max computation. - ICmpInst *OptimizeMax(Loop *L, ICmpInst *Cond, - IVStrideUse* &CondUse); - - /// OptimizeLoopCountIV - If, after all sharing of IVs, the IV used for - /// deciding when to exit the loop is used only for that purpose, try to - /// rearrange things so it counts down to a test against zero. - bool OptimizeLoopCountIV(Loop *L); - bool OptimizeLoopCountIVOfStride(const SCEV* &Stride, - IVStrideUse* &CondUse, Loop *L); - - /// StrengthReduceIVUsersOfStride - Strength reduce all of the users of a - /// single stride of IV. All of the users may have different starting - /// values, and this may not be the only stride. - void StrengthReduceIVUsersOfStride(const SCEV *Stride, - IVUsersOfOneStride &Uses, - Loop *L); - void StrengthReduceIVUsers(Loop *L); - - ICmpInst *ChangeCompareStride(Loop *L, ICmpInst *Cond, - IVStrideUse* &CondUse, - const SCEV* &CondStride, - bool PostPass = false); - - bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse, - const SCEV* &CondStride); - bool RequiresTypeConversion(const Type *Ty, const Type *NewTy); - const SCEV *CheckForIVReuse(bool, bool, bool, const SCEV *, - IVExpr&, const Type*, - const std::vector<BasedUser>& UsersToProcess); - bool ValidScale(bool, int64_t, - const std::vector<BasedUser>& UsersToProcess); - bool ValidOffset(bool, int64_t, int64_t, - const std::vector<BasedUser>& UsersToProcess); - const SCEV *CollectIVUsers(const SCEV *Stride, - IVUsersOfOneStride &Uses, - Loop *L, - bool &AllUsesAreAddresses, - bool &AllUsesAreOutsideLoop, - std::vector<BasedUser> &UsersToProcess); - bool StrideMightBeShared(const SCEV *Stride, Loop *L, bool CheckPreInc); - bool ShouldUseFullStrengthReductionMode( - const std::vector<BasedUser> &UsersToProcess, - const Loop *L, - bool AllUsesAreAddresses, - const SCEV *Stride); - void PrepareToStrengthReduceFully( - std::vector<BasedUser> &UsersToProcess, - const SCEV *Stride, - const SCEV *CommonExprs, - const Loop *L, - SCEVExpander &PreheaderRewriter); - void PrepareToStrengthReduceFromSmallerStride( - std::vector<BasedUser> &UsersToProcess, - Value *CommonBaseV, - const IVExpr &ReuseIV, - Instruction *PreInsertPt); - void PrepareToStrengthReduceWithNewPhi( - std::vector<BasedUser> &UsersToProcess, - const SCEV *Stride, - const SCEV *CommonExprs, - Value *CommonBaseV, - Instruction *IVIncInsertPt, - const Loop *L, - SCEVExpander &PreheaderRewriter); - - void DeleteTriviallyDeadInstructions(); - }; } -char LoopStrengthReduce::ID = 0; -static RegisterPass<LoopStrengthReduce> -X("loop-reduce", "Loop Strength Reduction"); +void RegSortData::print(raw_ostream &OS) const { + OS << "[NumUses=" << Bits.count() + << ", MaxComplexity="; + OS.write_hex(MaxComplexity); + OS << ", Index=" << Index << ']'; +} -Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) { - return new LoopStrengthReduce(TLI); +void RegSortData::dump() const { + print(errs()); errs() << '\n'; } -/// DeleteTriviallyDeadInstructions - If any of the instructions is the -/// specified set are trivially dead, delete them and see if this makes any of -/// their operands subsequently dead. -void LoopStrengthReduce::DeleteTriviallyDeadInstructions() { - while (!DeadInsts.empty()) { - Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val()); +namespace { - if (I == 0 || !isInstructionTriviallyDead(I)) - continue; +/// RegCount - This is a helper class to sort a given set of registers +/// according to associated RegSortData values. +class RegCount { +public: + const SCEV *Reg; + RegSortData Sort; + + RegCount(const SCEV *R, const RegSortData &RSD) + : Reg(R), Sort(RSD) {} + + // Sort by count. Returning true means the register is preferred. + bool operator<(const RegCount &Other) const { + // Sort by the number of unique uses of this register. + unsigned A = Sort.Bits.count(); + unsigned B = Other.Sort.Bits.count(); + if (A != B) return A > B; + + if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) { + const SCEVAddRecExpr *BR = dyn_cast<SCEVAddRecExpr>(Other.Reg); + // AddRecs have higher priority than other things. + if (!BR) return true; + + // Prefer affine values. + if (AR->isAffine() != BR->isAffine()) + return AR->isAffine(); + + const Loop *AL = AR->getLoop(); + const Loop *BL = BR->getLoop(); + if (AL != BL) { + unsigned ADepth = AL->getLoopDepth(); + unsigned BDepth = BL->getLoopDepth(); + // Prefer a less deeply nested addrec. + if (ADepth != BDepth) + return ADepth < BDepth; + + // Different loops at the same depth; do something arbitrary. + BasicBlock *AH = AL->getHeader(); + BasicBlock *BH = BL->getHeader(); + for (Function::iterator I = AH, E = AH->getParent()->end(); I != E; ++I) + if (&*I == BH) return true; + return false; + } - for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI) - if (Instruction *U = dyn_cast<Instruction>(*OI)) { - *OI = 0; - if (U->use_empty()) - DeadInsts.push_back(U); + // Sort addrecs by stride. + const SCEV *AStep = AR->getOperand(1); + const SCEV *BStep = BR->getOperand(1); + if (AStep != BStep) { + if (const SCEVConstant *AC = dyn_cast<SCEVConstant>(AStep)) { + const SCEVConstant *BC = dyn_cast<SCEVConstant>(BStep); + if (!BC) return true; + // Arbitrarily prefer wider registers. + if (AC->getValue()->getValue().getBitWidth() != + BC->getValue()->getValue().getBitWidth()) + return AC->getValue()->getValue().getBitWidth() > + BC->getValue()->getValue().getBitWidth(); + // Ignore the sign bit, assuming that striding by a negative value + // is just as easy as by a positive value. + // Prefer the addrec with the lesser absolute value stride, as it + // will allow uses to have simpler addressing modes. + return AC->getValue()->getValue().abs() + .ult(BC->getValue()->getValue().abs()); + } } - I->eraseFromParent(); - Changed = true; + // Then sort by the register which will permit the simplest uses. + // This is a heuristic; currently we only track the most complex use as a + // representative. + if (Sort.MaxComplexity != Other.Sort.MaxComplexity) + return Sort.MaxComplexity < Other.Sort.MaxComplexity; + + // Then sort them by their start values. + const SCEV *AStart = AR->getStart(); + const SCEV *BStart = BR->getStart(); + if (AStart != BStart) { + if (const SCEVConstant *AC = dyn_cast<SCEVConstant>(AStart)) { + const SCEVConstant *BC = dyn_cast<SCEVConstant>(BStart); + if (!BC) return true; + // Arbitrarily prefer wider registers. + if (AC->getValue()->getValue().getBitWidth() != + BC->getValue()->getValue().getBitWidth()) + return AC->getValue()->getValue().getBitWidth() > + BC->getValue()->getValue().getBitWidth(); + // Prefer positive over negative if the absolute values are the same. + if (AC->getValue()->getValue().abs() == + BC->getValue()->getValue().abs()) + return AC->getValue()->getValue().isStrictlyPositive(); + // Prefer the addrec with the lesser absolute value start. + return AC->getValue()->getValue().abs() + .ult(BC->getValue()->getValue().abs()); + } + } + } else { + // AddRecs have higher priority than other things. + if (isa<SCEVAddRecExpr>(Other.Reg)) return false; + // Sort by the register which will permit the simplest uses. + // This is a heuristic; currently we only track the most complex use as a + // representative. + if (Sort.MaxComplexity != Other.Sort.MaxComplexity) + return Sort.MaxComplexity < Other.Sort.MaxComplexity; + } + + + // Tie-breaker: the arbitrary index, to ensure a reliable ordering. + return Sort.Index < Other.Sort.Index; } + + void print(raw_ostream &OS) const; + void dump() const; +}; + } -/// isAddressUse - Returns true if the specified instruction is using the -/// specified value as an address. -static bool isAddressUse(Instruction *Inst, Value *OperandVal) { - bool isAddress = isa<LoadInst>(Inst); - if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) { - if (SI->getOperand(1) == OperandVal) - isAddress = true; - } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { - // Addressing modes can also be folded into prefetches and a variety - // of intrinsics. - switch (II->getIntrinsicID()) { - default: break; - case Intrinsic::prefetch: - case Intrinsic::x86_sse2_loadu_dq: - case Intrinsic::x86_sse2_loadu_pd: - case Intrinsic::x86_sse_loadu_ps: - case Intrinsic::x86_sse_storeu_ps: - case Intrinsic::x86_sse2_storeu_pd: - case Intrinsic::x86_sse2_storeu_dq: - case Intrinsic::x86_sse2_storel_dq: - if (II->getOperand(1) == OperandVal) - isAddress = true; - break; - } - } - return isAddress; +void RegCount::print(raw_ostream &OS) const { + OS << *Reg << ':'; + Sort.print(OS); } -/// getAccessType - Return the type of the memory being accessed. -static const Type *getAccessType(const Instruction *Inst) { - const Type *AccessTy = Inst->getType(); - if (const StoreInst *SI = dyn_cast<StoreInst>(Inst)) - AccessTy = SI->getOperand(0)->getType(); - else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) { - // Addressing modes can also be folded into prefetches and a variety - // of intrinsics. - switch (II->getIntrinsicID()) { - default: break; - case Intrinsic::x86_sse_storeu_ps: - case Intrinsic::x86_sse2_storeu_pd: - case Intrinsic::x86_sse2_storeu_dq: - case Intrinsic::x86_sse2_storel_dq: - AccessTy = II->getOperand(1)->getType(); - break; - } - } - return AccessTy; +void RegCount::dump() const { + print(errs()); errs() << '\n'; } namespace { - /// BasedUser - For a particular base value, keep information about how we've - /// partitioned the expression so far. - struct BasedUser { - /// Base - The Base value for the PHI node that needs to be inserted for - /// this use. As the use is processed, information gets moved from this - /// field to the Imm field (below). BasedUser values are sorted by this - /// field. - const SCEV *Base; - - /// Inst - The instruction using the induction variable. - Instruction *Inst; - - /// OperandValToReplace - The operand value of Inst to replace with the - /// EmittedBase. - Value *OperandValToReplace; - - /// Imm - The immediate value that should be added to the base immediately - /// before Inst, because it will be folded into the imm field of the - /// instruction. This is also sometimes used for loop-variant values that - /// must be added inside the loop. - const SCEV *Imm; - - /// Phi - The induction variable that performs the striding that - /// should be used for this user. - PHINode *Phi; - - // isUseOfPostIncrementedValue - True if this should use the - // post-incremented version of this IV, not the preincremented version. - // This can only be set in special cases, such as the terminating setcc - // instruction for a loop and uses outside the loop that are dominated by - // the loop. - bool isUseOfPostIncrementedValue; - - BasedUser(IVStrideUse &IVSU, ScalarEvolution *se) - : Base(IVSU.getOffset()), Inst(IVSU.getUser()), - OperandValToReplace(IVSU.getOperandValToReplace()), - Imm(se->getIntegerSCEV(0, Base->getType())), - isUseOfPostIncrementedValue(IVSU.isUseOfPostIncrementedValue()) {} - - // Once we rewrite the code to insert the new IVs we want, update the - // operands of Inst to use the new expression 'NewBase', with 'Imm' added - // to it. - void RewriteInstructionToUseNewBase(const SCEV *NewBase, - Instruction *InsertPt, - SCEVExpander &Rewriter, Loop *L, Pass *P, - SmallVectorImpl<WeakVH> &DeadInsts, - ScalarEvolution *SE); - - Value *InsertCodeForBaseAtPosition(const SCEV *NewBase, - const Type *Ty, - SCEVExpander &Rewriter, - Instruction *IP, - ScalarEvolution *SE); - void dump() const; - }; -} -void BasedUser::dump() const { - dbgs() << " Base=" << *Base; - dbgs() << " Imm=" << *Imm; - dbgs() << " Inst: " << *Inst; -} +/// Formula - This class holds information that describes a formula for +/// satisfying a use. It may include broken-out immediates and scaled registers. +struct Formula { + /// AM - This is used to represent complex addressing, as well as other kinds + /// of interesting uses. + TargetLowering::AddrMode AM; -Value *BasedUser::InsertCodeForBaseAtPosition(const SCEV *NewBase, - const Type *Ty, - SCEVExpander &Rewriter, - Instruction *IP, - ScalarEvolution *SE) { - Value *Base = Rewriter.expandCodeFor(NewBase, 0, IP); + /// BaseRegs - The list of "base" registers for this use. When this is + /// non-empty, AM.HasBaseReg should be set to true. + SmallVector<const SCEV *, 2> BaseRegs; - // Wrap the base in a SCEVUnknown so that ScalarEvolution doesn't try to - // re-analyze it. - const SCEV *NewValSCEV = SE->getUnknown(Base); + /// ScaledReg - The 'scaled' register for this use. This should be non-null + /// when AM.Scale is not zero. + const SCEV *ScaledReg; - // Always emit the immediate into the same block as the user. - NewValSCEV = SE->getAddExpr(NewValSCEV, Imm); + Formula() : ScaledReg(0) {} - return Rewriter.expandCodeFor(NewValSCEV, Ty, IP); -} + unsigned getNumRegs() const; + uint32_t getComplexity() const; + const Type *getType() const; + void InitialMatch(const SCEV *S, Loop *L, + ScalarEvolution &SE, DominatorTree &DT); -// Once we rewrite the code to insert the new IVs we want, update the -// operands of Inst to use the new expression 'NewBase', with 'Imm' added -// to it. NewBasePt is the last instruction which contributes to the -// value of NewBase in the case that it's a diffferent instruction from -// the PHI that NewBase is computed from, or null otherwise. -// -void BasedUser::RewriteInstructionToUseNewBase(const SCEV *NewBase, - Instruction *NewBasePt, - SCEVExpander &Rewriter, Loop *L, Pass *P, - SmallVectorImpl<WeakVH> &DeadInsts, - ScalarEvolution *SE) { - if (!isa<PHINode>(Inst)) { - // By default, insert code at the user instruction. - BasicBlock::iterator InsertPt = Inst; - - // However, if the Operand is itself an instruction, the (potentially - // complex) inserted code may be shared by many users. Because of this, we - // want to emit code for the computation of the operand right before its old - // computation. This is usually safe, because we obviously used to use the - // computation when it was computed in its current block. However, in some - // cases (e.g. use of a post-incremented induction variable) the NewBase - // value will be pinned to live somewhere after the original computation. - // In this case, we have to back off. - // - // If this is a use outside the loop (which means after, since it is based - // on a loop indvar) we use the post-incremented value, so that we don't - // artificially make the preinc value live out the bottom of the loop. - if (!isUseOfPostIncrementedValue && L->contains(Inst)) { - if (NewBasePt && isa<PHINode>(OperandValToReplace)) { - InsertPt = NewBasePt; - ++InsertPt; - } else if (Instruction *OpInst - = dyn_cast<Instruction>(OperandValToReplace)) { - InsertPt = OpInst; - while (isa<PHINode>(InsertPt)) ++InsertPt; - } - } - Value *NewVal = InsertCodeForBaseAtPosition(NewBase, - OperandValToReplace->getType(), - Rewriter, InsertPt, SE); - // Replace the use of the operand Value with the new Phi we just created. - Inst->replaceUsesOfWith(OperandValToReplace, NewVal); - - DEBUG(dbgs() << " Replacing with "); - DEBUG(WriteAsOperand(dbgs(), NewVal, /*PrintType=*/false)); - DEBUG(dbgs() << ", which has value " << *NewBase << " plus IMM " - << *Imm << "\n"); - return; + /// referencesReg - Test if this formula references the given register. + bool referencesReg(const SCEV *S) const { + return S == ScaledReg || + std::find(BaseRegs.begin(), BaseRegs.end(), S) != BaseRegs.end(); } - // PHI nodes are more complex. We have to insert one copy of the NewBase+Imm - // expression into each operand block that uses it. Note that PHI nodes can - // have multiple entries for the same predecessor. We use a map to make sure - // that a PHI node only has a single Value* for each predecessor (which also - // prevents us from inserting duplicate code in some blocks). - DenseMap<BasicBlock*, Value*> InsertedCode; - PHINode *PN = cast<PHINode>(Inst); - for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) { - if (PN->getIncomingValue(i) == OperandValToReplace) { - // If the original expression is outside the loop, put the replacement - // code in the same place as the original expression, - // which need not be an immediate predecessor of this PHI. This way we - // need only one copy of it even if it is referenced multiple times in - // the PHI. We don't do this when the original expression is inside the - // loop because multiple copies sometimes do useful sinking of code in - // that case(?). - Instruction *OldLoc = dyn_cast<Instruction>(OperandValToReplace); - BasicBlock *PHIPred = PN->getIncomingBlock(i); - if (L->contains(OldLoc)) { - // If this is a critical edge, split the edge so that we do not insert - // the code on all predecessor/successor paths. We do this unless this - // is the canonical backedge for this loop, as this can make some - // inserted code be in an illegal position. - if (e != 1 && PHIPred->getTerminator()->getNumSuccessors() > 1 && - !isa<IndirectBrInst>(PHIPred->getTerminator()) && - (PN->getParent() != L->getHeader() || !L->contains(PHIPred))) { - - // First step, split the critical edge. - BasicBlock *NewBB = SplitCriticalEdge(PHIPred, PN->getParent(), - P, false); - - // Next step: move the basic block. In particular, if the PHI node - // is outside of the loop, and PredTI is in the loop, we want to - // move the block to be immediately before the PHI block, not - // immediately after PredTI. - if (L->contains(PHIPred) && !L->contains(PN)) - NewBB->moveBefore(PN->getParent()); + bool operator==(const Formula &Other) const { + return BaseRegs == Other.BaseRegs && + ScaledReg == Other.ScaledReg && + AM.Scale == Other.AM.Scale && + AM.BaseOffs == Other.AM.BaseOffs && + AM.BaseGV == Other.AM.BaseGV; + } - // Splitting the edge can reduce the number of PHI entries we have. - e = PN->getNumIncomingValues(); - PHIPred = NewBB; - i = PN->getBasicBlockIndex(PHIPred); - } - } - Value *&Code = InsertedCode[PHIPred]; - if (!Code) { - // Insert the code into the end of the predecessor block. - Instruction *InsertPt = (L->contains(OldLoc)) ? - PHIPred->getTerminator() : - OldLoc->getParent()->getTerminator(); - Code = InsertCodeForBaseAtPosition(NewBase, PN->getType(), - Rewriter, InsertPt, SE); - - DEBUG(dbgs() << " Changing PHI use to "); - DEBUG(WriteAsOperand(dbgs(), Code, /*PrintType=*/false)); - DEBUG(dbgs() << ", which has value " << *NewBase << " plus IMM " - << *Imm << "\n"); - } + // This sorts at least partially based on host pointer values which are + // not deterministic, so it is only usable for uniqification. + bool operator<(const Formula &Other) const { + if (BaseRegs != Other.BaseRegs) + return BaseRegs < Other.BaseRegs; + if (ScaledReg != Other.ScaledReg) + return ScaledReg < Other.ScaledReg; + if (AM.Scale != Other.AM.Scale) + return AM.Scale < Other.AM.Scale; + if (AM.BaseOffs != Other.AM.BaseOffs) + return AM.BaseOffs < Other.AM.BaseOffs; + if (AM.BaseGV != Other.AM.BaseGV) + return AM.BaseGV < Other.AM.BaseGV; + return false; + } - // Replace the use of the operand Value with the new Phi we just created. - PN->setIncomingValue(i, Code); - Rewriter.clear(); - } + void print(raw_ostream &OS) const; + void dump() const; +}; + +} + +/// getNumRegs - Return the total number of register operands used by this +/// formula. +unsigned Formula::getNumRegs() const { + return !!ScaledReg + BaseRegs.size(); +} + +/// getComplexity - Return an oversimplified value indicating the complexity +/// of this formula. This is used as a tie-breaker in choosing register +/// preferences. +uint32_t Formula::getComplexity() const { + // Encode the information in a uint32_t so that comparing with operator< + // will be interesting. + return + // Most significant, the number of registers. This saturates because we + // need the bits, and because beyond a few hundred it doesn't really matter. + (std::min(getNumRegs(), (1u<<15)-1) << 17) | + // Having multiple base regs is worse than having a base reg and a scale. + ((BaseRegs.size() > 1) << 16) | + // Scale absolute value. + ((AM.Scale != 0 ? (Log2_64(abs64(AM.Scale)) + 1) : 0u) << 9) | + // Scale sign, which is less significant than the absolute value. + ((AM.Scale < 0) << 8) | + // Offset absolute value. + ((AM.BaseOffs != 0 ? (Log2_64(abs64(AM.BaseOffs)) + 1) : 0u) << 1) | + // If a GV is present, treat it like a maximal offset. + ((AM.BaseGV ? ((1u<<7)-1) : 0) << 1) | + // Offset sign, which is less significant than the absolute offset. + ((AM.BaseOffs < 0) << 0); +} + +/// getType - Return the type of this formula, if it has one, or null +/// otherwise. This type is meaningless except for the bit size. +const Type *Formula::getType() const { + return !BaseRegs.empty() ? BaseRegs.front()->getType() : + ScaledReg ? ScaledReg->getType() : + AM.BaseGV ? AM.BaseGV->getType() : + 0; +} + +namespace { + +/// ComplexitySorter - A predicate which orders Formulae by the number of +/// registers they contain. +struct ComplexitySorter { + bool operator()(const Formula &LHS, const Formula &RHS) const { + unsigned L = LHS.getNumRegs(); + unsigned R = RHS.getNumRegs(); + if (L != R) return L < R; + + return LHS.getComplexity() < RHS.getComplexity(); } +}; - // PHI node might have become a constant value after SplitCriticalEdge. - DeadInsts.push_back(Inst); } +/// DoInitialMatch - Recurrsion helper for InitialMatch. +static void DoInitialMatch(const SCEV *S, Loop *L, + SmallVectorImpl<const SCEV *> &Good, + SmallVectorImpl<const SCEV *> &Bad, + ScalarEvolution &SE, DominatorTree &DT) { + // Collect expressions which properly dominate the loop header. + if (S->properlyDominates(L->getHeader(), &DT)) { + Good.push_back(S); + return; + } -/// fitsInAddressMode - Return true if V can be subsumed within an addressing -/// mode, and does not need to be put in a register first. -static bool fitsInAddressMode(const SCEV *V, const Type *AccessTy, - const TargetLowering *TLI, bool HasBaseReg) { - if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(V)) { - int64_t VC = SC->getValue()->getSExtValue(); - if (TLI) { - TargetLowering::AddrMode AM; - AM.BaseOffs = VC; - AM.HasBaseReg = HasBaseReg; - return TLI->isLegalAddressingMode(AM, AccessTy); - } else { - // Defaults to PPC. PPC allows a sign-extended 16-bit immediate field. - return (VC > -(1 << 16) && VC < (1 << 16)-1); + // Look at add operands. + if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) { + for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end(); + I != E; ++I) + DoInitialMatch(*I, L, Good, Bad, SE, DT); + return; + } + + // Look at addrec operands. + if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) { + if (!AR->getStart()->isZero()) { + DoInitialMatch(AR->getStart(), L, Good, Bad, SE, DT); + DoInitialMatch(SE.getAddRecExpr(SE.getIntegerSCEV(0, AR->getType()), + AR->getStepRecurrence(SE), + AR->getLoop()), + L, Good, Bad, SE, DT); + return; } } - if (const SCEVUnknown *SU = dyn_cast<SCEVUnknown>(V)) - if (GlobalValue *GV = dyn_cast<GlobalValue>(SU->getValue())) { - if (TLI) { - TargetLowering::AddrMode AM; - AM.BaseGV = GV; - AM.HasBaseReg = HasBaseReg; - return TLI->isLegalAddressingMode(AM, AccessTy); - } else { - // Default: assume global addresses are not legal. - } + // Handle a multiplication by -1 (negation) if it didn't fold. + if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S)) + if (Mul->getOperand(0)->isAllOnesValue()) { + SmallVector<const SCEV *, 4> Ops(Mul->op_begin()+1, Mul->op_end()); + const SCEV *NewMul = SE.getMulExpr(Ops); + + SmallVector<const SCEV *, 4> MyGood; + SmallVector<const SCEV *, 4> MyBad; + DoInitialMatch(NewMul, L, MyGood, MyBad, SE, DT); + const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue( + SE.getEffectiveSCEVType(NewMul->getType()))); + for (SmallVectorImpl<const SCEV *>::const_iterator I = MyGood.begin(), + E = MyGood.end(); I != E; ++I) + Good.push_back(SE.getMulExpr(NegOne, *I)); + for (SmallVectorImpl<const SCEV *>::const_iterator I = MyBad.begin(), + E = MyBad.end(); I != E; ++I) + Bad.push_back(SE.getMulExpr(NegOne, *I)); + return; } - return false; + // Ok, we can't do anything interesting. Just stuff the whole thing into a + // register and hope for the best. + Bad.push_back(S); } -/// MoveLoopVariantsToImmediateField - Move any subexpressions from Val that are -/// loop varying to the Imm operand. -static void MoveLoopVariantsToImmediateField(const SCEV *&Val, const SCEV *&Imm, - Loop *L, ScalarEvolution *SE) { - if (Val->isLoopInvariant(L)) return; // Nothing to do. - - if (const SCEVAddExpr *SAE = dyn_cast<SCEVAddExpr>(Val)) { - SmallVector<const SCEV *, 4> NewOps; - NewOps.reserve(SAE->getNumOperands()); - - for (unsigned i = 0; i != SAE->getNumOperands(); ++i) - if (!SAE->getOperand(i)->isLoopInvariant(L)) { - // If this is a loop-variant expression, it must stay in |