1 files changed, 2268 insertions, 2355 deletions
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index acc66246c3..fa820ed8e4 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -17,45 +17,6 @@
 // available on the target, and it performs a variety of other optimizations
 // related to loop induction variables.
 //
-// Terminology note: this code has a lot of handling for "post-increment" or
-// "post-inc" users. This is not talking about post-increment addressing modes;
-// it is instead talking about code like this:
-//
-//   %i = phi [ 0, %entry ], [ %i.next, %latch ]
-//   ...
-//   %i.next = add %i, 1
-//   %c = icmp eq %i.next, %n
-//
-// The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
-// it's useful to think about these as the same register, with some uses using
-// the value of the register before the add and some using // it after. In this
-// example, the icmp is a post-increment user, since it uses %i.next, which is
-// the value of the induction variable after the increment. The other common
-// case of post-increment users is users outside the loop.
-//
-// TODO: More sophistication in the way Formulae are generated.
-//
-// TODO: Handle multiple loops at a time.
-//
-// TODO: test/CodeGen/X86/full-lsr.ll should get full lsr. The problem is
-//       that {0,+,1}<%bb> is getting picked first because all 7 uses can
-//       use it, and while it's a pretty good solution, it means that LSR
-//       doesn't look further to find an even better solution.
-//
-// TODO: Should TargetLowering::AddrMode::BaseGV be changed to a ConstantExpr
-//       instead of a GlobalValue?
-//
-// TODO: When truncation is free, truncate ICmp users' operands to make it a
-//       smaller encoding (on x86 at least).
-//
-// TODO: When a negated register is used by an add (such as in a list of
-//       multiple base registers, or as the increment expression in an addrec),
-//       we may not actually need both reg and (-1 * reg) in registers; the
-//       negation can be implemented by using a sub instead of an add. The
-//       lack of support for taking this into consideration when making
-//       register pressure decisions is partly worked around by the "Special"
-//       use kind.
-//
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "loop-reduce"
@@ -65,1549 +26,2025 @@
 #include "llvm/IntrinsicInst.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Analysis/IVUsers.h"
-#include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Transforms/Utils/AddrModeMatcher.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/ADT/SmallBitVector.h"
-#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLowering.h"
 #include <algorithm>
 using namespace llvm;
 
-namespace {
-
-// Constant strides come first which in turns are sorted by their absolute
-// values. If absolute values are the same, then positive strides comes first.
-// e.g.
-// 4, -1, X, 1, 2 ==> 1, -1, 2, 4, X
-struct StrideCompare {
-  const ScalarEvolution &SE;
-  explicit StrideCompare(const ScalarEvolution &se) : SE(se) {}
-
-  bool operator()(const SCEV *const &LHS, const SCEV *const &RHS) const {
-    const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(LHS);
-    const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(RHS);
-    if (LHSC && RHSC) {
-      unsigned BitWidth = std::max(SE.getTypeSizeInBits(LHS->getType()),
-                                   SE.getTypeSizeInBits(RHS->getType()));
-      APInt  LV = LHSC->getValue()->getValue();
-      APInt  RV = RHSC->getValue()->getValue();
-      LV.sextOrTrunc(BitWidth);
-      RV.sextOrTrunc(BitWidth);
-      APInt ALV = LV.abs();
-      APInt ARV = RV.abs();
-      if (ALV == ARV) {
-        if (LV != RV)
-          return LV.sgt(RV);
-      } else {
-        return ALV.ult(ARV);
-      }
-
-      // If it's the same value but different type, sort by bit width so
-      // that we emit larger induction variables before smaller
-      // ones, letting the smaller be re-written in terms of larger ones.
-      return SE.getTypeSizeInBits(RHS->getType()) <
-             SE.getTypeSizeInBits(LHS->getType());
-    }
-    return LHSC && !RHSC;
-  }
-};
-
-/// RegSortData - This class holds data which is used to order reuse
-/// candidates.
-class RegSortData {
-public:
-  /// Bits - This represents the set of LSRUses (by index) which reference a
-  /// particular register.
-  SmallBitVector Bits;
-
-  /// MaxComplexity - This represents the greatest complexity (see the comments
-  /// on Formula::getComplexity) seen with a particular register.
-  uint32_t MaxComplexity;
-
-  /// Index - This holds an arbitrary value used as a last-resort tie breaker
-  /// to ensure deterministic behavior.
-  unsigned Index;
-
-  RegSortData() {}
-
-  void print(raw_ostream &OS) const;
-  void dump() const;
-};
+STATISTIC(NumReduced ,    "Number of IV uses strength reduced");
+STATISTIC(NumInserted,    "Number of PHIs inserted");
+STATISTIC(NumVariable,    "Number of PHIs with variable strides");
+STATISTIC(NumEliminated,  "Number of strides eliminated");
+STATISTIC(NumShadow,      "Number of Shadow IVs optimized");
+STATISTIC(NumImmSunk,     "Number of common expr immediates sunk into uses");
+STATISTIC(NumLoopCond,    "Number of loop terminating conds optimized");
+STATISTIC(NumCountZero,   "Number of count iv optimized to count toward zero");
 
-}
-
-void RegSortData::print(raw_ostream &OS) const {
-  OS << "[NumUses=" << Bits.count()
-     << ", MaxComplexity=";
-  OS.write_hex(MaxComplexity);
-  OS << ", Index=" << Index << ']';
-}
-
-void RegSortData::dump() const {
-  print(errs()); errs() << '\n';
-}
+static cl::opt<bool> EnableFullLSRMode("enable-full-lsr",
+                                       cl::init(false),
+                                       cl::Hidden);
 
 namespace {
 
-/// RegCount - This is a helper class to sort a given set of registers
-/// according to associated RegSortData values.
-class RegCount {
-public:
-  const SCEV *Reg;
-  RegSortData Sort;
-
-  RegCount(const SCEV *R, const RegSortData &RSD)
-    : Reg(R), Sort(RSD) {}
-
-  // Sort by count. Returning true means the register is preferred.
-  bool operator<(const RegCount &Other) const {
-    // Sort by the number of unique uses of this register.
-    unsigned A = Sort.Bits.count();
-    unsigned B = Other.Sort.Bits.count();
-    if (A != B) return A > B;
-
-    if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
-      const SCEVAddRecExpr *BR = dyn_cast<SCEVAddRecExpr>(Other.Reg);
-      // AddRecs have higher priority than other things.
-      if (!BR) return true;
-
-      // Prefer affine values.
-      if (AR->isAffine() != BR->isAffine())
-        return AR->isAffine();
-
-      const Loop *AL = AR->getLoop();
-      const Loop *BL = BR->getLoop();
-      if (AL != BL) {
-        unsigned ADepth = AL->getLoopDepth();
-        unsigned BDepth = BL->getLoopDepth();
-        // Prefer a less deeply nested addrec.
-        if (ADepth != BDepth)
-          return ADepth < BDepth;
-
-        // Different loops at the same depth; do something arbitrary.
-        BasicBlock *AH = AL->getHeader();
-        BasicBlock *BH = BL->getHeader();
-        for (Function::iterator I = AH, E = AH->getParent()->end(); I != E; ++I)
-          if (&*I == BH) return true;
-        return false;
-      }
+  struct BasedUser;
 
-      // Sort addrecs by stride.
-      const SCEV *AStep = AR->getOperand(1);
-      const SCEV *BStep = BR->getOperand(1);
-      if (AStep != BStep) {
-        if (const SCEVConstant *AC = dyn_cast<SCEVConstant>(AStep)) {
-          const SCEVConstant *BC = dyn_cast<SCEVConstant>(BStep);
-          if (!BC) return true;
-          // Arbitrarily prefer wider registers.
-          if (AC->getValue()->getValue().getBitWidth() !=
-              BC->getValue()->getValue().getBitWidth())
-            return AC->getValue()->getValue().getBitWidth() >
-                   BC->getValue()->getValue().getBitWidth();
-          // Ignore the sign bit, assuming that striding by a negative value
-          // is just as easy as by a positive value.
-          // Prefer the addrec with the lesser absolute value stride, as it
-          // will allow uses to have simpler addressing modes.
-          return AC->getValue()->getValue().abs()
-            .ult(BC->getValue()->getValue().abs());
-        }
-      }
+  /// IVInfo - This structure keeps track of one IV expression inserted during
+  /// StrengthReduceStridedIVUsers. It contains the stride, the common base, as
+  /// well as the PHI node and increment value created for rewrite.
+  struct IVExpr {
+    const SCEV *Stride;
+    const SCEV *Base;
+    PHINode    *PHI;
 
-      // Then sort by the register which will permit the simplest uses.
-      // This is a heuristic; currently we only track the most complex use as a
-      // representative.
-      if (Sort.MaxComplexity != Other.Sort.MaxComplexity)
-        return Sort.MaxComplexity < Other.Sort.MaxComplexity;
-
-      // Then sort them by their start values.
-      const SCEV *AStart = AR->getStart();
-      const SCEV *BStart = BR->getStart();
-      if (AStart != BStart) {
-        if (const SCEVConstant *AC = dyn_cast<SCEVConstant>(AStart)) {
-          const SCEVConstant *BC = dyn_cast<SCEVConstant>(BStart);
-          if (!BC) return true;
-          // Arbitrarily prefer wider registers.
-          if (AC->getValue()->getValue().getBitWidth() !=
-              BC->getValue()->getValue().getBitWidth())
-            return AC->getValue()->getValue().getBitWidth() >
-                   BC->getValue()->getValue().getBitWidth();
-          // Prefer positive over negative if the absolute values are the same.
-          if (AC->getValue()->getValue().abs() ==
-              BC->getValue()->getValue().abs())
-            return AC->getValue()->getValue().isStrictlyPositive();
-          // Prefer the addrec with the lesser absolute value start.
-          return AC->getValue()->getValue().abs()
-            .ult(BC->getValue()->getValue().abs());
-        }
-      }
-    } else {
-      // AddRecs have higher priority than other things.
-      if (isa<SCEVAddRecExpr>(Other.Reg)) return false;
-      // Sort by the register which will permit the simplest uses.
-      // This is a heuristic; currently we only track the most complex use as a
-      // representative.
-      if (Sort.MaxComplexity != Other.Sort.MaxComplexity)
-        return Sort.MaxComplexity < Other.Sort.MaxComplexity;
-    }
+    IVExpr(const SCEV *const stride, const SCEV *const base, PHINode *phi)
+      : Stride(stride), Base(base), PHI(phi) {}
+  };
 
+  /// IVsOfOneStride - This structure keeps track of all IV expression inserted
+  /// during StrengthReduceStridedIVUsers for a particular stride of the IV.
+  struct IVsOfOneStride {
+    std::vector<IVExpr> IVs;
 
-    // Tie-breaker: the arbitrary index, to ensure a reliable ordering.
-    return Sort.Index < Other.Sort.Index;
-  }
+    void addIV(const SCEV *const Stride, const SCEV *const Base, PHINode *PHI) {
+      IVs.push_back(IVExpr(Stride, Base, PHI));
+    }
+  };
 
-  void print(raw_ostream &OS) const;
-  void dump() const;
-};
+  class LoopStrengthReduce : public LoopPass {
+    IVUsers *IU;
+    ScalarEvolution *SE;
+    bool Changed;
+
+    /// IVsByStride - Keep track of all IVs that have been inserted for a
+    /// particular stride.
+    std::map<const SCEV *, IVsOfOneStride> IVsByStride;
+
+    /// DeadInsts - Keep track of instructions we may have made dead, so that
+    /// we can remove them after we are done working.
+    SmallVector<WeakVH, 16> DeadInsts;
+
+    /// TLI - Keep a pointer of a TargetLowering to consult for determining
+    /// transformation profitability.
+    const TargetLowering *TLI;
+
+  public:
+    static char ID; // Pass ID, replacement for typeid
+    explicit LoopStrengthReduce(const TargetLowering *tli = NULL) :
+      LoopPass(&ID), TLI(tli) {}
+
+    bool runOnLoop(Loop *L, LPPassManager &LPM);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      // We split critical edges, so we change the CFG.  However, we do update
+      // many analyses if they are around.
+      AU.addPreservedID(LoopSimplifyID);
+      AU.addPreserved("loops");
+      AU.addPreserved("domfrontier");
+      AU.addPreserved("domtree");
+
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addRequired<ScalarEvolution>();
+      AU.addPreserved<ScalarEvolution>();
+      AU.addRequired<IVUsers>();
+      AU.addPreserved<IVUsers>();
+    }
 
+  private:
+    void OptimizeIndvars(Loop *L);
+
+    /// OptimizeLoopTermCond - Change loop terminating condition to use the
+    /// postinc iv when possible.
+    void OptimizeLoopTermCond(Loop *L);
+
+    /// OptimizeShadowIV - If IV is used in a int-to-float cast
+    /// inside the loop then try to eliminate the cast opeation.
+    void OptimizeShadowIV(Loop *L);
+
+    /// OptimizeMax - Rewrite the loop's terminating condition
+    /// if it uses a max computation.
+    ICmpInst *OptimizeMax(Loop *L, ICmpInst *Cond,
+                          IVStrideUse* &CondUse);
+
+    /// OptimizeLoopCountIV - If, after all sharing of IVs, the IV used for
+    /// deciding when to exit the loop is used only for that purpose, try to
+    /// rearrange things so it counts down to a test against zero.
+    bool OptimizeLoopCountIV(Loop *L);
+    bool OptimizeLoopCountIVOfStride(const SCEV* &Stride,
+                                     IVStrideUse* &CondUse, Loop *L);
+
+    /// StrengthReduceIVUsersOfStride - Strength reduce all of the users of a
+    /// single stride of IV.  All of the users may have different starting
+    /// values, and this may not be the only stride.
+    void StrengthReduceIVUsersOfStride(const SCEV *Stride,
+                                      IVUsersOfOneStride &Uses,
+                                      Loop *L);
+    void StrengthReduceIVUsers(Loop *L);
+
+    ICmpInst *ChangeCompareStride(Loop *L, ICmpInst *Cond,
+                                  IVStrideUse* &CondUse,
+                                  const SCEV* &CondStride,
+                                  bool PostPass = false);
+
+    bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse,
+                           const SCEV* &CondStride);
+    bool RequiresTypeConversion(const Type *Ty, const Type *NewTy);
+    const SCEV *CheckForIVReuse(bool, bool, bool, const SCEV *,
+                             IVExpr&, const Type*,
+                             const std::vector<BasedUser>& UsersToProcess);
+    bool ValidScale(bool, int64_t,
+                    const std::vector<BasedUser>& UsersToProcess);
+    bool ValidOffset(bool, int64_t, int64_t,
+                     const std::vector<BasedUser>& UsersToProcess);
+    const SCEV *CollectIVUsers(const SCEV *Stride,
+                              IVUsersOfOneStride &Uses,
+                              Loop *L,
+                              bool &AllUsesAreAddresses,
+                              bool &AllUsesAreOutsideLoop,
+                              std::vector<BasedUser> &UsersToProcess);
+    bool StrideMightBeShared(const SCEV *Stride, Loop *L, bool CheckPreInc);
+    bool ShouldUseFullStrengthReductionMode(
+                                const std::vector<BasedUser> &UsersToProcess,
+                                const Loop *L,
+                                bool AllUsesAreAddresses,
+                                const SCEV *Stride);
+    void PrepareToStrengthReduceFully(
+                             std::vector<BasedUser> &UsersToProcess,
+                             const SCEV *Stride,
+                             const SCEV *CommonExprs,
+                             const Loop *L,
+                             SCEVExpander &PreheaderRewriter);
+    void PrepareToStrengthReduceFromSmallerStride(
+                                         std::vector<BasedUser> &UsersToProcess,
+                                         Value *CommonBaseV,
+                                         const IVExpr &ReuseIV,
+                                         Instruction *PreInsertPt);
+    void PrepareToStrengthReduceWithNewPhi(
+                                  std::vector<BasedUser> &UsersToProcess,
+                                  const SCEV *Stride,
+                                  const SCEV *CommonExprs,
+                                  Value *CommonBaseV,
+                                  Instruction *IVIncInsertPt,
+                                  const Loop *L,
+                                  SCEVExpander &PreheaderRewriter);
+
+    void DeleteTriviallyDeadInstructions();
+  };
 }
 
-void RegCount::print(raw_ostream &OS) const {
-  OS << *Reg << ':';
-  Sort.print(OS);
-}
+char LoopStrengthReduce::ID = 0;
+static RegisterPass<LoopStrengthReduce>
+X("loop-reduce", "Loop Strength Reduction");
 
-void RegCount::dump() const {
-  print(errs()); errs() << '\n';
+Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) {
+  return new LoopStrengthReduce(TLI);
 }
 
-namespace {
-
-/// Formula - This class holds information that describes a formula for
-/// satisfying a use. It may include broken-out immediates and scaled registers.
-struct Formula {
-  /// AM - This is used to represent complex addressing, as well as other kinds
-  /// of interesting uses.
-  TargetLowering::AddrMode AM;
-
-  /// BaseRegs - The list of "base" registers for this use. When this is
-  /// non-empty, AM.HasBaseReg should be set to true.
-  SmallVector<const SCEV *, 2> BaseRegs;
-
-  /// ScaledReg - The 'scaled' register for this use. This should be non-null
-  /// when AM.Scale is not zero.
-  const SCEV *ScaledReg;
-
-  Formula() : ScaledReg(0) {}
-
-  unsigned getNumRegs() const;
-  uint32_t getComplexity() const;
-  const Type *getType() const;
+/// DeleteTriviallyDeadInstructions - If any of the instructions is the
+/// specified set are trivially dead, delete them and see if this makes any of
+/// their operands subsequently dead.
+void LoopStrengthReduce::DeleteTriviallyDeadInstructions() {
+  while (!DeadInsts.empty()) {
+    Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());
 
-  void InitialMatch(const SCEV *S, Loop *L,
-                    ScalarEvolution &SE, DominatorTree &DT);
+    if (I == 0 || !isInstructionTriviallyDead(I))
+      continue;
 
-  /// referencesReg - Test if this formula references the given register.
-  bool referencesReg(const SCEV *S) const {
-    return S == ScaledReg ||
-           std::find(BaseRegs.begin(), BaseRegs.end(), S) != BaseRegs.end();
-  }
+    for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
+      if (Instruction *U = dyn_cast<Instruction>(*OI)) {
+        *OI = 0;
+        if (U->use_empty())
+          DeadInsts.push_back(U);
+      }
 
-  bool operator==(const Formula &Other) const {
-    return BaseRegs == Other.BaseRegs &&
-           ScaledReg == Other.ScaledReg &&
-           AM.Scale == Other.AM.Scale &&
-           AM.BaseOffs == Other.AM.BaseOffs &&
-           AM.BaseGV == Other.AM.BaseGV;
+    I->eraseFromParent();
+    Changed = true;
   }
+}
 
-  // This sorts at least partially based on host pointer values which are
-  // not deterministic, so it is only usable for uniqification.
-  bool operator<(const Formula &Other) const {
-    if (BaseRegs != Other.BaseRegs)
-      return BaseRegs < Other.BaseRegs;
-    if (ScaledReg != Other.ScaledReg)
-      return ScaledReg < Other.ScaledReg;
-    if (AM.Scale != Other.AM.Scale)
-      return AM.Scale < Other.AM.Scale;
-    if (AM.BaseOffs != Other.AM.BaseOffs)
-      return AM.BaseOffs < Other.AM.BaseOffs;
-    if (AM.BaseGV != Other.AM.BaseGV)
-      return AM.BaseGV < Other.AM.BaseGV;
-    return false;
+/// isAddressUse - Returns true if the specified instruction is using the
+/// specified value as an address.
+static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
+  bool isAddress = isa<LoadInst>(Inst);
+  if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+    if (SI->getOperand(1) == OperandVal)
+      isAddress = true;
+  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+    // Addressing modes can also be folded into prefetches and a variety
+    // of intrinsics.
+    switch (II->getIntrinsicID()) {
+      default: break;
+      case Intrinsic::prefetch:
+      case Intrinsic::x86_sse2_loadu_dq:
+      case Intrinsic::x86_sse2_loadu_pd:
+      case Intrinsic::x86_sse_loadu_ps:
+      case Intrinsic::x86_sse_storeu_ps:
+      case Intrinsic::x86_sse2_storeu_pd:
+      case Intrinsic::x86_sse2_storeu_dq:
+      case Intrinsic::x86_sse2_storel_dq:
+        if (II->getOperand(1) == OperandVal)
+          isAddress = true;
+        break;
+    }
   }
-
-  void print(raw_ostream &OS) const;
-  void dump() const;
-};
-
+  return isAddress;
 }
 
-/// getNumRegs - Return the total number of register operands used by this
-/// formula. This does not include register uses implied by non-constant
-/// addrec strides.
-unsigned Formula::getNumRegs() const {
-  return !!ScaledReg + BaseRegs.size();
+/// getAccessType - Return the type of the memory being accessed.
+static const Type *getAccessType(const Instruction *Inst) {
+  const Type *AccessTy = Inst->getType();
+  if (const StoreInst *SI = dyn_cast<StoreInst>(Inst))
+    AccessTy = SI->getOperand(0)->getType();
+  else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
+    // Addressing modes can also be folded into prefetches and a variety
+    // of intrinsics.
+    switch (II->getIntrinsicID()) {
+    default: break;
+    case Intrinsic::x86_sse_storeu_ps:
+    case Intrinsic::x86_sse2_storeu_pd:
+    case Intrinsic::x86_sse2_storeu_dq:
+    case Intrinsic::x86_sse2_storel_dq:
+      AccessTy = II->getOperand(1)->getType();
+      break;
+    }
+  }
+  return AccessTy;
 }
 
-/// getComplexity - Return an oversimplified value indicating the complexity
-/// of this formula. This is used as a tie-breaker in choosing register
-/// preferences.
-uint32_t Formula::getComplexity() const {
-  // Encode the information in a uint32_t so that comparing with operator<
-  // will be interesting.
-  return
-    // Most significant, the number of registers. This saturates because we
-    // need the bits, and because beyond a few hundred it doesn't really matter.
-    (std::min(getNumRegs(), (1u<<15)-1) << 17) |
-    // Having multiple base regs is worse than having a base reg and a scale.
-    ((BaseRegs.size() > 1) << 16) |
-    // Scale absolute value.
-    ((AM.Scale != 0 ? (Log2_64(abs64(AM.Scale)) + 1) : 0u) << 9) |
-    // Scale sign, which is less significant than the absolute value.
-    ((AM.Scale < 0) << 8) |
-    // Offset absolute value.
-    ((AM.BaseOffs != 0 ? (Log2_64(abs64(AM.BaseOffs)) + 1) : 0u) << 1) |
-    // If a GV is present, treat it like a maximal offset.
-    ((AM.BaseGV ? ((1u<<7)-1) : 0) << 1) |
-    // Offset sign, which is less significant than the absolute offset.
-    ((AM.BaseOffs < 0) << 0);
+namespace {
+  /// BasedUser - For a particular base value, keep information about how we've
+  /// partitioned the expression so far.
+  struct BasedUser {
+    /// Base - The Base value for the PHI node that needs to be inserted for
+    /// this use.  As the use is processed, information gets moved from this
+    /// field to the Imm field (below).  BasedUser values are sorted by this
+    /// field.
+    const SCEV *Base;
+
+    /// Inst - The instruction using the induction variable.
+    Instruction *Inst;
+
+    /// OperandValToReplace - The operand value of Inst to replace with the
+    /// EmittedBase.
+    Value *OperandValToReplace;
+
+    /// Imm - The immediate value that should be added to the base immediately
+    /// before Inst, because it will be folded into the imm field of the
+    /// instruction.  This is also sometimes used for loop-variant values that
+    /// must be added inside the loop.
+    const SCEV *Imm;
+
+    /// Phi - The induction variable that performs the striding that
+    /// should be used for this user.
+    PHINode *Phi;
+
+    // isUseOfPostIncrementedValue - True if this should use the
+    // post-incremented version of this IV, not the preincremented version.
+    // This can only be set in special cases, such as the terminating setcc
+    // instruction for a loop and uses outside the loop that are dominated by
+    // the loop.
+    bool isUseOfPostIncrementedValue;
+
+    BasedUser(IVStrideUse &IVSU, ScalarEvolution *se)
+      : Base(IVSU.getOffset()), Inst(IVSU.getUser()),
+        OperandValToReplace(IVSU.getOperandValToReplace()),
+        Imm(se->getIntegerSCEV(0, Base->getType())),
+        isUseOfPostIncrementedValue(IVSU.isUseOfPostIncrementedValue()) {}
+
+    // Once we rewrite the code to insert the new IVs we want, update the
+    // operands of Inst to use the new expression 'NewBase', with 'Imm' added
+    // to it.
+    void RewriteInstructionToUseNewBase(const SCEV *NewBase,
+                                        Instruction *InsertPt,
+                                       SCEVExpander &Rewriter, Loop *L, Pass *P,
+                                        SmallVectorImpl<WeakVH> &DeadInsts,
+                                        ScalarEvolution *SE);
+
+    Value *InsertCodeForBaseAtPosition(const SCEV *NewBase,
+                                       const Type *Ty,
+                                       SCEVExpander &Rewriter,
+                                       Instruction *IP,
+                                       ScalarEvolution *SE);
+    void dump() const;
+  };
 }
 
-/// getType - Return the type of this formula, if it has one, or null
-/// otherwise. This type is meaningless except for the bit size.
-const Type *Formula::getType() const {
-  return !BaseRegs.empty() ? BaseRegs.front()->getType() :
-         ScaledReg ? ScaledReg->getType() :
-         AM.BaseGV ? AM.BaseGV->getType() :
-         0;
+void BasedUser::dump() const {
+  dbgs() << " Base=" << *Base;
+  dbgs() << " Imm=" << *Imm;
+  dbgs() << "   Inst: " << *Inst;
 }
 
-namespace {
+Value *BasedUser::InsertCodeForBaseAtPosition(const SCEV *NewBase,
+                                              const Type *Ty,
+                                              SCEVExpander &Rewriter,
+                                              Instruction *IP,
+                                              ScalarEvolution *SE) {
+  Value *Base = Rewriter.expandCodeFor(NewBase, 0, IP);
 
-/// ComplexitySorter - A predicate which orders Formulae by the number of
-/// registers they contain.
-struct ComplexitySorter {
-  bool operator()(const Formula &LHS, const Formula &RHS) const {
-    unsigned L = LHS.getNumRegs();
-    unsigned R = RHS.getNumRegs();
-    if (L != R) return L < R;
+  // Wrap the base in a SCEVUnknown so that ScalarEvolution doesn't try to
+  // re-analyze it.
+  const SCEV *NewValSCEV = SE->getUnknown(Base);
 
-    return LHS.getComplexity() < RHS.getComplexity();
-  }
-};
+  // Always emit the immediate into the same block as the user.
+  NewValSCEV = SE->getAddExpr(NewValSCEV, Imm);
 
+  return Rewriter.expandCodeFor(NewValSCEV, Ty, IP);
 }
 
-/// DoInitialMatch - Recurrsion helper for InitialMatch.
-static void DoInitialMatch(const SCEV *S, Loop *L,
-                           SmallVectorImpl<const SCEV *> &Good,
-                           SmallVectorImpl<const SCEV *> &Bad,
-                           ScalarEvolution &SE, DominatorTree &DT) {
-  // Collect expressions which properly dominate the loop header.
-  if (S->properlyDominates(L->getHeader(), &DT)) {
-    Good.push_back(S);
-    return;
-  }
-
-  // Look at add operands.
-  if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
-    for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
-         I != E; ++I)
-      DoInitialMatch(*I, L, Good, Bad, SE, DT);
-    return;
-  }
 
-  // Look at addrec operands.
-  if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
-    if (!AR->getStart()->isZero()) {
-      DoInitialMatch(AR->getStart(), L, Good, Bad, SE, DT);
-      DoInitialMatch(SE.getAddRecExpr(SE.getIntegerSCEV(0, AR->getType()),
-                                      AR->getStepRecurrence(SE),
-                                      AR->getLoop()),
-                     L, Good, Bad, SE, DT);
-      return;
+// Once we rewrite the code to insert the new IVs we want, update the
+// operands of Inst to use the new expression 'NewBase', with 'Imm' added
+// to it. NewBasePt is the last instruction which contributes to the
+// value of NewBase in the case that it's a diffferent instruction from
+// the PHI that NewBase is computed from, or null otherwise.
+//
+void BasedUser::RewriteInstructionToUseNewBase(const SCEV *NewBase,
+                                               Instruction *NewBasePt,
+                                      SCEVExpander &Rewriter, Loop *L, Pass *P,
+                                      SmallVectorImpl<WeakVH> &DeadInsts,
+                                      ScalarEvolution *SE) {
+  if (!isa<PHINode>(Inst)) {
+    // By default, insert code at the user instruction.
+    BasicBlock::iterator InsertPt = Inst;
+
+    // However, if the Operand is itself an instruction, the (potentially
+    // complex) inserted code may be shared by many users.  Because of this, we
+    // want to emit code for the computation of the operand right before its old
+    // computation.  This is usually safe, because we obviously used to use the
+    // computation when it was computed in its current block.  However, in some
+    // cases (e.g. use of a post-incremented induction variable) the NewBase
+    // value will be pinned to live somewhere after the original computation.
+    // In this case, we have to back off.
+    //
+    // If this is a use outside the loop (which means after, since it is based
+    // on a loop indvar) we use the post-incremented value, so that we don't
+    // artificially make the preinc value live out the bottom of the loop.
+    if (!isUseOfPostIncrementedValue && L->contains(Inst)) {
+      if (NewBasePt && isa<PHINode>(OperandValToReplace)) {
+        InsertPt = NewBasePt;
+        ++InsertPt;
+      } else if (Instruction *OpInst
+                 = dyn_cast<Instruction>(OperandValToReplace)) {
+        InsertPt = OpInst;
+        while (isa<PHINode>(InsertPt)) ++InsertPt;
+      }
     }
+    Value *NewVal = InsertCodeForBaseAtPosition(NewBase,
+                                                OperandValToReplace->getType(),
+                                                Rewriter, InsertPt, SE);
+    // Replace the use of the operand Value with the new Phi we just created.
+    Inst->replaceUsesOfWith(OperandValToReplace, NewVal);
+
+    DEBUG(dbgs() << "      Replacing with ");
+    DEBUG(WriteAsOperand(dbgs(), NewVal, /*PrintType=*/false));
+    DEBUG(dbgs() << ", which has value " << *NewBase << " plus IMM "
+                 << *Imm << "\n");
+    return;
   }
 
-  // Handle a multiplication by -1 (negation) if it didn't fold.
-  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
-    if (Mul->getOperand(0)->isAllOnesValue()) {
-      SmallVector<const SCEV *, 4> Ops(Mul->op_begin()+1, Mul->op_end());
-      const SCEV *NewMul = SE.getMulExpr(Ops);
-
-      SmallVector<const SCEV *, 4> MyGood;
-      SmallVector<const SCEV *, 4> MyBad;
-      DoInitialMatch(NewMul, L, MyGood, MyBad, SE, DT);
-      const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
-        SE.getEffectiveSCEVType(NewMul->getType())));
-      for (SmallVectorImpl<const SCEV *>::const_iterator I = MyGood.begin(),
-           E = MyGood.end(); I != E; ++I)
-        Good.push_back(SE.getMulExpr(NegOne, *I));
-      for (SmallVectorImpl<const SCEV *>::const_iterator I = MyBad.begin(),
-           E = MyBad.end(); I != E; ++I)
-        Bad.push_back(SE.getMulExpr(NegOne, *I));
-      return;
-    }
-
-  // Ok, we can't do anything interesting. Just stuff the whole thing into a
-  // register and hope for the best.
-  Bad.push_back(S);
-}
+  // PHI nodes are more complex.  We have to insert one copy of the NewBase+Imm
+  // expression into each operand block that uses it.  Note that PHI nodes can
+  // have multiple entries for the same predecessor.  We use a map to make sure
+  // that a PHI node only has a single Value* for each predecessor (which also
+  // prevents us from inserting duplicate code in some blocks).
+  DenseMap<BasicBlock*, Value*> InsertedCode;
+  PHINode *PN = cast<PHINode>(Inst);
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    if (PN->getIncomingValue(i) == OperandValToReplace) {
+      // If the original expression is outside the loop, put the replacement
+      // code in the same place as the original expression,
+      // which need not be an immediate predecessor of this PHI.  This way we
+      // need only one copy of it even if it is referenced multiple times in
+      // the PHI.  We don't do this when the original expression is inside the
+      // loop because multiple copies sometimes do useful sinking of code in
+      // that case(?).
+      Instruction *OldLoc = dyn_cast<Instruction>(OperandValToReplace);
+      BasicBlock *PHIPred = PN->getIncomingBlock(i);
+      if (L->contains(OldLoc)) {
+        // If this is a critical edge, split the edge so that we do not insert
+        // the code on all predecessor/successor paths.  We do this unless this
+        // is the canonical backedge for this loop, as this can make some
+        // inserted code be in an illegal position.
+        if (e != 1 && PHIPred->getTerminator()->getNumSuccessors() > 1 &&
+            !isa<IndirectBrInst>(PHIPred->getTerminator()) &&
+            (PN->getParent() != L->getHeader() || !L->contains(PHIPred))) {
+
+          // First step, split the critical edge.
+          BasicBlock *NewBB = SplitCriticalEdge(PHIPred, PN->getParent(),
+                                                P, false);
+
+          // Next step: move the basic block.  In particular, if the PHI node
+          // is outside of the loop, and PredTI is in the loop, we want to
+          // move the block to be immediately before the PHI block, not
+          // immediately after PredTI.
+          if (L->contains(PHIPred) && !L->contains(PN))
+            NewBB->moveBefore(PN->getParent());
 
-/// InitialMatch - Incorporate loop-variant parts of S into this Formula,
-/// attempting to keep all loop-invariant and loop-computable values in a
-/// single base register.
-void Formula::InitialMatch(const SCEV *S, Loop *L,
-                           ScalarEvolution &SE, DominatorTree &DT) {
-  SmallVector<const SCEV *, 4> Good;
-  SmallVector<const SCEV *, 4> Bad;
-  DoInitialMatch(S, L, Good, Bad, SE, DT);
-  if (!Good.empty()) {
-    BaseRegs.push_back(SE.getAddExpr(Good));
-    AM.HasBaseReg = true;
-  }
-  if (!Bad.empty()) {
-    BaseRegs.push_back(SE.getAddExpr(Bad));
-    AM.HasBaseReg = true;
-  }
-}
+          // Splitting the edge can reduce the number of PHI entries we have.
+          e = PN->getNumIncomingValues();
+          PHIPred = NewBB;
+          i = PN->getBasicBlockIndex(PHIPred);
+        }
+      }
+      Value *&Code = InsertedCode[PHIPred];
+      if (!Code) {
+        // Insert the code into the end of the predecessor block.
+        Instruction *InsertPt = (L->contains(OldLoc)) ?
+                                PHIPred->getTerminator() :
+                                OldLoc->getParent()->getTerminator();
+        Code = InsertCodeForBaseAtPosition(NewBase, PN->getType(),
+                                           Rewriter, InsertPt, SE);
+
+        DEBUG(dbgs() << "      Changing PHI use to ");
+        DEBUG(WriteAsOperand(dbgs(), Code, /*PrintType=*/false));
+        DEBUG(dbgs() << ", which has value " << *NewBase << " plus IMM "
+                     << *Imm << "\n");
+      }
 
-void Formula::print(raw_ostream &OS) const {
-  bool First = true;
-  if (AM.BaseGV) {
-    if (!First) OS << " + "; else First = false;
-    WriteAsOperand(OS, AM.BaseGV, /*PrintType=*/false);
-  }
-  if (AM.BaseOffs != 0) {
-    if (!First) OS << " + "; else First = false;
-    OS << AM.BaseOffs;
-  }
-  for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(),
-       E = BaseRegs.end(); I != E; ++I) {
-    if (!First) OS << " + "; else First = false;
-    OS << "reg(";
-    OS << **I;
-    OS << ")";
-  }
-  if (AM.Scale != 0) {
-    if (!First) OS << " + "; else First = false;
-    OS << AM.Scale << "*reg(";
-    if (ScaledReg)
-      OS << *ScaledReg;
-    else
-      OS << "<unknown>";
-    OS << ")";
+      // Replace the use of the operand Value with the new Phi we just created.
+      PN->setIncomingValue(i, Code);
+      Rewriter.clear();
+    }
   }
-}
 
-void Formula::dump() const {
-  print(errs()); errs() << '\n';
+  // PHI node might have become a constant value after SplitCriticalEdge.
+  DeadInsts.push_back(Inst);
 }
 
-/// getSDiv - Return an expression for LHS /s RHS, if it can be determined,
-/// or null otherwise. If IgnoreSignificantBits is true, expressions like
-/// (X * Y) /s Y are simplified to Y, ignoring that the multiplication may
-/// overflow, which is useful when the result will be used in a context where
-/// the most significant bits are ignored.
-static const SCEV *getSDiv(c