61 files changed, 5002 insertions, 1898 deletions
diff --git a/lib/CodeGen/AsmPrinter/ARMException.cpp b/lib/CodeGen/AsmPrinter/ARMException.cpp
index b60fda86a6..bf5d8c4880 100644
--- a/lib/CodeGen/AsmPrinter/ARMException.cpp
+++ b/lib/CodeGen/AsmPrinter/ARMException.cpp
@@ -44,9 +44,7 @@ EnableARMEHABIDescriptors("arm-enable-ehabi-descriptors", cl::Hidden,
 
 
 ARMException::ARMException(AsmPrinter *A)
-  : DwarfException(A),
-    shouldEmitTable(false), shouldEmitMoves(false), shouldEmitTableModule(false)
-    {}
+  : DwarfException(A) {}
 
 ARMException::~ARMException() {}
 
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 84277604ef..64a68d3d9c 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -218,16 +218,16 @@ void AsmPrinter::EmitLinkage(unsigned Linkage, MCSymbol *GVSym) const {
   case GlobalValue::CommonLinkage:
   case GlobalValue::LinkOnceAnyLinkage:
   case GlobalValue::LinkOnceODRLinkage:
+  case GlobalValue::LinkOnceODRAutoHideLinkage:
   case GlobalValue::WeakAnyLinkage:
   case GlobalValue::WeakODRLinkage:
   case GlobalValue::LinkerPrivateWeakLinkage:
-  case GlobalValue::LinkerPrivateWeakDefAutoLinkage:
     if (MAI->getWeakDefDirective() != 0) {
       // .globl _foo
       OutStreamer.EmitSymbolAttribute(GVSym, MCSA_Global);
 
       if ((GlobalValue::LinkageTypes)Linkage !=
-          GlobalValue::LinkerPrivateWeakDefAutoLinkage)
+          GlobalValue::LinkOnceODRAutoHideLinkage)
         // .weak_definition _foo
         OutStreamer.EmitSymbolAttribute(GVSym, MCSA_WeakDefinition);
       else
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 9d17767f3b..d30e5bbd8e 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -7,7 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains support for writing dwarf compile unit.
+// This file contains support for constructing a dwarf compile unit.
 //
 //===----------------------------------------------------------------------===//
 
@@ -419,27 +419,12 @@ void CompileUnit::addBlockByrefAddress(DbgVariable *&DV, DIE *Die,
 
   // Decode the original location, and use that as the start of the byref
   // variable's location.
-  const TargetRegisterInfo *RI = Asm->TM.getRegisterInfo();
-  unsigned Reg = RI->getDwarfRegNum(Location.getReg(), false);
   DIEBlock *Block = new (DIEValueAllocator) DIEBlock();
 
-  if (Location.isReg()) {
-    if (Reg < 32)
-      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_reg0 + Reg);
-    else {
-      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_regx);
-      addUInt(Block, 0, dwarf::DW_FORM_udata, Reg);
-    }
-  } else {
-    if (Reg < 32)
-      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_breg0 + Reg);
-    else {
-      addUInt(Block, 0, dwarf::DW_FORM_data1, dwarf::DW_OP_bregx);
-      addUInt(Block, 0, dwarf::DW_FORM_udata, Reg);
-    }
-
-    addUInt(Block, 0, dwarf::DW_FORM_sdata, Location.getOffset());
-  }
+  if (Location.isReg())
+    addRegisterOp(Block, Location.getReg());
+  else
+    addRegisterOffset(Block, Location.getReg(), Location.getOffset());
 
   // If we started with a pointer to the __Block_byref... struct, then
   // the first thing we need to do is dereference the pointer (DW_OP_deref).
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index fa7fb1f8d0..abac337042 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -1410,7 +1410,7 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
                                        MF->getFunction()->getContext());
     recordSourceLine(FnStartDL.getLine(), FnStartDL.getCol(),
                      FnStartDL.getScope(MF->getFunction()->getContext()),
-                     0);
+                     DWARF2_LINE_DEFAULT_IS_STMT ? DWARF2_FLAG_IS_STMT : 0);
   }
 }
 
@@ -1450,6 +1450,12 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
         DIVariable DV(Variables.getElement(i));
         if (!DV || !DV.Verify() || !ProcessedVars.insert(DV))
           continue;
+        // Check that DbgVariable for DV wasn't created earlier, when
+        // findAbstractVariable() was called for inlined instance of DV.
+        LLVMContext &Ctx = DV->getContext();
+        DIVariable CleanDV = cleanseInlinedVariable(DV, Ctx);
+        if (AbstractVariables.lookup(CleanDV))
+          continue;
         if (LexicalScope *Scope = LScopes.findAbstractScope(DV.getContext()))
           addScopeVariable(Scope, new DbgVariable(DV, NULL));
       }
diff --git a/lib/CodeGen/AsmPrinter/DwarfException.h b/lib/CodeGen/AsmPrinter/DwarfException.h
index b5f86ab1b9..75f6056c44 100644
--- a/lib/CodeGen/AsmPrinter/DwarfException.h
+++ b/lib/CodeGen/AsmPrinter/DwarfException.h
@@ -175,17 +175,6 @@ public:
 };
 
 class ARMException : public DwarfException {
-  /// shouldEmitTable - Per-function flag to indicate if EH tables should
-  /// be emitted.
-  bool shouldEmitTable;
-
-  /// shouldEmitMoves - Per-function flag to indicate if frame moves info
-  /// should be emitted.
-  bool shouldEmitMoves;
-
-  /// shouldEmitTableModule - Per-module flag to indicate if EH tables
-  /// should be emitted.
-  bool shouldEmitTableModule;
 public:
   //===--------------------------------------------------------------------===//
   // Main entry points.
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index a2ce7a004d..2e189ad7e7 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -11,6 +11,7 @@ add_llvm_library(LLVMCodeGen
   DeadMachineInstructionElim.cpp
   DFAPacketizer.cpp
   DwarfEHPrepare.cpp
+  EarlyIfConversion.cpp
   EdgeBundles.cpp
   ExecutionDepsFix.cpp
   ExpandISelPseudos.cpp
@@ -60,6 +61,7 @@ add_llvm_library(LLVMCodeGen
   MachineSSAUpdater.cpp
   MachineScheduler.cpp
   MachineSink.cpp
+  MachineTraceMetrics.cpp
   MachineVerifier.cpp
   OcamlGC.cpp
   OptimizePHIs.cpp
diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp
index 7b015d850d..fb2c2e83f1 100644
--- a/lib/CodeGen/CodeGen.cpp
+++ b/lib/CodeGen/CodeGen.cpp
@@ -23,6 +23,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeCalculateSpillWeightsPass(Registry);
   initializeCodePlacementOptPass(Registry);
   initializeDeadMachineInstructionElimPass(Registry);
+  initializeEarlyIfConverterPass(Registry);
   initializeExpandPostRAPass(Registry);
   initializeExpandISelPseudosPass(Registry);
   initializeFinalizeMachineBundlesPass(Registry);
diff --git a/lib/CodeGen/EarlyIfConversion.cpp b/lib/CodeGen/EarlyIfConversion.cpp
new file mode 100644
index 0000000000..f9347efdb0
--- /dev/null
+++ b/lib/CodeGen/EarlyIfConversion.cpp
@@ -0,0 +1,803 @@
+//===-- EarlyIfConversion.cpp - If-conversion on SSA form machine code ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Early if-conversion is for out-of-order CPUs that don't have a lot of
+// predicable instructions. The goal is to eliminate conditional branches that
+// may mispredict.
+//
+// Instructions from both sides of the branch are executed specutatively, and a
+// cmov instruction selects the result.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "early-ifcvt"
+#include "MachineTraceMetrics.h"
+#include "llvm/Function.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SparseSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+// Absolute maximum number of instructions allowed per speculated block.
+// This bypasses all other heuristics, so it should be set fairly high.
+static cl::opt<unsigned>
+BlockInstrLimit("early-ifcvt-limit", cl::init(30), cl::Hidden,
+  cl::desc("Maximum number of instructions per speculated block."));
+
+// Stress testing mode - disable heuristics.
+static cl::opt<bool> Stress("stress-early-ifcvt", cl::Hidden,
+  cl::desc("Turn all knobs to 11"));
+
+STATISTIC(NumDiamondsSeen,  "Number of diamonds");
+STATISTIC(NumDiamondsConv,  "Number of diamonds converted");
+STATISTIC(NumTrianglesSeen, "Number of triangles");
+STATISTIC(NumTrianglesConv, "Number of triangles converted");
+
+//===----------------------------------------------------------------------===//
+//                                 SSAIfConv
+//===----------------------------------------------------------------------===//
+//
+// The SSAIfConv class performs if-conversion on SSA form machine code after
+// determining if it is possible. The class contains no heuristics; external
+// code should be used to determine when if-conversion is a good idea.
+//
+// SSAIfConv can convert both triangles and diamonds:
+//
+//   Triangle: Head              Diamond: Head
+//              | \                       /  \_
+//              |  \                     /    |
+//              |  [TF]BB              FBB    TBB
+//              |  /                     \    /
+//              | /                       \  /
+//             Tail                       Tail
+//
+// Instructions in the conditional blocks TBB and/or FBB are spliced into the
+// Head block, and phis in the Tail block are converted to select instructions.
+//
+namespace {
+class SSAIfConv {
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  MachineRegisterInfo *MRI;
+
+public:
+  /// The block containing the conditional branch.
+  MachineBasicBlock *Head;
+
+  /// The block containing phis after the if-then-else.
+  MachineBasicBlock *Tail;
+
+  /// The 'true' conditional block as determined by AnalyzeBranch.
+  MachineBasicBlock *TBB;
+
+  /// The 'false' conditional block as determined by AnalyzeBranch.
+  MachineBasicBlock *FBB;
+
+  /// isTriangle - When there is no 'else' block, either TBB or FBB will be
+  /// equal to Tail.
+  bool isTriangle() const { return TBB == Tail || FBB == Tail; }
+
+  /// Returns the Tail predecessor for the True side.
+  MachineBasicBlock *getTPred() const { return TBB == Tail ? Head : TBB; }
+
+  /// Returns the Tail predecessor for the  False side.
+  MachineBasicBlock *getFPred() const { return FBB == Tail ? Head : FBB; }
+
+  /// Information about each phi in the Tail block.
+  struct PHIInfo {
+    MachineInstr *PHI;
+    unsigned TReg, FReg;
+    // Latencies from Cond+Branch, TReg, and FReg to DstReg.
+    int CondCycles, TCycles, FCycles;
+
+    PHIInfo(MachineInstr *phi)
+      : PHI(phi), TReg(0), FReg(0), CondCycles(0), TCycles(0), FCycles(0) {}
+  };
+
+  SmallVector<PHIInfo, 8> PHIs;
+
+private:
+  /// The branch condition determined by AnalyzeBranch.
+  SmallVector<MachineOperand, 4> Cond;
+
+  /// Instructions in Head that define values used by the conditional blocks.
+  /// The hoisted instructions must be inserted after these instructions.
+  SmallPtrSet<MachineInstr*, 8> InsertAfter;
+
+  /// Register units clobbered by the conditional blocks.
+  BitVector ClobberedRegUnits;
+
+  // Scratch pad for findInsertionPoint.
+  SparseSet<unsigned> LiveRegUnits;
+
+  /// Insertion point in Head for speculatively executed instructions form TBB
+  /// and FBB.
+  MachineBasicBlock::iterator InsertionPoint;
+
+  /// Return true if all non-terminator instructions in MBB can be safely
+  /// speculated.
+  bool canSpeculateInstrs(MachineBasicBlock *MBB);
+
+  /// Find a valid insertion point in Head.
+  bool findInsertionPoint();
+
+  /// Replace PHI instructions in Tail with selects.
+  void replacePHIInstrs();
+
+  /// Insert selects and rewrite PHI operands to use them.
+  void rewritePHIOperands();
+
+public:
+  /// runOnMachineFunction - Initialize per-function data structures.
+  void runOnMachineFunction(MachineFunction &MF) {
+    TII = MF.getTarget().getInstrInfo();
+    TRI = MF.getTarget().getRegisterInfo();
+    MRI = &MF.getRegInfo();
+    LiveRegUnits.clear();
+    LiveRegUnits.setUniverse(TRI->getNumRegUnits());
+    ClobberedRegUnits.clear();
+    ClobberedRegUnits.resize(TRI->getNumRegUnits());
+  }
+
+  /// canConvertIf - If the sub-CFG headed by MBB can be if-converted,
+  /// initialize the internal state, and return true.
+  bool canConvertIf(MachineBasicBlock *MBB);
+
+  /// convertIf - If-convert the last block passed to canConvertIf(), assuming
+  /// it is possible. Add any erased blocks to RemovedBlocks.
+  void convertIf(SmallVectorImpl<MachineBasicBlock*> &RemovedBlocks);
+};
+} // end anonymous namespace
+
+
+/// canSpeculateInstrs - Returns true if all the instructions in MBB can safely
+/// be speculated. The terminators are not considered.
+///
+/// If instructions use any values that are defined in the head basic block,
+/// the defining instructions are added to InsertAfter.
+///
+/// Any clobbered regunits are added to ClobberedRegUnits.
+///
+bool SSAIfConv::canSpeculateInstrs(MachineBasicBlock *MBB) {
+  // Reject any live-in physregs. It's probably CPSR/EFLAGS, and very hard to
+  // get right.
+  if (!MBB->livein_empty()) {
+    DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has live-ins.\n");
+    return false;
+  }
+
+  unsigned InstrCount = 0;
+
+  // Check all instructions, except the terminators. It is assumed that
+  // terminators never have side effects or define any used register values.
+  for (MachineBasicBlock::iterator I = MBB->begin(),
+       E = MBB->getFirstTerminator(); I != E; ++I) {
+    if (I->isDebugValue())
+      continue;
+
+    if (++InstrCount > BlockInstrLimit && !Stress) {
+      DEBUG(dbgs() << "BB#" << MBB->getNumber() << " has more than "
+                   << BlockInstrLimit << " instructions.\n");
+      return false;
+    }
+
+    // There shouldn't normally be any phis in a single-predecessor block.
+    if (I->isPHI()) {
+      DEBUG(dbgs() << "Can't hoist: " << *I);
+      return false;
+    }
+
+    // Don't speculate loads. Note that it may be possible and desirable to
+    // speculate GOT or constant pool loads that are guaranteed not to trap,
+    // but we don't support that for now.
+    if (I->mayLoad()) {
+      DEBUG(dbgs() << "Won't speculate load: " << *I);
+      return false;
+    }
+
+    // We never speculate stores, so an AA pointer isn't necessary.
+    bool DontMoveAcrossStore = true;
+    if (!I->isSafeToMove(TII, 0, DontMoveAcrossStore)) {
+      DEBUG(dbgs() << "Can't speculate: " << *I);
+      return false;
+    }
+
+    // Check for any dependencies on Head instructions.
+    for (MIOperands MO(I); MO.isValid(); ++MO) {
+      if (MO->isRegMask()) {
+        DEBUG(dbgs() << "Won't speculate regmask: " << *I);
+        return false;
+      }
+      if (!MO->isReg())
+        continue;
+      unsigned Reg = MO->getReg();
+
+      // Remember clobbered regunits.
+      if (MO->isDef() && TargetRegisterInfo::isPhysicalRegister(Reg))
+        for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
+          ClobberedRegUnits.set(*Units);
+
+      if (!MO->readsReg() || !TargetRegisterInfo::isVirtualRegister(Reg))
+        continue;
+      MachineInstr *DefMI = MRI->getVRegDef(Reg);
+      if (!DefMI || DefMI->getParent() != Head)
+        continue;
+      if (InsertAfter.insert(DefMI))
+        DEBUG(dbgs() << "BB#" << MBB->getNumber() << " depends on " << *DefMI);
+      if (DefMI->isTerminator()) {
+        DEBUG(dbgs() << "Can't insert instructions below terminator.\n");
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+
+/// Find an insertion point in Head for the speculated instructions. The
+/// insertion point must be:
+///
+/// 1. Before any terminators.
+/// 2. After any instructions in InsertAfter.
+/// 3. Not have any clobbered regunits live.
+///
+/// This function sets InsertionPoint and returns true when successful, it
+/// returns false if no valid insertion point could be found.
+///
+bool SSAIfConv::findInsertionPoint() {
+  // Keep track of live regunits before the current position.
+  // Only track RegUnits that are also in ClobberedRegUnits.
+  LiveRegUnits.clear();
+  SmallVector<unsigned, 8> Reads;
+  MachineBasicBlock::iterator FirstTerm = Head->getFirstTerminator();
+  MachineBasicBlock::iterator I = Head->end();
+  MachineBasicBlock::iterator B = Head->begin();
+  while (I != B) {
+    --I;
+    // Some of the conditional code depends in I.
+    if (InsertAfter.count(I)) {
+      DEBUG(dbgs() << "Can't insert code after " << *I);
+      return false;
+    }
+
+    // Update live regunits.
+    for (MIOperands MO(I); MO.isValid(); ++MO) {
+      // We're ignoring regmask operands. That is conservatively correct.
+      if (!MO->isReg())
+        continue;
+      unsigned Reg = MO->getReg();
+      if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+        continue;
+      // I clobbers Reg, so it isn't live before I.
+      if (MO->isDef())
+        for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units)
+          LiveRegUnits.erase(*Units);
+      // Unless I reads Reg.
+      if (MO->readsReg())
+        Reads.push_back(Reg);
+    }
+    // Anything read by I is live before I.
+    while (!Reads.empty())
+      for (MCRegUnitIterator Units(Reads.pop_back_val(), TRI); Units.isValid();
+           ++Units)
+        if (ClobberedRegUnits.test(*Units))
+          LiveRegUnits.insert(*Units);
+
+    // We can't insert before a terminator.
+    if (I != FirstTerm && I->isTerminator())
+      continue;
+
+    // Some of the clobbered registers are live before I, not a valid insertion
+    // point.
+    if (!LiveRegUnits.empty()) {
+      DEBUG({
+        dbgs() << "Would clobber";
+        for (SparseSet<unsigned>::const_iterator
+             i = LiveRegUnits.begin(), e = LiveRegUnits.end(); i != e; ++i)
+          dbgs() << ' ' << PrintRegUnit(*i, TRI);
+        dbgs() << " live before " << *I;
+      });
+      continue;
+    }
+
+    // This is a valid insertion point.
+    InsertionPoint = I;
+    DEBUG(dbgs() << "Can insert before " << *I);
+    return true;
+  }
+  DEBUG(dbgs() << "No legal insertion point found.\n");
+  return false;
+}
+
+
+
+/// canConvertIf - analyze the sub-cfg rooted in MBB, and return true if it is
+/// a potential candidate for if-conversion. Fill out the internal state.
+///
+bool SSAIfConv::canConvertIf(MachineBasicBlock *MBB) {
+  Head = MBB;
+  TBB = FBB = Tail = 0;
+
+  if (Head->succ_size() != 2)
+    return false;
+  MachineBasicBlock *Succ0 = Head->succ_begin()[0];
+  MachineBasicBlock *Succ1 = Head->succ_begin()[1];
+
+  // Canonicalize so Succ0 has MBB as its single predecessor.
+  if (Succ0->pred_size() != 1)
+    std::swap(Succ0, Succ1);
+
+  if (Succ0->pred_size() != 1 || Succ0->succ_size() != 1)
+    return false;
+
+  Tail = Succ0->succ_begin()[0];
+
+  // This is not a triangle.
+  if (Tail != Succ1) {
+    // Check for a diamond. We won't deal with any critical edges.
+    if (Succ1->pred_size() != 1 || Succ1->succ_size() != 1 ||
+        Succ1->succ_begin()[0] != Tail)
+      return false;
+    DEBUG(dbgs() << "\nDiamond: BB#" << Head->getNumber()
+                 << " -> BB#" << Succ0->getNumber()
+                 << "/BB#" << Succ1->getNumber()
+                 << " -> BB#" << Tail->getNumber() << '\n');
+
+    // Live-in physregs are tricky to get right when speculating code.
+    if (!Tail->livein_empty()) {
+      DEBUG(dbgs() << "Tail has live-ins.\n");
+      return false;
+    }
+  } else {
+    DEBUG(dbgs() << "\nTriangle: BB#" << Head->getNumber()
+                 << " -> BB#" << Succ0->getNumber()
+                 << " -> BB#" << Tail->getNumber() << '\n');
+  }
+
+  // This is a triangle or a diamond.
+  // If Tail doesn't have any phis, there must be side effects.
+  if (Tail->empty() || !Tail->front().isPHI()) {
+    DEBUG(dbgs() << "No phis in tail.\n");
+    return false;
+  }
+
+  // The branch we're looking to eliminate must be analyzable.
+  Cond.clear();
+  if (TII->AnalyzeBranch(*Head, TBB, FBB, Cond)) {
+    DEBUG(dbgs() << "Branch not analyzable.\n");
+    return false;
+  }
+
+  // This is weird, probably some sort of degenerate CFG.
+  if (!TBB) {
+    DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch.\n");
+    return false;
+  }
+
+  // AnalyzeBranch doesn't set FBB on a fall-through branch.
+  // Make sure it is always set.
+  FBB = TBB == Succ0 ? Succ1 : Succ0;
+
+  // Any phis in the tail block must be convertible to selects.
+  PHIs.clear();
+  MachineBasicBlock *TPred = getTPred();
+  MachineBasicBlock *FPred = getFPred();
+  for (MachineBasicBlock::iterator I = Tail->begin(), E = Tail->end();
+       I != E && I->isPHI(); ++I) {
+    PHIs.push_back(&*I);
+    PHIInfo &PI = PHIs.back();
+    // Find PHI operands corresponding to TPred and FPred.
+    for (unsigned i = 1; i != PI.PHI->getNumOperands(); i += 2) {
+      if (PI.PHI->getOperand(i+1).getMBB() == TPred)
+        PI.TReg = PI.PHI->getOperand(i).getReg();
+      if (PI.PHI->getOperand(i+1).getMBB() == FPred)
+        PI.FReg = PI.PHI->getOperand(i).getReg();
+    }
+    assert(TargetRegisterInfo::isVirtualRegister(PI.TReg) && "Bad PHI");
+    assert(TargetRegisterInfo::isVirtualRegister(PI.FReg) && "Bad PHI");
+
+    // Get target information.
+    if (!TII->canInsertSelect(*Head, Cond, PI.TReg, PI.FReg,
+                              PI.CondCycles, PI.TCycles, PI.FCycles)) {
+      DEBUG(dbgs() << "Can't convert: " << *PI.PHI);
+      return false;
+    }
+  }
+
+  // Check that the conditional instructions can be speculated.
+  InsertAfter.clear();
+  ClobberedRegUnits.reset();
+  if (TBB != Tail && !canSpeculateInstrs(TBB))
+    return false;
+  if (FBB != Tail && !canSpeculateInstrs(FBB))
+    return false;
+
+  // Try to find a valid insertion point for the speculated instructions in the
+  // head basic block.
+  if (!findInsertionPoint())
+    return false;
+
+  if (isTriangle())
+    ++NumTrianglesSeen;
+  else
+    ++NumDiamondsSeen;
+  return true;
+}
+
+/// replacePHIInstrs - Completely replace PHI instructions with selects.
+/// This is possible when the only Tail predecessors are the if-converted
+/// blocks.
+void SSAIfConv::replacePHIInstrs() {
+  assert(Tail->pred_size() == 2 && "Cannot replace PHIs");
+  MachineBasicBlock::iterator FirstTerm = Head->getFirstTerminator();
+  assert(FirstTerm != Head->end() && "No terminators");
+  DebugLoc HeadDL = FirstTerm->getDebugLoc();
+
+  // Convert all PHIs to select instructions inserted before FirstTerm.
+  for (unsigned i = 0, e = PHIs.size(); i != e; ++i) {
+    PHIInfo &PI = PHIs[i];
+    DEBUG(dbgs() << "If-converting " << *PI.PHI);
+    assert(PI.PHI->getNumOperands() == 5 && "Unexpected PHI operands.");
+    unsigned DstReg = PI.PHI->getOperand(0).getReg();
+    TII->insertSelect(*Head, FirstTerm, HeadDL, DstReg, Cond, PI.TReg, PI.FReg);
+    DEBUG(dbgs() << "          --> " << *llvm::prior(FirstTerm));
+    PI.PHI->eraseFromParent();
+    PI.PHI = 0;
+  }
+}
+
+/// rewritePHIOperands - When there are additional Tail predecessors, insert
+/// select instructions in Head and rewrite PHI operands to use the selects.
+/// Keep the PHI instructions in Tail to handle the other predecessors.
+void SSAIfConv::rewritePHIOperands() {
+  MachineBasicBlock::iterator FirstTerm = Head->getFirstTerminator();
+  assert(FirstTerm != Head->end() && "No terminators");
+  DebugLoc HeadDL = FirstTerm->getDebugLoc();
+
+  // Convert all PHIs to select instructions inserted before FirstTerm.
+  for (unsigned i = 0, e = PHIs.size(); i != e; ++i) {
+    PHIInfo &PI = PHIs[i];
+    DEBUG(dbgs() << "If-converting " << *PI.PHI);
+    unsigned PHIDst = PI.PHI->getOperand(0).getReg();
+    unsigned DstReg = MRI->createVirtualRegister(MRI->getRegClass(PHIDst));
+    TII->insertSelect(*Head, FirstTerm, HeadDL, DstReg, Cond, PI.TReg, PI.FReg);
+    DEBUG(dbgs() << "          --> " << *llvm::prior(FirstTerm));
+
+    // Rewrite PHI operands TPred -> (DstReg, Head), remove FPred.
+    for (unsigned i = PI.PHI->getNumOperands(); i != 1; i -= 2) {
+      MachineBasicBlock *MBB = PI.PHI->getOperand(i-1).getMBB();
+      if (MBB == getTPred()) {
+        PI.PHI->getOperand(i-1).setMBB(Head);
+        PI.PHI->getOperand(i-2).setReg(DstReg);
+      } else if (MBB == getFPred()) {
+        PI.PHI->RemoveOperand(i-1);
+        PI.PHI->RemoveOperand(i-2);
+      }
+    }
+    DEBUG(dbgs() << "          --> " << *PI.PHI);
+  }
+}
+
+/// convertIf - Execute the if conversion after canConvertIf has determined the
+/// feasibility.
+///
+/// Any basic blocks erased will be added to RemovedBlocks.
+///
+void SSAIfConv::convertIf(SmallVectorImpl<MachineBasicBlock*> &RemovedBlocks) {
+  assert(Head && Tail && TBB && FBB && "Call canConvertIf first.");
+
+  // Update statistics.
+  if (isTriangle())
+    ++NumTrianglesConv;
+  else
+    ++NumDiamondsConv;
+
+  // Move all instructions into Head, except for the terminators.
+  if (TBB != Tail)
+    Head->splice(InsertionPoint, TBB, TBB->begin(), TBB->getFirstTerminator());
+  if (FBB != Tail)
+    Head->splice(InsertionPoint, FBB, FBB->begin(), FBB->getFirstTerminator());
+
+  // Are there extra Tail predecessors?
+  bool ExtraPreds = Tail->pred_size() != 2;
+  if (ExtraPreds)
+    rewritePHIOperands();
+  else
+    replacePHIInstrs();
+
+  // Fix up the CFG, temporarily leave Head without any successors.
+  Head->removeSuccessor(TBB);
+  Head->removeSuccessor(FBB);
+  if (TBB != Tail)
+    TBB->removeSuccessor(Tail);
+  if (FBB != Tail)
+    FBB->removeSuccessor(Tail);
+
+  // Fix up Head's terminators.
+  // It should become a single branch or a fallthrough.
+  DebugLoc HeadDL = Head->getFirstTerminator()->getDebugLoc();
+  TII->RemoveBranch(*Head);
+
+  // Erase the now empty conditional blocks. It is likely that Head can fall
+  // through to Tail, and we can join the two blocks.
+  if (TBB != Tail) {
+    RemovedBlocks.push_back(TBB);
+    TBB->eraseFromParent();
+  }
+  if (FBB != Tail) {
+    RemovedBlocks.push_back(FBB);
+    FBB->eraseFromParent();
+  }
+
+  assert(Head->succ_empty() && "Additional head successors?");
+  if (!ExtraPreds && Head->isLayoutSuccessor(Tail)) {
+    // Splice Tail onto the end of Head.
+    DEBUG(dbgs() << "Joining tail BB#" << Tail->getNumber()
+                 << " into head BB#" << Head->getNumber() << '\n');
+    Head->splice(Head->end(), Tail,
+                     Tail->begin(), Tail->end());
+    Head->transferSuccessorsAndUpdatePHIs(Tail);
+    RemovedBlocks.push_back(Tail);
+    Tail->eraseFromParent();
+  } else {
+    // We need a branch to Tail, let code placement work it out later.
+    DEBUG(dbgs() << "Converting to unconditional branch.\n");
+    SmallVector<MachineOperand, 0> EmptyCond;
+    TII->InsertBranch(*Head, Tail, 0, EmptyCond, HeadDL);
+    Head->addSuccessor(Tail);
+  }
+  DEBUG(dbgs() << *Head);
+}
+
+
+//===----------------------------------------------------------------------===//
+//                           EarlyIfConverter Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+class EarlyIfConverter : public MachineFunctionPass {
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  const MCSchedModel *SchedModel;
+  MachineRegisterInfo *MRI;
+  MachineDominatorTree *DomTree;
+  MachineLoopInfo *Loops;
+  MachineTraceMetrics *Traces;
+  MachineTraceMetrics::Ensemble *MinInstr;
+  SSAIfConv IfConv;
+
+public:
+  static char ID;
+  EarlyIfConverter() : MachineFunctionPass(ID) {}
+  void getAnalysisUsage(AnalysisUsage &AU) const;
+  bool runOnMachineFunction(MachineFunction &MF);
+
+private:
+  bool tryConvertIf(MachineBasicBlock*);
+  void updateDomTree(ArrayRef<MachineBasicBlock*> Removed);
+  void updateLoops(ArrayRef<MachineBasicBlock*> Removed);
+  void invalidateTraces();
+  bool shouldConvertIf();
+};
+} // end anonymous namespace
+
+char EarlyIfConverter::ID = 0;
+char &llvm::EarlyIfConverterID = EarlyIfConverter::ID;
+
+INITIALIZE_PASS_BEGIN(EarlyIfConverter,
+                      "early-ifcvt", "Early If Converter", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
+INITIALIZE_PASS_END(EarlyIfConverter,
+                      "early-ifcvt", "Early If Converter", false, false)
+
+void EarlyIfConverter::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineBranchProbabilityInfo>();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addPreserved<MachineDominatorTree>();
+  AU.addRequired<MachineLoopInfo>();
+  AU.addPreserved<MachineLoopInfo>();
+  AU.addRequired<MachineTraceMetrics>();
+  AU.addPreserved<MachineTraceMetrics>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+/// Update the dominator tree after if-conversion erased some blocks.
+void EarlyIfConverter::updateDomTree(ArrayRef<MachineBasicBlock*> Removed) {
+  // convertIf can remove TBB, FBB, and Tail can be merged into Head.
+  // TBB and FBB should not dominate any blocks.
+  // Tail children should be transferred to Head.
+  MachineDomTreeNode *HeadNode = DomTree->getNode(IfConv.Head);
+  for (unsigned i = 0, e = Removed.size(); i != e; ++i) {
+    MachineDomTreeNode *Node = DomTree->getNode(Removed[i]);
+    assert(Node != HeadNode && "Cannot erase the head node");
+    while (Node->getNumChildren()) {
+      assert(Node->getBlock() == IfConv.Tail && "Unexpected children");
+      DomTree->changeImmediateDominator(Node->getChildren().back(), HeadNode);
+    }
+    DomTree->eraseNode(Removed[i]);
+  }
+}
+
+/// Update LoopInfo after if-conversion.
+void EarlyIfConverter::updateLoops(ArrayRef<MachineBasicBlock*> Removed) {
+  if (!Loops)
+    return;
+  // If-conversion doesn't change loop structure, and it doesn't mess with back
+  // edges, so updating LoopInfo is simply removing the dead blocks.
+  for (unsigned i = 0, e = Removed.size(); i != e; ++i)
+    Loops->removeBlock(Removed[i]);
+}
+
+/// Invalidate MachineTraceMetrics before if-conversion.
+void EarlyIfConverter::invalidateTraces() {
+  Traces->verifyAnalysis();
+  Traces->invalidate(IfConv.Head);
+  Traces->invalidate(IfConv.Tail);
+  Traces->invalidate(IfConv.TBB);
+  Traces->invalidate(IfConv.FBB);
+  Traces->verifyAnalysis();
+}
+
+// Adjust cycles with downward saturation.
+static unsigned adjCycles(unsigned Cyc, int Delta) {
+  if (Delta < 0 && Cyc + Delta > Cyc)
+    return 0;
+  return Cyc + Delta;
+}
+
+/// Apply cost model and heuristics to the if-conversion in IfConv.
+/// Return true if the conversion is a good idea.
+///
+bool EarlyIfConverter::shouldConvertIf() {
+  // Stress testing mode disables all cost considerations.
+  if (Stress)
+    return true;
+
+  if (!MinInstr)
+    MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
+
+  MachineTraceMetrics::Trace TBBTrace = MinInstr->getTrace(IfConv.getTPred());
+  MachineTraceMetrics::Trace FBBTrace = MinInstr->getTrace(IfConv.getFPred());
+  DEBUG(dbgs() << "TBB: " << TBBTrace << "FBB: " << FBBTrace);
+  unsigned MinCrit = std::min(TBBTrace.getCriticalPath(),
+                              FBBTrace.getCriticalPath());
+
+  // Set a somewhat arbitrary limit on the critical path extension we accept.
+  unsigned CritLimit = SchedModel->MispredictPenalty/2;
+
+  // If-conversion only makes sense when there is unexploited ILP. Compute the
+  // maximum-ILP resource length of the trace after if-conversion. Compare it
+  // to the shortest critical path.
+  SmallVector<const MachineBasicBlock*, 1> ExtraBlocks;
+  if (IfConv.TBB != IfConv.Tail)
+    ExtraBlocks.push_back(IfConv.TBB);
+  unsigned ResLength = FBBTrace.getResourceLength(ExtraBlocks);
+  DEBUG(dbgs() << "Resource length " << ResLength
+               << ", minimal critical path " << MinCrit << '\n');
+  if (ResLength > MinCrit + CritLimit) {
+    DEBUG(dbgs() << "Not enough available ILP.\n");
+    return false;
+  }
+
+  // Assume that the depth of the first head terminator will also be the depth
+  // of the select instruction inserted, as determined by the flag dependency.
+  // TBB / FBB data dependencies may delay the select even more.
+  MachineTraceMetrics::Trace HeadTrace = MinInstr->getTrace(IfConv.Head);
+  unsigned BranchDepth =
+    HeadTrace.getInstrCycles(IfConv.Head->getFirstTerminator()).Depth;
+  DEBUG(dbgs() << "Branch depth: " << BranchDepth << '\n');
+
+  // Look at all the tail phis, and compute the critical path extension caused
+  // by inserting select instructions.
+  MachineTraceMetrics::Trace TailTrace = MinInstr->getTrace(IfConv.Tail);
+  for (unsigned i = 0, e = IfConv.PHIs.size(); i != e; ++i) {
+    SSAIfConv::PHIInfo &PI = IfConv.PHIs[i];
+    unsigned Slack = TailTrace.getInstrSlack(PI.PHI);
+    unsigned MaxDepth = Slack + TailTrace.getInstrCycles(PI.PHI).Depth;
+    DEBUG(dbgs() << "Slack " << Slack << ":\t" << *PI.PHI);
+
+    // The condition is pulled into the critical path.
+    unsigned CondDepth = adjCycles(BranchDepth, PI.CondCycles);
+    if (CondDepth > MaxDepth) {
+      unsigned Extra = CondDepth - MaxDepth;
+      DEBUG(dbgs() << "Condition adds " << Extra << " cycles.\n");
+      if (Extra > CritLimit) {
+        DEBUG(dbgs() << "Exceeds limit of " << CritLimit << '\n');
+        return false;
+      }
+    }
+
+    // The TBB value is pulled into the critical path.
+    unsigned TDepth = adjCycles(TBBTrace.getPHIDepth(PI.PHI), PI.TCycles);
+    if (TDepth > MaxDepth) {
+      unsigned Extra = TDepth - MaxDepth;
+      DEBUG(dbgs() << "TBB data adds " << Extra << " cycles.\n");
+      if (Extra > CritLimit) {
+        DEBUG(dbgs() << "Exceeds limit of " << CritLimit << '\n');
+        return false;
+      }
+    }
+
+    // The FBB value is pulled into the critical path.
+    unsigned FDepth = adjCycles(FBBTrace.getPHIDepth(PI.PHI), PI.FCycles);
+    if (FDepth > MaxDepth) {
+      unsigned Extra = FDepth - MaxDepth;
+      DEBUG(dbgs() << "FBB data adds " << Extra << " cycles.\n");
+      if (Extra > CritLimit) {
+        DEBUG(dbgs() << "Exceeds limit of " << CritLimit << '\n');
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
+/// Attempt repeated if-conversion on MBB, return true if successful.
+///
+bool EarlyIfConverter::tryConvertIf(MachineBasicBlock *MBB) {
+  bool Changed = false;
+  while (IfConv.canConvertIf(MBB) && shouldConvertIf()) {
+    // If-convert MBB and update analyses.
+    invalidateTraces();
+    SmallVector<MachineBasicBlock*, 4> RemovedBlocks;
+    IfConv.convertIf(RemovedBlocks);
+    Changed = true;
+    updateDomTree(RemovedBlocks);
+    updateLoops(RemovedBlocks);
+  }
+  return Changed;
+}
+
+bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** EARLY IF-CONVERSION **********\n"
+               << "********** Function: "
+               << ((Value*)MF.getFunction())->getName() << '\n');
+  TII = MF.getTarget().getInstrInfo();
+  TRI = MF.getTarget().getRegisterInfo();
+  SchedModel = MF.getTarget().getInstrItineraryData()->SchedModel;
+  MRI = &MF.getRegInfo();
+  DomTree = &getAnalysis<MachineDominatorTree>();
+  Loops = getAnalysisIfAvailable<MachineLoopInfo>();
+  Traces = &getAnalysis<MachineTraceMetrics>();
+  MinInstr = 0;
+
+  bool Changed = false;
+  IfConv.runOnMachineFunction(MF);
+
+  // Visit blocks in dominator tree post-order. The post-order enables nested
+  // if-conversion in a single pass. The tryConvertIf() function may erase
+  // blocks, but only blocks dominated by the head block. This makes it safe to
+  // update the dominator tree while the post-order iterator is still active.
+  for (po_iterator<MachineDominatorTree*>
+       I = po_begin(DomTree), E = po_end(DomTree); I != E; ++I)
+    if (tryConvertIf(I->getBlock()))
+      Changed = true;
+
+  MF.verify(this, "After early if-conversion");
+  return Changed;
+}
diff --git a/lib/CodeGen/ExpandPostRAPseudos.cpp b/lib/CodeGen/ExpandPostRAPseudos.cpp
index b14afc286d..7a17331ba1 100644
--- a/lib/CodeGen/ExpandPostRAPseudos.cpp
+++ b/lib/CodeGen/ExpandPostRAPseudos.cpp
@@ -131,13 +131,16 @@ bool ExpandPostRA::LowerSubregToReg(MachineInstr *MI) {
   } else {
     TII->copyPhysReg(*MBB, MI, MI->getDebugLoc(), DstSubReg, InsReg,
                      MI->getOperand(2).isKill());
+
+    // Implicitly define DstReg for subsequent uses.
+    MachineBasicBlock::iterator CopyMI = MI;
+    --CopyMI;
+    CopyMI->addRegisterDefined(DstReg);
+
     // Transfer the kill/dead flags, if needed.
     if (MI->getOperand(0).isDead())
       TransferDeadFlag(MI, DstSubReg, TRI);
-    DEBUG({
-        MachineBasicBlock::iterator dMI = MI;
-        dbgs() << "subreg: " << *(--dMI);
-      });
+    DEBUG(dbgs() << "subreg: " << *CopyMI);
   }
 
   DEBUG(dbgs() << '\n');
diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp
index 4c7f5d8c88..07e37af57f 100644
--- a/lib/CodeGen/InlineSpiller.cpp
+++ b/lib/CodeGen/InlineSpiller.cpp
@@ -1081,6 +1081,10 @@ void InlineSpiller::insertReload(LiveInterval &NewLI,
                            MRI.getRegClass(NewLI.reg), &TRI);
   --MI; // Point to load instruction.
   SlotIndex LoadIdx = LIS.InsertMachineInstrInMaps(MI).getRegSlot();
+  // Some (out-of-tree) targets have EC reload instructions.
+  if (MachineOperand *MO = MI->findRegisterDefOperand(NewLI.reg))
+    if (MO->isEarlyClobber())
+      LoadIdx = LoadIdx.getRegSlot(true);
   DEBUG(dbgs() << "\treload:  " << LoadIdx << '\t' << *MI);
   VNInfo *LoadVNI = NewLI.getNextValue(LoadIdx, LIS.getVNInfoAllocator());
   NewLI.addRange(LiveRange(LoadIdx, Idx, LoadVNI));
diff --git a/lib/CodeGen/LiveInterval.cpp b/lib/CodeGen/LiveInterval.cpp
index 8990360a72..0a795e644c 100644
--- a/lib/CodeGen/LiveInterval.cpp
+++ b/lib/CodeGen/LiveInterval.cpp
@@ -160,7 +160,7 @@ void LiveInterval::markValNoForDeletion(VNInfo *ValNo) {
       valnos.pop_back();
     } while (!valnos.empty() && valnos.back()->isUnused());
   } else {
-    ValNo->setIsUnused(true);
+    ValNo->markUnused();
   }
 }
 
@@ -196,16 +196,16 @@ void LiveInterval::extendIntervalEndTo(Ranges::iterator I, SlotIndex NewEnd) {
   // If NewEnd was in the middle of an interval, make sure to get its endpoint.
   I->end = std::max(NewEnd, prior(MergeTo)->end);
 
-  // Erase any dead ranges.
-  ranges.erase(llvm::next(I), MergeTo);
-
   // If the newly formed range now touches the range after it and if they have
   // the same value number, merge the two ranges into one range.
-  Ranges::iterator Next = llvm::next(I);
-  if (Next != ranges.end() && Next->start <= I->end && Next->valno == ValNo) {
-    I->end = Next->end;
-    ranges.erase(Next);
+  if (MergeTo != ranges.end() && MergeTo->start <= I->end &&
+      MergeTo->valno == ValNo) {
+    I->end = MergeTo->end;
+    ++MergeTo;
   }
+
+  // Erase any dead ranges.
+  ranges.erase(llvm::next(I), MergeTo);
 }
 
 
@@ -381,6 +381,8 @@ void LiveInterval::join(LiveInterval &Other,
                         const int *RHSValNoAssignments,
                         SmallVector<VNInfo*, 16> &NewVNInfo,
                         MachineRegisterInfo *MRI) {
+  verify();
+
   // Determine if any of our live range values are mapped.  This is uncommon, so
   // we want to avoid the interval scan if not.
   bool MustMapCurValNos = false;
@@ -448,14 +450,148 @@ void LiveInterval::join(LiveInterval &Other,
     valnos.resize(NumNewVals);  // shrinkify
 
   // Okay, now insert the RHS live ranges into the LHS.
-  iterator InsertPos = begin();
   unsigned RangeNo = 0;
   for (iterator I = Other.begin(), E = Other.end(); I != E; ++I, ++RangeNo) {
     // Map the valno in the other live range to the current live range.
     I->valno = NewVNInfo[OtherAssignments[RangeNo]];
     assert(I->valno && "Adding a dead range?");
-    InsertPos = addRangeFrom(*I, InsertPos);
   }
+  mergeIntervalRanges(Other);
+
+  verify();
+}
+
+/// \brief Helper function for merging in another LiveInterval's ranges.
+///
+/// This is a helper routine implementing an efficient merge of another
+/// LiveIntervals ranges into the current interval.
+///
+/// \param LHSValNo If non-NULL, set as the new value number for every range
+///                 from RHS which is merged into the LHS.
+/// \param RHSValNo If non-NULL, then only ranges in RHS whose original value
+///                 number maches this value number will be merged into LHS.
+void LiveInterval::mergeIntervalRanges(const LiveInterval &RHS,
+                                       VNInfo *LHSValNo,
+                                       const VNInfo *RHSValNo) {
+  if (RHS.empty())
+    return;
+
+  // Ensure we're starting with a valid range. Note that we don't verify RHS
+  // because it may have had its value numbers adjusted in preparation for
+  // merging.
+  verify();
+
+  // The strategy for merging these efficiently is as follows:
+  //
+  // 1) Find the beginning of the impacted ranges in the LHS.
+  // 2) Create a new, merged sub-squence of ranges merging from the position in
+  //    #1 until either LHS or RHS is exhausted. Any part of LHS between RHS
+  //    entries being merged will be copied into this new range.
+  // 3) Replace the relevant section in LHS with these newly merged ranges.
+  // 4) Append any remaning ranges from RHS if LHS is exhausted in #2.
+  //
+  // We don't follow the typical in-place merge strategy for sorted ranges of
+  // appending the new ranges to the back and then using std::inplace_merge
+  // because one step of the merge can both mutate the original elements and
+  // remove elements from the original. Essentially, because the merge includes
+  // collapsing overlapping ranges, a more complex approach is required.
+
+  // We do an initial binary search to optimize for a common pattern: a large
+  // LHS, and a very small RHS.
+  const_iterator RI = RHS.begin(), RE = RHS.end();
+  iterator LE = end(), LI = std::upper_bound(begin(), LE, *RI);
+
+  // Merge into NewRanges until one of the ranges is exhausted.
+  SmallVector<LiveRange, 4> NewRanges;
+
+  // Keep track of where to begin the replacement.
+  iterator ReplaceI = LI;
+
+  // If there are preceding ranges in the LHS, put the last one into NewRanges
+  // so we can optionally extend it. Adjust the replacement point accordingly.
+  if (LI != begin()) {
+    ReplaceI = llvm::prior(LI);
+    NewRanges.push_back(*ReplaceI);
+  }
+
+  // Now loop over the mergable portions of both LHS and RHS, merging into
+  // NewRanges.
+  while (LI != LE && RI != RE) {
+    // Skip incoming ranges with the wrong value.
+    if (RHSValNo && RI->valno != RHSValNo) {
+      ++RI;
+      continue;
+    }
+
+    // Select the first range. We pick the earliest start point, and then the
+    // largest range.
+    LiveRange R = *LI;
+    if (*RI < R) {
+      R = *RI;
+      ++RI;
+      if (LHSValNo)
+        R.valno = LHSValNo;
+    } else {
+      ++LI;
+    }
+
+    if (NewRanges.empty()) {
+      NewRanges.push_back(R);
+      continue;
+    }
+
+    LiveRange &LastR = NewRanges.back();
+    if (R.valno == LastR.valno) {
+      // Try to merge this range into the last one.
+      if (R.start <= LastR.end) {
+        LastR.end = std::max(LastR.end, R.end);
+        continue;
+      }
+    } else {
+      // We can't merge ranges across a value number.
+      assert(R.start >= LastR.end &&
+             "Cannot overlap two LiveRanges with differing ValID's");
+    }
+
+    // If all else fails, just append the range.
+    NewRanges.push_back(R);
+  }
+  assert(RI == RE || LI == LE);
+
+  // Check for being able to merge into the trailing sequence of ranges on the LHS.
+  if (!NewRanges.empty())
+    for (; LI != LE && (LI->valno == NewRanges.back().valno &&
+                        LI->start <= NewRanges.back().end);
+         ++LI)
+      NewRanges.back().end = std::max(NewRanges.back().end, LI->end);
+
+  // Replace the ranges in the LHS with the newly merged ones. It would be
+  // really nice if there were a move-supporting 'replace' directly in
+  // SmallVector, but as there is not, we pay the price of copies to avoid
+  // wasted memory allocations.
+  SmallVectorImpl<LiveRange>::iterator NRI = NewRanges.begin(),
+                                       NRE = NewRanges.end();
+  for (; ReplaceI != LI && NRI != NRE; ++ReplaceI, ++NRI)
+    *ReplaceI = *NRI;
+  if (NRI == NRE)
+    ranges.erase(ReplaceI, LI);
+  else
+    ranges.insert(LI, NRI, NRE);
+
+  // And finally insert any trailing end of RHS (if we have one).
+  for (; RI != RE; ++RI) {
+    LiveRange R = *RI;
+    if (LHSValNo)
+      R.valno = LHSValNo;
+    if (!ranges.empty() &&
+        ranges.back().valno == R.valno && R.start <= ranges.back().end)
+      ranges.back().end = std::max(ranges.back().end, R.end);
+    else
+      ranges.push_back(R);
+  }
+
+  // Ensure we finished with a valid new sequence of ranges.
+  verify();
 }
 
 /// MergeRangesInAsValue - Merge all of the intervals in RHS into this live
@@ -464,38 +600,20 @@ void LiveInterval::join(LiveInterval &Other,
 /// the overlapping LiveRanges have the specified value number.
 void LiveInterval::MergeRangesInAsValue(const LiveInterval &RHS,
                                         VNInfo *LHSValNo) {
-  // TODO: Make this more efficient.
-  iterator InsertPos = begin();
-  for (const_iterator I = RHS.begin(), E = RHS.end(); I != E; ++I) {
-    // Map the valno in the other live range to the current live range.
-    LiveRange Tmp = *I;
-    Tmp.valno = LHSValNo;
-    InsertPos = addRangeFrom(Tmp, InsertPos);
-  }
+  mergeIntervalRanges(RHS, LHSValNo);
 }
 
-
 /// MergeValueInAsValue - Merge all of the live ranges of a specific val#
 /// in RHS into this live interval as the specified value number.
 /// The LiveRanges in RHS are allowed to overlap with LiveRanges in the
 /// current interval, it will replace the value numbers of the overlaped
 /// live ranges with the specified value number.
-void LiveInterval::MergeValueInAsValue(
-                                    const LiveInterval &RHS,
-                                    const VNInfo *RHSValNo, VNInfo *LHSValNo) {
-  // TODO: Make this more efficient.
-  iterator InsertPos = begin();
-  for (const_iterator I = RHS.begin(), E = RHS.end(); I != E; ++I) {
-    if (I->valno != RHSValNo)
-      continue;
-    // Map the valno in the other live range to the current live range.
-    LiveRange Tmp = *I;
-    Tmp.valno = LHSValNo;
-    InsertPos = addRangeFrom(Tmp, InsertPos);
-  }
+void LiveInterval::MergeValueInAsValue(const LiveInterval &RHS,
+                                       const VNInfo *RHSValNo,
+                                       VNInfo *LHSValNo) {
+  mergeIntervalRanges(RHS, LHSValNo, RHSValNo);
 }
 
-
 /// MergeValueNumberInto - This method is called when two value nubmers
 /// are found to be equivalent.  This eliminates V1, replacing all
 /// LiveRanges with the V1 value number with the V2 value number.  This can
@@ -549,9 +667,6 @@ VNInfo* LiveInterval::MergeValueNumberInto(VNInfo *V1, VNInfo *V2) {
     }
   }
 
-  // Merge the relevant flags.
-  V2->mergeFlags(V1);
-
   // Now that V1 is dead, remove it.
   markValNoForDeletion(V1);
 
@@ -575,6 +690,8 @@ void LiveInterval::Copy(const LiveInterval &RHS,
     const LiveRange &LR = RHS.ranges[i];
     addRange(LiveRange(LR.start, LR.end, getValNumInfo(LR.valno->id)));
   }
+
+  verify();
 }
 
 unsigned LiveInterval::getSize() const {
@@ -617,9 +734,7 @@ void LiveInterval::print(raw_ostream &OS) const {
       } else {
         OS << vni->def;
         if (vni->isPHIDef())
-          OS << "-phidef";
-        if (vni->hasPHIKill())
-          OS << "-phikill";
+          OS << "-phi";
       }
     }
   }
@@ -629,6 +744,23 @@ void LiveInterval::dump() const {
   dbgs() << *this << "\n";
 }
 
+#ifndef NDEBUG
+void LiveInterval::verify() const {
+  for (const_iterator I = begin(), E = end(); I != E; ++I) {
+    assert(I->start.isValid());
+    assert(I->end.isValid());
+    assert(I->start < I->end);
+    assert(I->valno != 0);
+    assert(I->valno == valnos[I->valno->id]);
+    if (llvm::next(I) != E) {
+      assert(I->end <= llvm::next(I)->start);
+      if (I->end == llvm::next(I)->start)
+        assert(I->valno != llvm::next(I)->valno);
+    }
+  }
+}
+#endif
+
 
 void LiveRange::print(raw_ostream &os) const {
   os << *this;
@@ -690,14 +822,11 @@ void ConnectedVNInfoEqClasses::Distribute(LiveInterval *LIV[],
     MachineOperand &MO = RI.getOperand();
     MachineInstr *MI = MO.getParent();
     ++RI;
-    if (MO.isUse() && MO.isUndef())
-      continue;
     // DBG_VALUE instructions should have been eliminated earlier.
-    SlotIndex Idx = LIS.getInstructionIndex(MI);
-    Idx = Idx.getRegSlot(MO.isUse());
-    const VNInfo *VNI = LI.getVNInfoAt(Idx);
-    // FIXME: We should be able to assert(VNI) here, but the coalescer leaves
-    // dangling defs around.
+    LiveRangeQuery LRQ(LI, LIS.getInstructionIndex(MI));
+    const VNInfo *VNI = MO.readsReg() ? LRQ.valueIn() : LRQ.valueDefined();
+    // In the case of an <undef> use that isn't tied to any def, VNI will be
+    // NULL. If the use is tied to a def, VNI will be the defined value.
     if (!VNI)
       continue;
     MO.setReg(LIV[getEqClass(VNI)]->reg);
diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
index 819707f59f..d0f8ae1af3 100644
--- a/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -38,7 +39,13 @@
 #include <cmath>
 using namespace llvm;
 
+// Switch to the new experimental algorithm for computing live intervals.
+static cl::opt<bool>
+NewLiveIntervals("new-live-intervals", cl::Hidden,
+                 cl::desc("Use new algorithm forcomputing live intervals"));
+
 char LiveIntervals::ID = 0;
+char &llvm::LiveIntervalsID = LiveIntervals::ID;
 INITIALIZE_PASS_BEGIN(LiveIntervals, "liveintervals",
                 "Live Interval Analysis", false, false)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
@@ -105,7 +112,19 @@ bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) {
   AllocatableRegs = TRI->getAllocatableSet(fn);
   ReservedRegs = TRI->getReservedRegs(fn);
 
-  computeIntervals();
+  // Allocate space for all virtual registers.
+  VirtRegIntervals.resize(MRI->getNumVirtRegs());
+
+  if (NewLiveIntervals) {
+    // This is the new way of computing live intervals.
+    // It is independent of LiveVariables, and it can run at any time.
+    computeVirtRegs();
+    computeRegMasks();
+  } else {
+    // This is the old way of computing live intervals.
+    // It depends on LiveVariables.
+    computeIntervals();
+  }
   computeLiveInRegUnits();
 
   DEBUG(dump());
@@ -238,7 +257,6 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
       // new valno in the killing blocks.
       assert(vi.AliveBlocks.empty() && "Phi join can't pass through blocks");
       DEBUG(dbgs() << " phi-join");
-      ValNo->setHasPHIKill(true);
     } else {
       // Iterate over all of the blocks that the variable is completely
       // live in, adding [insrtIndex(begin), instrIndex(end)+4) to the
@@ -266,7 +284,6 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
         assert(getInstructionFromIndex(Start) == 0 &&
                "PHI def index points at actual instruction.");
         ValNo = interval.getNextValue(Start, VNInfoAllocator);
-        ValNo->setIsPHIDef(true);
       }
       LiveRange LR(Start, killIdx, ValNo);
       interval.addRange(LR);
@@ -340,7 +357,6 @@ void LiveIntervals::handleVirtualRegisterDef(MachineBasicBlock *mbb,
       SlotIndex killIndex = getMBBEndIdx(mbb);
       LiveRange LR(defIndex, killIndex, ValNo);
       interval.addRange(LR);
-      ValNo->setHasPHIKill(true);
       DEBUG(dbgs() << " phi-join +" << LR);
     } else {
       llvm_unreachable("Multiply defined register");
@@ -442,6 +458,49 @@ LiveInterval* LiveIntervals::createInterval(unsigned reg) {
 }
 
 
+/// computeVirtRegInterval - Compute the live interval of a virtual register,
+/// based on defs and uses.
+void LiveIntervals::computeVirtRegInterval(LiveInterval *LI) {
+  assert(LRCalc && "LRCalc not initialized.");
+  assert(LI->empty() && "Should only compute empty intervals.");
+  LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
+  LRCalc->createDeadDefs(LI);
+  LRCalc->extendToUses(LI);
+}
+
+void LiveIntervals::computeVirtRegs() {
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
+    if (MRI->reg_nodbg_empty(Reg))
+      continue;
+    LiveInterval *LI = createInterval(Reg);
+    VirtRegIntervals[Reg] = LI;
+    computeVirtRegInterval(LI);
+  }
+}
+
+void LiveIntervals::computeRegMasks() {
+  RegMaskBlocks.resize(MF->getNumBlockIDs());
+
+  // Find all instructions with regmask operands.
+  for (MachineFunction::iterator MBBI = MF->begin(), E = MF->end();
+       MBBI != E; ++MBBI) {
+    MachineBasicBlock *MBB = MBBI;
+    std::pair<unsigned, unsigned> &RMB = RegMaskBlocks[MBB->getNumber()];
+    RMB.first = RegMaskSlots.size();
+    for (MachineBasicBlock::iterator MI = MBB->begin(), ME = MBB->end();
+         MI != ME; ++MI)
+      for (MIOperands MO(MI); MO.isValid(); ++MO) {
+        if (!MO->isRegMask())
+          continue;
+          RegMaskSlots.push_back(Indexes->getInstructionIndex(MI).getRegSlot());
+          RegMaskBits.push_back(MO->getRegMask());
+      }
+    // Compute the number of register mask instructions in this block.
+    RMB.second = RegMaskSlots.size() - RMB.first;;
+  }
+}
+
 //===----------------------------------------------------------------------===//
 //                           Register Unit Liveness
 //===----------------------------------------------------------------------===//
@@ -648,7 +707,7 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
       continue;
     if (VNI->isPHIDef()) {
       // This is a dead PHI. Remove it.
-      VNI->setIsUnused(true);
+      VNI->markUnused();
       NewLI.removeRange(*LII);
       DEBUG(dbgs() << "Dead PHI at " << VNI->def << " may separate interval\n");
       CanSeparate = true;
@@ -720,6 +779,25 @@ LiveIntervals::intervalIsInOneMBB(const LiveInterval &LI) const {
   return MBB1 == MBB2 ? MBB1 : NULL;
 }
 
+bool
+LiveIntervals::hasPHIKill(const LiveInterval &LI, const VNInfo *VNI) const {
+  for (LiveInterval::const_vni_iterator I = LI.vni_begin(), E = LI.vni_end();
+       I != E; ++I) {
+    const VNInfo *PHI = *I;
+    if (PHI->isUnused() || !PHI->isPHIDef())
+      continue;
+    const MachineBasicBlock *PHIMBB = getMBBFromIndex(PHI->def);
+    // Conservatively return true instead of scanning huge predecessor lists.
+    if (PHIMBB->pred_size() > 100)
+      return true;
+    for (MachineBasicBlock::const_pred_iterator
+         PI = PHIMBB->pred_begin(), PE = PHIMBB->pred_end(); PI != PE; ++PI)
+      if (VNI == LI.getVNInfoBefore(Indexes->getMBBEndIdx(*PI)))
+        return true;
+  }
+  return false;
+}
+
 float
 LiveIntervals::getSpillWeight(bool isDef, bool isUse, unsigned loopDepth) {
   // Limit the loop depth ridiculousness.
@@ -744,7 +822,6 @@ LiveRange LiveIntervals::addLiveRangeToEndOfBlock(unsigned reg,
   VNInfo* VN = Interval.getNextValue(
     SlotIndex(getInstructionIndex(startInst).getRegSlot()),
     getVNInfoAllocator());
-  VN->setHasPHIKill(true);
   LiveRange LR(
      SlotIndex(getInstructionIndex(startInst).getRegSlot()),
      getMBBEndIdx(startInst->getParent()), VN);
diff --git a/lib/CodeGen/LiveRangeCalc.cpp b/lib/CodeGen/LiveRangeCalc.cpp
index 3e5d7f6c29..d828f25932 100644
--- a/lib/CodeGen/LiveRangeCalc.cpp
+++ b/lib/CodeGen/LiveRangeCalc.cpp
@@ -54,8 +54,7 @@ void LiveRangeCalc::createDeadDefs(LiveInterval *LI, unsigned Reg) {
         .getRegSlot(I.getOperand().isEarlyClobber());
 
     // Create the def in LI. This may find an existing def.
-    VNInfo *VNI = LI->createDeadDef(Idx, *Alloc);
-    VNI->setIsPHIDef(MI->isPHI());
+    LI->createDeadDef(Idx, *Alloc);
   }
 }
 
@@ -95,7 +94,7 @@ void LiveRangeCalc::extendToUses(LiveInterval *LI, unsigned Reg) {
           Idx = Idx.getRegSlot(true);
       }
     }
-    extend(LI, Idx);
+    extend(LI, Idx, Reg);
   }
 }
 
@@ -129,7 +128,8 @@ void LiveRangeCalc::updateLiveIns(VNInfo *OverrideVNI) {
 
 
 void LiveRangeCalc::extend(LiveInterval *LI,
-                           SlotIndex Kill) {
+                           SlotIndex Kill,
+                           unsigned PhysReg) {
   assert(LI && "Missing live range");
   assert(Kill.isValid() && "Invalid SlotIndex");
   assert(Indexes && "Missing SlotIndexes");
@@ -146,7 +146,7 @@ void LiveRangeCalc::extend(LiveInterval *LI,
   // multiple values, and we may need to create even more phi-defs to preserve
   // VNInfo SSA form.  Perform a search for all predecessor blocks where we
   // know the dominating VNInfo.
-  VNInfo *VNI = findReachingDefs(LI, KillMBB, Kill);
+  VNInfo *VNI = findReachingDefs(LI, KillMBB, Kill, PhysReg);
 
   // When there were multiple different values, we may need new PHIs.
   if (!VNI)
@@ -169,7 +169,8 @@ void LiveRangeCalc::calculateValues() {
 
 VNInfo *LiveRangeCalc::findReachingDefs(LiveInterval *LI,
                                         MachineBasicBlock *KillMBB,
-                                        SlotIndex Kill) {
+                                        SlotIndex Kill,
+                                        unsigned PhysReg) {
   // Blocks where LI should be live-in.
   SmallVector<MachineBasicBlock*, 16> WorkList(1, KillMBB);
 
@@ -180,7 +181,22 @@ VNInfo *LiveRangeCalc::findReachingDefs(LiveInterval *LI,
   // Using Seen as a visited set, perform a BFS for all reaching defs.
   for (unsigned i = 0; i != WorkList.size(); ++i) {
     MachineBasicBlock *MBB = WorkList[i];
-    assert(!MBB->pred_empty() && "Value live-in to entry block?");
+
+#ifndef NDEBUG
+    if (MBB->pred_empty()) {
+      MBB->getParent()->verify();
+      llvm_unreachable("Use not jointly dominated by defs.");
+    }
+
+    if (TargetRegisterInfo::isPhysicalRegister(PhysReg) &&
+        !MBB->isLiveIn(PhysReg)) {
+      MBB->getParent()->verify();
+      errs() << "The register needs to be live in to BB#" << MBB->getNumber()
+             << ", but is missing from the live-in list.\n";
+      llvm_unreachable("Invalid global physical register");
+    }
+#endif
+
     for (MachineBasicBlock::pred_iterator PI = MBB->pred_begin(),
            PE = MBB->pred_end(); PI != PE; ++PI) {
        MachineBasicBlock *Pred = *PI;
@@ -303,7 +319,6 @@ void LiveRangeCalc::updateSSA() {
         SlotIndex Start, End;
         tie(Start, End) = Indexes->getMBBRange(MBB);
         VNInfo *VNI = I->LI->getNextValue(Start, *Alloc);
-        VNI->setIsPHIDef(true);
         I->Value = VNI;
         // This block is done, we know the final value.
         I->DomNode = 0;
diff --git a/lib/CodeGen/LiveRangeCalc.h b/lib/CodeGen/LiveRangeCalc.h
index 3c9e40f0fe..909829b228 100644
--- a/lib/CodeGen/LiveRangeCalc.h
+++ b/lib/CodeGen/LiveRangeCalc.h
@@ -105,9 +105,12 @@ class LiveRangeCalc {
   /// to be live-in are added to LiveIn.  If a unique reaching def is found,
   /// its value is returned, if Kill is jointly dominated by multiple values,
   /// NULL is returned.
+  ///
+  /// PhysReg, when set, is used to verify live-in lists on basic blocks.
   VNInfo *findReachingDefs(LiveInterval *LI,
                            MachineBasicBlock *KillMBB,
-                           SlotIndex Kill);
+                           SlotIndex Kill,
+                           unsigned PhysReg);
 
   /// updateSSA - Compute the values that will be live in to all requested
   /// blocks in LiveIn.  Create PHI-def values as required to preserve SSA form.
@@ -157,7 +160,9 @@ public:
   /// Kill is not dominated by a single existing value, PHI-defs are inserted
   /// as required to preserve SSA form.  If Kill is known to be dominated by a
   /// single existing value, Alloc may be null.
-  void extend(LiveInterval *LI, SlotIndex Kill);
+  ///
+  /// PhysReg, when set, is used to verify live-in lists on basic blocks.
+  void extend(LiveInterval *LI, SlotIndex Kill, unsigned PhysReg = 0);
 
   /// createDeadDefs - Create a dead def in LI for every def operand of Reg.
   /// Each instruction defining Reg gets a new VNInfo with a corresponding
diff --git a/lib/CodeGen/LiveRangeEdit.cpp b/lib/CodeGen/LiveRangeEdit.cpp
index 261d860e01..b4ce9aa8c1 100644
--- a/lib/CodeGen/LiveRangeEdit.cpp
+++ b/lib/CodeGen/LiveRangeEdit.cpp
@@ -177,6 +177,19 @@ bool LiveRangeEdit::foldAsLoad(LiveInterval *LI,
   if (!DefMI || !UseMI)
     return false;
 
+  // Since we're moving the DefMI load, make sure we're not extending any live
+  // ranges.
+  if (!allUsesAvailableAt(DefMI,
+                          LIS.getInstructionIndex(DefMI),
+                          LIS.getInstructionIndex(UseMI)))
+    return false;
+
+  // We also need to make sure it is safe to move the load.
+  // Assume there are stores between DefMI and UseMI.
+  bool SawStore = true;
+  if (!DefMI->isSafeToMove(&TII, 0, SawStore))
+    return false;
+
   DEBUG(dbgs() << "Try to fold single def: " << *DefMI
                << "       into single use: " << *UseMI);
 
@@ -226,6 +239,7 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
 
       // Collect virtual registers to be erased after MI is gone.
       SmallVector<unsigned, 8> RegsToErase;
+      bool ReadsPhysRegs = false;
 
       // Check for live intervals that may shrink
       for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
@@ -233,8 +247,12 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
         if (!MOI->isReg())
           continue;
         unsigned Reg = MOI->getReg();
-        if (!TargetRegisterInfo::isVirtualRegister(Reg))
+        if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
+          // Check if MI reads any unreserved physregs.
+          if (Reg && MOI->readsReg() && !LIS.isReserved(Reg))
+            ReadsPhysRegs = true;
           continue;
+        }
         LiveInterval &LI = LIS.getInterval(Reg);
 
         // Shrink read registers, unless it is likely to be expensive and
@@ -258,11 +276,30 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr*> &Dead,
         }
       }
 
-      if (TheDelegate)
-        TheDelegate->LRE_WillEraseInstruction(MI);
-      LIS.RemoveMachineInstrFromMaps(MI);
-      MI->eraseFromParent();
-      ++NumDCEDeleted;
+      // Currently, we don't support DCE of physreg live ranges. If MI reads
+      // any unreserved physregs, don't erase the instruction, but turn it into
+      // a KILL instead. This way, the physreg live ranges don't end up
+      // dangling.
+      // FIXME: It would be better to have something like shrinkToUses() for
+      // physregs. That could potentially enable more DCE and it would free up
+      // the physreg. It would not happen often, though.
+      if (ReadsPhysRegs) {
+        MI->setDesc(TII.get(TargetOpcode::KILL));
+        // Remove all operands that aren't physregs.
+        for (unsigned i = MI->getNumOperands(); i; --i) {
+          const MachineOperand &MO = MI->getOperand(i-1);
+          if (MO.isReg() && TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+            continue;
+          MI->RemoveOperand(i-1);
+        }
+        DEBUG(dbgs() << "Converted physregs to:\t" << *MI);
+      } else {
+        if (TheDelegate)
+          TheDelegate->LRE_WillEraseInstruction(MI);
+        LIS.RemoveMachineInstrFromMaps(MI);
+        MI->eraseFromParent();
+        ++NumDCEDeleted;
+      }
 
       // Erase any virtregs that are now empty and unused. There may be <undef>
       // uses around. Keep the empty live range in that case.
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index fa971e7c3a..a7c9f46f4d 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -109,7 +109,8 @@ void ilist_traits<MachineInstr>::removeNodeFromList(MachineInstr *N) {
   assert(N->getParent() != 0 && "machine instruction not in a basic block");
 
   // Remove from the use/def lists.
-  N->RemoveRegOperandsFromUseLists();
+  if (MachineFunction *MF = N->getParent()->getParent())
+    N->RemoveRegOperandsFromUseLists(MF->getRegInfo());
 
   N->setParent(0);
 
@@ -311,8 +312,11 @@ void MachineBasicBlock::print(raw_ostream &OS, SlotIndexes *Indexes) const {
   if (!succ_empty()) {
     if (Indexes) OS << '\t';
     OS << "    Successors according to CFG:";
-    for (const_succ_iterator SI = succ_begin(), E = succ_end(); SI != E; ++SI)
+    for (const_succ_iterator SI = succ_begin(), E = succ_end(); SI != E; ++SI) {
       OS << " BB#" << (*SI)->getNumber();
+      if (!Weights.empty())
+        OS << '(' << *getWeightIterator(SI) << ')';
+    }
     OS << '\n';
   }
 }
@@ -478,18 +482,42 @@ MachineBasicBlock::removeSuccessor(succ_iterator I) {
 
 void MachineBasicBlock::replaceSuccessor(MachineBasicBlock *Old,
                                          MachineBasicBlock *New) {
-  uint32_t weight = 0;
-  succ_iterator SI = std::find(Successors.begin(), Successors.end(), Old);
+  if (Old == New)
+    return;
 
-  // If Weight list is empty it means we don't use it (disabled optimization).
-  if (!Weights.empty()) {
-    weight_iterator WI = getWeightIterator(SI);
-    weight = *WI;
+  succ_iterator E = succ_end();
+  succ_iterator NewI = E;
+  succ_iterator OldI = E;
+  for (succ_iterator I = succ_begin(); I != E; ++I) {
+    if (*I == Old) {
+      OldI = I;
+      if (NewI != E)
+        break;
+    }
+    if (*I == New) {
+      NewI = I;
+      if (OldI != E)
+        break;
+    }
   }
+  assert(OldI != E && "Old is not a successor of this block");
+  Old->removePredecessor(this);
 
-  // Update the successor information.
-  removeSuccessor(SI);
-  addSuccessor(New, weight);
+  // If New isn't already a successor, let it take Old's place.
+  if (NewI == E) {
+    New->addPredecessor(this);
+    *OldI = New;
+    return;
+  }
+
+  // New is already a successor.
+  // Update its weight instead of adding a duplicate edge.
+  if (!Weights.empty()) {
+    weight_iterator OldWI = getWeightIterator(OldI);
+    *getWeightIterator(NewI) += *OldWI;
+    Weights.erase(OldWI);
+  }
+  Successors.erase(OldI);
 }
 
 void MachineBasicBlock::addPredecessor(MachineBasicBlock *pred) {
@@ -508,14 +536,13 @@ void MachineBasicBlock::transferSuccessors(MachineBasicBlock *fromMBB) {
 
   while (!fromMBB->succ_empty()) {
     MachineBasicBlock *Succ = *fromMBB->succ_begin();
-    uint32_t weight = 0;
-
+    uint32_t Weight = 0;
 
     // If Weight list is empty it means we don't use it (disabled optimization).
     if (!fromMBB->Weights.empty())
-      weight = *fromMBB->Weights.begin();
+      Weight = *fromMBB->Weights.begin();
 
-    addSuccessor(Succ, weight);
+    addSuccessor(Succ, Weight);
     fromMBB->removeSuccessor(Succ);
   }
 }
@@ -527,7 +554,10 @@ MachineBasicBlock::transferSuccessorsAndUpdatePHIs(MachineBasicBlock *fromMBB) {
 
   while (!fromMBB->succ_empty()) {
     MachineBasicBlock *Succ = *fromMBB->succ_begin();
-    addSuccessor(Succ);
+    uint32_t Weight = 0;
+    if (!fromMBB->Weights.empty())
+      Weight = *fromMBB->Weights.begin();
+    addSuccessor(Succ, Weight);
     fromMBB->removeSuccessor(Succ);
 
     // Fix up any PHI nodes in the successor.
@@ -541,9 +571,12 @@ MachineBasicBlock::transferSuccessorsAndUpdatePHIs(MachineBasicBlock *fromMBB) {
   }
 }
 
+bool MachineBasicBlock::isPredecessor(const MachineBasicBlock *MBB) const {
+  return std::find(pred_begin(), pred_end(), MBB) != pred_end();
+}
+
 bool MachineBasicBlock::isSuccessor(const MachineBasicBlock *MBB) const {
-  const_succ_iterator I = std::find(Successors.begin(), Successors.end(), MBB);
-  return I != Successors.end();
+  return std::find(succ_begin(), succ_end(), MBB) != succ_end();
 }
 
 bool MachineBasicBlock::isLayoutSuccessor(const MachineBasicBlock *MBB) const {
@@ -674,7 +707,7 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
 
   // Inherit live-ins from the successor
   for (MachineBasicBlock::livein_iterator I = Succ->livein_begin(),
-	 E = Succ->livein_end(); I != E; ++I)
+         E = Succ->livein_end(); I != E; ++I)
     NMBB->addLiveIn(*I);
 
   // Update LiveVariables.
@@ -910,12 +943,11 @@ MachineBasicBlock::findDebugLoc(instr_iterator MBBI) {
 
 /// getSuccWeight - Return weight of the edge from this block to MBB.
 ///
-uint32_t MachineBasicBlock::getSuccWeight(const MachineBasicBlock *succ) const {
+uint32_t MachineBasicBlock::getSuccWeight(const_succ_iterator Succ) const {
   if (Weights.empty())
     return 0;
 
-  const_succ_iterator I = std::find(Successors.begin(), Successors.end(), succ);
-  return *getWeightIterator(I);
+  return *getWeightIterator(Succ);
 }
 
 /// getWeightIterator - Return wight iterator corresonding to the I successor
diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp
index 5a15f92a18..c4dca2cd15 100644
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp
@@ -985,8 +985,22 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
     // boiler plate.
     Cond.clear();
     MachineBasicBlock *TBB = 0, *FBB = 0; // For AnalyzeBranch.
-    if (!TII->AnalyzeBranch(*PrevBB, TBB, FBB, Cond))
+    if (!TII->AnalyzeBranch(*PrevBB, TBB, FBB, Cond)) {
+      // If PrevBB has a two-way branch, try to re-order the branches
+      // such that we branch to the successor with higher weight first.
+      if (TBB && !Cond.empty() && FBB &&
+          MBPI->getEdgeWeight(PrevBB, FBB) > MBPI->getEdgeWeight(PrevBB, TBB) &&
+          !TII->ReverseBranchCondition(Cond)) {
+        DEBUG(dbgs() << "Reverse order of the two branches: "
+                     << getBlockName(PrevBB) << "\n");
+        DEBUG(dbgs() << "    Edge weight: " << MBPI->getEdgeWeight(PrevBB, FBB)
+                     << " vs " << MBPI->getEdgeWeight(PrevBB, TBB) << "\n");
+        DebugLoc dl;  // FIXME: this is nowhere
+        TII->RemoveBranch(*PrevBB);
+        TII->InsertBranch(*PrevBB, FBB, TBB, Cond, dl);
+      }
       PrevBB->updateTerminator();
+    }
   }
 
   // Fixup the last block.
@@ -997,29 +1011,63 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
 
   // Walk through the backedges of the function now that we have fully laid out
   // the basic blocks and align the destination of each backedge. We don't rely
-  // on the loop info here so that we can align backedges in unnatural CFGs and
-  // backedges that were introduced purely because of the loop rotations done
-  // during this layout pass.
-  // FIXME: This isn't quite right, we shouldn't align backedges that result
-  // from blocks being sunken below the exit block for the function.
+  // exclusively on the loop info here so that we can align backedges in
+  // unnatural CFGs and backedges that were introduced purely because of the
+  // loop rotations done during this layout pass.
   if (F.getFunction()->hasFnAttr(Attribute::OptimizeForSize))
     return;
   unsigned Align = TLI->getPrefLoopAlignment();
   if (!Align)
     return;  // Don't care about loop alignment.
+  if (FunctionChain.begin() == FunctionChain.end())
+    return;  // Empty chain.
 
-  SmallPtrSet<MachineBasicBlock *, 16> PreviousBlocks;
-  for (BlockChain::iterator BI = FunctionChain.begin(),
+  const BranchProbability ColdProb(1, 5); // 20%
+  BlockFrequency EntryFreq = MBFI->getBlockFreq(F.begin());
+  BlockFrequency WeightedEntryFreq = EntryFreq * ColdProb;
+  for (BlockChain::iterator BI = llvm::next(FunctionChain.begin()),
                             BE = FunctionChain.end();
        BI != BE; ++BI) {
-    PreviousBlocks.insert(*BI);
-    // Set alignment on the destination of all the back edges in the new
-    // ordering.
-    for (MachineBasicBlock::succ_iterator SI = (*BI)->succ_begin(),
-                                          SE = (*BI)->succ_end();
-         SI != SE; ++SI)
-      if (PreviousBlocks.count(*SI))
-        (*SI)->setAlignment(Align);
+    // Don't align non-looping basic blocks. These are unlikely to execute
+    // enough times to matter in practice. Note that we'll still handle
+    // unnatural CFGs inside of a natural outer loop (the common case) and
+    // rotated loops.
+    MachineLoop *L = MLI->getLoopFor(*BI);
+    if (!L)
+      continue;
+
+    // If the block is cold relative to the function entry don't waste space
+    // aligning it.
+    BlockFrequency Freq = MBFI->getBlockFreq(*BI);
+    if (Freq < WeightedEntryFreq)
+      continue;
+
+    // If the block is cold relative to its loop header, don't align it
+    // regardless of what edges into the block exist.
+    MachineBasicBlock *LoopHeader = L->getHeader();
+    BlockFrequency LoopHeaderFreq = MBFI->getBlockFreq(LoopHeader);
+    if (Freq < (LoopHeaderFreq * ColdProb))
+      continue;
+
+    // Check for the existence of a non-layout predecessor which would benefit
+    // from aligning this block.
+    MachineBasicBlock *LayoutPred = *llvm::prior(BI);
+
+    // Force alignment if all the predecessors are jumps. We already checked
+    // that the block isn't cold above.
+    if (!LayoutPred->isSuccessor(*BI)) {
+      (*BI)->setAlignment(Align);
+      continue;
+    }
+
+    // Align this block if the layout predecessor's edge into this block is
+    // cold relative to the block. When this is true, othe predecessors make up
+    // all of the hot entries into the block and thus alignment is likely to be
+    // important.
+    BranchProbability LayoutProb = MBPI->getEdgeProbability(LayoutPred, *BI);
+    BlockFrequency LayoutEdgeFreq = MBFI->getBlockFreq(LayoutPred) * LayoutProb;
+    if (LayoutEdgeFreq <= (Freq * ColdProb))
+      (*BI)->setAlignment(Align);
   }
 }
 
diff --git a/lib/CodeGen/MachineBranchProbabilityInfo.cpp b/lib/CodeGen/MachineBranchProbabilityInfo.cpp
index 0cc1af0795..447921147f 100644
--- a/lib/CodeGen/MachineBranchProbabilityInfo.cpp
+++ b/lib/CodeGen/MachineBranchProbabilityInfo.cpp
@@ -38,7 +38,7 @@ getSumForBlock(const MachineBasicBlock *MBB, uint32_t &Scale) const {
   Scale = 1;
   for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(),
        E = MBB->succ_end(); I != E; ++I) {
-    uint32_t Weight = getEdgeWeight(MBB, *I);
+    uint32_t Weight = getEdgeWeight(MBB, I);
     Sum += Weight;
   }
 
@@ -53,22 +53,30 @@ getSumForBlock(const MachineBasicBlock *MBB, uint32_t &Scale) const {
   Sum = 0;
   for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(),
        E = MBB->succ_end(); I != E; ++I) {
-    uint32_t Weight = getEdgeWeight(MBB, *I);
+    uint32_t Weight = getEdgeWeight(MBB, I);
     Sum += Weight / Scale;
   }
   assert(Sum <= UINT32_MAX);
   return Sum;
 }
 
-uint32_t
-MachineBranchProbabilityInfo::getEdgeWeight(const MachineBasicBlock *Src,
-                                            const MachineBasicBlock *Dst) const {
+uint32_t MachineBranchProbabilityInfo::
+getEdgeWeight(const MachineBasicBlock *Src,
+              MachineBasicBlock::const_succ_iterator Dst) const {
   uint32_t Weight = Src->getSuccWeight(Dst);
   if (!Weight)
     return DEFAULT_WEIGHT;
   return Weight;
 }
 
+uint32_t MachineBranchProbabilityInfo::
+getEdgeWeight(const MachineBasicBlock *Src,
+              const MachineBasicBlock *Dst) const {
+  // This is a linear search. Try to use the const_succ_iterator version when
+  // possible.
+  return getEdgeWeight(Src, std::find(Src->succ_begin(), Src->succ_end(), Dst));
+}
+
 bool MachineBranchProbabilityInfo::isEdgeHot(MachineBasicBlock *Src,
                                              MachineBasicBlock *Dst) const {
   // Hot probability is at least 4/5 = 80%
@@ -82,7 +90,7 @@ MachineBranchProbabilityInfo::getHotSucc(MachineBasicBlock *MBB) const {
   MachineBasicBlock *MaxSucc = 0;
   for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(),
        E = MBB->succ_end(); I != E; ++I) {
-    uint32_t Weight = getEdgeWeight(MBB, *I);
+    uint32_t Weight = getEdgeWeight(MBB, I);
     if (Weight > MaxWeight) {
       MaxWeight = Weight;
       MaxSucc = *I;
diff --git a/lib/CodeGen/MachineCSE.cpp b/lib/CodeGen/MachineCSE.cpp
index f2539baabd..896461fd19 100644
--- a/lib/CodeGen/MachineCSE.cpp
+++ b/lib/CodeGen/MachineCSE.cpp
@@ -84,7 +84,7 @@ namespace {
     bool PerformTrivialCoalescing(MachineInstr *MI, MachineBasicBlock *MBB);
     bool isPhysDefTriviallyDead(unsigned Reg,
                                 MachineBasicBlock::const_iterator I,
-                                MachineBasicBlock::const_iterator E) const ;
+                                MachineBasicBlock::const_iterator E) const;
     bool hasLivePhysRegDefUses(const MachineInstr *MI,
                                const MachineBasicBlock *MBB,
                                SmallSet<unsigned,8> &PhysRefs,
@@ -100,8 +100,7 @@ namespace {
     void ExitScope(MachineBasicBlock *MBB);
     bool ProcessBlock(MachineBasicBlock *MBB);
     void ExitScopeIfDone(MachineDomTreeNode *Node,
-                 DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren,
-                 DenseMap<MachineDomTreeNode*, MachineDomTreeNode*> &ParentMap);
+                         DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren);
     bool PerformCSE(MachineDomTreeNode *Node);
   };
 } // end anonymous namespace
@@ -216,8 +215,10 @@ bool MachineCSE::hasLivePhysRegDefUses(const MachineInstr *MI,
     if (MO.isDef() &&
         (MO.isDead() || isPhysDefTriviallyDead(Reg, I, MBB->end())))
       continue;
-    for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
-      PhysRefs.insert(*AI);
+    // Reading constant physregs is ok.
+    if (!MRI->isConstantPhysReg(Reg, *MBB->getParent()))
+      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+        PhysRefs.insert(*AI);
     if (MO.isDef())
       PhysDefs.push_back(Reg);
   }
@@ -325,6 +326,29 @@ bool MachineCSE::isProfitableToCSE(unsigned CSReg, unsigned Reg,
                                    MachineInstr *CSMI, MachineInstr *MI) {
   // FIXME: Heuristics that works around the lack the live range splitting.
 
+  // If CSReg is used at all uses of Reg, CSE should not increase register
+  // pressure of CSReg.
+  bool MayIncreasePressure = true;
+  if (TargetRegisterInfo::isVirtualRegister(CSReg) &&
+      TargetRegisterInfo::isVirtualRegister(Reg)) {
+    MayIncreasePressure = false;
+    SmallPtrSet<MachineInstr*, 8> CSUses;
+    for (MachineRegisterInfo::use_nodbg_iterator I =MRI->use_nodbg_begin(CSReg),
+         E = MRI->use_nodbg_end(); I != E; ++I) {
+      MachineInstr *Use = &*I;
+      CSUses.insert(Use);
+    }
+    for (MachineRegisterInfo::use_nodbg_iterator I = MRI->use_nodbg_begin(Reg),
+         E = MRI->use_nodbg_end(); I != E; ++I) {
+      MachineInstr *Use = &*I;
+      if (!CSUses.count(Use)) {
+        MayIncreasePressure = true;
+        break;
+      }
+    }
+  }
+  if (!MayIncreasePressure) return true;
+
   // Heuristics #1: Don't CSE "cheap" computation if the def is not local or in
   // an immediate predecessor. We don't want to increase register pressure and
   // end up causing other computation to be spilled.
@@ -395,6 +419,7 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
   bool Changed = false;
 
   SmallVector<std::pair<unsigned, unsigned>, 8> CSEPairs;
+  SmallVector<unsigned, 2> ImplicitDefsToUpdate;
   for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E; ) {
     MachineInstr *MI = &*I;
     ++I;
@@ -436,7 +461,7 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
     // used, then it's not safe to replace it with a common subexpression.
     // It's also not safe if the instruction uses physical registers.
     bool CrossMBBPhysDef = false;
-    SmallSet<unsigned,8> PhysRefs;
+    SmallSet<unsigned, 8> PhysRefs;
     SmallVector<unsigned, 2> PhysDefs;
     if (FoundCSE && hasLivePhysRegDefUses(MI, MBB, PhysRefs, PhysDefs)) {
       FoundCSE = false;
@@ -464,21 +489,31 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
 
     // Check if it's profitable to perform this CSE.
     bool DoCSE = true;
-    unsigned NumDefs = MI->getDesc().getNumDefs();
+    unsigned NumDefs = MI->getDesc().getNumDefs() +
+                       MI->getDesc().getNumImplicitDefs();
+    
     for (unsigned i = 0, e = MI->getNumOperands(); NumDefs && i != e; ++i) {
       MachineOperand &MO = MI->getOperand(i);
       if (!MO.isReg() || !MO.isDef())
         continue;
       unsigned OldReg = MO.getReg();
       unsigned NewReg = CSMI->getOperand(i).getReg();
-      if (OldReg == NewReg)
+
+      // Go through implicit defs of CSMI and MI, if a def is not dead at MI,
+      // we should make sure it is not dead at CSMI.
+      if (MO.isImplicit() && !MO.isDead() && CSMI->getOperand(i).isDead())
+        ImplicitDefsToUpdate.push_back(i);
+      if (OldReg == NewReg) {
+        --NumDefs;
         continue;
+      }
 
       assert(TargetRegisterInfo::isVirtualRegister(OldReg) &&
              TargetRegisterInfo::isVirtualRegister(NewReg) &&
              "Do not CSE physical register defs!");
 
       if (!isProfitableToCSE(NewReg, OldReg, CSMI, MI)) {
+        DEBUG(dbgs() << "*** Not profitable, avoid CSE!\n");
         DoCSE = false;
         break;
       }
@@ -487,6 +522,7 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
       // within the register class of the new instruction.
       const TargetRegisterClass *OldRC = MRI->getRegClass(OldReg);
       if (!MRI->constrainRegClass(NewReg, OldRC)) {
+        DEBUG(dbgs() << "*** Not the same register class, avoid CSE!\n");
         DoCSE = false;
         break;
       }
@@ -502,6 +538,11 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
         MRI->clearKillFlags(CSEPairs[i].second);
       }
 
+      // Go through implicit defs of CSMI and MI, if a def is not dead at MI,
+      // we should make sure it is not dead at CSMI.
+      for (unsigned i = 0, e = ImplicitDefsToUpdate.size(); i != e; ++i)
+        CSMI->getOperand(ImplicitDefsToUpdate[i]).setIsDead(false);
+
       if (CrossMBBPhysDef) {
         // Add physical register defs now coming in from a predecessor to MBB
         // livein list.
@@ -521,11 +562,11 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
         ++NumCommutes;
       Changed = true;
     } else {
-      DEBUG(dbgs() << "*** Not profitable, avoid CSE!\n");
       VNT.insert(MI, CurrVN++);
       Exps.push_back(MI);
     }
     CSEPairs.clear();
+    ImplicitDefsToUpdate.clear();
   }
 
   return Changed;
@@ -536,8 +577,7 @@ bool MachineCSE::ProcessBlock(MachineBasicBlock *MBB) {
 /// up the dominator tree to destroy ancestors which are now done.
 void
 MachineCSE::ExitScopeIfDone(MachineDomTreeNode *Node,
-                DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren,
-                DenseMap<MachineDomTreeNode*, MachineDomTreeNode*> &ParentMap) {
+                        DenseMap<MachineDomTreeNode*, unsigned> &OpenChildren) {
   if (OpenChildren[Node])
     return;
 
@@ -545,7 +585,7 @@ MachineCSE::ExitScopeIfDone(MachineDomTreeNode *Node,
   ExitScope(Node->getBlock());
 
   // Now traverse upwards to pop ancestors whose offsprings are all done.
-  while (MachineDomTreeNode *Parent = ParentMap[Node]) {
+  while (MachineDomTreeNode *Parent = Node->getIDom()) {
     unsigned Left = --OpenChildren[Parent];
     if (Left != 0)
       break;
@@ -557,7 +597,6 @@ MachineCSE::ExitScopeIfDone(MachineDomTreeNode *Node,
 bool MachineCSE::PerformCSE(MachineDomTreeNode *Node) {
   SmallVector<MachineDomTreeNode*, 32> Scopes;
   SmallVector<MachineDomTreeNode*, 8> WorkList;
-  DenseMap<MachineDomTreeNode*, MachineDomTreeNode*> ParentMap;
   DenseMap<MachineDomTreeNode*, unsigned> OpenChildren;
 
   CurrVN = 0;
@@ -572,7 +611,6 @@ bool MachineCSE::PerformCSE(MachineDomTreeNode *Node) {
     OpenChildren[Node] = NumChildren;
     for (unsigned i = 0; i != NumChildren; ++i) {
       MachineDomTreeNode *Child = Children[i];
-      ParentMap[Child] = Node;
       WorkList.push_back(Child);
     }
   } while (!WorkList.empty());
@@ -585,7 +623,7 @@ bool MachineCSE::PerformCSE(MachineDomTreeNode *Node) {
     EnterScope(MBB);
     Changed |= ProcessBlock(MBB);
     // If it's a leaf node, it's done. Traverse upwards to pop ancestors.
-    ExitScopeIfDone(Node, OpenChildren, ParentMap);
+    ExitScopeIfDone(Node, OpenChildren);
   }
 
   return Changed;
diff --git a/lib/CodeGen/MachineFunctionPrinterPass.cpp b/lib/CodeGen/MachineFunctionPrinterPass.cpp
index dbda93e4fd..0102ac708d 100644
--- a/lib/CodeGen/MachineFunctionPrinterPass.cpp
+++ b/lib/CodeGen/MachineFunctionPrinterPass.cpp
@@ -14,6 +14,7 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Debug.h"
 
@@ -42,7 +43,7 @@ struct MachineFunctionPrinterPass : public MachineFunctionPass {
 
   bool runOnMachineFunction(MachineFunction &MF) {
     OS << "# " << Banner << ":\n";
-    MF.print(OS);
+    MF.print(OS, getAnalysisIfAvailable<SlotIndexes>());
     return false;
   }
 };
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index 5d16c20460..b166849946 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -47,55 +47,6 @@ using namespace llvm;
 // MachineOperand Implementation
 //===----------------------------------------------------------------------===//
 
-/// AddRegOperandToRegInfo - Add this register operand to the specified
-/// MachineRegisterInfo.  If it is null, then the next/prev fields should be
-/// explicitly nulled out.
-void MachineOperand::AddRegOperandToRegInfo(MachineRegisterInfo *RegInfo) {
-  assert(isReg() && "Can only add reg operand to use lists");
-
-  // If the reginfo pointer is null, just explicitly null out or next/prev
-  // pointers, to ensure they are not garbage.
-  if (RegInfo == 0) {
-    Contents.Reg.Prev = 0;
-    Contents.Reg.Next = 0;
-    return;
-  }
-
-  // Otherwise, add this operand to the head of the registers use/def list.
-  MachineOperand **Head = &RegInfo->getRegUseDefListHead(getReg());
-
-  // For SSA values, we prefer to keep the definition at the start of the list.
-  // we do this by skipping over the definition if it is at the head of the
-  // list.
-  if (*Head && (*Head)->isDef())
-    Head = &(*Head)->Contents.Reg.Next;
-
-  Contents.Reg.Next = *Head;
-  if (Contents.Reg.Next) {
-    assert(getReg() == Contents.Reg.Next->getReg() &&
-           "Different regs on the same list!");
-    Contents.Reg.Next->Contents.Reg.Prev = &Contents.Reg.Next;
-  }
-
-  Contents.Reg.Prev = Head;
-  *Head = this;
-}
-
-/// RemoveRegOperandFromRegInfo - Remove this register operand from the
-/// MachineRegisterInfo it is linked with.
-void MachineOperand::RemoveRegOperandFromRegInfo() {
-  assert(isOnRegUseList() && "Reg operand is not on a use list");
-  // Unlink this from the doubly linked list of operands.
-  MachineOperand *NextOp = Contents.Reg.Next;
-  *Contents.Reg.Prev = NextOp;
-  if (NextOp) {
-    assert(NextOp->getReg() == getReg() && "Corrupt reg use/def chain!");
-    NextOp->Contents.Reg.Prev = Contents.Reg.Prev;
-  }
-  Contents.Reg.Prev = 0;
-  Contents.Reg.Next = 0;
-}
-
 void MachineOperand::setReg(unsigned Reg) {
   if (getReg() == Reg) return; // No change.
 
@@ -105,9 +56,10 @@ void MachineOperand::setReg(unsigned Reg) {
   if (MachineInstr *MI = getParent())
     if (MachineBasicBlock *MBB = MI->getParent())
       if (MachineFunction *MF = MBB->getParent()) {
-        RemoveRegOperandFromRegInfo();
+        MachineRegisterInfo &MRI = MF->getRegInfo();
+        MRI.removeRegOperandFromUseList(this);
         SmallContents.RegNo = Reg;
-        AddRegOperandToRegInfo(&MF->getRegInfo());
+        MRI.addRegOperandToUseList(this);
         return;
       }
 
@@ -136,15 +88,36 @@ void MachineOperand::substPhysReg(unsigned Reg, const TargetRegisterInfo &TRI) {
   setReg(Reg);
 }
 
+/// Change a def to a use, or a use to a def.
+void MachineOperand::setIsDef(bool Val) {
+  assert(isReg() && "Wrong MachineOperand accessor");
+  assert((!Val || !isDebug()) && "Marking a debug operation as def");
+  if (IsDef == Val)
+    return;
+  // MRI may keep uses and defs in different list positions.
+  if (MachineInstr *MI = getParent())
+    if (MachineBasicBlock *MBB = MI->getParent())
+      if (MachineFunction *MF = MBB->getParent()) {
+        MachineRegisterInfo &MRI = MF->getRegInfo();
+        MRI.removeRegOperandFromUseList(this);
+        IsDef = Val;
+        MRI.addRegOperandToUseList(this);
+        return;
+      }
+  IsDef = Val;
+}
+
 /// ChangeToImmediate - Replace this operand with a new immediate operand of
 /// the specified value.  If an operand is known to be an immediate already,
 /// the setImm method should be used.
 void MachineOperand::ChangeToImmediate(int64_t ImmVal) {
   // If this operand is currently a register operand, and if this is in a
   // function, deregister the operand from the register's use/def list.
-  if (isReg() && getParent() && getParent()->getParent() &&
-      getParent()->getParent()->getParent())
-    RemoveRegOperandFromRegInfo();
+  if (isReg() && isOnRegUseList())
+    if (MachineInstr *MI = getParent())
+      if (MachineBasicBlock *MBB = MI->getParent())
+        if (MachineFunction *MF = MBB->getParent())
+          MF->getRegInfo().removeRegOperandFromUseList(this);
 
   OpKind = MO_Immediate;
   Contents.ImmVal = ImmVal;
@@ -156,24 +129,20 @@ void MachineOperand::ChangeToImmediate(int64_t ImmVal) {
 void MachineOperand::ChangeToRegister(unsigned Reg, bool isDef, bool isImp,
                                       bool isKill, bool isDead, bool isUndef,
                                       bool isDebug) {
-  // If this operand is already a register operand, use setReg to update the
+  MachineRegisterInfo *RegInfo = 0;
+  if (MachineInstr *MI = getParent())
+    if (MachineBasicBlock *MBB = MI->getParent())
+      if (MachineFunction *MF = MBB->getParent())
+        RegInfo = &MF->getRegInfo();
+  // If this operand is already a register operand, remove it from the
   // register's use/def lists.
-  if (isReg()) {
-    assert(!isEarlyClobber());
-    setReg(Reg);
-  } else {
-    // Otherwise, change this to a register and set the reg#.
-    OpKind = MO_Register;
-    SmallContents.RegNo = Reg;
-
-    // If this operand is embedded in a function, add the operand to the
-    // register's use/def list.
-    if (MachineInstr *MI = getParent())
-      if (MachineBasicBlock *MBB = MI->getParent())
-        if (MachineFunction *MF = MBB->getParent())
-          AddRegOperandToRegInfo(&MF->getRegInfo());
-  }
+  if (RegInfo && isReg())
+    RegInfo->removeRegOperandFromUseList(this);
 
+  // Change this to a register and set the reg#.
+  OpKind = MO_Register;
+  SmallContents.RegNo = Reg;
+  SubReg = 0;
   IsDef = isDef;
   IsImp = isImp;
   IsKill = isKill;
@@ -182,11 +151,18 @@ void MachineOperand::ChangeToRegister(unsigned Reg, bool isDef, bool isImp,
   IsInternalRead = false;
   IsEarlyClobber = false;
   IsDebug = isDebug;
-  SubReg = 0;
+  // Ensure isOnRegUseList() returns false.
+  Contents.Reg.Prev = 0;
+
+  // If this operand is embedded in a function, add the operand to the
+  // register's use/def list.
+  if (RegInfo)
+    RegInfo->addRegOperandToUseList(this);
 }
 
 /// isIdenticalTo - Return true if this operand is identical to the specified
-/// operand.
+/// operand. Note that this should stay in sync with the hash_value overload
+/// below.
 bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const {
   if (getType() != Other.getType() ||
       getTargetFlags() != Other.getTargetFlags())
@@ -207,6 +183,7 @@ bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const {
   case MachineOperand::MO_FrameIndex:
     return getIndex() == Other.getIndex();
   case MachineOperand::MO_ConstantPoolIndex:
+  case MachineOperand::MO_TargetIndex:
     return getIndex() == Other.getIndex() && getOffset() == Other.getOffset();
   case MachineOperand::MO_JumpTableIndex:
     return getIndex() == Other.getIndex();
@@ -227,6 +204,47 @@ bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const {
   llvm_unreachable("Invalid machine operand type");
 }
 
+// Note: this must stay exactly in sync with isIdenticalTo above.
+hash_code llvm::hash_value(const MachineOperand &MO) {
+  switch (MO.getType()) {
+  case MachineOperand::MO_Register:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getReg(),
+                        MO.getSubReg(), MO.isDef());
+  case MachineOperand::MO_Immediate:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getImm());
+  case MachineOperand::MO_CImmediate:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getCImm());
+  case MachineOperand::MO_FPImmediate:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getFPImm());
+  case MachineOperand::MO_MachineBasicBlock:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getMBB());
+  case MachineOperand::MO_FrameIndex:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getIndex());
+  case MachineOperand::MO_ConstantPoolIndex:
+  case MachineOperand::MO_TargetIndex:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getIndex(),
+                        MO.getOffset());
+  case MachineOperand::MO_JumpTableIndex:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getIndex());
+  case MachineOperand::MO_ExternalSymbol:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getOffset(),
+                        MO.getSymbolName());
+  case MachineOperand::MO_GlobalAddress:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getGlobal(),
+                        MO.getOffset());
+  case MachineOperand::MO_BlockAddress:
+    return hash_combine(MO.getType(), MO.getTargetFlags(),
+                        MO.getBlockAddress());
+  case MachineOperand::MO_RegisterMask:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getRegMask());
+  case MachineOperand::MO_Metadata:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getMetadata());
+  case MachineOperand::MO_MCSymbol:
+    return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getMCSymbol());
+  }
+  llvm_unreachable("Invalid machine operand type");
+}
+
 /// print - Print the specified machine operand.
 ///
 void MachineOperand::print(raw_ostream &OS, const TargetMachine *TM) const {
@@ -312,6 +330,11 @@ void MachineOperand::print(raw_ostream &OS, const TargetMachine *TM) const {
     if (getOffset()) OS << "+" << getOffset();
     OS << '>';
     break;
+  case MachineOperand::MO_TargetIndex:
+    OS << "<ti#" << getIndex();
+    if (getOffset()) OS << "+" << getOffset();
+    OS << '>';
+    break;
   case MachineOperand::MO_JumpTableIndex:
     OS << "<jt#" << getIndex() << '>';
     break;
@@ -609,24 +632,21 @@ MachineRegisterInfo *MachineInstr::getRegInfo() {
 /// RemoveRegOperandsFromUseLists - Unlink all of the register operands in
 /// this instruction from their respective use lists.  This requires that the
 /// operands already be on their use lists.
-void MachineInstr::RemoveRegOperandsFromUseLists() {
-  for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
+void MachineInstr::RemoveRegOperandsFromUseLists(MachineRegisterInfo &MRI) {
+  for (unsigned i = 0, e = Operands.size(); i != e; ++i)
     if (Operands[i].isReg())
-      Operands[i].RemoveRegOperandFromRegInfo();
-  }
+      MRI.removeRegOperandFromUseList(&Operands[i]);
 }
 
 /// AddRegOperandsToUseLists - Add all of the register operands in
 /// this instruction from their respective use lists.  This requires that the
 /// operands not be on their use lists yet.
-void MachineInstr::AddRegOperandsToUseLists(MachineRegisterInfo &RegInfo) {
-  for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
+void MachineInstr::AddRegOperandsToUseLists(MachineRegisterInfo &MRI) {
+  for (unsigned i = 0, e = Operands.size(); i != e; ++i)
     if (Operands[i].isReg())
-      Operands[i].AddRegOperandToRegInfo(&RegInfo);
-  }
+      MRI.addRegOperandToUseList(&Operands[i]);
 }
 
-
 /// addOperand - Add the specified operand to the instruction.  If it is an
 /// implicit operand, it is added to the end of the operand list.  If it is
 /// an explicit operand it is added at the end of the explicit operand list
@@ -654,13 +674,15 @@ void MachineInstr::addOperand(const MachineOperand &Op) {
     while (OpNo && Operands[OpNo-1].isReg() && Operands[OpNo-1].isImplicit()) {
       --OpNo;
       if (RegInfo)
-        Operands[OpNo].RemoveRegOperandFromRegInfo();
+        RegInfo->removeRegOperandFromUseList(&Operands[OpNo]);
     }
   }
 
   // OpNo now points as the desired insertion point.  Unless this is a variadic
   // instruction, only implicit regs are allowed beyond MCID->getNumOperands().
-  assert((isImpReg || MCID->isVariadic() || OpNo < MCID->getNumOperands()) &&
+  // RegMask operands go between the explicit and implicit operands.
+  assert((isImpReg || Op.isRegMask() || MCID->isVariadic() ||
+          OpNo < MCID->getNumOperands()) &&
          "Trying to add an operand to a machine instr that is already done!");
 
   // All operands from OpNo have been removed from RegInfo.  If the Operands
@@ -669,7 +691,7 @@ void MachineInstr::addOperand(const MachineOperand &Op) {
   if (Reallocate)
     for (unsigned i = 0; i != OpNo; ++i)
       if (Operands[i].isReg())
-        Operands[i].RemoveRegOperandFromRegInfo();
+        RegInfo->removeRegOperandFromUseList(&Operands[i]);
 
   // Insert the new operand at OpNo.
   Operands.insert(Operands.begin() + OpNo, Op);
@@ -680,13 +702,15 @@ void MachineInstr::addOperand(const MachineOperand &Op) {
   if (Reallocate)
     for (unsigned i = 0; i != OpNo; ++i)
       if (Operands[i].isReg())
-        Operands[i].AddRegOperandToRegInfo(RegInfo);
+        RegInfo->addRegOperandToUseList(&Operands[i]);
 
   // When adding a register operand, tell RegInfo about it.
   if (Operands[OpNo].isReg()) {
-    // Add the new operand to RegInfo, even when RegInfo is NULL.
-    // This will initialize the linked list pointers.
-    Operands[OpNo].AddRegOperandToRegInfo(RegInfo);
+    // Ensure isOnRegUseList() returns false, regardless of Op's status.
+    Operands[OpNo].Contents.Reg.Prev = 0;
+    // Add the new operand to RegInfo.
+    if (RegInfo)
+      RegInfo->addRegOperandToUseList(&Operands[OpNo]);
     // If the register operand is flagged as early, mark the operand as such.
     if (MCID->getOperandConstraint(OpNo, MCOI::EARLY_CLOBBER) != -1)
       Operands[OpNo].setIsEarlyClobber(true);
@@ -696,7 +720,7 @@ void MachineInstr::addOperand(const MachineOperand &Op) {
   if (RegInfo) {
     for (unsigned i = OpNo + 1, e = Operands.size(); i != e; ++i) {
       assert(Operands[i].isReg() && "Should only be an implicit reg!");
-      Operands[i].AddRegOperandToRegInfo(RegInfo);
+      RegInfo->addRegOperandToUseList(&Operands[i]);
     }
   }
 }
@@ -706,12 +730,13 @@ void MachineInstr::addOperand(const MachineOperand &Op) {
 ///
 void MachineInstr::RemoveOperand(unsigned OpNo) {
   assert(OpNo < Operands.size() && "Invalid operand number");
+  MachineRegisterInfo *RegInfo = getRegInfo();
 
   // Special case removing the last one.
   if (OpNo == Operands.size()-1) {
     // If needed, remove from the reg def/use list.
-    if (Operands.back().isReg() && Operands.back().isOnRegUseList())
-      Operands.back().RemoveRegOperandFromRegInfo();
+    if (RegInfo && Operands.back().isReg() && Operands.back().isOnRegUseList())
+      RegInfo->removeRegOperandFromUseList(&Operands.back());
 
     Operands.pop_back();
     return;
@@ -720,11 +745,10 @@ void MachineInstr::RemoveOperand(unsigned OpNo) {
   // Otherwise, we are removing an interior operand.  If we have reginfo to
   // update, remove all operands that will be shifted down from their reg lists,
   // move everything down, then re-add them.
-  MachineRegisterInfo *RegInfo = getRegInfo();
   if (RegInfo) {
     for (unsigned i = OpNo, e = Operands.size(); i != e; ++i) {
       if (Operands[i].isReg())
-        Operands[i].RemoveRegOperandFromRegInfo();
+        RegInfo->removeRegOperandFromUseList(&Operands[i]);
     }
   }
 
@@ -733,7 +757,7 @@ void MachineInstr::RemoveOperand(unsigned OpNo) {
   if (RegInfo) {
     for (unsigned i = OpNo, e = Operands.size(); i != e; ++i) {
       if (Operands[i].isReg())
-        Operands[i].AddRegOperandToRegInfo(RegInfo);
+        RegInfo->addRegOperandToUseList(&Operands[i]);
     }
   }
 }
@@ -1852,52 +1876,16 @@ void MachineInstr::setPhysRegsDeadExcept(ArrayRef<unsigned> UsedRegs,
 unsigned
 MachineInstrExpressionTrait::getHashValue(const MachineInstr* const &MI) {
   // Build up a buffer of hash code components.
-  //
-  // FIXME: This is a total hack. We should have a hash_value overload for
-  // MachineOperand, but currently that doesn't work because there are many
-  // different ideas of "equality" and thus different sets of information that
-  // contribute to the hash code. This one happens to want to take a specific
-  // subset. And it's still not clear that this routine uses the *correct*
-  // subset of information when computing the hash code. The goal is to use the
-  // same inputs for the hash code here that MachineInstr::isIdenticalTo uses to
-  // test for equality when passed the 'IgnoreVRegDefs' filter flag. It would
-  // be very useful to factor the selection of relevant inputs out of the two
-  // functions and into a common routine, but it's not clear how that can be
-  // done.
   SmallVector<size_t, 8> HashComponents;
   HashComponents.reserve(MI->getNumOperands() + 1);
   HashComponents.push_back(MI->getOpcode());
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
-    switch (MO.getType()) {
-    default: break;
-    case MachineOperand::MO_Register:
-      if (MO.isDef() && TargetRegisterInfo::isVirtualRegister(MO.getReg()))
-        continue;  // Skip virtual register defs.
-      HashComponents.push_back(hash_combine(MO.getType(), MO.getReg()));
-      break;
-    case MachineOperand::MO_Immediate:
-      HashComponents.push_back(hash_combine(MO.getType(), MO.getImm()));
-      break;
-    case MachineOperand::MO_FrameIndex:
-    case MachineOperand::MO_ConstantPoolIndex:
-    case MachineOperand::MO_JumpTableIndex:
-      HashComponents.push_back(hash_combine(MO.getType(), MO.getIndex()));
-      break;
-    case MachineOperand::MO_MachineBasicBlock:
-      HashComponents.push_back(hash_combine(MO.getType(), MO.getMBB()));
-      break;
-    case MachineOperand::MO_GlobalAddress:
-      HashComponents.push_back(hash_combine(MO.getType(), MO.getGlobal()));
-      break;
-    case MachineOperand::MO_BlockAddress:
-      HashComponents.push_back(hash_combine(MO.getType(),
-                                            MO.getBlockAddress()));
-      break;
-    case MachineOperand::MO_MCSymbol:
-      HashComponents.push_back(hash_combine(MO.getType(), MO.getMCSymbol()));
-      break;
-    }
+    if (MO.isReg() && MO.isDef() &&
+        TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      continue;  // Skip virtual register defs.
+
+    HashComponents.push_back(hash_value(MO));
   }
   return hash_combine_range(HashComponents.begin(), HashComponents.end());
 }
diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp
index 82e123528a..5fb938f340 100644
--- a/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/lib/CodeGen/MachineRegisterInfo.cpp
@@ -102,17 +102,9 @@ MachineRegisterInfo::createVirtualRegister(const TargetRegisterClass *RegClass){
 
   // New virtual register number.
   unsigned Reg = TargetRegisterInfo::index2VirtReg(getNumVirtRegs());
-
-  // Add a reg, but keep track of whether the vector reallocated or not.
-  const unsigned FirstVirtReg = TargetRegisterInfo::index2VirtReg(0);
-  void *ArrayBase = getNumVirtRegs() == 0 ? 0 : &VRegInfo[FirstVirtReg];
   VRegInfo.grow(Reg);
   VRegInfo[Reg].first = RegClass;
   RegAllocHints.grow(Reg);
-
-  if (ArrayBase && &VRegInfo[FirstVirtReg] != ArrayBase)
-    // The vector reallocated, handle this now.
-    HandleVRegListReallocation();
   return Reg;
 }
 
@@ -126,21 +118,68 @@ void MachineRegisterInfo::clearVirtRegs() {
   VRegInfo.clear();
 }
 
-/// HandleVRegListReallocation - We just added a virtual register to the
-/// VRegInfo info list and it reallocated.  Update the use/def lists info
-/// pointers.
-void MachineRegisterInfo::HandleVRegListReallocation() {
-  // The back pointers for the vreg lists point into the previous vector.
-  // Update them to point to their correct slots.
-  for (unsigned i = 0, e = getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
-    MachineOperand *List = VRegInfo[Reg].second;
-    if (!List) continue;
-    // Update the back-pointer to be accurate once more.
-    List->Contents.Reg.Prev = &VRegInfo[Reg].second;
+/// Add MO to the linked list of operands for its register.
+void MachineRegisterInfo::addRegOperandToUseList(MachineOperand *MO) {
+  assert(!MO->isOnRegUseList() && "Already on list");
+  MachineOperand *&HeadRef = getRegUseDefListHead(MO->getReg());
+  MachineOperand *const Head = HeadRef;
+
+  // Head points to the first list element.
+  // Next is NULL on the last list element.
+  // Prev pointers are circular, so Head->Prev == Last.
+
+  // Head is NULL for an empty list.
+  if (!Head) {
+    MO->Contents.Reg.Prev = MO;
+    MO->Contents.Reg.Next = 0;
+    HeadRef = MO;
+    return;
+  }
+  assert(MO->getReg() == Head->getReg() && "Different regs on the same list!");
+
+  // Insert MO between Last and Head in the circular Prev chain.
+  MachineOperand *Last = Head->Contents.Reg.Prev;
+  assert(Last && "Inconsistent use list");
+  assert(MO->getReg() == Last->getReg() && "Different regs on the same list!");
+  Head->Contents.Reg.Prev = MO;
+  MO->Contents.Reg.Prev = Last;
+
+  // Def operands always precede uses. This allows def_iterator to stop early.
+  // Insert def operands at the front, and use operands at the back.
+  if (MO->isDef()) {
+    // Insert def at the front.
+    MO->Contents.Reg.Next = Head;
+    HeadRef = MO;
+  } else {
+    // Insert use at the end.
+    MO->Contents.Reg.Next = 0;
+    Last->Contents.Reg.Next = MO;
   }
 }
 
+/// Remove MO from its use-def list.
+void MachineRegisterInfo::removeRegOperandFromUseList(MachineOperand *MO) {
+  assert(MO->isOnRegUseList() && "Operand not on use list");
+  MachineOperand *&HeadRef = getRegUseDefListHead(MO->getReg());
+  MachineOperand *const Head = HeadRef;
+  assert(Head && "List already empty");
+
+  // Unlink this from the doubly linked list of operands.
+  MachineOperand *Next = MO->Contents.Reg.Next;
+  MachineOperand *Prev = MO->Contents.Reg.Prev;
+
+  // Prev links are circular, next link is NULL instead of looping back to Head.
+  if (MO == Head)
+    HeadRef = Next;
+  else
+    Prev->Contents.Reg.Next = Next;
+
+  (Next ? Next : Head)->Contents.Reg.Prev = Prev;
+
+  MO->Contents.Reg.Prev = 0;
+  MO->Contents.Reg.Next = 0;
+}
+
 /// replaceRegWith - Replace all instances of FromReg with ToReg in the
 /// machine function.  This is like llvm-level X->replaceAllUsesWith(Y),
 /// except that it also changes any definitions of the register as well.
@@ -178,13 +217,6 @@ MachineInstr *MachineRegisterInfo::getUniqueVRegDef(unsigned Reg) const {
   return &*I;
 }
 
-bool MachineRegisterInfo::hasOneUse(unsigned RegNo) const {
-  use_iterator UI = use_begin(RegNo);
-  if (UI == use_end())
-    return false;
-  return ++UI == use_end();
-}
-
 bool MachineRegisterInfo::hasOneNonDBGUse(unsigned RegNo) const {
   use_nodbg_iterator UI = use_nodbg_begin(RegNo);
   if (UI == use_nodbg_end())
diff --git a/lib/CodeGen/MachineSSAUpdater.cpp b/lib/CodeGen/MachineSSAUpdater.cpp
index acb1ee6cb6..076547a5ed 100644
--- a/lib/CodeGen/MachineSSAUpdater.cpp
+++ b/lib/CodeGen/MachineSSAUpdater.cpp
@@ -42,7 +42,7 @@ MachineSSAUpdater::MachineSSAUpdater(MachineFunction &MF,
 }
 
 MachineSSAUpdater::~MachineSSAUpdater() {
-  delete &getAvailableVals(AV);
+  delete static_cast<AvailableValsTy*>(AV);
 }
 
 /// Initialize - Reset this object to get ready for a new set of SSA
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index 847bf1e76e..a1dc9481c6 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -212,7 +212,7 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
     Scheduler->startBlock(MBB);
 
     // Break the block into scheduling regions [I, RegionEnd), and schedule each
-    // region as soon as it is discovered. RegionEnd points the the scheduling
+    // region as soon as it is discovered. RegionEnd points the scheduling
     // boundary at the bottom of the region. The DAG does not include RegionEnd,
     // but the region does (i.e. the next RegionEnd is above the previous
     // RegionBegin). If the current block has no terminator then RegionEnd ==
@@ -403,7 +403,8 @@ public:
 
   /// getIssueWidth - Return the max instructions per scheduling group.
   unsigned getIssueWidth() const {
-    return InstrItins ? InstrItins->Props.IssueWidth : 1;
+    return (InstrItins && InstrItins->SchedModel)
+      ? InstrItins->SchedModel->IssueWidth : 1;
   }
 
   /// getNumMicroOps - Return the number of issue slots required for this MI.
diff --git a/lib/CodeGen/MachineSink.cpp b/lib/CodeGen/MachineSink.cpp
index 1ce546b578..bc383cba45 100644
--- a/lib/CodeGen/MachineSink.cpp
+++ b/lib/CodeGen/MachineSink.cpp
@@ -99,6 +99,16 @@ namespace {
     bool PerformTrivialForwardCoalescing(MachineInstr *MI,
                                          MachineBasicBlock *MBB);
   };
+
+  // SuccessorSorter - Sort Successors according to their loop depth. 
+  struct SuccessorSorter {
+    SuccessorSorter(MachineLoopInfo *LoopInfo) : LI(LoopInfo) {}
+    bool operator()(const MachineBasicBlock *LHS,
+                    const MachineBasicBlock *RHS) const {
+      return LI->getLoopDepth(LHS) < LI->getLoopDepth(RHS);
+    }
+    MachineLoopInfo *LI;
+  };
 } // end anonymous namespace
 
 char MachineSinking::ID = 0;
@@ -526,8 +536,11 @@ MachineBasicBlock *MachineSinking::FindSuccToSinkTo(MachineInstr *MI,
 
       // Otherwise, we should look at all the successors and decide which one
       // we should sink to.
-      for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(),
-           E = MBB->succ_end(); SI != E; ++SI) {
+      // We give successors with smaller loop depth higher priority.
+      SmallVector<MachineBasicBlock*, 4> Succs(MBB->succ_begin(), MBB->succ_end());
+      std::stable_sort(Succs.begin(), Succs.end(), SuccessorSorter(LI));
+      for (SmallVector<MachineBasicBlock*, 4>::iterator SI = Succs.begin(),
+           E = Succs.end(); SI != E; ++SI) {
         MachineBasicBlock *SuccBlock = *SI;
         bool LocalUse = false;
         if (AllUsesDominatedByBlock(Reg, SuccBlock, MBB,
diff --git a/lib/CodeGen/MachineTraceMetrics.cpp b/lib/CodeGen/MachineTraceMetrics.cpp
new file mode 100644
index 0000000000..1a3aa60918
--- /dev/null
+++ b/lib/CodeGen/MachineTraceMetrics.cpp
@@ -0,0 +1,1153 @@
+//===- lib/CodeGen/MachineTraceMetrics.cpp ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "machine-trace-metrics"
+#include "MachineTraceMetrics.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SparseSet.h"
+
+using namespace llvm;
+
+char MachineTraceMetrics::ID = 0;
+char &llvm::MachineTraceMetricsID = MachineTraceMetrics::ID;
+
+INITIALIZE_PASS_BEGIN(MachineTraceMetrics,
+                  "machine-trace-metrics", "Machine Trace Metrics", false, true)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(MachineTraceMetrics,
+                  "machine-trace-metrics", "Machine Trace Metrics", false, true)
+
+MachineTraceMetrics::MachineTraceMetrics()
+  : MachineFunctionPass(ID), MF(0), TII(0), TRI(0), MRI(0), Loops(0) {
+  std::fill(Ensembles, array_endof(Ensembles), (Ensemble*)0);
+}
+
+void MachineTraceMetrics::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequired<MachineBranchProbabilityInfo>();
+  AU.addRequired<MachineLoopInfo>();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool MachineTraceMetrics::runOnMachineFunction(MachineFunction &Func) {
+  MF = &Func;
+  TII = MF->getTarget().getInstrInfo();
+  TRI = MF->getTarget().getRegisterInfo();
+  ItinData = MF->getTarget().getInstrItineraryData();
+  MRI = &MF->getRegInfo();
+  Loops = &getAnalysis<MachineLoopInfo>();
+  BlockInfo.resize(MF->getNumBlockIDs());
+  return false;
+}
+
+void MachineTraceMetrics::releaseMemory() {
+  MF = 0;
+  BlockInfo.clear();
+  for (unsigned i = 0; i != TS_NumStrategies; ++i) {
+    delete Ensembles[i];
+    Ensembles[i] = 0;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+//                          Fixed block information
+//===----------------------------------------------------------------------===//
+//
+// The number of instructions in a basic block and the CPU resources used by
+// those instructions don't depend on any given trace strategy.
+
+/// Compute the resource usage in basic block MBB.
+const MachineTraceMetrics::FixedBlockInfo*
+MachineTraceMetrics::getResources(const MachineBasicBlock *MBB) {
+  assert(MBB && "No basic block");
+  FixedBlockInfo *FBI = &BlockInfo[MBB->getNumber()];
+  if (FBI->hasResources())
+    return FBI;
+
+  // Compute resource usage in the block.
+  // FIXME: Compute per-functional unit counts.
+  FBI->HasCalls = false;
+  unsigned InstrCount = 0;
+  for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end();
+       I != E; ++I) {
+    const MachineInstr *MI = I;
+    if (MI->isTransient())
+      continue;
+    ++InstrCount;
+    if (MI->isCall())
+      FBI->HasCalls = true;
+  }
+  FBI->InstrCount = InstrCount;
+  return FBI;
+}
+
+//===----------------------------------------------------------------------===//
+//                         Ensemble utility functions
+//===----------------------------------------------------------------------===//
+
+MachineTraceMetrics::Ensemble::Ensemble(MachineTraceMetrics *ct)
+  : MTM(*ct) {
+  BlockInfo.resize(MTM.BlockInfo.size());
+}
+
+// Virtual destructor serves as an anchor.
+MachineTraceMetrics::Ensemble::~Ensemble() {}
+
+const MachineLoop*
+MachineTraceMetrics::Ensemble::getLoopFor(const MachineBasicBlock *MBB) const {
+  return MTM.Loops->getLoopFor(MBB);
+}
+
+// Update resource-related information in the TraceBlockInfo for MBB.
+// Only update resources related to the trace above MBB.
+void MachineTraceMetrics::Ensemble::
+computeDepthResources(const MachineBasicBlock *MBB) {
+  TraceBlockInfo *TBI = &BlockInfo[MBB->getNumber()];
+
+  // Compute resources from trace above. The top block is simple.
+  if (!TBI->Pred) {
+    TBI->InstrDepth = 0;
+    TBI->Head = MBB->getNumber();
+    return;
+  }
+
+  // Compute from the block above. A post-order traversal ensures the
+  // predecessor is always computed first.
+  TraceBlockInfo *PredTBI = &BlockInfo[TBI->Pred->getNumber()];
+  assert(PredTBI->hasValidDepth() && "Trace above has not been computed yet");
+  const FixedBlockInfo *PredFBI = MTM.getResources(TBI->Pred);
+  TBI->InstrDepth = PredTBI->InstrDepth + PredFBI->InstrCount;
+  TBI->Head = PredTBI->Head;
+}
+
+// Update resource-related information in the TraceBlockInfo for MBB.
+// Only update resources related to the trace below MBB.
+void MachineTraceMetrics::Ensemble::
+computeHeightResources(const MachineBasicBlock *MBB) {
+  TraceBlockInfo *TBI = &BlockInfo[MBB->getNumber()];
+
+  // Compute resources for the current block.
+  TBI->InstrHeight = MTM.getResources(MBB)->InstrCount;
+
+  // The trace tail is done.
+  if (!TBI->Succ) {
+    TBI->Tail = MBB->getNumber();
+    return;
+  }
+
+  // Compute from the block below. A post-order traversal ensures the
+  // predecessor is always computed first.
+  TraceBlockInfo *SuccTBI = &BlockInfo[TBI->Succ->getNumber()];
+  assert(SuccTBI->hasValidHeight() && "Trace below has not been computed yet");
+  TBI->InstrHeight += SuccTBI->InstrHeight;
+  TBI->Tail = SuccTBI->Tail;
+}
+
+// Check if depth resources for MBB are valid and return the TBI.
+// Return NULL if the resources have been invalidated.
+const MachineTraceMetrics::TraceBlockInfo*
+MachineTraceMetrics::Ensemble::
+getDepthResources(const MachineBasicBlock *MBB) const {
+  const TraceBlockInfo *TBI = &BlockInfo[MBB->getNumber()];
+  return TBI->hasValidDepth() ? TBI : 0;
+}
+
+// Check if height resources for MBB are valid and return the TBI.
+// Return NULL if the resources have been invalidated.
+const MachineTraceMetrics::TraceBlockInfo*
+MachineTraceMetrics::Ensemble::
+getHeightResources(const MachineBasicBlock *MBB) const {
+  const TraceBlockInfo *TBI = &BlockInfo[MBB->getNumber()];
+  return TBI->hasValidHeight() ? TBI : 0;
+}
+
+//===----------------------------------------------------------------------===//
+//                         Trace Selection Strategies
+//===----------------------------------------------------------------------===//
+//
+// A trace selection strategy is implemented as a sub-class of Ensemble. The
+// trace through a block B is computed by two DFS traversals of the CFG
+// starting from B. One upwards, and one downwards. During the upwards DFS,
+// pickTracePred() is called on the post-ordered blocks. During the downwards
+// DFS, pickTraceSucc() is called in a post-order.
+//
+
+// We never allow traces that leave loops, but we do allow traces to enter
+// nested loops. We also never allow traces to contain back-edges.
+//
+// This means that a loop header can never appear above the center block of a
+// trace, except as the trace head. Below the center block, loop exiting edges
+// are banned.
+//
+// Return true if an edge from the From loop to the To loop is leaving a loop.
+// Either of To and From can be null.
+static bool isExitingLoop(const MachineLoop *From, const MachineLoop *To) {
+  return From && !From->contains(To);
+}
+
+// MinInstrCountEnsemble - Pick the trace that executes the least number of
+// instructions.
+namespace {
+class MinInstrCountEnsemble : public MachineTraceMetrics::Ensemble {
+  const char *getName() const { return "MinInstr"; }
+  const MachineBasicBlock *pickTracePred(const MachineBasicBlock*);
+  const MachineBasicBlock *pickTraceSucc(const MachineBasicBlock*);
+
+public:
+  MinInstrCountEnsemble(MachineTraceMetrics *mtm)
+    : MachineTraceMetrics::Ensemble(mtm) {}
+};
+}
+
+// Select the preferred predecessor for MBB.
+const MachineBasicBlock*
+MinInstrCountEnsemble::pickTracePred(const MachineBasicBlock *MBB) {
+  if (MBB->pred_empty())
+    return 0;
+  const MachineLoop *CurLoop = getLoopFor(MBB);
+  // Don't leave loops, and never follow back-edges.
+  if (CurLoop && MBB == CurLoop->getHeader())
+    return 0;
+  unsigned CurCount = MTM.getResources(MBB)->InstrCount;
+  const MachineBasicBlock *Best = 0;
+  unsigned BestDepth = 0;
+  for (MachineBasicBlock::const_pred_iterator
+       I = MBB->pred_begin(), E = MBB->pred_end(); I != E; ++I) {
+    const MachineBasicBlock *Pred = *I;
+    const MachineTraceMetrics::TraceBlockInfo *PredTBI =
+      getDepthResources(Pred);
+    // Ignore cycles that aren't natural loops.
+    if (!PredTBI)
+      continue;
+    // Pick the predecessor that would give this block the smallest InstrDepth.
+    unsigned Depth = PredTBI->InstrDepth + CurCount;
+    if (!Best || Depth < BestDepth)
+      Best = Pred, BestDepth = Depth;
+  }
+  return Best;
+}
+
+// Select the preferred successor for MBB.
+const MachineBasicBlock*
+MinInstrCountEnsemble::pickTraceSucc(const MachineBasicBlock *MBB) {
+  if (MBB->pred_empty())
+    return 0;
+  const MachineLoop *CurLoop = getLoopFor(MBB);
+  const MachineBasicBlock *Best = 0;
+  unsigned BestHeight = 0;
+  for (MachineBasicBlock::const_succ_iterator
+       I = MBB->succ_begin(), E = MBB->succ_end(); I != E; ++I) {
+    const MachineBasicBlock *Succ = *I;
+    // Don't consider back-edges.
+    if (CurLoop && Succ == CurLoop->getHeader())
+      continue;
+    // Don't consider successors exiting CurLoop.
+    if (isExitingLoop(CurLoop, getLoopFor(Succ)))
+      continue;
+    const MachineTraceMetrics::TraceBlockInfo *SuccTBI =
+      getHeightResources(Succ);
+    // Ignore cycles that aren't natural loops.
+    if (!SuccTBI)
+      continue;
+    // Pick the successor that would give this block the smallest InstrHeight.
+    unsigned Height = SuccTBI->InstrHeight;
+    if (!Best || Height < BestHeight)
+      Best = Succ, BestHeight = Height;
+  }
+  return Best;
+}
+
+// Get an Ensemble sub-class for the requested trace strategy.
+MachineTraceMetrics::Ensemble *
+MachineTraceMetrics::getEnsemble(MachineTraceMetrics::Strategy strategy) {
+  assert(strategy < TS_NumStrategies && "Invalid trace strategy enum");
+  Ensemble *&E = Ensembles[strategy];
+  if (E)
+    return E;
+
+  // Allocate new Ensemble on demand.
+  switch (strategy) {
+  case TS_MinInstrCount: return (E = new MinInstrCountEnsemble(this));
+  default: llvm_unreachable("Invalid trace strategy enum");
+  }
+}
+
+void MachineTraceMetrics::invalidate(const MachineBasicBlock *MBB) {
+  DEBUG(dbgs() << "Invalidate traces through BB#" << MBB->getNumber() << '\n');
+  BlockInfo[MBB->getNumber()].invalidate();
+  for (unsigned i = 0; i != TS_NumStrategies; ++i)
+    if (Ensembles[i])
+      Ensembles[i]->invalidate(MBB);
+}
+
+void MachineTraceMetrics::verifyAnalysis() const {
+  if (!MF)
+    return;
+#ifndef NDEBUG
+  assert(BlockInfo.size() == MF->getNumBlockIDs() && "Outdated BlockInfo size");
+  for (unsigned i = 0; i != TS_NumStrategies; ++i)
+    if (Ensembles[i])
+      Ensembles[i]->verify();
+#endif
+}
+
+//===----------------------------------------------------------------------===//
+//                               Trace building
+//===----------------------------------------------------------------------===//
+//
+// Traces are built by two CFG traversals. To avoid recomputing too much, use a
+// set abstraction that confines the search to the current loop, and doesn't
+// revisit blocks.
+
+namespace {
+struct LoopBounds {
+  MutableArrayRef<MachineTraceMetrics::TraceBlockInfo> Blocks;
+  SmallPtrSet<const MachineBasicBlock*, 8> Visited;
+  const MachineLoopInfo *Loops;
+  bool Downward;
+  LoopBounds(MutableArrayRef<MachineTraceMetrics::TraceBlockInfo> blocks,
+             const MachineLoopInfo *loops)
+    : Blocks(blocks), Loops(loops), Downward(false) {}
+};
+}
+
+// Specialize po_iterator_storage in order to prune the post-order traversal so
+// it is limited to the current loop and doesn't traverse the loop back edges.
+namespace llvm {
+template<>
+class po_iterator_storage<LoopBounds, true> {
+  LoopBounds &LB;
+public:
+  po_iterator_storage(LoopBounds &lb) : LB(lb) {}
+  void finishPostorder(const MachineBasicBlock*) {}
+
+  bool insertEdge(const MachineBasicBlock *From, const MachineBasicBlock *To) {
+    // Skip already visited To blocks.
+    MachineTraceMetrics::TraceBlockInfo &TBI = LB.Blocks[To->getNumber()];
+    if (LB.Downward ? TBI.hasValidHeight() : TBI.hasValidDepth())
+      return false;
+    // From is null once when To is the trace center block.
+    if (From) {
+      if (const MachineLoop *FromLoop = LB.Loops->getLoopFor(From)) {
+        // Don't follow backedges, don't leave FromLoop when going upwards.
+        if ((LB.Downward ? To : From) == FromLoop->getHeader())
+          return false;
+        // Don't leave FromLoop.
+        if (isExitingLoop(FromLoop, LB.Loops->getLoopFor(To)))
+          return false;
+      }
+    }
+    // To is a new block. Mark the block as visited in case the CFG has cycles
+    // that MachineLoopInfo didn't recognize as a natural loop.
+    return LB.Visited.insert(To);
+  }
+};
+}
+
+/// Compute the trace through MBB.
+void MachineTraceMetrics::Ensemble::computeTrace(const MachineBasicBlock *MBB) {
+  DEBUG(dbgs() << "Computing " << getName() << " trace through BB#"
+               << MBB->getNumber() << '\n');
+  // Set up loop bounds for the backwards post-order traversal.
+  LoopBounds Bounds(BlockInfo, MTM.Loops);
+
+  // Run an upwards post-order search for the trace start.
+  Bounds.Downward = false;
+  Bounds.Visited.clear();
+  typedef ipo_ext_iterator<const MachineBasicBlock*, LoopBounds> UpwardPO;
+  for (UpwardPO I = ipo_ext_begin(MBB, Bounds), E = ipo_ext_end(MBB, Bounds);
+       I != E; ++I) {
+    DEBUG(dbgs() << "  pred for BB#" << I->getNumber() << ": ");
+    TraceBlockInfo &TBI = BlockInfo[I->getNumber()];
+    // All the predecessors have been visited, pick the preferred one.
+    TBI.Pred = pickTracePred(*I);
+    DEBUG({
+      if (TBI.Pred)
+        dbgs() << "BB#" << TBI.Pred->getNumber() << '\n';
+      else
+        dbgs() << "null\n";
+    });
+    // The trace leading to I is now known, compute the depth resources.
+    computeDepthResources(*I);
+  }
+
+  // Run a downwards post-order search for the trace end.
+  Bounds.Downward = true;
+  Bounds.Visited.clear();
+  typedef po_ext_iterator<const MachineBasicBlock*, LoopBounds> DownwardPO;
+  for (DownwardPO I = po_ext_begin(MBB, Bounds), E = po_ext_end(MBB, Bounds);
+       I != E; ++I) {
+    DEBUG(dbgs() << "  succ for BB#" << I->getNumber() << ": ");
+    TraceBlockInfo &TBI = BlockInfo[I->getNumber()];
+    // All the successors have been visited, pick the preferred one.
+    TBI.Succ = pickTraceSucc(*I);
+    DEBUG({
+      if (TBI.Succ)
+        dbgs() << "BB#" << TBI.Succ->getNumber() << '\n';
+      else
+        dbgs() << "null\n";
+    });
+    // The trace leaving I is now known, compute the height resources.
+    computeHeightResources(*I);
+  }
+}
+
+/// Invalidate traces through BadMBB.
+void
+MachineTraceMetrics::Ensemble::invalidate(const MachineBasicBlock *BadMBB) {
+  SmallVector<const MachineBasicBlock*, 16> WorkList;
+  TraceBlockInfo &BadTBI = BlockInfo[BadMBB->getNumber()];
+
+  // Invalidate height resources of blocks above MBB.
+  if (BadTBI.hasValidHeight()) {
+    BadTBI.invalidateHeight();
+    WorkList.push_back(BadMBB);
+    do {
+      const MachineBasicBlock *MBB = WorkList.pop_back_val();
+      DEBUG(dbgs() << "Invalidate BB#" << MBB->getNumber() << ' ' << getName()
+            << " height.\n");
+      // Find any MBB predecessors that have MBB as their preferred successor.
+      // They are the only ones that need to be invalidated.
+      for (MachineBasicBlock::const_pred_iterator
+           I = MBB->pred_begin(), E = MBB->pred_end(); I != E; ++I) {
+        TraceBlockInfo &TBI = BlockInfo[(*I)->getNumber()];
+        if (!TBI.hasValidHeight())
+          continue;
+        if (TBI.Succ == MBB) {
+          TBI.invalidateHeight();
+          WorkList.push_back(*I);
+          continue;
+        }
+        // Verify that TBI.Succ is actually a *I successor.
+        assert((!TBI.Succ || (*I)->isSuccessor(TBI.Succ)) && "CFG changed");
+      }
+    } while (!WorkList.empty());
+  }
+
+  // Invalidate depth resources of blocks below MBB.
+  if (BadTBI.hasValidDepth()) {
+    BadTBI.invalidateDepth();
+    WorkList.push_back(BadMBB);
+    do {
+      const MachineBasicBlock *MBB = WorkList.pop_back_val();
+      DEBUG(dbgs() << "Invalidate BB#" << MBB->getNumber() << ' ' << getName()
+            << " depth.\n");
+      // Find any MBB successors that have MBB as their preferred predecessor.
+      // They are the only ones that need to be invalidated.
+      for (MachineBasicBlock::const_succ_iterator
+           I = MBB->succ_begin(), E = MBB->succ_end(); I != E; ++I) {
+        TraceBlockInfo &TBI = BlockInfo[(*I)->getNumber()];
+        if (!TBI.hasValidDepth())
+          continue;
+        if (TBI.Pred == MBB) {
+          TBI.invalidateDepth();
+          WorkList.push_back(*I);
+          continue;
+        }
+        // Verify that TBI.Pred is actually a *I predecessor.
+        assert((!TBI.Pred || (*I)->isPredecessor(TBI.Pred)) && "CFG changed");
+      }
+    } while (!WorkList.empty());
+  }
+
+  // Clear any per-instruction data. We only have to do this for BadMBB itself
+  // because the instructions in that block may change. Other blocks may be
+  // invalidated, but their instructions will stay the same, so there is no
+  // need to erase the Cycle entries. They will be overwritten when we
+  // recompute.
+  for (MachineBasicBlock::const_iterator I = BadMBB->begin(), E = BadMBB->end();
+       I != E; ++I)
+    Cycles.erase(I);
+}
+
+void MachineTraceMetrics::Ensemble::verify() const {
+#ifndef NDEBUG
+  assert(BlockInfo.size() == MTM.MF->getNumBlockIDs() &&
+         "Outdated BlockInfo size");
+  for (unsigned Num = 0, e = BlockInfo.size(); Num != e; ++Num) {
+    const TraceBlockInfo &TBI = BlockInfo[Num];
+    if (TBI.hasValidDepth() && TBI.Pred) {
+      const MachineBasicBlock *MBB = MTM.MF->getBlockNumbered(Num);
+      assert(MBB->isPredecessor(TBI.Pred) && "CFG doesn't match trace");
+      assert(BlockInfo[TBI.Pred->getNumber()].hasValidDepth() &&
+             "Trace is broken, depth should have been invalidated.");
+      const MachineLoop *Loop = getLoopFor(MBB);
+      assert(!(Loop && MBB == Loop->getHeader()) && "Trace contains backedge");
+    }
+    if (TBI.hasValidHeight() && TBI.Succ) {
+      const MachineBasicBlock *MBB = MTM.MF->getBlockNumbered(Num);
+      assert(MBB->isSuccessor(TBI.Succ) && "CFG doesn't match trace");
+      assert(BlockInfo[TBI.Succ->getNumber()].hasValidHeight() &&
+             "Trace is broken, height should have been invalidated.");
+      const MachineLoop *Loop = getLoopFor(MBB);
+      const MachineLoop *SuccLoop = getLoopFor(TBI.Succ);
+      assert(!(Loop && Loop == SuccLoop && TBI.Succ == Loop->getHeader()) &&
+             "Trace contains backedge");
+    }
+  }
+#endif
+}
+
+//===----------------------------------------------------------------------===//
+//                             Data Dependencies
+//===----------------------------------------------------------------------===//
+//
+// Compute the depth and height of each instruction based on data dependencies
+// and instruction latencies. These cycle numbers assume that the CPU can issue
+// an infinite number of instructions per cycle as long as their dependencies
+// are ready.
+
+// A data dependency is represented as a defining MI and operand numbers on the
+// defining and using MI.
+namespace {
+struct DataDep {
+  const MachineInstr *DefMI;
+  unsigned DefOp;
+  unsigned UseOp;
+
+  DataDep(const MachineInstr *DefMI, unsigned DefOp, unsigned UseOp)
+    : DefMI(DefMI), DefOp(DefOp), UseOp(UseOp) {}
+
+  /// Create a DataDep from an SSA form virtual register.
+  DataDep(const MachineRegisterInfo *MRI, unsigned VirtReg, unsigned UseOp)
+    : UseOp(UseOp) {
+    assert(TargetRegisterInfo::isVirtualRegister(VirtReg));
+    MachineRegisterInfo::def_iterator DefI = MRI->def_begin(VirtReg);
+    assert(!DefI.atEnd() && "Register has no defs");
+    DefMI = &*DefI;
+    DefOp = DefI.getOperandNo();
+    assert((++DefI).atEnd() && "Register has multiple defs");
+  }
+};
+}
+
+// Get the input data dependencies that must be ready before UseMI can issue.
+// Return true if UseMI has any physreg operands.
+static bool getDataDeps(const MachineInstr *UseMI,
+                        SmallVectorImpl<DataDep> &Deps,
+                        const MachineRegisterInfo *MRI) {
+  bool HasPhysRegs = false;
+  for (ConstMIOperands MO(UseMI); MO.isValid(); ++MO) {
+    if (!MO->isReg())
+      continue;
+    unsigned Reg = MO->getReg();
+    if (!Reg)
+      continue;
+    if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+      HasPhysRegs = true;
+      continue;
+    }
+    // Collect virtual register reads.
+    if (MO->readsReg())
+      Deps.push_back(DataDep(MRI, Reg, MO.getOperandNo()));
+  }
+  return HasPhysRegs;
+}
+
+// Get the input data dependencies of a PHI instruction, using Pred as the
+// preferred predecessor.
+// This will add at most one dependency to Deps.
+static void getPHIDeps(const MachineInstr *UseMI,
+                       SmallVectorImpl<DataDep> &Deps,
+                       const MachineBasicBlock *Pred,
+                       const MachineRegisterInfo *MRI) {
+  // No predecessor at the beginning of a trace. Ignore dependencies.
+  if (!Pred)
+    return;
+  assert(UseMI->isPHI() && UseMI->getNumOperands() % 2 && "Bad PHI");
+  for (unsigned i = 1; i != UseMI->getNumOperands(); i += 2) {
+    if (UseMI->getOperand(i + 1).getMBB() == Pred) {
+      unsigned Reg = UseMI->getOperand(i).getReg();
+      Deps.push_back(DataDep(MRI, Reg, i));
+      return;
+    }
+  }
+}
+
+// Keep track of physreg data dependencies by recording each live register unit.
+// Associate each regunit with an instruction operand. Depending on the
+// direction instructions are scanned, it could be the operand that defined the
+// regunit, or the highest operand to read the regunit.
+namespace {
+struct LiveRegUnit {
+  unsigned RegUnit;
+  unsigned Cycle;
+  const MachineInstr *MI;
+  unsigned Op;
+
+  unsigned getSparseSetIndex() const { return RegUnit; }
+
+  LiveRegUnit(unsigned RU) : RegUnit(RU), Cycle(0), MI(0), Op(0) {}
+};
+}
+
+// Identify physreg dependencies for UseMI, and update the live regunit
+// tracking set when scanning instructions downwards.
+static void updatePhysDepsDownwards(const MachineInstr *UseMI,
+                                    SmallVectorImpl<DataDep> &Deps,
+                                    SparseSet<LiveRegUnit> &RegUnits,
+                                    const TargetRegisterInfo *TRI) {
+  SmallVector<unsigned, 8> Kills;
+  SmallVector<unsigned, 8> LiveDefOps;
+
+  for (ConstMIOperands MO(UseMI); MO.isValid(); ++MO) {
+    if (!MO->isReg())
+      continue;
+    unsigned Reg = MO->getReg();
+    if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+      continue;
+    // Track live defs and kills for updating RegUnits.
+    if (MO->isDef()) {
+      if (MO->isDead())
+        Kills.push_back(Reg);
+      else
+        LiveDefOps.push_back(MO.getOperandNo());
+    } else if (MO->isKill())
+      Kills.push_back(Reg);
+    // Identify dependencies.
+    if (!MO->readsReg())
+      continue;
+    for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
+      SparseSet<LiveRegUnit>::iterator I = RegUnits.find(*Units);
+      if (I == RegUnits.end())
+        continue;
+      Deps.push_back(DataDep(I->MI, I->Op, MO.getOperandNo()));
+      break;
+    }
+  }
+
+  // Update RegUnits to reflect live registers after UseMI.
+  // First kills.
+  for (unsigned i = 0, e = Kills.size(); i != e; ++i)
+    for (MCRegUnitIterator Units(Kills[i], TRI); Units.isValid(); ++Units)
+      RegUnits.erase(*Units);
+
+  // Second, live defs.
+  for (unsigned i = 0, e = LiveDefOps.size(); i != e; ++i) {
+    unsigned DefOp = LiveDefOps[i];
+    for (MCRegUnitIterator Units(UseMI->getOperand(DefOp).getReg(), TRI);
+         Units.isValid(); ++Units) {
+      LiveRegUnit &LRU = RegUnits[*Units];
+      LRU.MI = UseMI;
+      LRU.Op = DefOp;
+    }
+  }
+}
+
+/// The length of the critical path through a trace is the maximum of two path
+/// lengths:
+///
+/// 1. The maximum height+depth over all instructions in the trace center block.
+///
+/// 2. The longest cross-block dependency chain. For small blocks, it is
+///    possible that the critical path through the trace doesn't include any
+///    instructions in the block.
+///
+/// This function computes the second number from the live-in list of the
+/// center block.
+unsigned MachineTraceMetrics::Ensemble::
+computeCrossBlockCriticalPath(const TraceBlockInfo &TBI) {
+  assert(TBI.HasValidInstrDepths && "Missing depth info");
+  assert(TBI.HasValidInstrHeights && "Missing height info");
+  unsigned MaxLen = 0;
+  for (unsigned i = 0, e = TBI.LiveIns.size(); i != e; ++i) {
+    const LiveInReg &LIR = TBI.LiveIns[i];
+    if (!TargetRegisterInfo::isVirtualRegister(LIR.Reg))
+      continue;
+    const MachineInstr *DefMI = MTM.MRI->getVRegDef(LIR.Reg);
+    // Ignore dependencies outside the current trace.
+    const TraceBlockInfo &DefTBI = BlockInfo[DefMI->getParent()->getNumber()];
+    if (!DefTBI.hasValidDepth() || DefTBI.Head != TBI.Head)
+      continue;
+    unsigned Len = LIR.Height + Cycles[DefMI].Depth;
+    MaxLen = std::max(MaxLen, Len);
+  }
+  return MaxLen;
+}
+
+/// Compute instruction depths for all instructions above or in MBB in its
+/// trace. This assumes that the trace through MBB has already been computed.
+void MachineTraceMetrics::Ensemble::
+computeInstrDepths(const MachineBasicBlock *MBB) {
+  // The top of the trace may already be computed, and HasValidInstrDepths
+  // implies Head->HasValidInstrDepths, so we only need to start from the first
+  // block in the trace that needs to be recomputed.
+  SmallVector<const MachineBasicBlock*, 8> Stack;
+  do {
+    TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
+    assert(TBI.hasValidDepth() && "Incomplete trace");
+    if (TBI.HasValidInstrDepths)
+      break;
+    Stack.push_back(MBB);
+    MBB = TBI.Pred;
+  } while (MBB);
+
+  // FIXME: If MBB is non-null at this point, it is the last pre-computed block
+  // in the trace. We should track any live-out physregs that were defined in
+  // the trace. This is quite rare in SSA form, typically created by CSE
+  // hoisting a compare.
+  SparseSet<LiveRegUnit> RegUnits;
+  RegUnits.setUniverse(MTM.TRI->getNumRegUnits());
+
+  // Go through trace blocks in top-down order, stopping after the center block.
+  SmallVector<DataDep, 8> Deps;
+  while (!Stack.empty()) {
+    MBB = Stack.pop_back_val();
+    DEBUG(dbgs() << "Depths for BB#" << MBB->getNumber() << ":\n");
+    TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
+    TBI.HasValidInstrDepths = true;
+    TBI.CriticalPath = 0;
+
+    // Also compute the critical path length through MBB when possible.
+    if (TBI.HasValidInstrHeights)
+      TBI.CriticalPath = computeCrossBlockCriticalPath(TBI);
+
+    for (MachineBasicBlock::const_iterator I = MBB->begin(), E = MBB->end();
+         I != E; ++I) {
+      const MachineInstr *UseMI = I;
+
+      // Collect all data dependencies.
+      Deps.clear();
+      if (UseMI->isPHI())
+        getPHIDeps(UseMI, Deps, TBI.Pred, MTM.MRI);
+      else if (getDataDeps(UseMI, Deps, MTM.MRI))
+        updatePhysDepsDownwards(UseMI, Deps, RegUnits, MTM.TRI);
+
+      // Filter and process dependencies, computing the earliest issue cycle.
+      unsigned Cycle = 0;
+      for (unsigned i = 0, e = Deps.size(); i != e; ++i) {
+        const DataDep &Dep = Deps[i];
+        const TraceBlockInfo&DepTBI =
+          BlockInfo[Dep.DefMI->getParent()->getNumber()];
+        // Ignore dependencies from outside the current trace.
+        if (!DepTBI.hasValidDepth() || DepTBI.Head != TBI.Head)
+          continue;
+        assert(DepTBI.HasValidInstrDepths && "Inconsistent dependency");
+        unsigned DepCycle = Cycles.lookup(Dep.DefMI).Depth;
+        // Add latency if DefMI is a real instruction. Transients get latency 0.
+        if (!Dep.DefMI->isTransient())
+          DepCycle += MTM.TII->computeOperandLatency(MTM.ItinData,
+                                                     Dep.DefMI, Dep.DefOp,
+                                                     UseMI, Dep.UseOp,
+                                                     /* FindMin = */ false);
+        Cycle = std::max(Cycle, DepCycle);
+      }
+      // Remember the instruction depth.
+      InstrCycles &MICycles = Cycles[UseMI];
+      MICycles.Depth = Cycle;
+
+      if (!TBI.HasValidInstrHeights) {
+        DEBUG(dbgs() << Cycle << '\t' << *UseMI);
+        continue;
+      }
+      // Update critical path length.
+      TBI.CriticalPath = std::max(TBI.CriticalPath, Cycle + MICycles.Height);
+      DEBUG(dbgs() << TBI.CriticalPath << '\t' << Cycle << '\t' << *UseMI);
+    }
+  }
+}
+
+// Identify physreg dependencies for MI when scanning instructions upwards.
+// Return the issue height of MI after considering any live regunits.
+// Height is the issue height computed from virtual register dependencies alone.
+static unsigned updatePhysDepsUpwards(const MachineInstr *MI, unsigned Height,
+                                      SparseSet<LiveRegUnit> &RegUnits,
+                                      const InstrItineraryData *ItinData,
+                                      const TargetInstrInfo *TII,
+                                      const TargetRegisterInfo *TRI) {
+  SmallVector<unsigned, 8> ReadOps;
+  for (ConstMIOperands MO(MI); MO.isValid(); ++MO) {
+    if (!MO->isReg())
+      continue;
+    unsigned Reg = MO->getReg();
+    if (!TargetRegisterInfo::isPhysicalRegister(Reg))
+      continue;
+    if (MO->readsReg())
+      ReadOps.push_back(MO.getOperandNo());
+    if (!MO->isDef())
+      continue;
+    // This is a def of Reg. Remove corresponding entries from RegUnits, and
+    // update MI Height to consider the physreg dependencies.
+    for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
+      SparseSet<LiveRegUnit>::iterator I = RegUnits.find(*Units);
+      if (I == RegUnits.end())
+        continue;
+      unsigned DepHeight = I->Cycle;
+      if (!MI->isTransient()) {
+        // We may not know the UseMI of this dependency, if it came from the
+        // live-in list.
+        if (I->MI)
+          DepHeight += TII->computeOperandLatency(ItinData,
+                                                  MI, MO.getOperandNo(),
+                                                  I->MI, I->Op);
+        else
+          // No UseMI. Just use the MI latency instead.
+          DepHeight += TII->getInstrLatency(ItinData, MI);
+      }
+      Height = std::max(Height, DepHeight);
+      // This regunit is dead above MI.
+      RegUnits.erase(I);
+    }
+  }
+
+  // Now we know the height of MI. Update any regunits read.
+  for (unsigned i = 0, e = ReadOps.size(); i != e; ++i) {
+    unsigned Reg = MI->getOperand(ReadOps[i]).getReg();
+    for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
+      LiveRegUnit &LRU = RegUnits[*Units];
+      // Set the height to the highest reader of the unit.
+      if (LRU.Cycle <= Height && LRU.MI != MI) {
+        LRU.Cycle = Height;
+        LRU.MI = MI;
+        LRU.Op = ReadOps[i];
+      }
+    }
+  }
+
+  return Height;
+}
+
+
+typedef DenseMap<const MachineInstr *, unsigned> MIHeightMap;
+
+// Push the height of DefMI upwards if required to match UseMI.
+// Return true if this is the first time DefMI was seen.
+static bool pushDepHeight(const DataDep &Dep,
+                          const MachineInstr *UseMI, unsigned UseHeight,
+                          MIHeightMap &Heights,
+                          const InstrItineraryData *ItinData,
+                          const TargetInstrInfo *TII) {
+  // Adjust height by Dep.DefMI latency.
+  if (!Dep.DefMI->isTransient())
+    UseHeight += TII->computeOperandLatency(ItinData, Dep.DefMI, Dep.DefOp,
+                                            UseMI, Dep.UseOp);
+
+  // Update Heights[DefMI] to be the maximum height seen.
+  MIHeightMap::iterator I;
+  bool New;
+  tie(I, New) = Heights.insert(std::make_pair(Dep.DefMI, UseHeight));
+  if (New)
+    return true;
+
+  // DefMI has been pushed before. Give it the max height.
+  if (I->second < UseHeight)
+    I->second = UseHeight;
+  return false;
+}
+
+/// Assuming that DefMI was used by Trace.back(), add it to the live-in lists
+/// of all the blocks in Trace. Stop when reaching the block that contains
+/// DefMI.
+void MachineTraceMetrics::Ensemble::
+addLiveIns(const MachineInstr *DefMI,
+           ArrayRef<const MachineBasicBlock*> Trace) {
+  assert(!Trace.empty() && "Trace should contain at least one block");
+  unsigned Reg = DefMI->getOperand(0).getReg();
+  assert(TargetRegisterInfo::isVirtualRegister(Reg));
+  const MachineBasicBlock *DefMBB = DefMI->getParent();
+
+  // Reg is live-in to all blocks in Trace that follow DefMBB.
+  for (unsigned i = Trace.size(); i; --i) {
+    const MachineBasicBlock *MBB = Trace[i-1];
+    if (MBB == DefMBB)
+      return;
+    TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
+    // Just add the register. The height will be updated later.
+    TBI.LiveIns.push_back(Reg);
+  }
+}
+
+/// Compute instruction heights in the trace through MBB. This updates MBB and
+/// the blocks below it in the trace. It is assumed that the trace has already
+/// been computed.
+void MachineTraceMetrics::Ensemble::
+computeInstrHeights(const MachineBasicBlock *MBB) {
+  // The bottom of the trace may already be computed.
+  // Find the blocks that need updating.
+  SmallVector<const MachineBasicBlock*, 8> Stack;
+  do {
+    TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
+    assert(TBI.hasValidHeight() && "Incomplete trace");
+    if (TBI.HasValidInstrHeights)
+      break;
+    Stack.push_back(MBB);
+    TBI.LiveIns.clear();
+    MBB = TBI.Succ;
+  } while (MBB);
+
+  // As we move upwards in the trace, keep track of instructions that are
+  // required by deeper trace instructions. Map MI -> height required so far.
+  MIHeightMap Heights;
+
+  // For physregs, the def isn't known when we see the use.
+  // Instead, keep track of the highest use of each regunit.
+  SparseSet<LiveRegUnit> RegUnits;
+  RegUnits.setUniverse(MTM.TRI->getNumRegUnits());
+
+  // If the bottom of the trace was already precomputed, initialize heights
+  // from its live-in list.
+  // MBB is the highest precomputed block in the trace.
+  if (MBB) {
+    TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
+    for (unsigned i = 0, e = TBI.LiveIns.size(); i != e; ++i) {
+      LiveInReg LI = TBI.LiveIns[i];
+      if (TargetRegisterInfo::isVirtualRegister(LI.Reg)) {
+        // For virtual registers, the def latency is included.
+        unsigned &Height = Heights[MTM.MRI->getVRegDef(LI.Reg)];
+        if (Height < LI.Height)
+          Height = LI.Height;
+      } else {
+        // For register units, the def latency is not included because we don't
+        // know the def yet.
+        RegUnits[LI.Reg].Cycle = LI.Height;
+      }
+    }
+  }
+
+  // Go through the trace blocks in bottom-up order.
+  SmallVector<DataDep, 8> Deps;
+  for (;!Stack.empty(); Stack.pop_back()) {
+    MBB = Stack.back();
+    DEBUG(dbgs() << "Heights for BB#" << MBB->getNumber() << ":\n");
+    TraceBlockInfo &TBI = BlockInfo[MBB->getNumber()];
+    TBI.HasValidInstrHeights = true;
+    TBI.CriticalPath = 0;
+
+    // Get dependencies from PHIs in the trace successor.
+    const MachineBasicBlock *Succ = TBI.Succ;
+    // If MBB is the last block in the trace, and it has a back-edge to the
+    // loop header, get loop-carried dependencies from PHIs in the header. For
+    // that purpose, pretend that all the loop header PHIs have height 0.
+    if (!Succ)
+      if (const MachineLoop *Loop = getLoopFor(MBB))
+        if (MBB->isSuccessor(Loop->getHeader()))
+          Succ = Loop->getHeader();
+
+    if (Succ) {
+      for (MachineBasicBlock::const_iterator I = Succ->begin(), E = Succ->end();
+           I != E && I->isPHI(); ++I) {
+        const MachineInstr *PHI = I;
+        Deps.clear();
+        getPHIDeps(PHI, Deps, MBB, MTM.MRI);
+        if (!Deps.empty()) {
+          // Loop header PHI heights are all 0.
+          unsigned Height = TBI.Succ ? Cycles.lookup(PHI).Height : 0;
+          DEBUG(dbgs() << "pred\t" << Height << '\t' << *PHI);
+          if (pushDepHeight(Deps.front(), PHI, Height,
+                            Heights, MTM.ItinData, MTM.TII))
+            addLiveIns(Deps.front().DefMI, Stack);
+        }
+      }
+    }
+
+    // Go through the block backwards.
+    for (MachineBasicBlock::const_iterator BI = MBB->end(), BB = MBB->begin();
+         BI != BB;) {
+      const MachineInstr *MI = --BI;
+
+      // Find the MI height as determined by virtual register uses in the
+      // trace below.
+      unsigned Cycle = 0;
+      MIHeightMap::iterator HeightI = Heights.find(MI);
+      if (HeightI != Heights.end()) {
+        Cycle = HeightI->second;
+        // We won't be seeing any more MI uses.
+        Heights.erase(HeightI);
+      }
+
+      // Don't process PHI deps. They depend on the specific predecessor, and
+      // we'll get them when visiting the predecessor.
+      Deps.clear();
+      bool HasPhysRegs = !MI->isPHI() && getDataDeps(MI, Deps, MTM.MRI);
+
+      // There may also be regunit dependencies to include in the height.
+      if (HasPhysRegs)
+        Cycle = updatePhysDepsUpwards(MI, Cycle, RegUnits,
+                                      MTM.ItinData, MTM.TII, MTM.TRI);
+
+      // Update the required height of any virtual registers read by MI.
+      for (unsigned i = 0, e = Deps.size(); i != e; ++i)
+        if (pushDepHeight(Deps[i], MI, Cycle, Heights, MTM.ItinData, MTM.TII))
+          addLiveIns(Deps[i].DefMI, Stack);
+
+      InstrCycles &MICycles = Cycles[MI];
+      MICycles.Height = Cycle;
+      if (!TBI.HasValidInstrDepths) {
+        DEBUG(dbgs() << Cycle << '\t' << *MI);
+        continue;
+      }
+      // Update critical path length.
+      TBI.CriticalPath = std::max(TBI.CriticalPath, Cycle + MICycles.Depth);
+      DEBUG(dbgs() << TBI.CriticalPath << '\t' << Cycle << '\t' << *MI);
+    }
+
+    // Update virtual live-in heights. They were added by addLiveIns() with a 0
+    // height because the final height isn't known until now.
+    DEBUG(dbgs() << "BB#" << MBB->getNumber() <<  " Live-ins:");
+    for (unsigned i = 0, e = TBI.LiveIns.size(); i != e; ++i) {
+      LiveInReg &LIR = TBI.LiveIns[i];
+      const MachineInstr *DefMI = MTM.MRI->getVRegDef(LIR.Reg);
+      LIR.Height = Heights.lookup(DefMI);
+      DEBUG(dbgs() << ' ' << PrintReg(LIR.Reg) << '@' << LIR.Height);
+    }
+
+    // Transfer the live regunits to the live-in list.
+    for (SparseSet<LiveRegUnit>::const_iterator
+         RI = RegUnits.begin(), RE = RegUnits.end(); RI != RE; ++RI) {
+      TBI.LiveIns.push_back(LiveInReg(RI->RegUnit, RI->Cycle));
+      DEBUG(dbgs() << ' ' << PrintRegUnit(RI->RegUnit, MTM.TRI)
+                   << '@' << RI->Cycle);
+    }
+    DEBUG(dbgs() << '\n');
+
+    if (!TBI.HasValidInstrDepths)
+      continue;
+    // Add live-ins to the critical path length.
+    TBI.CriticalPath = std::max(TBI.CriticalPath,
+                                computeCrossBlockCriticalPath(TBI));
+    DEBUG(dbgs() << "Critical path: " << TBI.CriticalPath << '\n');
+  }
+}
+
+MachineTraceMetrics::Trace
+MachineTraceMetrics::Ensemble::getTrace(const MachineBasicBlock *MBB) {
+  // FIXME: Check cache tags, recompute as needed.
+  computeTrace(MBB);
+  computeInstrDepths(MBB);
+  computeInstrHeights(MBB);
+  return Trace(*this, BlockInfo[MBB->getNumber()]);
+}
+
+unsigned
+MachineTraceMetrics::Trace::getInstrSlack(const MachineInstr *MI) const {
+  assert(MI && "Not an instruction.");
+  assert(getBlockNum() == unsigned(MI->getParent()->getNumber()) &&
+         "MI must be in the trace center block");
+  InstrCycles Cyc = getInstrCycles(MI);
+  return getCriticalPath() - (Cyc.Depth + Cyc.Height);
+}
+
+unsigned
+MachineTraceMetrics::Trace::getPHIDepth(const MachineInstr *PHI) const {
+  const MachineBasicBlock *MBB = TE.MTM.MF->getBlockNumbered(getBlockNum());
+  SmallVector<DataDep, 1> Deps;
+  getPHIDeps(PHI, Deps, MBB, TE.MTM.MRI);
+  assert(Deps.size() == 1 && "PHI doesn't have MBB as a predecessor");
+  DataDep &Dep = Deps.front();
+  unsigned DepCycle = getInstrCycles(Dep.DefMI).Depth;
+  // Add latency if DefMI is a real instruction. Transients get latency 0.
+  if (!Dep.DefMI->isTransient())
+    DepCycle += TE.MTM.TII->computeOperandLatency(TE.MTM.ItinData,
+                                                  Dep.DefMI, Dep.DefOp,
+                                                  PHI, Dep.UseOp,
+                                                  /* FindMin = */ false);
+  return DepCycle;
+}
+
+unsigned MachineTraceMetrics::Trace::getResourceDepth(bool Bottom) const {
+  // For now, we compute the resource depth from instruction count / issue
+  // width. Eventually, we should compute resource depth per functional unit
+  // and return the max.
+  unsigned Instrs = TBI.InstrDepth;
+  if (Bottom)
+    Instrs += TE.MTM.BlockInfo[getBlockNum()].InstrCount;
+  if (const MCSchedModel *Model = TE.MTM.ItinData->SchedModel)
+    if (Model->IssueWidth != 0)
+      return Instrs / Model->IssueWidth;
+  // Assume issue width 1 without a schedule model.
+  return Instrs;
+}
+
+unsigned MachineTraceMetrics::Trace::
+getResourceLength(ArrayRef<const MachineBasicBlock*> Extrablocks) const {
+  unsigned Instrs = TBI.InstrDepth + TBI.InstrHeight;
+  for (unsigned i = 0, e = Extrablocks.size(); i != e; ++i)
+    Instrs += TE.MTM.getResources(Extrablocks[i])->InstrCount;
+  if (const MCSchedModel *Model = TE.MTM.ItinData->SchedModel)
+    if (Model->IssueWidth != 0)
+      return Instrs / Model->IssueWidth;
+  // Assume issue width 1 without a schedule model.
+  return Instrs;
+}
+
+void MachineTraceMetrics::Ensemble::print(raw_ostream &OS) const {
+  OS << getName() << " ensemble:\n";
+  for (unsigned i = 0, e = BlockInfo.size(); i != e; ++i) {
+    OS << "  BB#" << i << '\t';
+    BlockInfo[i].print(OS);
+    OS << '\n';
+  }
+}
+
+void MachineTraceMetrics::TraceBlockInfo::print(raw_ostream &OS) const {
+  if (hasValidDepth()) {
+    OS << "depth=" << InstrDepth;
+    if (Pred)
+      OS << " pred=BB#" << Pred->getNumber();
+    else
+      OS << " pred=null";
+    OS << " head=BB#" << Head;
+    if (HasValidInstrDepths)
+      OS << " +instrs";
+  } else
+    OS << "depth invalid";
+  OS << ", ";
+  if (hasValidHeight()) {
+    OS << "height=" << InstrHeight;
+    if (Succ)
+      OS << " succ=BB#" << Succ->getNumber();
+    else
+      OS << " succ=null";
+    OS << " tail=BB#" << Tail;
+    if (HasValidInstrHeights)
+      OS << " +instrs";
+  } else
+    OS << "height invalid";
+  if (HasValidInstrDepths && HasValidInstrHeights)
+    OS << ", crit=" << CriticalPath;
+}
+
+void MachineTraceMetrics::Trace::print(raw_ostream &OS) const {
+  unsigned MBBNum = &TBI - &TE.BlockInfo[0];
+
+  OS << TE.getName() << " trace BB#" << TBI.Head << " --> BB#" << MBBNum
+     << " --> BB#" << TBI.Tail << ':';
+  if (TBI.hasValidHeight() && TBI.hasValidDepth())
+    OS << ' ' << getInstrCount() << " instrs.";
+  if (TBI.HasValidInstrDepths && TBI.HasValidInstrHeights)
+    OS << ' ' << TBI.CriticalPath << " cycles.";
+
+  const MachineTraceMetrics::TraceBlockInfo *Block = &TBI;
+  OS << "\nBB#" << MBBNum;
+  while (Block->hasValidDepth() && Block->Pred) {
+    unsigned Num = Block->Pred->getNumber();
+    OS << " <- BB#" << Num;
+    Block = &TE.BlockInfo[Num];
+  }
+
+  Block = &TBI;
+  OS << "\n    ";
+  while (Block->hasValidHeight() && Block->Succ) {
+    unsigned Num = Block->Succ->getNumber();
+    OS << " -> BB#" << Num;
+    Block = &TE.BlockInfo[Num];
+  }
+  OS << '\n';
+}
diff --git a/lib/CodeGen/MachineTraceMetrics.h b/lib/CodeGen/MachineTraceMetrics.h
new file mode 100644
index 0000000000..c5b86f31db
--- /dev/null
+++ b/lib/CodeGen/MachineTraceMetrics.h
@@ -0,0 +1,341 @@
+//===- lib/CodeGen/MachineTraceMetrics.h - Super-scalar metrics -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the interface for the MachineTraceMetrics analysis pass
+// that estimates CPU resource usage and critical data dependency paths through
+// preferred traces. This is useful for super-scalar CPUs where execution speed
+// can be limited both by data dependencies and by limited execution resources.
+//
+// Out-of-order CPUs will often be executing instructions from multiple basic
+// blocks at the same time. This makes it difficult to estimate the resource
+// usage accurately in a single basic block. Resources can be estimated better
+// by looking at a trace through the current basic block.
+//
+// For every block, the MachineTraceMetrics pass will pick a preferred trace
+// that passes through the block. The trace is chosen based on loop structure,
+// branch probabilities, and resource usage. The intention is to pick likely
+// traces that would be the most affected by code transformations.
+//
+// It is expensive to compute a full arbitrary trace for every block, so to
+// save some computations, traces are chosen to be convergent. This means that
+// if the traces through basic blocks A and B ever cross when moving away from
+// A and B, they never diverge again. This applies in both directions - If the
+// traces meet above A and B, they won't diverge when going further back.
+//
+// Traces tend to align with loops. The trace through a block in an inner loop
+// will begin at the loop entry block and end at a back edge. If there are
+// nested loops, the trace may begin and end at those instead.
+//
+// For each trace, we compute the critical path length, which is the number of
+// cycles required to execute the trace when execution is limited by data
+// dependencies only. We also compute the resource height, which is the number
+// of cycles required to execute all instructions in the trace when ignoring
+// data dependencies.
+//
+// Every instruction in the current block has a slack - the number of cycles
+// execution of the instruction can be delayed without extending the critical
+// path.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_MACHINE_TRACE_METRICS_H
+#define LLVM_CODEGEN_MACHINE_TRACE_METRICS_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+namespace llvm {
+
+class InstrItineraryData;
+class MachineBasicBlock;
+class MachineInstr;
+class MachineLoop;
+class MachineLoopInfo;
+class MachineRegisterInfo;
+class TargetInstrInfo;
+class TargetRegisterInfo;
+class raw_ostream;
+
+class MachineTraceMetrics : public MachineFunctionPass {
+  const MachineFunction *MF;
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  const InstrItineraryData *ItinData;
+  const MachineRegisterInfo *MRI;
+  const MachineLoopInfo *Loops;
+
+public:
+  class Ensemble;
+  class Trace;
+  static char ID;
+  MachineTraceMetrics();
+  void getAnalysisUsage(AnalysisUsage&) const;
+  bool runOnMachineFunction(MachineFunction&);
+  void releaseMemory();
+  void verifyAnalysis() const;
+
+  friend class Ensemble;
+  friend class Trace;
+
+  /// Per-basic block information that doesn't depend on the trace through the
+  /// block.
+  struct FixedBlockInfo {
+    /// The number of non-trivial instructions in the block.
+    /// Doesn't count PHI and COPY instructions that are likely to be removed.
+    unsigned InstrCount;
+
+    /// True when the block contains calls.
+    bool HasCalls;
+
+    FixedBlockInfo() : InstrCount(~0u), HasCalls(false) {}
+
+    /// Returns true when resource information for this block has been computed.
+    bool hasResources() const { return InstrCount != ~0u; }
+
+    /// Invalidate resource information.
+    void invalidate() { InstrCount = ~0u; }
+  };
+
+  /// Get the fixed resource information about MBB. Compute it on demand.
+  const FixedBlockInfo *getResources(const MachineBasicBlock*);
+
+  /// A virtual register or regunit required by a basic block or its trace
+  /// successors.
+  struct LiveInReg {
+    /// The virtual register required, or a register unit.
+    unsigned Reg;
+
+    /// For virtual registers: Minimum height of the defining instruction.
+    /// For regunits: Height of the highest user in the trace.
+    unsigned Height;
+
+    LiveInReg(unsigned Reg, unsigned Height = 0) : Reg(Reg), Height(Height) {}
+  };
+
+  /// Per-basic block information that relates to a specific trace through the
+  /// block. Convergent traces means that only one of these is required per
+  /// block in a trace ensemble.
+  struct TraceBlockInfo {
+    /// Trace predecessor, or NULL for the first block in the trace.
+    /// Valid when hasValidDepth().
+    const MachineBasicBlock *Pred;
+
+    /// Trace successor, or NULL for the last block in the trace.
+    /// Valid when hasValidHeight().
+    const MachineBasicBlock *Succ;
+
+    /// The block number of the head of the trace. (When hasValidDepth()).
+    unsigned Head;
+
+    /// The block number of the tail of the trace. (When hasValidHeight()).
+    unsigned Tail;
+
+    /// Accumulated number of instructions in the trace above this block.
+    /// Does not include instructions in this block.
+    unsigned InstrDepth;
+
+    /// Accumulated number of instructions in the trace below this block.
+    /// Includes instructions in this block.
+    unsigned InstrHeight;
+
+    TraceBlockInfo() :
+      Pred(0), Succ(0),
+      InstrDepth(~0u), InstrHeight(~0u),
+      HasValidInstrDepths(false), HasValidInstrHeights(false) {}
+
+    /// Returns true if the depth resources have been computed from the trace
+    /// above this block.
+    bool hasValidDepth() const { return InstrDepth != ~0u; }
+
+    /// Returns true if the height resources have been computed from the trace
+    /// below this block.
+    bool hasValidHeight() const { return InstrHeight != ~0u; }
+
+    /// Invalidate depth resources when some block above this one has changed.
+    void invalidateDepth() { InstrDepth = ~0u; HasValidInstrDepths = false; }
+
+    /// Invalidate height resources when a block below this one has changed.
+    void invalidateHeight() { InstrHeight = ~0u; HasValidInstrHeights = false; }
+
+    // Data-dependency-related information. Per-instruction depth and height
+    // are computed from data dependencies in the current trace, using
+    // itinerary data.
+
+    /// Instruction depths have been computed. This implies hasValidDepth().
+    bool HasValidInstrDepths;
+
+    /// Instruction heights have been computed. This implies hasValidHeight().
+    bool HasValidInstrHeights;
+
+    /// Critical path length. This is the number of cycles in the longest data
+    /// dependency chain through the trace. This is only valid when both
+    /// HasValidInstrDepths and HasValidInstrHeights are set.
+    unsigned CriticalPath;
+
+    /// Live-in registers. These registers are defined above the current block
+    /// and used by this block or a block below it.
+    /// This does not include PHI uses in the current block, but it does
+    /// include PHI uses in deeper blocks.
+    SmallVector<LiveInReg, 4> LiveIns;
+
+    void print(raw_ostream&) const;
+  };
+
+  /// InstrCycles represents the cycle height and depth of an instruction in a
+  /// trace.
+  struct InstrCycles {
+    /// Earliest issue cycle as determined by data dependencies and instruction
+    /// latencies from the beginning of the trace. Data dependencies from
+    /// before the trace are not included.
+    unsigned Depth;
+
+    /// Minimum number of cycles from this instruction is issued to the of the
+    /// trace, as determined by data dependencies and instruction latencies.
+    unsigned Height;
+  };
+
+  /// A trace represents a plausible sequence of executed basic blocks that
+  /// passes through the current basic block one. The Trace class serves as a
+  /// handle to internal cached data structures.
+  class Trace {
+    Ensemble &TE;
+    TraceBlockInfo &TBI;
+
+    unsigned getBlockNum() const { return &TBI - &TE.BlockInfo[0]; }
+
+  public:
+    explicit Trace(Ensemble &te, TraceBlockInfo &tbi) : TE(te), TBI(tbi) {}
+    void print(raw_ostream&) const;
+
+    /// Compute the total number of instructions in the trace.
+    unsigned getInstrCount() const {
+      return TBI.InstrDepth + TBI.InstrHeight;
+    }
+
+    /// Return the resource depth of the top/bottom of the trace center block.
+    /// This is the number of cycles required to execute all instructions from
+    /// the trace head to the trace center block. The resource depth only
+    /// considers execution resources, it ignores data dependencies.
+    /// When Bottom is set, instructions in the trace center block are included.
+    unsigned getResourceDepth(bool Bottom) const;
+
+    /// Return the resource length of the trace. This is the number of cycles
+    /// required to execute the instructions in the trace if they were all
+    /// independent, exposing the maximum instruction-level parallelism.
+    ///
+    /// Any blocks in Extrablocks are included as if they were part of the
+    /// trace.
+    unsigned getResourceLength(ArrayRef<const MachineBasicBlock*> Extrablocks =
+                               ArrayRef<const MachineBasicBlock*>()) const;
+
+    /// Return the length of the (data dependency) critical path through the
+    /// trace.
+    unsigned getCriticalPath() const { return TBI.CriticalPath; }
+
+    /// Return the depth and height of MI. The depth is only valid for
+    /// instructions in or above the trace center block. The height is only
+    /// valid for instructions in or below the trace center block.
+    InstrCycles getInstrCycles(const MachineInstr *MI) const {
+      return TE.Cycles.lookup(MI);
+    }
+
+    /// Return the slack of MI. This is the number of cycles MI can be delayed
+    /// before the critical path becomes longer.
+    /// MI must be an instruction in the trace center block.
+    unsigned getInstrSlack(const MachineInstr *MI) const;
+
+    /// Return the Depth of a PHI instruction in a trace center block successor.
+    /// The PHI does not have to be part of the trace.
+    unsigned getPHIDepth(const MachineInstr *PHI) const;
+  };
+
+  /// A trace ensemble is a collection of traces selected using the same
+  /// strategy, for example 'minimum resource height'. There is one trace for
+  /// every block in the function.
+  class Ensemble {
+    SmallVector<TraceBlockInfo, 4> BlockInfo;
+    DenseMap<const MachineInstr*, InstrCycles> Cycles;
+    friend class Trace;
+
+    void computeTrace(const MachineBasicBlock*);
+    void computeDepthResources(const MachineBasicBlock*);
+    void computeHeightResources(const MachineBasicBlock*);
+    unsigned computeCrossBlockCriticalPath(const TraceBlockInfo&);
+    void computeInstrDepths(const MachineBasicBlock*);
+    void computeInstrHeights(const MachineBasicBlock*);
+    void addLiveIns(const MachineInstr *DefMI,
+                    ArrayRef<const MachineBasicBlock*> Trace);
+
+  protected:
+    MachineTraceMetrics &MTM;
+    virtual const MachineBasicBlock *pickTracePred(const MachineBasicBlock*) =0;
+    virtual const MachineBasicBlock *pickTraceSucc(const MachineBasicBlock*) =0;
+    explicit Ensemble(MachineTraceMetrics*);
+    const MachineLoop *getLoopFor(const MachineBasicBlock*) const;
+    const TraceBlockInfo *getDepthResources(const MachineBasicBlock*) const;
+    const TraceBlockInfo *getHeightResources(const MachineBasicBlock*) const;
+
+  public:
+    virtual ~Ensemble();
+    virtual const char *getName() const =0;
+    void print(raw_ostream&) const;
+    void invalidate(const MachineBasicBlock *MBB);
+    void verify() const;
+
+    /// Get the trace that passes through MBB.
+    /// The trace is computed on demand.
+    Trace getTrace(const MachineBasicBlock *MBB);
+  };
+
+  /// Strategies for selecting traces.
+  enum Strategy {
+    /// Select the trace through a block that has the fewest instructions.
+    TS_MinInstrCount,
+
+    TS_NumStrategies
+  };
+
+  /// Get the trace ensemble representing the given trace selection strategy.
+  /// The returned Ensemble object is owned by the MachineTraceMetrics analysis,
+  /// and valid for the lifetime of the analysis pass.
+  Ensemble *getEnsemble(Strategy);
+
+  /// Invalidate cached information about MBB. This must be called *before* MBB
+  /// is erased, or the CFG is otherwise changed.
+  ///
+  /// This invalidates per-block information about resource usage for MBB only,
+  /// and it invalidates per-trace information for any trace that passes
+  /// through MBB.
+  ///
+  /// Call Ensemble::getTrace() again to update any trace handles.
+  void invalidate(const MachineBasicBlock *MBB);
+
+private:
+  // One entry per basic block, indexed by block number.
+  SmallVector<FixedBlockInfo, 4> BlockInfo;
+
+  // One ensemble per strategy.
+  Ensemble* Ensembles[TS_NumStrategies];
+};
+
+inline raw_ostream &operator<<(raw_ostream &OS,
+                               const MachineTraceMetrics::Trace &Tr) {
+  Tr.print(OS);
+  return OS;
+}
+
+inline raw_ostream &operator<<(raw_ostream &OS,
+                               const MachineTraceMetrics::Ensemble &En) {
+  En.print(OS);
+  return OS;
+}
+} // end namespace llvm
+
+#endif
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index 45ce3ab28b..852c169254 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -73,8 +73,10 @@ namespace {
     typedef SmallVector<const uint32_t*, 4> RegMaskVector;
     typedef DenseSet<unsigned> RegSet;
     typedef DenseMap<unsigned, const MachineInstr*> RegMap;
+    typedef SmallPtrSet<const MachineBasicBlock*, 8> BlockSet;
 
     const MachineInstr *FirstTerminator;
+    BlockSet FunctionBlocks;
 
     BitVector regsReserved;
     BitVector regsAllocatable;
@@ -117,6 +119,9 @@ namespace {
       // block. This set is disjoint from regsLiveOut.
       RegSet vregsRequired;
 
+      // Set versions of block's predecessor and successor lists.
+      BlockSet Preds, Succs;
+
       BBInfo() : reachable(false) {}
 
       // Add register to vregsPassed if it belongs there. Return true if
@@ -203,6 +208,10 @@ namespace {
     void report(const char *msg, const MachineBasicBlock *MBB);
     void report(const char *msg, const MachineInstr *MI);
     void report(const char *msg, const MachineOperand *MO, unsigned MONum);
+    void report(const char *msg, const MachineFunction *MF,
+                const LiveInterval &LI);
+    void report(const char *msg, const MachineBasicBlock *MBB,
+                const LiveInterval &LI);
 
     void checkLiveness(const MachineOperand *MO, unsigned MONum);
     void markReachable(const MachineBasicBlock *MBB);
@@ -212,6 +221,10 @@ namespace {
     void calcRegsRequired();
     void verifyLiveVariables();
     void verifyLiveIntervals();
+    void verifyLiveInterval(const LiveInterval&);
+    void verifyLiveIntervalValue(const LiveInterval&, VNInfo*);
+    void verifyLiveIntervalSegment(const LiveInterval&,
+                                   LiveInterval::const_iterator);
   };
 
   struct MachineVerifierPass : public MachineFunctionPass {
@@ -350,9 +363,9 @@ void MachineVerifier::report(const char *msg, const MachineFunction *MF) {
 void MachineVerifier::report(const char *msg, const MachineBasicBlock *MBB) {
   assert(MBB);
   report(msg, MBB->getParent());
-  *OS << "- basic block: " << MBB->getName()
-      << " " << (void*)MBB
-      << " (BB#" << MBB->getNumber() << ")";
+  *OS << "- basic block: BB#" << MBB->getNumber()
+      << ' ' << MBB->getName()
+      << " (" << (void*)MBB << ')';
   if (Indexes)
     *OS << " [" << Indexes->getMBBStartIdx(MBB)
         << ';' <<  Indexes->getMBBEndIdx(MBB) << ')';
@@ -377,6 +390,28 @@ void MachineVerifier::report(const char *msg,
   *OS << "\n";
 }
 
+void MachineVerifier::report(const char *msg, const MachineFunction *MF,
+                             const LiveInterval &LI) {
+  report(msg, MF);
+  *OS << "- interval:    ";
+  if (TargetRegisterInfo::isVirtualRegister(LI.reg))
+    *OS << PrintReg(LI.reg, TRI);
+  else
+    *OS << PrintRegUnit(LI.reg, TRI);
+  *OS << ' ' << LI << '\n';
+}
+
+void MachineVerifier::report(const char *msg, const MachineBasicBlock *MBB,
+                             const LiveInterval &LI) {
+  report(msg, MBB);
+  *OS << "- interval:    ";
+  if (TargetRegisterInfo::isVirtualRegister(LI.reg))
+    *OS << PrintReg(LI.reg, TRI);
+  else
+    *OS << PrintRegUnit(LI.reg, TRI);
+  *OS << ' ' << LI << '\n';
+}
+
 void MachineVerifier::markReachable(const MachineBasicBlock *MBB) {
   BBInfo &MInfo = MBBInfoMap[MBB];
   if (!MInfo.reachable) {
@@ -404,6 +439,22 @@ void MachineVerifier::visitMachineFunctionBefore() {
   regsAllocatable = TRI->getAllocatableSet(*MF);
 
   markReachable(&MF->front());
+
+  // Build a set of the basic blocks in the function.
+  FunctionBlocks.clear();
+  for (MachineFunction::const_iterator
+       I = MF->begin(), E = MF->end(); I != E; ++I) {
+    FunctionBlocks.insert(I);
+    BBInfo &MInfo = MBBInfoMap[I];
+
+    MInfo.Preds.insert(I->pred_begin(), I->pred_end());
+    if (MInfo.Preds.size() != I->pred_size())
+      report("MBB has duplicate entries in its predecessor list.", I);
+
+    MInfo.Succs.insert(I->succ_begin(), I->succ_end());
+    if (MInfo.Succs.size() != I->succ_size())
+      report("MBB has duplicate entries in its successor list.", I);
+  }
 }
 
 // Does iterator point to a and b as the first two elements?
@@ -440,6 +491,25 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
        E = MBB->succ_end(); I != E; ++I) {
     if ((*I)->isLandingPad())
       LandingPadSuccs.insert(*I);
+    if (!FunctionBlocks.count(*I))
+      report("MBB has successor that isn't part of the function.", MBB);
+    if (!MBBInfoMap[*I].Preds.count(MBB)) {
+      report("Inconsistent CFG", MBB);
+      *OS << "MBB is not in the predecessor list of the successor BB#"
+          << (*I)->getNumber() << ".\n";
+    }
+  }
+
+  // Check the predecessor list.
+  for (MachineBasicBlock::const_pred_iterator I = MBB->pred_begin(),
+       E = MBB->pred_end(); I != E; ++I) {
+    if (!FunctionBlocks.count(*I))
+      report("MBB has predecessor that isn't part of the function.", MBB);
+    if (!MBBInfoMap[*I].Succs.count(MBB)) {
+      report("Inconsistent CFG", MBB);
+      *OS << "MBB is not in the successor list of the predecessor BB#"
+          << (*I)->getNumber() << ".\n";
+    }
   }
 
   const MCAsmInfo *AsmInfo = TM->getMCAsmInfo();
@@ -510,7 +580,15 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
       ++MBBI;
       if (MBBI == MF->end()) {
         report("MBB conditionally falls through out of function!", MBB);
-      } if (MBB->succ_size() != 2) {
+      } if (MBB->succ_size() == 1) {
+        // A conditional branch with only one successor is weird, but allowed.
+        if (&*MBBI != TBB)
+          report("MBB exits via conditional branch/fall-through but only has "
+                 "one CFG successor!", MBB);
+        else if (TBB != *MBB->succ_begin())
+          report("MBB exits via conditional branch/fall-through but the CFG "
+                 "successor don't match the actual successor!", MBB);
+      } else if (MBB->succ_size() != 2) {
         report("MBB exits via conditional branch/fall-through but doesn't have "
                "exactly two CFG successors!", MBB);
       } else if (!matchPair(MBB->succ_begin(), TBB, MBBI)) {
@@ -530,7 +608,15 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
     } else if (TBB && FBB) {
       // Block conditionally branches somewhere, otherwise branches
       // somewhere else.
-      if (MBB->succ_size() != 2) {
+      if (MBB->succ_size() == 1) {
+        // A conditional branch with only one successor is weird, but allowed.
+        if (FBB != TBB)
+          report("MBB exits via conditional branch/branch through but only has "
+                 "one CFG successor!", MBB);
+        else if (TBB != *MBB->succ_begin())
+          report("MBB exits via conditional branch/branch through but the CFG "
+                 "successor don't match the actual successor!", MBB);
+      } else if (MBB->succ_size() != 2) {
         report("MBB exits via conditional branch/branch but doesn't have "
                "exactly two CFG successors!", MBB);
       } else if (!matchPair(MBB->succ_begin(), TBB, FBB)) {
@@ -651,10 +737,10 @@ void
 MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
   const MachineInstr *MI = MO->getParent();
   const MCInstrDesc &MCID = MI->getDesc();
-  const MCOperandInfo &MCOI = MCID.OpInfo[MONum];
 
   // The first MCID.NumDefs operands must be explicit register defines
   if (MONum < MCID.getNumDefs()) {
+    const MCOperandInfo &MCOI = MCID.OpInfo[MONum];
     if (!MO->isReg())
       report("Explicit definition must be a register", MO, MONum);
     else if (!MO->isDef() && !MCOI.isOptionalDef())
@@ -662,6 +748,7 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
     else if (MO->isImplicit())
       report("Explicit definition marked as implicit", MO, MONum);
   } else if (MONum < MCID.getNumOperands()) {
+    const MCOperandInfo &MCOI = MCID.OpInfo[MONum];
     // Don't check if it's the last operand in a variadic instruction. See,
     // e.g., LDM_RET in the arm back end.
     if (MO->isReg() &&
@@ -685,6 +772,12 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
     if (MRI->tracksLiveness() && !MI->isDebugValue())
       checkLiveness(MO, MONum);
 
+    // Verify two-address constraints after leaving SSA form.
+    unsigned DefIdx;
+    if (!MRI->isSSA() && MO->isUse() &&
+        MI->isRegTiedToDefOperand(MONum, &DefIdx) &&
+        Reg != MI->getOperand(DefIdx).getReg())
+      report("Two-address instruction operands must be identical", MO, MONum);
 
     // Check register classes.
     if (MONum < MCID.getNumOperands() && !MO->isImplicit()) {
@@ -786,20 +879,7 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
   if (MO->readsReg()) {
     regsLiveInButUnused.erase(Reg);
 
-    bool isKill = false;
-    unsigned defIdx;
-    if (MI->isRegTiedToDefOperand(MONum, &defIdx)) {
-      // A two-addr use counts as a kill if use and def are the same.
-      unsigned DefReg = MI->getOperand(defIdx).getReg();
-      if (Reg == DefReg)
-        isKill = true;
-      else if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
-        report("Two-address instruction operands must be identical", MO, MONum);
-      }
-    } else
-      isKill = MO->isKill();
-
-    if (isKill)
+    if (MO->isKill())
       addRegWithSubRegs(regsKilled, Reg);
 
     // Check that LiveVars knows this kill.
@@ -811,23 +891,44 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
     }
 
     // Check LiveInts liveness and kill.
-    if (TargetRegisterInfo::isVirtualRegister(Reg) &&
-        LiveInts && !LiveInts->isNotInMIMap(MI)) {
-      SlotIndex UseIdx = LiveInts->getInstructionIndex(MI).getRegSlot(true);
-      if (LiveInts->hasInterval(Reg)) {
-        const LiveInterval &LI = LiveInts->getInterval(Reg);
-        if (!LI.liveAt(UseIdx)) {
-          report("No live range at use", MO, MONum);
-          *OS << UseIdx << " is not live in " << LI << '\n';
+    if (LiveInts && !LiveInts->isNotInMIMap(MI)) {
+      SlotIndex UseIdx = LiveInts->getInstructionIndex(MI);
+      // Check the cached regunit intervals.
+      if (TargetRegisterInfo::isPhysicalRegister(Reg) && !isReserved(Reg)) {
+        for (MCRegUnitIterator Units(Reg, TRI); Units.isValid(); ++Units) {
+          if (const LiveInterval *LI = LiveInts->getCachedRegUnit(*Units)) {
+            LiveRangeQuery LRQ(*LI, UseIdx);
+            if (!LRQ.valueIn()) {
+              report("No live range at use", MO, MONum);
+              *OS << UseIdx << " is not live in " << PrintRegUnit(*Units, TRI)
+                  << ' ' << *LI << '\n';
+            }
+            if (MO->isKill() && !LRQ.isKill()) {
+              report("Live range continues after kill flag", MO, MONum);
+              *OS << PrintRegUnit(*Units, TRI) << ' ' << *LI << '\n';
+            }
+          }
         }
-        // Check for extra kill flags.
-        // Note that we allow missing kill flags for now.
-        if (MO->isKill() && !LI.killedAt(UseIdx.getRegSlot())) {
-          report("Live range continues after kill flag", MO, MONum);
-          *OS << "Live range: " << LI << '\n';
+      }
+
+      if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+        if (LiveInts->hasInterval(Reg)) {
+          // This is a virtual register interval.
+          const LiveInterval &LI = LiveInts->getInterval(Reg);
+          LiveRangeQuery LRQ(LI, UseIdx);
+          if (!LRQ.valueIn()) {
+            report("No live range at use", MO, MONum);
+            *OS << UseIdx << " is not live in " << LI << '\n';
+          }
+          // Check for extra kill flags.
+          // Note that we allow missing kill flags for now.
+          if (MO->isKill() && !LRQ.isKill()) {
+            report("Live range continues after kill flag", MO, MONum);
+            *OS << "Live range: " << LI << '\n';
+          }
+        } else {
+          report("Virtual register has no live interval", MO, MONum);
         }
-      } else {
-        report("Virtual register has no Live interval", MO, MONum);
       }
     }
 
@@ -837,6 +938,8 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
         // Reserved registers may be used even when 'dead'.
         if (!isReserved(Reg))
           report("Using an undefined physical register", MO, MONum);
+      } else if (MRI->def_empty(Reg)) {
+        report("Reading virtual register without a def", MO, MONum);
       } else {
         BBInfo &MInfo = MBBInfoMap[MI->getParent()];
         // We don't know which virtual registers are live in, so only complain
@@ -1057,8 +1160,8 @@ void MachineVerifier::visitMachineFunctionAfter() {
          I = MInfo.vregsRequired.begin(), E = MInfo.vregsRequired.end(); I != E;
          ++I)
       if (MInfo.regsKilled.count(*I)) {
-	report("Virtual register killed in block, but needed live out.", MFI);
-	*OS << "Virtual register " << PrintReg(*I)
+        report("Virtual register killed in block, but needed live out.", MFI);
+        *OS << "Virtual register " << PrintReg(*I)
             << " is used after the block.\n";
       }
   }
@@ -1122,281 +1225,282 @@ void MachineVerifier::verifyLiveIntervals() {
 
     const LiveInterval &LI = LiveInts->getInterval(Reg);
     assert(Reg == LI.reg && "Invalid reg to interval mapping");
+    verifyLiveInterval(LI);
+  }
 
-    for (LiveInterval::const_vni_iterator I = LI.vni_begin(), E = LI.vni_end();
-         I!=E; ++I) {
-      VNInfo *VNI = *I;
-      const VNInfo *DefVNI = LI.getVNInfoAt(VNI->def);
+  // Verify all the cached regunit intervals.
+  for (unsigned i = 0, e = TRI->getNumRegUnits(); i != e; ++i)
+    if (const LiveInterval *LI = LiveInts->getCachedRegUnit(i))
+      verifyLiveInterval(*LI);
+}
 
-      if (!DefVNI) {
-        if (!VNI->isUnused()) {
-          report("Valno not live at def and not marked unused", MF);
-          *OS << "Valno #" << VNI->id << " in " << LI << '\n';
-        }
-        continue;
-      }
+void MachineVerifier::verifyLiveIntervalValue(const LiveInterval &LI,
+                                              VNInfo *VNI) {
+  if (VNI->isUnused())
+    return;
 
-      if (VNI->isUnused())
-        continue;
+  const VNInfo *DefVNI = LI.getVNInfoAt(VNI->def);
 
-      if (DefVNI != VNI) {
-        report("Live range at def has different valno", MF);
-        *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
-            << " where valno #" << DefVNI->id << " is live in " << LI << '\n';
-        continue;
-      }
+  if (!DefVNI) {
+    report("Valno not live at def and not marked unused", MF, LI);
+    *OS << "Valno #" << VNI->id << '\n';
+    return;
+  }
 
-      const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(VNI->def);
-      if (!MBB) {
-        report("Invalid definition index", MF);
-        *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
-            << " in " << LI << '\n';
-        continue;
-      }
+  if (DefVNI != VNI) {
+    report("Live range at def has different valno", MF, LI);
+    *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
+        << " where valno #" << DefVNI->id << " is live\n";
+    return;
+  }
 
-      if (VNI->isPHIDef()) {
-        if (VNI->def != LiveInts->getMBBStartIdx(MBB)) {
-          report("PHIDef value is not defined at MBB start", MF);
-          *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
-              << ", not at the beginning of BB#" << MBB->getNumber()
-              << " in " << LI << '\n';
-        }
-      } else {
-        // Non-PHI def.
-        const MachineInstr *MI = LiveInts->getInstructionFromIndex(VNI->def);
-        if (!MI) {
-          report("No instruction at def index", MF);
-          *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
-              << " in " << LI << '\n';
-          continue;
-        }
+  const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(VNI->def);
+  if (!MBB) {
+    report("Invalid definition index", MF, LI);
+    *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
+        << " in " << LI << '\n';
+    return;
+  }
 
-        bool hasDef = false;
-        bool isEarlyClobber = false;
-        for (ConstMIBundleOperands MOI(MI); MOI.isValid(); ++MOI) {
-          if (!MOI->isReg() || !MOI->isDef())
-            continue;
-          if (TargetRegisterInfo::isVirtualRegister(LI.reg)) {
-            if (MOI->getReg() != LI.reg)
-              continue;
-          } else {
-            if (!TargetRegisterInfo::isPhysicalRegister(MOI->getReg()) ||
-                !TRI->regsOverlap(LI.reg, MOI->getReg()))
-              continue;
-          }
-          hasDef = true;
-          if (MOI->isEarlyClobber())
-            isEarlyClobber = true;
-        }
+  if (VNI->isPHIDef()) {
+    if (VNI->def != LiveInts->getMBBStartIdx(MBB)) {
+      report("PHIDef value is not defined at MBB start", MBB, LI);
+      *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
+          << ", not at the beginning of BB#" << MBB->getNumber() << '\n';
+    }
+    return;
+  }
 
-        if (!hasDef) {
-          report("Defining instruction does not modify register", MI);
-          *OS << "Valno #" << VNI->id << " in " << LI << '\n';
-        }
+  // Non-PHI def.
+  const MachineInstr *MI = LiveInts->getInstructionFromIndex(VNI->def);
+  if (!MI) {
+    report("No instruction at def index", MBB, LI);
+    *OS << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
+    return;
+  }
 
-        // Early clobber defs begin at USE slots, but other defs must begin at
-        // DEF slots.
-        if (isEarlyClobber) {
-          if (!VNI->def.isEarlyClobber()) {
-            report("Early clobber def must be at an early-clobber slot", MF);
-            *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
-                << " in " << LI << '\n';
-          }
-        } else if (!VNI->def.isRegister()) {
-          report("Non-PHI, non-early clobber def must be at a register slot",
-                 MF);
-          *OS << "Valno #" << VNI->id << " is defined at " << VNI->def
-              << " in " << LI << '\n';
-        }
-      }
+  bool hasDef = false;
+  bool isEarlyClobber = false;
+  for (ConstMIBundleOperands MOI(MI); MOI.isValid(); ++MOI) {
+    if (!MOI->isReg() || !MOI->isDef())
+      continue;
+    if (TargetRegisterInfo::isVirtualRegister(LI.reg)) {
+      if (MOI->getReg() != LI.reg)
+        continue;
+    } else {
+      if (!TargetRegisterInfo::isPhysicalRegister(MOI->getReg()) ||
+          !TRI->hasRegUnit(MOI->getReg(), LI.reg))
+        continue;
     }
+    hasDef = true;
+    if (MOI->isEarlyClobber())
+      isEarlyClobber = true;
+  }
 
-    for (LiveInterval::const_iterator I = LI.begin(), E = LI.end(); I!=E; ++I) {
-      const VNInfo *VNI = I->valno;
-      assert(VNI && "Live range has no valno");
+  if (!hasDef) {
+    report("Defining instruction does not modify register", MI);
+    *OS << "Valno #" << VNI->id << " in " << LI << '\n';
+  }
 
-      if (VNI->id >= LI.getNumValNums() || VNI != LI.getValNumInfo(VNI->id)) {
-        report("Foreign valno in live range", MF);
-        I->print(*OS);
-        *OS << " has a valno not in " << LI << '\n';
-      }
+  // Early clobber defs begin at USE slots, but other defs must begin at
+  // DEF slots.
+  if (isEarlyClobber) {
+    if (!VNI->def.isEarlyClobber()) {
+      report("Early clobber def must be at an early-clobber slot", MBB, LI);
+      *OS << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
+    }
+  } else if (!VNI->def.isRegister()) {
+    report("Non-PHI, non-early clobber def must be at a register slot",
+           MBB, LI);
+    *OS << "Valno #" << VNI->id << " is defined at " << VNI->def << '\n';
+  }
+}
 
-      if (VNI->isUnused()) {
-        report("Live range valno is marked unused", MF);
-        I->print(*OS);
-        *OS << " in " << LI << '\n';
-      }
+void
+MachineVerifier::verifyLiveIntervalSegment(const LiveInterval &LI,
+                                           LiveInterval::const_iterator I) {
+  const VNInfo *VNI = I->valno;
+  assert(VNI && "Live range has no valno");
+
+  if (VNI->id >= LI.getNumValNums() || VNI != LI.getValNumInfo(VNI->id)) {
+    report("Foreign valno in live range", MF, LI);
+    *OS << *I << " has a bad valno\n";
+  }
 
-      const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(I->start);
-      if (!MBB) {
-        report("Bad start of live segment, no basic block", MF);
-        I->print(*OS);
-        *OS << " in " << LI << '\n';
-        continue;
-      }
-      SlotIndex MBBStartIdx = LiveInts->getMBBStartIdx(MBB);
-      if (I->start != MBBStartIdx && I->start != VNI->def) {
-        report("Live segment must begin at MBB entry or valno def", MBB);
-        I->print(*OS);
-        *OS << " in " << LI << '\n' << "Basic block starts at "
-            << MBBStartIdx << '\n';
-      }
+  if (VNI->isUnused()) {
+    report("Live range valno is marked unused", MF, LI);
+    *OS << *I << '\n';
+  }
 
-      const MachineBasicBlock *EndMBB =
-                                LiveInts->getMBBFromIndex(I->end.getPrevSlot());
-      if (!EndMBB) {
-        report("Bad end of live segment, no basic block", MF);
-        I->print(*OS);
-        *OS << " in " << LI << '\n';
-        continue;
-      }
+  const MachineBasicBlock *MBB = LiveInts->getMBBFromIndex(I->start);
+  if (!MBB) {
+    report("Bad start of live segment, no basic block", MF, LI);
+    *OS << *I << '\n';
+    return;
+  }
+  SlotIndex MBBStartIdx = LiveInts->getMBBStartIdx(MBB);
+  if (I->start != MBBStartIdx && I->start != VNI->def) {
+    report("Live segment must begin at MBB entry or valno def", MBB, LI);
+    *OS << *I << '\n';
+  }
 
-      // No more checks for live-out segments.
-      if (I->end == LiveInts->getMBBEndIdx(EndMBB))
-        continue;
+  const MachineBasicBlock *EndMBB =
+    LiveInts->getMBBFromIndex(I->end.getPrevSlot());
+  if (!EndMBB) {
+    report("Bad end of live segment, no basic block", MF, LI);
+    *OS << *I << '\n';
+    return;
+  }
 
-      // The live segment is ending inside EndMBB
-      const MachineInstr *MI =
-        LiveInts->getInstructionFromIndex(I->end.getPrevSlot());
-      if (!MI) {
-        report("Live segment doesn't end at a valid instruction", EndMBB);
-        I->print(*OS);
-        *OS << " in " << LI << '\n' << "Basic block starts at "
-          << MBBStartIdx << '\n';
+  // No more checks for live-out segments.
+  if (I->end == LiveInts->getMBBEndIdx(EndMBB))
+    return;
+
+  // RegUnit intervals are allowed dead phis.
+  if (!TargetRegisterInfo::isVirtualRegister(LI.reg) && VNI->isPHIDef() &&
+      I->start == VNI->def && I->end == VNI->def.getDeadSlot())
+    return;
+
+  // The live segment is ending inside EndMBB
+  const MachineInstr *MI =
+    LiveInts->getInstructionFromIndex(I->end.getPrevSlot());
+  if (!MI) {
+    report("Live segment doesn't end at a valid instruction", EndMBB, LI);
+    *OS << *I << '\n';
+    return;
+  }
+
+  // The block slot must refer to a basic block boundary.
+  if (I->end.isBlock()) {
+    report("Live segment ends at B slot of an instruction", EndMBB, LI);
+    *OS << *I << '\n';
+  }
+
+  if (I->end.isDead()) {
+    // Segment ends on the dead slot.
+    // That means there must be a dead def.
+    if (!SlotIndex::isSameInstr(I->start, I->end)) {
+      report("Live segment ending at dead slot spans instructions", EndMBB, LI);
+      *OS << *I << '\n';
+    }
+  }
+
+  // A live segment can only end at an early-clobber slot if it is being
+  // redefined by an early-clobber def.
+  if (I->end.isEarlyClobber()) {
+    if (I+1 == LI.end() || (I+1)->start != I->end) {
+      report("Live segment ending at early clobber slot must be "
+             "redefined by an EC def in the same instruction", EndMBB, LI);
+      *OS << *I << '\n';
+    }
+  }
+
+  // The following checks only apply to virtual registers. Physreg liveness
+  // is too weird to check.
+  if (TargetRegisterInfo::isVirtualRegister(LI.reg)) {
+    // A live range can end with either a redefinition, a kill flag on a
+    // use, or a dead flag on a def.
+    bool hasRead = false;
+    bool hasDeadDef = false;
+    for (ConstMIBundleOperands MOI(MI); MOI.isValid(); ++MOI) {
+      if (!MOI->isReg() || MOI->getReg() != LI.reg)
         continue;
-      }
+      if (MOI->readsReg())
+        hasRead = true;
+      if (MOI->isDef() && MOI->isDead())
+        hasDeadDef = true;
+    }
 
-      // The block slot must refer to a basic block boundary.
-      if (I->end.isBlock()) {
-        report("Live segment ends at B slot of an instruction", MI);
+    if (I->end.isDead()) {
+      if (!hasDeadDef) {
+        report("Instruction doesn't have a dead def operand", MI);
         I->print(*OS);
         *OS << " in " << LI << '\n';
       }
-
-      if (I->end.isDead()) {
-        // Segment ends on the dead slot.
-        // That means there must be a dead def.
-        if (!SlotIndex::isSameInstr(I->start, I->end)) {
-          report("Live segment ending at dead slot spans instructions", MI);
-          I->print(*OS);
-          *OS << " in " << LI << '\n';
-        }
-      }
-
-      // A live segment can only end at an early-clobber slot if it is being
-      // redefined by an early-clobber def.
-      if (I->end.isEarlyClobber()) {
-        if (I+1 == E || (I+1)->start != I->end) {
-          report("Live segment ending at early clobber slot must be "
-                 "redefined by an EC def in the same instruction", MI);
-          I->print(*OS);
-          *OS << " in " << LI << '\n';
-        }
+    } else {
+      if (!hasRead) {
+        report("Instruction ending live range doesn't read the register", MI);
+        *OS << *I << " in " << LI << '\n';
       }
+    }
+  }
 
-      // The following checks only apply to virtual registers. Physreg liveness
-      // is too weird to check.
-      if (TargetRegisterInfo::isVirtualRegister(LI.reg)) {
-        // A live range can end with either a redefinition, a kill flag on a
-        // use, or a dead flag on a def.
-        bool hasRead = false;
-        bool hasDeadDef = false;
-        for (ConstMIBundleOperands MOI(MI); MOI.isValid(); ++MOI) {
-          if (!MOI->isReg() || MOI->getReg() != LI.reg)
-            continue;
-          if (MOI->readsReg())
-            hasRead = true;
-          if (MOI->isDef() && MOI->isDead())
-            hasDeadDef = true;
-        }
-
-        if (I->end.isDead()) {
-          if (!hasDeadDef) {
-            report("Instruction doesn't have a dead def operand", MI);
-            I->print(*OS);
-            *OS << " in " << LI << '\n';
-          }
-        } else {
-          if (!hasRead) {
-            report("Instruction ending live range doesn't read the register",
-                   MI);
-            I->print(*OS);
-            *OS << " in " << LI << '\n';
-          }
-        }
-      }
+  // Now check all the basic blocks in this live segment.
+  MachineFunction::const_iterator MFI = MBB;
+  // Is this live range the beginning of a non-PHIDef VN?
+  if (I->start == VNI->def && !VNI->isPHIDef()) {
+    // Not live-in to any blocks.
+    if (MBB == EndMBB)
+      return;
+    // Skip this block.
+    ++MFI;
+  }
+  for (;;) {
+    assert(LiveInts->isLiveInToMBB(LI, MFI));
+    // We don't know how to track physregs into a landing pad.
+    if (!TargetRegisterInfo::isVirtualRegister(LI.reg) &&
+        MFI->isLandingPad()) {
+      if (&*MFI == EndMBB)
+        break;
+      ++MFI;
+      continue;
+    }
 
-      // Now check all the basic blocks in this live segment.
-      MachineFunction::const_iterator MFI = MBB;
-      // Is this live range the beginning of a non-PHIDef VN?
-      if (I->start == VNI->def && !VNI->isPHIDef()) {
-        // Not live-in to any blocks.
-        if (MBB == EndMBB)
-          continue;
-        // Skip this block.
-        ++MFI;
+    // Is VNI a PHI-def in the current block?
+    bool IsPHI = VNI->isPHIDef() &&
+      VNI->def == LiveInts->getMBBStartIdx(MFI);
+
+    // Check that VNI is live-out of all predecessors.
+    for (MachineBasicBlock::const_pred_iterator PI = MFI->pred_begin(),
+         PE = MFI->pred_end(); PI != PE; ++PI) {
+      SlotIndex PEnd = LiveInts->getMBBEndIdx(*PI);
+      const VNInfo *PVNI = LI.getVNInfoBefore(PEnd);
+
+      // All predecessors must have a live-out value.
+      if (!PVNI) {
+        report("Register not marked live out of predecessor", *PI, LI);
+        *OS << "Valno #" << VNI->id << " live into BB#" << MFI->getNumber()
+            << '@' << LiveInts->getMBBStartIdx(MFI) << ", not live before "
+            << PEnd << '\n';
+        continue;
       }
-      for (;;) {
-        assert(LiveInts->isLiveInToMBB(LI, MFI));
-        // We don't know how to track physregs into a landing pad.
-        if (TargetRegisterInfo::isPhysicalRegister(LI.reg) &&
-            MFI->isLandingPad()) {
-          if (&*MFI == EndMBB)
-            break;
-          ++MFI;
-          continue;
-        }
 
-        // Is VNI a PHI-def in the current block?
-        bool IsPHI = VNI->isPHIDef() &&
-                     VNI->def == LiveInts->getMBBStartIdx(MFI);
-
-        // Check that VNI is live-out of all predecessors.
-        for (MachineBasicBlock::const_pred_iterator PI = MFI->pred_begin(),
-             PE = MFI->pred_end(); PI != PE; ++PI) {
-          SlotIndex PEnd = LiveInts->getMBBEndIdx(*PI);
-          const VNInfo *PVNI = LI.getVNInfoBefore(PEnd);
-
-          // All predecessors must have a live-out value.
-          if (!PVNI) {
-            report("Register not marked live out of predecessor", *PI);
-            *OS << "Valno #" << VNI->id << " live into BB#" << MFI->getNumber()
-                << '@' << LiveInts->getMBBStartIdx(MFI) << ", not live before "
-                << PEnd << " in " << LI << '\n';
-            continue;
-          }
-
-          // Only PHI-defs can take different predecessor values.
-          if (!IsPHI && PVNI != VNI) {
-            report("Different value live out of predecessor", *PI);
-            *OS << "Valno #" << PVNI->id << " live out of BB#"
-                << (*PI)->getNumber() << '@' << PEnd
-                << "\nValno #" << VNI->id << " live into BB#" << MFI->getNumber()
-                << '@' << LiveInts->getMBBStartIdx(MFI) << " in "
-                << PrintReg(Reg) << ": " << LI << '\n';
-          }
-        }
-        if (&*MFI == EndMBB)
-          break;
-        ++MFI;
+      // Only PHI-defs can take different predecessor values.
+      if (!IsPHI && PVNI != VNI) {
+        report("Different value live out of predecessor", *PI, LI);
+        *OS << "Valno #" << PVNI->id << " live out of BB#"
+            << (*PI)->getNumber() << '@' << PEnd
+            << "\nValno #" << VNI->id << " live into BB#" << MFI->getNumber()
+            << '@' << LiveInts->getMBBStartIdx(MFI) << '\n';
       }
     }
+    if (&*MFI == EndMBB)
+      break;
+    ++MFI;
+  }
+}
 
-    // Check the LI only has one connected component.
-    if (TargetRegisterInfo::isVirtualRegister(LI.reg)) {
-      ConnectedVNInfoEqClasses ConEQ(*LiveInts);
-      unsigned NumComp = ConEQ.Classify(&LI);
-      if (NumComp > 1) {
-        report("Multiple connected components in live interval", MF);
-        *OS << NumComp << " components in " << LI << '\n';
-        for (unsigned comp = 0; comp != NumComp; ++comp) {
-          *OS << comp << ": valnos";
-          for (LiveInterval::const_vni_iterator I = LI.vni_begin(),
-               E = LI.vni_end(); I!=E; ++I)
-            if (comp == ConEQ.getEqClass(*I))
-              *OS << ' ' << (*I)->id;
-          *OS << '\n';
-        }
+void MachineVerifier::verifyLiveInterval(const LiveInterval &LI) {
+  for (LiveInterval::const_vni_iterator I = LI.vni_begin(), E = LI.vni_end();
+       I!=E; ++I)
+    verifyLiveIntervalValue(LI, *I);
+
+  for (LiveInterval::const_iterator I = LI.begin(), E = LI.end(); I!=E; ++I)
+    verifyLiveIntervalSegment(LI, I);
+
+  // Check the LI only has one connected component.
+  if (TargetRegisterInfo::isVirtualRegister(LI.reg)) {
+    ConnectedVNInfoEqClasses ConEQ(*LiveInts);
+    unsigned NumComp = ConEQ.Classify(&LI);
+    if (NumComp > 1) {
+      report("Multiple connected components in live interval", MF, LI);
+      for (unsigned comp = 0; comp != NumComp; ++comp) {
+        *OS << comp << ": valnos";
+        for (LiveInterval::const_vni_iterator I = LI.vni_begin(),
+             E = LI.vni_end(); I!=E; ++I)
+          if (comp == ConEQ.getEqClass(*I))
+            *OS << ' ' << (*I)->id;
+        *OS << '\n';
       }
     }
   }
diff --git a/lib/CodeGen/PHIElimination.cpp b/lib/CodeGen/PHIElimination.cpp
index b820578d6c..e6e23da27c 100644
--- a/lib/CodeGen/PHIElimination.cpp
+++ b/lib/CodeGen/PHIElimination.cpp
@@ -194,7 +194,7 @@ static bool isSourceDefinedByImplicitDef(const MachineInstr *MPhi,
 
 
 /// LowerAtomicPHINode - Lower the PHI node at the top of the specified block,
-/// under the assuption that it needs to be lowered in a way that supports
+/// under the assumption that it needs to be lowered in a way that supports
 /// atomic execution of PHIs.  This lowering method is always correct all of the
 /// time.
 ///
@@ -355,39 +355,35 @@ void PHIElimination::LowerAtomicPHINode(
     // add a kill marker in this block saying that it kills the incoming value!
     if (!ValueIsUsed && !LV->isLiveOut(SrcReg, opBlock)) {
       // In our final twist, we have to decide which instruction kills the
-      // register.  In most cases this is the copy, however, the first
-      // terminator instruction at the end of the block may also use the value.
-      // In this case, we should mark *it* as being the killing block, not the
-      // copy.
-      MachineBasicBlock::iterator KillInst;
-      MachineBasicBlock::iterator Term = opBlock.getFirstTerminator();
-      if (Term != opBlock.end() && Term->readsRegister(SrcReg)) {
-        KillInst = Term;
-
-        // Check that no other terminators use values.
-#ifndef NDEBUG
-        for (MachineBasicBlock::iterator TI = llvm::next(Term);
-             TI != opBlock.end(); ++TI) {
-          if (TI->isDebugValue())
-            continue;
-          assert(!TI->readsRegister(SrcReg) &&
-                 "Terminator instructions cannot use virtual registers unless"
-                 "they are the first terminator in a block!");
-        }
-#endif
-      } else if (reusedIncoming || !IncomingReg) {
-        // We may have to rewind a bit if we didn't insert a copy this time.
-        KillInst = Term;
-        while (KillInst != opBlock.begin()) {
-          --KillInst;
-          if (KillInst->isDebugValue())
-            continue;
-          if (KillInst->readsRegister(SrcReg))
-            break;
+      // register.  In most cases this is the copy, however, terminator
+      // instructions at the end of the block may also use the value. In this
+      // case, we should mark the last such terminator as being the killing
+      // block, not the copy.
+      MachineBasicBlock::iterator KillInst = opBlock.end();
+      MachineBasicBlock::iterator FirstTerm = opBlock.getFirstTerminator();
+      for (MachineBasicBlock::iterator Term = FirstTerm;
+          Term != opBlock.end(); ++Term) {
+        if (Term->readsRegister(SrcReg))
+          KillInst = Term;
+      }
+
+      if (KillInst == opBlock.end()) {
+        // No terminator uses the register.
+
+        if (reusedIncoming || !IncomingReg) {
+          // We may have to rewind a bit if we didn't insert a copy this time.
+          KillInst = FirstTerm;
+          while (KillInst != opBlock.begin()) {
+            --KillInst;
+            if (KillInst->isDebugValue())
+              continue;
+            if (KillInst->readsRegister(SrcReg))
+              break;
+          }
+        } else {
+          // We just inserted this copy.
+          KillInst = prior(InsertPos);
         }
-      } else {
-        // We just inserted this copy.
-        KillInst = prior(InsertPos);
       }
       assert(KillInst->readsRegister(SrcReg) && "Cannot find kill instruction");
 
@@ -427,28 +423,71 @@ bool PHIElimination::SplitPHIEdges(MachineFunction &MF,
   if (MBB.empty() || !MBB.front().isPHI() || MBB.isLandingPad())
     return false;   // Quick exit for basic blocks without PHIs.
 
+  const MachineLoop *CurLoop = MLI ? MLI->getLoopFor(&MBB) : 0;
+  bool IsLoopHeader = CurLoop && &MBB == CurLoop->getHeader();
+
   bool Changed = false;
   for (MachineBasicBlock::iterator BBI = MBB.begin(), BBE = MBB.end();
        BBI != BBE && BBI->isPHI(); ++BBI) {
     for (unsigned i = 1, e = BBI->getNumOperands(); i != e; i += 2) {
       unsigned Reg = BBI->getOperand(i).getReg();
       MachineBasicBlock *PreMBB = BBI->getOperand(i+1).getMBB();
-      // We break edges when registers are live out from the predecessor block
-      // (not considering PHI nodes). If the register is live in to this block
-      // anyway, we would gain nothing from splitting.
+      // Is there a critical edge from PreMBB to MBB?
+      if (PreMBB->succ_size() == 1)
+        continue;
+
       // Avoid splitting backedges of loops. It would introduce small
       // out-of-line blocks into the loop which is very bad for code placement.
-      if (PreMBB != &MBB &&
-          !LV.isLiveIn(Reg, MBB) && LV.isLiveOut(Reg, *PreMBB)) {
-        if (!MLI ||
-            !(MLI->getLoopFor(PreMBB) == MLI->getLoopFor(&MBB) &&
-              MLI->isLoopHeader(&MBB))) {
-          if (PreMBB->SplitCriticalEdge(&MBB, this)) {
-            Changed = true;
-            ++NumCriticalEdgesSplit;
-          }
-        }
+      if (PreMBB == &MBB)
+        continue;
+      const MachineLoop *PreLoop = MLI ? MLI->getLoopFor(PreMBB) : 0;
+      if (IsLoopHeader && PreLoop == CurLoop)
+        continue;
+
+      // LV doesn't consider a phi use live-out, so isLiveOut only returns true
+      // when the source register is live-out for some other reason than a phi
+      // use. That means the copy we will insert in PreMBB won't be a kill, and
+      // there is a risk it may not be coalesced away.
+      //
+      // If the copy would be a kill, there is no need to split the edge.
+      if (!LV.isLiveOut(Reg, *PreMBB))
+        continue;
+
+      DEBUG(dbgs() << PrintReg(Reg) << " live-out before critical edge BB#"
+                   << PreMBB->getNumber() << " -> BB#" << MBB.getNumber()
+                   << ": " << *BBI);
+
+      // If Reg is not live-in to MBB, it means it must be live-in to some
+      // other PreMBB successor, and we can avoid the interference by splitting
+      // the edge.
+      //
+      // If Reg *is* live-in to MBB, the interference is inevitable and a copy
+      // is likely to be left after coalescing. If we are looking at a loop
+      // exiting edge, split it so we won't insert code in the loop, otherwise
+      // don't bother.
+      bool ShouldSplit = !LV.isLiveIn(Reg, MBB);
+
+      // Check for a loop exiting edge.
+      if (!ShouldSplit && CurLoop != PreLoop) {
+        DEBUG({
+          dbgs() << "Split wouldn't help, maybe avoid loop copies?\n";
+          if (PreLoop) dbgs() << "PreLoop: " << *PreLoop;
+          if (CurLoop) dbgs() << "CurLoop: " << *CurLoop;
+        });
+        // This edge could be entering a loop, exiting a loop, or it could be
+        // both: Jumping directly form one loop to the header of a sibling
+        // loop.
+        // Split unless this edge is entering CurLoop from an outer loop.
+        ShouldSplit = PreLoop && !PreLoop->contains(CurLoop);
+      }
+      if (!ShouldSplit)
+        continue;
+      if (!PreMBB->SplitCriticalEdge(&MBB, this)) {
+        DEBUG(dbgs() << "Failed to split ciritcal edge.\n");
+        continue;
       }
+      Changed = true;
+      ++NumCriticalEdgesSplit;
     }
   }
   return Changed;
diff --git a/lib/CodeGen/Passes.cpp b/lib/CodeGen/Passes.cpp
index 9693780bda..56526f2732 100644
--- a/lib/CodeGen/Passes.cpp
+++ b/lib/CodeGen/Passes.cpp
@@ -49,6 +49,8 @@ static cl::opt<bool> DisableSSC("disable-ssc", cl::Hidden,
     cl::desc("Disable Stack Slot Coloring"));
 static cl::opt<bool> DisableMachineDCE("disable-machine-dce", cl::Hidden,
     cl::desc("Disable Machine Dead Code Elimination"));
+static cl::opt<bool> EnableEarlyIfConversion("enable-early-ifcvt", cl::Hidden,
+    cl::desc("Enable Early If-conversion"));
 static cl::opt<bool> DisableMachineLICM("disable-machine-licm", cl::Hidden,
     cl::desc("Disable Machine LICM"));
 static cl::opt<bool> DisableMachineCSE("disable-machine-cse", cl::Hidden,
@@ -86,6 +88,10 @@ PrintMachineInstrs("print-machineinstrs", cl::ValueOptional,
                    cl::desc("Print machine instrs"),
                    cl::value_desc("pass-name"), cl::init("option-unspecified"));
 
+// Experimental option to run live inteerval analysis early.
+static cl::opt<bool> EarlyLiveIntervals("early-live-intervals", cl::Hidden,
+    cl::desc("Run live interval analysis earlier in the pipeline"));
+
 /// Allow standard passes to be disabled by command line options. This supports
 /// simple binary flags that either suppress the pass or do nothing.
 /// i.e. -disable-mypass=false has no effect.
@@ -154,6 +160,9 @@ static AnalysisID overridePass(AnalysisID StandardID, AnalysisID TargetID) {
   if (StandardID == &DeadMachineInstructionElimID)
     return applyDisable(TargetID, DisableMachineDCE);
 
+  if (StandardID == &EarlyIfConverterID)
+    return applyDisable(TargetID, !EnableEarlyIfConversion);
+
   if (StandardID == &MachineLICMID)
     return applyDisable(TargetID, DisableMachineLICM);
 
@@ -228,6 +237,9 @@ TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm)
   substitutePass(&EarlyTailDuplicateID, &TailDuplicateID);
   substitutePass(&PostRAMachineLICMID, &MachineLICMID);
 
+  // Disable early if-conversion. Targets that are ready can enable it.
+  disablePass(&EarlyIfConverterID);
+
   // Temporarily disable experimental passes.
   substitutePass(&MachineSchedulerID, 0);
 }
@@ -425,9 +437,6 @@ void TargetPassConfig::addISelPrepare() {
 /// TODO: We could use a single addPre/Post(ID) hook to allow pass injection
 /// before/after any target-independent pass. But it's currently overkill.
 void TargetPassConfig::addMachinePasses() {
-  // Print the instruction selected machine code...
-  printAndVerify("After Instruction Selection");
-
   // Insert a machine instr printer pass after the specified pass.
   // If -print-machineinstrs specified, print machineinstrs after all passes.
   if (StringRef(PrintMachineInstrs.getValue()).equals(""))
@@ -443,8 +452,12 @@ void TargetPassConfig::addMachinePasses() {
     insertPass(TID, IID);
   }
 
+  // Print the instruction selected machine code...
+  printAndVerify("After Instruction Selection");
+
   // Expand pseudo-instructions emitted by ISel.
-  addPass(&ExpandISelPseudosID);
+  if (addPass(&ExpandISelPseudosID))
+    printAndVerify("After ExpandISelPseudos");
 
   // Add passes that optimize machine instructions in SSA form.
   if (getOptLevel() != CodeGenOpt::None) {
@@ -527,6 +540,7 @@ void TargetPassConfig::addMachineSSAOptimization() {
   addPass(&DeadMachineInstructionElimID);
   printAndVerify("After codegen DCE pass");
 
+  addPass(&EarlyIfConverterID);
   addPass(&MachineLICMID);
   addPass(&MachineCSEID);
   addPass(&MachineSinkingID);
@@ -639,6 +653,11 @@ void TargetPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
     addPass(&MachineLoopInfoID);
     addPass(&PHIEliminationID);
   }
+
+  // Eventually, we want to run LiveIntervals before PHI elimination.
+  if (EarlyLiveIntervals)
+    addPass(&LiveIntervalsID);
+
   addPass(&TwoAddressInstructionPassID);
 
   if (EnableStrongPHIElim)
diff --git a/lib/CodeGen/PeepholeOptimizer.cpp b/lib/CodeGen/PeepholeOptimizer.cpp
index 91c33c4af4..9099862bd3 100644
--- a/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/lib/CodeGen/PeepholeOptimizer.cpp
@@ -78,6 +78,8 @@ STATISTIC(NumReuse,      "Number of extension results reused");
 STATISTIC(NumBitcasts,   "Number of bitcasts eliminated");
 STATISTIC(NumCmps,       "Number of compares eliminated");
 STATISTIC(NumImmFold,    "Number of move immediate folded");
+STATISTIC(NumLoadFold,   "Number of loads folded");
+STATISTIC(NumSelects,    "Number of selects optimized");
 
 namespace {
   class PeepholeOptimizer : public MachineFunctionPass {
@@ -108,12 +110,14 @@ namespace {
     bool optimizeCmpInstr(MachineInstr *MI, MachineBasicBlock *MBB);
     bool optimizeExtInstr(MachineInstr *MI, MachineBasicBlock *MBB,
                           SmallPtrSet<MachineInstr*, 8> &LocalMIs);
+    bool optimizeSelect(MachineInstr *MI);
     bool isMoveImmediate(MachineInstr *MI,
                          SmallSet<unsigned, 4> &ImmDefRegs,
                          DenseMap<unsigned, MachineInstr*> &ImmDefMIs);
     bool foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB,
                        SmallSet<unsigned, 4> &ImmDefRegs,
                        DenseMap<unsigned, MachineInstr*> &ImmDefMIs);
+    bool isLoadFoldable(MachineInstr *MI, unsigned &FoldAsLoadDefReg);
   };
 }
 
@@ -384,6 +388,47 @@ bool PeepholeOptimizer::optimizeCmpInstr(MachineInstr *MI,
   return false;
 }
 
+/// Optimize a select instruction.
+bool PeepholeOptimizer::optimizeSelect(MachineInstr *MI) {
+  unsigned TrueOp = 0;
+  unsigned FalseOp = 0;
+  bool Optimizable = false;
+  SmallVector<MachineOperand, 4> Cond;
+  if (TII->analyzeSelect(MI, Cond, TrueOp, FalseOp, Optimizable))
+    return false;
+  if (!Optimizable)
+    return false;
+  if (!TII->optimizeSelect(MI))
+    return false;
+  MI->eraseFromParent();
+  ++NumSelects;
+  return true;
+}
+
+/// isLoadFoldable - Check whether MI is a candidate for folding into a later
+/// instruction. We only fold loads to virtual registers and the virtual
+/// register defined has a single use.
+bool PeepholeOptimizer::isLoadFoldable(MachineInstr *MI,
+                                       unsigned &FoldAsLoadDefReg) {
+  if (!MI->canFoldAsLoad() || !MI->mayLoad())
+    return false;
+  const MCInstrDesc &MCID = MI->getDesc();
+  if (MCID.getNumDefs() != 1)
+    return false;
+
+  unsigned Reg = MI->getOperand(0).getReg();
+  // To reduce compilation time, we check MRI->hasOneUse when inserting
+  // loads. It should be checked when processing uses of the load, since
+  // uses can be removed during peephole.
+  if (!MI->getOperand(0).getSubReg() &&
+      TargetRegisterInfo::isVirtualRegister(Reg) &&
+      MRI->hasOneUse(Reg)) {
+    FoldAsLoadDefReg = Reg;
+    return true;
+  }
+  return false;
+}
+
 bool PeepholeOptimizer::isMoveImmediate(MachineInstr *MI,
                                         SmallSet<unsigned, 4> &ImmDefRegs,
                                  DenseMap<unsigned, MachineInstr*> &ImmDefMIs) {
@@ -441,6 +486,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
   SmallPtrSet<MachineInstr*, 8> LocalMIs;
   SmallSet<unsigned, 4> ImmDefRegs;
   DenseMap<unsigned, MachineInstr*> ImmDefMIs;
+  unsigned FoldAsLoadDefReg;
   for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ++I) {
     MachineBasicBlock *MBB = &*I;
 
@@ -448,37 +494,33 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
     LocalMIs.clear();
     ImmDefRegs.clear();
     ImmDefMIs.clear();
+    FoldAsLoadDefReg = 0;
 
-    bool First = true;
-    MachineBasicBlock::iterator PMII;
     for (MachineBasicBlock::iterator
            MII = I->begin(), MIE = I->end(); MII != MIE; ) {
       MachineInstr *MI = &*MII;
+      // We may be erasing MI below, increment MII now.
+      ++MII;
       LocalMIs.insert(MI);
 
+      // If there exists an instruction which belongs to the following
+      // categories, we will discard the load candidate.
       if (MI->isLabel() || MI->isPHI() || MI->isImplicitDef() ||
           MI->isKill() || MI->isInlineAsm() || MI->isDebugValue() ||
           MI->hasUnmodeledSideEffects()) {
-        ++MII;
+        FoldAsLoadDefReg = 0;
         continue;
       }
-
-      if (MI->isBitcast()) {
-        if (optimizeBitcastInstr(MI, MBB)) {
-          // MI is deleted.
-          LocalMIs.erase(MI);
-          Changed = true;
-          MII = First ? I->begin() : llvm::next(PMII);
-          continue;
-        }
-      } else if (MI->isCompare()) {
-        if (optimizeCmpInstr(MI, MBB)) {
-          // MI is deleted.
-          LocalMIs.erase(MI);
-          Changed = true;
-          MII = First ? I->begin() : llvm::next(PMII);
-          continue;
-        }
+      if (MI->mayStore() || MI->isCall())
+        FoldAsLoadDefReg = 0;
+
+      if ((MI->isBitcast() && optimizeBitcastInstr(MI, MBB)) ||
+          (MI->isCompare() && optimizeCmpInstr(MI, MBB)) ||
+          (MI->isSelect() && optimizeSelect(MI))) {
+        // MI is deleted.
+        LocalMIs.erase(MI);
+        Changed = true;
+        continue;
       }
 
       if (isMoveImmediate(MI, ImmDefRegs, ImmDefMIs)) {
@@ -489,9 +531,29 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
           Changed |= foldImmediate(MI, MBB, ImmDefRegs, ImmDefMIs);
       }
 
-      First = false;
-      PMII = MII;
-      ++MII;
+      // Check whether MI is a load candidate for folding into a later
+      // instruction. If MI is not a candidate, check whether we can fold an
+      // earlier load into MI.
+      if (!isLoadFoldable(MI, FoldAsLoadDefReg) && FoldAsLoadDefReg) {
+        // We need to fold load after optimizeCmpInstr, since optimizeCmpInstr
+        // can enable folding by converting SUB to CMP.
+        MachineInstr *DefMI = 0;
+        MachineInstr *FoldMI = TII->optimizeLoadInstr(MI, MRI,
+                                                      FoldAsLoadDefReg, DefMI);
+        if (FoldMI) {
+          // Update LocalMIs since we replaced MI with FoldMI and deleted DefMI.
+          LocalMIs.erase(MI);
+          LocalMIs.erase(DefMI);
+          LocalMIs.insert(FoldMI);
+          MI->eraseFromParent();
+          DefMI->eraseFromParent();
+          ++NumLoadFold;
+
+          // MI is replaced with FoldMI.
+          Changed = true;
+          continue;
+        }
+      }
     }
   }
 
diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index 8325f20e41..6b3a48eefd 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@@ -201,20 +201,16 @@ int RAFast::getStackSpaceFor(unsigned VirtReg, const TargetRegisterClass *RC) {
 /// its virtual register, and it is guaranteed to be a block-local register.
 ///
 bool RAFast::isLastUseOfLocalReg(MachineOperand &MO) {
-  // Check for non-debug uses or defs following MO.
-  // This is the most likely way to fail - fast path it.
-  MachineOperand *Next = &MO;
-  while ((Next = Next->getNextOperandForReg()))
-    if (!Next->isDebug())
-      return false;
-
   // If the register has ever been spilled or reloaded, we conservatively assume
   // it is a global register used in multiple blocks.
   if (StackSlotForVirtReg[MO.getReg()] != -1)
     return false;
 
   // Check that the use/def chain has exactly one operand - MO.
-  return &MRI->reg_nodbg_begin(MO.getReg()).getOperand() == &MO;
+  MachineRegisterInfo::reg_nodbg_iterator I = MRI->reg_nodbg_begin(MO.getReg());
+  if (&I.getOperand() != &MO)
+    return false;
+  return ++I == MRI->reg_nodbg_end();
 }
 
 /// addKillFlag - Set kill flags on last use of a virtual register.
diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp
index 6ac5428605..d0cff481cb 100644
--- a/lib/CodeGen/RegAllocGreedy.cpp
+++ b/lib/CodeGen/RegAllocGreedy.cpp
@@ -1747,7 +1747,7 @@ unsigned RAGreedy::selectOrSplit(LiveInterval &VirtReg,
 bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   DEBUG(dbgs() << "********** GREEDY REGISTER ALLOCATION **********\n"
                << "********** Function: "
-               << ((Value*)mf.getFunction())->getName() << '\n');
+               << mf.getFunction()->getName() << '\n');
 
   MF = &mf;
   if (VerifyEnabled)
diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
index 733312fbd0..990633440e 100644
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp
@@ -460,14 +460,8 @@ bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP,
   IntB.addRange(LiveRange(FillerStart, FillerEnd, BValNo));
 
   // Okay, merge "B1" into the same value number as "B0".
-  if (BValNo != ValLR->valno) {
-    // If B1 is killed by a PHI, then the merged live range must also be killed
-    // by the same PHI, as B0 and B1 can not overlap.
-    bool HasPHIKill = BValNo->hasPHIKill();
+  if (BValNo != ValLR->valno)
     IntB.MergeValueNumberInto(BValNo, ValLR->valno);
-    if (HasPHIKill)
-      ValLR->valno->setHasPHIKill(true);
-  }
   DEBUG(dbgs() << "   result = " << IntB << '\n');
 
   // If the source instruction was killing the source register before the
@@ -494,6 +488,11 @@ bool RegisterCoalescer::hasOtherReachingDefs(LiveInterval &IntA,
                                              LiveInterval &IntB,
                                              VNInfo *AValNo,
                                              VNInfo *BValNo) {
+  // If AValNo has PHI kills, conservatively assume that IntB defs can reach
+  // the PHI values.
+  if (LIS->hasPHIKill(IntA, AValNo))
+    return true;
+
   for (LiveInterval::iterator AI = IntA.begin(), AE = IntA.end();
        AI != AE; ++AI) {
     if (AI->valno != AValNo) continue;
@@ -558,10 +557,7 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
   // AValNo is the value number in A that defines the copy, A3 in the example.
   VNInfo *AValNo = IntA.getVNInfoAt(CopyIdx.getRegSlot(true));
   assert(AValNo && "COPY source not live");
-
-  // If other defs can reach uses of this def, then it's not safe to perform
-  // the optimization.
-  if (AValNo->isPHIDef() || AValNo->isUnused() || AValNo->hasPHIKill())
+  if (AValNo->isPHIDef() || AValNo->isUnused())
     return false;
   MachineInstr *DefMI = LIS->getInstructionFromIndex(AValNo->def);
   if (!DefMI)
@@ -657,6 +653,8 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
     LiveInterval::iterator ULR = IntA.FindLiveRangeContaining(UseIdx);
     if (ULR == IntA.end() || ULR->valno != AValNo)
       continue;
+    // Kill flags are no longer accurate. They are recomputed after RA.
+    UseMO.setIsKill(false);
     if (TargetRegisterInfo::isPhysicalRegister(NewReg))
       UseMO.substPhysReg(NewReg, *TRI);
     else
@@ -1093,6 +1091,11 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
   // register live range doesn't need to be accurate as long as all the
   // defs are there.
 
+  // Delete the identity copy.
+  MachineInstr *CopyMI = MRI->getVRegDef(RHS.reg);
+  LIS->RemoveMachineInstrFromMaps(CopyMI);
+  CopyMI->eraseFromParent();
+
   // We don't track kills for reserved registers.
   MRI->clearKillFlags(CP.getSrcReg());
 
@@ -1382,24 +1385,6 @@ bool RegisterCoalescer::joinIntervals(CoalescerPair &CP) {
       ++J;
   }
 
-  // Update kill info. Some live ranges are extended due to copy coalescing.
-  for (DenseMap<VNInfo*, VNInfo*>::iterator I = LHSValsDefinedFromRHS.begin(),
-         E = LHSValsDefinedFromRHS.end(); I != E; ++I) {
-    VNInfo *VNI = I->first;
-    unsigned LHSValID = LHSValNoAssignments[VNI->id];
-    if (VNI->hasPHIKill())
-      NewVNInfo[LHSValID]->setHasPHIKill(true);
-  }
-
-  // Update kill info. Some live ranges are extended due to copy coalescing.
-  for (DenseMap<VNInfo*, VNInfo*>::iterator I = RHSValsDefinedFromLHS.begin(),
-         E = RHSValsDefinedFromLHS.end(); I != E; ++I) {
-    VNInfo *VNI = I->first;
-    unsigned RHSValID = RHSValNoAssignments[VNI->id];
-    if (VNI->hasPHIKill())
-      NewVNInfo[RHSValID]->setHasPHIKill(true);
-  }
-
   // Clear kill flags where live ranges are extended.
   while (!LHSOldKills.empty())
     LHSOldKills.pop_back_val()->clearRegisterKills(LHS.reg, TRI);
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index 110f478f48..9c1dba355b 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -411,12 +411,11 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
   const MachineInstr *MI = SU->getInstr();
   unsigned Reg = MI->getOperand(OperIdx).getReg();
 
-  // SSA defs do not have output/anti dependencies.
+  // Singly defined vregs do not have output/anti dependencies.
   // The current operand is a def, so we have at least one.
-  //
-  // FIXME: This optimization is disabled pending PR13112.
-  //if (llvm::next(MRI.def_begin(Reg)) == MRI.def_end())
-  //  return;
+  // Check here if there are any others...
+  if (MRI.hasOneDef(Reg))
+    return;
 
   // Add output dependence to the next nearest def of this vreg.
   //
diff --git a/lib/CodeGen/ScoreboardHazardRecognizer.cpp b/lib/CodeGen/ScoreboardHazardRecognizer.cpp
index 7110b7566a..e675366485 100644
--- a/lib/CodeGen/ScoreboardHazardRecognizer.cpp
+++ b/lib/CodeGen/ScoreboardHazardRecognizer.cpp
@@ -72,10 +72,12 @@ ScoreboardHazardRecognizer(const InstrItineraryData *II,
   ReservedScoreboard.reset(ScoreboardDepth);
   RequiredScoreboard.reset(ScoreboardDepth);
 
+  // If MaxLookAhead is not set above, then we are not enabled.
   if (!isEnabled())
     DEBUG(dbgs() << "Disabled scoreboard hazard recognizer\n");
   else {
-    IssueWidth = ItinData->Props.IssueWidth;
+    // A nonempty itinerary must have a SchedModel.
+    IssueWidth = ItinData->SchedModel->IssueWidth;
     DEBUG(dbgs() << "Using scoreboard hazard recognizer: Depth = "
           << ScoreboardDepth << '\n');
   }
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index f16b1e7f4a..10491d98cb 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -228,6 +228,9 @@ namespace {
     SDValue visitFP_EXTEND(SDNode *N);
     SDValue visitFNEG(SDNode *N);
     SDValue visitFABS(SDNode *N);
+    SDValue visitFCEIL(SDNode *N);
+    SDValue visitFTRUNC(SDNode *N);
+    SDValue visitFFLOOR(SDNode *N);
     SDValue visitBRCOND(SDNode *N);
     SDValue visitBR_CC(SDNode *N);
     SDValue visitLOAD(SDNode *N);
@@ -1140,6 +1143,9 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::FP_EXTEND:          return visitFP_EXTEND(N);
   case ISD::FNEG:               return visitFNEG(N);
   case ISD::FABS:               return visitFABS(N);
+  case ISD::FFLOOR:             return visitFFLOOR(N);
+  case ISD::FCEIL:              return visitFCEIL(N);
+  case ISD::FTRUNC:             return visitFTRUNC(N);
   case ISD::BRCOND:             return visitBRCOND(N);
   case ISD::BR_CC:              return visitBR_CC(N);
   case ISD::LOAD:               return visitLOAD(N);
@@ -1639,7 +1645,7 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
   if (N1.getOpcode() == ISD::ADD && N0C && N1C1) {
     SDValue NewC = DAG.getConstant((N0C->getAPIntValue() - N1C1->getAPIntValue()), VT);
     return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, NewC,
-		       N1.getOperand(0));
+                       N1.getOperand(0));
   }
   // fold ((A+(B+or-C))-B) -> A+or-C
   if (N0.getOpcode() == ISD::ADD &&
@@ -2340,7 +2346,7 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
   // We also handle SCALAR_TO_VECTOR because xor/or/and operations are cheaper
   // on scalars.
   if ((N0.getOpcode() == ISD::BITCAST || N0.getOpcode() == ISD::SCALAR_TO_VECTOR)
-      && Level == AfterLegalizeVectorOps) {
+      && Level == AfterLegalizeTypes) {
     SDValue In0 = N0.getOperand(0);
     SDValue In1 = N1.getOperand(0);
     EVT In0Ty = In0.getValueType();
@@ -2716,6 +2722,34 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
     }
   }
 
+  if (N0.getOpcode() == ISD::ADD && N1.getOpcode() == ISD::SRL &&
+      VT.getSizeInBits() <= 64) {
+    if (ConstantSDNode *ADDI = dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+      APInt ADDC = ADDI->getAPIntValue();
+      if (!TLI.isLegalAddImmediate(ADDC.getSExtValue())) {
+        // Look for (and (add x, c1), (lshr y, c2)). If C1 wasn't a legal
+        // immediate for an add, but it is legal if its top c2 bits are set,
+        // transform the ADD so the immediate doesn't need to be materialized
+        // in a register.
+        if (ConstantSDNode *SRLI = dyn_cast<ConstantSDNode>(N1.getOperand(1))) {
+          APInt Mask = APInt::getHighBitsSet(VT.getSizeInBits(),
+                                             SRLI->getZExtValue());
+          if (DAG.MaskedValueIsZero(N0.getOperand(1), Mask)) {
+            ADDC |= Mask;
+            if (TLI.isLegalAddImmediate(ADDC.getSExtValue())) {
+              SDValue NewAdd =
+                DAG.getNode(ISD::ADD, N0.getDebugLoc(), VT,
+                            N0.getOperand(0), DAG.getConstant(ADDC, VT));
+              CombineTo(N0.getNode(), NewAdd);
+              return SDValue(N, 0); // Return N so it doesn't get rechecked!
+            }
+          }
+        }
+      }
+    }
+  }
+      
+
   return SDValue();
 }
 
@@ -5651,7 +5685,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
   if ((DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||
        DAG.getTarget().Options.UnsafeFPMath) &&
       DAG.getTarget().getTargetLowering()->isFMAFasterThanMulAndAdd(VT) &&
-      TLI.isOperationLegal(ISD::FMA, VT)) {
+      TLI.isOperationLegalOrCustom(ISD::FMA, VT)) {
 
     // fold (fadd (fmul x, y), z) -> (fma x, y, z)
     if (N0.getOpcode() == ISD::FMUL && N0->hasOneUse()) {
@@ -5676,6 +5710,7 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
   ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
   ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
   EVT VT = N->getValueType(0);
+  DebugLoc dl = N->getDebugLoc();
 
   // fold vector ops
   if (VT.isVector()) {
@@ -5696,11 +5731,11 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
     if (isNegatibleForFree(N1, LegalOperations, TLI, &DAG.getTarget().Options))
       return GetNegatedExpression(N1, DAG, LegalOperations);
     if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
-      return DAG.getNode(ISD::FNEG, N->getDebugLoc(), VT, N1);
+      return DAG.getNode(ISD::FNEG, dl, VT, N1);
   }
   // fold (fsub A, (fneg B)) -> (fadd A, B)
   if (isNegatibleForFree(N1, LegalOperations, TLI, &DAG.getTarget().Options))
-    return DAG.getNode(ISD::FADD, N->getDebugLoc(), VT, N0,
+    return DAG.getNode(ISD::FADD, dl, VT, N0,
                        GetNegatedExpression(N1, DAG, LegalOperations));
 
   // If 'unsafe math' is enabled, fold
@@ -5728,23 +5763,34 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
   if ((DAG.getTarget().Options.AllowFPOpFusion == FPOpFusion::Fast ||
        DAG.getTarget().Options.UnsafeFPMath) &&
       DAG.getTarget().getTargetLowering()->isFMAFasterThanMulAndAdd(VT) &&
-      TLI.isOperationLegal(ISD::FMA, VT)) {
+      TLI.isOperationLegalOrCustom(ISD::FMA, VT)) {
 
     // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
     if (N0.getOpcode() == ISD::FMUL && N0->hasOneUse()) {
-      return DAG.getNode(ISD::FMA, N->getDebugLoc(), VT,
+      return DAG.getNode(ISD::FMA, dl, VT,
                          N0.getOperand(0), N0.getOperand(1),
-                         DAG.getNode(ISD::FNEG, N1->getDebugLoc(), VT, N1));
+                         DAG.getNode(ISD::FNEG, dl, VT, N1));
     }
 
     // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
     // Note: Commutes FSUB operands.
     if (N1.getOpcode() == ISD::FMUL && N1->hasOneUse()) {
-      return DAG.getNode(ISD::FMA, N->getDebugLoc(), VT,
-                         DAG.getNode(ISD::FNEG, N1->getDebugLoc(), VT,
+      return DAG.getNode(ISD::FMA, dl, VT,
+                         DAG.getNode(ISD::FNEG, dl, VT,
                          N1.getOperand(0)),
                          N1.getOperand(1), N0);
     }
+
+    // fold (fsub (-(fmul, x, y)), z) -> (fma (fneg x), y, (fneg z))
+    if (N0.getOpcode() == ISD::FNEG && 
+        N0.getOperand(0).getOpcode() == ISD::FMUL &&
+        N0->hasOneUse() && N0.getOperand(0).hasOneUse()) {
+      SDValue N00 = N0.getOperand(0).getOperand(0);
+      SDValue N01 = N0.getOperand(0).getOperand(1);
+      return DAG.getNode(ISD::FMA, dl, VT,
+                         DAG.getNode(ISD::FNEG, dl, VT, N00), N01,
+                         DAG.getNode(ISD::FNEG, dl, VT, N1));
+    }
   }
 
   return SDValue();
@@ -5974,6 +6020,38 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
       return DAG.getNode(ISD::UINT_TO_FP, N->getDebugLoc(), VT, N0);
   }
 
+  // The next optimizations are desireable only if SELECT_CC can be lowered.
+  // Check against MVT::Other for SELECT_CC, which is a workaround for targets
+  // having to say they don't support SELECT_CC on every type the DAG knows
+  // about, since there is no way to mark an opcode illegal at all value types
+  // (See also visitSELECT)
+  if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, MVT::Other)) {
+    // fold (sint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc)
+    if (N0.getOpcode() == ISD::SETCC && N0.getValueType() == MVT::i1 &&
+        !VT.isVector() &&
+        (!LegalOperations ||
+         TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT))) {
+      SDValue Ops[] =
+        { N0.getOperand(0), N0.getOperand(1),
+          DAG.getConstantFP(-1.0, VT) , DAG.getConstantFP(0.0, VT),
+          N0.getOperand(2) };
+      return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), VT, Ops, 5);
+    }
+
+    // fold (sint_to_fp (zext (setcc x, y, cc))) ->
+    //      (select_cc x, y, 1.0, 0.0,, cc)
+    if (N0.getOpcode() == ISD::ZERO_EXTEND &&
+        N0.getOperand(0).getOpcode() == ISD::SETCC &&!VT.isVector() &&
+        (!LegalOperations ||
+         TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT))) {
+      SDValue Ops[] =
+        { N0.getOperand(0).getOperand(0), N0.getOperand(0).getOperand(1),
+          DAG.getConstantFP(1.0, VT) , DAG.getConstantFP(0.0, VT),
+          N0.getOperand(0).getOperand(2) };
+      return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), VT, Ops, 5);
+    }
+  }
+
   return SDValue();
 }
 
@@ -5999,6 +6077,25 @@ SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {
       return DAG.getNode(ISD::SINT_TO_FP, N->getDebugLoc(), VT, N0);
   }
 
+  // The next optimizations are desireable only if SELECT_CC can be lowered.
+  // Check against MVT::Other for SELECT_CC, which is a workaround for targets
+  // having to say they don't support SELECT_CC on every type the DAG knows
+  // about, since there is no way to mark an opcode illegal at all value types
+  // (See also visitSELECT)
+  if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, MVT::Other)) {
+    // fold (uint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc)
+
+    if (N0.getOpcode() == ISD::SETCC && !VT.isVector() &&
+        (!LegalOperations ||
+         TLI.isOperationLegalOrCustom(llvm::ISD::ConstantFP, VT))) {
+      SDValue Ops[] =
+        { N0.getOperand(0), N0.getOperand(1),
+          DAG.getConstantFP(1.0, VT),  DAG.getConstantFP(0.0, VT),
+          N0.getOperand(2) };
+      return DAG.getNode(ISD::SELECT_CC, N->getDebugLoc(), VT, Ops, 5);
+    }
+  }
+
   return SDValue();
 }
 
@@ -6152,6 +6249,42 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitFCEIL(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  EVT VT = N->getValueType(0);
+
+  // fold (fceil c1) -> fceil(c1)
+  if (N0CFP && VT != MVT::ppcf128)
+    return DAG.getNode(ISD::FCEIL, N->getDebugLoc(), VT, N0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFTRUNC(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  EVT VT = N->getValueType(0);
+
+  // fold (ftrunc c1) -> ftrunc(c1)
+  if (N0CFP && VT != MVT::ppcf128)
+    return DAG.getNode(ISD::FTRUNC, N->getDebugLoc(), VT, N0);
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitFFLOOR(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
+  EVT VT = N->getValueType(0);
+
+  // fold (ffloor c1) -> ffloor(c1)
+  if (N0CFP && VT != MVT::ppcf128)
+    return DAG.getNode(ISD::FFLOOR, N->getDebugLoc(), VT, N0);
+
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitFABS(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
@@ -7561,6 +7694,11 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
   unsigned NumInScalars = N->getNumOperands();
   DebugLoc dl = N->getDebugLoc();
   EVT VT = N->getValueType(0);
+
+  // A vector built entirely of undefs is undef.
+  if (ISD::allOperandsUndef(N))
+    return DAG.getUNDEF(VT);
+
   // Check to see if this is a BUILD_VECTOR of a bunch of values
   // which come from any_extend or zero_extend nodes. If so, we can create
   // a new BUILD_VECTOR using bit-casts which may enable other BUILD_VECTOR
@@ -7568,12 +7706,11 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
   // using shuffles.
   EVT SourceType = MVT::Other;
   bool AllAnyExt = true;
-  bool AllUndef = true;
+
   for (unsigned i = 0; i != NumInScalars; ++i) {
     SDValue In = N->getOperand(i);
     // Ignore undef inputs.
     if (In.getOpcode() == ISD::UNDEF) continue;
-    AllUndef = false;
 
     bool AnyExt  = In.getOpcode() == ISD::ANY_EXTEND;
     bool ZeroExt = In.getOpcode() == ISD::ZERO_EXTEND;
@@ -7601,9 +7738,6 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
     AllAnyExt &= AnyExt;
   }
 
-  if (AllUndef)
-    return DAG.getUNDEF(VT);
-
   // In order to have valid types, all of the inputs must be extended from the
   // same source type and all of the inputs must be any or zero extend.
   // Scalar sizes must be a power of two.
@@ -7742,9 +7876,29 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
       if (VecIn1.getValueType().getSizeInBits()*2 != VT.getSizeInBits())
         return SDValue();
 
-      // Widen the input vector by adding undef values.
-      VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, N->getDebugLoc(), VT,
-                           VecIn1, DAG.getUNDEF(VecIn1.getValueType()));
+      // If the element type of the input vector is not the same as
+      // the output element type, make concat_vectors based on input element
+      // type and then bitcast it to the output vector type.
+      //
+      // In another words avoid nodes like this:
+      //  <NODE> v16i8 = concat_vectors v4i16 v4i16
+      // Replace it with this one:
+      //  <NODE0> v8i16 = concat_vectors v4i16 v4i16
+      //  <NODE1> v16i8 = bitcast NODE0
+      EVT ItemType = VecIn1.getValueType().getVectorElementType();
+      if (ItemType != VT.getVectorElementType()) {
+        EVT ConcatVT = EVT::getVectorVT(*DAG.getContext(),
+                                ItemType,
+                                VecIn1.getValueType().getVectorNumElements()*2);
+        // Widen the input vector by adding undef values.
+        VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT,
+                             VecIn1, DAG.getUNDEF(VecIn1.getValueType()));
+        VecIn1 = DAG.getNode(ISD::BITCAST, dl, VT, VecIn1);
+      } else
+        // Widen the input vector by adding undef values.
+        VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+                             VecIn1, DAG.getUNDEF(VecIn1.getValueType()));
+
     }
 
     // If VecIn2 is unused then change it to undef.
@@ -7779,6 +7933,10 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
   if (N->getNumOperands() == 1)
     return N->getOperand(0);
 
+  // Check if all of the operands are undefs.
+  if (ISD::allOperandsUndef(N))
+    return DAG.getUNDEF(N->getValueType(0));
+
   return SDValue();
 }
 
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index 07dc36577f..683fac6744 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -55,6 +55,7 @@
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -484,7 +485,7 @@ bool FastISel::SelectGetElementPtr(const User *I) {
       if (const ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) {
         if (CI->isZero()) continue;
         // N = N + Offset
-        TotalOffs += 
+        TotalOffs +=
           TD.getTypeAllocSize(Ty)*cast<ConstantInt>(CI)->getSExtValue();
         if (TotalOffs >= MaxOffs) {
           N = FastEmit_ri_(VT, ISD::ADD, N, NIsKill, TotalOffs, VT);
@@ -573,7 +574,10 @@ bool FastISel::SelectCall(const User *I) {
     // At -O0 we don't care about the lifetime intrinsics.
   case Intrinsic::lifetime_start:
   case Intrinsic::lifetime_end:
+    // The donothing intrinsic does, well, nothing.
+  case Intrinsic::donothing:
     return true;
+
   case Intrinsic::dbg_declare: {
     const DbgDeclareInst *DI = cast<DbgDeclareInst>(Call);
     if (!DIVariable(DI->getVariable()).Verify() ||
@@ -642,7 +646,7 @@ bool FastISel::SelectCall(const User *I) {
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
           .addCImm(CI).addImm(DI->getOffset())
           .addMetadata(DI->getVariable());
-      else 
+      else
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, II)
           .addImm(CI->getZExtValue()).addImm(DI->getOffset())
           .addMetadata(DI->getVariable());
@@ -786,13 +790,24 @@ FastISel::SelectInstruction(const Instruction *I) {
 
   MachineBasicBlock::iterator SavedInsertPt = FuncInfo.InsertPt;
 
+  // As a special case, don't handle calls to builtin library functions that
+  // may be translated directly to target instructions.
+  if (const CallInst *Call = dyn_cast<CallInst>(I)) {
+    const Function *F = Call->getCalledFunction();
+    LibFunc::Func Func;
+    if (F && !F->hasLocalLinkage() && F->hasName() &&
+        LibInfo->getLibFunc(F->getName(), Func) &&
+        LibInfo->hasOptimizedCodeGen(Func))
+      return false;
+  }
+
   // First, try doing target-independent selection.
   if (SelectOperator(I, I->getOpcode())) {
     ++NumFastIselSuccessIndependent;
     DL = DebugLoc();
     return true;
   }
-  // Remove dead code.  However, ignore call instructions since we've flushed 
+  // Remove dead code.  However, ignore call instructions since we've flushed
   // the local value map and recomputed the insert point.
   if (!isa<CallInst>(I)) {
     recomputeInsertPt();
@@ -1037,7 +1052,8 @@ FastISel::SelectOperator(const User *I, unsigned Opcode) {
   }
 }
 
-FastISel::FastISel(FunctionLoweringInfo &funcInfo)
+FastISel::FastISel(FunctionLoweringInfo &funcInfo,
+                   const TargetLibraryInfo *libInfo)
   : FuncInfo(funcInfo),
     MRI(FuncInfo.MF->getRegInfo()),
     MFI(*FuncInfo.MF->getFrameInfo()),
@@ -1046,7 +1062,8 @@ FastISel::FastISel(FunctionLoweringInfo &funcInfo)
     TD(*TM.getTargetData()),
     TII(*TM.getInstrInfo()),
     TLI(*TM.getTargetLowering()),
-    TRI(*TM.getRegisterInfo()) {
+    TRI(*TM.getRegisterInfo()),
+    LibInfo(libInfo) {
 }
 
 FastISel::~FastISel() {}
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index f154271894..4488d2790b 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -48,16 +48,31 @@ unsigned InstrEmitter::CountResults(SDNode *Node) {
   return N;
 }
 
-/// CountOperands - The inputs to target nodes have any actual inputs first,
+/// countOperands - The inputs to target nodes have any actual inputs first,
 /// followed by an optional chain operand, then an optional glue operand.
 /// Compute the number of actual operands that will go into the resulting
 /// MachineInstr.
-unsigned InstrEmitter::CountOperands(SDNode *Node) {
+///
+/// Also count physreg RegisterSDNode and RegisterMaskSDNode operands preceding
+/// the chain and glue. These operands may be implicit on the machine instr.
+static unsigned countOperands(SDNode *Node, unsigned &NumImpUses) {
   unsigned N = Node->getNumOperands();
   while (N && Node->getOperand(N - 1).getValueType() == MVT::Glue)
     --N;
   if (N && Node->getOperand(N - 1).getValueType() == MVT::Other)
     --N; // Ignore chain if it exists.
+
+  // Count RegisterSDNode and RegisterMaskSDNode operands for NumImpUses.
+  for (unsigned I = N; I; --I) {
+    if (isa<RegisterMaskSDNode>(Node->getOperand(I - 1)))
+      continue;
+    if (RegisterSDNode *RN = dyn_cast<RegisterSDNode>(Node->getOperand(I - 1)))
+      if (TargetRegisterInfo::isPhysicalRegister(RN->getReg()))
+        continue;
+    NumImpUses = N - I;
+    break;
+  }
+
   return N;
 }
 
@@ -337,8 +352,7 @@ InstrEmitter::AddRegisterOperand(MachineInstr *MI, SDValue Op,
 
 /// AddOperand - Add the specified operand to the specified machine instr.  II
 /// specifies the instruction information for the node, and IIOpNum is the
-/// operand number (in the II) that we are adding. IIOpNum and II are used for
-/// assertions only.
+/// operand number (in the II) that we are adding.
 void InstrEmitter::AddOperand(MachineInstr *MI, SDValue Op,
                               unsigned IIOpNum,
                               const MCInstrDesc *II,
@@ -353,7 +367,11 @@ void InstrEmitter::AddOperand(MachineInstr *MI, SDValue Op,
     const ConstantFP *CFP = F->getConstantFPValue();
     MI->addOperand(MachineOperand::CreateFPImm(CFP));
   } else if (RegisterSDNode *R = dyn_cast<RegisterSDNode>(Op)) {
-    MI->addOperand(MachineOperand::CreateReg(R->getReg(), false));
+    // Turn additional physreg operands into implicit uses on non-variadic
+    // instructions. This is used by call and return instructions passing
+    // arguments in registers.
+    bool Imp = II && (IIOpNum >= II->getNumOperands() && !II->isVariadic());
+    MI->addOperand(MachineOperand::CreateReg(R->getReg(), false, Imp));
   } else if (RegisterMaskSDNode *RM = dyn_cast<RegisterMaskSDNode>(Op)) {
     MI->addOperand(MachineOperand::CreateRegMask(RM->getRegMask()));
   } else if (GlobalAddressSDNode *TGA = dyn_cast<GlobalAddressSDNode>(Op)) {
@@ -393,6 +411,10 @@ void InstrEmitter::AddOperand(MachineInstr *MI, SDValue Op,
   } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(Op)) {
     MI->addOperand(MachineOperand::CreateBA(BA->getBlockAddress(),
                                             BA->getTargetFlags()));
+  } else if (TargetIndexSDNode *TI = dyn_cast<TargetIndexSDNode>(Op)) {
+    MI->addOperand(MachineOperand::CreateTargetIndex(TI->getIndex(),
+                                                     TI->getOffset(),
+                                                     TI->getTargetFlags()));
   } else {
     assert(Op.getValueType() != MVT::Other &&
            Op.getValueType() != MVT::Glue &&
@@ -461,7 +483,8 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
     unsigned SrcReg, DstReg, DefSubIdx;
     if (DefMI &&
         TII->isCoalescableExtInstr(*DefMI, SrcReg, DstReg, DefSubIdx) &&
-        SubIdx == DefSubIdx) {
+        SubIdx == DefSubIdx &&
+        TRC == MRI->getRegClass(SrcReg)) {
       // Optimize these:
       // r1025 = s/zext r1024, 4
       // r1026 = extract_subreg r1025, 4
@@ -696,7 +719,8 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
 
   const MCInstrDesc &II = TII->get(Opc);
   unsigned NumResults = CountResults(Node);
-  unsigned NodeOperands = CountOperands(Node);
+  unsigned NumImpUses = 0;
+  unsigned NodeOperands = countOperands(Node, NumImpUses);
   bool HasPhysRegOuts = NumResults > II.getNumDefs() && II.getImplicitDefs()!=0;
 #ifndef NDEBUG
   unsigned NumMIOperands = NodeOperands + NumResults;
@@ -705,7 +729,8 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
            "Too few operands for a variadic node!");
   else
     assert(NumMIOperands >= II.getNumOperands() &&
-           NumMIOperands <= II.getNumOperands()+II.getNumImplicitDefs() &&
+           NumMIOperands <= II.getNumOperands() + II.getNumImplicitDefs() +
+                            NumImpUses &&
            "#operands for dag node doesn't match .td file!");
 #endif
 
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.h b/lib/CodeGen/SelectionDAG/InstrEmitter.h
index c081f38be0..9eddee9e33 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.h
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.h
@@ -105,12 +105,6 @@ public:
   /// (which do not go into the machine instrs.)
   static unsigned CountResults(SDNode *Node);
 
-  /// CountOperands - The inputs to target nodes have any actual inputs first,
-  /// followed by an optional chain operand, then flag operands.  Compute
-  /// the number of actual operands that will go into the resulting
-  /// MachineInstr.
-  static unsigned CountOperands(SDNode *Node);
-
   /// EmitDbgValue - Generate machine instruction for a dbg_value node.
   ///
   MachineInstr *EmitDbgValue(SDDbgValue *SD,
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index a36137db4a..908ebb9486 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -70,6 +70,9 @@ private:
 
   SDValue OptimizeFloatStore(StoreSDNode *ST);
 
+  void LegalizeLoadOps(SDNode *Node);
+  void LegalizeStoreOps(SDNode *Node);
+
   /// PerformInsertVectorEltInMemory - Some target cannot handle a variable
   /// insertion index for the INSERT_VECTOR_ELT instruction.  In this case, it
   /// is necessary to spill the vector being inserted into to memory, perform
@@ -425,7 +428,7 @@ ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
   DebugLoc dl = LD->getDebugLoc();
   if (VT.isFloatingPoint() || VT.isVector()) {
     EVT intVT = EVT::getIntegerVT(*DAG.getContext(), LoadedVT.getSizeInBits());
-    if (TLI.isTypeLegal(intVT)) {
+    if (TLI.isTypeLegal(intVT) && TLI.isTypeLegal(LoadedVT)) {
       // Expand to a (misaligned) integer load of the same size,
       // then bitconvert to floating point or vector.
       SDValue newLoad = DAG.getLoad(intVT, dl, Chain, Ptr, LD->getPointerInfo(),
@@ -433,8 +436,9 @@ ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
                                     LD->isNonTemporal(),
                                     LD->isInvariant(), LD->getAlignment());
       SDValue Result = DAG.getNode(ISD::BITCAST, dl, LoadedVT, newLoad);
-      if (VT.isFloatingPoint() && LoadedVT != VT)
-        Result = DAG.getNode(ISD::FP_EXTEND, dl, VT, Result);
+      if (LoadedVT != VT)
+        Result = DAG.getNode(VT.isFloatingPoint() ? ISD::FP_EXTEND :
+                             ISD::ANY_EXTEND, dl, VT, Result);
 
       ValResult = Result;
       ChainResult = Chain;
@@ -639,9 +643,8 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
   // probably means that we need to integrate dag combiner and legalizer
   // together.
   // We generally can't do this one for long doubles.
-  SDValue Tmp1 = ST->getChain();
-  SDValue Tmp2 = ST->getBasePtr();
-  SDValue Tmp3;
+  SDValue Chain = ST->getChain();
+  SDValue Ptr = ST->getBasePtr();
   unsigned Alignment = ST->getAlignment();
   bool isVolatile = ST->isVolatile();
   bool isNonTemporal = ST->isNonTemporal();
@@ -649,19 +652,19 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
   if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(ST->getValue())) {
     if (CFP->getValueType(0) == MVT::f32 &&
         TLI.isTypeLegal(MVT::i32)) {
-      Tmp3 = DAG.getConstant(CFP->getValueAPF().
+      SDValue Con = DAG.getConstant(CFP->getValueAPF().
                                       bitcastToAPInt().zextOrTrunc(32),
                               MVT::i32);
-      return DAG.getStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(),
+      return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(),
                           isVolatile, isNonTemporal, Alignment);
     }
 
     if (CFP->getValueType(0) == MVT::f64) {
       // If this target supports 64-bit registers, do a single 64-bit store.
       if (TLI.isTypeLegal(MVT::i64)) {
-        Tmp3 = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
+        SDValue Con = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
                                   zextOrTrunc(64), MVT::i64);
-        return DAG.getStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(),
+        return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(),
                             isVolatile, isNonTemporal, Alignment);
       }
 
@@ -674,11 +677,11 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
         SDValue Hi = DAG.getConstant(IntVal.lshr(32).trunc(32), MVT::i32);
         if (TLI.isBigEndian()) std::swap(Lo, Hi);
 
-        Lo = DAG.getStore(Tmp1, dl, Lo, Tmp2, ST->getPointerInfo(), isVolatile,
+        Lo = DAG.getStore(Chain, dl, Lo, Ptr, ST->getPointerInfo(), isVolatile,
                           isNonTemporal, Alignment);
-        Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
+        Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
                             DAG.getIntPtrConstant(4));
-        Hi = DAG.getStore(Tmp1, dl, Hi, Tmp2,
+        Hi = DAG.getStore(Chain, dl, Hi, Ptr,
                           ST->getPointerInfo().getWithOffset(4),
                           isVolatile, isNonTemporal, MinAlign(Alignment, 4U));
 
@@ -689,14 +692,448 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
   return SDValue(0, 0);
 }
 
+void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
+    StoreSDNode *ST = cast<StoreSDNode>(Node);
+    SDValue Chain = ST->getChain();
+    SDValue Ptr = ST->getBasePtr();
+    DebugLoc dl = Node->getDebugLoc();
+
+    unsigned Alignment = ST->getAlignment();
+    bool isVolatile = ST->isVolatile();
+    bool isNonTemporal = ST->isNonTemporal();
+
+    if (!ST->isTruncatingStore()) {
+      if (SDNode *OptStore = OptimizeFloatStore(ST).getNode()) {
+        ReplaceNode(ST, OptStore);
+        return;
+      }
+
+      {
+        SDValue Value = ST->getValue();
+        EVT VT = Value.getValueType();
+        switch (TLI.getOperationAction(ISD::STORE, VT)) {
+        default: llvm_unreachable("This action is not supported yet!");
+        case TargetLowering::Legal:
+          // If this is an unaligned store and the target doesn't support it,
+          // expand it.
+          if (!TLI.allowsUnalignedMemoryAccesses(ST->getMemoryVT())) {
+            Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext());
+            unsigned ABIAlignment= TLI.getTargetData()->getABITypeAlignment(Ty);
+            if (ST->getAlignment() < ABIAlignment)
+              ExpandUnalignedStore(cast<StoreSDNode>(Node),
+                                   DAG, TLI, this);
+          }
+          break;
+        case TargetLowering::Custom: {
+          SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
+          if (Res.getNode())
+            ReplaceNode(SDValue(Node, 0), Res);
+          return;
+        }
+        case TargetLowering::Promote: {
+          assert(VT.isVector() && "Unknown legal promote case!");
+          Value = DAG.getNode(ISD::BITCAST, dl,
+                             TLI.getTypeToPromoteTo(ISD::STORE, VT), Value);
+          SDValue Result =
+            DAG.getStore(Chain, dl, Value, Ptr,
+                         ST->getPointerInfo(), isVolatile,
+                         isNonTemporal, Alignment);
+          ReplaceNode(SDValue(Node, 0), Result);
+          break;
+        }
+        }
+        return;
+      }
+    } else {
+      SDValue Value = ST->getValue();
+
+      EVT StVT = ST->getMemoryVT();
+      unsigned StWidth = StVT.getSizeInBits();
+
+      if (StWidth != StVT.getStoreSizeInBits()) {
+        // Promote to a byte-sized store with upper bits zero if not
+        // storing an integral number of bytes.  For example, promote
+        // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
+        EVT NVT = EVT::getIntegerVT(*DAG.getContext(),
+                                    StVT.getStoreSizeInBits());
+        Value = DAG.getZeroExtendInReg(Value, dl, StVT);
+        SDValue Result =
+          DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
+                            NVT, isVolatile, isNonTemporal, Alignment);
+        ReplaceNode(SDValue(Node, 0), Result);
+      } else if (StWidth & (StWidth - 1)) {
+        // If not storing a power-of-2 number of bits, expand as two stores.
+        assert(!StVT.isVector() && "Unsupported truncstore!");
+        unsigned RoundWidth = 1 << Log2_32(StWidth);
+        assert(RoundWidth < StWidth);
+        unsigned ExtraWidth = StWidth - RoundWidth;
+        assert(ExtraWidth < RoundWidth);
+        assert(!(RoundWidth % 8) && !(ExtraWidth % 8) &&
+               "Store size not an integral number of bytes!");
+        EVT RoundVT = EVT::getIntegerVT(*DAG.getContext(), RoundWidth);
+        EVT ExtraVT = EVT::getIntegerVT(*DAG.getContext(), ExtraWidth);
+        SDValue Lo, Hi;
+        unsigned IncrementSize;
+
+        if (TLI.isLittleEndian()) {
+          // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 X, TRUNCSTORE@+2:i8 (srl X, 16)
+          // Store the bottom RoundWidth bits.
+          Lo = DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
+                                 RoundVT,
+                                 isVolatile, isNonTemporal, Alignment);
+
+          // Store the remaining ExtraWidth bits.
+          IncrementSize = RoundWidth / 8;
+          Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                             DAG.getIntPtrConstant(IncrementSize));
+          Hi = DAG.getNode(ISD::SRL, dl, Value.getValueType(), Value,
+                           DAG.getConstant(RoundWidth,
+                                    TLI.getShiftAmountTy(Value.getValueType())));
+          Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr,
+                             ST->getPointerInfo().getWithOffset(IncrementSize),
+                                 ExtraVT, isVolatile, isNonTemporal,
+                                 MinAlign(Alignment, IncrementSize));
+        } else {
+          // Big endian - avoid unaligned stores.
+          // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 (srl X, 8), TRUNCSTORE@+2:i8 X
+          // Store the top RoundWidth bits.
+          Hi = DAG.getNode(ISD::SRL, dl, Value.getValueType(), Value,
+                           DAG.getConstant(ExtraWidth,
+                                    TLI.getShiftAmountTy(Value.getValueType())));
+          Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr, ST->getPointerInfo(),
+                                 RoundVT, isVolatile, isNonTemporal, Alignment);
+
+          // Store the remaining ExtraWidth bits.
+          IncrementSize = RoundWidth / 8;
+          Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                             DAG.getIntPtrConstant(IncrementSize));
+          Lo = DAG.getTruncStore(Chain, dl, Value, Ptr,
+                              ST->getPointerInfo().getWithOffset(IncrementSize),
+                                 ExtraVT, isVolatile, isNonTemporal,
+                                 MinAlign(Alignment, IncrementSize));
+        }
+
+        // The order of the stores doesn't matter.
+        SDValue Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
+        ReplaceNode(SDValue(Node, 0), Result);
+      } else {
+        switch (TLI.getTruncStoreAction(ST->getValue().getValueType(), StVT)) {
+        default: llvm_unreachable("This action is not supported yet!");
+        case TargetLowering::Legal:
+          // If this is an unaligned store and the target doesn't support it,
+          // expand it.
+          if (!TLI.allowsUnalignedMemoryAccesses(ST->getMemoryVT())) {
+            Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext());
+            unsigned ABIAlignment= TLI.getTargetData()->getABITypeAlignment(Ty);
+            if (ST->getAlignment() < ABIAlignment)
+              ExpandUnalignedStore(cast<StoreSDNode>(Node), DAG, TLI, this);
+          }
+          break;
+        case TargetLowering::Custom: {
+          SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
+          if (Res.getNode())
+            ReplaceNode(SDValue(Node, 0), Res);
+          return;
+        }
+        case TargetLowering::Expand:
+          assert(!StVT.isVector() &&
+                 "Vector Stores are handled in LegalizeVectorOps");
+
+          // TRUNCSTORE:i16 i32 -> STORE i16
+          assert(TLI.isTypeLegal(StVT) &&
+                 "Do not know how to expand this store!");
+          Value = DAG.getNode(ISD::TRUNCATE, dl, StVT, Value);
+          SDValue Result =
+            DAG.getStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
+                         isVolatile, isNonTemporal, Alignment);
+          ReplaceNode(SDValue(Node, 0), Result);
+          break;
+        }
+      }
+    }
+}
+
+void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
+  LoadSDNode *LD = cast<LoadSDNode>(Node);
+  SDValue Chain = LD->getChain();  // The chain.
+  SDValue Ptr = LD->getBasePtr();  // The base pointer.
+  SDValue Value;                   // The value returned by the load op.
+  DebugLoc dl = Node->getDebugLoc();
+
+  ISD::LoadExtType ExtType = LD->getExtensionType();
+  if (ExtType == ISD::NON_EXTLOAD) {
+    EVT VT = Node->getValueType(0);
+    SDValue RVal = SDValue(Node, 0);
+    SDValue RChain = SDValue(Node, 1);
+
+    switch (TLI.getOperationAction(Node->getOpcode(), VT)) {
+    default: llvm_unreachable("This action is not supported yet!");
+    case TargetLowering::Legal:
+             // If this is an unaligned load and the target doesn't support it,
+             // expand it.
+             if (!TLI.allowsUnalignedMemoryAccesses(LD->getMemoryVT())) {
+               Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
+               unsigned ABIAlignment =
+                 TLI.getTargetData()->getABITypeAlignment(Ty);
+               if (LD->getAlignment() < ABIAlignment){
+                 ExpandUnalignedLoad(cast<LoadSDNode>(Node),
+                                     DAG, TLI, RVal, RChain);
+               }
+             }
+             break;
+    case TargetLowering::Custom: {
+             SDValue Res = TLI.LowerOperation(RVal, DAG);
+             if (Res.getNode()) {
+               RVal = Res;
+               RChain = Res.getValue(1);
+             }
+             break;
+    }
+    case TargetLowering::Promote: {
+      // Only promote a load of vector type to another.
+      assert(VT.isVector() && "Cannot promote this load!");
+      // Change base type to a different vector type.
+      EVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), VT);
+
+      SDValue Res = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo(),
+                         LD->isVolatile(), LD->isNonTemporal(),
+                         LD->isInvariant(), LD->getAlignment());
+      RVal = DAG.getNode(ISD::BITCAST, dl, VT, Res);
+      RChain = Res.getValue(1);
+      break;
+    }
+    }
+    if (RChain.getNode() != Node) {
+      assert(RVal.getNode() != Node && "Load must be completely replaced");
+      DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), RVal);
+      DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), RChain);
+      ReplacedNode(Node);
+    }
+    return;
+  }
+
+  EVT SrcVT = LD->getMemoryVT();
+  unsigned SrcWidth = SrcVT.getSizeInBits();
+  unsigned Alignment = LD->getAlignment();
+  bool isVolatile = LD->isVolatile();
+  bool isNonTemporal = LD->isNonTemporal();
+
+  if (SrcWidth != SrcVT.getStoreSizeInBits() &&
+      // Some targets pretend to have an i1 loading operation, and actually
+      // load an i8.  This trick is correct for ZEXTLOAD because the top 7
+      // bits are guaranteed to be zero; it helps the optimizers understand
+      // that these bits are zero.  It is also useful for EXTLOAD, since it
+      // tells the optimizers that those bits are undefined.  It would be
+      // nice to have an effective generic way of getting these benefits...
+      // Until such a way is found, don't insist on promoting i1 here.
+      (SrcVT != MVT::i1 ||
+       TLI.getLoadExtAction(ExtType, MVT::i1) == TargetLowering::Promote)) {
+    // Promote to a byte-sized load if not loading an integral number of
+    // bytes.  For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
+    unsigned NewWidth = SrcVT.getStoreSizeInBits();
+    EVT NVT = EVT::getIntegerVT(*DAG.getContext(), NewWidth);
+    SDValue Ch;
+
+    // The extra bits are guaranteed to be zero, since we stored them that
+    // way.  A zext load from NVT thus automatically gives zext from SrcVT.
+
+    ISD::LoadExtType NewExtType =
+      ExtType == ISD::ZEXTLOAD ? ISD::ZEXTLOAD : ISD::EXTLOAD;
+
+    SDValue Result =
+      DAG.getExtLoad(NewExtType, dl, Node->getValueType(0),
+                     Chain, Ptr, LD->getPointerInfo(),
+                     NVT, isVolatile, isNonTemporal, Alignment);
+
+    Ch = Result.getValue(1); // The chain.
+
+    if (ExtType == ISD::SEXTLOAD)
+      // Having the top bits zero doesn't help when sign extending.
+      Result = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl,
+                           Result.getValueType(),
+                           Result, DAG.getValueType(SrcVT));
+    else if (ExtType == ISD::ZEXTLOAD || NVT == Result.getValueType())
+      // All the top bits are guaranteed to be zero - inform the optimizers.
+      Result = DAG.getNode(ISD::AssertZext, dl,
+                           Result.getValueType(), Result,
+                           DAG.getValueType(SrcVT));
+
+    Value = Result;
+    Chain = Ch;
+  } else if (SrcWidth & (SrcWidth - 1)) {
+    // If not loading a power-of-2 number of bits, expand as two loads.
+    assert(!SrcVT.isVector() && "Unsupported extload!");
+    unsigned RoundWidth = 1 << Log2_32(SrcWidth);
+    assert(RoundWidth < SrcWidth);
+    unsigned ExtraWidth = SrcWidth - RoundWidth;
+    assert(ExtraWidth < RoundWidth);
+    assert(!(RoundWidth % 8) && !(ExtraWidth % 8) &&
+           "Load size not an integral number of bytes!");
+    EVT RoundVT = EVT::getIntegerVT(*DAG.getContext(), RoundWidth);
+    EVT ExtraVT = EVT::getIntegerVT(*DAG.getContext(), ExtraWidth);
+    SDValue Lo, Hi, Ch;
+    unsigned IncrementSize;
+
+    if (TLI.isLittleEndian()) {
+      // EXTLOAD:i24 -> ZEXTLOAD:i16 | (shl EXTLOAD@+2:i8, 16)
+      // Load the bottom RoundWidth bits.
+      Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, Node->getValueType(0),
+                          Chain, Ptr,
+                          LD->getPointerInfo(), RoundVT, isVolatile,
+                          isNonTemporal, Alignment);
+
+      // Load the remaining ExtraWidth bits.
+      IncrementSize = RoundWidth / 8;
+      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                         DAG.getIntPtrConstant(IncrementSize));
+      Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr,
+                          LD->getPointerInfo().getWithOffset(IncrementSize),
+                          ExtraVT, isVolatile, isNonTemporal,
+                          MinAlign(Alignment, IncrementSize));
+
+      // Build a factor node to remember that this load is independent of
+      // the other one.
+      Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                       Hi.getValue(1));
+
+      // Move the top bits to the right place.
+      Hi = DAG.getNode(ISD::SHL, dl, Hi.getValueType(), Hi,
+                       DAG.getConstant(RoundWidth,
+                                       TLI.getShiftAmountTy(Hi.getValueType())));
+
+      // Join the hi and lo parts.
+      Value = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi);
+    } else {
+      // Big endian - avoid unaligned loads.
+      // EXTLOAD:i24 -> (shl EXTLOAD:i16, 8) | ZEXTLOAD@+2:i8
+      // Load the top RoundWidth bits.
+      Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr,
+                          LD->getPointerInfo(), RoundVT, isVolatile,
+                          isNonTemporal, Alignment);
+
+      // Load the remaining ExtraWidth bits.
+      IncrementSize = RoundWidth / 8;
+      Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
+                         DAG.getIntPtrConstant(IncrementSize));
+      Lo = DAG.getExtLoad(ISD::ZEXTLOAD,
+                          dl, Node->getValueType(0), Chain, Ptr,
+                          LD->getPointerInfo().getWithOffset(IncrementSize),
+                          ExtraVT, isVolatile, isNonTemporal,
+                          MinAlign(Alignment, IncrementSize));
+
+      // Build a factor node to remember that this load is independent of
+      // the other one.
+      Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
+                       Hi.getValue(1));
+
+      // Move the top bits to the right place.
+      Hi = DAG.getNode(ISD::SHL, dl, Hi.getValueType(), Hi,
+                       DAG.getConstant(ExtraWidth,
+                                       TLI.getShiftAmountTy(Hi.getValueType())));
+
+      // Join the hi and lo parts.
+      Value = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi);
+    }
+
+    Chain = Ch;
+  } else {
+    bool isCustom = false;
+    switch (TLI.getLoadExtAction(ExtType, SrcVT)) {
+    default: llvm_unreachable("This action is not supported yet!");
+    case TargetLowering::Custom:
+             isCustom = true;
+             // FALLTHROUGH
+    case TargetLowering::Legal: {
+             Value = SDValue(Node, 0);
+             Chain = SDValue(Node, 1);
+
+             if (isCustom) {
+               SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
+               if (Res.getNode()) {
+                 Value = Res;
+                 Chain = Res.getValue(1);
+               }
+             } else {
+               // If this is an unaligned load and the target doesn't support it,
+               // expand it.
+               if (!TLI.allowsUnalignedMemoryAccesses(LD->getMemoryVT())) {
+                 Type *Ty =
+                   LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
+                 unsigned ABIAlignment =
+                   TLI.getTargetData()->getABITypeAlignment(Ty);
+                 if (LD->getAlignment() < ABIAlignment){
+                   ExpandUnalignedLoad(cast<LoadSDNode>(Node),
+                                       DAG, TLI, Value, Chain);
+                 }
+               }
+             }
+             break;
+    }
+    case TargetLowering::Expand:
+             if (!TLI.isLoadExtLegal(ISD::EXTLOAD, SrcVT) && TLI.isTypeLegal(SrcVT)) {
+               SDValue Load = DAG.getLoad(SrcVT, dl, Chain, Ptr,
+                                          LD->getPointerInfo(),
+                                          LD->isVolatile(), LD->isNonTemporal(),
+                                          LD->isInvariant(), LD->getAlignment());
+               unsigned ExtendOp;
+               switch (ExtType) {
+               case ISD::EXTLOAD:
+                 ExtendOp = (SrcVT.isFloatingPoint() ?
+                             ISD::FP_EXTEND : ISD::ANY_EXTEND);
+                 break;
+               case ISD::SEXTLOAD: ExtendOp = ISD::SIGN_EXTEND; break;
+               case ISD::ZEXTLOAD: ExtendOp = ISD::ZERO_EXTEND; break;
+               default: llvm_unreachable("Unexpected extend load type!");
+               }
+               Value = DAG.getNode(ExtendOp, dl, Node->getValueType(0), Load);
+               Chain = Load.getValue(1);
+               break;
+             }
+
+             assert(!SrcVT.isVector() &&
+                    "Vector Loads are handled in LegalizeVectorOps");
+
+             // FIXME: This does not work for vectors on most targets.  Sign- and
+             // zero-extend operations are currently folded into extending loads,
+             // whether they are legal or not, and then we end up here without any
+             // support for legalizing them.
+             assert(ExtType != ISD::EXTLOAD &&
+                    "EXTLOAD should always be supported!");
+             // Turn the unsupported load into an EXTLOAD followed by an explicit
+             // zero/sign extend inreg.
+             SDValue Result = DAG.getExtLoad(ISD::EXTLOAD, dl, Node->getValueType(0),
+                                             Chain, Ptr, LD->getPointerInfo(), SrcVT,
+                                             LD->isVolatile(), LD->isNonTemporal(),
+                                             LD->getAlignment());
+             SDValue ValRes;
+             if (ExtType == ISD::SEXTLOAD)
+               ValRes = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl,
+                                    Result.getValueType(),
+                                    Result, DAG.getValueType(SrcVT));
+             else
+               ValRes = DAG.getZeroExtendInReg(Result, dl, SrcVT.getScalarType());
+             Value = ValRes;
+             Chain = Result.getValue(1);
+             break;
+    }
+  }
+
+  // Since loads produce two values, make sure to remember that we legalized
+  // both of them.
+  if (Chain.getNode() != Node) {
+    assert(Value.getNode() != Node && "Load must be completely replaced");
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), Value);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), Chain);
+    ReplacedNode(Node);
+  }
+}
+
 /// LegalizeOp - Return a legal replacement for the given operation, with
 /// all legal operands.
 void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
   if (Node->getOpcode() == ISD::TargetConstant) // Allow illegal target nodes.
     return;
 
-  DebugLoc dl = Node->getDebugLoc();
-
   for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
     assert(TLI.getTypeAction(*DAG.getContext(), Node->getValueType(i)) ==
              TargetLowering::TypeLegal &&
@@ -709,9 +1146,6 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
             Node->getOperand(i).getOpcode() == ISD::TargetConstant) &&
            "Unexpected illegal type!");
 
-  SDValue Tmp1, Tmp2, Tmp3, Tmp4;
-  bool isCustom = false;
-
   // Figure out the correct action; the way to query this varies by opcode
   TargetLowering::LegalizeAction Action = TargetLowering::Legal;
   bool SimpleFinishLegalizing = true;
@@ -865,19 +1299,19 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
     switch (Action) {
     case TargetLowering::Legal:
       return;
-    case TargetLowering::Custom:
+    case TargetLowering::Custom: {
       // FIXME: The handling for custom lowering with multiple results is
       // a complete mess.
-      Tmp1 = TLI.LowerOperation(SDValue(Node, 0), DAG);
-      if (Tmp1.getNode()) {
+      SDValue Res = TLI.LowerOperation(SDValue(Node, 0), DAG);
+      if (Res.getNode()) {
         SmallVector<SDValue, 8> ResultVals;
         for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i) {
           if (e == 1)
-            ResultVals.push_back(Tmp1);
+            ResultVals.push_back(Res);
           else
-            ResultVals.push_back(Tmp1.getValue(i));
+            ResultVals.push_back(Res.getValue(i));
         }
-        if (Tmp1.getNode() != Node || Tmp1.getResNo() != 0) {
+        if (Res.getNode() != Node || Res.getResNo() != 0) {
           DAG.ReplaceAllUsesWith(Node, ResultVals.data());
           for (unsigned i = 0, e = Node->getNumValues(); i != e; ++i)
             DAG.TransferDbgValues(SDValue(Node, i), ResultVals[i]);
@@ -885,7 +1319,7 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
         }
         return;
       }
-
+    }
       // FALL THROUGH
     case TargetLowering::Expand:
       ExpandNode(Node);
@@ -909,429 +1343,10 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
   case ISD::CALLSEQ_END:
     break;
   case ISD::LOAD: {
-    LoadSDNode *LD = cast<LoadSDNode>(Node);
-    Tmp1 = LD->getChain();   // Legalize the chain.
-    Tmp2 = LD->getBasePtr(); // Legalize the base pointer.
-
-    ISD::LoadExtType ExtType = LD->getExtensionType();
-    if (ExtType == ISD::NON_EXTLOAD) {
-      EVT VT = Node->getValueType(0);
-      Tmp3 = SDValue(Node, 0);
-      Tmp4 = SDValue(Node, 1);
-
-      switch (TLI.getOperationAction(Node->getOpcode(), VT)) {
-      default: llvm_unreachable("This action is not supported yet!");
-      case TargetLowering::Legal:
-        // If this is an unaligned load and the target doesn't support it,
-        // expand it.
-        if (!TLI.allowsUnalignedMemoryAccesses(LD->getMemoryVT())) {
-          Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
-          unsigned ABIAlignment = TLI.getTargetData()->getABITypeAlignment(Ty);
-          if (LD->getAlignment() < ABIAlignment){
-            ExpandUnalignedLoad(cast<LoadSDNode>(Node),
-                                DAG, TLI, Tmp3, Tmp4);
-          }
-        }
-        break;
-      case TargetLowering::Custom:
-        Tmp1 = TLI.LowerOperation(Tmp3, DAG);
-        if (Tmp1.getNode()) {
-          Tmp3 = Tmp1;
-          Tmp4 = Tmp1.getValue(1);
-        }
-        break;
-      case TargetLowering::Promote: {
-        // Only promote a load of vector type to another.
-        assert(VT.isVector() && "Cannot promote this load!");
-        // Change base type to a different vector type.
-        EVT NVT = TLI.getTypeToPromoteTo(Node->getOpcode(), VT);
-
-        Tmp1 = DAG.getLoad(NVT, dl, Tmp1, Tmp2, LD->getPointerInfo(),
-                           LD->isVolatile(), LD->isNonTemporal(),
-                           LD->isInvariant(), LD->getAlignment());
-        Tmp3 = DAG.getNode(ISD::BITCAST, dl, VT, Tmp1);
-        Tmp4 = Tmp1.getValue(1);
-        break;
-      }
-      }
-      if (Tmp4.getNode() != Node) {
-        assert(Tmp3.getNode() != Node && "Load must be completely replaced");
-        DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), Tmp3);
-        DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), Tmp4);
-        ReplacedNode(Node);
-      }
-      return;
-    }
-
-    EVT SrcVT = LD->getMemoryVT();
-    unsigned SrcWidth = SrcVT.getSizeInBits();
-    unsigned Alignment = LD->getAlignment();
-    bool isVolatile = LD->isVolatile();
-    bool isNonTemporal = LD->isNonTemporal();
-
-    if (SrcWidth != SrcVT.getStoreSizeInBits() &&
-        // Some targets pretend to have an i1 loading operation, and actually
-        // load an i8.  This trick is correct for ZEXTLOAD because the top 7
-        // bits are guaranteed to be zero; it helps the optimizers understand
-        // that these bits are zero.  It is also useful for EXTLOAD, since it
-        // tells the optimizers that those bits are undefined.  It would be
-        // nice to have an effective generic way of getting these benefits...
-        // Until such a way is found, don't insist on promoting i1 here.
-        (SrcVT != MVT::i1 ||
-         TLI.getLoadExtAction(ExtType, MVT::i1) == TargetLowering::Promote)) {
-      // Promote to a byte-sized load if not loading an integral number of
-      // bytes.  For example, promote EXTLOAD:i20 -> EXTLOAD:i24.
-      unsigned NewWidth = SrcVT.getStoreSizeInBits();
-      EVT NVT = EVT::getIntegerVT(*DAG.getContext(), NewWidth);
-      SDValue Ch;
-
-      // The extra bits are guaranteed to be zero, since we stored them that
-      // way.  A zext load from NVT thus automatically gives zext from SrcVT.
-
-      ISD::LoadExtType NewExtType =
-        ExtType == ISD::ZEXTLOAD ? ISD::ZEXTLOAD : ISD::EXTLOAD;
-
-      SDValue Result =
-        DAG.getExtLoad(NewExtType, dl, Node->getValueType(0),
-                       Tmp1, Tmp2, LD->getPointerInfo(),
-                       NVT, isVolatile, isNonTemporal, Alignment);
-
-      Ch = Result.getValue(1); // The chain.
-
-      if (ExtType == ISD::SEXTLOAD)
-        // Having the top bits zero doesn't help when sign extending.
-        Result = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl,
-                             Result.getValueType(),
-                             Result, DAG.getValueType(SrcVT));
-      else if (ExtType == ISD::ZEXTLOAD || NVT == Result.getValueType())
-        // All the top bits are guaranteed to be zero - inform the optimizers.
-        Result = DAG.getNode(ISD::AssertZext, dl,
-                             Result.getValueType(), Result,
-                             DAG.getValueType(SrcVT));
-
-      Tmp1 = Result;
-      Tmp2 = Ch;
-    } else if (SrcWidth & (SrcWidth - 1)) {
-      // If not loading a power-of-2 number of bits, expand as two loads.
-      assert(!SrcVT.isVector() && "Unsupported extload!");
-      unsigned RoundWidth = 1 << Log2_32(SrcWidth);
-      assert(RoundWidth < SrcWidth);
-      unsigned ExtraWidth = SrcWidth - RoundWidth;
-      assert(ExtraWidth < RoundWidth);
-      assert(!(RoundWidth % 8) && !(ExtraWidth % 8) &&
-             "Load size not an integral number of bytes!");
-      EVT RoundVT = EVT::getIntegerVT(*DAG.getContext(), RoundWidth);
-      EVT ExtraVT = EVT::getIntegerVT(*DAG.getContext(), ExtraWidth);
-      SDValue Lo, Hi, Ch;
-      unsigned IncrementSize;
-
-      if (TLI.isLittleEndian()) {
-        // EXTLOAD:i24 -> ZEXTLOAD:i16 | (shl EXTLOAD@+2:i8, 16)
-        // Load the bottom RoundWidth bits.
-        Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, Node->getValueType(0),
-                            Tmp1, Tmp2,
-                            LD->getPointerInfo(), RoundVT, isVolatile,
-                            isNonTemporal, Alignment);
-
-        // Load the remaining ExtraWidth bits.
-        IncrementSize = RoundWidth / 8;
-        Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
-                           DAG.getIntPtrConstant(IncrementSize));
-        Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Tmp1, Tmp2,
-                            LD->getPointerInfo().getWithOffset(IncrementSize),
-                            ExtraVT, isVolatile, isNonTemporal,
-                            MinAlign(Alignment, IncrementSize));
-
-        // Build a factor node to remember that this load is independent of
-        // the other one.
-        Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
-                         Hi.getValue(1));
-
-        // Move the top bits to the right place.
-        Hi = DAG.getNode(ISD::SHL, dl, Hi.getValueType(), Hi,
-                         DAG.getConstant(RoundWidth,
-                                      TLI.getShiftAmountTy(Hi.getValueType())));
-
-        // Join the hi and lo parts.
-        Tmp1 = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi);
-      } else {
-        // Big endian - avoid unaligned loads.
-        // EXTLOAD:i24 -> (shl EXTLOAD:i16, 8) | ZEXTLOAD@+2:i8
-        // Load the top RoundWidth bits.
-        Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Tmp1, Tmp2,
-                            LD->getPointerInfo(), RoundVT, isVolatile,
-                            isNonTemporal, Alignment);
-
-        // Load the remaining ExtraWidth bits.
-        IncrementSize = RoundWidth / 8;
-        Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
-                           DAG.getIntPtrConstant(IncrementSize));
-        Lo = DAG.getExtLoad(ISD::ZEXTLOAD,
-                            dl, Node->getValueType(0), Tmp1, Tmp2,
-                            LD->getPointerInfo().getWithOffset(IncrementSize),
-                            ExtraVT, isVolatile, isNonTemporal,
-                            MinAlign(Alignment, IncrementSize));
-
-        // Build a factor node to remember that this load is independent of
-        // the other one.
-        Ch = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo.getValue(1),
-                         Hi.getValue(1));
-
-        // Move the top bits to the right place.
-        Hi = DAG.getNode(ISD::SHL, dl, Hi.getValueType(), Hi,
-                         DAG.getConstant(ExtraWidth,
-                                      TLI.getShiftAmountTy(Hi.getValueType())));
-
-        // Join the hi and lo parts.
-        Tmp1 = DAG.getNode(ISD::OR, dl, Node->getValueType(0), Lo, Hi);
-      }
-
-      Tmp2 = Ch;
-    } else {
-      switch (TLI.getLoadExtAction(ExtType, SrcVT)) {
-      default: llvm_unreachable("This action is not supported yet!");
-      case TargetLowering::Custom:
-        isCustom = true;
-        // FALLTHROUGH
-      case TargetLowering::Legal:
-        Tmp1 = SDValue(Node, 0);
-        Tmp2 = SDValue(Node, 1);
-
-        if (isCustom) {
-          Tmp3 = TLI.LowerOperation(SDValue(Node, 0), DAG);
-          if (Tmp3.getNode()) {
-            Tmp1 = Tmp3;
-            Tmp2 = Tmp3.getValue(1);
-          }
-        } else {
-          // If this is an unaligned load and the target doesn't support it,
-          // expand it.
-          if (!TLI.allowsUnalignedMemoryAccesses(LD->getMemoryVT())) {
-            Type *Ty =
-              LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
-            unsigned ABIAlignment =
-              TLI.getTargetData()->getABITypeAlignment(Ty);
-            if (LD->getAlignment() < ABIAlignment){
-              ExpandUnalignedLoad(cast<LoadSDNode>(Node),
-                                  DAG, TLI, Tmp1, Tmp2);
-            }
-          }
-        }
-        break;
-      case TargetLowering::Expand:
-        if (!TLI.isLoadExtLegal(ISD::EXTLOAD, SrcVT) && TLI.isTypeLegal(SrcVT)) {
-          SDValue Load = DAG.getLoad(SrcVT, dl, Tmp1, Tmp2,
-                                     LD->getPointerInfo(),
-                                     LD->isVolatile(), LD->isNonTemporal(),
-                                     LD->isInvariant(), LD->getAlignment());
-          unsigned ExtendOp;
-          switch (ExtType) {
-          case ISD::EXTLOAD:
-            ExtendOp = (SrcVT.isFloatingPoint() ?
-                        ISD::FP_EXTEND : ISD::ANY_EXTEND);
-            break;
-          case ISD::SEXTLOAD: ExtendOp = ISD::SIGN_EXTEND; break;
-          case ISD::ZEXTLOAD: ExtendOp = ISD::ZERO_EXTEND; break;
-          default: llvm_unreachable("Unexpected extend load type!");
-          }
-          Tmp1 = DAG.getNode(ExtendOp, dl, Node->getValueType(0), Load);
-          Tmp2 = Load.getValue(1);
-          break;
-        }
-
-        assert(!SrcVT.isVector() &&
-               "Vector Loads are handled in LegalizeVectorOps");
-
-        // FIXME: This does not work for vectors on most targets.  Sign- and
-        // zero-extend operations are currently folded into extending loads,
-        // whether they are legal or not, and then we end up here without any
-        // support for legalizing them.
-        assert(ExtType != ISD::EXTLOAD &&
-               "EXTLOAD should always be supported!");
-        // Turn the unsupported load into an EXTLOAD followed by an explicit
-        // zero/sign extend inreg.
-        SDValue Result = DAG.getExtLoad(ISD::EXTLOAD, dl, Node->getValueType(0),
-                                        Tmp1, Tmp2, LD->getPointerInfo(), SrcVT,
-                                        LD->isVolatile(), LD->isNonTemporal(),
-                                        LD->getAlignment());
-        SDValue ValRes;
-        if (ExtType == ISD::SEXTLOAD)
-          ValRes = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl,
-                               Result.getValueType(),
-                               Result, DAG.getValueType(SrcVT));
-        else
-          ValRes = DAG.getZeroExtendInReg(Result, dl, SrcVT.getScalarType());
-        Tmp1 = ValRes;
-        Tmp2 = Result.getValue(1);
-        break;
-      }
-    }
-
-    // Since loads produce two values, make sure to remember that we legalized
-    // both of them.
-    if (Tmp2.getNode() != Node) {
-      assert(Tmp1.getNode() != Node && "Load must be completely replaced");
-      DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 0), Tmp1);
-      DAG.ReplaceAllUsesOfValueWith(SDValue(Node, 1), Tmp2);
-      ReplacedNode(Node);
-    }
-    break;
+    return LegalizeLoadOps(Node);
   }
   case ISD::STORE: {
-    StoreSDNode *ST = cast<StoreSDNode>(Node);
-    Tmp1 = ST->getChain();
-    Tmp2 = ST->getBasePtr();
-    unsigned Alignment = ST->getAlignment();
-    bool isVolatile = ST->isVolatile();
-    bool isNonTemporal = ST->isNonTemporal();
-
-    if (!ST->isTruncatingStore()) {
-      if (SDNode *OptStore = OptimizeFloatStore(ST).getNode()) {
-        ReplaceNode(ST, OptStore);
-        break;
-      }
-
-      {
-        Tmp3 = ST->getValue();
-        EVT VT = Tmp3.getValueType();
-        switch (TLI.getOperationAction(ISD::STORE, VT)) {
-        default: llvm_unreachable("This action is not supported yet!");
-        case TargetLowering::Legal:
-          // If this is an unaligned store and the target doesn't support it,
-          // expand it.
-          if (!TLI.allowsUnalignedMemoryAccesses(ST->getMemoryVT())) {
-            Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext());
-            unsigned ABIAlignment= TLI.getTargetData()->getABITypeAlignment(Ty);
-            if (ST->getAlignment() < ABIAlignment)
-              ExpandUnalignedStore(cast<StoreSDNode>(Node),
-                                   DAG, TLI, this);
-          }
-          break;
-        case TargetLowering::Custom:
-          Tmp1 = TLI.LowerOperation(SDValue(Node, 0), DAG);
-          if (Tmp1.getNode())
-            ReplaceNode(SDValue(Node, 0), Tmp1);
-          break;
-        case TargetLowering::Promote: {
-          assert(VT.isVector() && "Unknown legal promote case!");
-          Tmp3 = DAG.getNode(ISD::BITCAST, dl,
-                             TLI.getTypeToPromoteTo(ISD::STORE, VT), Tmp3);
-          SDValue Result =
-            DAG.getStore(Tmp1, dl, Tmp3, Tmp2,
-                         ST->getPointerInfo(), isVolatile,
-                         isNonTemporal, Alignment);
-          ReplaceNode(SDValue(Node, 0), Result);
-          break;
-        }
-        }
-        break;
-      }
-    } else {
-      Tmp3 = ST->getValue();
-
-      EVT StVT = ST->getMemoryVT();
-      unsigned StWidth = StVT.getSizeInBits();
-
-      if (StWidth != StVT.getStoreSizeInBits()) {
-        // Promote to a byte-sized store with upper bits zero if not
-        // storing an integral number of bytes.  For example, promote
-        // TRUNCSTORE:i1 X -> TRUNCSTORE:i8 (and X, 1)
-        EVT NVT = EVT::getIntegerVT(*DAG.getContext(),
-                                    StVT.getStoreSizeInBits());
-        Tmp3 = DAG.getZeroExtendInReg(Tmp3, dl, StVT);
-        SDValue Result =
-          DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(),
-                            NVT, isVolatile, isNonTemporal, Alignment);
-        ReplaceNode(SDValue(Node, 0), Result);
-      } else if (StWidth & (StWidth - 1)) {
-        // If not storing a power-of-2 number of bits, expand as two stores.
-        assert(!StVT.isVector() && "Unsupported truncstore!");
-        unsigned RoundWidth = 1 << Log2_32(StWidth);
-        assert(RoundWidth < StWidth);
-        unsigned ExtraWidth = StWidth - RoundWidth;
-        assert(ExtraWidth < RoundWidth);
-        assert(!(RoundWidth % 8) && !(ExtraWidth % 8) &&
-               "Store size not an integral number of bytes!");
-        EVT RoundVT = EVT::getIntegerVT(*DAG.getContext(), RoundWidth);
-        EVT ExtraVT = EVT::getIntegerVT(*DAG.getContext(), ExtraWidth);
-        SDValue Lo, Hi;
-        unsigned IncrementSize;
-
-        if (TLI.isLittleEndian()) {
-          // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 X, TRUNCSTORE@+2:i8 (srl X, 16)
-          // Store the bottom RoundWidth bits.
-          Lo = DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(),
-                                 RoundVT,
-                                 isVolatile, isNonTemporal, Alignment);
-
-          // Store the remaining ExtraWidth bits.
-          IncrementSize = RoundWidth / 8;
-          Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
-                             DAG.getIntPtrConstant(IncrementSize));
-          Hi = DAG.getNode(ISD::SRL, dl, Tmp3.getValueType(), Tmp3,
-                           DAG.getConstant(RoundWidth,
-                                    TLI.getShiftAmountTy(Tmp3.getValueType())));
-          Hi = DAG.getTruncStore(Tmp1, dl, Hi, Tmp2,
-                             ST->getPointerInfo().getWithOffset(IncrementSize),
-                                 ExtraVT, isVolatile, isNonTemporal,
-                                 MinAlign(Alignment, IncrementSize));
-        } else {
-          // Big endian - avoid unaligned stores.
-          // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 (srl X, 8), TRUNCSTORE@+2:i8 X
-          // Store the top RoundWidth bits.
-          Hi = DAG.getNode(ISD::SRL, dl, Tmp3.getValueType(), Tmp3,
-                           DAG.getConstant(ExtraWidth,
-                                    TLI.getShiftAmountTy(Tmp3.getValueType())));
-          Hi = DAG.getTruncStore(Tmp1, dl, Hi, Tmp2, ST->getPointerInfo(),
-                                 RoundVT, isVolatile, isNonTemporal, Alignment);
-
-          // Store the remaining ExtraWidth bits.
-          IncrementSize = RoundWidth / 8;
-          Tmp2 = DAG.getNode(ISD::ADD, dl, Tmp2.getValueType(), Tmp2,
-                             DAG.getIntPtrConstant(IncrementSize));
-          Lo = DAG.getTruncStore(Tmp1, dl, Tmp3, Tmp2,
-                              ST->getPointerInfo().getWithOffset(IncrementSize),
-                                 ExtraVT, isVolatile, isNonTemporal,
-                                 MinAlign(Alignment, IncrementSize));
-        }
-
-        // The order of the stores doesn't matter.
-        SDValue Result = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
-        ReplaceNode(SDValue(Node, 0), Result);
-      } else {
-        switch (TLI.getTruncStoreAction(ST->getValue().getValueType(), StVT)) {
-        default: llvm_unreachable("This action is not supported yet!");
-        case TargetLowering::Legal:
-          // If this is an unaligned store and the target doesn't support it,
-          // expand it.
-          if (!TLI.allowsUnalignedMemoryAccesses(ST->getMemoryVT())) {
-            Type *Ty = ST->getMemoryVT().getTypeForEVT(*DAG.getContext());
-            unsigned ABIAlignment= TLI.getTargetData()->getABITypeAlignment(Ty);
-            if (ST->getAlignment() < ABIAlignment)
-              ExpandUnalignedStore(cast<StoreSDNode>(Node), DAG, TLI, this);
-          }
-          break;
-        case TargetLowering::Custom:
-          Tmp1 = TLI.LowerOperation(SDValue(Node, 0), DAG);
-          if (Tmp1.getNode())
-            ReplaceNode(SDValue(Node, 0), Tmp1);
-          break;
-        case TargetLowering::Expand:
-          assert(!StVT.isVector() &&
-                 "Vector Stores are handled in LegalizeVectorOps");
-
-          // TRUNCSTORE:i16 i32 -> STORE i16
-          assert(TLI.isTypeLegal(StVT) && "Do not know how to expand this store!");
-          Tmp3 = DAG.getNode(ISD::TRUNCATE, dl, StVT, Tmp3);
-          SDValue Result =
-            DAG.getStore(Tmp1, dl, Tmp3, Tmp2, ST->getPointerInfo(),
-                         isVolatile, isNonTemporal, Alignment);
-          ReplaceNode(SDValue(Node, 0), Result);
-          break;
-        }
-      }
-    }
-    break;
+    return LegalizeStoreOps(Node);
   }
   }
 }
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 75b31f79ad..e8e968aaef 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -588,18 +588,14 @@ SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) {
     unsigned NumElts = InVT.getVectorNumElements();
     assert(NumElts == NVT.getVectorNumElements() &&
            "Dst and Src must have the same number of elements");
-    EVT EltVT = InVT.getScalarType();
     assert(isPowerOf2_32(NumElts) &&
            "Promoted vector type must be a power of two");
 
-    EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), EltVT, NumElts/2);
+    SDValue EOp1, EOp2;
+    GetSplitVector(InOp, EOp1, EOp2);
+
     EVT HalfNVT = EVT::getVectorVT(*DAG.getContext(), NVT.getScalarType(),
                                    NumElts/2);
-
-    SDValue EOp1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, InOp,
-                               DAG.getIntPtrConstant(0));
-    SDValue EOp2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HalfVT, InOp,
-                               DAG.getIntPtrConstant(NumElts/2));
     EOp1 = DAG.getNode(ISD::TRUNCATE, dl, HalfNVT, EOp1);
     EOp2 = DAG.getNode(ISD::TRUNCATE, dl, HalfNVT, EOp2);
 
@@ -2273,9 +2269,9 @@ void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,
     // A divide for UMULO will be faster than a function call. Select to
     // make sure we aren't using 0.
     SDValue isZero = DAG.getSetCC(dl, TLI.getSetCCResultType(VT),
-				  RHS, DAG.getConstant(0, VT), ISD::SETNE);
+                                  RHS, DAG.getConstant(0, VT), ISD::SETNE);
     SDValue NotZero = DAG.getNode(ISD::SELECT, dl, VT, isZero,
-				  DAG.getConstant(1, VT), RHS);
+                                  DAG.getConstant(1, VT), RHS);
     SDValue DIV = DAG.getNode(ISD::UDIV, DL, LHS.getValueType(), MUL, NotZero);
     SDValue Overflow;
     Overflow = DAG.getSetCC(DL, N->getValueType(1), DIV, LHS, ISD::SETNE);
@@ -2296,8 +2292,8 @@ void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,
   SDValue Temp = DAG.CreateStackTemporary(PtrVT);
   // Temporary for the overflow value, default it to zero.
   SDValue Chain = DAG.getStore(DAG.getEntryNode(), dl,
-			       DAG.getConstant(0, PtrVT), Temp,
-			       MachinePointerInfo(), false, false, 0);
+                               DAG.getConstant(0, PtrVT), Temp,
+                               MachinePointerInfo(), false, false, 0);
 
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
@@ -2321,15 +2317,15 @@ void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,
   SDValue Func = DAG.getExternalSymbol(TLI.getLibcallName(LC), PtrVT);
   TargetLowering::
   CallLoweringInfo CLI(Chain, RetTy, true, false, false, false,
-		    0, TLI.getLibcallCallingConv(LC),
-                    /*isTailCall=*/false,
-		    /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
-                    Func, Args, DAG, dl);
+                       0, TLI.getLibcallCallingConv(LC),
+                       /*isTailCall=*/false,
+                       /*doesNotReturn=*/false, /*isReturnValueUsed=*/true,
+                       Func, Args, DAG, dl);
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
   SplitInteger(CallInfo.first, Lo, Hi);
   SDValue Temp2 = DAG.getLoad(PtrVT, dl, CallInfo.second, Temp,
-			      MachinePointerInfo(), false, false, false, 0);
+                              MachinePointerInfo(), false, false, false, 0);
   SDValue Ofl = DAG.getSetCC(dl, N->getValueType(1), Temp2,
                              DAG.getConstant(0, PtrVT),
                              ISD::SETNE);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 106b086184..94fc9761ec 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -516,6 +516,7 @@ private:
   SDValue ScalarizeVecRes_InregOp(SDNode *N);
 
   SDValue ScalarizeVecRes_BITCAST(SDNode *N);
+  SDValue ScalarizeVecRes_BUILD_VECTOR(SDNode *N);
   SDValue ScalarizeVecRes_CONVERT_RNDSAT(SDNode *N);
   SDValue ScalarizeVecRes_EXTRACT_SUBVECTOR(SDNode *N);
   SDValue ScalarizeVecRes_FP_ROUND(SDNode *N);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index a8ff7c65ab..06f6bd63b6 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -168,6 +168,7 @@ void DAGTypeLegalizer::ExpandRes_EXTRACT_VECTOR_ELT(SDNode *N, SDValue &Lo,
                                                     SDValue &Hi) {
   SDValue OldVec = N->getOperand(0);
   unsigned OldElts = OldVec.getValueType().getVectorNumElements();
+  EVT OldEltVT = OldVec.getValueType().getVectorElementType();
   DebugLoc dl = N->getDebugLoc();
 
   // Convert to a vector of the expanded element type, for example
@@ -175,6 +176,15 @@ void DAGTypeLegalizer::ExpandRes_EXTRACT_VECTOR_ELT(SDNode *N, SDValue &Lo,
   EVT OldVT = N->getValueType(0);
   EVT NewVT = TLI.getTypeToTransformTo(*DAG.getContext(), OldVT);
 
+  if (OldVT != OldEltVT) {
+    // The result of EXTRACT_VECTOR_ELT may be larger than the element type of
+    // the input vector.  If so, extend the elements of the input vector to the
+    // same bitwidth as the result before expanding.
+    assert(OldEltVT.bitsLT(OldVT) && "Result type smaller then element type!");
+    EVT NVecVT = EVT::getVectorVT(*DAG.getContext(), OldVT, OldElts);
+    OldVec = DAG.getNode(ISD::ANY_EXTEND, dl, NVecVT, N->getOperand(0));
+  }
+
   SDValue NewVec = DAG.getNode(ISD::BITCAST, dl,
                                EVT::getVectorVT(*DAG.getContext(),
                                                 NewVT, 2*OldElts),
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index d09411c42f..4709202965 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -48,7 +48,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
 
   case ISD::MERGE_VALUES:      R = ScalarizeVecRes_MERGE_VALUES(N, ResNo);break;
   case ISD::BITCAST:           R = ScalarizeVecRes_BITCAST(N); break;
-  case ISD::BUILD_VECTOR:      R = N->getOperand(0); break;
+  case ISD::BUILD_VECTOR:      R = ScalarizeVecRes_BUILD_VECTOR(N); break;
   case ISD::CONVERT_RNDSAT:    R = ScalarizeVecRes_CONVERT_RNDSAT(N); break;
   case ISD::EXTRACT_SUBVECTOR: R = ScalarizeVecRes_EXTRACT_SUBVECTOR(N); break;
   case ISD::FP_ROUND:          R = ScalarizeVecRes_FP_ROUND(N); break;
@@ -152,6 +152,16 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_BITCAST(SDNode *N) {
                      NewVT, N->getOperand(0));
 }
 
+SDValue DAGTypeLegalizer::ScalarizeVecRes_BUILD_VECTOR(SDNode *N) {
+  EVT EltVT = N->getValueType(0).getVectorElementType();
+  SDValue InOp = N->getOperand(0);
+  // The BUILD_VECTOR operands may be of wider element types and
+  // we may need to truncate them back to the requested return type.
+  if (EltVT.isInteger())
+    return DAG.getNode(ISD::TRUNCATE, N->getDebugLoc(), EltVT, InOp);
+  return InOp;
+}
+
 SDValue DAGTypeLegalizer::ScalarizeVecRes_CONVERT_RNDSAT(SDNode *N) {
   EVT NewVT = N->getValueType(0).getVectorElementType();
   SDValue Op0 = GetScalarizedVector(N->getOperand(0));
@@ -447,7 +457,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
         N->dump(&DAG);
         dbgs() << "\n");
   SDValue Lo, Hi;
-  
+
   // See if the target wants to custom expand this node.
   if (CustomLowerNode(N, N->getValueType(ResNo), true))
     return;
@@ -459,7 +469,8 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
     N->dump(&DAG);
     dbgs() << "\n";
 #endif
-    llvm_unreachable("Do not know how to split the result of this operator!");
+    report_fatal_error("Do not know how to split the result of this "
+                       "operator!\n");
 
   case ISD::MERGE_VALUES: SplitRes_MERGE_VALUES(N, ResNo, Lo, Hi); break;
   case ISD::VSELECT:
@@ -1007,7 +1018,9 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
       N->dump(&DAG);
       dbgs() << "\n";
 #endif
-      llvm_unreachable("Do not know how to split this operator's operand!");
+      report_fatal_error("Do not know how to split this operator's "
+                         "operand!\n");
+
     case ISD::SETCC:             Res = SplitVecOp_VSETCC(N); break;
     case ISD::BITCAST:           Res = SplitVecOp_BITCAST(N); break;
     case ISD::EXTRACT_SUBVECTOR: Res = SplitVecOp_EXTRACT_SUBVECTOR(N); break;
@@ -1233,15 +1246,15 @@ SDValue DAGTypeLegalizer::SplitVecOp_FP_ROUND(SDNode *N) {
   DebugLoc DL = N->getDebugLoc();
   GetSplitVector(N->getOperand(0), Lo, Hi);
   EVT InVT = Lo.getValueType();
-  
+
   EVT OutVT = EVT::getVectorVT(*DAG.getContext(), ResVT.getVectorElementType(),
                                InVT.getVectorNumElements());
-  
+
   Lo = DAG.getNode(ISD::FP_ROUND, DL, OutVT, Lo, N->getOperand(1));
   Hi = DAG.getNode(ISD::FP_ROUND, DL, OutVT, Hi, N->getOperand(1));
-  
+
   return DAG.getNode(ISD::CONCAT_VECTORS, DL, ResVT, Lo, Hi);
-}  
+}
 
 
 
@@ -1785,8 +1798,8 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
     if (InputWidened)
       InOp = GetWidenedVector(InOp);
     for (unsigned j=0; j < NumInElts; ++j)
-        Ops[Idx++] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
-                                 DAG.getIntPtrConstant(j));
+      Ops[Idx++] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
+                               DAG.getIntPtrConstant(j));
   }
   SDValue UndefVal = DAG.getUNDEF(EltVT);
   for (; Idx < WidenNumElts; ++Idx)
@@ -1846,7 +1859,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONVERT_RNDSAT(SDNode *N) {
       InOp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InWidenVT, InOp,
                          DAG.getIntPtrConstant(0));
       return DAG.getConvertRndSat(WidenVT, dl, InOp, DTyOp, STyOp, RndOp,
-                                SatOp, CvtCode);
+                                  SatOp, CvtCode);
     }
   }
 
@@ -1862,7 +1875,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONVERT_RNDSAT(SDNode *N) {
     SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp,
                                  DAG.getIntPtrConstant(i));
     Ops[i] = DAG.getConvertRndSat(WidenVT, dl, ExtVal, DTyOp, STyOp, RndOp,
-                                        SatOp, CvtCode);
+                                  SatOp, CvtCode);
   }
 
   SDValue UndefVal = DAG.getUNDEF(EltVT);
@@ -1966,7 +1979,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) {
       Cond1 = GetWidenedVector(Cond1);
 
     if (Cond1.getValueType() != CondWidenVT)
-       Cond1 = ModifyToType(Cond1, CondWidenVT);
+      Cond1 = ModifyToType(Cond1, CondWidenVT);
   }
 
   SDValue InOp1 = GetWidenedVector(N->getOperand(1));
@@ -2232,7 +2245,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) {
   SDValue CC = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
                            ResVT, WideSETCC, DAG.getIntPtrConstant(0));
 
-  return PromoteTargetBoolean(CC, N->getValueType(0)); 
+  return PromoteTargetBoolean(CC, N->getValueType(0));
 }
 
 
@@ -2401,10 +2414,8 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVector<SDValue, 16> &LdChain,
       NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff);
       NewVTWidth = NewVT.getSizeInBits();
       L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
-                               LD->getPointerInfo().getWithOffset(Offset),
-                               isVolatile,
-                               isNonTemporal, isInvariant,
-                               MinAlign(Align, Increment));
+                      LD->getPointerInfo().getWithOffset(Offset), isVolatile,
+                      isNonTemporal, isInvariant, MinAlign(Align, Increment));
       LdChain.push_back(L.getValue(1));
       if (L->getValueType(0).isVector()) {
         SmallVector<SDValue, 16> Loads;
@@ -2593,7 +2604,7 @@ void DAGTypeLegalizer::GenWidenVectorStores(SmallVector<SDValue, 16>& StChain,
         Offset += Increment;
         BasePtr = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
                               DAG.getIntPtrConstant(Increment));
-      } while (StWidth != 0  && StWidth >= NewVTWidth);
+      } while (StWidth != 0 && StWidth >= NewVTWidth);
       // Restore index back to be relative to the original widen element type
       Idx = Idx * NewVTWidth / ValEltWidth;
     }
diff --git a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
index 01622cb295..c3794d5f78 100644
--- a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
+++ b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
@@ -318,7 +318,7 @@ void ResourcePriorityQueue::reserveResources(SUnit *SU) {
 
   // If packet is now full, reset the state so in the next cycle
   // we start fresh.
-  if (Packet.size() >= InstrItins->Props.IssueWidth) {
+  if (Packet.size() >= InstrItins->SchedModel->IssueWidth) {
     ResourcesModel->clearResources();
     Packet.clear();
   }
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index 67f42b18ee..bf0a43785b 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -853,7 +853,7 @@ void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) {
 }
 
 /// After backtracking, the hazard checker needs to be restored to a state
-/// corresponding the the current cycle.
+/// corresponding the current cycle.
 void ScheduleDAGRRList::RestoreHazardCheckerBottomUp() {
   HazardRec->Reset();
 
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
index 5384576a0c..84e41fc4a1 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
@@ -61,6 +61,7 @@ namespace llvm {
       if (isa<BasicBlockSDNode>(Node))     return true;
       if (isa<FrameIndexSDNode>(Node))     return true;
       if (isa<ConstantPoolSDNode>(Node))   return true;
+      if (isa<TargetIndexSDNode>(Node))    return true;
       if (isa<JumpTableSDNode>(Node))      return true;
       if (isa<ExternalSymbolSDNode>(Node)) return true;
       if (isa<BlockAddressSDNode>(Node))   return true;
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index a8dce2176f..f4fe8927f6 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -219,6 +219,22 @@ bool ISD::isScalarToVector(const SDNode *N) {
   return true;
 }
 
+/// allOperandsUndef - Return true if the node has at least one operand
+/// and all operands of the specified node are ISD::UNDEF.
+bool ISD::allOperandsUndef(const SDNode *N) {
+  // Return false if the node has no operands.
+  // This is "logically inconsistent" with the definition of "all" but
+  // is probably the desired behavior.
+  if (N->getNumOperands() == 0)
+    return false;
+
+  for (unsigned i = 0, e = N->getNumOperands(); i != e ; ++i)
+    if (N->getOperand(i).getOpcode() != ISD::UNDEF)
+      return false;
+
+  return true;
+}
+
 /// getSetCCSwappedOperands - Return the operation corresponding to (Y op X)
 /// when given the operation for (X op Y).
 ISD::CondCode ISD::getSetCCSwappedOperands(ISD::CondCode Operation) {
@@ -387,6 +403,7 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
     ID.AddPointer(GA->getGlobal());
     ID.AddInteger(GA->getOffset());
     ID.AddInteger(GA->getTargetFlags());
+    ID.AddInteger(GA->getAddressSpace());
     break;
   }
   case ISD::BasicBlock:
@@ -422,16 +439,25 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
     ID.AddInteger(CP->getTargetFlags());
     break;
   }
+  case ISD::TargetIndex: {
+    const TargetIndexSDNode *TI = cast<TargetIndexSDNode>(N);
+    ID.AddInteger(TI->getIndex());
+    ID.AddInteger(TI->getOffset());
+    ID.AddInteger(TI->getTargetFlags());
+    break;
+  }
   case ISD::LOAD: {
     const LoadSDNode *LD = cast<LoadSDNode>(N);
     ID.AddInteger(LD->getMemoryVT().getRawBits());
     ID.AddInteger(LD->getRawSubclassData());
+    ID.AddInteger(LD->getPointerInfo().getAddrSpace());
     break;
   }
   case ISD::STORE: {
     const StoreSDNode *ST = cast<StoreSDNode>(N);
     ID.AddInteger(ST->getMemoryVT().getRawBits());
     ID.AddInteger(ST->getRawSubclassData());
+    ID.AddInteger(ST->getPointerInfo().getAddrSpace());
     break;
   }
   case ISD::ATOMIC_CMP_SWAP:
@@ -451,6 +477,12 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
     const AtomicSDNode *AT = cast<AtomicSDNode>(N);
     ID.AddInteger(AT->getMemoryVT().getRawBits());
     ID.AddInteger(AT->getRawSubclassData());
+    ID.AddInteger(AT->getPointerInfo().getAddrSpace());
+    break;
+  }
+  case ISD::PREFETCH: {
+    const MemSDNode *PF = cast<MemSDNode>(N);
+    ID.AddInteger(PF->getPointerInfo().getAddrSpace());
     break;
   }
   case ISD::VECTOR_SHUFFLE: {
@@ -467,6 +499,10 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
     break;
   }
   } // end switch (N->getOpcode())
+
+  // Target specific memory nodes could also have address spaces to check.
+  if (N->isTargetMemoryOpcode())
+    ID.AddInteger(cast<MemSDNode>(N)->getPointerInfo().getAddrSpace());
 }
 
 /// AddNodeIDNode - Generic routine for adding a nodes info to the NodeID
@@ -1084,6 +1120,7 @@ SDValue SelectionDAG::getGlobalAddress(const GlobalValue *GV, DebugLoc DL,
   ID.AddPointer(GV);
   ID.AddInteger(Offset);
   ID.AddInteger(TargetFlags);
+  ID.AddInteger(GV->getType()->getAddressSpace());
   void *IP = 0;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
@@ -1183,6 +1220,24 @@ SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT,
   return SDValue(N, 0);
 }
 
+SDValue SelectionDAG::getTargetIndex(int Index, EVT VT, int64_t Offset,
+                                     unsigned char TargetFlags) {
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::TargetIndex, getVTList(VT), 0, 0);
+  ID.AddInteger(Index);
+  ID.AddInteger(Offset);
+  ID.AddInteger(TargetFlags);
+  void *IP = 0;
+  if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
+    return SDValue(E, 0);
+
+  SDNode *N = new (NodeAllocator) TargetIndexSDNode(Index, VT, Offset,
+                                                    TargetFlags);
+  CSEMap.InsertNode(N, IP);
+  AllNodes.push_back(N);
+  return SDValue(N, 0);
+}
+
 SDValue SelectionDAG::getBasicBlock(MachineBasicBlock *MBB) {
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::BasicBlock, getVTList(MVT::Other), 0, 0);
@@ -1949,6 +2004,7 @@ void SelectionDAG::ComputeMaskedBits(SDValue Op, APInt &KnownZero,
     APInt InMask = APInt::getLowBitsSet(BitWidth, VT.getSizeInBits());
     ComputeMaskedBits(Op.getOperand(0), KnownZero, KnownOne, Depth+1);
     KnownZero |= (~InMask);
+    KnownOne  &= (~KnownZero);
     return;
   }
   case ISD::FGETSIGN:
@@ -2427,6 +2483,24 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
       case ISD::FABS:
         V.clearSign();
         return getConstantFP(V, VT);
+      case ISD::FCEIL: {
+        APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardPositive);
+        if (fs == APFloat::opOK || fs == APFloat::opInexact)
+          return getConstantFP(V, VT);
+        break;
+      }
+      case ISD::FTRUNC: {
+        APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardZero);
+        if (fs == APFloat::opOK || fs == APFloat::opInexact)
+          return getConstantFP(V, VT);
+        break;
+      }
+      case ISD::FFLOOR: {
+        APFloat::opStatus fs = V.roundToIntegral(APFloat::rmTowardNegative);
+        if (fs == APFloat::opOK || fs == APFloat::opInexact)
+          return getConstantFP(V, VT);
+        break;
+      }
       case ISD::FP_EXTEND: {
         bool ignored;
         // This can return overflow, underflow, or inexact; we don't care.
@@ -2674,6 +2748,11 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL, EVT VT,
     if (N1 == N2) return N1;
     break;
   case ISD::CONCAT_VECTORS:
+    // Concat of UNDEFs is UNDEF.
+    if (N1.getOpcode() == ISD::UNDEF &&
+        N2.getOpcode() == ISD::UNDEF)
+      return getUNDEF(VT);
+
     // A CONCAT_VECTOR with all operands BUILD_VECTOR can be simplified to
     // one big BUILD_VECTOR.
     if (N1.getOpcode() == ISD::BUILD_VECTOR &&
@@ -3879,6 +3958,7 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
   ID.AddInteger(MemVT.getRawBits());
   SDValue Ops[] = {Chain, Ptr, Cmp, Swp};
   AddNodeIDNode(ID, Opcode, VTs, Ops, 4);
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void* IP = 0;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
     cast<AtomicSDNode>(E)->refineAlignment(MMO);
@@ -3951,6 +4031,7 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
   ID.AddInteger(MemVT.getRawBits());
   SDValue Ops[] = {Chain, Ptr, Val};
   AddNodeIDNode(ID, Opcode, VTs, Ops, 3);
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void* IP = 0;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
     cast<AtomicSDNode>(E)->refineAlignment(MMO);
@@ -4007,6 +4088,7 @@ SDValue SelectionDAG::getAtomic(unsigned Opcode, DebugLoc dl, EVT MemVT,
   ID.AddInteger(MemVT.getRawBits());
   SDValue Ops[] = {Chain, Ptr};
   AddNodeIDNode(ID, Opcode, VTs, Ops, 2);
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void* IP = 0;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
     cast<AtomicSDNode>(E)->refineAlignment(MMO);
@@ -4084,6 +4166,7 @@ SelectionDAG::getMemIntrinsicNode(unsigned Opcode, DebugLoc dl, SDVTList VTList,
   if (VTList.VTs[VTList.NumVTs-1] != MVT::Glue) {
     FoldingSetNodeID ID;
     AddNodeIDNode(ID, Opcode, VTList, Ops, NumOps);
+    ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
     void *IP = 0;
     if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
       cast<MemIntrinsicSDNode>(E)->refineAlignment(MMO);
@@ -4203,6 +4286,7 @@ SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
   ID.AddInteger(encodeMemSDNodeFlags(ExtType, AM, MMO->isVolatile(),
                                      MMO->isNonTemporal(), 
                                      MMO->isInvariant()));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void *IP = 0;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
     cast<LoadSDNode>(E)->refineAlignment(MMO);
@@ -4292,6 +4376,7 @@ SDValue SelectionDAG::getStore(SDValue Chain, DebugLoc dl, SDValue Val,
   ID.AddInteger(VT.getRawBits());
   ID.AddInteger(encodeMemSDNodeFlags(false, ISD::UNINDEXED, MMO->isVolatile(),
                                      MMO->isNonTemporal(), MMO->isInvariant()));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void *IP = 0;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
     cast<StoreSDNode>(E)->refineAlignment(MMO);
@@ -4359,6 +4444,7 @@ SDValue SelectionDAG::getTruncStore(SDValue Chain, DebugLoc dl, SDValue Val,
   ID.AddInteger(SVT.getRawBits());
   ID.AddInteger(encodeMemSDNodeFlags(true, ISD::UNINDEXED, MMO->isVolatile(),
                                      MMO->isNonTemporal(), MMO->isInvariant()));
+  ID.AddInteger(MMO->getPointerInfo().getAddrSpace());
   void *IP = 0;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP)) {
     cast<StoreSDNode>(E)->refineAlignment(MMO);
@@ -4383,6 +4469,7 @@ SelectionDAG::getIndexedStore(SDValue OrigStore, DebugLoc dl, SDValue Base,
   AddNodeIDNode(ID, ISD::STORE, VTs, Ops, 4);
   ID.AddInteger(ST->getMemoryVT().getRawBits());
   ID.AddInteger(ST->getRawSubclassData());
+  ID.AddInteger(ST->getPointerInfo().getAddrSpace());
   void *IP = 0;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
@@ -4659,13 +4746,7 @@ SDVTList SelectionDAG::getVTList(const EVT *VTs, unsigned NumVTs) {
     if (I->NumVTs != NumVTs || VTs[0] != I->VTs[0] || VTs[1] != I->VTs[1])
       continue;
 
-    bool NoMatch = false;
-    for (unsigned i = 2; i != NumVTs; ++i)
-      if (VTs[i] != I->VTs[i]) {
-        NoMatch = true;
-        break;
-      }
-    if (!NoMatch)
+    if (std::equal(&VTs[2], &VTs[NumVTs], &I->VTs[2]))
       return *I;
   }
 
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index e5d24e9d18..e0a061d8be 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -942,7 +942,7 @@ void SelectionDAGBuilder::visit(unsigned Opcode, const User &I) {
   default: llvm_unreachable("Unknown instruction type encountered!");
     // Build the switch statement using the Instruction.def file.
 #define HANDLE_INST(NUM, OPCODE, CLASS) \
-    case Instruction::OPCODE: visit##OPCODE((CLASS&)I); break;
+    case Instruction::OPCODE: visit##OPCODE((const CLASS&)I); break;
 #include "llvm/Instruction.def"
   }
 
@@ -1601,7 +1601,10 @@ void SelectionDAGBuilder::visitSwitchCase(CaseBlock &CB,
 
   // Update successor info
   addSuccessorWithWeight(SwitchBB, CB.TrueBB, CB.TrueWeight);
-  addSuccessorWithWeight(SwitchBB, CB.FalseBB, CB.FalseWeight);
+  // TrueBB and FalseBB are always different unless the incoming IR is
+  // degenerate. This only happens when running llc on weird IR.
+  if (CB.TrueBB != CB.FalseBB)
+    addSuccessorWithWeight(SwitchBB, CB.FalseBB, CB.FalseWeight);
 
   // Set NextBlock to be the MBB immediately after the current one, if any.
   // This is used to avoid emitting unnecessary branches to the next block.
@@ -1833,7 +1836,7 @@ void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) {
     visitInlineAsm(&I);
   else if (Fn && Fn->isIntrinsic()) {
     assert(Fn->getIntrinsicID() == Intrinsic::donothing);
-    return; // ignore invokes to @llvm.donothing
+    // Ignore invokes to @llvm.donothing: jump directly to the next BB.
   } else
     LowerCallTo(&I, getValue(Callee), false, LandingPad);
 
@@ -3460,7 +3463,7 @@ void SelectionDAGBuilder::visitAtomicLoad(const LoadInst &I) {
 
   SDValue InChain = getRoot();
 
-  EVT VT = EVT::getEVT(I.getType());
+  EVT VT = TLI.getValueType(I.getType());
 
   if (I.getAlignment() * 8 < VT.getSizeInBits())
     report_fatal_error("Cannot generate unaligned atomic load");
@@ -3490,7 +3493,7 @@ void SelectionDAGBuilder::visitAtomicStore(const StoreInst &I) {
 
   SDValue InChain = getRoot();
 
-  EVT VT = EVT::getEVT(I.getValueOperand()->getType());
+  EVT VT = TLI.getValueType(I.getValueOperand()->getType());
 
   if (I.getAlignment() * 8 < VT.getSizeInBits())
     report_fatal_error("Cannot generate unaligned atomic store");
@@ -4929,6 +4932,11 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
                              getValue(I.getArgOperand(0)).getValueType(),
                              getValue(I.getArgOperand(0))));
     return 0;
+  case Intrinsic::floor:
+    setValue(&I, DAG.getNode(ISD::FFLOOR, dl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0))));
+    return 0;
   case Intrinsic::fma:
     setValue(&I, DAG.getNode(ISD::FMA, dl,
                              getValue(I.getArgOperand(0)).getValueType(),
@@ -5239,9 +5247,9 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
                 Outs, TLI);
 
   bool CanLowerReturn = TLI.CanLowerReturn(CS.getCallingConv(),
-					   DAG.getMachineFunction(),
-					   FTy->isVarArg(), Outs,
-					   FTy->getContext());
+                                           DAG.getMachineFunction(),
+                                           FTy->isVarArg(), Outs,
+                                           FTy->getContext());
 
   SDValue DemoteStackSlot;
   int DemoteStackIdx = -100;
@@ -5538,6 +5546,22 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
   return false;
 }
 
+/// visitUnaryFloatCall - If a call instruction is a unary floating-point
+/// operation (as expected), translate it to an SDNode with the specified opcode
+/// and return true.
+bool SelectionDAGBuilder::visitUnaryFloatCall(const CallInst &I,
+                                              unsigned Opcode) {
+  // Sanity check that it really is a unary floating-point call.
+  if (I.getNumArgOperands() != 1 ||
+      !I.getArgOperand(0)->getType()->isFloatingPointTy() ||
+      I.getType() != I.getArgOperand(0)->getType() ||
+      !I.onlyReadsMemory())
+    return false;
+
+  SDValue Tmp = getValue(I.getArgOperand(0));
+  setValue(&I, DAG.getNode(Opcode, getCurDebugLoc(), Tmp.getValueType(), Tmp));
+  return true;
+}
 
 void SelectionDAGBuilder::visitCall(const CallInst &I) {
   // Handle inline assembly differently.
@@ -5568,150 +5592,97 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
 
     // Check for well-known libc/libm calls.  If the function is internal, it
     // can't be a library call.
-    if (!F->hasLocalLinkage() && F->hasName()) {
-      StringRef Name = F->getName();
-      if ((LibInfo->has(LibFunc::copysign) && Name == "copysign") ||
-          (LibInfo->has(LibFunc::copysignf) && Name == "copysignf") ||
-          (LibInfo->has(LibFunc::copysignl) && Name == "copysignl")) {
+    LibFunc::Func Func;
+    if (!F->hasLocalLinkage() && F->hasName() &&
+        LibInfo->getLibFunc(F->getName(), Func) &&
+        LibInfo->hasOptimizedCodeGen(Func)) {
+      switch (Func) {
+      default: break;
+      case LibFunc::copysign:
+      case LibFunc::copysignf:
+      case LibFunc::copysignl:
         if (I.getNumArgOperands() == 2 &&   // Basic sanity checks.
             I.getArgOperand(0)->getType()->isFloatingPointTy() &&
             I.getType() == I.getArgOperand(0)->getType() &&
-            I.getType() == I.getArgOperand(1)->getType()) {
+            I.getType() == I.getArgOperand(1)->getType() &&
+            I.onlyReadsMemory()) {
           SDValue LHS = getValue(I.getArgOperand(0));
           SDValue RHS = getValue(I.getArgOperand(1));
           setValue(&I, DAG.getNode(ISD::FCOPYSIGN, getCurDebugLoc(),
                                    LHS.getValueType(), LHS, RHS));
           return;
         }
-      } else if ((LibInfo->has(LibFunc::fabs) && Name == "fabs") ||
-                 (LibInfo->has(LibFunc::fabsf) && Name == "fabsf") ||
-                 (LibInfo->has(LibFunc::fabsl) && Name == "fabsl")) {
-        if (I.getNumArgOperands() == 1 &&   // Basic sanity checks.
-            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
-            I.getType() == I.getArgOperand(0)->getType()) {
-          SDValue Tmp = getValue(I.getArgOperand(0));
-          setValue(&I, DAG.getNode(ISD::FABS, getCurDebugLoc(),
-                                   Tmp.getValueType(), Tmp));
+        break;
+      case LibFunc::fabs:
+      case LibFunc::fabsf:
+      case LibFunc::fabsl:
+        if (visitUnaryFloatCall(I, ISD::FABS))
           return;
-        }
-      } else if ((LibInfo->has(LibFunc::sin) && Name == "sin") ||
-                 (LibInfo->has(LibFunc::sinf) && Name == "sinf") ||
-                 (LibInfo->has(LibFunc::sinl) && Name == "sinl")) {
-        if (I.getNumArgOperands() == 1 &&   // Basic sanity checks.
-            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
-            I.getType() == I.getArgOperand(0)->getType() &&
-            I.onlyReadsMemory()) {
-          SDValue Tmp = getValue(I.getArgOperand(0));
-          setValue(&I, DAG.getNode(ISD::FSIN, getCurDebugLoc(),
-                                   Tmp.getValueType(), Tmp));
+        break;
+      case LibFunc::sin:
+      case LibFunc::sinf:
+      case LibFunc::sinl:
+        if (visitUnaryFloatCall(I, ISD::FSIN))
           return;
-        }
-      } else if ((LibInfo->has(LibFunc::cos) && Name == "cos") ||
-                 (LibInfo->has(LibFunc::cosf) && Name == "cosf") ||
-                 (LibInfo->has(LibFunc::cosl) && Name == "cosl")) {
-        if (I.getNumArgOperands() == 1 &&   // Basic sanity checks.
-            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
-            I.getType() == I.getArgOperand(0)->getType() &&
-            I.onlyReadsMemory()) {
-          SDValue Tmp = getValue(I.getArgOperand(0));
-          setValue(&I, DAG.getNode(ISD::FCOS, getCurDebugLoc(),
-                                   Tmp.getValueType(), Tmp));
+        break;
+      case LibFunc::cos:
+      case LibFunc::cosf:
+      case LibFunc::cosl:
+        if (visitUnaryFloatCall(I, ISD::FCOS))
           return;
-        }
-      } else if ((LibInfo->has(LibFunc::sqrt) && Name == "sqrt") ||
-                 (LibInfo->has(LibFunc::sqrtf) && Name == "sqrtf") ||
-                 (LibInfo->has(LibFunc::sqrtl) && Name == "sqrtl")) {
-        if (I.getNumArgOperands() == 1 &&   // Basic sanity checks.
-            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
-            I.getType() == I.getArgOperand(0)->getType() &&
-            I.onlyReadsMemory()) {
-          SDValue Tmp = getValue(I.getArgOperand(0));
-          setValue(&I, DAG.getNode(ISD::FSQRT, getCurDebugLoc(),
-                                   Tmp.getValueType(), Tmp));
+        break;
+      case LibFunc::sqrt:
+      case LibFunc::sqrtf:
+      case LibFunc::sqrtl:
+        if (visitUnaryFloatCall(I, ISD::FSQRT))
           return;
-        }
-      } else if ((LibInfo->has(LibFunc::floor) && Name == "floor") ||
-                 (LibInfo->has(LibFunc::floorf) && Name == "floorf") ||
-                 (LibInfo->has(LibFunc::floorl) && Name == "floorl")) {
-        if (I.getNumArgOperands() == 1 && // Basic sanity checks.
-            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
-            I.getType() == I.getArgOperand(0)->getType()) {
-          SDValue Tmp = getValue(I.getArgOperand(0));
-          setValue(&I, DAG.getNode(ISD::FFLOOR, getCurDebugLoc(),
-                                   Tmp.getValueType(), Tmp));
+        break;
+      case LibFunc::floor:
+      case LibFunc::floorf:
+      case LibFunc::floorl:
+        if (visitUnaryFloatCall(I, ISD::FFLOOR))
           return;
-        }
-      } else if ((LibInfo->has(LibFunc::nearbyint) && Name == "nearbyint") ||
-                 (LibInfo->has(LibFunc::nearbyintf) && Name == "nearbyintf") ||
-                 (LibInfo->has(LibFunc::nearbyintl) && Name == "nearbyintl")) {
-        if (I.getNumArgOperands() == 1 && // Basic sanity checks.
-            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
-            I.getType() == I.getArgOperand(0)->getType()) {
-          SDValue Tmp = getValue(I.getArgOperand(0));
-          setValue(&I, DAG.getNode(ISD::FNEARBYINT, getCurDebugLoc(),
-                                   Tmp.getValueType(), Tmp));
+        break;
+      case LibFunc::nearbyint:
+      case LibFunc::nearbyintf:
+      case LibFunc::nearbyintl:
+        if (visitUnaryFloatCall(I, ISD::FNEARBYINT))
           return;
-        }
-      } else if ((LibInfo->has(LibFunc::ceil) && Name == "ceil") ||
-                 (LibInfo->has(LibFunc::ceilf) && Name == "ceilf") ||
-                 (LibInfo->has(LibFunc::ceill) && Name == "ceill")) {
-        if (I.getNumArgOperands() == 1 && // Basic sanity checks.
-            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
-            I.getType() == I.getArgOperand(0)->getType()) {
-          SDValue Tmp = getValue(I.getArgOperand(0));
-          setValue(&I, DAG.getNode(ISD::FCEIL, getCurDebugLoc(),
-                                   Tmp.getValueType(), Tmp));
+        break;
+      case LibFunc::ceil:
+      case LibFunc::ceilf:
+      case LibFunc::ceill:
+        if (visitUnaryFloatCall(I, ISD::FCEIL))
           return;
-        }
-      } else if ((LibInfo->has(LibFunc::rint) && Name == "rint") ||
-                 (LibInfo->has(LibFunc::rintf) && Name == "rintf") ||
-                 (LibInfo->has(LibFunc::rintl) && Name == "rintl")) {
-        if (I.getNumArgOperands() == 1 && // Basic sanity checks.
-            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
-            I.getType() == I.getArgOperand(0)->getType()) {
-          SDValue Tmp = getValue(I.getArgOperand(0));
-          setValue(&I, DAG.getNode(ISD::FRINT, getCurDebugLoc(),
-                                   Tmp.getValueType(), Tmp));
+        break;
+      case LibFunc::rint:
+      case LibFunc::rintf:
+      case LibFunc::rintl:
+        if (visitUnaryFloatCall(I, ISD::FRINT))
           return;
-        }
-      } else if ((LibInfo->has(LibFunc::trunc) && Name == "trunc") ||
-                 (LibInfo->has(LibFunc::truncf) && Name == "truncf") ||
-                 (LibInfo->has(LibFunc::truncl) && Name == "truncl")) {
-        if (I.getNumArgOperands() == 1 && // Basic sanity checks.
-            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
-            I.getType() == I.getArgOperand(0)->getType()) {
-          SDValue Tmp = getValue(I.getArgOperand(0));
-          setValue(&I, DAG.getNode(ISD::FTRUNC, getCurDebugLoc(),
-                                   Tmp.getValueType(), Tmp));
+        break;
+      case LibFunc::trunc:
+      case LibFunc::truncf:
+      case LibFunc::truncl:
+        if (visitUnaryFloatCall(I, ISD::FTRUNC))
           return;
-        }
-      } else if ((LibInfo->has(LibFunc::log2) && Name == "log2") ||
-                 (LibInfo->has(LibFunc::log2f) && Name == "log2f") ||
-                 (LibInfo->has(LibFunc::log2l) && Name == "log2l")) {
-        if (I.getNumArgOperands() == 1 && // Basic sanity checks.
-            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
-            I.getType() == I.getArgOperand(0)->getType() &&
-            I.onlyReadsMemory()) {
-          SDValue Tmp = getValue(I.getArgOperand(0));
-          setValue(&I, DAG.getNode(ISD::FLOG2, getCurDebugLoc(),
-                                   Tmp.getValueType(), Tmp));
+        break;
+      case LibFunc::log2:
+      case LibFunc::log2f:
+      case LibFunc::log2l:
+        if (visitUnaryFloatCall(I, ISD::FLOG2))
           return;
-        }
-      } else if ((LibInfo->has(LibFunc::exp2) && Name == "exp2") ||
-                 (LibInfo->has(LibFunc::exp2f) && Name == "exp2f") ||
-                 (LibInfo->has(LibFunc::exp2l) && Name == "exp2l")) {
-        if (I.getNumArgOperands() == 1 && // Basic sanity checks.
-            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
-            I.getType() == I.getArgOperand(0)->getType() &&
-            I.onlyReadsMemory()) {
-          SDValue Tmp = getValue(I.getArgOperand(0));
-          setValue(&I, DAG.getNode(ISD::FEXP2, getCurDebugLoc(),
-                                   Tmp.getValueType(), Tmp));
+        break;
+      case LibFunc::exp2:
+      case LibFunc::exp2f:
+      case LibFunc::exp2l:
+        if (visitUnaryFloatCall(I, ISD::FEXP2))
           return;
-        }
-      } else if (Name == "memcmp") {
+        break;
+      case LibFunc::memcmp:
         if (visitMemCmpCall(I))
           return;
+        break;
       }
     }
   }
@@ -6008,11 +5979,11 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
       SDISelAsmOperandInfo &Input = ConstraintOperands[OpInfo.MatchingInput];
 
       if (OpInfo.ConstraintVT != Input.ConstraintVT) {
-	std::pair<unsigned, const TargetRegisterClass*> MatchRC =
-	  TLI.getRegForInlineAsmConstraint(OpInfo.ConstraintCode,
+        std::pair<unsigned, const TargetRegisterClass*> MatchRC =
+          TLI.getRegForInlineAsmConstraint(OpInfo.ConstraintCode,
                                            OpInfo.ConstraintVT);
-	std::pair<unsigned, const TargetRegisterClass*> InputRC =
-	  TLI.getRegForInlineAsmConstraint(Input.ConstraintCode,
+        std::pair<unsigned, const TargetRegisterClass*> InputRC =
+          TLI.getRegForInlineAsmConstraint(Input.ConstraintCode,
                                            Input.ConstraintVT);
         if ((OpInfo.ConstraintVT.isInteger() !=
              Input.ConstraintVT.isInteger()) ||
@@ -6805,7 +6776,7 @@ void SelectionDAGISel::LowerArguments(const BasicBlock *LLVMBB) {
 
     // Note down frame index.
     if (FrameIndexSDNode *FI =
-	dyn_cast<FrameIndexSDNode>(ArgValues[0].getNode()))
+        dyn_cast<FrameIndexSDNode>(ArgValues[0].getNode()))
       FuncInfo->setArgumentFrameIndex(I, FI->getIndex());
 
     SDValue Res = DAG.getMergeValues(&ArgValues[0], NumValues,
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index d0fde6f01d..4090002314 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -520,6 +520,7 @@ private:
   void visitPHI(const PHINode &I);
   void visitCall(const CallInst &I);
   bool visitMemCmpCall(const CallInst &I);
+  bool visitUnaryFloatCall(const CallInst &I, unsigned Opcode);
   void visitAtomicLoad(const LoadInst &I);
   void visitAtomicStore(const StoreInst &I);
 
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 5233479701..1c357ef375 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -100,6 +100,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::EH_SJLJ_SETJMP:             return "EH_SJLJ_SETJMP";
   case ISD::EH_SJLJ_LONGJMP:            return "EH_SJLJ_LONGJMP";
   case ISD::ConstantPool:               return "ConstantPool";
+  case ISD::TargetIndex:                return "TargetIndex";
   case ISD::ExternalSymbol:             return "ExternalSymbol";
   case ISD::BlockAddress:               return "BlockAddress";
   case ISD::INTRINSIC_WO_CHAIN:
@@ -418,6 +419,10 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
       OS << " " << offset;
     if (unsigned int TF = CP->getTargetFlags())
       OS << " [TF=" << TF << ']';
+  } else if (const TargetIndexSDNode *TI = dyn_cast<TargetIndexSDNode>(this)) {
+    OS << "<" << TI->getIndex() << '+' << TI->getOffset() << ">";
+    if (unsigned TF = TI->getTargetFlags())
+      OS << " [TF=" << TF << ']';
   } else if (const BasicBlockSDNode *BBDN = dyn_cast<BasicBlockSDNode>(this)) {
     OS << "<";
     const Value *LBB = (const Value*)BBDN->getBasicBlock()->getBasicBlock();
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index f02e895fcb..9f277fd8b4 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -971,7 +971,7 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
   // Initialize the Fast-ISel state, if needed.
   FastISel *FastIS = 0;
   if (TM.Options.EnableFastISel)
-    FastIS = TLI.createFastISel(*FuncInfo);
+    FastIS = TLI.createFastISel(*FuncInfo, LibInfo);
 
   // Iterate over all basic blocks in the function.
   ReversePostOrderTraversal<const Function*> RPOT(&Fn);
@@ -2973,6 +2973,7 @@ void SelectionDAGISel::CannotYetSelect(SDNode *N) {
       N->getOpcode() != ISD::INTRINSIC_WO_CHAIN &&
       N->getOpcode() != ISD::INTRINSIC_VOID) {
     N->printrFull(Msg, CurDAG);
+    Msg << "\nIn function: " << MF->getFunction()->getName();
   } else {
     bool HasInputChain = N->getOperand(0).getValueType() == MVT::Other;
     unsigned iid =
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 6212918d6c..6820175c1b 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -2303,7 +2303,7 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
         N0.getOpcode() == ISD::AND)
       if (ConstantSDNode *AndRHS =
                   dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
-        EVT ShiftTy = DCI.isBeforeLegalize() ?
+        EVT ShiftTy = DCI.isBeforeLegalizeOps() ?
           getPointerTy() : getShiftAmountTy(N0.getValueType());
         if (Cond == ISD::SETNE && C1 == 0) {// (X & 8) != 0  -->  (X & 8) >> 3
           // Perform the xform if the AND RHS is a single bit.
@@ -2322,6 +2322,55 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
           }
         }
       }
+
+    if (C1.getMinSignedBits() <= 64 &&
+        !isLegalICmpImmediate(C1.getSExtValue())) {
+      // (X & -256) == 256 -> (X >> 8) == 1
+      if ((Cond == ISD::SETEQ || Cond == ISD::SETNE) &&
+          N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
+        if (ConstantSDNode *AndRHS =
+            dyn_cast<ConstantSDNode>(N0.getOperand(1))) {
+          const APInt &AndRHSC = AndRHS->getAPIntValue();
+          if ((-AndRHSC).isPowerOf2() && (AndRHSC & C1) == C1) {
+            unsigned ShiftBits = AndRHSC.countTrailingZeros();
+            EVT ShiftTy = DCI.isBeforeLegalizeOps() ?
+              getPointerTy() : getShiftAmountTy(N0.getValueType());
+            EVT CmpTy = N0.getValueType();
+            SDValue Shift = DAG.getNode(ISD::SRL, dl, CmpTy, N0.getOperand(0),
+                                        DAG.getConstant(ShiftBits, ShiftTy));
+            SDValue CmpRHS = DAG.getConstant(C1.lshr(ShiftBits), CmpTy);
+            return DAG.getSetCC(dl, VT, Shift, CmpRHS, Cond);
+          }
+        }
+      } else if (Cond == ISD::SETULT || Cond == ISD::SETUGE ||
+                 Cond == ISD::SETULE || Cond == ISD::SETUGT) {
+        bool AdjOne = (Cond == ISD::SETULE || Cond == ISD::SETUGT);
+        // X <  0x100000000 -> (X >> 32) <  1
+        // X >= 0x100000000 -> (X >> 32) >= 1
+        // X <= 0x0ffffffff -> (X >> 32) <  1
+        // X >  0x0ffffffff -> (X >> 32) >= 1
+        unsigned ShiftBits;
+        APInt NewC = C1;
+        ISD::CondCode NewCond = Cond;
+        if (AdjOne) {
+          ShiftBits = C1.countTrailingOnes();
+          NewC = NewC + 1;
+          NewCond = (Cond == ISD::SETULE) ? ISD::SETULT : ISD::SETUGE;
+        } else {
+          ShiftBits = C1.countTrailingZeros();
+        }
+        NewC = NewC.lshr(ShiftBits);
+        if (ShiftBits && isLegalICmpImmediate(NewC.getSExtValue())) {
+          EVT ShiftTy = DCI.isBeforeLegalizeOps() ?
+            getPointerTy() : getShiftAmountTy(N0.getValueType());
+          EVT CmpTy = N0.getValueType();
+          SDValue Shift = DAG.getNode(ISD::SRL, dl, CmpTy, N0,
+                                      DAG.getConstant(ShiftBits, ShiftTy));
+          SDValue CmpRHS = DAG.getConstant(NewC, CmpTy);
+          return DAG.getSetCC(dl, VT, Shift, CmpRHS, NewCond);
+        }
+      }
+    }
   }
 
   if (isa<ConstantFPSDNode>(N0.getNode())) {
@@ -2390,25 +2439,33 @@ TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
   }
 
   if (N0 == N1) {
+    // The sext(setcc()) => setcc() optimization relies on the appropriate
+    // constant being emitted.
+    uint64_t EqVal;
+    switch (getBooleanContents(N0.getValueType().isVector())) {
+    case UndefinedBooleanContent:
+    case ZeroOrOneBooleanContent:
+      EqVal = ISD::isTrueWhenEqual(Cond);
+      break;
+    case ZeroOrNegativeOneBooleanContent:
+      EqVal = ISD::isTrueWhenEqual(Cond) ? -1 : 0;
+      break;
+    }
+
     // We can always fold X == X for integer setcc's.
     if (N0.getValueType().isInteger()) {
-      switch (getBooleanContents(N0.getValueType().isVector())) {
-      case UndefinedBooleanContent: 
-      case ZeroOrOneBooleanContent: 
-        return DAG.getConstant(ISD::isTrueWhenEqual(Cond), VT);
-      case ZeroOrNegativeOneBooleanContent:
-        return DAG.getConstant(ISD::isTrueWhenEqual(Cond) ? -1 : 0, VT);
-      }
+      return DAG.getConstant(EqVal, VT);
     }
     unsigned UOF = ISD::getUnorderedFlavor(Cond);
     if (UOF == 2)   // FP operators that are undefined on NaNs.
-      return DAG.getConstant(ISD::isTrueWhenEqual(Cond), VT);
+      return DAG.getConstant(EqVal, VT);
     if (UOF == unsigned(ISD::isTrueWhenEqual(Cond)))
-      return DAG.getConstant(UOF, VT);
+      return DAG.getConstant(EqVal, VT);
     // Otherwise, we can't fold it.  However, we can simplify it to SETUO/SETO
     // if it is not already.
     ISD::CondCode NewCond = UOF == 0 ? ISD::SETO : ISD::SETUO;
-    if (NewCond != Cond)
+    if (NewCond != Cond && (DCI.isBeforeLegalizeOps() ||
+          getCondCodeAction(NewCond, N0.getValueType()) == Legal))
       return DAG.getSetCC(dl, VT, N0, N1, NewCond);
   }
 
@@ -2977,10 +3034,12 @@ TargetLowering::AsmOperandInfoVector TargetLowering::ParseConstraints(
       AsmOperandInfo &Input = ConstraintOperands[OpInfo.MatchingInput];
 
       if (OpInfo.ConstraintVT != Input.ConstraintVT) {
-	std::pair<unsigned, const TargetRegisterClass*> MatchRC =
-	  getRegForInlineAsmConstraint(OpInfo.ConstraintCode, OpInfo.ConstraintVT);
-	std::pair<unsigned, const TargetRegisterClass*> InputRC =
-	  getRegForInlineAsmConstraint(Input.ConstraintCode, Input.ConstraintVT);
+        std::pair<unsigned, const TargetRegisterClass*> MatchRC =
+          getRegForInlineAsmConstraint(OpInfo.ConstraintCode,
+                                       OpInfo.ConstraintVT);
+        std::pair<unsigned, const TargetRegisterClass*> InputRC =
+          getRegForInlineAsmConstraint(Input.ConstraintCode,
+                                       Input.ConstraintVT);
         if ((OpInfo.ConstraintVT.isInteger() !=
              Input.ConstraintVT.isInteger()) ||
             (MatchRC.second != InputRC.second)) {
diff --git a/lib/CodeGen/SplitKit.cpp b/lib/CodeGen/SplitKit.cpp
index 9a751c13b5..4a2b7ec1cf 100644
--- a/lib/CodeGen/SplitKit.cpp
+++ b/lib/CodeGen/SplitKit.cpp
@@ -652,7 +652,7 @@ void SplitEditor::removeBackCopies(SmallVectorImpl<VNInfo*> &Copies) {
     // Adjust RegAssign if a register assignment is killed at VNI->def.  We
     // want to avoid calculating the live range of the source register if
     // possible.
-    AssignI.find(VNI->def.getPrevSlot());
+    AssignI.find(Def.getPrevSlot());
     if (!AssignI.valid() || AssignI.start() >= Def)
       continue;
     // If MI doesn't kill the assigned register, just leave it.
@@ -739,6 +739,8 @@ void SplitEditor::hoistCopiesForSize() {
   for (LiveInterval::vni_iterator VI = LI->vni_begin(), VE = LI->vni_end();
        VI != VE; ++VI) {
     VNInfo *VNI = *VI;
+    if (VNI->isUnused())
+      continue;
     VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(VNI->def);
     assert(ParentVNI && "Parent not live at complement def");
 
@@ -812,6 +814,8 @@ void SplitEditor::hoistCopiesForSize() {
   for (LiveInterval::vni_iterator VI = LI->vni_begin(), VE = LI->vni_end();
        VI != VE; ++VI) {
     VNInfo *VNI = *VI;
+    if (VNI->isUnused())
+      continue;
     VNInfo *ParentVNI = Edit->getParent().getVNInfoAt(VNI->def);
     const DomPair &Dom = NearestDom[ParentVNI->id];
     if (!Dom.first || Dom.second == VNI->def)
@@ -1047,8 +1051,7 @@ void SplitEditor::finish(SmallVectorImpl<unsigned> *LRMap) {
     if (ParentVNI->isUnused())
       continue;
     unsigned RegIdx = RegAssign.lookup(ParentVNI->def);
-    VNInfo *VNI = defValue(RegIdx, ParentVNI, ParentVNI->def);
-    VNI->setIsPHIDef(ParentVNI->isPHIDef());
+    defValue(RegIdx, ParentVNI, ParentVNI->def);
 
     // Force rematted values to be recomputed everywhere.
     // The new live ranges may be truncated.
diff --git a/lib/CodeGen/StackProtector.cpp b/lib/CodeGen/StackProtector.cpp
index 43a6ad8c97..a04ac3fbc1 100644
--- a/lib/CodeGen/StackProtector.cpp
+++ b/lib/CodeGen/StackProtector.cpp
@@ -28,15 +28,10 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/ADT/Triple.h"
 using namespace llvm;
 
-// SSPBufferSize - The lower bound for a buffer to be considered for stack
-// smashing protection.
-static cl::opt<unsigned>
-SSPBufferSize("stack-protector-buffer-size", cl::init(8),
-              cl::desc("Lower bound for a buffer to be considered for "
-                       "stack protection"));
-
 namespace {
   class StackProtector : public FunctionPass {
     /// TLI - Keep a pointer of a TargetLowering to consult for determining
@@ -46,7 +41,7 @@ namespace {
     Function *F;
     Module *M;
 
-    DominatorTree* DT;
+    DominatorTree *DT;
 
     /// InsertStackProtectors - Insert code into the prologue and epilogue of
     /// the function.
@@ -60,6 +55,11 @@ namespace {
     /// check fails.
     BasicBlock *CreateFailBB();
 
+    /// ContainsProtectableArray - Check whether the type either is an array or
+    /// contains an array of sufficient size so that we need stack protectors
+    /// for it.
+    bool ContainsProtectableArray(Type *Ty, bool InStruct = false) const;
+
     /// RequiresStackProtector - Check whether or not this function needs a
     /// stack protector based upon the stack protector level.
     bool RequiresStackProtector() const;
@@ -70,8 +70,8 @@ namespace {
     }
     StackProtector(const TargetLowering *tli)
       : FunctionPass(ID), TLI(tli) {
-        initializeStackProtectorPass(*PassRegistry::getPassRegistry());
-      }
+      initializeStackProtectorPass(*PassRegistry::getPassRegistry());
+    }
 
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addPreserved<DominatorTree>();
@@ -95,10 +95,43 @@ bool StackProtector::runOnFunction(Function &Fn) {
   DT = getAnalysisIfAvailable<DominatorTree>();
 
   if (!RequiresStackProtector()) return false;
-  
+
   return InsertStackProtectors();
 }
 
+/// ContainsProtectableArray - Check whether the type either is an array or
+/// contains a char array of sufficient size so that we need stack protectors
+/// for it.
+bool StackProtector::ContainsProtectableArray(Type *Ty, bool InStruct) const {
+  if (!Ty) return false;
+  if (ArrayType *AT = dyn_cast<ArrayType>(Ty)) {
+    const TargetMachine &TM = TLI->getTargetMachine();
+    if (!AT->getElementType()->isIntegerTy(8)) {
+      Triple Trip(TM.getTargetTriple());
+
+      // If we're on a non-Darwin platform or we're inside of a structure, don't
+      // add stack protectors unless the array is a character array.
+      if (InStruct || !Trip.isOSDarwin())
+          return false;
+    }
+
+    // If an array has more than SSPBufferSize bytes of allocated space, then we
+    // emit stack protectors.
+    if (TM.Options.SSPBufferSize <= TLI->getTargetData()->getTypeAllocSize(AT))
+      return true;
+  }
+
+  const StructType *ST = dyn_cast<StructType>(Ty);
+  if (!ST) return false;
+
+  for (StructType::element_iterator I = ST->element_begin(),
+         E = ST->element_end(); I != E; ++I)
+    if (ContainsProtectableArray(*I, true))
+      return true;
+
+  return false;
+}
+
 /// RequiresStackProtector - Check whether or not this function needs a stack
 /// protector based upon the stack protector level. The heuristic we use is to
 /// add a guard variable to functions that call alloca, and functions with
@@ -110,8 +143,6 @@ bool StackProtector::RequiresStackProtector() const {
   if (!F->hasFnAttr(Attribute::StackProtect))
     return false;
 
-  const TargetData *TD = TLI->getTargetData();
-
   for (Function::iterator I = F->begin(), E = F->end(); I != E; ++I) {
     BasicBlock *BB = I;
 
@@ -123,11 +154,8 @@ bool StackProtector::RequiresStackProtector() const {
           // protectors.
           return true;
 
-        if (ArrayType *AT = dyn_cast<ArrayType>(AI->getAllocatedType()))
-          // If an array has more than SSPBufferSize bytes of allocated space,
-          // then we emit stack protectors.
-          if (SSPBufferSize <= TD->getTypeAllocSize(AT))
-            return true;
+        if (ContainsProtectableArray(AI->getAllocatedType()))
+          return true;
       }
   }
 
@@ -159,17 +187,17 @@ bool StackProtector::InsertStackProtectors() {
       //     StackGuardSlot = alloca i8*
       //     StackGuard = load __stack_chk_guard
       //     call void @llvm.stackprotect.create(StackGuard, StackGuardSlot)
-      // 
+      //
       PointerType *PtrTy = Type::getInt8PtrTy(RI->getContext());
       unsigned AddressSpace, Offset;
       if (TLI->getStackCookieLocation(AddressSpace, Offset)) {
         Constant *OffsetVal =
           ConstantInt::get(Type::getInt32Ty(RI->getContext()), Offset);
-        
+
         StackGuardVar = ConstantExpr::getIntToPtr(OffsetVal,
                                       PointerType::get(PtrTy, AddressSpace));
       } else {
-        StackGuardVar = M->getOrInsertGlobal("__stack_chk_guard", PtrTy); 
+        StackGuardVar = M->getOrInsertGlobal("__stack_chk_guard", PtrTy);
       }
 
       BasicBlock &Entry = F->getEntryBlock();
diff --git a/lib/CodeGen/StackSlotColoring.cpp b/lib/CodeGen/StackSlotColoring.cpp
index 1e940b1d07..20da36e8fb 100644
--- a/lib/CodeGen/StackSlotColoring.cpp
+++ b/lib/CodeGen/StackSlotColoring.cpp
@@ -46,7 +46,6 @@ STATISTIC(NumDead,       "Number of trivially dead stack accesses eliminated");
 
 namespace {
   class StackSlotColoring : public MachineFunctionPass {
-    bool ColorWithRegs;
     LiveStacks* LS;
     MachineFrameInfo *MFI;
     const TargetInstrInfo  *TII;
@@ -82,7 +81,7 @@ namespace {
   public:
     static char ID; // Pass identification
     StackSlotColoring() :
-      MachineFunctionPass(ID), ColorWithRegs(false), NextColor(-1) {
+      MachineFunctionPass(ID), NextColor(-1) {
         initializeStackSlotColoringPass(*PassRegistry::getPassRegistry());
       }
 
diff --git a/lib/CodeGen/StrongPHIElimination.cpp b/lib/CodeGen/StrongPHIElimination.cpp
index c6fdc73824..5b06195046 100644
--- a/lib/CodeGen/StrongPHIElimination.cpp
+++ b/lib/CodeGen/StrongPHIElimination.cpp
@@ -672,8 +672,8 @@ void StrongPHIElimination::InsertCopiesForPHI(MachineInstr *PHI,
       LiveInterval &SrcInterval = LI->getInterval(SrcReg);
       SlotIndex PredIndex = LI->getMBBEndIdx(PredBB);
       VNInfo *SrcVNI = SrcInterval.getVNInfoBefore(PredIndex);
+      (void)SrcVNI;
       assert(SrcVNI);
-      SrcVNI->setHasPHIKill(true);
       continue;
     }
 
@@ -744,7 +744,6 @@ void StrongPHIElimination::InsertCopiesForPHI(MachineInstr *PHI,
     SlotIndex PHIIndex = LI->getInstructionIndex(PHI);
     VNInfo *DestVNI = DestLI.getVNInfoAt(PHIIndex.getRegSlot());
     assert(DestVNI);
-    DestVNI->setIsPHIDef(true);
   
     // Prior to PHI elimination, the live ranges of PHIs begin at their defining
     // instruction. After PHI elimination, PHI instructions are replaced by VNs
@@ -777,7 +776,6 @@ void StrongPHIElimination::InsertCopiesForPHI(MachineInstr *PHI,
   SlotIndex DestCopyIndex = LI->getInstructionIndex(CopyInstr);
   VNInfo *CopyVNI = CopyLI.getNextValue(MBBStartIndex,
                                         LI->getVNInfoAllocator());
-  CopyVNI->setIsPHIDef(true);
   CopyLI.addRange(LiveRange(MBBStartIndex,
                             DestCopyIndex.getRegSlot(),
                             CopyVNI));
diff --git a/lib/CodeGen/TargetInstrInfoImpl.cpp b/lib/CodeGen/TargetInstrInfoImpl.cpp
index 54be88a8bb..ddee6b2401 100644
--- a/lib/CodeGen/TargetInstrInfoImpl.cpp
+++ b/lib/CodeGen/TargetInstrInfoImpl.cpp
@@ -570,12 +570,12 @@ TargetInstrInfoImpl::getNumMicroOps(const InstrItineraryData *ItinData,
 }
 
 /// Return the default expected latency for a def based on it's opcode.
-unsigned TargetInstrInfo::defaultDefLatency(const InstrItineraryData *ItinData,
+unsigned TargetInstrInfo::defaultDefLatency(const MCSchedModel *SchedModel,
                                             const MachineInstr *DefMI) const {
   if (DefMI->mayLoad())
-    return ItinData->Props.LoadLatency;
+    return SchedModel->LoadLatency;
   if (isHighLatencyDef(DefMI->getOpcode()))
-    return ItinData->Props.HighLatency;
+    return SchedModel->HighLatency;
   return 1;
 }
 
@@ -629,7 +629,7 @@ static int computeDefOperandLatency(
   if (FindMin) {
     // If MinLatency is valid, call getInstrLatency. This uses Stage latency if
     // it exists before defaulting to MinLatency.
-    if (ItinData->Props.MinLatency >= 0)
+    if (ItinData->SchedModel->MinLatency >= 0)
       return TII->getInstrLatency(ItinData, DefMI);
 
     // If MinLatency is invalid, OperandLatency is interpreted as MinLatency.
@@ -638,10 +638,10 @@ static int computeDefOperandLatency(
       return 1;
   }
   else if(ItinData->isEmpty())
-    return TII->defaultDefLatency(ItinData, DefMI);
+    return TII->defaultDefLatency(ItinData->SchedModel, DefMI);
 
   // ...operand lookup required
-return -1;
+  return -1;
 }
 
 /// computeOperandLatency - Compute and return the latency of the given data
@@ -669,7 +669,8 @@ computeOperandLatency(const InstrItineraryData *ItinData,
 
   // Expected latency is the max of the stage latency and itinerary props.
   if (!FindMin)
-    InstrLatency = std::max(InstrLatency, defaultDefLatency(ItinData, DefMI));
+    InstrLatency = std::max(InstrLatency,
+                            defaultDefLatency(ItinData->SchedModel, DefMI));
   return InstrLatency;
 }
 
@@ -742,6 +743,7 @@ computeOperandLatency(const InstrItineraryData *ItinData,
 
   // Expected latency is the max of the stage latency and itinerary props.
   if (!FindMin)
-    InstrLatency = std::max(InstrLatency, defaultDefLatency(ItinData, DefMI));
+    InstrLatency = std::max(InstrLatency,
+                            defaultDefLatency(ItinData->SchedModel, DefMI));
   return InstrLatency;
 }
diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index babfacb782..75c9467a4f 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -109,8 +109,9 @@ getELFKindForNamedSection(StringRef Name, SectionKind K) {
   // N.B.: The defaults used in here are no the same ones used in MC.
   // We follow gcc, MC follows gas. For example, given ".section .eh_frame",
   // both gas and MC will produce a section with no flags. Given
-  // section(".eh_frame") gcc will produce
-  // .section	.eh_frame,"a",@progbits
+  // section(".eh_frame") gcc will produce:
+  //
+  //   .section   .eh_frame,"a",@progbits
   if (Name.empty() || Name[0] != '.') return K;
 
   // Some lame default implementation based on some magic section names.
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index 153f8711d8..aa601af21b 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -30,6 +30,7 @@
 #define DEBUG_TYPE "twoaddrinstr"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Function.h"
+#include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -55,18 +56,19 @@ STATISTIC(NumCommuted        , "Number of instructions commuted to coalesce");
 STATISTIC(NumAggrCommuted    , "Number of instructions aggressively commuted");
 STATISTIC(NumConvertedTo3Addr, "Number of instructions promoted to 3-address");
 STATISTIC(Num3AddrSunk,        "Number of 3-address instructions sunk");
-STATISTIC(NumReMats,           "Number of instructions re-materialized");
-STATISTIC(NumDeletes,          "Number of dead instructions deleted");
 STATISTIC(NumReSchedUps,       "Number of instructions re-scheduled up");
 STATISTIC(NumReSchedDowns,     "Number of instructions re-scheduled down");
 
 namespace {
   class TwoAddressInstructionPass : public MachineFunctionPass {
+    MachineFunction *MF;
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
     const InstrItineraryData *InstrItins;
     MachineRegisterInfo *MRI;
     LiveVariables *LV;
+    SlotIndexes *Indexes;
+    LiveIntervals *LIS;
     AliasAnalysis *AA;
     CodeGenOpt::Level OptLevel;
 
@@ -92,16 +94,9 @@ namespace {
                               unsigned Reg,
                               MachineBasicBlock::iterator OldPos);
 
-    bool isProfitableToReMat(unsigned Reg, const TargetRegisterClass *RC,
-                             MachineInstr *MI, MachineInstr *DefMI,
-                             MachineBasicBlock *MBB, unsigned Loc);
-
     bool NoUseAfterLastDef(unsigned Reg, MachineBasicBlock *MBB, unsigned Dist,
                            unsigned &LastDef);
 
-    MachineInstr *FindLastUseInMBB(unsigned Reg, MachineBasicBlock *MBB,
-                                   unsigned Dist);
-
     bool isProfitableToCommute(unsigned regA, unsigned regB, unsigned regC,
                                MachineInstr *MI, MachineBasicBlock *MBB,
                                unsigned Dist);
@@ -117,14 +112,6 @@ namespace {
                             MachineFunction::iterator &mbbi,
                             unsigned RegA, unsigned RegB, unsigned Dist);
 
-    typedef std::pair<std::pair<unsigned, bool>, MachineInstr*> NewKill;
-    bool canUpdateDeletedKills(SmallVector<unsigned, 4> &Kills,
-                               SmallVector<NewKill, 4> &NewKills,
-                               MachineBasicBlock *MBB, unsigned Dist);
-    bool DeleteUnusedInstr(MachineBasicBlock::iterator &mi,
-                           MachineBasicBlock::iterator &nmi,
-                           MachineFunction::iterator &mbbi, unsigned Dist);
-
     bool isDefTooClose(unsigned Reg, unsigned Dist,
                        MachineInstr *MI, MachineBasicBlock *MBB);
 
@@ -150,6 +137,11 @@ namespace {
     void ProcessCopy(MachineInstr *MI, MachineBasicBlock *MBB,
                      SmallPtrSet<MachineInstr*, 8> &Processed);
 
+    typedef SmallVector<std::pair<unsigned, unsigned>, 4> TiedPairList;
+    typedef SmallDenseMap<unsigned, TiedPairList> TiedOperandMap;
+    bool collectTiedOperands(MachineInstr *MI, TiedOperandMap&);
+    void processTiedPairs(MachineInstr *MI, TiedPairList&, unsigned &Dist);
+
     void CoalesceExtSubRegs(SmallVector<unsigned,4> &Srcs, unsigned DstReg);
 
     /// EliminateRegSequences - Eliminate REG_SEQUENCE instructions as part
@@ -167,6 +159,8 @@ namespace {
       AU.setPreservesCFG();
       AU.addRequired<AliasAnalysis>();
       AU.addPreserved<LiveVariables>();
+      AU.addPreserved<SlotIndexes>();
+      AU.addPreserved<LiveIntervals>();
       AU.addPreservedID(MachineLoopInfoID);
       AU.addPreservedID(MachineDominatorsID);
       MachineFunctionPass::getAnalysisUsage(AU);
@@ -241,7 +235,7 @@ bool TwoAddressInstructionPass::Sink3AddrInstruction(MachineBasicBlock *MBB,
   // appropriate location, we can try to sink the current instruction
   // past it.
   if (!KillMI || KillMI->getParent() != MBB || KillMI == MI ||
-      KillMI->isTerminator())
+      KillMI == OldPos || KillMI->isTerminator())
     return false;
 
   // If any of the definitions are used by another instruction between the
@@ -284,6 +278,7 @@ bool TwoAddressInstructionPass::Sink3AddrInstruction(MachineBasicBlock *MBB,
       }
     }
   }
+  assert(KillMO && "Didn't find kill");
 
   // Update kill and LV information.
   KillMO->setIsKill(false);
@@ -297,59 +292,13 @@ bool TwoAddressInstructionPass::Sink3AddrInstruction(MachineBasicBlock *MBB,
   MBB->remove(MI);
   MBB->insert(KillPos, MI);
 
+  if (LIS)
+    LIS->handleMove(MI);
+
   ++Num3AddrSunk;
   return true;
 }
 
-/// isTwoAddrUse - Return true if the specified MI is using the specified
-/// register as a two-address operand.
-static bool isTwoAddrUse(MachineInstr *UseMI, unsigned Reg) {
-  const MCInstrDesc &MCID = UseMI->getDesc();
-  for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = UseMI->getOperand(i);
-    if (MO.isReg() && MO.getReg() == Reg &&
-        (MO.isDef() || UseMI->isRegTiedToDefOperand(i)))
-      // Earlier use is a two-address one.
-      return true;
-  }
-  return false;
-}
-
-/// isProfitableToReMat - Return true if the heuristics determines it is likely
-/// to be profitable to re-materialize the definition of Reg rather than copy
-/// the register.
-bool
-TwoAddressInstructionPass::isProfitableToReMat(unsigned Reg,
-                                         const TargetRegisterClass *RC,
-                                         MachineInstr *MI, MachineInstr *DefMI,
-                                         MachineBasicBlock *MBB, unsigned Loc) {
-  bool OtherUse = false;
-  for (MachineRegisterInfo::use_nodbg_iterator UI = MRI->use_nodbg_begin(Reg),
-         UE = MRI->use_nodbg_end(); UI != UE; ++UI) {
-    MachineOperand &UseMO = UI.getOperand();
-    MachineInstr *UseMI = UseMO.getParent();
-    MachineBasicBlock *UseMBB = UseMI->getParent();
-    if (UseMBB == MBB) {
-      DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(UseMI);
-      if (DI != DistanceMap.end() && DI->second == Loc)
-        continue;  // Current use.
-      OtherUse = true;
-      // There is at least one other use in the MBB that will clobber the
-      // register.
-      if (isTwoAddrUse(UseMI, Reg))
-        return true;
-    }
-  }
-
-  // If other uses in MBB are not two-address uses, then don't remat.
-  if (OtherUse)
-    return false;
-
-  // No other uses in the same block, remat if it's defined in the same
-  // block so it does not unnecessarily extend the live range.
-  return MBB == DefMI->getParent();
-}
-
 /// NoUseAfterLastDef - Return true if there are no intervening uses between the
 /// last instruction in the MBB that defines the specified register and the
 /// two-address instruction which is being processed. It also returns the last
@@ -377,31 +326,6 @@ bool TwoAddressInstructionPass::NoUseAfterLastDef(unsigned Reg,
   return !(LastUse > LastDef && LastUse < Dist);
 }
 
-MachineInstr *TwoAddressInstructionPass::FindLastUseInMBB(unsigned Reg,
-                                                         MachineBasicBlock *MBB,
-                                                         unsigned Dist) {
-  unsigned LastUseDist = 0;
-  MachineInstr *LastUse = 0;
-  for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(Reg),
-         E = MRI->reg_end(); I != E; ++I) {
-    MachineOperand &MO = I.getOperand();
-    MachineInstr *MI = MO.getParent();
-    if (MI->getParent() != MBB || MI->isDebugValue())
-      continue;
-    DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(MI);
-    if (DI == DistanceMap.end())
-      continue;
-    if (DI->second >= Dist)
-      continue;
-
-    if (MO.isUse() && DI->second > LastUseDist) {
-      LastUse = DI->first;
-      LastUseDist = DI->second;
-    }
-  }
-  return LastUse;
-}
-
 /// isCopyToReg - Return true if the specified MI is a copy instruction or
 /// a extract_subreg instruction. It also returns the source and destination
 /// registers and whether they are physical registers by reference.
@@ -483,32 +407,6 @@ static bool isTwoAddrUse(MachineInstr &MI, unsigned Reg, unsigned &DstReg) {
   return false;
 }
 
-/// findLocalKill - Look for an instruction below MI in the MBB that kills the
-/// specified register. Returns null if there are any other Reg use between the
-/// instructions.
-static
-MachineInstr *findLocalKill(unsigned Reg, MachineBasicBlock *MBB,
-                            MachineInstr *MI, MachineRegisterInfo *MRI,
-                            DenseMap<MachineInstr*, unsigned> &DistanceMap) {
-  MachineInstr *KillMI = 0;
-  for (MachineRegisterInfo::use_nodbg_iterator
-         UI = MRI->use_nodbg_begin(Reg),
-         UE = MRI->use_nodbg_end(); UI != UE; ++UI) {
-    MachineInstr *UseMI = &*UI;
-    if (UseMI == MI || UseMI->getParent() != MBB)
-      continue;
-    if (DistanceMap.count(UseMI))
-      continue;
-    if (!UI.getOperand().isKill())
-      return 0;
-    if (KillMI)
-      return 0;  // -O0 kill markers cannot be trusted?
-    KillMI = UseMI;
-  }
-
-  return KillMI;
-}
-
 /// findOnlyInterestingUse - Given a register, if has a single in-basic block
 /// use, return the use instruction if it's a copy or a two-address use.
 static
@@ -564,7 +462,7 @@ regsAreCompatible(unsigned RegA, unsigned RegB, const TargetRegisterInfo *TRI) {
 }
 
 
-/// isProfitableToReMat - Return true if it's potentially profitable to commute
+/// isProfitableToCommute - Return true if it's potentially profitable to commute
 /// the two-address instruction that's being processed.
 bool
 TwoAddressInstructionPass::isProfitableToCommute(unsigned regA, unsigned regB,
@@ -654,6 +552,8 @@ TwoAddressInstructionPass::CommuteInstruction(MachineBasicBlock::iterator &mi,
     if (LV)
       // Update live variables
       LV->replaceKillInstruction(RegC, MI, NewMI);
+    if (Indexes)
+      Indexes->replaceMachineInstrInMaps(MI, NewMI);
 
     mbbi->insert(mi, NewMI);           // Insert the new inst
     mbbi->erase(mi);                   // Nuke the old inst.
@@ -702,6 +602,9 @@ TwoAddressInstructionPass::ConvertInstTo3Addr(MachineBasicBlock::iterator &mi,
     DEBUG(dbgs() << "2addr:         TO 3-ADDR: " << *NewMI);
     bool Sunk = false;
 
+    if (Indexes)
+      Indexes->replaceMachineInstrInMaps(mi, NewMI);
+
     if (NewMI->findRegisterUseOperand(RegB, false, TRI))
       // FIXME: Temporary workaround. If the new instruction doesn't
       // uses RegB, convertToThreeAddress must have created more
@@ -811,92 +714,6 @@ void TwoAddressInstructionPass::ProcessCopy(MachineInstr *MI,
   return;
 }
 
-/// isSafeToDelete - If the specified instruction does not produce any side
-/// effects and all of its defs are dead, then it's safe to delete.
-static bool isSafeToDelete(MachineInstr *MI,
-                           const TargetInstrInfo *TII,
-                           SmallVector<unsigned, 4> &Kills) {
-  if (MI->mayStore() || MI->isCall())
-    return false;
-  if (MI->isTerminator() || MI->hasUnmodeledSideEffects())
-    return false;
-
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    MachineOperand &MO = MI->getOperand(i);
-    if (!MO.isReg())
-      continue;
-    if (MO.isDef() && !MO.isDead())
-      return false;
-    if (MO.isUse() && MO.isKill())
-      Kills.push_back(MO.getReg());
-  }
-  return true;
-}
-
-/// canUpdateDeletedKills - Check if all the registers listed in Kills are
-/// killed by instructions in MBB preceding the current instruction at
-/// position Dist.  If so, return true and record information about the
-/// preceding kills in NewKills.
-bool TwoAddressInstructionPass::
-canUpdateDeletedKills(SmallVector<unsigned, 4> &Kills,
-                      SmallVector<NewKill, 4> &NewKills,
-                      MachineBasicBlock *MBB, unsigned Dist) {
-  while (!Kills.empty()) {
-    unsigned Kill = Kills.back();
-    Kills.pop_back();
-    if (TargetRegisterInfo::isPhysicalRegister(Kill))
-      return false;
-
-    MachineInstr *LastKill = FindLastUseInMBB(Kill, MBB, Dist);
-    if (!LastKill)
-      return false;
-
-    bool isModRef = LastKill->definesRegister(Kill);
-    NewKills.push_back(std::make_pair(std::make_pair(Kill, isModRef),
-                                      LastKill));
-  }
-  return true;
-}
-
-/// DeleteUnusedInstr - If an instruction with a tied register operand can
-/// be safely deleted, just delete it.
-bool
-TwoAddressInstructionPass::DeleteUnusedInstr(MachineBasicBlock::iterator &mi,
-                                             MachineBasicBlock::iterator &nmi,
-                                             MachineFunction::iterator &mbbi,
-                                             unsigned Dist) {
-  // Check if the instruction has no side effects and if all its defs are dead.
-  SmallVector<unsigned, 4> Kills;
-  if (!isSafeToDelete(mi, TII, Kills))
-    return false;
-
-  // If this instruction kills some virtual registers, we need to
-  // update the kill information. If it's not possible to do so,
-  // then bail out.
-  SmallVector<NewKill, 4> NewKills;
-  if (!canUpdateDeletedKills(Kills, NewKills, &*mbbi, Dist))
-    return false;
-
-  if (LV) {
-    while (!NewKills.empty()) {
-      MachineInstr *NewKill = NewKills.back().second;
-      unsigned Kill = NewKills.back().first.first;
-      bool isDead = NewKills.back().first.second;
-      NewKills.pop_back();
-      if (LV->removeVirtualRegisterKilled(Kill, mi)) {
-        if (isDead)
-          LV->addVirtualRegisterDead(Kill, NewKill);
-        else
-          LV->addVirtualRegisterKilled(Kill, NewKill);
-      }
-    }
-  }
-
-  mbbi->erase(mi); // Nuke the old inst.
-  mi = nmi;
-  return true;
-}
-
 /// RescheduleMIBelowKill - If there is one more local instruction that reads
 /// 'Reg' and it kills 'Reg, consider moving the instruction below the kill
 /// instruction in order to eliminate the need for the copy.
@@ -905,14 +722,19 @@ TwoAddressInstructionPass::RescheduleMIBelowKill(MachineBasicBlock *MBB,
                                      MachineBasicBlock::iterator &mi,
                                      MachineBasicBlock::iterator &nmi,
                                      unsigned Reg) {
+  // Bail immediately if we don't have LV available. We use it to find kills
+  // efficiently.
+  if (!LV)
+    return false;
+
   MachineInstr *MI = &*mi;
   DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(MI);
   if (DI == DistanceMap.end())
     // Must be created from unfolded load. Don't waste time trying this.
     return false;
 
-  MachineInstr *KillMI = findLocalKill(Reg, MBB, mi, MRI, DistanceMap);
-  if (!KillMI || KillMI->isCopy() || KillMI->isCopyLike())
+  MachineInstr *KillMI = LV->getVarInfo(Reg).findKill(MBB);
+  if (!KillMI || MI == KillMI || KillMI->isCopy() || KillMI->isCopyLike())
     // Don't mess with copies, they may be coalesced later.
     return false;
 
@@ -999,6 +821,12 @@ TwoAddressInstructionPass::RescheduleMIBelowKill(MachineBasicBlock *MBB,
             ((MO.isKill() && Uses.count(MOReg)) || Kills.count(MOReg)))
           // Don't want to extend other live ranges and update kills.
           return false;
+        if (MOReg == Reg && !MO.isKill())
+          // We can't schedule across a use of the register in question.
+          return false;
+        // Ensure that if this is register in question, its the kill we expect.
+        assert((MOReg != Reg || OtherMI == KillMI) &&
+               "Found multiple kills of a register in a basic block");
       }
     }
   }
@@ -1012,20 +840,13 @@ TwoAddressInstructionPass::RescheduleMIBelowKill(MachineBasicBlock *MBB,
   MBB->splice(KillPos, MBB, From, To);
   DistanceMap.erase(DI);
 
-  if (LV) {
-    // Update live variables
-    LV->removeVirtualRegisterKilled(Reg, KillMI);
-    LV->addVirtualRegisterKilled(Reg, MI);
-  } else {
-    for (unsigned i = 0, e = KillMI->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = KillMI->getOperand(i);
-      if (!MO.isReg() || !MO.isUse() || MO.getReg() != Reg)
-        continue;
-      MO.setIsKill(false);
-    }
-    MI->addRegisterKilled(Reg, 0);
-  }
+  // Update live variables
+  LV->removeVirtualRegisterKilled(Reg, KillMI);
+  LV->addVirtualRegisterKilled(Reg, MI);
+  if (LIS)
+    LIS->handleMove(MI);
 
+  DEBUG(dbgs() << "\trescheduled below kill: " << *KillMI);
   return true;
 }
 
@@ -1061,14 +882,19 @@ TwoAddressInstructionPass::RescheduleKillAboveMI(MachineBasicBlock *MBB,
                                      MachineBasicBlock::iterator &mi,
                                      MachineBasicBlock::iterator &nmi,
                                      unsigned Reg) {
+  // Bail immediately if we don't have LV available. We use it to find kills
+  // efficiently.
+  if (!LV)
+    return false;
+
   MachineInstr *MI = &*mi;
   DenseMap<MachineInstr*, unsigned>::iterator DI = DistanceMap.find(MI);
   if (DI == DistanceMap.end())
     // Must be created from unfolded load. Don't waste time trying this.
     return false;
 
-  MachineInstr *KillMI = findLocalKill(Reg, MBB, mi, MRI, DistanceMap);
-  if (!KillMI || KillMI->isCopy() || KillMI->isCopyLike())
+  MachineInstr *KillMI = LV->getVarInfo(Reg).findKill(MBB);
+  if (!KillMI || MI == KillMI || KillMI->isCopy() || KillMI->isCopyLike())
     // Don't mess with copies, they may be coalesced later.
     return false;
 
@@ -1094,6 +920,8 @@ TwoAddressInstructionPass::RescheduleKillAboveMI(MachineBasicBlock *MBB,
         continue;
       if (isDefTooClose(MOReg, DI->second, MI, MBB))
         return false;
+      if (MOReg == Reg && !MO.isKill())
+        return false;
       Uses.insert(MOReg);
       if (MO.isKill() && MOReg != Reg)
         Kills.insert(MOReg);
@@ -1135,6 +963,9 @@ TwoAddressInstructionPass::RescheduleKillAboveMI(MachineBasicBlock *MBB,
         if (Kills.count(MOReg))
           // Don't want to extend other live ranges and update kills.
           return false;
+        if (OtherMI != MI && MOReg == Reg && !MO.isKill())
+          // We can't schedule across a use of the register in question.
+          return false;
       } else {
         OtherDefs.push_back(MOReg);
       }
@@ -1165,19 +996,13 @@ TwoAddressInstructionPass::RescheduleKillAboveMI(MachineBasicBlock *MBB,
   nmi = llvm::prior(InsertPos); // Backtrack so we process the moved instr.
   DistanceMap.erase(DI);
 
-  if (LV) {
-    // Update live variables
-    LV->removeVirtualRegisterKilled(Reg, KillMI);
-    LV->addVirtualRegisterKilled(Reg, MI);
-  } else {
-    for (unsigned i = 0, e = KillMI->getNumOperands(); i != e; ++i) {
-      MachineOperand &MO = KillMI->getOperand(i);
-      if (!MO.isReg() || !MO.isUse() || MO.getReg() != Reg)
-        continue;
-      MO.setIsKill(false);
-    }
-    MI->addRegisterKilled(Reg, 0);
-  }
+  // Update live variables
+  LV->removeVirtualRegisterKilled(Reg, KillMI);
+  LV->addVirtualRegisterKilled(Reg, MI);
+  if (LIS)
+    LIS->handleMove(KillMI);
+
+  DEBUG(dbgs() << "\trescheduled kill: " << *KillMI);
   return true;
 }
 
@@ -1202,15 +1027,7 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi,
 
   assert(TargetRegisterInfo::isVirtualRegister(regB) &&
          "cannot make instruction into two-address form");
-
-  // If regA is dead and the instruction can be deleted, just delete
-  // it so it doesn't clobber regB.
   bool regBKilled = isKilled(MI, regB, MRI, TII);
-  if (!regBKilled && MI.getOperand(DstIdx).isDead() &&
-      DeleteUnusedInstr(mi, nmi, mbbi, Dist)) {
-    ++NumDeletes;
-    return true; // Done with this instruction.
-  }
 
   if (TargetRegisterInfo::isVirtualRegister(regA))
     ScanUses(regA, &*mbbi, Processed);
@@ -1294,16 +1111,14 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi,
     if (NewOpc != 0) {
       const MCInstrDesc &UnfoldMCID = TII->get(NewOpc);
       if (UnfoldMCID.getNumDefs() == 1) {
-        MachineFunction &MF = *mbbi->getParent();
-
         // Unfold the load.
         DEBUG(dbgs() << "2addr:   UNFOLDING: " << MI);
         const TargetRegisterClass *RC =
           TRI->getAllocatableClass(
-            TII->getRegClass(UnfoldMCID, LoadRegIndex, TRI, MF));
+            TII->getRegClass(UnfoldMCID, LoadRegIndex, TRI, *MF));
         unsigned Reg = MRI->createVirtualRegister(RC);
         SmallVector<MachineInstr *, 2> NewMIs;
-        if (!TII->unfoldMemoryOperand(MF, &MI, Reg,
+        if (!TII->unfoldMemoryOperand(*MF, &MI, Reg,
                                       /*UnfoldLoad=*/true,/*UnfoldStore=*/false,
                                       NewMIs)) {
           DEBUG(dbgs() << "2addr: ABANDONING UNFOLD\n");
@@ -1380,15 +1195,177 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi,
   return false;
 }
 
+// Collect tied operands of MI that need to be handled.
+// Rewrite trivial cases immediately.
+// Return true if any tied operands where found, including the trivial ones.
+bool TwoAddressInstructionPass::
+collectTiedOperands(MachineInstr *MI, TiedOperandMap &TiedOperands) {
+  const MCInstrDesc &MCID = MI->getDesc();
+  bool AnyOps = false;
+  unsigned NumOps = MI->isInlineAsm() ?
+    MI->getNumOperands() : MCID.getNumOperands();
+
+  for (unsigned SrcIdx = 0; SrcIdx < NumOps; ++SrcIdx) {
+    unsigned DstIdx = 0;
+    if (!MI->isRegTiedToDefOperand(SrcIdx, &DstIdx))
+      continue;
+    AnyOps = true;
+    MachineOperand &SrcMO = MI->getOperand(SrcIdx);
+    MachineOperand &DstMO = MI->getOperand(DstIdx);
+    unsigned SrcReg = SrcMO.getReg();
+    unsigned DstReg = DstMO.getReg();
+    // Tied constraint already satisfied?
+    if (SrcReg == DstReg)
+      continue;
+
+    assert(SrcReg && SrcMO.isUse() && "two address instruction invalid");
+
+    // Deal with <undef> uses immediately - simply rewrite the src operand.
+    if (SrcMO.isUndef()) {
+      // Constrain the DstReg register class if required.
+      if (TargetRegisterInfo::isVirtualRegister(DstReg))
+        if (const TargetRegisterClass *RC = TII->getRegClass(MCID, SrcIdx,
+                                                             TRI, *MF))
+          MRI->constrainRegClass(DstReg, RC);
+      SrcMO.setReg(DstReg);
+      DEBUG(dbgs() << "\t\trewrite undef:\t" << *MI);
+      continue;
+    }
+    TiedOperands[SrcReg].push_back(std::make_pair(SrcIdx, DstIdx));
+  }
+  return AnyOps;
+}
+
+// Process a list of tied MI operands that all use the same source register.
+// The tied pairs are of the form (SrcIdx, DstIdx).
+void
+TwoAddressInstructionPass::processTiedPairs(MachineInstr *MI,
+                                            TiedPairList &TiedPairs,
+                                            unsigned &Dist) {
+  bool IsEarlyClobber = false;
+  bool RemovedKillFlag = false;
+  bool AllUsesCopied = true;
+  unsigned LastCopiedReg = 0;
+  unsigned RegB = 0;
+  for (unsigned tpi = 0, tpe = TiedPairs.size(); tpi != tpe; ++tpi) {
+    unsigned SrcIdx = TiedPairs[tpi].first;
+    unsigned DstIdx = TiedPairs[tpi].second;
+
+    const MachineOperand &DstMO = MI->getOperand(DstIdx);
+    unsigned RegA = DstMO.getReg();
+    IsEarlyClobber |= DstMO.isEarlyClobber();
+
+    // Grab RegB from the instruction because it may have changed if the
+    // instruction was commuted.
+    RegB = MI->getOperand(SrcIdx).getReg();
+
+    if (RegA == RegB) {
+      // The register is tied to multiple destinations (or else we would
+      // not have continued this far), but this use of the register
+      // already matches the tied destination.  Leave it.
+      AllUsesCopied = false;
+      continue;
+    }
+    LastCopiedReg = RegA;
+
+    assert(TargetRegisterInfo::isVirtualRegister(RegB) &&
+           "cannot make instruction into two-address form");
+
+#ifndef NDEBUG
+    // First, verify that we don't have a use of "a" in the instruction
+    // (a = b + a for example) because our transformation will not
+    // work. This should never occur because we are in SSA form.
+    for (unsigned i = 0; i != MI->getNumOperands(); ++i)
+      assert(i == DstIdx ||
+             !MI->getOperand(i).isReg() ||
+             MI->getOperand(i).getReg() != RegA);
+#endif
+
+    // Emit a copy.
+    BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+            TII->get(TargetOpcode::COPY), RegA).addReg(RegB);
+
+    // Update DistanceMap.
+    MachineBasicBlock::iterator PrevMI = MI;
+    --PrevMI;
+    DistanceMap.insert(std::make_pair(PrevMI, Dist));
+    DistanceMap[MI] = ++Dist;
+
+    SlotIndex CopyIdx;
+    if (Indexes)
+      CopyIdx = Indexes->insertMachineInstrInMaps(PrevMI).getRegSlot();
+
+    DEBUG(dbgs() << "\t\tprepend:\t" << *PrevMI);
+
+    MachineOperand &MO = MI->getOperand(SrcIdx);
+    assert(MO.isReg() && MO.getReg() == RegB && MO.isUse() &&
+           "inconsistent operand info for 2-reg pass");
+    if (MO.isKill()) {
+      MO.setIsKill(false);
+      RemovedKillFlag = true;
+    }
+
+    // Make sure regA is a legal regclass for the SrcIdx operand.
+    if (TargetRegisterInfo::isVirtualRegister(RegA) &&
+        TargetRegisterInfo::isVirtualRegister(RegB))
+      MRI->constrainRegClass(RegA, MRI->getRegClass(RegB));
+
+    MO.setReg(RegA);
+
+    // Propagate SrcRegMap.
+    SrcRegMap[RegA] = RegB;
+  }
+
+
+  if (AllUsesCopied) {
+    if (!IsEarlyClobber) {
+      // Replace other (un-tied) uses of regB with LastCopiedReg.
+      for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+        MachineOperand &MO = MI->getOperand(i);
+        if (MO.isReg() && MO.getReg() == RegB && MO.isUse()) {
+          if (MO.isKill()) {
+            MO.setIsKill(false);
+            RemovedKillFlag = true;
+          }
+          MO.setReg(LastCopiedReg);
+        }
+      }
+    }
+
+    // Update live variables for regB.
+    if (RemovedKillFlag && LV && LV->getVarInfo(RegB).removeKill(MI)) {
+      MachineBasicBlock::iterator PrevMI = MI;
+      --PrevMI;
+      LV->addVirtualRegisterKilled(RegB, PrevMI);
+    }
+
+  } else if (RemovedKillFlag) {
+    // Some tied uses of regB matched their destination registers, so
+    // regB is still used in this instruction, but a kill flag was
+    // removed from a different tied use of regB, so now we need to add
+    // a kill flag to one of the remaining uses of regB.
+    for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+      MachineOperand &MO = MI->getOperand(i);
+      if (MO.isReg() && MO.getReg() == RegB && MO.isUse()) {
+        MO.setIsKill(true);
+        break;
+      }
+    }
+  }
+}
+
 /// runOnMachineFunction - Reduce two-address instructions to two operands.
 ///
-bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
-  const TargetMachine &TM = MF.getTarget();
-  MRI = &MF.getRegInfo();
+bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
+  MF = &Func;
+  const TargetMachine &TM = MF->getTarget();
+  MRI = &MF->getRegInfo();
   TII = TM.getInstrInfo();
   TRI = TM.getRegisterInfo();
   InstrItins = TM.getInstrItineraryData();
+  Indexes = getAnalysisIfAvailable<SlotIndexes>();
   LV = getAnalysisIfAvailable<LiveVariables>();
+  LIS = getAnalysisIfAvailable<LiveIntervals>();
   AA = &getAnalysis<AliasAnalysis>();
   OptLevel = TM.getOptLevel();
 
@@ -1396,20 +1373,15 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
 
   DEBUG(dbgs() << "********** REWRITING TWO-ADDR INSTRS **********\n");
   DEBUG(dbgs() << "********** Function: "
-        << MF.getFunction()->getName() << '\n');
+        << MF->getFunction()->getName() << '\n');
 
   // This pass takes the function out of SSA form.
   MRI->leaveSSA();
 
-  // ReMatRegs - Keep track of the registers whose def's are remat'ed.
-  BitVector ReMatRegs(MRI->getNumVirtRegs());
-
-  typedef DenseMap<unsigned, SmallVector<std::pair<unsigned, unsigned>, 4> >
-    TiedOperandMap;
-  TiedOperandMap TiedOperands(4);
+  TiedOperandMap TiedOperands;
 
   SmallPtrSet<MachineInstr*, 8> Processed;
-  for (MachineFunction::iterator mbbi = MF.begin(), mbbe = MF.end();
+  for (MachineFunction::iterator mbbi = MF->begin(), mbbe = MF->end();
        mbbi != mbbe; ++mbbi) {
     unsigned Dist = 0;
     DistanceMap.clear();
@@ -1428,199 +1400,48 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
       if (mi->isRegSequence())
         RegSequences.push_back(&*mi);
 
-      const MCInstrDesc &MCID = mi->getDesc();
-      bool FirstTied = true;
-
       DistanceMap.insert(std::make_pair(mi, ++Dist));
 
       ProcessCopy(&*mi, &*mbbi, Processed);
 
       // First scan through all the tied register uses in this instruction
       // and record a list of pairs of tied operands for each register.
-      unsigned NumOps = mi->isInlineAsm()
-        ? mi->getNumOperands() : MCID.getNumOperands();
-      for (unsigned SrcIdx = 0; SrcIdx < NumOps; ++SrcIdx) {
-        unsigned DstIdx = 0;
-        if (!mi->isRegTiedToDefOperand(SrcIdx, &DstIdx))
-          continue;
-
-        if (FirstTied) {
-          FirstTied = false;
-          ++NumTwoAddressInstrs;
-          DEBUG(dbgs() << '\t' << *mi);
-        }
-
-        assert(mi->getOperand(SrcIdx).isReg() &&
-               mi->getOperand(SrcIdx).getReg() &&
-               mi->getOperand(SrcIdx).isUse() &&
-               "two address instruction invalid");
-
-        unsigned regB = mi->getOperand(SrcIdx).getReg();
-
-        // Deal with <undef> uses immediately - simply rewrite the src operand.
-        if (mi->getOperand(SrcIdx).isUndef()) {
-          unsigned DstReg = mi->getOperand(DstIdx).getReg();
-          // Constrain the DstReg register class if required.
-          if (TargetRegisterInfo::isVirtualRegister(DstReg))
-            if (const TargetRegisterClass *RC = TII->getRegClass(MCID, SrcIdx,
-                                                                 TRI, MF))
-              MRI->constrainRegClass(DstReg, RC);
-          mi->getOperand(SrcIdx).setReg(DstReg);
-          DEBUG(dbgs() << "\t\trewrite undef:\t" << *mi);
-          continue;
-        }
-        TiedOperands[regB].push_back(std::make_pair(SrcIdx, DstIdx));
+      if (!collectTiedOperands(mi, TiedOperands)) {
+        mi = nmi;
+        continue;
       }
 
-      // Now iterate over the information collected above.
-      for (TiedOperandMap::iterator OI = TiedOperands.begin(),
-             OE = TiedOperands.end(); OI != OE; ++OI) {
-        SmallVector<std::pair<unsigned, unsigned>, 4> &TiedPairs = OI->second;
-
-        // If the instruction has a single pair of tied operands, try some
-        // transformations that may either eliminate the tied operands or
-        // improve the opportunities for coalescing away the register copy.
-        if (TiedOperands.size() == 1 && TiedPairs.size() == 1) {
+      ++NumTwoAddressInstrs;
+      MadeChange = true;
+      DEBUG(dbgs() << '\t' << *mi);
+
+      // If the instruction has a single pair of tied operands, try some
+      // transformations that may either eliminate the tied operands or
+      // improve the opportunities for coalescing away the register copy.
+      if (TiedOperands.size() == 1) {
+        SmallVector<std::pair<unsigned, unsigned>, 4> &TiedPairs
+          = TiedOperands.begin()->second;
+        if (TiedPairs.size() == 1) {
           unsigned SrcIdx = TiedPairs[0].first;
           unsigned DstIdx = TiedPairs[0].second;
-
-          // If the registers are already equal, nothing needs to be done.
-          if (mi->getOperand(SrcIdx).getReg() ==
-              mi->getOperand(DstIdx).getReg())
-            break; // Done with this instruction.
-
-          if (TryInstructionTransform(mi, nmi, mbbi, SrcIdx, DstIdx, Dist,
-                                      Processed))
-            break; // The tied operands have been eliminated.
-        }
-
-        bool IsEarlyClobber = false;
-        bool RemovedKillFlag = false;
-        bool AllUsesCopied = true;
-        unsigned LastCopiedReg = 0;
-        unsigned regB = OI->first;
-        for (unsigned tpi = 0, tpe = TiedPairs.size(); tpi != tpe; ++tpi) {
-          unsigned SrcIdx = TiedPairs[tpi].first;
-          unsigned DstIdx = TiedPairs[tpi].second;
-
-          const MachineOperand &DstMO = mi->getOperand(DstIdx);
-          unsigned regA = DstMO.getReg();
-          IsEarlyClobber |= DstMO.isEarlyClobber();
-
-          // Grab regB from the instruction because it may have changed if the
-          // instruction was commuted.
-          regB = mi->getOperand(SrcIdx).getReg();
-
-          if (regA == regB) {
-            // The register is tied to multiple destinations (or else we would
-            // not have continued this far), but this use of the register
-            // already matches the tied destination.  Leave it.
-            AllUsesCopied = false;
+          unsigned SrcReg = mi->getOperand(SrcIdx).getReg();
+          unsigned DstReg = mi->getOperand(DstIdx).getReg();
+          if (SrcReg != DstReg &&
+              TryInstructionTransform(mi, nmi, mbbi, SrcIdx, DstIdx, Dist,
+                                      Processed)) {
+            // The tied operands have been eliminated or shifted further down the
+            // block to ease elimination. Continue processing with 'nmi'.
+            TiedOperands.clear();
+            mi = nmi;
             continue;
           }
-          LastCopiedReg = regA;
-
-          assert(TargetRegisterInfo::isVirtualRegister(regB) &&
-                 "cannot make instruction into two-address form");
-
-#ifndef NDEBUG
-          // First, verify that we don't have a use of "a" in the instruction
-          // (a = b + a for example) because our transformation will not
-          // work. This should never occur because we are in SSA form.
-          for (unsigned i = 0; i != mi->getNumOperands(); ++i)
-            assert(i == DstIdx ||
-                   !mi->getOperand(i).isReg() ||
-                   mi->getOperand(i).getReg() != regA);
-#endif
-
-          // Emit a copy or rematerialize the definition.
-          bool isCopy = false;
-          const TargetRegisterClass *rc = MRI->getRegClass(regB);
-          MachineInstr *DefMI = MRI->getUniqueVRegDef(regB);
-          // If it's safe and profitable, remat the definition instead of
-          // copying it.
-          if (DefMI &&
-              DefMI->isAsCheapAsAMove() &&
-              DefMI->isSafeToReMat(TII, AA, regB) &&
-              isProfitableToReMat(regB, rc, mi, DefMI, mbbi, Dist)){
-            DEBUG(dbgs() << "2addr: REMATTING : " << *DefMI << "\n");
-            unsigned regASubIdx = mi->getOperand(DstIdx).getSubReg();
-            TII->reMaterialize(*mbbi, mi, regA, regASubIdx, DefMI, *TRI);
-            ReMatRegs.set(TargetRegisterInfo::virtReg2Index(regB));
-            ++NumReMats;
-          } else {
-            BuildMI(*mbbi, mi, mi->getDebugLoc(), TII->get(TargetOpcode::COPY),
-                    regA).addReg(regB);
-            isCopy = true;
-          }
-
-          // Update DistanceMap.
-          MachineBasicBlock::iterator prevMI = prior(mi);
-          DistanceMap.insert(std::make_pair(prevMI, Dist));
-          DistanceMap[mi] = ++Dist;
-
-          DEBUG(dbgs() << "\t\tprepend:\t" << *prevMI);
-
-          MachineOperand &MO = mi->getOperand(SrcIdx);
-          assert(MO.isReg() && MO.getReg() == regB && MO.isUse() &&
-                 "inconsistent operand info for 2-reg pass");
-          if (MO.isKill()) {
-            MO.setIsKill(false);
-            RemovedKillFlag = true;
-          }
-
-          // Make sure regA is a legal regclass for the SrcIdx operand.
-          if (TargetRegisterInfo::isVirtualRegister(regA) &&
-              TargetRegisterInfo::isVirtualRegister(regB))
-            MRI->constrainRegClass(regA, MRI->getRegClass(regB));
-
-          MO.setReg(regA);
-
-          if (isCopy)
-            // Propagate SrcRegMap.
-            SrcRegMap[regA] = regB;
-        }
-
-        if (AllUsesCopied) {
-          if (!IsEarlyClobber) {
-            // Replace other (un-tied) uses of regB with LastCopiedReg.
-            for (unsigned i = 0, e = mi->getNumOperands(); i != e; ++i) {
-              MachineOperand &MO = mi->getOperand(i);
-              if (MO.isReg() && MO.getReg() == regB && MO.isUse()) {
-                if (MO.isKill()) {
-                  MO.setIsKill(false);
-                  RemovedKillFlag = true;
-                }
-                MO.setReg(LastCopiedReg);
-              }
-            }
-          }
-
-          // Update live variables for regB.
-          if (RemovedKillFlag && LV && LV->getVarInfo(regB).removeKill(mi))
-            LV->addVirtualRegisterKilled(regB, prior(mi));
-
-        } else if (RemovedKillFlag) {
-          // Some tied uses of regB matched their destination registers, so
-          // regB is still used in this instruction, but a kill flag was
-          // removed from a different tied use of regB, so now we need to add
-          // a kill flag to one of the remaining uses of regB.
-          for (unsigned i = 0, e = mi->getNumOperands(); i != e; ++i) {
-            MachineOperand &MO = mi->getOperand(i);
-            if (MO.isReg() && MO.getReg() == regB && MO.isUse()) {
-              MO.setIsKill(true);
-              break;
-            }
-          }
         }
+      }
 
-        // Schedule the source copy / remat inserted to form two-address
-        // instruction. FIXME: Does it matter the distance map may not be
-        // accurate after it's scheduled?
-        TII->scheduleTwoAddrSource(prior(mi), mi, *TRI);
-
-        MadeChange = true;
-
+      // Now iterate over the information collected above.
+      for (TiedOperandMap::iterator OI = TiedOperands.begin(),
+             OE = TiedOperands.end(); OI != OE; ++OI) {
+        processTiedPairs(mi, OI->second, Dist);
         DEBUG(dbgs() << "\t\trewrite to:\t" << *mi);
       }
 
@@ -1645,15 +1466,6 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
-  // Some remat'ed instructions are dead.
-  for (int i = ReMatRegs.find_first(); i != -1; i = ReMatRegs.find_next(i)) {
-    unsigned VReg = TargetRegisterInfo::index2VirtReg(i);
-    if (MRI->use_nodbg_empty(VReg)) {
-      MachineInstr *DefMI = MRI->getVRegDef(VReg);
-      DefMI->eraseFromParent();
-    }
-  }
-
   // Eliminate REG_SEQUENCE instructions. Their whole purpose was to preseve
   // SSA form. It's now safe to de-SSA.
   MadeChange |= EliminateRegSequences();