94 files changed, 2784 insertions, 611 deletions
diff --git a/Makefile b/Makefile
index 3bc1f5da52..f8d89ebd62 100644
--- a/Makefile
+++ b/Makefile
@@ -74,12 +74,10 @@ endif
 ifeq ($(MAKECMDGOALS),install-clang)
   DIRS := tools/clang/tools/driver tools/clang/lib/Headers \
           tools/clang/tools/libclang \
+          tools/clang/tools/c-index-test \
           tools/clang/include/clang-c \
           tools/clang/runtime tools/clang/docs \
           tools/lto runtime
-  ifneq ($(BUILD_CLANG_ONLY),YES)
-    DIRS += tools/clang/tools/c-index-test
-  endif
   OPTIONAL_DIRS :=
   NO_INSTALL = 1
 endif
diff --git a/docs/CommandGuide/FileCheck.rst b/docs/CommandGuide/FileCheck.rst
index 51a9bf6293..1d7a462bd7 100644
--- a/docs/CommandGuide/FileCheck.rst
+++ b/docs/CommandGuide/FileCheck.rst
@@ -45,6 +45,11 @@ OPTIONS
 
 
 
+**--input-file** *filename*
+
+  File to check (defaults to stdin).
+
+
 **--strict-whitespace**
 
  By default, FileCheck canonicalizes input horizontal whitespace (spaces and
@@ -271,8 +276,9 @@ simple example:
 The first check line matches a regex (**%[a-z]+**) and captures it into
 the variable "REGISTER".  The second line verifies that whatever is in REGISTER
 occurs later in the file after an "andw".  FileCheck variable references are
-always contained in **[[ ]]** pairs, are named, and their names can be
-name, then it is a definition of the variable, if not, it is a use.
+always contained in **[[ ]]** pairs, and their names can be formed with the
+regex **[a-zA-Z][a-zA-Z0-9]***.  If a colon follows the name, then it is a
+definition of the variable; otherwise, it is a use.
 
 FileCheck variables can be defined multiple times, and uses always get the
 latest value.  Note that variables are all read at the start of a "CHECK" line
diff --git a/docs/Phabricator.rst b/docs/Phabricator.rst
index 13ef9eddd3..b45449793e 100644
--- a/docs/Phabricator.rst
+++ b/docs/Phabricator.rst
@@ -50,8 +50,8 @@ reviewer understand your code.
 To get a full diff, use one of the following commands (or just use Arcanist
 to upload your patch):
 
-* git diff -U999999 other-branch
-* svn diff --diff-cmd=diff -x -U999999
+* ``git diff -U999999 other-branch``
+* ``svn diff --diff-cmd=diff -x -U999999``
 
 To upload a new patch:
 
diff --git a/docs/TestingGuide.html b/docs/TestingGuide.html
index ae2643fe4e..d90c8ad1c3 100644
--- a/docs/TestingGuide.html
+++ b/docs/TestingGuide.html
@@ -218,11 +218,11 @@ you can run the LLVM and Clang tests simultaneously using:</p>
 
 <p>To run individual tests or subsets of tests, you can use the 'llvm-lit'
 script which is built as part of LLVM. For example, to run the
-'Integer/BitCast.ll' test by itself you can run:</p>
+'Integer/BitPacked.ll' test by itself you can run:</p>
 
 <div class="doc_code">
 <pre>
-% llvm-lit ~/llvm/test/Integer/BitCast.ll 
+% llvm-lit ~/llvm/test/Integer/BitPacked.ll 
 </pre>
 </div>
 
diff --git a/include/llvm/CodeGen/MachineScheduler.h b/include/llvm/CodeGen/MachineScheduler.h
index 2b96c7abe4..31bd606f93 100644
--- a/include/llvm/CodeGen/MachineScheduler.h
+++ b/include/llvm/CodeGen/MachineScheduler.h
@@ -154,6 +154,8 @@ public:
 
   bool empty() const { return Queue.empty(); }
 
+  void clear() { Queue.clear(); }
+
   unsigned size() const { return Queue.size(); }
 
   typedef std::vector<SUnit*>::iterator iterator;
@@ -171,10 +173,12 @@ public:
     SU->NodeQueueId |= ID;
   }
 
-  void remove(iterator I) {
+  iterator remove(iterator I) {
     (*I)->NodeQueueId &= ~ID;
     *I = Queue.back();
+    unsigned idx = I - Queue.begin();
     Queue.pop_back();
+    return Queue.begin() + idx;
   }
 
 #ifndef NDEBUG
@@ -306,6 +310,9 @@ protected:
   /// Reinsert debug_values recorded in ScheduleDAGInstrs::DbgValues.
   void placeDebugValues();
 
+  /// \brief dump the scheduled Sequence.
+  void dumpSchedule() const;
+
   // Lesser helpers...
 
   void initRegPressure();
diff --git a/include/llvm/CodeGen/RegisterPressure.h b/include/llvm/CodeGen/RegisterPressure.h
index 2043155bc5..30326d05df 100644
--- a/include/llvm/CodeGen/RegisterPressure.h
+++ b/include/llvm/CodeGen/RegisterPressure.h
@@ -43,7 +43,7 @@ struct RegisterPressure {
   /// class. This is only useful to account for spilling or rematerialization.
   void decrease(const TargetRegisterClass *RC, const TargetRegisterInfo *TRI);
 
-  void dump(const TargetRegisterInfo *TRI);
+  void dump(const TargetRegisterInfo *TRI) const;
 };
 
 /// RegisterPressure computed within a region of instructions delimited by
@@ -197,6 +197,7 @@ public:
   /// This result is complete if either advance() or recede() has returned true,
   /// or if closeRegion() was explicitly invoked.
   RegisterPressure &getPressure() { return P; }
+  const RegisterPressure &getPressure() const { return P; }
 
   /// Get the register set pressure at the current position, which may be less
   /// than the pressure across the traversed region.
diff --git a/include/llvm/CodeGen/ScheduleDAG.h b/include/llvm/CodeGen/ScheduleDAG.h
index 05b74b09cb..7e0ca1478e 100644
--- a/include/llvm/CodeGen/ScheduleDAG.h
+++ b/include/llvm/CodeGen/ScheduleDAG.h
@@ -31,6 +31,7 @@ namespace llvm {
   class MachineFunction;
   class MachineRegisterInfo;
   class MachineInstr;
+  struct MCSchedClassDesc;
   class TargetRegisterInfo;
   class ScheduleDAG;
   class SDNode;
@@ -52,6 +53,13 @@ namespace llvm {
       Order        ///< Any other ordering dependency.
     };
 
+    enum OrderKind {
+      Barrier,      ///< An unknown scheduling barrier.
+      MayAliasMem,  ///< Nonvolatile load/Store instructions that may alias.
+      MustAliasMem, ///< Nonvolatile load/Store instructions that must alias.
+      Artificial    ///< Arbitrary weak DAG edge (no actual dependence).
+    };
+
   private:
     /// Dep - A pointer to the depending/depended-on SUnit, and an enum
     /// indicating the kind of the dependency.
@@ -65,20 +73,7 @@ namespace llvm {
       unsigned Reg;
 
       /// Order - Additional information about Order dependencies.
-      struct {
-        /// isNormalMemory - True if both sides of the dependence
-        /// access memory in non-volatile and fully modeled ways.
-        bool isNormalMemory : 1;
-
-        /// isMustAlias - True if both sides of the dependence are known to
-        /// access the same memory.
-        bool isMustAlias : 1;
-
-        /// isArtificial - True if this is an artificial dependency, meaning
-        /// it is not necessary for program correctness, and may be safely
-        /// deleted if necessary.
-        bool isArtificial : 1;
-      } Order;
+      unsigned OrdKind; // enum OrderKind
     } Contents;
 
     /// Latency - The time associated with this edge. Often this is just
@@ -86,6 +81,9 @@ namespace llvm {
     /// models may provide additional information about specific edges.
     unsigned Latency;
     /// Record MinLatency seperately from "expected" Latency.
+    ///
+    /// FIXME: this field is not packed on LP64. Convert to 16-bit DAG edge
+    /// latency after introducing saturating truncation.
     unsigned MinLatency;
 
   public:
@@ -95,28 +93,28 @@ namespace llvm {
     SDep() : Dep(0, Data) {}
 
     /// SDep - Construct an SDep with the specified values.
-    SDep(SUnit *S, Kind kind, unsigned latency = 1, unsigned Reg = 0,
-         bool isNormalMemory = false, bool isMustAlias = false,
-         bool isArtificial = false)
-      : Dep(S, kind), Contents(), Latency(latency), MinLatency(latency) {
+    SDep(SUnit *S, Kind kind, unsigned Reg)
+      : Dep(S, kind), Contents() {
       switch (kind) {
+      default:
+        llvm_unreachable("Reg given for non-register dependence!");
       case Anti:
       case Output:
         assert(Reg != 0 &&
                "SDep::Anti and SDep::Output must use a non-zero Reg!");
-        // fall through
-      case Data:
-        assert(!isMustAlias && "isMustAlias only applies with SDep::Order!");
-        assert(!isArtificial && "isArtificial only applies with SDep::Order!");
         Contents.Reg = Reg;
+        Latency = 0;
         break;
-      case Order:
-        assert(Reg == 0 && "Reg given for non-register dependence!");
-        Contents.Order.isNormalMemory = isNormalMemory;
-        Contents.Order.isMustAlias = isMustAlias;
-        Contents.Order.isArtificial = isArtificial;
+      case Data:
+        Contents.Reg = Reg;
+        Latency = 1;
         break;
       }
+      MinLatency = Latency;
+    }
+    SDep(SUnit *S, OrderKind kind)
+      : Dep(S, Order), Contents(), Latency(0), MinLatency(0) {
+      Contents.OrdKind = kind;
     }
 
     /// Return true if the specified SDep is equivalent except for latency.
@@ -128,10 +126,7 @@ namespace llvm {
       case Output:
         return Contents.Reg == Other.Contents.Reg;
       case Order:
-        return Contents.Order.isNormalMemory ==
-                 Other.Contents.Order.isNormalMemory &&
-               Contents.Order.isMustAlias == Other.Contents.Order.isMustAlias &&
-               Contents.Order.isArtificial == Other.Contents.Order.isArtificial;
+        return Contents.OrdKind == Other.Contents.OrdKind;
       }
       llvm_unreachable("Invalid dependency kind!");
     }
@@ -194,20 +189,21 @@ namespace llvm {
     /// memory accesses where both sides of the dependence access memory
     /// in non-volatile and fully modeled ways.
     bool isNormalMemory() const {
-      return getKind() == Order && Contents.Order.isNormalMemory;
+      return getKind() == Order && (Contents.OrdKind == MayAliasMem
+                                    || Contents.OrdKind == MustAliasMem);
     }
 
     /// isMustAlias - Test if this is an Order dependence that is marked
     /// as "must alias", meaning that the SUnits at either end of the edge
     /// have a memory dependence on a known memory location.
     bool isMustAlias() const {
-      return getKind() == Order && Contents.Order.isMustAlias;
+      return getKind() == Order && Contents.OrdKind == MustAliasMem;
     }
 
     /// isArtificial - Test if this is an Order dependence that is marked
     /// as "artificial", meaning it isn't necessary for correctness.
     bool isArtificial() const {
-      return getKind() == Order && Contents.Order.isArtificial;
+      return getKind() == Order && Contents.OrdKind == Artificial;
     }
 
     /// isAssignedRegDep - Test if this is a Data dependence that is
@@ -254,6 +250,8 @@ namespace llvm {
                                         // this node was cloned.
                                         // (SD scheduling only)
 
+    const MCSchedClassDesc *SchedClass; // NULL or resolved SchedClass.
+
     // Preds/Succs - The SUnits before/after us in the graph.
     SmallVector<SDep, 4> Preds;  // All sunit predecessors.
     SmallVector<SDep, 4> Succs;  // All sunit successors.
@@ -301,7 +299,7 @@ namespace llvm {
     /// SUnit - Construct an SUnit for pre-regalloc scheduling to represent
     /// an SDNode and any nodes flagged to it.
     SUnit(SDNode *node, unsigned nodenum)
-      : Node(node), Instr(0), OrigNode(0), NodeNum(nodenum),
+      : Node(node), Instr(0), OrigNode(0), SchedClass(0), NodeNum(nodenum),
         NodeQueueId(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0),
         NumSuccsLeft(0), NumRegDefsLeft(0), Latency(0),
         isVRegCycle(false), isCall(false), isCallOp(false), isTwoAddress(false),
@@ -315,7 +313,7 @@ namespace llvm {
     /// SUnit - Construct an SUnit for post-regalloc scheduling to represent
     /// a MachineInstr.
     SUnit(MachineInstr *instr, unsigned nodenum)
-      : Node(0), Instr(instr), OrigNode(0), NodeNum(nodenum),
+      : Node(0), Instr(instr), OrigNode(0), SchedClass(0), NodeNum(nodenum),
         NodeQueueId(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0),
         NumSuccsLeft(0), NumRegDefsLeft(0), Latency(0),
         isVRegCycle(false), isCall(false), isCallOp(false), isTwoAddress(false),
@@ -328,7 +326,7 @@ namespace llvm {
 
     /// SUnit - Construct a placeholder SUnit.
     SUnit()
-      : Node(0), Instr(0), OrigNode(0), NodeNum(~0u),
+      : Node(0), Instr(0), OrigNode(0), SchedClass(0), NodeNum(~0u),
         NodeQueueId(0), NumPreds(0), NumSuccs(0), NumPredsLeft(0),
         NumSuccsLeft(0), NumRegDefsLeft(0), Latency(0),
         isVRegCycle(false), isCall(false), isCallOp(false), isTwoAddress(false),
diff --git a/include/llvm/CodeGen/ScheduleDAGInstrs.h b/include/llvm/CodeGen/ScheduleDAGInstrs.h
index b4496a0515..4bcd35a834 100644
--- a/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -189,6 +189,13 @@ namespace llvm {
     /// \brief Get the machine model for instruction scheduling.
     const TargetSchedModel *getSchedModel() const { return &SchedModel; }
 
+    /// \brief Resolve and cache a resolved scheduling class for an SUnit.
+    const MCSchedClassDesc *getSchedClass(SUnit *SU) const {
+      if (!SU->SchedClass)
+        SU->SchedClass = SchedModel.resolveSchedClass(SU->getInstr());
+      return SU->SchedClass;
+    }
+
     /// begin - Return an iterator to the top of the current scheduling region.
     MachineBasicBlock::iterator begin() const { return RegionBegin; }
 
diff --git a/include/llvm/CodeGen/TargetSchedule.h b/include/llvm/CodeGen/TargetSchedule.h
index 3c55f1cbc5..88e6105a7d 100644
--- a/include/llvm/CodeGen/TargetSchedule.h
+++ b/include/llvm/CodeGen/TargetSchedule.h
@@ -16,8 +16,10 @@
 #ifndef LLVM_TARGET_TARGETSCHEDMODEL_H
 #define LLVM_TARGET_TARGETSCHEDMODEL_H
 
+#include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/MC/MCSchedule.h"
 #include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/ADT/SmallVector.h"
 
 namespace llvm {
 
@@ -34,6 +36,10 @@ class TargetSchedModel {
   InstrItineraryData InstrItins;
   const TargetSubtargetInfo *STI;
   const TargetInstrInfo *TII;
+
+  SmallVector<unsigned, 16> ResourceFactors;
+  unsigned MicroOpFactor; // Multiply to normalize microops to resource units.
+  unsigned ResourceLCM;   // Resource units per cycle. Latency normalization factor.
 public:
   TargetSchedModel(): STI(0), TII(0) {}
 
@@ -45,6 +51,9 @@ public:
   void init(const MCSchedModel &sm, const TargetSubtargetInfo *sti,
             const TargetInstrInfo *tii);
 
+  /// Return the MCSchedClassDesc for this instruction.
+  const MCSchedClassDesc *resolveSchedClass(const MachineInstr *MI) const;
+
   /// \brief TargetInstrInfo getter.
   const TargetInstrInfo *getInstrInfo() const { return TII; }
 
@@ -76,7 +85,48 @@ public:
   unsigned getIssueWidth() const { return SchedModel.IssueWidth; }
 
   /// \brief Return the number of issue slots required for this MI.
-  unsigned getNumMicroOps(MachineInstr *MI) const;
+  unsigned getNumMicroOps(const MachineInstr *MI,
+                          const MCSchedClassDesc *SC = 0) const;
+
+  /// \brief Get the number of kinds of resources for this target.
+  unsigned getNumProcResourceKinds() const {
+    return SchedModel.getNumProcResourceKinds();
+  }
+
+  /// \brief Get a processor resource by ID for convenience.
+  const MCProcResourceDesc *getProcResource(unsigned PIdx) const {
+    return SchedModel.getProcResource(PIdx);
+  }
+
+  typedef const MCWriteProcResEntry *ProcResIter;
+
+  // \brief Get an iterator into the processor resources consumed by this
+  // scheduling class.
+  ProcResIter getWriteProcResBegin(const MCSchedClassDesc *SC) const {
+    // The subtarget holds a single resource table for all processors.
+    return STI->getWriteProcResBegin(SC);
+  }
+  ProcResIter getWriteProcResEnd(const MCSchedClassDesc *SC) const {
+    return STI->getWriteProcResEnd(SC);
+  }
+
+  /// \brief Multiply the number of units consumed for a resource by this factor
+  /// to normalize it relative to other resources.
+  unsigned getResourceFactor(unsigned ResIdx) const {
+    return ResourceFactors[ResIdx];
+  }
+
+  /// \brief Multiply number of micro-ops by this factor to normalize it
+  /// relative to other resources.
+  unsigned getMicroOpFactor() const {
+    return MicroOpFactor;
+  }
+
+  /// \brief Multiply cycle count by this factor to normalize it relative to
+  /// other resources. This is the number of resource units per cycle.
+  unsigned getLatencyFactor() const {
+    return ResourceLCM;
+  }
 
   /// \brief Compute operand latency based on the available machine model.
   ///
@@ -105,15 +155,11 @@ public:
   unsigned computeOutputLatency(const MachineInstr *DefMI, unsigned DefIdx,
                                 const MachineInstr *DepMI) const;
 
-
 private:
   /// getDefLatency is a helper for computeOperandLatency. Return the
   /// instruction's latency if operand lookup is not required.
   /// Otherwise return -1.
   int getDefLatency(const MachineInstr *DefMI, bool FindMin) const;
-
-  /// Return the MCSchedClassDesc for this instruction.
-  const MCSchedClassDesc *resolveSchedClass(const MachineInstr *MI) const;
 };
 
 } // namespace llvm
diff --git a/include/llvm/DebugInfo/DIContext.h b/include/llvm/DebugInfo/DIContext.h
index 8d6054aa45..26bd1f6275 100644
--- a/include/llvm/DebugInfo/DIContext.h
+++ b/include/llvm/DebugInfo/DIContext.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_DEBUGINFO_DICONTEXT_H
 #define LLVM_DEBUGINFO_DICONTEXT_H
 
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
@@ -89,6 +90,13 @@ public:
   }
 };
 
+// In place of applying the relocations to the data we've read from disk we use
+// a separate mapping table to the side and checking that at locations in the
+// dwarf where we expect relocated values. This adds a bit of complexity to the
+// dwarf parsing/extraction at the benefit of not allocating memory for the
+// entire size of the debug info sections.
+typedef DenseMap<uint64_t, std::pair<uint8_t, int64_t> > RelocAddrMap;
+
 class DIContext {
 public:
   virtual ~DIContext();
@@ -100,7 +108,8 @@ public:
                                     StringRef aRangeSection = StringRef(),
                                     StringRef lineSection = StringRef(),
                                     StringRef stringSection = StringRef(),
-                                    StringRef rangeSection = StringRef());
+                                    StringRef rangeSection = StringRef(),
+                                    const RelocAddrMap &Map = RelocAddrMap());
 
   virtual void dump(raw_ostream &OS) = 0;
 
diff --git a/include/llvm/ExecutionEngine/ExecutionEngine.h b/include/llvm/ExecutionEngine/ExecutionEngine.h
index 04756ca601..8073d8f92c 100644
--- a/include/llvm/ExecutionEngine/ExecutionEngine.h
+++ b/include/llvm/ExecutionEngine/ExecutionEngine.h
@@ -249,6 +249,13 @@ public:
                      "EE!");
   }
 
+  // finalizeObject - This method should be called after sections within an
+  // object have been relocated using mapSectionAddress.  When this method is
+  // called the MCJIT execution engine will reapply relocations for a loaded
+  // object.  This method has no effect for the legacy JIT engine or the
+  // interpeter.
+  virtual void finalizeObject() {}
+
   /// runStaticConstructorsDestructors - This method is used to execute all of
   /// the static constructors or destructors for a program.
   ///
diff --git a/include/llvm/ExecutionEngine/JITEventListener.h b/include/llvm/ExecutionEngine/JITEventListener.h
index eea603fcee..e6586e778c 100644
--- a/include/llvm/ExecutionEngine/JITEventListener.h
+++ b/include/llvm/ExecutionEngine/JITEventListener.h
@@ -26,6 +26,7 @@ class Function;
 class MachineFunction;
 class OProfileWrapper;
 class IntelJITEventsWrapper;
+class ObjectImage;
 
 /// JITEvent_EmittedFunctionDetails - Helper struct for containing information
 /// about a generated machine code function.
@@ -76,6 +77,20 @@ public:
   /// matching NotifyFreeingMachineCode call.
   virtual void NotifyFreeingMachineCode(void *) {}
 
+  /// NotifyObjectEmitted - Called after an object has been successfully
+  /// emitted to memory.  NotifyFunctionEmitted will not be called for
+  /// individual functions in the object.
+  ///
+  /// ELF-specific information
+  /// The ObjectImage contains the generated object image
+  /// with section headers updated to reflect the address at which sections
+  /// were loaded and with relocations performed in-place on debug sections.
+  virtual void NotifyObjectEmitted(const ObjectImage &Obj) {}
+
+  /// NotifyFreeingObject - Called just before the memory associated with
+  /// a previously emitted object is released.
+  virtual void NotifyFreeingObject(const ObjectImage &Obj) {}
+
 #if LLVM_USE_INTEL_JITEVENTS
   // Construct an IntelJITEventListener
   static JITEventListener *createIntelJITEventListener();
diff --git a/include/llvm/IntrinsicsX86.td b/include/llvm/IntrinsicsX86.td
index 5ff085633e..d2463c0efa 100644
--- a/include/llvm/IntrinsicsX86.td
+++ b/include/llvm/IntrinsicsX86.td
@@ -2559,3 +2559,15 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_rdrand_32 : Intrinsic<[llvm_i32_ty, llvm_i32_ty], [], []>;
   def int_x86_rdrand_64 : Intrinsic<[llvm_i64_ty, llvm_i32_ty], [], []>;
 }
+
+//===----------------------------------------------------------------------===//
+// RTM intrinsics. Transactional Memory support.
+
+let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
+  def int_x86_xbegin : GCCBuiltin<"__builtin_ia32_xbegin">,
+              Intrinsic<[llvm_i32_ty], [], []>;
+  def int_x86_xend : GCCBuiltin<"__builtin_ia32_xend">,
+              Intrinsic<[], [], []>;
+  def int_x86_xabort : GCCBuiltin<"__builtin_ia32_xabort">,
+              Intrinsic<[], [llvm_i8_ty], [IntrNoReturn]>;
+}
diff --git a/include/llvm/MC/MCSchedule.h b/include/llvm/MC/MCSchedule.h
index c9a060c79b..0c71ee5135 100644
--- a/include/llvm/MC/MCSchedule.h
+++ b/include/llvm/MC/MCSchedule.h
@@ -219,6 +219,10 @@ public:
   /// Does this machine model include instruction-level scheduling.
   bool hasInstrSchedModel() const { return SchedClassTable; }
 
+  unsigned getNumProcResourceKinds() const {
+    return NumProcResourceKinds;
+  }
+
   const MCProcResourceDesc *getProcResource(unsigned ProcResourceIdx) const {
     assert(hasInstrSchedModel() && "No scheduling machine model");
 
diff --git a/include/llvm/Object/RelocVisitor.h b/include/llvm/Object/RelocVisitor.h
new file mode 100644
index 0000000000..7668bdedb7
--- /dev/null
+++ b/include/llvm/Object/RelocVisitor.h
@@ -0,0 +1,131 @@
+//===-- RelocVisitor.h - Visitor for object file relocations -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides a wrapper around all the different types of relocations
+// in different file formats, such that a client can handle them in a unified
+// manner by only implementing a minimal number of functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LLVM_OBJECT_RELOCVISITOR
+#define _LLVM_OBJECT_RELOCVISITOR
+
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Object/ELF.h"
+#include "llvm/ADT/StringRef.h"
+
+namespace llvm {
+namespace object {
+
+struct RelocToApply {
+  // The computed value after applying the relevant relocations.
+  int64_t Value;
+
+  // The width of the value; how many bytes to touch when applying the
+  // relocation.
+  char Width;
+  RelocToApply(const RelocToApply &In) : Value(In.Value), Width(In.Width) {}
+  RelocToApply(int64_t Value, char Width) : Value(Value), Width(Width) {}
+  RelocToApply() : Value(0), Width(0) {}
+};
+
+/// @brief Base class for object file relocation visitors.
+class RelocVisitor {
+public:
+  explicit RelocVisitor(llvm::StringRef FileFormat)
+    : FileFormat(FileFormat), HasError(false) {}
+
+  // TODO: Should handle multiple applied relocations via either passing in the
+  // previously computed value or just count paired relocations as a single
+  // visit.
+  RelocToApply visit(uint32_t RelocType, RelocationRef R, uint64_t SecAddr = 0,
+                     uint64_t Value = 0) {
+    if (FileFormat == "ELF64-x86-64") {
+      switch (RelocType) {
+        case llvm::ELF::R_X86_64_NONE:
+          return visitELF_X86_64_NONE(R);
+        case llvm::ELF::R_X86_64_64:
+          return visitELF_X86_64_64(R, Value);
+        case llvm::ELF::R_X86_64_PC32:
+          return visitELF_X86_64_PC32(R, Value, SecAddr);
+        case llvm::ELF::R_X86_64_32:
+          return visitELF_X86_64_32(R, Value);
+        case llvm::ELF::R_X86_64_32S:
+          return visitELF_X86_64_32S(R, Value);
+        default:
+          HasError = true;
+          return RelocToApply();
+      }
+    }
+    return RelocToApply();
+  }
+
+  bool error() { return HasError; }
+
+private:
+  llvm::StringRef FileFormat;
+  bool HasError;
+
+  /// Operations
+
+  // Width is the width in bytes of the extend.
+  RelocToApply zeroExtend(RelocToApply r, char Width) {
+    if (Width == r.Width)
+      return r;
+    r.Value &= (1 << ((Width * 8))) - 1;
+    return r;
+  }
+  RelocToApply signExtend(RelocToApply r, char Width) {
+    if (Width == r.Width)
+      return r;
+    bool SignBit = r.Value & (1 << ((Width * 8) - 1));
+    if (SignBit) {
+      r.Value |= ~((1 << (Width * 8)) - 1);
+    } else {
+      r.Value &= (1 << (Width * 8)) - 1;
+    }
+    return r;
+  }
+
+  /// X86-64 ELF
+  RelocToApply visitELF_X86_64_NONE(RelocationRef R) {
+    return RelocToApply(0, 0);
+  }
+  RelocToApply visitELF_X86_64_64(RelocationRef R, uint64_t Value) {
+    int64_t Addend;
+    R.getAdditionalInfo(Addend);
+    return RelocToApply(Value + Addend, 8);
+  }
+  RelocToApply visitELF_X86_64_PC32(RelocationRef R, uint64_t Value,
+                                    uint64_t SecAddr) {
+    int64_t Addend;
+    R.getAdditionalInfo(Addend);
+    uint64_t Address;
+    R.getAddress(Address);
+    return RelocToApply(Value + Addend - Address, 4);
+  }
+  RelocToApply visitELF_X86_64_32(RelocationRef R, uint64_t Value) {
+    int64_t Addend;
+    R.getAdditionalInfo(Addend);
+    uint32_t Res = (Value + Addend) & 0xFFFFFFFF;
+    return RelocToApply(Res, 4);
+  }
+  RelocToApply visitELF_X86_64_32S(RelocationRef R, uint64_t Value) {
+    int64_t Addend;
+    R.getAdditionalInfo(Addend);
+    int32_t Res = (Value + Addend) & 0xFFFFFFFF;
+    return RelocToApply(Res, 4);
+  }
+};
+
+}
+}
+#endif
diff --git a/include/llvm/Support/DataExtractor.h b/include/llvm/Support/DataExtractor.h
index 8d880fd5e8..a3ae782040 100644
--- a/include/llvm/Support/DataExtractor.h
+++ b/include/llvm/Support/DataExtractor.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_SUPPORT_DATAEXTRACTOR_H
 #define LLVM_SUPPORT_DATAEXTRACTOR_H
 
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/DataTypes.h"
 
diff --git a/include/llvm/Target/TargetTransformImpl.h b/include/llvm/Target/TargetTransformImpl.h
index 625be7208a..7ea2396076 100644
--- a/include/llvm/Target/TargetTransformImpl.h
+++ b/include/llvm/Target/TargetTransformImpl.h
@@ -55,13 +55,15 @@ protected:
   const TargetLowering *TLI;
 
   /// Estimate the cost of type-legalization and the legalized type.
-  std::pair<unsigned, EVT>
-  getTypeLegalizationCost(LLVMContext &C, EVT Ty) const;
+  std::pair<unsigned, MVT> getTypeLegalizationCost(Type *Ty) const;
 
   /// Estimate the overhead of scalarizing an instruction. Insert and Extract
   /// are set if the result needs to be inserted and/or extracted from vectors.
   unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const;
 
+  // Get the ISD node that corresponds to the Instruction class opcode.
+  int InstructionOpcodeToISD(unsigned Opcode) const;
+
 public:
   explicit VectorTargetTransformImpl(const TargetLowering *TL) : TLI(TL) {}
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 64d6186d91..22535fe5b4 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -307,47 +307,51 @@ DIE *DwarfDebug::updateSubprogramScopeDIE(CompileUnit *SPCU,
   assert(SPDie && "Unable to find subprogram DIE!");
   DISubprogram SP(SPNode);
 
-  DISubprogram SPDecl = SP.getFunctionDeclaration();
-  if (!SPDecl.isSubprogram()) {
-    // There is not any need to generate specification DIE for a function
-    // defined at compile unit level. If a function is defined inside another
-    // function then gdb prefers the definition at top level and but does not
-    // expect specification DIE in parent function. So avoid creating
-    // specification DIE for a function defined inside a function.
-    if (SP.isDefinition() && !SP.getContext().isCompileUnit() &&
-        !SP.getContext().isFile() &&
-        !isSubprogramContext(SP.getContext())) {
-      SPCU->addFlag(SPDie, dwarf::DW_AT_declaration);
-      
-      // Add arguments.
-      DICompositeType SPTy = SP.getType();
-      DIArray Args = SPTy.getTypeArray();
-      unsigned SPTag = SPTy.getTag();
-      if (SPTag == dwarf::DW_TAG_subroutine_type)
-        for (unsigned i = 1, N = Args.getNumElements(); i < N; ++i) {
-          DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter);
-          DIType ATy = DIType(Args.getElement(i));
-          SPCU->addType(Arg, ATy);
-          if (ATy.isArtificial())
-            SPCU->addFlag(Arg, dwarf::DW_AT_artificial);
-          if (ATy.isObjectPointer())
-            SPCU->addDIEEntry(SPDie, dwarf::DW_AT_object_pointer,
-                              dwarf::DW_FORM_ref4, Arg);
-          SPDie->addChild(Arg);
-        }
-      DIE *SPDeclDie = SPDie;
-      SPDie = new DIE(dwarf::DW_TAG_subprogram);
-      SPCU->addDIEEntry(SPDie, dwarf::DW_AT_specification, dwarf::DW_FORM_ref4,
-                        SPDeclDie);
-      SPCU->addDie(SPDie);
-    }
-  }
-  // Pick up abstract subprogram DIE.
+  // If we're updating an abstract DIE, then we will be adding the children and
+  // object pointer later on. But what we don't want to do is process the
+  // concrete DIE twice.
   if (DIE *AbsSPDIE = AbstractSPDies.lookup(SPNode)) {
+    // Pick up abstract subprogram DIE.
     SPDie = new DIE(dwarf::DW_TAG_subprogram);
     SPCU->addDIEEntry(SPDie, dwarf::DW_AT_abstract_origin,
                       dwarf::DW_FORM_ref4, AbsSPDIE);
     SPCU->addDie(SPDie);
+  } else {
+    DISubprogram SPDecl = SP.getFunctionDeclaration();
+    if (!SPDecl.isSubprogram()) {
+      // There is not any need to generate specification DIE for a function
+      // defined at compile unit level. If a function is defined inside another
+      // function then gdb prefers the definition at top level and but does not
+      // expect specification DIE in parent function. So avoid creating
+      // specification DIE for a function defined inside a function.
+      if (SP.isDefinition() && !SP.getContext().isCompileUnit() &&
+          !SP.getContext().isFile() &&
+          !isSubprogramContext(SP.getContext())) {
+        SPCU->addFlag(SPDie, dwarf::DW_AT_declaration);
+
+        // Add arguments.
+        DICompositeType SPTy = SP.getType();
+        DIArray Args = SPTy.getTypeArray();
+        unsigned SPTag = SPTy.getTag();
+        if (SPTag == dwarf::DW_TAG_subroutine_type)
+          for (unsigned i = 1, N = Args.getNumElements(); i < N; ++i) {
+            DIE *Arg = new DIE(dwarf::DW_TAG_formal_parameter);
+            DIType ATy = DIType(Args.getElement(i));
+            SPCU->addType(Arg, ATy);
+            if (ATy.isArtificial())
+              SPCU->addFlag(Arg, dwarf::DW_AT_artificial);
+            if (ATy.isObjectPointer())
+              SPCU->addDIEEntry(SPDie, dwarf::DW_AT_object_pointer,
+                                dwarf::DW_FORM_ref4, Arg);
+            SPDie->addChild(Arg);
+          }
+        DIE *SPDeclDie = SPDie;
+        SPDie = new DIE(dwarf::DW_TAG_subprogram);
+        SPCU->addDIEEntry(SPDie, dwarf::DW_AT_specification, dwarf::DW_FORM_ref4,
+                          SPDeclDie);
+        SPCU->addDie(SPDie);
+      }
+    }
   }
 
   SPCU->addLabel(SPDie, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr,
@@ -831,7 +835,7 @@ void DwarfDebug::endModule() {
         LexicalScope *Scope = 
           new LexicalScope(NULL, DIDescriptor(SP), NULL, false);
         DeadFnScopeMap[SP] = Scope;
-        
+
         // Construct subprogram DIE and add variables DIEs.
         CompileUnit *SPCU = CUMap.lookup(TheCU);
         assert(SPCU && "Unable to find Compile Unit!");
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index c55e8b7898..de16932c06 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -49,6 +49,15 @@ static cl::opt<unsigned> MISchedCutoff("misched-cutoff", cl::Hidden,
 static bool ViewMISchedDAGs = false;
 #endif // NDEBUG
 
+// Threshold to very roughly model an out-of-order processor's instruction
+// buffers. If the actual value of this threshold matters much in practice, then
+// it can be specified by the machine model. For now, it's an experimental
+// tuning knob to determine when and if it matters.
+static cl::opt<unsigned> ILPWindow("ilp-window", cl::Hidden,
+  cl::desc("Allow expected latency to exceed the critical path by N cycles "
+           "before attempting to balance ILP"),
+  cl::init(10U));
+
 //===----------------------------------------------------------------------===//
 // Machine Instruction Scheduling Pass and Registry
 //===----------------------------------------------------------------------===//
@@ -220,7 +229,7 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
     // The Scheduler may insert instructions during either schedule() or
     // exitRegion(), even for empty regions. So the local iterators 'I' and
     // 'RegionEnd' are invalid across these calls.
-    unsigned RemainingCount = MBB->size();
+    unsigned RemainingInstrs = MBB->size();
     for(MachineBasicBlock::iterator RegionEnd = MBB->end();
         RegionEnd != MBB->begin(); RegionEnd = Scheduler->begin()) {
 
@@ -229,19 +238,19 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
           || TII->isSchedulingBoundary(llvm::prior(RegionEnd), MBB, *MF)) {
         --RegionEnd;
         // Count the boundary instruction.
-        --RemainingCount;
+        --RemainingInstrs;
       }
 
       // The next region starts above the previous region. Look backward in the
       // instruction stream until we find the nearest boundary.
       MachineBasicBlock::iterator I = RegionEnd;
-      for(;I != MBB->begin(); --I, --RemainingCount) {
+      for(;I != MBB->begin(); --I, --RemainingInstrs) {
         if (TII->isSchedulingBoundary(llvm::prior(I), MBB, *MF))
           break;
       }
       // Notify the scheduler of the region, even if we may skip scheduling
       // it. Perhaps it still needs to be bundled.
-      Scheduler->enterRegion(MBB, I, RegionEnd, RemainingCount);
+      Scheduler->enterRegion(MBB, I, RegionEnd, RemainingInstrs);
 
       // Skip empty scheduling regions (0 or 1 schedulable instructions).
       if (I == RegionEnd || I == llvm::prior(RegionEnd)) {
@@ -255,7 +264,7 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
             << ":BB#" << MBB->getNumber() << "\n  From: " << *I << "    To: ";
             if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
             else dbgs() << "End";
-            dbgs() << " Remaining: " << RemainingCount << "\n");
+            dbgs() << " Remaining: " << RemainingInstrs << "\n");
 
       // Schedule a region: possibly reorder instructions.
       // This invalidates 'RegionEnd' and 'I'.
@@ -268,7 +277,7 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
       // scheduler for the top of it's scheduled region.
       RegionEnd = Scheduler->begin();
     }
-    assert(RemainingCount == 0 && "Instruction count mismatch!");
+    assert(RemainingInstrs == 0 && "Instruction count mismatch!");
     Scheduler->finishBlock();
   }
   Scheduler->finalizeSchedule();
@@ -487,6 +496,13 @@ void ScheduleDAGMI::schedule() {
   assert(CurrentTop == CurrentBottom && "Nonempty unscheduled zone.");
 
   placeDebugValues();
+
+  DEBUG({
+      unsigned BBNum = top()->getParent()->getNumber();
+      dbgs() << "*** Final schedule for BB#" << BBNum << " ***\n";
+      dumpSchedule();
+      dbgs() << '\n';
+    });
 }
 
 /// Build the DAG and setup three register pressure trackers.
@@ -627,6 +643,17 @@ void ScheduleDAGMI::placeDebugValues() {
   FirstDbgValue = NULL;
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void ScheduleDAGMI::dumpSchedule() const {
+  for (MachineBasicBlock::iterator MI = begin(), ME = end(); MI != ME; ++MI) {
+    if (SUnit *SU = getSUnit(&(*MI)))
+      SU->dump(this);
+    else
+      dbgs() << "Missing SUnit\n";
+  }
+}
+#endif
+
 //===----------------------------------------------------------------------===//
 // ConvergingScheduler - Implementation of the standard MachineSchedStrategy.
 //===----------------------------------------------------------------------===//
@@ -635,33 +662,127 @@ namespace {
 /// ConvergingScheduler shrinks the unscheduled zone using heuristics to balance
 /// the schedule.
 class ConvergingScheduler : public MachineSchedStrategy {
+public:
+  /// Represent the type of SchedCandidate found within a single queue.
+  /// pickNodeBidirectional depends on these listed by decreasing priority.
+  enum CandReason {
+    NoCand, SingleExcess, SingleCritical, ResourceReduce, ResourceDemand,
+    BotHeightReduce, BotPathReduce, TopDepthReduce, TopPathReduce,
+    SingleMax, MultiPressure, NextDefUse, NodeOrder};
+
+#ifndef NDEBUG
+  static const char *getReasonStr(ConvergingScheduler::CandReason Reason);
+#endif
+
+  /// Policy for scheduling the next instruction in the candidate's zone.
+  struct CandPolicy {
+    bool ReduceLatency;
+    unsigned ReduceResIdx;
+    unsigned DemandResIdx;
+
+    CandPolicy(): ReduceLatency(false), ReduceResIdx(0), DemandResIdx(0) {}
+  };
+
+  /// Status of an instruction's critical resource consumption.
+  struct SchedResourceDelta {
+    // Count critical resources in the scheduled region required by SU.
+    unsigned CritResources;
+
+    // Count critical resources from another region consumed by SU.
+    unsigned DemandedResources;
+
+    SchedResourceDelta(): CritResources(0), DemandedResources(0) {}
+
+    bool operator==(const SchedResourceDelta &RHS) const {
+      return CritResources == RHS.CritResources
+        && DemandedResources == RHS.DemandedResources;
+    }
+    bool operator!=(const SchedResourceDelta &RHS) const {
+      return !operator==(RHS);
+    }
+  };
 
   /// Store the state used by ConvergingScheduler heuristics, required for the
   /// lifetime of one invocation of pickNode().
   struct SchedCandidate {
+    CandPolicy Policy;
+
     // The best SUnit candidate.
     SUnit *SU;
 
+    // The reason for this candidate.
+    CandReason Reason;
+
     // Register pressure values for the best candidate.
     RegPressureDelta RPDelta;
 
-    SchedCandidate(): SU(NULL) {}
+    // Critical resource consumption of the best candidate.
+    SchedResourceDelta ResDelta;
+
+    SchedCandidate(const CandPolicy &policy)
+    : Policy(policy), SU(NULL), Reason(NoCand) {}
+
+    bool isValid() const { return SU; }
+
+    // Copy the status of another candidate without changing policy.
+    void setBest(SchedCandidate &Best) {
+      assert(Best.Reason != NoCand && "uninitialized Sched candidate");
+      SU = Best.SU;
+      Reason = Best.Reason;
+      RPDelta = Best.RPDelta;
+      ResDelta = Best.ResDelta;
+    }
+
+    void initResourceDelta(const ScheduleDAGMI *DAG,
+                           const TargetSchedModel *SchedModel);
+  };
+
+  /// Summarize the unscheduled region.
+  struct SchedRemainder {
+    // Critical path through the DAG in expected latency.
+    unsigned CriticalPath;
+
+    // Unscheduled resources
+    SmallVector<unsigned, 16> RemainingCounts;
+    // Critical resource for the unscheduled zone.
+    unsigned CritResIdx;
+    // Number of micro-ops left to schedule.
+    unsigned RemainingMicroOps;
+    // Is the unscheduled zone resource limited.
+    bool IsResourceLimited;
+
+    unsigned MaxRemainingCount;
+
+    void reset() {
+      CriticalPath = 0;
+      RemainingCounts.clear();
+      CritResIdx = 0;
+      RemainingMicroOps = 0;
+      IsResourceLimited = false;
+      MaxRemainingCount = 0;
+    }
+
+    SchedRemainder() { reset(); }
+
+    void init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel);
   };
-  /// Represent the type of SchedCandidate found within a single queue.
-  enum CandResult {
-    NoCand, NodeOrder, SingleExcess, SingleCritical, SingleMax, MultiPressure };
 
   /// Each Scheduling boundary is associated with ready queues. It tracks the
-  /// current cycle in whichever direction at has moved, and maintains the state
+  /// current cycle in the direction of movement, and maintains the state
   /// of "hazards" and other interlocks at the current cycle.
   struct SchedBoundary {
     ScheduleDAGMI *DAG;
     const TargetSchedModel *SchedModel;
+    SchedRemainder *Rem;
 
     ReadyQueue Available;
     ReadyQueue Pending;
     bool CheckPending;
 
+    // For heuristics, keep a list of the nodes that immediately depend on the
+    // most recently scheduled node.
+    SmallPtrSet<const SUnit*, 8> NextSUs;
+
     ScheduleHazardRecognizer *HazardRec;
 
     unsigned CurrCycle;
@@ -670,34 +791,88 @@ class ConvergingScheduler : public MachineSchedStrategy {
     /// MinReadyCycle - Cycle of the soonest available instruction.
     unsigned MinReadyCycle;
 
+    // The expected latency of the critical path in this scheduled zone.
+    unsigned ExpectedLatency;
+
+    // Resources used in the scheduled zone beyond this boundary.
+    SmallVector<unsigned, 16> ResourceCounts;
+
+    // Cache the critical resources ID in this scheduled zone.
+    unsigned CritResIdx;
+
+    // Is the scheduled region resource limited vs. latency limited.
+    bool IsResourceLimited;
+
+    unsigned ExpectedCount;
+
+    // Policy flag: attempt to find ILP until expected latency is covered.
+    bool ShouldIncreaseILP;
+
+#ifndef NDEBUG
     // Remember the greatest min operand latency.
     unsigned MaxMinLatency;
+#endif
+
+    void reset() {
+      Available.clear();
+      Pending.clear();
+      CheckPending = false;
+      NextSUs.clear();
+      HazardRec = 0;
+      CurrCycle = 0;
+      IssueCount = 0;
+      MinReadyCycle = UINT_MAX;
+      ExpectedLatency = 0;
+      ResourceCounts.resize(1);
+      assert(!ResourceCounts[0] && "nonzero count for bad resource");
+      CritResIdx = 0;
+      IsResourceLimited = false;
+      ExpectedCount = 0;
+      ShouldIncreaseILP = false;
+#ifndef NDEBUG
+      MaxMinLatency = 0;
+#endif
+      // Reserve a zero-count for invalid CritResIdx.
+      ResourceCounts.resize(1);
+    }
 
     /// Pending queues extend the ready queues with the same ID and the
     /// PendingFlag set.
     SchedBoundary(unsigned ID, const Twine &Name):
-      DAG(0), SchedModel(0), Available(ID, Name+".A"),
-      Pending(ID << ConvergingScheduler::LogMaxQID, Name+".P"),
-      CheckPending(false), HazardRec(0), CurrCycle(0), IssueCount(0),
-      MinReadyCycle(UINT_MAX), MaxMinLatency(0) {}
+      DAG(0), SchedModel(0), Rem(0), Available(ID, Name+".A"),
+      Pending(ID << ConvergingScheduler::LogMaxQID, Name+".P") {
+      reset();
+    }
 
     ~SchedBoundary() { delete HazardRec; }
 
-    void init(ScheduleDAGMI *dag, const TargetSchedModel *smodel) {
-      DAG = dag;
-      SchedModel = smodel;
-    }
+    void init(ScheduleDAGMI *dag, const TargetSchedModel *smodel,
+              SchedRemainder *rem);
 
     bool isTop() const {
       return Available.getID() == ConvergingScheduler::TopQID;
     }
 
+    unsigned getUnscheduledLatency(SUnit *SU) const {
+      if (isTop())
+        return SU->getHeight();
+      return SU->getDepth();
+    }
+
+    unsigned getCriticalCount() const {
+      return ResourceCounts[CritResIdx];
+    }
+
     bool checkHazard(SUnit *SU);
 
+    void checkILPPolicy();
+
     void releaseNode(SUnit *SU, unsigned ReadyCycle);
 
     void bumpCycle();
 
+    void countResource(unsigned PIdx, unsigned Cycles);
+
     void bumpNode(SUnit *SU);
 
     void releasePending();
@@ -707,11 +882,13 @@ class ConvergingScheduler : public MachineSchedStrategy {
     SUnit *pickOnlyChoice();
   };
 
+private:
   ScheduleDAGMI *DAG;
   const TargetSchedModel *SchedModel;
   const TargetRegisterInfo *TRI;
 
   // State of the top and bottom scheduled instruction boundaries.
+  SchedRemainder Rem;
   SchedBoundary Top;
   SchedBoundary Bot;
 
@@ -736,25 +913,75 @@ public:
 
   virtual void releaseBottomNode(SUnit *SU);
 
+  virtual void registerRoots();
+
 protected:
-  SUnit *pickNodeBidrectional(bool &IsTopNode);
+  void balanceZones(
+    ConvergingScheduler::SchedBoundary &CriticalZone,
+    ConvergingScheduler::SchedCandidate &CriticalCand,
+    ConvergingScheduler::SchedBoundary &OppositeZone,
+    ConvergingScheduler::SchedCandidate &OppositeCand);
+
+  void checkResourceLimits(ConvergingScheduler::SchedCandidate &TopCand,
+                           ConvergingScheduler::SchedCandidate &BotCand);
+
+  void tryCandidate(SchedCandidate &Cand,
+                    SchedCandidate &TryCand,
+                    SchedBoundary &Zone,
+                    const RegPressureTracker &RPTracker,
+                    RegPressureTracker &TempTracker);
+
+  SUnit *pickNodeBidirectional(bool &IsTopNode);
+
+  void pickNodeFromQueue(SchedBoundary &Zone,
+                         const RegPressureTracker &RPTracker,
+                         SchedCandidate &Candidate);
 
-  CandResult pickNodeFromQueue(ReadyQueue &Q,
-                               const RegPressureTracker &RPTracker,
-                               SchedCandidate &Candidate);
 #ifndef NDEBUG
-  void traceCandidate(const char *Label, const ReadyQueue &Q, SUnit *SU,
-                      PressureElement P = PressureElement());
+  void traceCandidate(const SchedCandidate &Cand, const SchedBoundary &Zone);
 #endif
 };
 } // namespace
 
+void ConvergingScheduler::SchedRemainder::
+init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel) {
+  reset();
+  if (!SchedModel->hasInstrSchedModel())
+    return;
+  RemainingCounts.resize(SchedModel->getNumProcResourceKinds());
+  for (std::vector<SUnit>::iterator
+         I = DAG->SUnits.begin(), E = DAG->SUnits.end(); I != E; ++I) {
+    const MCSchedClassDesc *SC = DAG->getSchedClass(&*I);
+    RemainingMicroOps += SchedModel->getNumMicroOps(I->getInstr(), SC);
+    for (TargetSchedModel::ProcResIter
+           PI = SchedModel->getWriteProcResBegin(SC),
+           PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
+      unsigned PIdx = PI->ProcResourceIdx;
+      unsigned Factor = SchedModel->getResourceFactor(PIdx);
+      RemainingCounts[PIdx] += (Factor * PI->Cycles);
+    }
+  }
+}
+
+void ConvergingScheduler::SchedBoundary::
+init(ScheduleDAGMI *dag, const TargetSchedModel *smodel, SchedRemainder *rem) {
+  reset();
+  DAG = dag;
+  SchedModel = smodel;
+  Rem = rem;
+  if (SchedModel->hasInstrSchedModel())
+    ResourceCounts.resize(SchedModel->getNumProcResourceKinds());
+}
+
 void ConvergingScheduler::initialize(ScheduleDAGMI *dag) {
   DAG = dag;
   SchedModel = DAG->getSchedModel();
   TRI = DAG->TRI;
-  Top.init(DAG, SchedModel);
-  Bot.init(DAG, SchedModel);
+  Rem.init(DAG, SchedModel);
+  Top.init(DAG, SchedModel, &Rem);
+  Bot.init(DAG, SchedModel, &Rem);
+
+  // Initialize resource counts.
 
   // Initialize the HazardRecognizers. If itineraries don't exist, are empty, or
   // are disabled, then these HazardRecs will be disabled.
@@ -803,6 +1030,17 @@ void ConvergingScheduler::releaseBottomNode(SUnit *SU) {
   Bot.releaseNode(SU, SU->BotReadyCycle);
 }
 
+void ConvergingScheduler::registerRoots() {
+  Rem.CriticalPath = DAG->ExitSU.getDepth();
+  // Some roots may not feed into ExitSU. Check all of them in case.
+  for (std::vector<SUnit*>::const_iterator
+         I = Bot.Available.begin(), E = Bot.Available.end(); I != E; ++I) {
+    if ((*I)->getDepth() > Rem.CriticalPath)
+      Rem.CriticalPath = (*I)->getDepth();
+  }
+  DEBUG(dbgs() << "Critical Path: " << Rem.CriticalPath << '\n');
+}
+
 /// Does this SU have a hazard within the current instruction group.
 ///
 /// The scheduler supports two modes of hazard recognition. The first is the
@@ -821,14 +1059,26 @@ bool ConvergingScheduler::SchedBoundary::checkHazard(SUnit *SU) {
     return HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard;
 
   unsigned uops = SchedModel->getNumMicroOps(SU->getInstr());
-  if (IssueCount + uops > SchedModel->getIssueWidth())
+  if ((IssueCount > 0) && (IssueCount + uops > SchedModel->getIssueWidth())) {
+    DEBUG(dbgs() << "  SU(" << SU->NodeNum << ") uops="
+          << SchedModel->getNumMicroOps(SU->getInstr()) << '\n');
     return true;
-
+  }
   return false;
 }
 
+/// If expected latency is covered, disable ILP policy.
+void ConvergingScheduler::SchedBoundary::checkILPPolicy() {
+  if (ShouldIncreaseILP
+      && (IsResourceLimited || ExpectedLatency <= CurrCycle)) {
+    ShouldIncreaseILP = false;
+    DEBUG(dbgs() << "Disable ILP: " << Available.getName() << '\n');
+  }
+}
+
 void ConvergingScheduler::SchedBoundary::releaseNode(SUnit *SU,
                                                      unsigned ReadyCycle) {
+
   if (ReadyCycle < MinReadyCycle)
     MinReadyCycle = ReadyCycle;
 
@@ -838,6 +1088,18 @@ void ConvergingScheduler::SchedBoundary::releaseNode(SUnit *SU,
     Pending.push(SU);
   else
     Available.push(SU);
+
+  // Record this node as an immediate dependent of the scheduled node.
+  NextSUs.insert(SU);
+
+  // If CriticalPath has been computed, then check if the unscheduled nodes
+  // exceed the ILP window. Before registerRoots, CriticalPath==0.
+  if (Rem->CriticalPath && (ExpectedLatency + getUnscheduledLatency(SU)
+                            > Rem->CriticalPath + ILPWindow)) {
+    ShouldIncreaseILP = true;
+    DEBUG(dbgs() << "Increase ILP: " << Available.getName() << " "
+          << ExpectedLatency << " + " << getUnscheduledLatency(SU) << '\n');
+  }
 }
 
 /// Move the boundary of scheduled code by one cycle.
@@ -845,8 +1107,12 @@ void ConvergingScheduler::SchedBoundary::bumpCycle() {
   unsigned Width = SchedModel->getIssueWidth();
   IssueCount = (IssueCount <= Width) ? 0 : IssueCount - Width;
 
+  unsigned NextCycle = CurrCycle + 1;
   assert(MinReadyCycle < UINT_MAX && "MinReadyCycle uninitialized");
-  unsigned NextCycle = std::max(CurrCycle + 1, MinReadyCycle);
+  if (MinReadyCycle > NextCycle) {
+    IssueCount = 0;
+    NextCycle = MinReadyCycle;
+  }
 
   if (!HazardRec->isEnabled()) {
     // Bypass HazardRec virtual calls.
@@ -862,11 +1128,39 @@ void ConvergingScheduler::SchedBoundary::bumpCycle() {
     }
   }
   CheckPending = true;
+  IsResourceLimited = getCriticalCount() > std::max(ExpectedLatency, CurrCycle);
 
-  DEBUG(dbgs() << "*** " << Available.getName() << " cycle "
+  DEBUG(dbgs() << "  *** " << Available.getName() << " cycle "
         << CurrCycle << '\n');
 }
 
+/// Add the given processor resource to this scheduled zone.
+void ConvergingScheduler::SchedBoundary::countResource(unsigned PIdx,
+                                                       unsigned Cycles) {
+  unsigned Factor = SchedModel->getResourceFactor(PIdx);
+  DEBUG(dbgs() << "  " << SchedModel->getProcResource(PIdx)->Name
+        << " +(" << Cycles << "x" << Factor
+        << ") / " << SchedModel->getLatencyFactor() << '\n');
+
+  unsigned Count = Factor * Cycles;
+  ResourceCounts[PIdx] += Count;
+  assert(Rem->RemainingCounts[PIdx] >= Count && "resource double counted");
+  Rem->RemainingCounts[PIdx] -= Count;
+
+  // Reset MaxRemainingCount for sanity.
+  Rem->MaxRemainingCount = 0;
+
+  // Check if this resource exceeds the current critical resource by a full
+  // cycle. If so, it becomes the critical resource.
+  if ((int)(ResourceCounts[PIdx] - ResourceCounts[CritResIdx])
+      >= (int)SchedModel->getLatencyFactor()) {
+    CritResIdx = PIdx;
+    DEBUG(dbgs() << "  *** Critical resource "
+          << SchedModel->getProcResource(PIdx)->Name << " x"
+          << ResourceCounts[PIdx] << '\n');
+  }
+}
+
 /// Move the boundary of scheduled code by one SUnit.
 void ConvergingScheduler::SchedBoundary::bumpNode(SUnit *SU) {
   // Update the reservation table.
@@ -878,11 +1172,38 @@ void ConvergingScheduler::SchedBoundary::bumpNode(SUnit *SU) {
     }
     HazardRec->EmitInstruction(SU);
   }
+  // Update resource counts and critical resource.
+  if (SchedModel->hasInstrSchedModel()) {
+    const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+    Rem->RemainingMicroOps -= SchedModel->getNumMicroOps(SU->getInstr(), SC);
+    for (TargetSchedModel::ProcResIter
+           PI = SchedModel->getWriteProcResBegin(SC),
+           PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
+      countResource(PI->ProcResourceIdx, PI->Cycles);
+    }
+  }
+  if (isTop()) {
+    if (SU->getDepth() > ExpectedLatency)
+      ExpectedLatency = SU->getDepth();
+  }
+  else {
+    if (SU->getHeight() > ExpectedLatency)
+      ExpectedLatency = SU->getHeight();
+  }
+
+  IsResourceLimited = getCriticalCount() > std::max(ExpectedLatency, CurrCycle);
+
   // Check the instruction group dispatch limit.
   // TODO: Check if this SU must end a dispatch group.
   IssueCount += SchedModel->getNumMicroOps(SU->getInstr());
+
+  // checkHazard prevents scheduling multiple instructions per cycle that exceed
+  // issue width. However, we commonly reach the maximum. In this case
+  // opportunistically bump the cycle to avoid uselessly checking everything in
+  // the readyQ. Furthermore, a single instruction may produce more than one
+  // cycle's worth of micro-ops.
   if (IssueCount >= SchedModel->getIssueWidth()) {
-    DEBUG(dbgs() << "*** Max instrs at cycle " << CurrCycle << '\n');
+    DEBUG(dbgs() << "  *** Max instrs at cycle " << CurrCycle << '\n');
     bumpCycle();
   }
 }
@@ -913,6 +1234,7 @@ void ConvergingScheduler::SchedBoundary::releasePending() {
     Pending.remove(Pending.begin()+i);
     --i; --e;
   }
+  DEBUG(if (!Pending.empty()) Pending.dump());
   CheckPending = false;
 }
 
@@ -927,12 +1249,23 @@ void ConvergingScheduler::SchedBoundary::removeReady(SUnit *SU) {
 }
 
 /// If this queue only has one ready candidate, return it. As a side effect,
-/// advance the cycle until at least one node is ready. If multiple instructions
-/// are ready, return NULL.
+/// defer any nodes that now hit a hazard, and advance the cycle until at least
+/// one node is ready. If multiple instructions are ready, return NULL.
 SUnit *ConvergingScheduler::SchedBoundary::pickOnlyChoice() {
   if (CheckPending)
     releasePending();
 
+  if (IssueCount > 0) {
+    // Defer any ready instrs that now have a hazard.
+    for (ReadyQueue::iterator I = Available.begin(); I != Available.end();) {
+      if (checkHazard(*I)) {
+        Pending.push(*I);
+        I = Available.remove(I);
+        continue;
+      }
+      ++I;
+    }
+  }
   for (unsigned i = 0; Available.empty(); ++i) {
     assert(i <= (HazardRec->getMaxLookAhead() + MaxMinLatency) &&
            "permanent hazard"); (void)i;
@@ -944,18 +1277,262 @@ SUnit *ConvergingScheduler::SchedBoundary::pickOnlyChoice() {
   return NULL;
 }
 
-#ifndef NDEBUG
-void ConvergingScheduler::traceCandidate(const char *Label, const ReadyQueue &Q,
-                                         SUnit *SU, PressureElement P) {
-  dbgs() << Label << " " << Q.getName() << " ";
-  if (P.isValid())
-    dbgs() << TRI->getRegPressureSetName(P.PSetID) << ":" << P.UnitIncrease
-           << " ";
-  else
-    dbgs() << "     ";
-  SU->dump(DAG);
+/// Record the candidate policy for opposite zones with different critical
+/// resources.
+///
+/// If the CriticalZone is latency limited, don't force a policy for the
+/// candidates here. Instead, When releasing each candidate, releaseNode
+/// compares the region's critical path to the candidate's height or depth and
+/// the scheduled zone's expected latency then sets ShouldIncreaseILP.
+void ConvergingScheduler::balanceZones(
+  ConvergingScheduler::SchedBoundary &CriticalZone,
+  ConvergingScheduler::SchedCandidate &CriticalCand,
+  ConvergingScheduler::SchedBoundary &OppositeZone,
+  ConvergingScheduler::SchedCandidate &OppositeCand) {
+
+  if (!CriticalZone.IsResourceLimited)
+    return;
+
+  SchedRemainder *Rem = CriticalZone.Rem;
+
+  // If the critical zone is overconsuming a resource relative to the
+  // remainder, try to reduce it.
+  unsigned RemainingCritCount =
+    Rem->RemainingCounts[CriticalZone.CritResIdx];
+  if ((int)(Rem->MaxRemainingCount - RemainingCritCount)
+      > (int)SchedModel->getLatencyFactor()) {
+    CriticalCand.Policy.ReduceResIdx = CriticalZone.CritResIdx;
+    DEBUG(dbgs() << "Balance " << CriticalZone.Available.getName() << " reduce "
+          << SchedModel->getProcResource(CriticalZone.CritResIdx)->Name
+          << '\n');
+  }
+  // If the other zone is underconsuming a resource relative to the full zone,
+  // try to increase it.
+  unsigned OppositeCount =
+    OppositeZone.ResourceCounts[CriticalZone.CritResIdx];
+  if ((int)(OppositeZone.ExpectedCount - OppositeCount)
+      > (int)SchedModel->getLatencyFactor()) {
+    OppositeCand.Policy.DemandResIdx = CriticalZone.CritResIdx;
+    DEBUG(dbgs() << "Balance " << OppositeZone.Available.getName() << " demand "
+          << SchedModel->getProcResource(OppositeZone.CritResIdx)->Name
+          << '\n');
+  }
+}
+
+/// Determine if the scheduled zones exceed resource limits or critical path and
+/// set each candidate's ReduceHeight policy accordingly.
+void ConvergingScheduler::checkResourceLimits(
+  ConvergingScheduler::SchedCandidate &TopCand,
+  ConvergingScheduler::SchedCandidate &BotCand) {
+
+  Bot.checkILPPolicy();
+  Top.checkILPPolicy();
+  if (Bot.ShouldIncreaseILP)
+    BotCand.Policy.ReduceLatency = true;
+  if (Top.ShouldIncreaseILP)
+    TopCand.Policy.ReduceLatency = true;
+
+  // Handle resource-limited regions.
+  if (Top.IsResourceLimited && Bot.IsResourceLimited
+      && Top.CritResIdx == Bot.CritResIdx) {
+    // If the scheduled critical resource in both zones is no longer the
+    // critical remaining resource, attempt to reduce resource height both ways.
+    if (Top.CritResIdx != Rem.CritResIdx) {
+      TopCand.Policy.ReduceResIdx = Top.CritResIdx;
+      BotCand.Policy.ReduceResIdx = Bot.CritResIdx;
+      DEBUG(dbgs() << "Reduce scheduled "
+            << SchedModel->getProcResource(Top.CritResIdx)->Name << '\n');
+    }
+    return;
+  }
+  // Handle latency-limited regions.
+  if (!Top.IsResourceLimited && !Bot.IsResourceLimited) {
+    // If the total scheduled expected latency exceeds the region's critical
+    // path then reduce latency both ways.
+    //
+    // Just because a zone is not resource limited does not mean it is latency
+    // limited. Unbuffered resource, such as max micro-ops may cause CurrCycle
+    // to exceed expected latency.
+    if ((Top.ExpectedLatency + Bot.ExpectedLatency >= Rem.CriticalPath)
+        && (Rem.CriticalPath > Top.CurrCycle + Bot.CurrCycle)) {
+      TopCand.Policy.ReduceLatency = true;
+      BotCand.Policy.ReduceLatency = true;
+      DEBUG(dbgs() << "Reduce scheduled latency " << Top.ExpectedLatency
+            << " + " << Bot.ExpectedLatency << '\n');
+    }
+    return;
+  }
+  // The critical resource is different in each zone, so request balancing.
+
+  // Compute the cost of each zone.
+  Rem.MaxRemainingCount = std::max(
+    Rem.RemainingMicroOps * SchedModel->getMicroOpFactor(),
+    Rem.RemainingCounts[Rem.CritResIdx]);
+  Top.ExpectedCount = std::max(Top.ExpectedLatency, Top.CurrCycle);
+  Top.ExpectedCount = std::max(
+    Top.getCriticalCount(),
+    Top.ExpectedCount * SchedModel->getLatencyFactor());
+  Bot.ExpectedCount = std::max(Bot.ExpectedLatency, Bot.CurrCycle);
+  Bot.ExpectedCount = std::max(
+    Bot.getCriticalCount(),
+    Bot.ExpectedCount * SchedModel->getLatencyFactor());
+
+  balanceZones(Top, TopCand, Bot, BotCand);
+  balanceZones(Bot, BotCand, Top, TopCand);
+}
+
+void ConvergingScheduler::SchedCandidate::
+initResourceDelta(const ScheduleDAGMI *DAG,
+                  const TargetSchedModel *SchedModel) {
+  if (!Policy.ReduceResIdx && !Policy.DemandResIdx)
+    return;
+
+  const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
+  for (TargetSchedModel::ProcResIter
+         PI = SchedModel->getWriteProcResBegin(SC),
+         PE = SchedModel->getWriteProcResEnd(SC); PI != PE; ++PI) {
+    if (PI->ProcResourceIdx == Policy.ReduceResIdx)
+      ResDelta.CritResources += PI->Cycles;
+    if (PI->ProcResourceIdx == Policy.DemandResIdx)
+      ResDelta.DemandedResources += PI->Cycles;
+  }
+}
+
+/// Return true if this heuristic determines order.
+static bool tryLess(unsigned TryVal, unsigned CandVal,
+                    ConvergingScheduler::SchedCandidate &TryCand,
+                    ConvergingScheduler::SchedCandidate &Cand,
+                    ConvergingScheduler::CandReason Reason) {
+  if (TryVal < CandVal) {
+    TryCand.Reason = Reason;
+    return true;
+  }
+  if (TryVal > CandVal) {
+    if (Cand.Reason > Reason)
+      Cand.Reason = Reason;
+    return true;
+  }
+  return false;
+}
+static bool tryGreater(unsigned TryVal, unsigned CandVal,
+                       ConvergingScheduler::SchedCandidate &TryCand,
+                       ConvergingScheduler::SchedCandidate &Cand,
+                       ConvergingScheduler::CandReason Reason) {
+  if (TryVal > CandVal) {
+    TryCand.Reason = Reason;
+    return true;
+  }
+  if (TryVal < CandVal) {
+    if (Cand.Reason > Reason)
+      Cand.Reason = Reason;
+    return true;
+  }
+  return false;
+}
+
+/// Apply a set of heursitics to a new candidate. Heuristics are currently
+/// hierarchical. This may be more efficient than a graduated cost model because
+/// we don't need to evaluate all aspects of the model for each node in the
+/// queue. But it's really done to make the heuristics easier to debug and
+/// statistically analyze.
+///
+/// \param Cand provides the policy and current best candidate.
+/// \param TryCand refers to the next SUnit candidate, otherwise uninitialized.
+/// \param Zone describes the scheduled zone that we are extending.
+/// \param RPTracker describes reg pressure within the scheduled zone.
+/// \param TempTracker is a scratch pressure tracker to reuse in queries.
+void ConvergingScheduler::tryCandidate(SchedCandidate &Cand,
+                                       SchedCandidate &TryCand,
+                                       SchedBoundary &Zone,
+                                       const RegPressureTracker &RPTracker,
+                                       RegPressureTracker &TempTracker) {
+
+  // Always initialize TryCand's RPDelta.
+  TempTracker.getMaxPressureDelta(TryCand.SU->getInstr(), TryCand.RPDelta,
+                                  DAG->getRegionCriticalPSets(),
+                                  DAG->getRegPressure().MaxSetPressure);
+
+  // Initialize the candidate if needed.
+  if (!Cand.isValid()) {
+    TryCand.Reason = NodeOrder;
+    return;
+  }
+  // Avoid exceeding the target's limit.
+  if (tryLess(TryCand.RPDelta.Excess.UnitIncrease,
+              Cand.RPDelta.Excess.UnitIncrease, TryCand, Cand, SingleExcess))
+    return;
+  if (Cand.Reason == SingleExcess)
+    Cand.Reason = MultiPressure;
+
+  // Avoid increasing the max critical pressure in the scheduled region.
+  if (tryLess(TryCand.RPDelta.CriticalMax.UnitIncrease,
+              Cand.RPDelta.CriticalMax.UnitIncrease,
+              TryCand, Cand, SingleCritical))
+    return;
+  if (Cand.Reason == SingleCritical)
+    Cand.Reason = MultiPressure;
+
+  // Avoid critical resource consumption and balance the schedule.
+  TryCand.initResourceDelta(DAG, SchedModel);
+  if (tryLess(TryCand.ResDelta.CritResources, Cand.ResDelta.CritResources,
+              TryCand, Cand, ResourceReduce))
+    return;
+  if (tryGreater(TryCand.ResDelta.DemandedResources,
+                 Cand.ResDelta.DemandedResources,
+                 TryCand, Cand, ResourceDemand))
+    return;
+
+  // Avoid serializing long latency dependence chains.
+  if (Cand.Policy.ReduceLatency) {
+    if (Zone.isTop()) {
+      if (Cand.SU->getDepth() * SchedModel->getLatencyFactor()
+          > Zone.ExpectedCount) {
+        if (tryLess(TryCand.SU->getDepth(), Cand.SU->getDepth(),
+                    TryCand, Cand, TopDepthReduce))
+          return;
+      }
+      if (tryGreater(TryCand.SU->getHeight(), Cand.SU->getHeight(),
+                     TryCand, Cand, TopPathReduce))
+        return;
+    }
+    else {
+      if (Cand.SU->getHeight() * SchedModel->getLatencyFactor()
+          > Zone.ExpectedCount) {
+        if (tryLess(TryCand.SU->getHeight(), Cand.SU->getHeight(),
+                    TryCand, Cand, BotHeightReduce))
+          return;
+      }
+      if (tryGreater(TryCand.SU->getDepth(), Cand.SU->getDepth(),
+                     TryCand, Cand, BotPathReduce))
+        return;
+    }
+  }
+
+  // Avoid increasing the max pressure of the entire region.
+  if (tryLess(TryCand.RPDelta.CurrentMax.UnitIncrease,
+              Cand.RPDelta.CurrentMax.UnitIncrease, TryCand, Cand, SingleMax))
+    return;
+  if (Cand.Reason == SingleMax)
+    Cand.Reason = MultiPressure;
+
+  // Prefer immediate defs/users of the last scheduled instruction. This is a
+  // nice pressure avoidance strategy that also conserves the processor's
+  // register renaming resources and keeps the machine code readable.
+  if (Zone.NextSUs.count(TryCand.SU) && !Zone.NextSUs.count(Cand.SU)) {
+    TryCand.Reason = NextDefUse;
+    return;
+  }
+  if (!Zone.NextSUs.count(TryCand.SU) && Zone.NextSUs.count(Cand.SU)) {
+    if (Cand.Reason > NextDefUse)
+      Cand.Reason = NextDefUse;
+    return;
+  }
+  // Fall through to original instruction order.
+  if ((Zone.isTop() && TryCand.SU->NodeNum < Cand.SU->NodeNum)
+      || (!Zone.isTop() && TryCand.SU->NodeNum > Cand.SU->NodeNum)) {
+    TryCand.Reason = NodeOrder;
+  }
 }
-#endif
 
 /// pickNodeFromQueue helper that returns true if the LHS reg pressure effect is
 /// more desirable than RHS from scheduling standpoint.
@@ -966,109 +1543,143 @@ static bool compareRPDelta(const RegPressureDelta &LHS,
   // have UnitIncrease==0, so are neutral.
 
   // Avoid increasing the max critical pressure in the scheduled region.
-  if (LHS.Excess.UnitIncrease != RHS.Excess.UnitIncrease)
+  if (LHS.Excess.UnitIncrease != RHS.Excess.UnitIncrease) {
+    DEBUG(dbgs() << "RP excess top - bot: "
+          << (LHS.Excess.UnitIncrease - RHS.Excess.UnitIncrease) << '\n');
     return LHS.Excess.UnitIncrease < RHS.Excess.UnitIncrease;
-
+  }
   // Avoid increasing the max critical pressure in the scheduled region.
-  if (LHS.CriticalMax.UnitIncrease != RHS.CriticalMax.UnitIncrease)
+  if (LHS.CriticalMax.UnitIncrease != RHS.CriticalMax.UnitIncrease) {
+    DEBUG(dbgs() << "RP critical top - bot: "
+          << (LHS.CriticalMax.UnitIncrease - RHS.CriticalMax.UnitIncrease)
+          << '\n');
     return LHS.CriticalMax.UnitIncrease < RHS.CriticalMax.UnitIncrease;
-
+  }
   // Avoid increasing the max pressure of the entire region.
-  if (LHS.CurrentMax.UnitIncrease != RHS.CurrentMax.UnitIncrease)
+  if (LHS.CurrentMax.UnitIncrease != RHS.CurrentMax.UnitIncrease) {
+    DEBUG(dbgs() << "RP current top - bot: "
+          << (LHS.CurrentMax.UnitIncrease - RHS.CurrentMax.UnitIncrease)
+          << '\n');
     return LHS.CurrentMax.UnitIncrease < RHS.CurrentMax.UnitIncrease;
-
+  }
   return false;
 }
 
+#ifndef NDEBUG
+const char *ConvergingScheduler::getReasonStr(
+  ConvergingScheduler::CandReason Reason) {
+  switch (Reason) {
+  case NoCand:         return "NOCAND    ";
+  case SingleExcess:   return "REG-EXCESS";
+  case SingleCritical: return "REG-CRIT  ";
+  case SingleMax:      return "REG-MAX   ";
+  case MultiPressure:  return "REG-MULTI ";
+  case ResourceReduce: return "RES-REDUCE";
+  case ResourceDemand: return "RES-DEMAND";
+  case TopDepthReduce: return "TOP-DEPTH ";
+  case TopPathReduce:  return "TOP-PATH  ";
+  case BotHeightReduce:return "BOT-HEIGHT";
+  case BotPathReduce:  return "BOT-PATH  ";
+  case NextDefUse:     return "DEF-USE   ";
+  case NodeOrder:      return "ORDER     ";
+  };
+}
+
+void ConvergingScheduler::traceCandidate(const SchedCandidate &Cand,
+                                         const SchedBoundary &Zone) {
+  const char *Label = getReasonStr(Cand.Reason);
+  PressureElement P;
+  unsigned ResIdx = 0;
+  unsigned Latency = 0;
+  switch (Cand.Reason) {
+  default:
+    break;
+  case SingleExcess:
+    P = Cand.RPDelta.Excess;
+    break;
+  case SingleCritical:
+    P = Cand.RPDelta.CriticalMax;
+    break;
+  case SingleMax:
+    P = Cand.RPDelta.CurrentMax;
+    break;
+  case ResourceReduce:
+    ResIdx = Cand.Policy.ReduceResIdx;
+    break;
+  case ResourceDemand:
+    ResIdx = Cand.Policy.DemandResIdx;
+    break;
+  case TopDepthReduce:
+    Latency = Cand.SU->getDepth();
+    break;
+  case TopPathReduce:
+    Latency = Cand.SU->getHeight();
+    break;
+  case BotHeightReduce:
+    Latency = Cand.SU->getHeight();
+    break;
+  case BotPathReduce:
+    Latency = Cand.SU->getDepth();
+    break;
+  }
+  dbgs() << Label << " " << Zone.Available.getName() << " ";
+  if (P.isValid())
+    dbgs() << TRI->getRegPressureSetName(P.PSetID) << ":" << P.UnitIncrease
+           << " ";
+  else
+    dbgs() << "     ";
+  if (ResIdx)
+    dbgs() << SchedModel->getProcResource(ResIdx)->Name << " ";
+  else
+    dbgs() << "        ";
+  if (Latency)
+    dbgs() << Latency << " cycles ";
+  else
+    dbgs() << "         ";
+  Cand.SU->dump(DAG);
+}
+#endif
+
 /// Pick the best candidate from the top queue.
 ///
 /// TODO: getMaxPressureDelta results can be mostly cached for each SUnit during
 /// DAG building. To adjust for the current scheduling location we need to
 /// maintain the number of vreg uses remaining to be top-scheduled.
-ConvergingScheduler::CandResult ConvergingScheduler::
-pickNodeFromQueue(ReadyQueue &Q, const RegPressureTracker &RPTracker,
-                  SchedCandidate &Candidate) {
+void ConvergingScheduler::pickNodeFromQueue(SchedBoundary &Zone,
+                                            const RegPressureTracker &RPTracker,
+                                            SchedCandidate &Cand) {
+  ReadyQueue &Q = Zone.Available;
+
   DEBUG(Q.dump());
 
   // getMaxPressureDelta temporarily modifies the tracker.
   RegPressureTracker &TempTracker = const_cast<RegPressureTracker&>(RPTracker);
 
-  // BestSU remains NULL if no top candidates beat the best existing candidate.
-  CandResult FoundCandidate = NoCand;
   for (ReadyQueue::iterator I = Q.begin(), E = Q.end(); I != E; ++I) {
-    RegPressureDelta RPDelta;
-    TempTracker.getMaxPressureDelta((*I)->getInstr(), RPDelta,
-                                    DAG->getRegionCriticalPSets(),
-                                    DAG->getRegPressure().MaxSetPressure);
-
-    // Initialize the candidate if needed.
-    if (!Candidate.SU) {
-      Candidate.SU = *I;
-      Candidate.RPDelta = RPDelta;
-      FoundCandidate = NodeOrder;
-      continue;
-    }
-    // Avoid exceeding the target's limit.
-    if (RPDelta.Excess.UnitIncrease < Candidate.RPDelta.Excess.UnitIncrease) {
-      DEBUG(traceCandidate("ECAND", Q, *I, RPDelta.Excess));
-      Candidate.SU = *I;
-      Candidate.RPDelta = RPDelta;
-      FoundCandidate = SingleExcess;
-      continue;
-    }
-    if (RPDelta.Excess.UnitIncrease > Candidate.RPDelta.Excess.UnitIncrease)
-      continue;
-    if (FoundCandidate == SingleExcess)
-      FoundCandidate = MultiPressure;
-
-    // Avoid increasing the max critical pressure in the scheduled region.
-    if (RPDelta.CriticalMax.UnitIncrease
-        < Candidate.RPDelta.CriticalMax.UnitIncrease) {
-      DEBUG(traceCandidate("PCAND", Q, *I, RPDelta.CriticalMax));
-      Candidate.SU = *I;
-      Candidate.RPDelta = RPDelta;
-      FoundCandidate = SingleCritical;
-      continue;
-    }
-    if (RPDelta.CriticalMax.UnitIncrease
-        > Candidate.RPDelta.CriticalMax.UnitIncrease)
-      continue;
-    if (FoundCandidate == SingleCritical)
-      FoundCandidate = MultiPressure;
-
-    // Avoid increasing the max pressure of the entire region.
-    if (RPDelta.CurrentMax.UnitIncrease
-        < Candidate.RPDelta.CurrentMax.UnitIncrease) {
-      DEBUG(traceCandidate("MCAND", Q, *I, RPDelta.CurrentMax));
-      Candidate.SU = *I;
-      Candidate.RPDelta = RPDelta;
-      FoundCandidate = SingleMax;
-      continue;
-    }
-    if (RPDelta.CurrentMax.UnitIncrease
-        > Candidate.RPDelta.CurrentMax.UnitIncrease)
-      continue;
-    if (FoundCandidate == SingleMax)
-      FoundCandidate = MultiPressure;
-
-    // Fall through to original instruction order.
-    // Only consider node order if Candidate was chosen from this Q.
-    if (FoundCandidate == NoCand)
-      continue;
 
-    if ((Q.getID() == TopQID && (*I)->NodeNum < Candidate.SU->NodeNum)
-        || (Q.getID() == BotQID && (*I)->NodeNum > Candidate.SU->NodeNum)) {
-      DEBUG(traceCandidate("NCAND", Q, *I));
-      Candidate.SU = *I;
-      Candidate.RPDelta = RPDelta;
-      FoundCandidate = NodeOrder;
+    SchedCandidate TryCand(Cand.Policy);
+    TryCand.SU = *I;
+    tryCandidate(Cand, TryCand, Zone, RPTracker, TempTracker);
+    if (TryCand.Reason != NoCand) {
+      // Initialize resource delta if needed in case future heuristics query it.
+      if (TryCand.ResDelta == SchedResourceDelta())
+        TryCand.initResourceDelta(DAG, SchedModel);
+      Cand.setBest(TryCand);
+      DEBUG(traceCandidate(Cand, Zone));
     }
+    TryCand.SU = *I;
   }
-  return FoundCandidate;
+}
+
+static void tracePick(const ConvergingScheduler::SchedCandidate &Cand,
+                      bool IsTop) {
+  DEBUG(dbgs() << "Pick " << (IsTop ? "top" : "bot")
+        << " SU(" << Cand.SU->NodeNum << ") "
+        << ConvergingScheduler::getReasonStr(Cand.Reason) << '\n');
 }
 
 /// Pick the best candidate node from either the top or bottom queue.
-SUnit *ConvergingScheduler::pickNodeBidrectional(bool &IsTopNode) {
+SUnit *ConvergingScheduler::pickNodeBidirectional(bool &IsTopNode) {
   // Schedule as far as possible in the direction of no choice. This is most
   // efficient, but also provides the best heuristics for CriticalPSets.
   if (SUnit *SU = Bot.pickOnlyChoice()) {
@@ -1079,11 +1690,14 @@ SUnit *ConvergingScheduler::pickNodeBidrectional(bool &IsTopNode) {
     IsTopNode = true;
     return SU;
   }
-  SchedCandidate BotCand;
+  CandPolicy NoPolicy;
+  SchedCandidate BotCand(NoPolicy);
+  SchedCandidate TopCand(NoPolicy);
+  checkResourceLimits(TopCand, BotCand);
+
   // Prefer bottom scheduling when heuristics are silent.
-  CandResult BotResult = pickNodeFromQueue(Bot.Available,
-                                           DAG->getBotRPTracker(), BotCand);
-  assert(BotResult != NoCand && "failed to find the first candidate");
+  pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand);
+  assert(BotCand.Reason != NoCand && "failed to find the first candidate");
 
   // If either Q has a single candidate that provides the least increase in
   // Excess pressure, we can immediately schedule from that Q.
@@ -1092,37 +1706,41 @@ SUnit *ConvergingScheduler::pickNodeBidrectional(bool &IsTopNode) {
   // affects picking from either Q. If scheduling in one direction must
   // increase pressure for one of the excess PSets, then schedule in that
   // direction first to provide more freedom in the other direction.
-  if (BotResult == SingleExcess || BotResult == SingleCritical) {
+  if (BotCand.Reason == SingleExcess || BotCand.Reason == SingleCritical) {
     IsTopNode = false;
+    tracePick(BotCand, IsTopNode);
     return BotCand.SU;
   }
   // Check if the top Q has a better candidate.
-  SchedCandidate TopCand;
-  CandResult TopResult = pickNodeFromQueue(Top.Available,
-                                           DAG->getTopRPTracker(), TopCand);
-  assert(TopResult != NoCand && "failed to find the first candidate");
+  pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand);
+  assert(TopCand.Reason != NoCand && "failed to find the first candidate");
 
-  if (TopResult == SingleExcess || TopResult == SingleCritical) {
-    IsTopNode = true;
-    return TopCand.SU;
-  }
   // If either Q has a single candidate that minimizes pressure above the
   // original region's pressure pick it.
-  if (BotResult == SingleMax) {
+  if (TopCand.Reason <= SingleMax || BotCand.Reason <= SingleMax) {
+    if (TopCand.Reason < BotCand.Reason) {
+      IsTopNode = true;
+      tracePick(TopCand, IsTopNode);
+      return TopCand.SU;
+    }
     IsTopNode = false;
+    tracePick(BotCand, IsTopNode);
     return BotCand.SU;
   }
-  if (TopResult == SingleMax) {
+  // Check for a salient pressure difference and pick the best from either side.
+  if (compareRPDelta(TopCand.RPDelta, BotCand.RPDelta)) {
     IsTopNode = true;
+    tracePick(TopCand, IsTopNode);
     return TopCand.SU;
   }
-  // Check for a salient pressure difference and pick the best from either side.
-  if (compareRPDelta(TopCand.RPDelta, BotCand.RPDelta)) {
+  // Otherwise prefer the bottom candidate, in node order if all else failed.
+  if (TopCand.Reason < BotCand.Reason) {
     IsTopNode = true;
+    tracePick(TopCand, IsTopNode);
     return TopCand.SU;
   }
-  // Otherwise prefer the bottom candidate in node order.
   IsTopNode = false;
+  tracePick(BotCand, IsTopNode);
   return BotCand.SU;
 }
 
@@ -1138,11 +1756,10 @@ SUnit *ConvergingScheduler::pickNode(bool &IsTopNode) {
     if (ForceTopDown) {
       SU = Top.pickOnlyChoice();
       if (!SU) {
-        SchedCandidate TopCand;
-        CandResult TopResult =
-          pickNodeFromQueue(Top.Available, DAG->getTopRPTracker(), TopCand);
-        assert(TopResult != NoCand && "failed to find the first candidate");
-        (void)TopResult;
+        CandPolicy NoPolicy;
+        SchedCandidate TopCand(NoPolicy);
+        pickNodeFromQueue(Top, DAG->getTopRPTracker(), TopCand);
+        assert(TopCand.Reason != NoCand && "failed to find the first candidate");
         SU = TopCand.SU;
       }
       IsTopNode = true;
@@ -1150,17 +1767,16 @@ SUnit *ConvergingScheduler::pickNode(bool &IsTopNode) {
     else if (ForceBottomUp) {
       SU = Bot.pickOnlyChoice();
       if (!SU) {
-        SchedCandidate BotCand;
-        CandResult BotResult =
-          pickNodeFromQueue(Bot.Available, DAG->getBotRPTracker(), BotCand);
-        assert(BotResult != NoCand && "failed to find the first candidate");
-        (void)BotResult;
+        CandPolicy NoPolicy;
+        SchedCandidate BotCand(NoPolicy);
+        pickNodeFromQueue(Bot, DAG->getBotRPTracker(), BotCand);
+        assert(BotCand.Reason != NoCand && "failed to find the first candidate");
         SU = BotCand.SU;
       }
       IsTopNode = false;
     }
     else {
-      SU = pickNodeBidrectional(IsTopNode);
+      SU = pickNodeBidirectional(IsTopNode);
     }
   } while (SU->isScheduled);
 
diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index e426fe23c0..88922169b3 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@@ -956,6 +956,11 @@ void RAFast::AllocateBasicBlock() {
     bool hasPhysDefs = false;
     for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
       MachineOperand &MO = MI->getOperand(i);
+      // Make sure MRI knows about registers clobbered by regmasks.
+      if (MO.isRegMask()) {
+        MRI->addPhysRegsUsedFromRegMask(MO.getRegMask());
+        continue;
+      }
       if (!MO.isReg()) continue;
       unsigned Reg = MO.getReg();
       if (!Reg) continue;
diff --git a/lib/CodeGen/RegisterPressure.cpp b/lib/CodeGen/RegisterPressure.cpp
index 94779770e0..543c426458 100644
--- a/lib/CodeGen/RegisterPressure.cpp
+++ b/lib/CodeGen/RegisterPressure.cpp
@@ -64,7 +64,7 @@ void RegisterPressure::decrease(const TargetRegisterClass *RC,
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-void RegisterPressure::dump(const TargetRegisterInfo *TRI) {
+void RegisterPressure::dump(const TargetRegisterInfo *TRI) const {
   dbgs() << "Live In: ";
   for (unsigned i = 0, e = LiveInRegs.size(); i < e; ++i)
     dbgs() << PrintReg(LiveInRegs[i], TRI) << " ";
@@ -322,10 +322,8 @@ struct RegisterOperands {
         if (findReg(MO.getReg(), isVReg, DeadDefs, TRI) == DeadDefs.end())
           DeadDefs.push_back(MO.getReg());
       }
-      else {
-        if (findReg(MO.getReg(), isVReg, Defs, TRI) == Defs.end())
-          Defs.push_back(MO.getReg());
-      }
+      else if (findReg(MO.getReg(), isVReg, Defs, TRI) == Defs.end())
+        Defs.push_back(MO.getReg());
     }
   }
 };
@@ -676,9 +674,16 @@ void RegPressureTracker::bumpUpwardPressure(const MachineInstr *MI) {
   decreaseVirtRegPressure(VirtRegOpers.DeadDefs);
 
   // Kill liveness at live defs.
-  decreasePhysRegPressure(PhysRegOpers.Defs);
-  decreaseVirtRegPressure(VirtRegOpers.Defs);
-
+  for (unsigned i = 0, e = PhysRegOpers.Defs.size(); i < e; ++i) {
+    unsigned Reg = PhysRegOpers.Defs[i];
+    if (!findReg(Reg, false, PhysRegOpers.Uses, TRI))
+      decreasePhysRegPressure(PhysRegOpers.Defs);
+  }
+  for (unsigned i = 0, e = VirtRegOpers.Defs.size(); i < e; ++i) {
+    unsigned Reg = VirtRegOpers.Defs[i];
+    if (!findReg(Reg, true, VirtRegOpers.Uses, TRI))
+      decreaseVirtRegPressure(VirtRegOpers.Defs);
+  }
   // Generate liveness for uses.
   for (unsigned i = 0, e = PhysRegOpers.Uses.size(); i < e; ++i) {
     unsigned Reg = PhysRegOpers.Uses[i];
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index 496473d3a4..a4d4a93e6d 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -245,7 +245,7 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
       if (UseSU == SU)
         continue;
 
-      SDep dep(SU, SDep::Data, 1, *Alias);
+      SDep dep(SU, SDep::Data, *Alias);
 
       // Adjust the dependence latency using operand def/use information,
       // then allow the target to perform its own adjustments.
@@ -291,11 +291,14 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
           (Kind != SDep::Output || !MO.isDead() ||
            !DefSU->getInstr()->registerDefIsDead(*Alias))) {
         if (Kind == SDep::Anti)
-          DefSU->addPred(SDep(SU, Kind, 0, /*Reg=*/*Alias));
+          DefSU->addPred(SDep(SU, Kind, /*Reg=*/*Alias));
         else {
-          unsigned AOLat =
+          SDep Dep(SU, Kind, /*Reg=*/*Alias);
+          unsigned OutLatency =
             SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr());
-          DefSU->addPred(SDep(SU, Kind, AOLat, /*Reg=*/*Alias));
+          Dep.setMinLatency(OutLatency);
+          Dep.setLatency(OutLatency);
+          DefSU->addPred(Dep);
         }
       }
     }
@@ -364,9 +367,12 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
   else {
     SUnit *DefSU = DefI->SU;
     if (DefSU != SU && DefSU != &ExitSU) {
+      SDep Dep(SU, SDep::Output, Reg);
       unsigned OutLatency =
         SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr());
-      DefSU->addPred(SDep(SU, SDep::Output, OutLatency, Reg));
+      Dep.setMinLatency(OutLatency);
+      Dep.setLatency(OutLatency);
+      DefSU->addPred(Dep);
     }
     DefI->SU = SU;
   }
@@ -396,7 +402,7 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
     if (DefSU) {
       // The reaching Def lives within this scheduling region.
       // Create a data dependence.
-      SDep dep(DefSU, SDep::Data, 1, Reg);
+      SDep dep(DefSU, SDep::Data, Reg);
       // Adjust the dependence latency using operand def/use information, then
       // allow the target to perform its own adjustments.
       int DefOp = Def->findRegisterDefOperandIdx(Reg);
@@ -414,7 +420,7 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
   // Add antidependence to the following def of the vreg it uses.
   VReg2SUnitMap::iterator DefI = VRegDefs.find(Reg);
   if (DefI != VRegDefs.end() && DefI->SU != SU)
-    DefI->SU->addPred(SDep(SU, SDep::Anti, 0, Reg));
+    DefI->SU->addPred(SDep(SU, SDep::Anti, Reg));
 }
 
 /// Return true if MI is an instruction we are unable to reason about
@@ -554,8 +560,7 @@ iterateChainSucc(AliasAnalysis *AA, const MachineFrameInfo *MFI,
   // and stop descending.
   if (*Depth > 200 ||
       MIsNeedChainEdge(AA, MFI, SUa->getInstr(), SUb->getInstr())) {
-    SUb->addPred(SDep(SUa, SDep::Order, /*Latency=*/0, /*Reg=*/0,
-                      /*isNormalMemory=*/true));
+    SUb->addPred(SDep(SUa, SDep::MayAliasMem));
     return *Depth;
   }
   // Track current depth.
@@ -586,9 +591,9 @@ static void adjustChainDeps(AliasAnalysis *AA, const MachineFrameInfo *MFI,
     if (SU == *I)
       continue;
     if (MIsNeedChainEdge(AA, MFI, SU->getInstr(), (*I)->getInstr())) {
-      unsigned Latency = ((*I)->getInstr()->mayLoad()) ? LatencyToLoad : 0;
-      (*I)->addPred(SDep(SU, SDep::Order, Latency, /*Reg=*/0,
-                         /*isNormalMemory=*/true));
+      SDep Dep(SU, SDep::MayAliasMem);
+      Dep.setLatency(((*I)->getInstr()->mayLoad()) ? LatencyToLoad : 0);
+      (*I)->addPred(Dep);
     }
     // Now go through all the chain successors and iterate from them.
     // Keep track of visited nodes.
@@ -611,9 +616,11 @@ void addChainDependency (AliasAnalysis *AA, const MachineFrameInfo *MFI,
   // If this is a false dependency,
   // do not add the edge, but rememeber the rejected node.
   if (!EnableAASchedMI ||
-      MIsNeedChainEdge(AA, MFI, SUa->getInstr(), SUb->getInstr()))
-    SUb->addPred(SDep(SUa, SDep::Order, TrueMemOrderLatency, /*Reg=*/0,
-                      isNormalMemory));
+      MIsNeedChainEdge(AA, MFI, SUa->getInstr(), SUb->getInstr())) {
+    SDep Dep(SUa, isNormalMemory ? SDep::MayAliasMem : SDep::Barrier);
+    Dep.setLatency(TrueMemOrderLatency);
+    SUb->addPred(Dep);
+  }
   else {
     // Duplicate entries should be ignored.
     RejectList.insert(SUb);
@@ -755,16 +762,19 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
       // references, even those that are known to not alias.
       for (std::map<const Value *, SUnit *>::iterator I =
              NonAliasMemDefs.begin(), E = NonAliasMemDefs.end(); I != E; ++I) {
-        I->second->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
+        I->second->addPred(SDep(SU, SDep::Barrier));
       }
       for (std::map<const Value *, std::vector<SUnit *> >::iterator I =
              NonAliasMemUses.begin(), E = NonAliasMemUses.end(); I != E; ++I) {
-        for (unsigned i = 0, e = I->second.size(); i != e; ++i)
-          I->second[i]->addPred(SDep(SU, SDep::Order, TrueMemOrderLatency));
+        for (unsigned i = 0, e = I->second.size(); i != e; ++i) {
+          SDep Dep(SU, SDep::Barrier);
+          Dep.setLatency(TrueMemOrderLatency);
+          I->second[i]->addPred(Dep);
+        }
       }
       // Add SU to the barrier chain.
       if (BarrierChain)
-        BarrierChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
+        BarrierChain->addPred(SDep(SU, SDep::Barrier));
       BarrierChain = SU;
       // This is a barrier event that acts as a pivotal node in the DAG,
       // so it is safe to clear list of exposed nodes.
@@ -852,7 +862,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
         // SU and barrier _could_ be reordered, they should not. In addition,
         // we have lost all RejectMemNodes below barrier.
         if (BarrierChain)
-          BarrierChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
+          BarrierChain->addPred(SDep(SU, SDep::Barrier));
       } else {
         // Treat all other stores conservatively.
         goto new_alias_chain;
@@ -861,10 +871,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
       if (!ExitSU.isPred(SU))
         // Push store's up a bit to avoid them getting in between cmp
         // and branches.
-        ExitSU.addPred(SDep(SU, SDep::Order, 0,
-                            /*Reg=*/0, /*isNormalMemory=*/false,
-                            /*isMustAlias=*/false,
-                            /*isArtificial=*/true));
+        ExitSU.addPred(SDep(SU, SDep::Artificial));
     } else if (MI->mayLoad()) {
       bool MayAlias = true;
       if (MI->isInvariantLoad(AA)) {
@@ -899,7 +906,7 @@ void ScheduleDAGInstrs::buildSchedGraph(AliasAnalysis *AA,
         if (MayAlias && AliasChain)
           addChainDependency(AA, MFI, SU, AliasChain, RejectMemNodes);
         if (BarrierChain)
-          BarrierChain->addPred(SDep(SU, SDep::Order, /*Latency=*/0));
+          BarrierChain->addPred(SDep(SU, SDep::Barrier));
       }
     }
   }
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
index 2ae08692ae..2ecdd89415 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -336,7 +336,9 @@ SUnit *ScheduleDAGFast::CopyAndMoveSuccessors(SUnit *SU) {
       }
     }
     if (isNewLoad) {
-      AddPred(NewSU, SDep(LoadSU, SDep::Order, LoadSU->Latency));
+      SDep D(LoadSU, SDep::Barrier);
+      D.setLatency(LoadSU->Latency);
+      AddPred(NewSU, D);
     }
 
     ++NumUnfolds;
@@ -412,9 +414,12 @@ void ScheduleDAGFast::InsertCopiesAndMoveSuccs(SUnit *SU, unsigned Reg,
   for (unsigned i = 0, e = DelDeps.size(); i != e; ++i) {
     RemovePred(DelDeps[i].first, DelDeps[i].second);
   }
-
-  AddPred(CopyFromSU, SDep(SU, SDep::Data, SU->Latency, Reg));
-  AddPred(CopyToSU, SDep(CopyFromSU, SDep::Data, CopyFromSU->Latency, 0));
+  SDep FromDep(SU, SDep::Data, Reg);
+  FromDep.setLatency(SU->Latency);
+  AddPred(CopyFromSU, FromDep);
+  SDep ToDep(CopyFromSU, SDep::Data, 0);
+  ToDep.setLatency(CopyFromSU->Latency);
+  AddPred(CopyToSU, ToDep);
 
   Copies.push_back(CopyFromSU);
   Copies.push_back(CopyToSU);
@@ -591,18 +596,14 @@ void ScheduleDAGFast::ListScheduleBottomUp() {
           InsertCopiesAndMoveSuccs(LRDef, Reg, DestRC, RC, Copies);
           DEBUG(dbgs() << "Adding an edge from SU # " << TrySU->NodeNum
                        << " to SU #" << Copies.front()->NodeNum << "\n");
-          AddPred(TrySU, SDep(Copies.front(), SDep::Order, /*Latency=*/1,
-                              /*Reg=*/0, /*isNormalMemory=*/false,
-                              /*isMustAlias=*/false, /*isArtificial=*/true));
+          AddPred(TrySU, SDep(Copies.front(), SDep::Artificial));
           NewDef = Copies.back();
         }
 
         DEBUG(dbgs() << "Adding an edge from SU # " << NewDef->NodeNum
                      << " to SU #" << TrySU->NodeNum << "\n");
         LiveRegDefs[Reg] = NewDef;
-        AddPred(NewDef, SDep(TrySU, SDep::Order, /*Latency=*/1,
-                             /*Reg=*/0, /*isNormalMemory=*/false,
-                             /*isMustAlias=*/false, /*isArtificial=*/true));
+        AddPred(NewDef, SDep(TrySU, SDep::Artificial));
         TrySU->isAvailable = false;
         CurSU = NewDef;
       }
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index 62f485153b..c55456902c 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -1058,7 +1058,9 @@ SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) {
 
     // Add a data dependency to reflect that NewSU reads the value defined
     // by LoadSU.
-    AddPred(NewSU, SDep(LoadSU, SDep::Data, LoadSU->Latency));
+    SDep D(LoadSU, SDep::Data, 0);
+    D.setLatency(LoadSU->Latency);
+    AddPred(NewSU, D);
 
     if (isNewLoad)
       AvailableQueue->addNode(LoadSU);
@@ -1140,17 +1142,18 @@ void ScheduleDAGRRList::InsertCopiesAndMoveSuccs(SUnit *SU, unsigned Reg,
       // Avoid scheduling the def-side copy before other successors. Otherwise
       // we could introduce another physreg interference on the copy and
       // continue inserting copies indefinitely.
-      SDep D(CopyFromSU, SDep::Order, /*Latency=*/0,
-             /*Reg=*/0, /*isNormalMemory=*/false,
-             /*isMustAlias=*/false, /*isArtificial=*/true);
-      AddPred(SuccSU, D);
+      AddPred(SuccSU, SDep(CopyFromSU, SDep::Artificial));
     }
   }
   for (unsigned i = 0, e = DelDeps.size(); i != e; ++i)
     RemovePred(DelDeps[i].first, DelDeps[i].second);
 
-  AddPred(CopyFromSU, SDep(SU, SDep::Data, SU->Latency, Reg));
-  AddPred(CopyToSU, SDep(CopyFromSU, SDep::Data, CopyFromSU->Latency, 0));
+  SDep FromDep(SU, SDep::Data, Reg);
+  FromDep.setLatency(SU->Latency);
+  AddPred(CopyFromSU, FromDep);
+  SDep ToDep(CopyFromSU, SDep::Data, 0);
+  ToDep.setLatency(CopyFromSU->Latency);
+  AddPred(CopyToSU, ToDep);
 
   AvailableQueue->updateNode(SU);
   AvailableQueue->addNode(CopyFromSU);
@@ -1359,9 +1362,7 @@ SUnit *ScheduleDAGRRList::PickNodeToScheduleBottomUp() {
         if (!BtSU->isPending)
           AvailableQueue->remove(BtSU);
       }
-      AddPred(TrySU, SDep(BtSU, SDep::Order, /*Latency=*/1,
-                          /*Reg=*/0, /*isNormalMemory=*/false,
-                          /*isMustAlias=*/false, /*isArtificial=*/true));
+      AddPred(TrySU, SDep(BtSU, SDep::Artificial));
 
       // If one or more successors has been unscheduled, then the current
       // node is no longer avaialable. Schedule a successor that's now
@@ -1413,20 +1414,14 @@ SUnit *ScheduleDAGRRList::PickNodeToScheduleBottomUp() {
       InsertCopiesAndMoveSuccs(LRDef, Reg, DestRC, RC, Copies);
       DEBUG(dbgs() << "    Adding an edge from SU #" << TrySU->NodeNum
             << " to SU #" << Copies.front()->NodeNum << "\n");
-      AddPred(TrySU, SDep(Copies.front(), SDep::Order, /*Latency=*/1,
-                          /*Reg=*/0, /*isNormalMemory=*/false,
-                          /*isMustAlias=*/false,
-                          /*isArtificial=*/true));
+      AddPred(TrySU, SDep(Copies.front(), SDep::Artificial));
       NewDef = Copies.back();
     }
 
     DEBUG(dbgs() << "    Adding an edge from SU #" << NewDef->NodeNum
           << " to SU #" << TrySU->NodeNum << "\n");
     LiveRegDefs[Reg] = NewDef;
-    AddPred(NewDef, SDep(TrySU, SDep::Order, /*Latency=*/1,
-                         /*Reg=*/0, /*isNormalMemory=*/false,
-                         /*isMustAlias=*/false,
-                         /*isArtificial=*/true));
+    AddPred(NewDef, SDep(TrySU, SDep::Artificial));
     TrySU->isAvailable = false;
     CurSU = NewDef;
   }
@@ -2936,10 +2931,7 @@ void RegReductionPQBase::AddPseudoTwoAddrDeps() {
             !scheduleDAG->IsReachable(SuccSU, SU)) {
           DEBUG(dbgs() << "    Adding a pseudo-two-addr edge from SU #"
                        << SU->NodeNum << " to SU #" << SuccSU->NodeNum << "\n");
-          scheduleDAG->AddPred(SU, SDep(SuccSU, SDep::Order, /*Latency=*/0,
-                                        /*Reg=*/0, /*isNormalMemory=*/false,
-                                        /*isMustAlias=*/false,
-                                        /*isArtificial=*/true));
+          scheduleDAG->AddPred(SU, SDep(SuccSU, SDep::Artificial));
         }
       }
     }
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 714471f559..a197fcbfa5 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -485,14 +485,15 @@ void ScheduleDAGSDNodes::AddSchedEdges() {
         if(isChain && OpN->getOpcode() == ISD::TokenFactor)
           OpLatency = 0;
 
-        const SDep &dep = SDep(OpSU, isChain ? SDep::Order : SDep::Data,
-                               OpLatency, PhysReg);
+        SDep Dep = isChain ? SDep(OpSU, SDep::Barrier)
+          : SDep(OpSU, SDep::Data, PhysReg);
+        Dep.setLatency(OpLatency);
         if (!isChain && !UnitLatencies) {
-          computeOperandLatency(OpN, N, i, const_cast<SDep &>(dep));
-          ST.adjustSchedDependency(OpSU, SU, const_cast<SDep &>(dep));
+          computeOperandLatency(OpN, N, i, Dep);
+          ST.adjustSchedDependency(OpSU, SU, Dep);
         }
 
-        if (!SU->addPred(dep) && !dep.isCtrl() && OpSU->NumRegDefsLeft > 1) {
+        if (!SU->addPred(Dep) && !Dep.isCtrl() && OpSU->NumRegDefsLeft > 1) {
           // Multiple register uses are combined in the same SUnit. For example,
           // we could have a set of glued nodes with all their defs consumed by
           // another set of glued nodes. Register pressure tracking sees this as
diff --git a/lib/CodeGen/TargetSchedule.cpp b/lib/CodeGen/TargetSchedule.cpp
index 6a096a16c4..ca3b0e0b11 100644
--- a/lib/CodeGen/TargetSchedule.cpp
+++ b/lib/CodeGen/TargetSchedule.cpp
@@ -36,6 +36,21 @@ bool TargetSchedModel::hasInstrItineraries() const {
   return EnableSchedItins && !InstrItins.isEmpty();
 }
 
+static unsigned gcd(unsigned Dividend, unsigned Divisor) {
+  // Dividend and Divisor will be naturally swapped as needed.
+  while(Divisor) {
+    unsigned Rem = Dividend % Divisor;
+    Dividend = Divisor;
+    Divisor = Rem;
+  };
+  return Dividend;
+}
+static unsigned lcm(unsigned A, unsigned B) {
+  unsigned LCM = (uint64_t(A) * B) / gcd(A, B);
+  assert((LCM >= A && LCM >= B) && "LCM overflow");
+  return LCM;
+}
+
 void TargetSchedModel::init(const MCSchedModel &sm,
                             const TargetSubtargetInfo *sti,
                             const TargetInstrInfo *tii) {
@@ -43,17 +58,33 @@ void TargetSchedModel::init(const MCSchedModel &sm,
   STI = sti;
   TII = tii;
   STI->initInstrItins(InstrItins);
+
+  unsigned NumRes = SchedModel.getNumProcResourceKinds();
+  ResourceFactors.resize(NumRes);
+  ResourceLCM = SchedModel.IssueWidth;
+  for (unsigned Idx = 0; Idx < NumRes; ++Idx) {
+    unsigned NumUnits = SchedModel.getProcResource(Idx)->NumUnits;
+    if (NumUnits > 0)
+      ResourceLCM = lcm(ResourceLCM, NumUnits);
+  }
+  MicroOpFactor = ResourceLCM / SchedModel.IssueWidth;
+  for (unsigned Idx = 0; Idx < NumRes; ++Idx) {
+    unsigned NumUnits = SchedModel.getProcResource(Idx)->NumUnits;
+    ResourceFactors[Idx] = NumUnits ? (ResourceLCM / NumUnits) : 0;
+  }
 }
 
-unsigned TargetSchedModel::getNumMicroOps(MachineInstr *MI) const {
+unsigned TargetSchedModel::getNumMicroOps(const MachineInstr *MI,
+                                          const MCSchedClassDesc *SC) const {
   if (hasInstrItineraries()) {
     int UOps = InstrItins.getNumMicroOps(MI->getDesc().getSchedClass());
     return (UOps >= 0) ? UOps : TII->getNumMicroOps(&InstrItins, MI);
   }
   if (hasInstrSchedModel()) {
-    const MCSchedClassDesc *SCDesc = resolveSchedClass(MI);
-    if (SCDesc->isValid())
-      return SCDesc->NumMicroOps;
+    if (!SC)
+      SC = resolveSchedClass(MI);
+    if (SC->isValid())
+      return SC->NumMicroOps;
   }
   return MI->isTransient() ? 0 : 1;
 }
diff --git a/lib/DebugInfo/DIContext.cpp b/lib/DebugInfo/DIContext.cpp
index ead57f9715..691a92c392 100644
--- a/lib/DebugInfo/DIContext.cpp
+++ b/lib/DebugInfo/DIContext.cpp
@@ -19,8 +19,9 @@ DIContext *DIContext::getDWARFContext(bool isLittleEndian,
                                       StringRef aRangeSection,
                                       StringRef lineSection,
                                       StringRef stringSection,
-                                      StringRef rangeSection) {
+                                      StringRef rangeSection,
+                                      const RelocAddrMap &Map) {
   return new DWARFContextInMemory(isLittleEndian, infoSection, abbrevSection,
                                   aRangeSection, lineSection, stringSection,
-                                  rangeSection);
+                                  rangeSection, Map);
 }
diff --git a/lib/DebugInfo/DWARFContext.h b/lib/DebugInfo/DWARFContext.h
index d10e850870..4001792b3d 100644
--- a/lib/DebugInfo/DWARFContext.h
+++ b/lib/DebugInfo/DWARFContext.h
@@ -26,6 +26,7 @@ namespace llvm {
 /// methods that a concrete implementation provides.
 class DWARFContext : public DIContext {
   bool IsLittleEndian;
+  const RelocAddrMap &RelocMap;
 
   SmallVector<DWARFCompileUnit, 1> CUs;
   OwningPtr<DWARFDebugAbbrev> Abbrev;
@@ -38,9 +39,11 @@ class DWARFContext : public DIContext {
   /// Read compile units from the debug_info section and store them in CUs.
   void parseCompileUnits();
 protected:
-  DWARFContext(bool isLittleEndian) : IsLittleEndian(isLittleEndian) {}
+  DWARFContext(bool isLittleEndian, const RelocAddrMap &Map) :
+    IsLittleEndian(isLittleEndian), RelocMap(Map) {}
 public:
   virtual void dump(raw_ostream &OS);
+
   /// Get the number of compile units in this context.
   unsigned getNumCompileUnits() {
     if (CUs.empty())
@@ -70,6 +73,7 @@ public:
       DILineInfoSpecifier Specifier = DILineInfoSpecifier());
 
   bool isLittleEndian() const { return IsLittleEndian; }
+  const RelocAddrMap &relocMap() const { return RelocMap; }
 
   virtual StringRef getInfoSection() = 0;
   virtual StringRef getAbbrevSection() = 0;
@@ -108,8 +112,9 @@ public:
                        StringRef aRangeSection,
                        StringRef lineSection,
                        StringRef stringSection,
-                       StringRef rangeSection)
-    : DWARFContext(isLittleEndian),
+                       StringRef rangeSection,
+                       const RelocAddrMap &Map = RelocAddrMap())
+    : DWARFContext(isLittleEndian, Map),
       InfoSection(infoSection),
       AbbrevSection(abbrevSection),
       ARangeSection(aRangeSection),
diff --git a/lib/DebugInfo/DWARFFormValue.cpp b/lib/DebugInfo/DWARFFormValue.cpp
index c9ecbbbbd4..fea9fd7f7d 100644
--- a/lib/DebugInfo/DWARFFormValue.cpp
+++ b/lib/DebugInfo/DWARFFormValue.cpp
@@ -10,6 +10,7 @@
 #include "DWARFFormValue.h"
 #include "DWARFCompileUnit.h"
 #include "DWARFContext.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
@@ -98,8 +99,16 @@ DWARFFormValue::extractValue(DataExtractor data, uint32_t *offset_ptr,
     indirect = false;
     switch (Form) {
     case DW_FORM_addr:
-    case DW_FORM_ref_addr:
-      Value.uval = data.getUnsigned(offset_ptr, cu->getAddressByteSize());
+    case DW_FORM_ref_addr: {
+      RelocAddrMap::const_iterator AI
+        = cu->getContext().relocMap().find(*offset_ptr);
+      if (AI != cu->getContext().relocMap().end()) {
+        const std::pair<uint8_t, int64_t> &R = AI->second;
+        Value.uval = R.second;
+        *offset_ptr += R.first;
+      } else
+        Value.uval = data.getUnsigned(offset_ptr, cu->getAddressByteSize());
+    }
       break;
     case DW_FORM_exprloc:
     case DW_FORM_block:
@@ -138,9 +147,17 @@ DWARFFormValue::extractValue(DataExtractor data, uint32_t *offset_ptr,
     case DW_FORM_sdata:
       Value.sval = data.getSLEB128(offset_ptr);
       break;
-    case DW_FORM_strp:
-      Value.uval = data.getU32(offset_ptr);
+    case DW_FORM_strp: {
+      RelocAddrMap::const_iterator AI
+        = cu->getContext().relocMap().find(*offset_ptr);
+      if (AI != cu->getContext().relocMap().end()) {
+        const std::pair<uint8_t, int64_t> &R = AI->second;
+        Value.uval = R.second;
+        *offset_ptr += R.first;
+      } else
+        Value.uval = data.getU32(offset_ptr);
       break;
+    }
     case DW_FORM_udata:
     case DW_FORM_ref_udata:
       Value.uval = data.getULEB128(offset_ptr);
diff --git a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
index 23f8607322..4cb0270d57 100644
--- a/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
+++ b/lib/ExecutionEngine/IntelJITEvents/IntelJITEventListener.cpp
@@ -54,6 +54,10 @@ public:
                                      const EmittedFunctionDetails &Details);
 
   virtual void NotifyFreeingMachineCode(void *OldPtr);
+
+  virtual void NotifyObjectEmitted(const ObjectImage &Obj);
+
+  virtual void NotifyFreeingObject(const ObjectImage &Obj);
 };
 
 static LineNumberInfo LineStartToIntelJITFormat(
@@ -164,6 +168,12 @@ void IntelJITEventListener::NotifyFreeingMachineCode(void *FnStart) {
   }
 }
 
+void IntelJITEventListener::NotifyObjectEmitted(const ObjectImage &Obj) {
+}
+
+void IntelJITEventListener::NotifyFreeingObject(const ObjectImage &Obj) {
+}
+
 }  // anonymous namespace.
 
 namespace llvm {
diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
index a0ad985145..752c5b73ea 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.cpp
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
@@ -11,6 +11,7 @@
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
 #include "llvm/ExecutionEngine/GenericValue.h"
+#include "llvm/ExecutionEngine/JITEventListener.h"
 #include "llvm/ExecutionEngine/JITMemoryManager.h"
 #include "llvm/ExecutionEngine/MCJIT.h"
 #include "llvm/ExecutionEngine/ObjectBuffer.h"
@@ -57,6 +58,8 @@ MCJIT::MCJIT(Module *m, TargetMachine *tm, RTDyldMemoryManager *MM,
 }
 
 MCJIT::~MCJIT() {
+  if (LoadedObject)
+    NotifyFreeingObject(*LoadedObject.get());
   delete MemMgr;
   delete TM;
 }
@@ -107,10 +110,27 @@ void MCJIT::emitObject(Module *m) {
   // FIXME: Make this optional, maybe even move it to a JIT event listener
   LoadedObject->registerWithDebugger();
 
+  NotifyObjectEmitted(*LoadedObject);
+
   // FIXME: Add support for per-module compilation state
   isCompiled = true;
 }
 
+// FIXME: Add a parameter to identify which object is being finalized when
+// MCJIT supports multiple modules.
+void MCJIT::finalizeObject() {
+  // If the module hasn't been compiled, just do that.
+  if (!isCompiled) {
+    // If the call to Dyld.resolveRelocations() is removed from emitObject()
+    // we'll need to do that here.
+    emitObject(M);
+    return;
+  }
+
+  // Resolve any relocations.
+  Dyld.resolveRelocations();
+}
+
 void *MCJIT::getPointerToBasicBlock(BasicBlock *BB) {
   report_fatal_error("not yet implemented");
 }
@@ -275,3 +295,33 @@ void *MCJIT::getPointerToNamedFunction(const std::string &Name,
   }
   return 0;
 }
+
+void MCJIT::RegisterJITEventListener(JITEventListener *L) {
+  if (L == NULL)
+    return;
+  MutexGuard locked(lock);
+  EventListeners.push_back(L);
+}
+void MCJIT::UnregisterJITEventListener(JITEventListener *L) {
+  if (L == NULL)
+    return;
+  MutexGuard locked(lock);
+  SmallVector<JITEventListener*, 2>::reverse_iterator I=
+      std::find(EventListeners.rbegin(), EventListeners.rend(), L);
+  if (I != EventListeners.rend()) {
+    std::swap(*I, EventListeners.back());
+    EventListeners.pop_back();
+  }
+}
+void MCJIT::NotifyObjectEmitted(const ObjectImage& Obj) {
+  MutexGuard locked(lock);
+  for (unsigned I = 0, S = EventListeners.size(); I < S; ++I) {
+    EventListeners[I]->NotifyObjectEmitted(Obj);
+  }
+}
+void MCJIT::NotifyFreeingObject(const ObjectImage& Obj) {
+  MutexGuard locked(lock);
+  for (unsigned I = 0, S = EventListeners.size(); I < S; ++I) {
+    EventListeners[I]->NotifyFreeingObject(Obj);
+  }
+}
diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.h b/lib/ExecutionEngine/MCJIT/MCJIT.h
index b9ff06e701..571080d2bd 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.h
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.h
@@ -11,6 +11,7 @@
 #define LLVM_LIB_EXECUTIONENGINE_MCJIT_H
 
 #include "llvm/PassManager.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
 
@@ -30,6 +31,7 @@ class MCJIT : public ExecutionEngine {
   MCContext *Ctx;
   RTDyldMemoryManager *MemMgr;
   RuntimeDyld Dyld;
+  SmallVector<JITEventListener*, 2> EventListeners;
 
   // FIXME: Add support for multiple modules
   bool isCompiled;
@@ -42,6 +44,8 @@ public:
   /// @name ExecutionEngine interface implementation
   /// @{
 
+  virtual void finalizeObject();
+
   virtual void *getPointerToBasicBlock(BasicBlock *BB);
 
   virtual void *getPointerToFunction(Function *F);
@@ -73,6 +77,9 @@ public:
     Dyld.mapSectionAddress(LocalAddress, TargetAddress);
   }
 
+  virtual void RegisterJITEventListener(JITEventListener *L);
+  virtual void UnregisterJITEventListener(JITEventListener *L);
+
   /// @}
   /// @name (Private) Registration Interfaces
   /// @{
@@ -96,6 +103,9 @@ protected:
   /// is passed as a parameter here to prepare for multiple module support in 
   /// the future.
   void emitObject(Module *M);
+
+  void NotifyObjectEmitted(const ObjectImage& Obj);
+  void NotifyFreeingObject(const ObjectImage& Obj);
 };
 
 } // End llvm namespace
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index 950b4208a9..f6dccb106d 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -36,7 +36,11 @@ void RuntimeDyldImpl::resolveRelocations() {
   // Just iterate over the sections we have and resolve all the relocations
   // in them. Gross overkill, but it gets the job done.
   for (int i = 0, e = Sections.size(); i != e; ++i) {
-    reassignSectionAddress(i, Sections[i].LoadAddress);
+    uint64_t Addr = Sections[i].LoadAddress;
+    DEBUG(dbgs() << "Resolving relocations Section #" << i
+            << "\t" << format("%p", (uint8_t *)Addr)
+            << "\n");
+    resolveRelocationList(Relocations[i], Addr);
   }
 }
 
@@ -387,17 +391,15 @@ void RuntimeDyldImpl::reassignSectionAddress(unsigned SectionID,
                                              uint64_t Addr) {
   // The address to use for relocation resolution is not
   // the address of the local section buffer. We must be doing
-  // a remote execution environment of some sort. Re-apply any
-  // relocations referencing this section with the given address.
+  // a remote execution environment of some sort. Relocations can't
+  // be applied until all the sections have been moved.  The client must
+  // trigger this with a call to MCJIT::finalize() or
+  // RuntimeDyld::resolveRelocations().
   //
   // Addr is a uint64_t because we can't assume the pointer width
   // of the target is the same as that of the host. Just use a generic
   // "big enough" type.
   Sections[SectionID].LoadAddress = Addr;
-  DEBUG(dbgs() << "Resolving relocations Section #" << SectionID
-          << "\t" << format("%p", (uint8_t *)Addr)
-          << "\n");
-  resolveRelocationList(Relocations[SectionID], Addr);
 }
 
 void RuntimeDyldImpl::resolveRelocationEntry(const RelocationEntry &RE,
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index 051c7dd153..cf86a44d1b 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -154,6 +154,7 @@ private:
   StringRef CppHashFilename;
   int64_t CppHashLineNumber;
   SMLoc CppHashLoc;
+  int CppHashBuf;
 
   /// AssemblerDialect. ~OU means unset value and use value provided by MAI.
   unsigned AssemblerDialect;
@@ -1421,7 +1422,7 @@ bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
 	 getStreamer().EmitDwarfFileDirective(
 	   getContext().nextGenDwarfFileNumber(), StringRef(), CppHashFilename);
 
-       unsigned CppHashLocLineNo = SrcMgr.FindLineNumber(CppHashLoc, CurBuffer);
+       unsigned CppHashLocLineNo = SrcMgr.FindLineNumber(CppHashLoc,CppHashBuf);
        Line = CppHashLineNumber - 1 + (Line - CppHashLocLineNo);
      }
 
@@ -1483,6 +1484,7 @@ bool AsmParser::ParseCppHashLineFilenameComment(const SMLoc &L) {
   CppHashLoc = L;
   CppHashFilename = Filename;
   CppHashLineNumber = LineNumber;
+  CppHashBuf = CurBuffer;
 
   // Ignore any trailing characters, they're just comment.
   EatToEndOfLine();
diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp
index 2501f2ea98..0423c7acb3 100644
--- a/lib/Support/MemoryBuffer.cpp
+++ b/lib/Support/MemoryBuffer.cpp
@@ -33,6 +33,9 @@
 #include <unistd.h>
 #else
 #include <io.h>
+#ifndef S_ISFIFO
+#define S_ISFIFO(x) (0)
+#endif
 #endif
 #include <fcntl.h>
 using namespace llvm;
@@ -201,6 +204,27 @@ public:
 };
 }
 
+static error_code getMemoryBufferForStream(int FD, 
+                                           StringRef BufferName,
+                                           OwningPtr<MemoryBuffer> &result) {
+  const ssize_t ChunkSize = 4096*4;
+  SmallString<ChunkSize> Buffer;
+  ssize_t ReadBytes;
+  // Read into Buffer until we hit EOF.
+  do {
+    Buffer.reserve(Buffer.size() + ChunkSize);
+    ReadBytes = read(FD, Buffer.end(), ChunkSize);
+    if (ReadBytes == -1) {
+      if (errno == EINTR) continue;
+      return error_code(errno, posix_category());
+    }
+    Buffer.set_size(Buffer.size() + ReadBytes);
+  } while (ReadBytes != 0);
+
+  result.reset(MemoryBuffer::getMemBufferCopy(Buffer, BufferName));
+  return error_code::success();
+}
+
 error_code MemoryBuffer::getFile(StringRef Filename,
                                  OwningPtr<MemoryBuffer> &result,
                                  int64_t FileSize,
@@ -296,6 +320,13 @@ error_code MemoryBuffer::getOpenFile(int FD, const char *Filename,
       if (fstat(FD, &FileInfo) == -1) {
         return error_code(errno, posix_category());
       }
+
+      // If this is a named pipe, we can't trust the size. Create the memory
+      // buffer by copying off the stream.
+      if (S_ISFIFO(FileInfo.st_mode)) {
+        return getMemoryBufferForStream(FD, Filename, result);
+      }
+
       FileSize = FileInfo.st_size;
     }
     MapSize = FileSize;
@@ -369,20 +400,5 @@ error_code MemoryBuffer::getSTDIN(OwningPtr<MemoryBuffer> &result) {
   // fallback if it fails.
   sys::Program::ChangeStdinToBinary();
 
-  const ssize_t ChunkSize = 4096*4;
-  SmallString<ChunkSize> Buffer;
-  ssize_t ReadBytes;
-  // Read into Buffer until we hit EOF.
-  do {
-    Buffer.reserve(Buffer.size() + ChunkSize);
-    ReadBytes = read(0, Buffer.end(), ChunkSize);
-    if (ReadBytes == -1) {
-      if (errno == EINTR) continue;
-      return error_code(errno, posix_category());
-    }
-    Buffer.set_size(Buffer.size() + ReadBytes);
-  } while (ReadBytes != 0);
-
-  result.reset(getMemBufferCopy(Buffer, "<stdin>"));
-  return error_code::success();
+  return getMemoryBufferForStream(0, "<stdin>", result);
 }
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index d28f7ff73f..8f5be6a120 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -88,6 +88,11 @@ ARMBaseRegisterInfo::getCallPreservedMask(CallingConv::ID) const {
     ? CSR_iOS_RegMask : CSR_AAPCS_RegMask;
 }
 
+const uint32_t*
+ARMBaseRegisterInfo::getNoPreservedMask() const {
+  return CSR_NoRegs_RegMask;
+}
+
 BitVector ARMBaseRegisterInfo::
 getReservedRegs(const MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.h b/lib/Target/ARM/ARMBaseRegisterInfo.h
index dbbb566900..e2bdd046db 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -96,6 +96,7 @@ public:
   /// Code Generation virtual methods...
   const uint16_t *getCalleeSavedRegs(const MachineFunction *MF = 0) const;
   const uint32_t *getCallPreservedMask(CallingConv::ID) const;
+  const uint32_t *getNoPreservedMask() const;
 
   BitVector getReservedRegs(const MachineFunction &MF) const;
 
diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td
index e0f3a871ba..a7544cd4cd 100644
--- a/lib/Target/ARM/ARMCallingConv.td
+++ b/lib/Target/ARM/ARMCallingConv.td
@@ -194,6 +194,8 @@ def RetCC_ARM_AAPCS_VFP : CallingConv<[
 // Callee-saved register lists.
 //===----------------------------------------------------------------------===//
 
+def CSR_NoRegs : CalleeSavedRegs<(add)>;
+
 def CSR_AAPCS : CalleeSavedRegs<(add LR, R11, R10, R9, R8, R7, R6, R5, R4,
                                      (sequence "D%u", 15, 8))>;
 
@@ -211,4 +213,4 @@ def CSR_GHC : CalleeSavedRegs<(add)>;
 // NaCl does not save R9, but otherwise uses the same order as AAPCS
 def CSR_NaCl : CalleeSavedRegs<(add LR, R11, R10, R8, R7, R6, R5, R4,
                                      (sequence "D%u", 15, 8))>;
-// @LOCALMOD-END
-\ No newline at end of file
+// @LOCALMOD-END
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 4ef226a2a6..4956c59606 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -860,9 +860,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       MI.eraseFromParent();
       return true;
     }
-    case ARM::Int_eh_sjlj_dispatchsetup:
-    case ARM::Int_eh_sjlj_dispatchsetup_nofp:
-    case ARM::tInt_eh_sjlj_dispatchsetup: {
+    case ARM::Int_eh_sjlj_dispatchsetup: {
       MachineFunction &MF = *MI.getParent()->getParent();
       const ARMBaseInstrInfo *AII =
         static_cast<const ARMBaseInstrInfo*>(TII);
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 848e1ab761..033540ae7d 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -619,7 +619,10 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, EVT VT) {
 
   Reloc::Model RelocM = TM.getRelocationModel();
   bool IsIndirect = Subtarget->GVIsIndirectSymbol(GV, RelocM);
-  unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
+  const TargetRegisterClass *RC = isThumb2 ?
+    (const TargetRegisterClass*)&ARM::rGPRRegClass :
+    (const TargetRegisterClass*)&ARM::GPRRegClass;
+  unsigned DestReg = createResultReg(RC);
 
   // Use movw+movt when possible, it avoids constant pool entries.
   // Darwin targets don't support movt with Reloc::Static, see
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 6ab48f6244..dec498a4f7 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -6357,12 +6357,15 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
                              MachineMemOperand::MOLoad |
                              MachineMemOperand::MOVolatile, 4, 4);
 
-  if (AFI->isThumb1OnlyFunction())
-    BuildMI(DispatchBB, dl, TII->get(ARM::tInt_eh_sjlj_dispatchsetup));
-  else if (!Subtarget->hasVFP2())
-    BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup_nofp));
-  else
-    BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
+  MachineInstrBuilder MIB;
+  MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
+
+  const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
+  const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
+
+  // Add a register mask with no preserved registers.  This results in all
+  // registers being marked as clobbered.
+  MIB.addRegMask(RI.getNoPreservedMask());
 
   unsigned NumLPads = LPadList.size();
   if (Subtarget->isThumb2()) {
@@ -6581,8 +6584,6 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
   }
 
   // N.B. the order the invoke BBs are processed in doesn't matter here.
-  const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
-  const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
   const uint16_t *SavedRegs = RI.getCalleeSavedRegs(MF);
   SmallVector<MachineBasicBlock*, 64> MBBLPads;
   for (SmallPtrSet<MachineBasicBlock*, 64>::iterator
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 6d10383c73..9f7e50cd27 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -4888,21 +4888,13 @@ def Int_eh_sjlj_longjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$scratch),
                                 Requires<[IsARM, IsIOS]>;
 }
 
-// eh.sjlj.dispatchsetup pseudo-instructions.
-// These pseudos are used for both ARM and Thumb2. Any differences are
-// handled when the pseudo is expanded (which happens before any passes
-// that need the instruction size).
-let Defs =
-  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR, CPSR,
-    Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, Q8, Q9, Q10, Q11, Q12, Q13, Q14, Q15 ],
-  isBarrier = 1 in
+// eh.sjlj.dispatchsetup pseudo-instruction.
+// This pseudo is used for both ARM and Thumb. Any differences are handled when
+// the pseudo is expanded (which happens before any passes that need the
+// instruction size).
+let isBarrier = 1 in
 def Int_eh_sjlj_dispatchsetup : PseudoInst<(outs), (ins), NoItinerary, []>;
 
-let Defs =
-  [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR, CPSR ],
-  isBarrier = 1 in
-def Int_eh_sjlj_dispatchsetup_nofp : PseudoInst<(outs), (ins), NoItinerary, []>;
-
 
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index a5c0bc6740..ae7a5c00bd 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -1247,10 +1247,6 @@ def tInt_eh_sjlj_longjmp : XI<(outs), (ins GPR:$src, GPR:$scratch),
                               [(ARMeh_sjlj_longjmp GPR:$src, GPR:$scratch)]>,
                              Requires<[IsThumb, IsIOS]>;
 
-let Defs = [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7, R12, CPSR ],
-    isBarrier = 1 in
-def tInt_eh_sjlj_dispatchsetup : PseudoInst<(outs), (ins), NoItinerary, []>;
-
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
 //
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
index ca52570358..0e9ef4838d 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.cpp
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
@@ -31,8 +31,7 @@ void VLIWMachineScheduler::postprocessDAG() {
       LastSequentialCall = &(SUnits[su]);
     // Look for a compare that defines a predicate.
     else if (SUnits[su].getInstr()->isCompare() && LastSequentialCall)
-      SUnits[su].addPred(SDep(LastSequentialCall, SDep::Order, 0, /*Reg=*/0,
-                              false));
+      SUnits[su].addPred(SDep(LastSequentialCall, SDep::Barrier));
   }
 }
 
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index a8810d238c..32cf6c8be7 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -247,6 +247,10 @@ MipsTargetLowering(MipsTargetMachine &TM)
     setOperationAction(ISD::SRL_PARTS,          MVT::i32,   Custom);
   }
 
+  setOperationAction(ISD::ADD,                MVT::i32,   Custom);
+  if (HasMips64)
+    setOperationAction(ISD::ADD,                MVT::i64,   Custom);
+
   setOperationAction(ISD::SDIV, MVT::i32, Expand);
   setOperationAction(ISD::SREM, MVT::i32, Expand);
   setOperationAction(ISD::UDIV, MVT::i32, Expand);
@@ -921,7 +925,7 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const
     case ISD::STORE:              return LowerSTORE(Op, DAG);
     case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
     case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, DAG);
-
+    case ISD::ADD:                return LowerADD(Op, DAG);
     // @LOCALMOD-BEGIN
     case ISD::NACL_TP_TLS_OFFSET: return LowerNaClTpTlsOffset(Op, DAG);
     case ISD::NACL_TP_TDB_OFFSET: return LowerNaClTpTdbOffset(Op, DAG);
@@ -2603,6 +2607,27 @@ SDValue MipsTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
   }
 }
 
+SDValue MipsTargetLowering::LowerADD(SDValue Op, SelectionDAG &DAG) const {
+  if (Op->getOperand(0).getOpcode() != ISD::FRAMEADDR
+      || cast<ConstantSDNode>
+        (Op->getOperand(0).getOperand(0))->getZExtValue() != 0
+      || Op->getOperand(1).getOpcode() != ISD::FRAME_TO_ARGS_OFFSET)
+    return SDValue();
+
+  // The pattern
+  //   (add (frameaddr 0), (frame_to_args_offset))
+  // results from lowering llvm.eh.dwarf.cfa intrinsic. Transform it to
+  //   (add FrameObject, 0)
+  // where FrameObject is a fixed StackObject with offset 0 which points to
+  // the old stack pointer.
+  MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
+  EVT ValTy = Op->getValueType(0);
+  int FI = MFI->CreateFixedObject(Op.getValueSizeInBits() / 8, 0, false);
+  SDValue InArgsAddr = DAG.getFrameIndex(FI, ValTy);
+  return DAG.getNode(ISD::ADD, Op->getDebugLoc(), ValTy, InArgsAddr,
+                     DAG.getConstant(0, ValTy));
+}
+
 //===----------------------------------------------------------------------===//
 //                      Calling Convention Implementation
 //===----------------------------------------------------------------------===//
@@ -2736,9 +2761,9 @@ IsEligibleForTailCallOptimization(const MipsCC &MipsCCInfo,
   if (MipsCCInfo.hasByValArg() || FI.hasByvalArg())
     return false;
 
-  // Return true if the callee's next stack offset is no larger than the
+  // Return true if the callee's argument area is no larger than the
   // caller's.
-  return NextStackOffset <= FI.nextStackOffset();
+  return NextStackOffset <= FI.getIncomingArgSize();
 }
 
 SDValue
@@ -3089,7 +3114,6 @@ MipsTargetLowering::LowerFormalArguments(SDValue Chain,
   MipsCCInfo.analyzeFormalArguments(Ins);
   MipsFI->setFormalArgInfo(CCInfo.getNextStackOffset(),
                            MipsCCInfo.hasByValArg());
-  MipsFI->setIncomingArgSize(CCInfo.getNextStackOffset());
 
   Function::const_arg_iterator FuncArg =
     DAG.getMachineFunction().getFunction()->arg_begin();
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index ac82347a1f..77045c3162 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -272,6 +272,7 @@ namespace llvm {
     SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerADD(SDValue Op, SelectionDAG &DAG) const;
 
     /// IsEligibleForTailCallOptimization - Check whether the call is eligible
     /// for tail call optimization.
diff --git a/lib/Target/Mips/MipsMachineFunction.h b/lib/Target/Mips/MipsMachineFunction.h
index d2eba58114..bb45f92f18 100644
--- a/lib/Target/Mips/MipsMachineFunction.h
+++ b/lib/Target/Mips/MipsMachineFunction.h
@@ -47,11 +47,10 @@ class MipsFunctionInfo : public MachineFunctionInfo {
   /// VarArgsFrameIndex - FrameIndex for start of varargs area.
   int VarArgsFrameIndex;
 
-  // Formal argument information obtained during call to LowerFormalArguments.
-  unsigned NextStackOffset;
+  /// True if function has a byval argument.
   bool HasByvalArg;
 
-  // Size of incoming argument area.
+  /// Size of incoming argument area.
   unsigned IncomingArgSize;
 
 public:
@@ -72,15 +71,13 @@ public:
   int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
   void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
 
-  unsigned nextStackOffset() const { return NextStackOffset; }
   bool hasByvalArg() const { return HasByvalArg; }
-  void setFormalArgInfo(unsigned Offset, bool HasByval) {
-    NextStackOffset = Offset;
+  void setFormalArgInfo(unsigned Size, bool HasByval) {
+    IncomingArgSize = Size;
     HasByvalArg = HasByval;
   }
 
   unsigned getIncomingArgSize() const { return IncomingArgSize; }
-  void setIncomingArgSize(unsigned S) { IncomingArgSize = S; }
 };
 
 } // end of namespace llvm
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 72395bb283..4387730fcc 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -284,8 +284,22 @@ bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
                                           unsigned AsmVariant,
                                           const char *ExtraCode,
                                           raw_ostream &O) {
-  if (ExtraCode && ExtraCode[0])
-    return true; // Unknown modifier.
+  if (ExtraCode && ExtraCode[0]) {
+    if (ExtraCode[1] != 0) return true; // Unknown modifier.
+
+    switch (ExtraCode[0]) {
+    default: return true;  // Unknown modifier.
+    case 'y': // A memory reference for an X-form instruction
+      {
+        const char *RegName = "r0";
+        if (!Subtarget.isDarwin()) RegName = stripRegisterPrefix(RegName);
+        O << RegName << ", ";
+        printOperand(MI, OpNo, O);
+        return false;
+      }
+    }
+  }
+
   assert(MI->getOperand(OpNo).isReg());
   O << "0(";
   printOperand(MI, OpNo, O);
diff --git a/lib/Target/PowerPC/PPCCallingConv.td b/lib/Target/PowerPC/PPCCallingConv.td
index b2b5364856..3f87e883b1 100644
--- a/lib/Target/PowerPC/PPCCallingConv.td
+++ b/lib/Target/PowerPC/PPCCallingConv.td
@@ -12,12 +12,19 @@
 //
 //===----------------------------------------------------------------------===//
 
+/// CCIfSubtarget - Match if the current subtarget has a feature F.
+class CCIfSubtarget<string F, CCAction A>
+ : CCIf<!strconcat("State.getTarget().getSubtarget<PPCSubtarget>().", F), A>;
+
 //===----------------------------------------------------------------------===//
 // Return Value Calling Convention
 //===----------------------------------------------------------------------===//
 
 // Return-value convention for PowerPC
 def RetCC_PPC : CallingConv<[
+  // On PPC64, integer return values are always promoted to i64
+  CCIfType<[i32], CCIfSubtarget<"isPPC64()", CCPromoteToType<i64>>>,
+
   CCIfType<[i32], CCAssignToReg<[R3, R4, R5, R6, R7, R8, R9, R10]>>,
   CCIfType<[i64], CCAssignToReg<[X3, X4, X5, X6]>>,
   
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 0922011d97..adf78d5233 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -3177,12 +3177,32 @@ PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
   // Copy all of the result registers out of their specified physreg.
   for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
     CCValAssign &VA = RVLocs[i];
-    EVT VT = VA.getValVT();
     assert(VA.isRegLoc() && "Can only return in registers!");
-    Chain = DAG.getCopyFromReg(Chain, dl,
-                               VA.getLocReg(), VT, InFlag).getValue(1);
-    InVals.push_back(Chain.getValue(0));
-    InFlag = Chain.getValue(2);
+
+    SDValue Val = DAG.getCopyFromReg(Chain, dl,
+                                     VA.getLocReg(), VA.getLocVT(), InFlag);
+    Chain = Val.getValue(1);
+    InFlag = Val.getValue(2);
+
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::AExt:
+      Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+      break;
+    case CCValAssign::ZExt:
+      Val = DAG.getNode(ISD::AssertZext, dl, VA.getLocVT(), Val,
+                        DAG.getValueType(VA.getValVT()));
+      Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+      break;
+    case CCValAssign::SExt:
+      Val = DAG.getNode(ISD::AssertSext, dl, VA.getLocVT(), Val,
+                        DAG.getValueType(VA.getValVT()));
+      Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
+      break;
+    }
+
+    InVals.push_back(Val);
   }
 
   return Chain;
@@ -4316,8 +4336,24 @@ PPCTargetLowering::LowerReturn(SDValue Chain,
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
-    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
-                             OutVals[i], Flag);
+
+    SDValue Arg = OutVals[i];
+
+    switch (VA.getLocInfo()) {
+    default: llvm_unreachable("Unknown loc info!");
+    case CCValAssign::Full: break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    }
+
+    Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
     Flag = Chain.getValue(1);
   }
 
@@ -6443,6 +6479,14 @@ PPCTargetLowering::getConstraintType(const std::string &Constraint) const {
     case 'v':
     case 'y':
       return C_RegisterClass;
+    case 'Z':
+      // FIXME: While Z does indicate a memory constraint, it specifically
+      // indicates an r+r address (used in conjunction with the 'y' modifier
+      // in the replacement string). Currently, we're forcing the base
+      // register to be r0 in the asm printer (which is interpreted as zero)
+      // and forming the complete address in the second register. This is
+      // suboptimal.
+      return C_Memory;
     }
   }
   return TargetLowering::getConstraintType(Constraint);
@@ -6485,6 +6529,9 @@ PPCTargetLowering::getSingleConstraintMatchWeight(
   case 'y':
     weight = CW_Register;
     break;
+  case 'Z':
+    weight = CW_Memory;
+    break;
   }
   return weight;
 }
diff --git a/lib/Target/TargetTransformImpl.cpp b/lib/Target/TargetTransformImpl.cpp
index 4cd07cd6c3..c07332de32 100644
--- a/lib/Target/TargetTransformImpl.cpp
+++ b/lib/Target/TargetTransformImpl.cpp
@@ -60,7 +60,7 @@ bool ScalarTargetTransformImpl::shouldBuildLookupTables() const {
 // Calls used by the vectorizers.
 //
 //===----------------------------------------------------------------------===//
-static int InstructionOpcodeToISD(unsigned Opcode) {
+int VectorTargetTransformImpl::InstructionOpcodeToISD(unsigned Opcode) const {
   enum InstructionOpcodes {
 #define HANDLE_INST(NUM, OPCODE, CLASS) OPCODE = NUM,
 #define LAST_OTHER_INST(NUM) InstructionOpcodesCount = NUM
@@ -101,7 +101,7 @@ static int InstructionOpcodeToISD(unsigned Opcode) {
   case AtomicRMW:      return 0;
   case Trunc:          return ISD::TRUNCATE;
   case ZExt:           return ISD::ZERO_EXTEND;
-  case SExt:           return ISD::SEXTLOAD;
+  case SExt:           return ISD::SIGN_EXTEND;
   case FPToUI:         return ISD::FP_TO_UINT;
   case FPToSI:         return ISD::FP_TO_SINT;
   case UIToFP:         return ISD::UINT_TO_FP;
@@ -130,24 +130,28 @@ static int InstructionOpcodeToISD(unsigned Opcode) {
   llvm_unreachable("Unknown instruction type encountered!");
 }
 
-std::pair<unsigned, EVT>
-VectorTargetTransformImpl::getTypeLegalizationCost(LLVMContext &C,
-                                                   EVT Ty) const {
+std::pair<unsigned, MVT>
+VectorTargetTransformImpl::getTypeLegalizationCost(Type *Ty) const {
+
+  LLVMContext &C = Ty->getContext();
+  EVT MTy = TLI->getValueType(Ty);
+
   unsigned Cost = 1;
   // We keep legalizing the type until we find a legal kind. We assume that
   // the only operation that costs anything is the split. After splitting
   // we need to handle two types.
   while (true) {
-    TargetLowering::LegalizeKind LK = TLI->getTypeConversion(C, Ty);
+    TargetLowering::LegalizeKind LK = TLI->getTypeConversion(C, MTy);
 
     if (LK.first == TargetLowering::TypeLegal)
-      return std::make_pair(Cost, Ty);
+      return std::make_pair(Cost, MTy.getSimpleVT());
 
-    if (LK.first == TargetLowering::TypeSplitVector)
+    if (LK.first == TargetLowering::TypeSplitVector ||
+        LK.first == TargetLowering::TypeExpandInteger)
       Cost *= 2;
 
     // Keep legalizing the type.
-    Ty = LK.second;
+    MTy = LK.second;
   }
 }
 
@@ -174,8 +178,7 @@ unsigned VectorTargetTransformImpl::getArithmeticInstrCost(unsigned Opcode,
   int ISD = InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
-  std::pair<unsigned, EVT> LT =
-  getTypeLegalizationCost(Ty->getContext(), TLI->getValueType(Ty));
+  std::pair<unsigned, MVT> LT = getTypeLegalizationCost(Ty);
 
   if (!TLI->isOperationExpand(ISD, LT.second)) {
     // The operation is legal. Assume it costs 1. Multiply
@@ -205,11 +208,8 @@ unsigned VectorTargetTransformImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
   int ISD = InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
-  std::pair<unsigned, EVT> SrcLT =
-  getTypeLegalizationCost(Src->getContext(), TLI->getValueType(Src));
-
-  std::pair<unsigned, EVT> DstLT =
-  getTypeLegalizationCost(Dst->getContext(), TLI->getValueType(Dst));
+  std::pair<unsigned, MVT> SrcLT = getTypeLegalizationCost(Src);
+  std::pair<unsigned, MVT> DstLT = getTypeLegalizationCost(Dst);
 
   // Handle scalar conversions.
   if (!Src->isVectorTy() && !Dst->isVectorTy()) {
@@ -234,9 +234,17 @@ unsigned VectorTargetTransformImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
         SrcLT.second.getSizeInBits() == DstLT.second.getSizeInBits()) {
 
       // Bitcast between types that are legalized to the same type are free.
-      if (Opcode == Instruction::BitCast)
+      if (Opcode == Instruction::BitCast || Opcode == Instruction::Trunc)
         return 0;
 
+      // Assume that Zext is done using AND.
+      if (Opcode == Instruction::ZExt)
+        return 1;
+
+      // Assume that sext is done using SHL and SRA.
+      if (Opcode == Instruction::SExt)
+        return 2;
+
       // Just check the op cost. If the operation is legal then assume it costs
       // 1 and multiply by the type-legalization overhead.
       if (!TLI->isOperationExpand(ISD, DstLT.second))
@@ -283,8 +291,7 @@ unsigned VectorTargetTransformImpl::getCmpSelInstrCost(unsigned Opcode,
       ISD = ISD::VSELECT;
   }
 
-  std::pair<unsigned, EVT> LT =
-  getTypeLegalizationCost(ValTy->getContext(), TLI->getValueType(ValTy));
+  std::pair<unsigned, MVT> LT = getTypeLegalizationCost(ValTy);
 
   if (!TLI->isOperationExpand(ISD, LT.second)) {
     // The operation is legal. Assume it costs 1. Multiply
@@ -300,7 +307,7 @@ unsigned VectorTargetTransformImpl::getCmpSelInstrCost(unsigned Opcode,
     unsigned Cost = getCmpSelInstrCost(Opcode, ValTy->getScalarType(),
                                        CondTy);
 
-    // return the cost of multiple scalar invocation plus the cost of inserting
+    // Return the cost of multiple scalar invocation plus the cost of inserting
     // and extracting the values.
     return getScalarizationOverhead(ValTy, true, false) + Num * Cost;
   }
@@ -309,7 +316,6 @@ unsigned VectorTargetTransformImpl::getCmpSelInstrCost(unsigned Opcode,
   return 1;
 }
 
-/// Returns the expected cost of Vector Insert and Extract.
 unsigned VectorTargetTransformImpl::getVectorInstrCost(unsigned Opcode,
                                                        Type *Val,
                                                        unsigned Index) const {
@@ -326,8 +332,7 @@ unsigned
 VectorTargetTransformImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
                                            unsigned Alignment,
                                            unsigned AddressSpace) const {
-  std::pair<unsigned, EVT> LT =
-  getTypeLegalizationCost(Src->getContext(), TLI->getValueType(Src));
+  std::pair<unsigned, MVT> LT = getTypeLegalizationCost(Src);
 
   // Assume that all loads of legal types cost 1.
   return LT.first;
@@ -335,7 +340,6 @@ VectorTargetTransformImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
 
 unsigned
 VectorTargetTransformImpl::getNumberOfParts(Type *Tp) const {
-  std::pair<unsigned, EVT> LT =
-    getTypeLegalizationCost(Tp->getContext(), TLI->getValueType(Tp));
+  std::pair<unsigned, MVT> LT = getTypeLegalizationCost(Tp);
   return LT.first;
 }
diff --git a/lib/Target/X86/AsmParser/X86AsmLexer.cpp b/lib/Target/X86/AsmParser/X86AsmLexer.cpp
index 2794e60df2..66ad353709 100644
--- a/lib/Target/X86/AsmParser/X86AsmLexer.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmLexer.cpp
@@ -18,19 +18,19 @@
 using namespace llvm;
 
 namespace {
-  
+
 class X86AsmLexer : public MCTargetAsmLexer {
   const MCAsmInfo &AsmInfo;
-  
+
   bool tentativeIsValid;
   AsmToken tentativeToken;
-  
+
   const AsmToken &lexTentative() {
     tentativeToken = getLexer()->Lex();
     tentativeIsValid = true;
     return tentativeToken;
   }
-  
+
   const AsmToken &lexDefinite() {
     if (tentativeIsValid) {
       tentativeIsValid = false;
@@ -38,7 +38,7 @@ class X86AsmLexer : public MCTargetAsmLexer {
     }
     return getLexer()->Lex();
   }
-  
+
   AsmToken LexTokenATT();
   AsmToken LexTokenIntel();
 protected:
@@ -47,7 +47,7 @@ protected:
       SetError(SMLoc(), "No MCAsmLexer installed");
       return AsmToken(AsmToken::Error, "", 0);
     }
-    
+
     switch (AsmInfo.getAssemblerDialect()) {
     default:
       SetError(SMLoc(), "Unhandled dialect");
@@ -71,33 +71,32 @@ public:
 
 AsmToken X86AsmLexer::LexTokenATT() {
   AsmToken lexedToken = lexDefinite();
-  
+
   switch (lexedToken.getKind()) {
   default:
     return lexedToken;
   case AsmToken::Error:
     SetError(Lexer->getErrLoc(), Lexer->getErr());
     return lexedToken;
-      
+
   case AsmToken::Percent: {
     const AsmToken &nextToken = lexTentative();
     if (nextToken.getKind() != AsmToken::Identifier)
       return lexedToken;
 
-      
     if (unsigned regID = MatchRegisterName(nextToken.getString())) {
       lexDefinite();
-        
+
       // FIXME: This is completely wrong when there is a space or other
       // punctuation between the % and the register name.
       StringRef regStr(lexedToken.getString().data(),
-                       lexedToken.getString().size() + 
+                       lexedToken.getString().size() +
                        nextToken.getString().size());
-      
-      return AsmToken(AsmToken::Register, regStr, 
+
+      return AsmToken(AsmToken::Register, regStr,
                       static_cast<int64_t>(regID));
     }
-    
+
     // Match register name failed.  If this is "db[0-7]", match it as an alias
     // for dr[0-7].
     if (nextToken.getString().size() == 3 &&
@@ -113,29 +112,29 @@ AsmToken X86AsmLexer::LexTokenATT() {
       case '6': RegNo = X86::DR6; break;
       case '7': RegNo = X86::DR7; break;
       }
-      
+
       if (RegNo != -1) {
         lexDefinite();
 
         // FIXME: This is completely wrong when there is a space or other
         // punctuation between the % and the register name.
         StringRef regStr(lexedToken.getString().data(),
-                         lexedToken.getString().size() + 
+                         lexedToken.getString().size() +
                          nextToken.getString().size());
-        return AsmToken(AsmToken::Register, regStr, 
+        return AsmToken(AsmToken::Register, regStr,
                         static_cast<int64_t>(RegNo));
       }
     }
-      
-   
+
+
     return lexedToken;
-  }    
+  }
   }
 }
 
 AsmToken X86AsmLexer::LexTokenIntel() {
   const AsmToken &lexedToken = lexDefinite();
-  
+
   switch(lexedToken.getKind()) {
   default:
     return lexedToken;
@@ -144,7 +143,7 @@ AsmToken X86AsmLexer::LexTokenIntel() {
     return lexedToken;
   case AsmToken::Identifier: {
     unsigned regID = MatchRegisterName(lexedToken.getString().lower());
-    
+
     if (regID)
       return AsmToken(AsmToken::Register,
                       lexedToken.getString(),
diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index b0e5be3162..0ce4c126c2 100644
--- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -276,9 +276,9 @@ namespace X86II {
     MRM_C1 = 33, MRM_C2 = 34, MRM_C3 = 35, MRM_C4 = 36,
     MRM_C8 = 37, MRM_C9 = 38, MRM_E8 = 39, MRM_F0 = 40,
     MRM_F8 = 41, MRM_F9 = 42, MRM_D0 = 45, MRM_D1 = 46,
-    MRM_D4 = 47, MRM_D8 = 48, MRM_D9 = 49, MRM_DA = 50,
-    MRM_DB = 51, MRM_DC = 52, MRM_DD = 53, MRM_DE = 54,
-    MRM_DF = 55,
+    MRM_D4 = 47, MRM_D5 = 48, MRM_D8 = 49, MRM_D9 = 50,
+    MRM_DA = 51, MRM_DB = 52, MRM_DC = 53, MRM_DD = 54,
+    MRM_DE = 55, MRM_DF = 56,
 
     /// RawFrmImm8 - This is used for the ENTER instruction, which has two
     /// immediates, the first of which is a 16-bit immediate (specified by
@@ -583,11 +583,11 @@ namespace X86II {
     case X86II::MRM_E8: case X86II::MRM_F0:
     case X86II::MRM_F8: case X86II::MRM_F9:
     case X86II::MRM_D0: case X86II::MRM_D1:
-    case X86II::MRM_D4: case X86II::MRM_D8:
-    case X86II::MRM_D9: case X86II::MRM_DA:
-    case X86II::MRM_DB: case X86II::MRM_DC:
-    case X86II::MRM_DD: case X86II::MRM_DE:
-    case X86II::MRM_DF:
+    case X86II::MRM_D4: case X86II::MRM_D5:
+    case X86II::MRM_D8: case X86II::MRM_D9:
+    case X86II::MRM_DA: case X86II::MRM_DB:
+    case X86II::MRM_DC: case X86II::MRM_DD:
+    case X86II::MRM_DE: case X86II::MRM_DF:
       return -1;
     }
   }
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 400263c3b5..4c6036761a 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -1124,13 +1124,13 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
   case X86II::MRM_C3: case X86II::MRM_C4:
   case X86II::MRM_C8: case X86II::MRM_C9:
   case X86II::MRM_D0: case X86II::MRM_D1:
-  case X86II::MRM_D4: case X86II::MRM_D8:
-  case X86II::MRM_D9: case X86II::MRM_DA:
-  case X86II::MRM_DB: case X86II::MRM_DC:
-  case X86II::MRM_DD: case X86II::MRM_DE:
-  case X86II::MRM_DF: case X86II::MRM_E8:
-  case X86II::MRM_F0: case X86II::MRM_F8:
-  case X86II::MRM_F9:
+  case X86II::MRM_D4: case X86II::MRM_D5:
+  case X86II::MRM_D8: case X86II::MRM_D9:
+  case X86II::MRM_DA: case X86II::MRM_DB:
+  case X86II::MRM_DC: case X86II::MRM_DD:
+  case X86II::MRM_DE: case X86II::MRM_DF:
+  case X86II::MRM_E8: case X86II::MRM_F0:
+  case X86II::MRM_F8: case X86II::MRM_F9:
     EmitByte(BaseOpcode, CurByte, OS);
 
     unsigned char MRM;
@@ -1145,6 +1145,7 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
     case X86II::MRM_D0: MRM = 0xD0; break;
     case X86II::MRM_D1: MRM = 0xD1; break;
     case X86II::MRM_D4: MRM = 0xD4; break;
+    case X86II::MRM_D5: MRM = 0xD5; break;
     case X86II::MRM_D8: MRM = 0xD8; break;
     case X86II::MRM_D9: MRM = 0xD9; break;
     case X86II::MRM_DA: MRM = 0xDA; break;
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 1b5c4d9753..8ad0bc08ac 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -118,6 +118,8 @@ def FeatureBMI     : SubtargetFeature<"bmi", "HasBMI", "true",
                                       "Support BMI instructions">;
 def FeatureBMI2    : SubtargetFeature<"bmi2", "HasBMI2", "true",
                                       "Support BMI2 instructions">;
+def FeatureRTM     : SubtargetFeature<"rtm", "HasRTM", "true",
+                                      "Support RTM instructions">;
 def FeatureLeaForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
                                      "Use LEA for adjusting the stack pointer">;
 def FeatureSlowDivide : SubtargetFeature<"idiv-to-divb",
@@ -192,7 +194,8 @@ def : Proc<"core-avx2",       [FeatureAVX2, FeatureCMPXCHG16B, FeaturePOPCNT,
                                FeatureAES, FeaturePCLMUL, FeatureRDRAND,
                                FeatureF16C, FeatureFSGSBase,
                                FeatureMOVBE, FeatureLZCNT, FeatureBMI,
-                               FeatureBMI2, FeatureFMA]>;
+                               FeatureBMI2, FeatureFMA,
+                               FeatureRTM]>;
 
 def : Proc<"k6",              [FeatureMMX]>;
 def : Proc<"k6-2",            [Feature3DNow]>;
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 972913ea1f..c359d37885 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -12396,6 +12396,63 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
 
 // private utility function
 
+/// Utility function to emit xbegin specifying the start of an RTM region.
+MachineBasicBlock *
+X86TargetLowering::EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB) const {
+  DebugLoc DL = MI->getDebugLoc();
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+
+  const BasicBlock *BB = MBB->getBasicBlock();
+  MachineFunction::iterator I = MBB;
+  ++I;
+
+  // For the v = xbegin(), we generate
+  //
+  // thisMBB:
+  //  xbegin sinkMBB
+  //
+  // mainMBB:
+  //  eax = -1
+  //
+  // sinkMBB:
+  //  v = eax
+
+  MachineBasicBlock *thisMBB = MBB;
+  MachineFunction *MF = MBB->getParent();
+  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+  MF->insert(I, mainMBB);
+  MF->insert(I, sinkMBB);
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), MBB,
+                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // thisMBB:
+  //  xbegin sinkMBB
+  //  # fallthrough to mainMBB
+  //  # abortion to sinkMBB
+  BuildMI(thisMBB, DL, TII->get(X86::XBEGIN_4)).addMBB(sinkMBB);
+  thisMBB->addSuccessor(mainMBB);
+  thisMBB->addSuccessor(sinkMBB);
+
+  // mainMBB:
+  //  EAX = -1
+  BuildMI(mainMBB, DL, TII->get(X86::MOV32ri), X86::EAX).addImm(-1);
+  mainMBB->addSuccessor(sinkMBB);
+
+  // sinkMBB:
+  // EAX is live into the sinkMBB
+  sinkMBB->addLiveIn(X86::EAX);
+  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+          TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
+    .addReg(X86::EAX);
+
+  MI->eraseFromParent();
+  return sinkMBB;
+}
+
 // Get CMPXCHG opcode for the specified data type.
 static unsigned getCmpXChgOpcode(EVT VT) {
   switch (VT.getSimpleVT().SimpleTy) {
@@ -14076,6 +14133,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case X86::MONITOR:
     return EmitMonitor(MI, BB);
 
+  // xbegin
+  case X86::XBEGIN:
+    return EmitXBegin(MI, BB);
+
     // Atomic Lowering.
   case X86::ATOMAND8:
   case X86::ATOMAND16:
@@ -17404,7 +17465,7 @@ void X86TargetLowering::LowerAsmOperandForConstraint(SDValue Op,
     return;
   case 'K':
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
-      if ((int8_t)C->getSExtValue() == C->getSExtValue()) {
+      if (isInt<8>(C->getSExtValue())) {
         Result = DAG.getTargetConstant(C->getZExtValue(), Op.getValueType());
         break;
       }
@@ -17730,71 +17791,205 @@ X86TargetLowering::getRegForInlineAsmConstraint(const std::string &Constraint,
   return Res;
 }
 
+//===----------------------------------------------------------------------===//
+//
+// X86 cost model.
+//
+//===----------------------------------------------------------------------===//
+
+struct X86CostTblEntry {
+  int ISD;
+  MVT Type;
+  unsigned Cost;
+};
+
+static int
+FindInTable(const X86CostTblEntry *Tbl, unsigned len, int ISD, MVT Ty) {
+  for (unsigned int i = 0; i < len; ++i)
+    if (Tbl[i].ISD == ISD && Tbl[i].Type == Ty)
+      return i;
+
+  // Could not find an entry.
+  return -1;
+}
+
+struct X86TypeConversionCostTblEntry {
+  int ISD;
+  MVT Dst;
+  MVT Src;
+  unsigned Cost;
+};
+
+static int
+FindInConvertTable(const X86TypeConversionCostTblEntry *Tbl, unsigned len,
+                   int ISD, MVT Dst, MVT Src) {
+  for (unsigned int i = 0; i < len; ++i)
+    if (Tbl[i].ISD == ISD && Tbl[i].Src == Src && Tbl[i].Dst == Dst)
+      return i;
+
+  // Could not find an entry.
+  return -1;
+}
+
 unsigned
 X86VectorTargetTransformInfo::getArithmeticInstrCost(unsigned Opcode,
                                                      Type *Ty) const {
-  const X86Subtarget &ST =
-  TLI->getTargetMachine().getSubtarget<X86Subtarget>();
+  // Legalize the type.
+  std::pair<unsigned, MVT> LT = getTypeLegalizationCost(Ty);
+
+  int ISD = InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  const X86Subtarget &ST = TLI->getTargetMachine().getSubtarget<X86Subtarget>();
+
+  static const X86CostTblEntry AVX1CostTable[] = {
+    // We don't have to scalarize unsupported ops. We can issue two half-sized
+    // operations and we only need to extract the upper YMM half.
+    // Two ops + 1 extract + 1 insert = 4.
+    { ISD::MUL,     MVT::v8i32,    4 },
+    { ISD::SUB,     MVT::v8i32,    4 },
+    { ISD::ADD,     MVT::v8i32,    4 },
+    { ISD::MUL,     MVT::v4i64,    4 },
+    { ISD::SUB,     MVT::v4i64,    4 },
+    { ISD::ADD,     MVT::v4i64,    4 },
+    };
 
-  // Fix some of the inaccuracies of the target independent estimation.
-  if (Ty->isVectorTy() && ST.hasSSE41()) {
-    unsigned NumElem = Ty->getVectorNumElements();
-    unsigned SizeInBits = Ty->getScalarType()->getScalarSizeInBits();
-
-    bool Is2 = (NumElem == 2);
-    bool Is4 = (NumElem == 4);
-    bool Is8 = (NumElem == 8);
-    bool Is32bits = (SizeInBits == 32);
-    bool Is64bits = (SizeInBits == 64);
-    bool HasAvx = ST.hasAVX();
-    bool HasAvx2 = ST.hasAVX2();
-
-    switch (Opcode) {
-      case Instruction::Add:
-      case Instruction::Sub:
-      case Instruction::Mul: {
-        // Only AVX2 has support for 8-wide integer operations.
-        if (Is32bits && (Is4 || (Is8 && HasAvx2))) return 1;
-        if (Is64bits && (Is2 || (Is4 && HasAvx2))) return 1;
-
-        // We don't have to completly scalarize unsupported ops. We can
-        // issue two half-sized operations (with some overhead).
-        // We don't need to extract the lower part of the YMM to the XMM.
-        // Extract the upper, two ops, insert the upper = 4.
-        if (Is32bits && Is8 && HasAvx) return 4;
-        if (Is64bits && Is4 && HasAvx) return 4;
-        break;
-      }
-      case Instruction::FAdd:
-      case Instruction::FSub:
-      case Instruction::FMul: {
-        // AVX has support for 8-wide float operations.
-        if (Is32bits && (Is4 || (Is8 && HasAvx))) return 1;
-        if (Is64bits && (Is2 || (Is4 && HasAvx))) return 1;
-        break;
-      }
-      case Instruction::Shl:
-      case Instruction::LShr:
-      case Instruction::AShr:
-      case Instruction::And:
-      case Instruction::Or:
-      case Instruction::Xor: {
-        // AVX has support for 8-wide integer bitwise operations.
-        if (Is32bits && (Is4 || (Is8 && HasAvx))) return 1;
-        if (Is64bits && (Is2 || (Is4 && HasAvx))) return 1;
-        break;
-      }
-    }
+  // Look for AVX1 lowering tricks.
+  if (ST.hasAVX()) {
+    int Idx = FindInTable(AVX1CostTable, array_lengthof(AVX1CostTable), ISD,
+                          LT.second);
+    if (Idx != -1)
+      return LT.first * AVX1CostTable[Idx].Cost;
   }
-
+  // Fallback to the default implementation.
   return VectorTargetTransformImpl::getArithmeticInstrCost(Opcode, Ty);
 }
 
 unsigned
 X86VectorTargetTransformInfo::getVectorInstrCost(unsigned Opcode, Type *Val,
-                                    unsigned Index) const {
-  // Floating point scalars are already located in index #0.
-  if (Val->getScalarType()->isFloatingPointTy() && Index == 0)
-    return 0;
+                                                 unsigned Index) const {
+  assert(Val->isVectorTy() && "This must be a vector type");
+
+  if (Index != -1U) {
+    // Legalize the type.
+    std::pair<unsigned, MVT> LT = getTypeLegalizationCost(Val);
+
+    // This type is legalized to a scalar type.
+    if (!LT.second.isVector())
+      return 0;
+
+    // The type may be split. Normalize the index to the new type.
+    unsigned Width = LT.second.getVectorNumElements();
+    Index = Index % Width;
+
+    // Floating point scalars are already located in index #0.
+    if (Val->getScalarType()->isFloatingPointTy() && Index == 0)
+      return 0;
+  }
+
   return VectorTargetTransformImpl::getVectorInstrCost(Opcode, Val, Index);
 }
+
+unsigned X86VectorTargetTransformInfo::getCmpSelInstrCost(unsigned Opcode,
+                                                          Type *ValTy,
+                                                          Type *CondTy) const {
+  // Legalize the type.
+  std::pair<unsigned, MVT> LT = getTypeLegalizationCost(ValTy);
+
+  MVT MTy = LT.second;
+
+  int ISD = InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  const X86Subtarget &ST =
+  TLI->getTargetMachine().getSubtarget<X86Subtarget>();
+
+  static const X86CostTblEntry SSE42CostTbl[] = {
+    { ISD::SETCC,   MVT::v2f64,   1 },
+    { ISD::SETCC,   MVT::v4f32,   1 },
+    { ISD::SETCC,   MVT::v2i64,   1 },
+    { ISD::SETCC,   MVT::v4i32,   1 },
+    { ISD::SETCC,   MVT::v8i16,   1 },
+    { ISD::SETCC,   MVT::v16i8,   1 },
+  };
+
+  static const X86CostTblEntry AVX1CostTbl[] = {
+    { ISD::SETCC,   MVT::v4f64,   1 },
+    { ISD::SETCC,   MVT::v8f32,   1 },
+    // AVX1 does not support 8-wide integer compare.
+    { ISD::SETCC,   MVT::v4i64,   4 },
+    { ISD::SETCC,   MVT::v8i32,   4 },
+    { ISD::SETCC,   MVT::v16i16,  4 },
+    { ISD::SETCC,   MVT::v32i8,   4 },
+  };
+
+  static const X86CostTblEntry AVX2CostTbl[] = {
+    { ISD::SETCC,   MVT::v4i64,   1 },
+    { ISD::SETCC,   MVT::v8i32,   1 },
+    { ISD::SETCC,   MVT::v16i16,  1 },
+    { ISD::SETCC,   MVT::v32i8,   1 },
+  };
+
+  if (ST.hasSSE42()) {
+    int Idx = FindInTable(SSE42CostTbl, array_lengthof(SSE42CostTbl), ISD, MTy);
+    if (Idx != -1)
+      return LT.first * SSE42CostTbl[Idx].Cost;
+  }
+
+  if (ST.hasAVX()) {
+    int Idx = FindInTable(AVX1CostTbl, array_lengthof(AVX1CostTbl), ISD, MTy);
+    if (Idx != -1)
+      return LT.first * AVX1CostTbl[Idx].Cost;
+  }
+
+  if (ST.hasAVX2()) {
+    int Idx = FindInTable(AVX2CostTbl, array_lengthof(AVX2CostTbl), ISD, MTy);
+    if (Idx != -1)
+      return LT.first * AVX2CostTbl[Idx].Cost;
+  }
+
+  return VectorTargetTransformImpl::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+}
+
+unsigned X86VectorTargetTransformInfo::getCastInstrCost(unsigned Opcode,
+                                                        Type *Dst,
+                                                        Type *Src) const {
+  int ISD = InstructionOpcodeToISD(Opcode);
+  assert(ISD && "Invalid opcode");
+
+  EVT SrcTy = TLI->getValueType(Src);
+  EVT DstTy = TLI->getValueType(Dst);
+
+  if (!SrcTy.isSimple() || !DstTy.isSimple())
+    return VectorTargetTransformImpl::getCastInstrCost(Opcode, Dst, Src);
+
+  const X86Subtarget &ST = TLI->getTargetMachine().getSubtarget<X86Subtarget>();
+
+  static const X86TypeConversionCostTblEntry AVXConversionTbl[] = {
+    { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
+    { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1 },
+    { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
+    { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i32, 1 },
+    { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64, 1 },
+    { ISD::TRUNCATE,    MVT::v8i16, MVT::v8i32, 1 },
+    { ISD::SINT_TO_FP,  MVT::v8f32, MVT::v8i8,  1 },
+    { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8,  1 },
+    { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i8,  1 },
+    { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8,  1 },
+    { ISD::FP_TO_SINT,  MVT::v8i8, MVT::v8f32,  1 },
+    { ISD::FP_TO_SINT,  MVT::v4i8, MVT::v4f32,  1 },
+    { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1,  6 },
+    { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1,  9 },
+    { ISD::TRUNCATE,    MVT::v8i32, MVT::v8i64, 3 },
+  };
+
+  if (ST.hasAVX()) {
+    int Idx = FindInConvertTable(AVXConversionTbl,
+                                 array_lengthof(AVXConversionTbl),
+                                 ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT());
+    if (Idx != -1)
+      return AVXConversionTbl[Idx].Cost;
+  }
+
+  return VectorTargetTransformImpl::getCastInstrCost(Opcode, Dst, Src);
+}
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 39d24c35c7..872ef887c5 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -917,6 +917,10 @@ namespace llvm {
     MachineBasicBlock *EmitAtomicLoadArith6432(MachineInstr *MI,
                                                MachineBasicBlock *MBB) const;
 
+    /// Utility function to emit xbegin specifying the start of an RTM region.
+    MachineBasicBlock *EmitXBegin(MachineInstr *MI,
+                                  MachineBasicBlock *MBB) const;
+
     // Utility function to emit the low-level va_arg code for X86-64.
     MachineBasicBlock *EmitVAARG64WithCustomInserter(
                        MachineInstr *MI,
@@ -976,8 +980,13 @@ namespace llvm {
 
     virtual unsigned getVectorInstrCost(unsigned Opcode, Type *Val,
                                         unsigned Index) const;
-  };
 
+    unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
+                                Type *CondTy) const;
+
+    virtual unsigned getCastInstrCost(unsigned Opcode, Type *Dst,
+                                      Type *Src) const;
+  };
 }
 
 #endif    // X86ISELLOWERING_H
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index 247c42ce8a..7309942880 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -44,14 +44,15 @@ def RawFrmImm16 : Format<44>;
 def MRM_D0 : Format<45>;
 def MRM_D1 : Format<46>;
 def MRM_D4 : Format<47>;
-def MRM_D8 : Format<48>;
-def MRM_D9 : Format<49>;
-def MRM_DA : Format<50>;
-def MRM_DB : Format<51>;
-def MRM_DC : Format<52>;
-def MRM_DD : Format<53>;
-def MRM_DE : Format<54>;
-def MRM_DF : Format<55>;
+def MRM_D5 : Format<48>;
+def MRM_D8 : Format<49>;
+def MRM_D9 : Format<50>;
+def MRM_DA : Format<51>;
+def MRM_DB : Format<52>;
+def MRM_DC : Format<53>;
+def MRM_DD : Format<54>;
+def MRM_DE : Format<55>;
+def MRM_DF : Format<56>;
 def CustomFrm : Format<62>; // @LOCALMOD
 
 // ImmType - This specifies the immediate type used by an instruction. This is
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index dd13c6c551..aff5defedb 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -605,6 +605,7 @@ def HasFSGSBase  : Predicate<"Subtarget->hasFSGSBase()">;
 def HasLZCNT     : Predicate<"Subtarget->hasLZCNT()">;
 def HasBMI       : Predicate<"Subtarget->hasBMI()">;
 def HasBMI2      : Predicate<"Subtarget->hasBMI2()">;
+def HasRTM       : Predicate<"Subtarget->hasRTM()">;
 def FPStackf32   : Predicate<"!Subtarget->hasSSE1()">;
 def FPStackf64   : Predicate<"!Subtarget->hasSSE2()">;
 def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
@@ -1728,6 +1729,8 @@ include "X86Instr3DNow.td"
 include "X86InstrVMX.td"
 include "X86InstrSVM.td"
 
+include "X86InstrTSX.td"
+
 // System instructions.
 include "X86InstrSystem.td"
 
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index dff2d4ea1c..28dfbe7a1f 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -1302,7 +1302,7 @@ let Predicates = [HasAVX] in {
             (VMOVHPSrm VR128:$src1, addr:$src2)>;
 
   // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
-  // is during lowering, where it's not possible to recognize the load fold 
+  // is during lowering, where it's not possible to recognize the load fold
   // cause it has two uses through a bitcast. One use disappears at isel time
   // and the fold opportunity reappears.
   def : Pat<(v2f64 (X86Unpckl VR128:$src1,
@@ -1322,7 +1322,7 @@ let Predicates = [UseSSE1] in {
 
 let Predicates = [UseSSE2] in {
   // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
-  // is during lowering, where it's not possible to recognize the load fold 
+  // is during lowering, where it's not possible to recognize the load fold
   // cause it has two uses through a bitcast. One use disappears at isel time
   // and the fold opportunity reappears.
   def : Pat<(v2f64 (X86Unpckl VR128:$src1,
@@ -2159,7 +2159,7 @@ let Predicates = [UseSSE2] in {
 
 // sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
 multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
-                            Operand CC, SDNode OpNode, ValueType VT, 
+                            Operand CC, SDNode OpNode, ValueType VT,
                             PatFrag ld_frag, string asm, string asm_alt,
                             OpndItins itins> {
   def rr : SIi8<0xC2, MRMSrcReg,
@@ -2305,7 +2305,7 @@ let Defs = [EFLAGS] in {
 
 // sse12_cmp_packed - sse 1 & 2 compare packed instructions
 multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
-                            Operand CC, Intrinsic Int, string asm, 
+                            Operand CC, Intrinsic Int, string asm,
                             string asm_alt, Domain d> {
   def rri : PIi8<0xC2, MRMSrcReg,
              (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
diff --git a/lib/Target/X86/X86InstrTSX.td b/lib/Target/X86/X86InstrTSX.td
new file mode 100644
index 0000000000..ad55058ede
--- /dev/null
+++ b/lib/Target/X86/X86InstrTSX.td
@@ -0,0 +1,32 @@
+//===-- X86InstrVMX.td - TSX Instruction Set Extension -----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the Intel TSX instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// TSX instructions
+
+let usesCustomInserter = 1 in
+def XBEGIN : I<0, Pseudo, (outs GR32:$dst), (ins),
+               "# XBEGIN", [(set GR32:$dst, (int_x86_xbegin))]>,
+             Requires<[HasRTM]>;
+
+let isBranch = 1, isTerminator = 1, Defs = [EAX] in
+def XBEGIN_4 : Ii32PCRel<0xc7, MRM_F8, (outs), (ins brtarget:$dst),
+                         "xbegin\t$dst", []>;
+
+def XEND : I<0x01, MRM_D5, (outs), (ins),
+             "xend", [(int_x86_xend)]>, TB, Requires<[HasRTM]>;
+
+def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm),
+                 "xabort\t$imm",
+                 [(int_x86_xabort imm:$imm)]>, Requires<[HasRTM]>;
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 2653f958a5..31be6b7a7b 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -310,6 +310,10 @@ void X86Subtarget::AutoDetectSubtargetFeatures() {
         HasBMI2 = true;
         ToggleFeature(X86::FeatureBMI2);
       }
+      if (IsIntel && ((EBX >> 11) & 0x1)) {
+        HasRTM = true;
+        ToggleFeature(X86::FeatureRTM);
+      }
     }
   }
 }
@@ -338,6 +342,7 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
   , HasLZCNT(false)
   , HasBMI(false)
   , HasBMI2(false)
+  , HasRTM(false)
   , IsBTMemSlow(false)
   , IsUAMemFast(false)
   , HasVectorUAMem(false)
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index a9ce62640e..0f8cab52f2 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -118,6 +118,9 @@ protected:
   /// HasBMI2 - Processor has BMI2 instructions.
   bool HasBMI2;
 
+  /// HasRTM - Processor has RTM instructions.
+  bool HasRTM;
+
   /// IsBTMemSlow - True if BT (bit test) of memory instructions are slow.
   bool IsBTMemSlow;
 
@@ -222,6 +225,7 @@ public:
   bool hasLZCNT() const { return HasLZCNT; }
   bool hasBMI() const { return HasBMI; }
   bool hasBMI2() const { return HasBMI2; }
+  bool hasRTM() const { return HasRTM; }
   bool isBTMemSlow() const { return IsBTMemSlow; }
   bool isUnalignedMemAccessFast() const { return IsUAMemFast; }
   bool hasVectorUAMem() const { return HasVectorUAMem; }
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 93f785ca5b..b7be4625ca 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -731,8 +731,9 @@ bool AddressSanitizer::doInitialization(Module &M) {
       std::string FunctionName = std::string(kAsanReportErrorTemplate) +
           (AccessIsWrite ? "store" : "load") + itostr(1 << AccessSizeIndex);
       // If we are merging crash callbacks, they have two parameters.
-      AsanErrorCallback[AccessIsWrite][AccessSizeIndex] = cast<Function>(
-          M.getOrInsertFunction(FunctionName, IRB.getVoidTy(), IntptrTy, NULL));
+      AsanErrorCallback[AccessIsWrite][AccessSizeIndex] =
+          checkInterfaceFunction(M.getOrInsertFunction(
+              FunctionName, IRB.getVoidTy(), IntptrTy, NULL));
     }
   }
 
diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
index bacada58c1..7d652dea48 100644
--- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
@@ -119,38 +119,6 @@ static bool IsOnlyUsedInEqualityComparison(Value *V, Value *With) {
 
 namespace {
 //===---------------------------------------===//
-// 'strspn' Optimizations
-
-struct StrSpnOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 2 ||
-        FT->getParamType(0) != B.getInt8PtrTy() ||
-        FT->getParamType(1) != FT->getParamType(0) ||
-        !FT->getReturnType()->isIntegerTy())
-      return 0;
-
-    StringRef S1, S2;
-    bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
-    bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
-
-    // strspn(s, "") -> 0
-    // strspn("", s) -> 0
-    if ((HasS1 && S1.empty()) || (HasS2 && S2.empty()))
-      return Constant::getNullValue(CI->getType());
-
-    // Constant folding.
-    if (HasS1 && HasS2) {
-      size_t Pos = S1.find_first_not_of(S2);
-      if (Pos == StringRef::npos) Pos = S1.size();
-      return ConstantInt::get(CI->getType(), Pos);
-    }
-
-    return 0;
-  }
-};
-
-//===---------------------------------------===//
 // 'strcspn' Optimizations
 
 struct StrCSpnOpt : public LibCallOptimization {
@@ -1037,7 +1005,7 @@ namespace {
 
     StringMap<LibCallOptimization*> Optimizations;
     // String and Memory LibCall Optimizations
-    StrSpnOpt StrSpn; StrCSpnOpt StrCSpn; StrStrOpt StrStr;
+    StrCSpnOpt StrCSpn; StrStrOpt StrStr;
     MemCmpOpt MemCmp; MemCpyOpt MemCpy; MemMoveOpt MemMove; MemSetOpt MemSet;
     // Math Library Optimizations
     CosOpt Cos; PowOpt Pow; Exp2Opt Exp2;
@@ -1105,7 +1073,6 @@ void SimplifyLibCalls::AddOpt(LibFunc::Func F1, LibFunc::Func F2,
 /// we know.
 void SimplifyLibCalls::InitOptimizations() {
   // String and Memory LibCall Optimizations
-  Optimizations["strspn"] = &StrSpn;
   Optimizations["strcspn"] = &StrCSpn;
   Optimizations["strstr"] = &StrStr;
   Optimizations["memcmp"] = &MemCmp;
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 9823433e86..b33d0d473b 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -3538,7 +3538,8 @@ static bool SwitchToLookupTable(SwitchInst *SI,
                                 const TargetTransformInfo *TTI) {
   assert(SI->getNumCases() > 1 && "Degenerate switch?");
 
-  if (TTI && !TTI->getScalarTargetTransformInfo()->shouldBuildLookupTables())
+  // Only build lookup table when we have a target that supports it.
+  if (!TTI || !TTI->getScalarTargetTransformInfo()->shouldBuildLookupTables())
     return false;
 
   // FIXME: If the switch is too sparse for a lookup table, perhaps we could
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 581b8d3ea2..64c7011660 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -772,6 +772,35 @@ struct StrToOpt : public LibCallOptimization {
   }
 };
 
+struct StrSpnOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        FT->getParamType(0) != B.getInt8PtrTy() ||
+        FT->getParamType(1) != FT->getParamType(0) ||
+        !FT->getReturnType()->isIntegerTy())
+      return 0;
+
+    StringRef S1, S2;
+    bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
+    bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
+
+    // strspn(s, "") -> 0
+    // strspn("", s) -> 0
+    if ((HasS1 && S1.empty()) || (HasS2 && S2.empty()))
+      return Constant::getNullValue(CI->getType());
+
+    // Constant folding.
+    if (HasS1 && HasS2) {
+      size_t Pos = S1.find_first_not_of(S2);
+      if (Pos == StringRef::npos) Pos = S1.size();
+      return ConstantInt::get(CI->getType(), Pos);
+    }
+
+    return 0;
+  }
+};
+
 } // End anonymous namespace.
 
 namespace llvm {
@@ -802,6 +831,7 @@ class LibCallSimplifierImpl {
   StrLenOpt StrLen;
   StrPBrkOpt StrPBrk;
   StrToOpt StrTo;
+  StrSpnOpt StrSpn;
 
   void initOptimizations();
 public:
@@ -842,6 +872,7 @@ void LibCallSimplifierImpl::initOptimizations() {
   Optimizations["strtoll"] = &StrTo;
   Optimizations["strtold"] = &StrTo;
   Optimizations["strtoull"] = &StrTo;
+  Optimizations["strspn"] = &StrSpn;
 }
 
 Value *LibCallSimplifierImpl::optimizeCall(CallInst *CI) {
diff --git a/test/Analysis/CostModel/X86/arith.ll b/test/Analysis/CostModel/X86/arith.ll
index 58b4a7c426..37cca8d540 100644
--- a/test/Analysis/CostModel/X86/arith.ll
+++ b/test/Analysis/CostModel/X86/arith.ll
@@ -12,6 +12,8 @@ define i32 @add(i32 %arg) {
   %C = add <2 x i64> undef, undef
   ;CHECK: cost of 4 {{.*}} add
   %D = add <4 x i64> undef, undef
+  ;CHECK: cost of 8 {{.*}} add
+  %E = add <8 x i64> undef, undef
   ;CHECK: cost of 1 {{.*}} ret
   ret i32 undef
 }
diff --git a/test/Analysis/CostModel/X86/cast.ll b/test/Analysis/CostModel/X86/cast.ll
new file mode 100644
index 0000000000..75c97a781e
--- /dev/null
+++ b/test/Analysis/CostModel/X86/cast.ll
@@ -0,0 +1,69 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define i32 @add(i32 %arg) {
+
+  ; -- Same size registeres --
+  ;CHECK: cost of 1 {{.*}} zext
+  %A = zext <4 x i1> undef to <4 x i32>
+  ;CHECK: cost of 2 {{.*}} sext
+  %B = sext <4 x i1> undef to <4 x i32>
+  ;CHECK: cost of 0 {{.*}} trunc
+  %C = trunc <4 x i32> undef to <4 x i1>
+
+  ; -- Different size registers --
+  ;CHECK-NOT: cost of 1 {{.*}} zext
+  %D = zext <8 x i1> undef to <8 x i32>
+  ;CHECK-NOT: cost of 2 {{.*}} sext
+  %E = sext <8 x i1> undef to <8 x i32>
+  ;CHECK-NOT: cost of 2 {{.*}} trunc
+  %F = trunc <8 x i32> undef to <8 x i1>
+
+  ; -- scalars --
+
+  ;CHECK: cost of 1 {{.*}} zext
+  %G = zext i1 undef to i32
+  ;CHECK: cost of 0 {{.*}} trunc
+  %H = trunc i32 undef to i1
+
+  ;CHECK: cost of 1 {{.*}} ret
+  ret i32 undef
+}
+
+define i32 @zext_sext(<8 x i1> %in) {
+  ;CHECK: cost of 6 {{.*}} zext
+  %Z = zext <8 x i1> %in to <8 x i32>
+  ;CHECK: cost of 9 {{.*}} sext
+  %S = sext <8 x i1> %in to <8 x i32>
+
+  ;CHECK: cost of 1 {{.*}} sext
+  %A = sext <8 x i16> undef to <8 x i32>
+  ;CHECK: cost of 1 {{.*}} zext
+  %B = zext <8 x i16> undef to <8 x i32>
+  ;CHECK: cost of 1 {{.*}} sext
+  %C = sext <4 x i32> undef to <4 x i64>
+
+  ;CHECK: cost of 1 {{.*}} zext
+  %D = zext <4 x i32> undef to <4 x i64>
+  ;CHECK: cost of 1 {{.*}} trunc
+
+  %E = trunc <4 x i64> undef to <4 x i32>
+  ;CHECK: cost of 1 {{.*}} trunc
+  %F = trunc <8 x i32> undef to <8 x i16>
+
+  ;CHECK: cost of 3 {{.*}} trunc
+  %G = trunc <8 x i64> undef to <8 x i32>
+
+  ret i32 undef
+}
+
+define i32 @masks(<8 x i1> %in) {
+  ;CHECK: cost of 6 {{.*}} zext
+  %Z = zext <8 x i1> %in to <8 x i32>
+  ;CHECK: cost of 9 {{.*}} sext
+  %S = sext <8 x i1> %in to <8 x i32>
+  ret i32 undef
+}
+
diff --git a/test/Analysis/CostModel/X86/cmp.ll b/test/Analysis/CostModel/X86/cmp.ll
new file mode 100644
index 0000000000..f868bd18b5
--- /dev/null
+++ b/test/Analysis/CostModel/X86/cmp.ll
@@ -0,0 +1,42 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define i32 @cmp(i32 %arg) {
+  ;  -- floats --
+  ;CHECK: cost of 1 {{.*}} fcmp
+  %A = fcmp olt <2 x float> undef, undef
+  ;CHECK: cost of 1 {{.*}} fcmp
+  %B = fcmp olt <4 x float> undef, undef
+  ;CHECK: cost of 1 {{.*}} fcmp
+  %C = fcmp olt <8 x float> undef, undef
+  ;CHECK: cost of 1 {{.*}} fcmp
+  %D = fcmp olt <2 x double> undef, undef
+  ;CHECK: cost of 1 {{.*}} fcmp
+  %E = fcmp olt <4 x double> undef, undef
+
+  ;  -- integers --
+
+  ;CHECK: cost of 1 {{.*}} icmp
+  %F = icmp eq <16 x i8> undef, undef
+  ;CHECK: cost of 1 {{.*}} icmp
+  %G = icmp eq <8 x i16> undef, undef
+  ;CHECK: cost of 1 {{.*}} icmp
+  %H = icmp eq <4 x i32> undef, undef
+  ;CHECK: cost of 1 {{.*}} icmp
+  %I = icmp eq <2 x i64> undef, undef
+  ;CHECK: cost of 4 {{.*}} icmp
+  %J = icmp eq <4 x i64> undef, undef
+  ;CHECK: cost of 4 {{.*}} icmp
+  %K = icmp eq <8 x i32> undef, undef
+  ;CHECK: cost of 4 {{.*}} icmp
+  %L = icmp eq <16 x i16> undef, undef
+  ;CHECK: cost of 4 {{.*}} icmp
+  %M = icmp eq <32 x i8> undef, undef
+
+  ;CHECK: cost of 1 {{.*}} ret
+  ret i32 undef
+}
+
+
diff --git a/test/Analysis/CostModel/X86/i32.ll b/test/Analysis/CostModel/X86/i32.ll
new file mode 100644
index 0000000000..4015e0b1ee
--- /dev/null
+++ b/test/Analysis/CostModel/X86/i32.ll
@@ -0,0 +1,9 @@
+; RUN: opt < %s  -cost-model -analyze -mtriple=i386 -mcpu=corei7-avx | FileCheck %s
+
+
+;CHECK: cost of 2 {{.*}} add
+;CHECK: cost of 1 {{.*}} ret
+define i32 @no_info(i32 %arg) {
+  %e = add i64 undef, undef
+  ret i32 undef
+}
diff --git a/test/Analysis/CostModel/X86/insert-extract-at-zero.ll b/test/Analysis/CostModel/X86/insert-extract-at-zero.ll
index eea5b601d0..87bf7c488b 100644
--- a/test/Analysis/CostModel/X86/insert-extract-at-zero.ll
+++ b/test/Analysis/CostModel/X86/insert-extract-at-zero.ll
@@ -29,5 +29,12 @@ define i32 @insert-extract-at-zero-idx(i32 %arg, float %fl) {
   ;CHECK: cost of 0 {{.*}} insert
   %J = insertelement <4 x double> undef, double undef, i32 0
 
+  ;CHECK: cost of 0 {{.*}} insert
+  %K = insertelement <8 x double> undef, double undef, i32 4
+  ;CHECK: cost of 0 {{.*}} insert
+  %L = insertelement <16 x double> undef, double undef, i32 8
+  ;CHECK: cost of 1 {{.*}} insert
+  %M = insertelement <16 x double> undef, double undef, i32 9
   ret i32 0
 }
+
diff --git a/test/CodeGen/Mips/eh-dwarf-cfa.ll b/test/CodeGen/Mips/eh-dwarf-cfa.ll
new file mode 100644
index 0000000000..3a21332b5c
--- /dev/null
+++ b/test/CodeGen/Mips/eh-dwarf-cfa.ll
@@ -0,0 +1,63 @@
+; RUN: llc -march=mipsel -mcpu=mips32 < %s | FileCheck %s
+; RUN: llc -march=mips64el -mcpu=mips64 < %s | \
+; RUN:      FileCheck %s -check-prefix=CHECK-MIPS64
+
+declare i8* @llvm.eh.dwarf.cfa(i32) nounwind
+declare i8* @llvm.frameaddress(i32) nounwind readnone
+
+define i8* @f1() nounwind {
+entry:
+  %x = alloca [32 x i8], align 1
+  %0 = call i8* @llvm.eh.dwarf.cfa(i32 0)
+  ret i8* %0
+
+; CHECK:        addiu   $sp, $sp, -32
+; CHECK:        addiu   $2, $sp, 32
+}
+
+
+define i8* @f2() nounwind {
+entry:
+  %x = alloca [65536 x i8], align 1
+  %0 = call i8* @llvm.eh.dwarf.cfa(i32 0)
+  ret i8* %0
+
+; check stack size (65536 + 8)
+; CHECK:        lui     $[[R0:[a-z0-9]+]], 65535
+; CHECK:        addiu   $[[R0]], $[[R0]], -8
+; CHECK:        addu    $sp, $sp, $[[R0]]
+
+; check return value ($sp + stack size)
+; CHECK:        lui     $[[R1:[a-z0-9]+]], 1
+; CHECK:        addu    $[[R1]], $sp, $[[R1]]
+; CHECK:        addiu   $2, $[[R1]], 8
+}
+
+
+define i32 @f3() nounwind {
+entry:
+  %x = alloca [32 x i8], align 1
+  %0 = call i8* @llvm.eh.dwarf.cfa(i32 0)
+  %1 = ptrtoint i8* %0 to i32
+  %2 = call i8* @llvm.frameaddress(i32 0)
+  %3 = ptrtoint i8* %2 to i32
+  %add = add i32 %1, %3
+  ret i32 %add
+
+; CHECK:        addiu   $sp, $sp, -40
+
+; check return value ($fp + stack size + $fp)
+; CHECK:        addiu   $[[R0:[a-z0-9]+]], $fp, 40
+; CHECK:        addu    $2, $[[R0]], $fp
+}
+
+
+define i8* @f4() nounwind {
+entry:
+  %x = alloca [32 x i8], align 1
+  %0 = call i8* @llvm.eh.dwarf.cfa(i32 0)
+  ret i8* %0
+
+; CHECK-MIPS64:        daddiu   $sp, $sp, -32
+; CHECK-MIPS64:        daddiu   $2, $sp, 32
+}
diff --git a/test/CodeGen/PowerPC/asm-Zy.ll b/test/CodeGen/PowerPC/asm-Zy.ll
new file mode 100644
index 0000000000..691165f237
--- /dev/null
+++ b/test/CodeGen/PowerPC/asm-Zy.ll
@@ -0,0 +1,14 @@
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-bgq-linux"
+; RUN: llc < %s -march=ppc64 -mcpu=a2 | FileCheck %s
+
+define i32 @zytest(i32 %a) nounwind {
+entry:
+; CHECK: @zytest
+  %r = call i32 asm "lwbrx $0, ${1:y}", "=r,Z"(i32 %a) nounwind, !srcloc !0
+  ret i32 %r
+; CHECK: lwbrx 3, 0,
+}
+
+!0 = metadata !{i32 101688}
+
diff --git a/test/CodeGen/PowerPC/coalesce-ext.ll b/test/CodeGen/PowerPC/coalesce-ext.ll
index cc80f83307..f19175c9be 100644
--- a/test/CodeGen/PowerPC/coalesce-ext.ll
+++ b/test/CodeGen/PowerPC/coalesce-ext.ll
@@ -13,5 +13,6 @@ define i32 @test1sext(i64 %A, i64 %B, i32* %P, i64 *%P2) nounwind {
   store volatile i32 %D, i32* %P
   ; Reuse low bits of extended register, don't extend live range of SUM.
   ; CHECK: stw [[EXT]]
-  ret i32 %D
+  %R = add i32 %D, %D
+  ret i32 %R
 }
diff --git a/test/CodeGen/PowerPC/ppc64-abi-extend.ll b/test/CodeGen/PowerPC/ppc64-abi-extend.ll
new file mode 100644
index 0000000000..8baf1c613e
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-abi-extend.ll
@@ -0,0 +1,97 @@
+; Verify that i32 argument/return values are extended to i64
+
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@si = common global i32 0, align 4
+@ui = common global i32 0, align 4
+
+declare void @arg_si(i32 signext)
+declare void @arg_ui(i32 zeroext)
+
+declare signext i32 @ret_si()
+declare zeroext i32 @ret_ui()
+
+define void @pass_arg_si() nounwind {
+entry:
+  %0 = load i32* @si, align 4
+  tail call void @arg_si(i32 signext %0) nounwind
+  ret void
+}
+; CHECK: @pass_arg_si
+; CHECK: lwa 3,
+; CHECK: bl arg_si
+
+define void @pass_arg_ui() nounwind {
+entry:
+  %0 = load i32* @ui, align 4
+  tail call void @arg_ui(i32 zeroext %0) nounwind
+  ret void
+}
+; CHECK: @pass_arg_ui
+; CHECK: lwz 3,
+; CHECK: bl arg_ui
+
+define i64 @use_arg_si(i32 signext %x) nounwind readnone {
+entry:
+  %conv = sext i32 %x to i64
+  ret i64 %conv
+}
+; CHECK: @use_arg_si
+; CHECK: %entry
+; CHECK-NEXT: blr
+
+define i64 @use_arg_ui(i32 zeroext %x) nounwind readnone {
+entry:
+  %conv = zext i32 %x to i64
+  ret i64 %conv
+}
+; CHECK: @use_arg_ui
+; CHECK: %entry
+; CHECK-NEXT: blr
+
+define signext i32 @pass_ret_si() nounwind readonly {
+entry:
+  %0 = load i32* @si, align 4
+  ret i32 %0
+}
+; CHECK: @pass_ret_si
+; CHECK: lwa 3,
+; CHECK: blr
+
+define zeroext i32 @pass_ret_ui() nounwind readonly {
+entry:
+  %0 = load i32* @ui, align 4
+  ret i32 %0
+}
+; CHECK: @pass_ret_ui
+; CHECK: lwz 3,
+; CHECK: blr
+
+define i64 @use_ret_si() nounwind {
+entry:
+  %call = tail call signext i32 @ret_si() nounwind
+  %conv = sext i32 %call to i64
+  ret i64 %conv
+}
+; CHECK: @use_ret_si
+; CHECK: bl ret_si
+; This is to verify the return register (3) set up by the ret_si
+; call is passed on unmodified as return value of use_ret_si.
+; CHECK-NOT: 3
+; CHECK: blr
+
+define i64 @use_ret_ui() nounwind {
+entry:
+  %call = tail call zeroext i32 @ret_ui() nounwind
+  %conv = zext i32 %call to i64
+  ret i64 %conv
+}
+; CHECK: @use_ret_ui
+; CHECK: bl ret_ui
+; This is to verify the return register (3) set up by the ret_ui
+; call is passed on unmodified as return value of use_ret_ui.
+; CHECK-NOT: 3
+; CHECK: blr
+
diff --git a/test/CodeGen/X86/misched-balance.ll b/test/CodeGen/X86/misched-balance.ll
new file mode 100644
index 0000000000..2184d9e960
--- /dev/null
+++ b/test/CodeGen/X86/misched-balance.ll
@@ -0,0 +1,230 @@
+; RUN: llc < %s -march=x86-64 -mcpu=core2 -pre-RA-sched=source -enable-misched \
+; RUN:          -verify-machineinstrs | FileCheck %s
+;
+; Verify that misched resource/latency balancy heuristics are sane.
+
+define void @unrolled_mmult1(i32* %tmp55, i32* %tmp56, i32* %pre, i32* %pre94,
+  i32* %pre95, i32* %pre96, i32* %pre97, i32* %pre98, i32* %pre99,
+ i32* %pre100, i32* %pre101, i32* %pre102, i32* %pre103, i32* %pre104)
+  nounwind uwtable ssp {
+entry:
+  br label %for.body
+
+; imull folded loads should be in order and interleaved with addl, never
+; adjacent. Also check that we have no spilling.
+;
+; Since mmult1 IR is already in good order, this effectively ensure
+; the scheduler maintains source order.
+;
+; CHECK: %for.body
+; CHECK-NOT: %rsp
+; CHECK: imull 4
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 8
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 12
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 16
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 20
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 24
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 28
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 32
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 36
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: %end
+for.body:
+  %indvars.iv42.i = phi i64 [ %indvars.iv.next43.i, %for.body ], [ 0, %entry ]
+  %tmp57 = load i32* %tmp56, align 4
+  %arrayidx12.us.i61 = getelementptr inbounds i32* %pre, i64 %indvars.iv42.i
+  %tmp58 = load i32* %arrayidx12.us.i61, align 4
+  %mul.us.i = mul nsw i32 %tmp58, %tmp57
+  %arrayidx8.us.i.1 = getelementptr inbounds i32* %tmp56, i64 1
+  %tmp59 = load i32* %arrayidx8.us.i.1, align 4
+  %arrayidx12.us.i61.1 = getelementptr inbounds i32* %pre94, i64 %indvars.iv42.i
+  %tmp60 = load i32* %arrayidx12.us.i61.1, align 4
+  %mul.us.i.1 = mul nsw i32 %tmp60, %tmp59
+  %add.us.i.1 = add nsw i32 %mul.us.i.1, %mul.us.i
+  %arrayidx8.us.i.2 = getelementptr inbounds i32* %tmp56, i64 2
+  %tmp61 = load i32* %arrayidx8.us.i.2, align 4
+  %arrayidx12.us.i61.2 = getelementptr inbounds i32* %pre95, i64 %indvars.iv42.i
+  %tmp62 = load i32* %arrayidx12.us.i61.2, align 4
+  %mul.us.i.2 = mul nsw i32 %tmp62, %tmp61
+  %add.us.i.2 = add nsw i32 %mul.us.i.2, %add.us.i.1
+  %arrayidx8.us.i.3 = getelementptr inbounds i32* %tmp56, i64 3
+  %tmp63 = load i32* %arrayidx8.us.i.3, align 4
+  %arrayidx12.us.i61.3 = getelementptr inbounds i32* %pre96, i64 %indvars.iv42.i
+  %tmp64 = load i32* %arrayidx12.us.i61.3, align 4
+  %mul.us.i.3 = mul nsw i32 %tmp64, %tmp63
+  %add.us.i.3 = add nsw i32 %mul.us.i.3, %add.us.i.2
+  %arrayidx8.us.i.4 = getelementptr inbounds i32* %tmp56, i64 4
+  %tmp65 = load i32* %arrayidx8.us.i.4, align 4
+  %arrayidx12.us.i61.4 = getelementptr inbounds i32* %pre97, i64 %indvars.iv42.i
+  %tmp66 = load i32* %arrayidx12.us.i61.4, align 4
+  %mul.us.i.4 = mul nsw i32 %tmp66, %tmp65
+  %add.us.i.4 = add nsw i32 %mul.us.i.4, %add.us.i.3
+  %arrayidx8.us.i.5 = getelementptr inbounds i32* %tmp56, i64 5
+  %tmp67 = load i32* %arrayidx8.us.i.5, align 4
+  %arrayidx12.us.i61.5 = getelementptr inbounds i32* %pre98, i64 %indvars.iv42.i
+  %tmp68 = load i32* %arrayidx12.us.i61.5, align 4
+  %mul.us.i.5 = mul nsw i32 %tmp68, %tmp67
+  %add.us.i.5 = add nsw i32 %mul.us.i.5, %add.us.i.4
+  %arrayidx8.us.i.6 = getelementptr inbounds i32* %tmp56, i64 6
+  %tmp69 = load i32* %arrayidx8.us.i.6, align 4
+  %arrayidx12.us.i61.6 = getelementptr inbounds i32* %pre99, i64 %indvars.iv42.i
+  %tmp70 = load i32* %arrayidx12.us.i61.6, align 4
+  %mul.us.i.6 = mul nsw i32 %tmp70, %tmp69
+  %add.us.i.6 = add nsw i32 %mul.us.i.6, %add.us.i.5
+  %arrayidx8.us.i.7 = getelementptr inbounds i32* %tmp56, i64 7
+  %tmp71 = load i32* %arrayidx8.us.i.7, align 4
+  %arrayidx12.us.i61.7 = getelementptr inbounds i32* %pre100, i64 %indvars.iv42.i
+  %tmp72 = load i32* %arrayidx12.us.i61.7, align 4
+  %mul.us.i.7 = mul nsw i32 %tmp72, %tmp71
+  %add.us.i.7 = add nsw i32 %mul.us.i.7, %add.us.i.6
+  %arrayidx8.us.i.8 = getelementptr inbounds i32* %tmp56, i64 8
+  %tmp73 = load i32* %arrayidx8.us.i.8, align 4
+  %arrayidx12.us.i61.8 = getelementptr inbounds i32* %pre101, i64 %indvars.iv42.i
+  %tmp74 = load i32* %arrayidx12.us.i61.8, align 4
+  %mul.us.i.8 = mul nsw i32 %tmp74, %tmp73
+  %add.us.i.8 = add nsw i32 %mul.us.i.8, %add.us.i.7
+  %arrayidx8.us.i.9 = getelementptr inbounds i32* %tmp56, i64 9
+  %tmp75 = load i32* %arrayidx8.us.i.9, align 4
+  %arrayidx12.us.i61.9 = getelementptr inbounds i32* %pre102, i64 %indvars.iv42.i
+  %tmp76 = load i32* %arrayidx12.us.i61.9, align 4
+  %mul.us.i.9 = mul nsw i32 %tmp76, %tmp75
+  %add.us.i.9 = add nsw i32 %mul.us.i.9, %add.us.i.8
+  %arrayidx16.us.i = getelementptr inbounds i32* %tmp55, i64 %indvars.iv42.i
+  store i32 %add.us.i.9, i32* %arrayidx16.us.i, align 4
+  %indvars.iv.next43.i = add i64 %indvars.iv42.i, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next43.i to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 10
+  br i1 %exitcond, label %end, label %for.body
+
+end:
+  ret void
+}
+
+; Unlike the above loop, this IR starts out bad and must be
+; rescheduled.
+;
+; CHECK: %for.body
+; CHECK-NOT: %rsp
+; CHECK: imull 4
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 8
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 12
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 16
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 20
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 24
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 28
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 32
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK: imull 36
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: addl
+; CHECK-NOT: {{imull|rsp}}
+; CHECK: %end
+define void @unrolled_mmult2(i32* %tmp55, i32* %tmp56, i32* %pre, i32* %pre94,
+  i32* %pre95, i32* %pre96, i32* %pre97, i32* %pre98, i32* %pre99,
+  i32* %pre100, i32* %pre101, i32* %pre102, i32* %pre103, i32* %pre104)
+  nounwind uwtable ssp {
+entry:
+  br label %for.body
+for.body:
+  %indvars.iv42.i = phi i64 [ %indvars.iv.next43.i, %for.body ], [ 0, %entry ]
+  %tmp57 = load i32* %tmp56, align 4
+  %arrayidx12.us.i61 = getelementptr inbounds i32* %pre, i64 %indvars.iv42.i
+  %tmp58 = load i32* %arrayidx12.us.i61, align 4
+  %arrayidx8.us.i.1 = getelementptr inbounds i32* %tmp56, i64 1
+  %tmp59 = load i32* %arrayidx8.us.i.1, align 4
+  %arrayidx12.us.i61.1 = getelementptr inbounds i32* %pre94, i64 %indvars.iv42.i
+  %tmp60 = load i32* %arrayidx12.us.i61.1, align 4
+  %arrayidx8.us.i.2 = getelementptr inbounds i32* %tmp56, i64 2
+  %tmp61 = load i32* %arrayidx8.us.i.2, align 4
+  %arrayidx12.us.i61.2 = getelementptr inbounds i32* %pre95, i64 %indvars.iv42.i
+  %tmp62 = load i32* %arrayidx12.us.i61.2, align 4
+  %arrayidx8.us.i.3 = getelementptr inbounds i32* %tmp56, i64 3
+  %tmp63 = load i32* %arrayidx8.us.i.3, align 4
+  %arrayidx12.us.i61.3 = getelementptr inbounds i32* %pre96, i64 %indvars.iv42.i
+  %tmp64 = load i32* %arrayidx12.us.i61.3, align 4
+  %arrayidx8.us.i.4 = getelementptr inbounds i32* %tmp56, i64 4
+  %tmp65 = load i32* %arrayidx8.us.i.4, align 4
+  %arrayidx12.us.i61.4 = getelementptr inbounds i32* %pre97, i64 %indvars.iv42.i
+  %tmp66 = load i32* %arrayidx12.us.i61.4, align 4
+  %arrayidx8.us.i.5 = getelementptr inbounds i32* %tmp56, i64 5
+  %tmp67 = load i32* %arrayidx8.us.i.5, align 4
+  %arrayidx12.us.i61.5 = getelementptr inbounds i32* %pre98, i64 %indvars.iv42.i
+  %tmp68 = load i32* %arrayidx12.us.i61.5, align 4
+  %arrayidx8.us.i.6 = getelementptr inbounds i32* %tmp56, i64 6
+  %tmp69 = load i32* %arrayidx8.us.i.6, align 4
+  %arrayidx12.us.i61.6 = getelementptr inbounds i32* %pre99, i64 %indvars.iv42.i
+  %tmp70 = load i32* %arrayidx12.us.i61.6, align 4
+  %mul.us.i = mul nsw i32 %tmp58, %tmp57
+  %arrayidx8.us.i.7 = getelementptr inbounds i32* %tmp56, i64 7
+  %tmp71 = load i32* %arrayidx8.us.i.7, align 4
+  %arrayidx12.us.i61.7 = getelementptr inbounds i32* %pre100, i64 %indvars.iv42.i
+  %tmp72 = load i32* %arrayidx12.us.i61.7, align 4
+  %arrayidx8.us.i.8 = getelementptr inbounds i32* %tmp56, i64 8
+  %tmp73 = load i32* %arrayidx8.us.i.8, align 4
+  %arrayidx12.us.i61.8 = getelementptr inbounds i32* %pre101, i64 %indvars.iv42.i
+  %tmp74 = load i32* %arrayidx12.us.i61.8, align 4
+  %arrayidx8.us.i.9 = getelementptr inbounds i32* %tmp56, i64 9
+  %tmp75 = load i32* %arrayidx8.us.i.9, align 4
+  %arrayidx12.us.i61.9 = getelementptr inbounds i32* %pre102, i64 %indvars.iv42.i
+  %tmp76 = load i32* %arrayidx12.us.i61.9, align 4
+  %mul.us.i.1 = mul nsw i32 %tmp60, %tmp59
+  %add.us.i.1 = add nsw i32 %mul.us.i.1, %mul.us.i
+  %mul.us.i.2 = mul nsw i32 %tmp62, %tmp61
+  %add.us.i.2 = add nsw i32 %mul.us.i.2, %add.us.i.1
+  %mul.us.i.3 = mul nsw i32 %tmp64, %tmp63
+  %add.us.i.3 = add nsw i32 %mul.us.i.3, %add.us.i.2
+  %mul.us.i.4 = mul nsw i32 %tmp66, %tmp65
+  %add.us.i.4 = add nsw i32 %mul.us.i.4, %add.us.i.3
+  %mul.us.i.5 = mul nsw i32 %tmp68, %tmp67
+  %add.us.i.5 = add nsw i32 %mul.us.i.5, %add.us.i.4
+  %mul.us.i.6 = mul nsw i32 %tmp70, %tmp69
+  %add.us.i.6 = add nsw i32 %mul.us.i.6, %add.us.i.5
+  %mul.us.i.7 = mul nsw i32 %tmp72, %tmp71
+  %add.us.i.7 = add nsw i32 %mul.us.i.7, %add.us.i.6
+  %mul.us.i.8 = mul nsw i32 %tmp74, %tmp73
+  %add.us.i.8 = add nsw i32 %mul.us.i.8, %add.us.i.7
+  %mul.us.i.9 = mul nsw i32 %tmp76, %tmp75
+  %add.us.i.9 = add nsw i32 %mul.us.i.9, %add.us.i.8
+  %arrayidx16.us.i = getelementptr inbounds i32* %tmp55, i64 %indvars.iv42.i
+  store i32 %add.us.i.9, i32* %arrayidx16.us.i, align 4
+  %indvars.iv.next43.i = add i64 %indvars.iv42.i, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next43.i to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 10
+  br i1 %exitcond, label %end, label %for.body
+
+end:
+  ret void
+}
diff --git a/test/CodeGen/X86/rtm.ll b/test/CodeGen/X86/rtm.ll
new file mode 100644
index 0000000000..76eb9514f0
--- /dev/null
+++ b/test/CodeGen/X86/rtm.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -mattr=+rtm -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+declare i32 @llvm.x86.xbegin() nounwind
+declare void @llvm.x86.xend() nounwind
+declare void @llvm.x86.xabort(i8) noreturn nounwind
+
+define i32 @test_xbegin() nounwind uwtable {
+entry:
+  %0 = tail call i32 @llvm.x86.xbegin() nounwind
+  ret i32 %0
+; CHECK: test_xbegin
+; CHECK: xbegin [[LABEL:.*BB.*]]
+; CHECK: [[LABEL]]:
+}
+
+define void @test_xend() nounwind uwtable {
+entry:
+  tail call void @llvm.x86.xend() nounwind
+  ret void
+; CHECK: test_xend
+; CHECK: xend
+}
+
+define void @test_xabort() nounwind uwtable {
+entry:
+  tail call void @llvm.x86.xabort(i8 2)
+  unreachable
+; CHECK: test_xabort
+; CHECK: xabort $2
+}
diff --git a/test/DebugInfo/X86/elf-names.ll b/test/DebugInfo/X86/elf-names.ll
new file mode 100644
index 0000000000..b908bcefe4
--- /dev/null
+++ b/test/DebugInfo/X86/elf-names.ll
@@ -0,0 +1,109 @@
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu %s -o %t -filetype=obj
+; RUN: llvm-dwarfdump %t | FileCheck %s
+
+; CHECK: 0x0000000b: DW_TAG_compile_unit
+; CHECK: 0x00000012:   DW_AT_name [DW_FORM_strp] ( .debug_str[0x00000035] = "foo.cpp")
+; CHECK: 0x0000003c:   DW_TAG_class_type
+; CHECK: 0x0000003d:     DW_AT_name [DW_FORM_strp]       ( .debug_str[0x0000006d] = "D")
+; CHECK: 0x00000044:     DW_TAG_member
+; CHECK: 0x00000045:       DW_AT_name [DW_FORM_strp]     ( .debug_str[0x0000005d] = "c1")
+
+%class.D = type { i32, i32, i32, i32 }
+
+@_ZN1DC1Ev = alias void (%class.D*)* @_ZN1DC2Ev
+@_ZN1DC1ERKS_ = alias void (%class.D*, %class.D*)* @_ZN1DC2ERKS_
+
+define void @_ZN1DC2Ev(%class.D* nocapture %this) unnamed_addr nounwind uwtable align 2 {
+entry:
+  tail call void @llvm.dbg.value(metadata !{%class.D* %this}, i64 0, metadata !29), !dbg !36
+  %c1 = getelementptr inbounds %class.D* %this, i64 0, i32 0, !dbg !37
+  store i32 1, i32* %c1, align 4, !dbg !37, !tbaa !39
+  %c2 = getelementptr inbounds %class.D* %this, i64 0, i32 1, !dbg !42
+  store i32 2, i32* %c2, align 4, !dbg !42, !tbaa !39
+  %c3 = getelementptr inbounds %class.D* %this, i64 0, i32 2, !dbg !43
+  store i32 3, i32* %c3, align 4, !dbg !43, !tbaa !39
+  %c4 = getelementptr inbounds %class.D* %this, i64 0, i32 3, !dbg !44
+  store i32 4, i32* %c4, align 4, !dbg !44, !tbaa !39
+  ret void, !dbg !45
+}
+
+define void @_ZN1DC2ERKS_(%class.D* nocapture %this, %class.D* nocapture %d) unnamed_addr nounwind uwtable align 2 {
+entry:
+  tail call void @llvm.dbg.value(metadata !{%class.D* %this}, i64 0, metadata !34), !dbg !46
+  tail call void @llvm.dbg.value(metadata !{%class.D* %d}, i64 0, metadata !35), !dbg !46
+  %c1 = getelementptr inbounds %class.D* %d, i64 0, i32 0, !dbg !47
+  %0 = load i32* %c1, align 4, !dbg !47, !tbaa !39
+  %c12 = getelementptr inbounds %class.D* %this, i64 0, i32 0, !dbg !47
+  store i32 %0, i32* %c12, align 4, !dbg !47, !tbaa !39
+  %c2 = getelementptr inbounds %class.D* %d, i64 0, i32 1, !dbg !49
+  %1 = load i32* %c2, align 4, !dbg !49, !tbaa !39
+  %c23 = getelementptr inbounds %class.D* %this, i64 0, i32 1, !dbg !49
+  store i32 %1, i32* %c23, align 4, !dbg !49, !tbaa !39
+  %c3 = getelementptr inbounds %class.D* %d, i64 0, i32 2, !dbg !50
+  %2 = load i32* %c3, align 4, !dbg !50, !tbaa !39
+  %c34 = getelementptr inbounds %class.D* %this, i64 0, i32 2, !dbg !50
+  store i32 %2, i32* %c34, align 4, !dbg !50, !tbaa !39
+  %c4 = getelementptr inbounds %class.D* %d, i64 0, i32 3, !dbg !51
+  %3 = load i32* %c4, align 4, !dbg !51, !tbaa !39
+  %c45 = getelementptr inbounds %class.D* %this, i64 0, i32 3, !dbg !51
+  store i32 %3, i32* %c45, align 4, !dbg !51, !tbaa !39
+  ret void, !dbg !52
+}
+
+declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"foo.cpp", metadata !"/usr/local/google/home/echristo", metadata !"clang version 3.2 (trunk 167506) (llvm/trunk 167505)", i1 true, i1 true, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ] [/usr/local/google/home/echristo/foo.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !5, metadata !31}
+!5 = metadata !{i32 786478, i32 0, null, metadata !"D", metadata !"D", metadata !"_ZN1DC2Ev", metadata !6, i32 12, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (%class.D*)* @_ZN1DC2Ev, null, metadata !17, metadata !27, i32 12} ; [ DW_TAG_subprogram ] [line 12] [def] [D]
+!6 = metadata !{i32 786473, metadata !"foo.cpp", metadata !"/usr/local/google/home/echristo", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{null, metadata !9}
+!9 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from D]
+!10 = metadata !{i32 786434, null, metadata !"D", metadata !6, i32 1, i64 128, i64 32, i32 0, i32 0, null, metadata !11, i32 0, null, null} ; [ DW_TAG_class_type ] [D] [line 1, size 128, align 32, offset 0] [from ]
+!11 = metadata !{metadata !12, metadata !14, metadata !15, metadata !16, metadata !17, metadata !20}
+!12 = metadata !{i32 786445, metadata !10, metadata !"c1", metadata !6, i32 6, i64 32, i64 32, i64 0, i32 1, metadata !13} ; [ DW_TAG_member ] [c1] [line 6, size 32, align 32, offset 0] [private] [from int]
+!13 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!14 = metadata !{i32 786445, metadata !10, metadata !"c2", metadata !6, i32 7, i64 32, i64 32, i64 32, i32 1, metadata !13} ; [ DW_TAG_member ] [c2] [line 7, size 32, align 32, offset 32] [private] [from int]
+!15 = metadata !{i32 786445, metadata !10, metadata !"c3", metadata !6, i32 8, i64 32, i64 32, i64 64, i32 1, metadata !13} ; [ DW_TAG_member ] [c3] [line 8, size 32, align 32, offset 64] [private] [from int]
+!16 = metadata !{i32 786445, metadata !10, metadata !"c4", metadata !6, i32 9, i64 32, i64 32, i64 96, i32 1, metadata !13} ; [ DW_TAG_member ] [c4] [line 9, size 32, align 32, offset 96] [private] [from int]
+!17 = metadata !{i32 786478, i32 0, metadata !10, metadata !"D", metadata !"D", metadata !"", metadata !6, i32 3, metadata !7, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, metadata !18, i32 3} ; [ DW_TAG_subprogram ] [line 3] [D]
+!18 = metadata !{metadata !19}
+!19 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!20 = metadata !{i32 786478, i32 0, metadata !10, metadata !"D", metadata !"D", metadata !"", metadata !6, i32 4, metadata !21, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 true, null, null, i32 0, metadata !25, i32 4} ; [ DW_TAG_subprogram ] [line 4] [D]
+!21 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !22, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!22 = metadata !{null, metadata !9, metadata !23}
+!23 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !24} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from ]
+!24 = metadata !{i32 786470, null, metadata !"", null, i32 0, i64 0, i64 0, i64 0, i32 0, metadata !10} ; [ DW_TAG_const_type ] [line 0, size 0, align 0, offset 0] [from D]
+!25 = metadata !{metadata !26}
+!26 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!27 = metadata !{metadata !28}
+!28 = metadata !{metadata !29}
+!29 = metadata !{i32 786689, metadata !5, metadata !"this", metadata !6, i32 16777228, metadata !30, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 12]
+!30 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !10} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from D]
+!31 = metadata !{i32 786478, i32 0, null, metadata !"D", metadata !"D", metadata !"_ZN1DC2ERKS_", metadata !6, i32 19, metadata !21, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, void (%class.D*, %class.D*)* @_ZN1DC2ERKS_, null, metadata !20, metadata !32, i32 19} ; [ DW_TAG_subprogram ] [line 19] [def] [D]
+!32 = metadata !{metadata !33}
+!33 = metadata !{metadata !34, metadata !35}
+!34 = metadata !{i32 786689, metadata !31, metadata !"this", metadata !6, i32 16777235, metadata !30, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 19]
+!35 = metadata !{i32 786689, metadata !31, metadata !"d", metadata !6, i32 33554451, metadata !23, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [d] [line 19]
+!36 = metadata !{i32 12, i32 0, metadata !5, null}
+!37 = metadata !{i32 13, i32 0, metadata !38, null}
+!38 = metadata !{i32 786443, metadata !5, i32 12, i32 0, metadata !6, i32 0} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/foo.cpp]
+!39 = metadata !{metadata !"int", metadata !40}
+!40 = metadata !{metadata !"omnipotent char", metadata !41}
+!41 = metadata !{metadata !"Simple C/C++ TBAA"}
+!42 = metadata !{i32 14, i32 0, metadata !38, null}
+!43 = metadata !{i32 15, i32 0, metadata !38, null}
+!44 = metadata !{i32 16, i32 0, metadata !38, null}
+!45 = metadata !{i32 17, i32 0, metadata !38, null}
+!46 = metadata !{i32 19, i32 0, metadata !31, null}
+!47 = metadata !{i32 20, i32 0, metadata !48, null}
+!48 = metadata !{i32 786443, metadata !31, i32 19, i32 0, metadata !6, i32 1} ; [ DW_TAG_lexical_block ] [/usr/local/google/home/echristo/foo.cpp]
+!49 = metadata !{i32 21, i32 0, metadata !48, null}
+!50 = metadata !{i32 22, i32 0, metadata !48, null}
+!51 = metadata !{i32 23, i32 0, metadata !48, null}
+!52 = metadata !{i32 24, i32 0, metadata !48, null}
diff --git a/test/MC/MachO/gen-dwarf-macro-cpp.s b/test/MC/MachO/gen-dwarf-macro-cpp.s
new file mode 100644
index 0000000000..05a449b402
--- /dev/null
+++ b/test/MC/MachO/gen-dwarf-macro-cpp.s
@@ -0,0 +1,17 @@
+// RUN: llvm-mc -g -triple i386-apple-darwin10 %s -filetype=obj -o %t
+// RUN: llvm-dwarfdump %t | FileCheck %s
+
+# 1 "foo.S" 2
+.macro switcher
+        ljmp *0x38(%ecx)
+.endmacro
+        switcher NaClSwitchNoSSE, 0
+
+// PR14264 was a crash in the code caused by the .macro not handled correctly
+// rdar://12637628
+
+// We check that the source name "foo.S" is picked up
+// CHECK:                 Dir  Mod Time   File Len   File Name
+// CHECK:                 ---- ---------- ---------- ---------------------------
+// CHECK: file_names[  1]    1 0x00000000 0x00000000 gen-dwarf-macro-cpp.s
+// CHECK: file_names[  2]    0 0x00000000 0x00000000 foo.S
diff --git a/test/MC/X86/x86_64-rtm-encoding.s b/test/MC/X86/x86_64-rtm-encoding.s
new file mode 100644
index 0000000000..44d6bacb7f
--- /dev/null
+++ b/test/MC/X86/x86_64-rtm-encoding.s
@@ -0,0 +1,13 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown --show-encoding %s | FileCheck %s
+
+// CHECK: xbegin .L0
+// CHECK: encoding: [0xc7,0xf8,A,A,A,A]
+	xbegin .L0
+
+// CHECK: xend
+// CHECK: encoding: [0x0f,0x01,0xd5]
+	xend
+
+// CHECK: xabort
+// CHECK: encoding: [0xc6,0xf8,0x0d]
+	xabort $13
diff --git a/test/Transforms/InstCombine/strspn-1.ll b/test/Transforms/InstCombine/strspn-1.ll
new file mode 100644
index 0000000000..393f88735b
--- /dev/null
+++ b/test/Transforms/InstCombine/strspn-1.ll
@@ -0,0 +1,56 @@
+; Test that the strspn library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@abcba = constant [6 x i8] c"abcba\00"
+@abc = constant [4 x i8] c"abc\00"
+@null = constant [1 x i8] zeroinitializer
+
+declare i64 @strspn(i8*, i8*)
+
+; Check strspn(s, "") -> 0.
+
+define i64 @test_simplify1(i8* %str) {
+; CHECK: @test_simplify1
+  %pat = getelementptr [1 x i8]* @null, i32 0, i32 0
+
+  %ret = call i64 @strspn(i8* %str, i8* %pat)
+  ret i64 %ret
+; CHECK-NEXT: ret i64 0
+}
+
+; Check strspn("", s) -> 0.
+
+define i64 @test_simplify2(i8* %pat) {
+; CHECK: @test_simplify2
+  %str = getelementptr [1 x i8]* @null, i32 0, i32 0
+
+  %ret = call i64 @strspn(i8* %str, i8* %pat)
+  ret i64 %ret
+; CHECK-NEXT: ret i64 0
+}
+
+; Check strspn(s1, s2), where s1 and s2 are constants.
+
+define i64 @test_simplify3() {
+; CHECK: @test_simplify3
+  %str = getelementptr [6 x i8]* @abcba, i32 0, i32 0
+  %pat = getelementptr [4 x i8]* @abc, i32 0, i32 0
+
+  %ret = call i64 @strspn(i8* %str, i8* %pat)
+  ret i64 %ret
+; CHECK-NEXT: ret i64 5
+}
+
+; Check cases that shouldn't be simplified.
+
+define i64 @test_no_simplify1(i8* %str, i8* %pat) {
+; CHECK: @test_no_simplify1
+
+  %ret = call i64 @strspn(i8* %str, i8* %pat)
+; CHECK-NEXT: %ret = call i64 @strspn(i8* %str, i8* %pat)
+  ret i64 %ret
+; CHECK-NEXT: ret i64 %ret
+}
diff --git a/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/test/Transforms/LoopVectorize/X86/conversion-cost.ll
index 8582613617..8f1bb545fa 100644
--- a/test/Transforms/LoopVectorize/X86/conversion-cost.ll
+++ b/test/Transforms/LoopVectorize/X86/conversion-cost.ll
@@ -4,7 +4,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "x86_64-apple-macosx10.8.0"
 
 ;CHECK: @conversion_cost1
-;CHECK: store <8 x i8>
+;CHECK: store <2 x i8>
 ;CHECK: ret
 define i32 @conversion_cost1(i32 %n, i8* nocapture %A, float* nocapture %B) nounwind uwtable ssp {
   %1 = icmp sgt i32 %n, 3
@@ -25,7 +25,7 @@ define i32 @conversion_cost1(i32 %n, i8* nocapture %A, float* nocapture %B) noun
 }
 
 ;CHECK: @conversion_cost2
-;CHECK: store <8 x float>
+;CHECK: <2 x float>
 ;CHECK: ret
 define i32 @conversion_cost2(i32 %n, i8* nocapture %A, float* nocapture %B) nounwind uwtable ssp {
   %1 = icmp sgt i32 %n, 9
diff --git a/test/Transforms/LoopVectorize/X86/cost-model.ll b/test/Transforms/LoopVectorize/X86/cost-model.ll
index 40e660855b..628f9912c8 100644
--- a/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -9,7 +9,7 @@ target triple = "x86_64-apple-macosx10.8.0"
 @a = common global [2048 x i32] zeroinitializer, align 16
 
 ;CHECK: cost_model_1
-;CHECK-NOT: <4 x i32>
+;CHECK: <4 x i32>
 ;CHECK: ret void
 define void @cost_model_1() nounwind uwtable noinline ssp {
 entry:
diff --git a/test/Transforms/SimplifyCFG/X86/lit.local.cfg b/test/Transforms/SimplifyCFG/X86/lit.local.cfg
new file mode 100644
index 0000000000..a8ad0f1a28
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/X86/lit.local.cfg
@@ -0,0 +1,6 @@
+config.suffixes = ['.ll', '.c', '.cpp']
+
+targets = set(config.root.targets_to_build.split())
+if not 'X86' in targets:
+    config.unsupported = True
+
diff --git a/test/Transforms/SimplifyCFG/switch_to_lookup_table.ll b/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
index d358350554..8a59992f5e 100644
--- a/test/Transforms/SimplifyCFG/switch_to_lookup_table.ll
+++ b/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -simplifycfg -S | FileCheck %s
+; RUN: opt < %s -simplifycfg -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -770,7 +770,7 @@ sw.bb3: br label %return
 sw.default: unreachable
 
 return:
-  %retval.0 = phi i32 [ 1, %sw.bb3 ], [ -1, %sw.bb2 ], [ 1, %sw.bb1 ], [ 0, %sw.bb ]
+  %retval.0 = phi i32 [ 1, %sw.bb3 ], [ -1, %sw.bb2 ], [ 0, %sw.bb ]
   ret i32 %retval.0
 
 ; CHECK: @unreachable
diff --git a/test/Transforms/SimplifyCFG/switch_create.ll b/test/Transforms/SimplifyCFG/switch_create.ll
index b28e4a4550..546cc75f29 100644
--- a/test/Transforms/SimplifyCFG/switch_create.ll
+++ b/test/Transforms/SimplifyCFG/switch_create.ll
@@ -141,9 +141,8 @@ UnifiedReturnBlock:             ; preds = %shortcirc_done.4, %shortcirc_next.4
         ret i1 %UnifiedRetVal
         
 ; CHECK: @test6
-; CHECK: %switch.tableidx = sub i32 %tmp.2.i, 14
-; CHECK: %0 = icmp ult i32 %switch.tableidx, 6
-; CHECK: select i1 %0, i1 true, i1 false
+; CHECK: %tmp.2.i.off = add i32 %tmp.2.i, -14
+; CHECK: %switch = icmp ult i32 %tmp.2.i.off, 6
 }
 
 define void @test7(i8 zeroext %c, i32 %x) nounwind ssp noredzone {
diff --git a/test/Transforms/SimplifyLibCalls/StrSpn.ll b/test/Transforms/SimplifyLibCalls/StrSpn.ll
index 800c190883..2660ee9800 100644
--- a/test/Transforms/SimplifyLibCalls/StrSpn.ll
+++ b/test/Transforms/SimplifyLibCalls/StrSpn.ll
@@ -6,22 +6,6 @@ target datalayout = "-p:64:64:64"
 @abc = constant [4 x i8] c"abc\00"
 @null = constant [1 x i8] zeroinitializer
 
-declare i64 @strspn(i8*, i8*)
-
-define i64 @testspn(i8* %s1, i8* %s2) {
-  	%abcba_p = getelementptr [6 x i8]* @abcba, i32 0, i32 0
-	%abc_p = getelementptr [4 x i8]* @abc, i32 0, i32 0
-	%null_p = getelementptr [1 x i8]* @null, i32 0, i32 0
-	%test1 = call i64 @strspn(i8* %s1, i8* %null_p)
-	%test2 = call i64 @strspn(i8* %null_p, i8* %s2)
-	%test3 = call i64 @strspn(i8* %abcba_p, i8* %abc_p)
-; CHECK-NOT: call i64 @strspn
-	%test4 = call i64 @strspn(i8* %s1, i8* %s2)
-; CHECK: call i64 @strspn(i8* %s1, i8* %s2)
-	ret i64 %test3
-; CHECK: ret i64 5
-}
-
 declare i64 @strcspn(i8*, i8*)
 
 define i64 @testcspn(i8* %s1, i8* %s2) {
diff --git a/tools/lli/CMakeLists.txt b/tools/lli/CMakeLists.txt
index a9c7adf978..ed479f5323 100644
--- a/tools/lli/CMakeLists.txt
+++ b/tools/lli/CMakeLists.txt
@@ -1,5 +1,5 @@
 
-set(LLVM_LINK_COMPONENTS mcjit jit interpreter nativecodegen bitreader asmparser selectiondag)
+set(LLVM_LINK_COMPONENTS mcjit jit interpreter nativecodegen bitreader asmparser selectiondag native)
 
 if( LLVM_USE_OPROFILE )
   set(LLVM_LINK_COMPONENTS
diff --git a/tools/lli/LLVMBuild.txt b/tools/lli/LLVMBuild.txt
index 4eb82bd9e1..36ceb39b12 100644
--- a/tools/lli/LLVMBuild.txt
+++ b/tools/lli/LLVMBuild.txt
@@ -19,4 +19,4 @@
 type = Tool
 name = lli
 parent = Tools
-required_libraries = AsmParser BitReader Interpreter JIT MCJIT NativeCodeGen SelectionDAG
+required_libraries = AsmParser BitReader Interpreter JIT MCJIT NativeCodeGen SelectionDAG Native
diff --git a/tools/lli/Makefile b/tools/lli/Makefile
index 100fc2e415..31f3ab8a1e 100644
--- a/tools/lli/Makefile
+++ b/tools/lli/Makefile
@@ -12,7 +12,7 @@ TOOLNAME := lli
 
 include $(LEVEL)/Makefile.config
 
-LINK_COMPONENTS := mcjit jit interpreter nativecodegen bitreader asmparser selectiondag
+LINK_COMPONENTS := mcjit jit interpreter nativecodegen bitreader asmparser selectiondag native
 
 # If Intel JIT Events support is confiured, link against the LLVM Intel JIT
 # Events interface library
diff --git a/tools/lli/lli.cpp b/tools/lli/lli.cpp
index 22b48cf63f..d41a595de8 100644
--- a/tools/lli/lli.cpp
+++ b/tools/lli/lli.cpp
@@ -475,6 +475,10 @@ void layoutRemoteTargetMemory(RemoteTarget *T, RecordingMemoryManager *JMM) {
                  << " to remote: " << format("%p", Addr) << "\n");
 
   }
+
+  // Trigger application of relocations
+  EE->finalizeObject();
+
   // Now load it all to the target.
   for (unsigned i = 0, e = Offsets.size(); i != e; ++i) {
     uint64_t Addr = RemoteAddr + Offsets[i].second;
@@ -508,6 +512,7 @@ int main(int argc, char **argv, char * const *envp) {
   // usable by the JIT.
   InitializeNativeTarget();
   InitializeNativeTargetAsmPrinter();
+  InitializeNativeTargetAsmParser();
 
   cl::ParseCommandLineOptions(argc, argv,
                               "llvm interpreter & dynamic compiler\n");
diff --git a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
index 309bc4ecd4..e73300a0cd 100644
--- a/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
+++ b/tools/llvm-dwarfdump/llvm-dwarfdump.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Object/RelocVisitor.h"
 #include "llvm/DebugInfo/DIContext.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -28,6 +29,9 @@
 #include "llvm/Support/system_error.h"
 #include <algorithm>
 #include <cstring>
+#include <list>
+#include <string>
+
 using namespace llvm;
 using namespace object;
 
@@ -67,6 +71,7 @@ static void DumpInput(const StringRef &Filename) {
   OwningPtr<ObjectFile> Obj(ObjectFile::createObjectFile(Buff.take()));
 
   StringRef DebugInfoSection;
+  RelocAddrMap RelocMap;
   StringRef DebugAbbrevSection;
   StringRef DebugLineSection;
   StringRef DebugArangesSection;
@@ -97,6 +102,57 @@ static void DumpInput(const StringRef &Filename) {
       DebugStringSection = data;
     else if (name == "debug_ranges")
       DebugRangesSection = data;
+    // Any more debug info sections go here.
+    else
+      continue;
+
+    // TODO: For now only handle relocations for the debug_info section.
+    if (name != "debug_info")
+      continue;
+
+    if (i->begin_relocations() != i->end_relocations()) {
+      uint64_t SectionSize;
+      i->getSize(SectionSize);
+      for (relocation_iterator reloc_i = i->begin_relocations(),
+                               reloc_e = i->end_relocations();
+                               reloc_i != reloc_e; reloc_i.increment(ec)) {
+        uint64_t Address;
+        reloc_i->getAddress(Address);
+        uint64_t Type;
+        reloc_i->getType(Type);
+
+        RelocVisitor V(Obj->getFileFormatName());
+        // The section address is always 0 for debug sections.
+        RelocToApply R(V.visit(Type, *reloc_i));
+        if (V.error()) {
+          SmallString<32> Name;
+          error_code ec(reloc_i->getTypeName(Name));
+          if (ec) {
+            errs() << "Aaaaaa! Nameless relocation! Aaaaaa!\n";
+          }
+          errs() << "error: failed to compute relocation: "
+                 << Name << "\n";
+          continue;
+        }
+
+        if (Address + R.Width > SectionSize) {
+          errs() << "error: " << R.Width << "-byte relocation starting "
+                 << Address << " bytes into section " << name << " which is "
+                 << SectionSize << " bytes long.\n";
+          continue;
+        }
+        if (R.Width > 8) {
+          errs() << "error: can't handle a relocation of more than 8 bytes at "
+                    "a time.\n";
+          continue;
+        }
+        DEBUG(dbgs() << "Writing " << format("%p", R.Value)
+                     << " at " << format("%p", Address)
+                     << " with width " << format("%d", R.Width)
+                     << "\n");
+        RelocMap[Address] = std::make_pair(R.Width, R.Value);
+      }
+    }
   }
 
   OwningPtr<DIContext> dictx(DIContext::getDWARFContext(/*FIXME*/true,
@@ -105,7 +161,8 @@ static void DumpInput(const StringRef &Filename) {
                                                         DebugArangesSection,
                                                         DebugLineSection,
                                                         DebugStringSection,
-                                                        DebugRangesSection));
+                                                        DebugRangesSection,
+                                                        RelocMap));
   if (Address == -1ULL) {
     outs() << Filename
            << ":\tfile format " << Obj->getFileFormatName() << "\n\n";
diff --git a/utils/TableGen/X86RecognizableInstr.cpp b/utils/TableGen/X86RecognizableInstr.cpp
index 24155c0a5a..d6ed2fe2c6 100644
--- a/utils/TableGen/X86RecognizableInstr.cpp
+++ b/utils/TableGen/X86RecognizableInstr.cpp
@@ -38,14 +38,15 @@ using namespace llvm;
   MAP(D0, 45)           \
   MAP(D1, 46)           \
   MAP(D4, 47)           \
-  MAP(D8, 48)           \
-  MAP(D9, 49)           \
-  MAP(DA, 50)           \
-  MAP(DB, 51)           \
-  MAP(DC, 52)           \
-  MAP(DD, 53)           \
-  MAP(DE, 54)           \
-  MAP(DF, 55)
+  MAP(D5, 48)           \
+  MAP(D8, 49)           \
+  MAP(D9, 50)           \
+  MAP(DA, 51)           \
+  MAP(DB, 52)           \
+  MAP(DC, 53)           \
+  MAP(DD, 54)           \
+  MAP(DE, 55)           \
+  MAP(DF, 56)
 
 // A clone of X86 since we can't depend on something that is generated.
 namespace X86Local {