Merge commit 'bc4021f31eaa97ee52655828da3e3de14a39e4a6'

Conflicts: lib/MC/MCAssembler.cpp lib/Target/ARM/ARMISelDAGToDAG.cpp lib/Target/Mips/MipsInstrFPU.td lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp lib/Target/X86/X86ISelLowering.h
author: Derek Schuff <dschuff@chromium.org> 2012-09-25 17:30:25 -0700
committer: Derek Schuff <dschuff@chromium.org> 2012-09-25 18:01:23 -0700
commit: a27c28b1427dc2082ab2b31efdbb25f9fde31b61 (patch)
tree: 6f3ff025f542ca3f66a1a01cbf239aeef7784511
parent: 0e15ffd8cb1ec642eddb96380660914ff2b007e1 (diff)
parent: bc4021f31eaa97ee52655828da3e3de14a39e4a6 (diff)
523 files changed, 22111 insertions, 4980 deletions
diff --git a/autoconf/configure.ac b/autoconf/configure.ac
index 43d23e6fac..354d3d7322 100644
--- a/autoconf/configure.ac
+++ b/autoconf/configure.ac
@@ -683,6 +683,21 @@ esac
 AC_DEFINE_UNQUOTED([ENABLE_TIMESTAMPS],$ENABLE_TIMESTAMPS,
                    [Define if timestamp information (e.g., __DATE__) is allowed])
 
+dnl Enable embedding timestamp information into build.
+
+AC_ARG_ENABLE(backtraces,
+  AS_HELP_STRING([--enable-backtraces],
+                 [Enable embedding backtraces on crash (default is YES)]),,
+                 enableval=default)
+case "$enableval" in
+  yes) AC_SUBST(ENABLE_TIMESTAMPS,[1]) ;;
+  no)  AC_SUBST(ENABLE_TIMESTAMPS,[0]) ;;
+  default) AC_SUBST(ENABLE_TIMESTAMPS,[1]) ;;
+  *) AC_MSG_ERROR([Invalid setting for --enable-backtraces. Use "yes" or "no"]) ;;
+esac
+AC_DEFINE_UNQUOTED([ENABLE_BACKTRACES],$ENABLE_BACKTRACES,
+                   [Define if you want backtraces on crash])
+
 dnl Allow specific targets to be specified for building (or not)
 TARGETS_TO_BUILD=""
 AC_ARG_ENABLE([targets],AS_HELP_STRING([--enable-targets],
diff --git a/configure b/configure
index da74ac9175..e844511d5a 100755
--- a/configure
+++ b/configure
@@ -1423,6 +1423,8 @@ Optional Features:
                           Win32 DLL (default is NO)
   --enable-timestamps     Enable embedding timestamp information in build
                           (default is YES)
+  --enable-backtraces     Enable embedding backtraces on crash (default is
+                          YES)
   --enable-targets        Build specific host targets: all or
                           target1,target2,... Valid targets are: host, x86,
                           x86_64, sparc, powerpc, arm, mips, spu, hexagon,
@@ -5387,6 +5389,31 @@ cat >>confdefs.h <<_ACEOF
 _ACEOF
 
 
+
+# Check whether --enable-backtraces was given.
+if test "${enable_backtraces+set}" = set; then
+  enableval=$enable_backtraces;
+else
+  enableval=default
+fi
+
+case "$enableval" in
+  yes) ENABLE_TIMESTAMPS=1
+ ;;
+  no)  ENABLE_TIMESTAMPS=0
+ ;;
+  default) ENABLE_TIMESTAMPS=1
+ ;;
+  *) { { echo "$as_me:$LINENO: error: Invalid setting for --enable-backtraces. Use \"yes\" or \"no\"" >&5
+echo "$as_me: error: Invalid setting for --enable-backtraces. Use \"yes\" or \"no\"" >&2;}
+   { (exit 1); exit 1; }; } ;;
+esac
+
+cat >>confdefs.h <<_ACEOF
+#define ENABLE_BACKTRACES $ENABLE_BACKTRACES
+_ACEOF
+
+
 TARGETS_TO_BUILD=""
 # Check whether --enable-targets was given.
 if test "${enable_targets+set}" = set; then
@@ -10294,7 +10321,7 @@ else
   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
   lt_status=$lt_dlunknown
   cat > conftest.$ac_ext <<EOF
-#line 10292 "configure"
+#line 10319 "configure"
 #include "confdefs.h"
 
 #if HAVE_DLFCN_H
diff --git a/docs/CodingStandards.rst b/docs/CodingStandards.rst
index a416a1e856..455ffd5292 100644
--- a/docs/CodingStandards.rst
+++ b/docs/CodingStandards.rst
@@ -421,9 +421,9 @@ exit from a function, consider this "bad" code:
 
 .. code-block:: c++
 
-  Value *DoSomething(Instruction *I) {
+  Value *doSomething(Instruction *I) {
     if (!isa<TerminatorInst>(I) &&
-        I->hasOneUse() && SomeOtherThing(I)) {
+        I->hasOneUse() && doOtherThing(I)) {
       ... some long code ....
     }
 
@@ -445,7 +445,7 @@ It is much preferred to format the code like this:
 
 .. code-block:: c++
 
-  Value *DoSomething(Instruction *I) {
+  Value *doSomething(Instruction *I) {
     // Terminators never need 'something' done to them because ... 
     if (isa<TerminatorInst>(I))
       return 0;
@@ -456,7 +456,7 @@ It is much preferred to format the code like this:
       return 0;
 
     // This is really just here for example.
-    if (!SomeOtherThing(I))
+    if (!doOtherThing(I))
       return 0;
     
     ... some long code ....
@@ -601,9 +601,9 @@ code to be structured like this:
 
 .. code-block:: c++
 
-  /// ListContainsFoo - Return true if the specified list has an element that is
+  /// containsFoo - Return true if the specified list has an element that is
   /// a foo.
-  static bool ListContainsFoo(const std::vector<Bar*> &List) {
+  static bool containsFoo(const std::vector<Bar*> &List) {
     for (unsigned i = 0, e = List.size(); i != e; ++i)
       if (List[i]->isFoo())
         return true;
@@ -611,7 +611,7 @@ code to be structured like this:
   }
   ...
 
-  if (ListContainsFoo(BarList)) {
+  if (containsFoo(BarList)) {
     ...
   }
 
@@ -818,6 +818,52 @@ least one out-of-line virtual method in the class.  Without this, the compiler
 will copy the vtable and RTTI into every ``.o`` file that ``#include``\s the
 header, bloating ``.o`` file sizes and increasing link times.
 
+Don't use default labels in fully covered switches over enumerations
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``-Wswitch`` warns if a switch, without a default label, over an enumeration
+does not cover every enumeration value. If you write a default label on a fully
+covered switch over an enumeration then the ``-Wswitch`` warning won't fire
+when new elements are added to that enumeration. To help avoid adding these
+kinds of defaults, Clang has the warning ``-Wcovered-switch-default`` which is
+off by default but turned on when building LLVM with a version of Clang that
+supports the warning.
+
+A knock-on effect of this stylistic requirement is that when building LLVM with
+GCC you may get warnings related to "control may reach end of non-void function"
+if you return from each case of a covered switch-over-enum because GCC assumes
+that the enum expression may take any representable value, not just those of
+individual enumerators. To suppress this warning, use ``llvm_unreachable`` after
+the switch.
+
+Use ``LLVM_DELETED_FUNCTION`` to mark uncallable methods
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Prior to C++11, a common pattern to make a class uncopyable was to declare an
+unimplemented copy constructor and copy assignment operator and make them
+private. This would give a compiler error for accessing a private method or a
+linker error because it wasn't implemented.
+
+With C++11, we can mark methods that won't be implemented with ``= delete``.
+This will trigger a much better error message and tell the compiler that the
+method will never be implemented. This enables other checks like
+``-Wunused-private-field`` to run correctly on classes that contain these
+methods.
+
+To maintain compatibility with C++03, ``LLVM_DELETED_FUNCTION`` should be used
+which will expand to ``= delete`` if the compiler supports it. These methods
+should still be declared private. Example of the uncopyable pattern:
+
+.. code-block:: c++
+
+  class DontCopy {
+  private:
+    DontCopy(const DontCopy&) LLVM_DELETED_FUNCTION;
+    DontCopy &operator =(const DontCopy&) LLVM_DELETED_FUNCTION;
+  public:
+    ...
+  };
+
 Don't evaluate ``end()`` every time through a loop
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -1092,7 +1138,7 @@ good:
     };
   } // end anonymous namespace
 
-  static void Helper() { 
+  static void runHelper() { 
     ... 
   }
 
@@ -1112,7 +1158,7 @@ This is bad:
     bool operator<(const char *RHS) const;
   };
 
-  void Helper() { 
+  void runHelper() { 
     ... 
   }
 
@@ -1122,7 +1168,7 @@ This is bad:
 
   } // end anonymous namespace
 
-This is bad specifically because if you're looking at "``Helper``" in the middle
+This is bad specifically because if you're looking at "``runHelper``" in the middle
 of a large C++ file, that you have no immediate way to tell if it is local to
 the file.  When it is marked static explicitly, this is immediately obvious.
 Also, there is no reason to enclose the definition of "``operator<``" in the
diff --git a/docs/DebuggingJITedCode.html b/docs/DebuggingJITedCode.html
deleted file mode 100644
index 7d52fa7635..0000000000
--- a/docs/DebuggingJITedCode.html
+++ /dev/null
@@ -1,184 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
-                      "http://www.w3.org/TR/html4/strict.dtd">
-<html>
-<head>
-  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
-  <title>Debugging JITed Code With GDB</title>
-  <link rel="stylesheet" href="_static/llvm.css" type="text/css">
-</head>
-<body>
-
-<h1>Debugging JIT-ed Code With GDB</h1>
-<ol>
-  <li><a href="#background">Background</a></li>
-  <li><a href="#gdbversion">GDB Version</a></li>
-  <li><a href="#mcjitdebug">Debugging MCJIT-ed code</a></li>
-  <ul>
-    <li><a href="#mcjitdebug_example">Example</a></li>
-  </ul>
-</ol>
-<div class="doc_author">Written by Reid Kleckner and Eli Bendersky</div>
-
-<!--=========================================================================-->
-<h2><a name="background">Background</a></h2>
-<!--=========================================================================-->
-<div>
-
-<p>Without special runtime support, debugging dynamically generated code with
-GDB (as well as most debuggers) can be quite painful.  Debuggers generally read
-debug information from the object file of the code, but for JITed code, there is
-no such file to look for.
-</p>
-
-<p>In order to communicate the necessary debug info to GDB, an interface for
-registering JITed code with debuggers has been designed and implemented for
-GDB and LLVM MCJIT.  At a high level, whenever MCJIT generates new machine code,
-it does so in an in-memory object file that contains the debug information in
-DWARF format.  MCJIT then adds this in-memory object file to a global list of
-dynamically generated object files and calls a special function
-(<tt>__jit_debug_register_code</tt>) marked noinline that GDB knows about.  When
-GDB attaches to a process, it puts a breakpoint in this function and loads all
-of the object files in the global list.  When MCJIT calls the registration
-function, GDB catches the breakpoint signal, loads the new object file from
-the inferior's memory, and resumes the execution.  In this way, GDB can get the
-necessary debug information.
-</p>
-</div>
-
-<!--=========================================================================-->
-<h2><a name="gdbversion">GDB Version</a></h2>
-<!--=========================================================================-->
-
-<p>In order to debug code JIT-ed by LLVM, you need GDB 7.0 or newer, which is
-available on most modern distributions of Linux.  The version of GDB that Apple
-ships with Xcode has been frozen at 6.3 for a while.  LLDB may be a better
-option for debugging JIT-ed code on Mac OS X.
-</p>
-
-
-<!--=========================================================================-->
-<h2><a name="mcjitdebug">Debugging MCJIT-ed code</a></h2>
-<!--=========================================================================-->
-<div>
-
-<p>The emerging MCJIT component of LLVM allows full debugging of JIT-ed code with
-GDB.  This is due to MCJIT's ability to use the MC emitter to provide full
-DWARF debugging information to GDB.</p>
-
-<p>Note that lli has to be passed the <tt>-use-mcjit</tt> flag to JIT the code
-with MCJIT instead of the old JIT.</p>
-
-<h3><a name="mcjitdebug_example">Example</a></h3>
-
-<div>
-
-<p>Consider the following C code (with line numbers added to make the example
-easier to follow):</p>
-
-<pre class="doc_code">
-1   int compute_factorial(int n)
-2   {
-3       if (n <= 1)
-4           return 1;
-5
-6       int f = n;
-7       while (--n > 1) 
-8           f *= n;
-9       return f;
-10  }
-11
-12
-13  int main(int argc, char** argv)
-14  {
-15      if (argc < 2)
-16          return -1;
-17      char firstletter = argv[1][0];
-18      int result = compute_factorial(firstletter - '0');
-19  
-20      // Returned result is clipped at 255...
-21      return result;
-22  }
-</pre>
-
-<p>Here is a sample command line session that shows how to build and run this
-code via lli inside GDB:
-</p>
-
-<pre class="doc_code">
-$ $BINPATH/clang -cc1 -O0 -g -emit-llvm showdebug.c
-$ gdb --quiet --args $BINPATH/lli -use-mcjit showdebug.ll 5
-Reading symbols from $BINPATH/lli...done.
-(gdb) b showdebug.c:6
-No source file named showdebug.c.
-Make breakpoint pending on future shared library load? (y or [n]) y
-Breakpoint 1 (showdebug.c:6) pending.
-(gdb) r
-Starting program: $BINPATH/lli -use-mcjit showdebug.ll 5
-[Thread debugging using libthread_db enabled]
-
-Breakpoint 1, compute_factorial (n=5) at showdebug.c:6
-6	    int f = n;
-(gdb) p n
-$1 = 5
-(gdb) p f
-$2 = 0
-(gdb) n
-7	    while (--n > 1) 
-(gdb) p f
-$3 = 5
-(gdb) b showdebug.c:9
-Breakpoint 2 at 0x7ffff7ed404c: file showdebug.c, line 9.
-(gdb) c
-Continuing.
-
-Breakpoint 2, compute_factorial (n=1) at showdebug.c:9
-9	    return f;
-(gdb) p f
-$4 = 120
-(gdb) bt
-#0  compute_factorial (n=1) at showdebug.c:9
-#1  0x00007ffff7ed40a9 in main (argc=2, argv=0x16677e0) at showdebug.c:18
-#2  0x3500000001652748 in ?? ()
-#3  0x00000000016677e0 in ?? ()
-#4  0x0000000000000002 in ?? ()
-#5  0x0000000000d953b3 in llvm::MCJIT::runFunction (this=0x16151f0, F=0x1603020, ArgValues=...) at /home/ebenders_test/llvm_svn_rw/lib/ExecutionEngine/MCJIT/MCJIT.cpp:161
-#6  0x0000000000dc8872 in llvm::ExecutionEngine::runFunctionAsMain (this=0x16151f0, Fn=0x1603020, argv=..., envp=0x7fffffffe040)
-    at /home/ebenders_test/llvm_svn_rw/lib/ExecutionEngine/ExecutionEngine.cpp:397
-#7  0x000000000059c583 in main (argc=4, argv=0x7fffffffe018, envp=0x7fffffffe040) at /home/ebenders_test/llvm_svn_rw/tools/lli/lli.cpp:324
-(gdb) finish
-Run till exit from #0  compute_factorial (n=1) at showdebug.c:9
-0x00007ffff7ed40a9 in main (argc=2, argv=0x16677e0) at showdebug.c:18
-18	    int result = compute_factorial(firstletter - '0');
-Value returned is $5 = 120
-(gdb) p result
-$6 = 23406408
-(gdb) n
-21	    return result;
-(gdb) p result
-$7 = 120
-(gdb) c
-Continuing.
-
-Program exited with code 0170.
-(gdb) 
-
-</pre>
-
-</div>
-</div>
-
-
-<!-- *********************************************************************** -->
-<hr>
-<address>
-  <a href="http://jigsaw.w3.org/css-validator/check/referer"><img
-  src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a>
-  <a href="http://validator.w3.org/check/referer"><img
-  src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
-  <a href="mailto:reid.kleckner@gmail.com">Reid Kleckner</a>,
-  <a href="mailto:eliben@gmail.com">Eli Bendersky</a><br>
-  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date$
-</address>
-</body>
-</html>
diff --git a/docs/DebuggingJITedCode.rst b/docs/DebuggingJITedCode.rst
new file mode 100644
index 0000000000..eeb2f7787d
--- /dev/null
+++ b/docs/DebuggingJITedCode.rst
@@ -0,0 +1,147 @@
+.. _debugging-jited-code:
+
+==============================
+Debugging JIT-ed Code With GDB
+==============================
+
+.. sectionauthor:: Reid Kleckner and Eli Bendersky
+
+Background
+==========
+
+Without special runtime support, debugging dynamically generated code with
+GDB (as well as most debuggers) can be quite painful.  Debuggers generally
+read debug information from the object file of the code, but for JITed
+code, there is no such file to look for.
+
+In order to communicate the necessary debug info to GDB, an interface for
+registering JITed code with debuggers has been designed and implemented for
+GDB and LLVM MCJIT.  At a high level, whenever MCJIT generates new machine code,
+it does so in an in-memory object file that contains the debug information in
+DWARF format.  MCJIT then adds this in-memory object file to a global list of
+dynamically generated object files and calls a special function
+(``__jit_debug_register_code``) marked noinline that GDB knows about.  When
+GDB attaches to a process, it puts a breakpoint in this function and loads all
+of the object files in the global list.  When MCJIT calls the registration
+function, GDB catches the breakpoint signal, loads the new object file from
+the inferior's memory, and resumes the execution.  In this way, GDB can get the
+necessary debug information.
+
+GDB Version
+===========
+
+In order to debug code JIT-ed by LLVM, you need GDB 7.0 or newer, which is
+available on most modern distributions of Linux.  The version of GDB that
+Apple ships with Xcode has been frozen at 6.3 for a while.  LLDB may be a
+better option for debugging JIT-ed code on Mac OS X.
+
+
+Debugging MCJIT-ed code
+=======================
+
+The emerging MCJIT component of LLVM allows full debugging of JIT-ed code with
+GDB.  This is due to MCJIT's ability to use the MC emitter to provide full
+DWARF debugging information to GDB.
+
+Note that lli has to be passed the ``-use-mcjit`` flag to JIT the code with
+MCJIT instead of the old JIT.
+
+Example
+-------
+
+Consider the following C code (with line numbers added to make the example
+easier to follow):
+
+..
+   FIXME:
+   Sphinx has the ability to automatically number these lines by adding
+   :linenos: on the line immediately following the `.. code-block:: c`, but
+   it looks like garbage; the line numbers don't even line up with the
+   lines. Is this a Sphinx bug, or is it a CSS problem?
+
+.. code-block:: c
+
+   1   int compute_factorial(int n)
+   2   {
+   3       if (n <= 1)
+   4           return 1;
+   5
+   6       int f = n;
+   7       while (--n > 1)
+   8           f *= n;
+   9       return f;
+   10  }
+   11
+   12
+   13  int main(int argc, char** argv)
+   14  {
+   15      if (argc < 2)
+   16          return -1;
+   17      char firstletter = argv[1][0];
+   18      int result = compute_factorial(firstletter - '0');
+   19
+   20      // Returned result is clipped at 255...
+   21      return result;
+   22  }
+
+Here is a sample command line session that shows how to build and run this
+code via ``lli`` inside GDB:
+
+.. code-block:: bash
+
+   $ $BINPATH/clang -cc1 -O0 -g -emit-llvm showdebug.c
+   $ gdb --quiet --args $BINPATH/lli -use-mcjit showdebug.ll 5
+   Reading symbols from $BINPATH/lli...done.
+   (gdb) b showdebug.c:6
+   No source file named showdebug.c.
+   Make breakpoint pending on future shared library load? (y or [n]) y
+   Breakpoint 1 (showdebug.c:6) pending.
+   (gdb) r
+   Starting program: $BINPATH/lli -use-mcjit showdebug.ll 5
+   [Thread debugging using libthread_db enabled]
+
+   Breakpoint 1, compute_factorial (n=5) at showdebug.c:6
+   6	    int f = n;
+   (gdb) p n
+   $1 = 5
+   (gdb) p f
+   $2 = 0
+   (gdb) n
+   7	    while (--n > 1)
+   (gdb) p f
+   $3 = 5
+   (gdb) b showdebug.c:9
+   Breakpoint 2 at 0x7ffff7ed404c: file showdebug.c, line 9.
+   (gdb) c
+   Continuing.
+
+   Breakpoint 2, compute_factorial (n=1) at showdebug.c:9
+   9	    return f;
+   (gdb) p f
+   $4 = 120
+   (gdb) bt
+   #0  compute_factorial (n=1) at showdebug.c:9
+   #1  0x00007ffff7ed40a9 in main (argc=2, argv=0x16677e0) at showdebug.c:18
+   #2  0x3500000001652748 in ?? ()
+   #3  0x00000000016677e0 in ?? ()
+   #4  0x0000000000000002 in ?? ()
+   #5  0x0000000000d953b3 in llvm::MCJIT::runFunction (this=0x16151f0, F=0x1603020, ArgValues=...) at /home/ebenders_test/llvm_svn_rw/lib/ExecutionEngine/MCJIT/MCJIT.cpp:161
+   #6  0x0000000000dc8872 in llvm::ExecutionEngine::runFunctionAsMain (this=0x16151f0, Fn=0x1603020, argv=..., envp=0x7fffffffe040)
+       at /home/ebenders_test/llvm_svn_rw/lib/ExecutionEngine/ExecutionEngine.cpp:397
+   #7  0x000000000059c583 in main (argc=4, argv=0x7fffffffe018, envp=0x7fffffffe040) at /home/ebenders_test/llvm_svn_rw/tools/lli/lli.cpp:324
+   (gdb) finish
+   Run till exit from #0  compute_factorial (n=1) at showdebug.c:9
+   0x00007ffff7ed40a9 in main (argc=2, argv=0x16677e0) at showdebug.c:18
+   18	    int result = compute_factorial(firstletter - '0');
+   Value returned is $5 = 120
+   (gdb) p result
+   $6 = 23406408
+   (gdb) n
+   21	    return result;
+   (gdb) p result
+   $7 = 120
+   (gdb) c
+   Continuing.
+
+   Program exited with code 0170.
+   (gdb)
diff --git a/docs/DeveloperPolicy.rst b/docs/DeveloperPolicy.rst
index cda281a25c..f940dbc579 100644
--- a/docs/DeveloperPolicy.rst
+++ b/docs/DeveloperPolicy.rst
@@ -279,7 +279,7 @@ If you have recently been granted commit access, these policies apply:
 #. You are granted *commit-after-approval* to all parts of LLVM.  To get
    approval, submit a `patch`_ to `llvm-commits
    <http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits>`_. When approved
-   you may commit it yourself.</li>
+   you may commit it yourself.
 
 #. You are allowed to commit patches without approval which you think are
    obvious. This is clearly a subjective decision --- we simply expect you to
diff --git a/docs/HowToAddABuilder.html b/docs/HowToAddABuilder.html
deleted file mode 100644
index 985b30e4f7..0000000000
--- a/docs/HowToAddABuilder.html
+++ /dev/null
@@ -1,142 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
-                      "http://www.w3.org/TR/html4/strict.dtd">
-<html>
-<head>
-  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
-  <title>
-    How To Add Your Build Configuration To LLVM Buildbot Infrastructure
-  </title>
-  <link rel="stylesheet" href="_static/llvm.css" type="text/css">
-</head>
-<body>
-
-<h1>How To Add Your Build Configuration To LLVM Buildbot Infrastructure</h1>
-<ol>
-  <li><a href="#introduction">Introduction</a></li>
-  <li><a href="#steps">Steps To Add Builder To LLVM Buildbot</a></li>
-</ol>
-<div class="doc_author">
-  <p>Written by <a href="mailto:gkistanova@gmail.com">Galina Kistanova</a></p>
-</div>
-
-<!-- *********************************************************************** -->
-<h2><a name="introduction">Introduction</a></h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>This document contains information about adding a build configuration and
-   buildslave to private slave builder to LLVM Buildbot Infrastructure
-   <a href="http://lab.llvm.org:8011">http://lab.llvm.org:8011</a></p>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2><a name="steps">Steps To Add Builder To LLVM Buildbot</a></h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>Volunteers can provide their build machines to work as build slaves to
-   public LLVM Buildbot.</p>
-
-<p>Here are the steps you can follow to do so:</p>
-
-<ol>
-  <li><p>Check the existing build configurations to make sure the one you are
-      interested in is not covered yet or gets built on your computer much
-      faster than on the existing one. We prefer faster builds so developers
-      will get feedback sooner after changes get committed.</p></li>
-
-  <li><p>The computer you will be registering with the LLVM buildbot
-      infrastructure should have all dependencies installed and you can
-      actually build your configuration successfully. Please check what degree
-      of parallelism (-j param) would give the fastest build.
-      You can build multiple configurations on one computer.</p></li>
-
-  <li><p>Install buildslave (currently we are using buildbot version 0.8.5).
-      Depending on the platform, buildslave could be available to download and
-      install with your packet manager, or you can download it directly from
-      <a href="http://trac.buildbot.net">http://trac.buildbot.net</a> and
-      install it manually.</p></li>
-
-  <li><p>Create a designated user account, your buildslave will be running
-      under, and set appropriate permissions.</p></li>
-
-  <li><p>Choose the buildslave root directory (all builds will be placed under
-      it), buildslave access name and password the build master will be using
-      to authenticate your buildslave.</p></li>
-
-  <li><p>Create a buildslave in context of that buildslave account.
-      Point it to the <b>lab.llvm.org</b> port <b>9990</b> (see
-      <a href="http://buildbot.net/buildbot/docs/current/full.html#creating-a-slave">
-      Buildbot documentation, Creating a slave</a>
-      for more details) by running the following command:</p>
-
-<div class="doc_code">
-<pre>
-$ buildslave create-slave <i>buildslave-root-directory</i> \
-             lab.llvm.org:9990 \
-             <i>buildslave-access-name buildslave-access-password</i>
-</pre>
-</div></li>
-
-  <li><p>Fill the buildslave description and admin name/e-mail.
-      Here is an example of the buildslave description:</p>
-
-<div class="doc_code">
-<pre>
-Windows 7 x64
-Core i7 (2.66GHz), 16GB of RAM
-
-g++.exe (TDM-1 mingw32) 4.4.0
-GNU Binutils 2.19.1
-cmake version 2.8.4
-Microsoft(R) 32-bit C/C++ Optimizing Compiler Version 16.00.40219.01 for 80x86
-</pre>
-</div></li>
-
-  <li><p>Make sure you can actually start the buildslave successfully. Then set
-      up your buildslave to start automatically at the start up time.
-      See the buildbot documentation for help.
-      You may want to restart your computer to see if it works.</p></li>
-
-  <li><p>Send a patch which adds your build slave and your builder to zorg.</p>
-      <ul>
-          <li>slaves are added to
-              <tt>buildbot/osuosl/master/config/slaves.py</tt></li>
-          <li>builders are added to
-              <tt>buildbot/osuosl/master/config/builders.py</tt></li>
-      </ul></li>
-
-  <li><p>Send the buildslave access name and the access password directly
-      to <a href="mailto:gkistanova@gmail.com">Galina Kistanova</a>, and wait
-      till she will let you know that your changes are applied and buildmaster
-      is reconfigured.</p>
-
-  <li><p>Check the status of your buildslave on the
-      <a href="http://lab.llvm.org:8011/waterfall">Waterfall Display</a>
-      to make sure it is connected, and
-      <a href="http://lab.llvm.org:8011/buildslaves/your-buildslave-name">
-      http://lab.llvm.org:8011/buildslaves/&lt;your-buildslave-name&gt;</a>
-      to see if administrator contact and slave information are correct.</p>
-      </li>
-
-  <li><p>Wait for the first build to succeed and enjoy.</p></li>
-</ol>
-
-</div>
-
-<!-- *********************************************************************** -->
-<hr>
-<address>
-  <a href="http://jigsaw.w3.org/css-validator/check/referer"><img
-  src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a>
-  <a href="http://validator.w3.org/check/referer"><img
-  src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
-  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a>
-  <br>
-  Last modified: $Date: 2011-10-31 12:50:0 -0700 (Mon, 31 Oct 2011) $
-</address>
-</body>
-</html>
diff --git a/docs/HowToAddABuilder.rst b/docs/HowToAddABuilder.rst
new file mode 100644
index 0000000000..b0cd2907f9
--- /dev/null
+++ b/docs/HowToAddABuilder.rst
@@ -0,0 +1,90 @@
+.. _how_to_add_a_builder:
+
+===================================================================
+How To Add Your Build Configuration To LLVM Buildbot Infrastructure
+===================================================================
+
+.. sectionauthor:: Galina Kistanova <gkistanova@gmail.com>
+
+Introduction
+============
+
+This document contains information about adding a build configuration and
+buildslave to private slave builder to LLVM Buildbot Infrastructure
+`<http://lab.llvm.org:8011>`_.
+
+
+Steps To Add Builder To LLVM Buildbot
+=====================================
+Volunteers can provide their build machines to work as build slaves to
+public LLVM Buildbot.
+
+Here are the steps you can follow to do so:
+
+#. Check the existing build configurations to make sure the one you are
+   interested in is not covered yet or gets built on your computer much
+   faster than on the existing one. We prefer faster builds so developers
+   will get feedback sooner after changes get committed.
+
+#. The computer you will be registering with the LLVM buildbot
+   infrastructure should have all dependencies installed and you can
+   actually build your configuration successfully. Please check what degree
+   of parallelism (-j param) would give the fastest build.  You can build
+   multiple configurations on one computer.
+
+#. Install buildslave (currently we are using buildbot version 0.8.5).
+   Depending on the platform, buildslave could be available to download and
+   install with your packet manager, or you can download it directly from
+   `<http://trac.buildbot.net>`_ and install it manually.
+
+#. Create a designated user account, your buildslave will be running under,
+   and set appropriate permissions.
+
+#. Choose the buildslave root directory (all builds will be placed under
+   it), buildslave access name and password the build master will be using
+   to authenticate your buildslave.
+
+#. Create a buildslave in context of that buildslave account.  Point it to
+   the **lab.llvm.org** port **9990** (see `Buildbot documentation,
+   Creating a slave
+   <http://buildbot.net/buildbot/docs/current/full.html#creating-a-slave>`_
+   for more details) by running the following command:
+
+    .. code-block:: bash
+
+       $ buildslave create-slave <buildslave-root-directory> \
+                    lab.llvm.org:9990 \
+                    <buildslave-access-name> <buildslave-access-password>
+
+#. Fill the buildslave description and admin name/e-mail.  Here is an
+   example of the buildslave description::
+
+       Windows 7 x64
+       Core i7 (2.66GHz), 16GB of RAM
+
+       g++.exe (TDM-1 mingw32) 4.4.0
+       GNU Binutils 2.19.1
+       cmake version 2.8.4
+       Microsoft(R) 32-bit C/C++ Optimizing Compiler Version 16.00.40219.01 for 80x86
+
+#. Make sure you can actually start the buildslave successfully. Then set
+   up your buildslave to start automatically at the start up time.  See the
+   buildbot documentation for help.  You may want to restart your computer
+   to see if it works.
+
+#. Send a patch which adds your build slave and your builder to zorg.
+
+   * slaves are added to ``buildbot/osuosl/master/config/slaves.py``
+   * builders are added to ``buildbot/osuosl/master/config/builders.py``
+
+#. Send the buildslave access name and the access password directly to
+   `Galina Kistanova <mailto:gkistanova@gmail.com>`_, and wait till she
+   will let you know that your changes are applied and buildmaster is
+   reconfigured.
+
+#. Check the status of your buildslave on the `Waterfall Display
+   <http://lab.llvm.org:8011/waterfall>`_ to make sure it is connected, and
+   ``http://lab.llvm.org:8011/buildslaves/<your-buildslave-name>`` to see
+   if administrator contact and slave information are correct.
+
+#. Wait for the first build to succeed and enjoy.
diff --git a/docs/LangRef.html b/docs/LangRef.html
index 4daab592e9..89b9227b72 100644
--- a/docs/LangRef.html
+++ b/docs/LangRef.html
@@ -103,6 +103,7 @@
       <li><a href="#metadata">Metadata Nodes and Metadata Strings</a>
         <ol>
           <li><a href="#tbaa">'<tt>tbaa</tt>' Metadata</a></li>
+          <li><a href="#tbaa.struct">'<tt>tbaa.struct</tt>' Metadata</a></li>
           <li><a href="#fpmath">'<tt>fpmath</tt>' Metadata</a></li>
           <li><a href="#range">'<tt>range</tt>' Metadata</a></li>
         </ol>
@@ -3052,6 +3053,44 @@ call void @llvm.dbg.value(metadata !24, i64 0, metadata !25)
 
 <!-- _______________________________________________________________________ -->
 <h4>
+  <a name="tbaa.struct">'<tt>tbaa.struct</tt>' Metadata</a>
+</h4>
+
+<div>
+
+<p>The <a href="#int_memcpy"><tt>llvm.memcpy</tt></a> is often used to implement
+aggregate assignment operations in C and similar languages, however it is
+defined to copy a contiguous region of memory, which is more than strictly
+necessary for aggregate types which contain holes due to padding. Also, it
+doesn't contain any TBAA information about the fields of the aggregate.</p>
+
+<p><tt>!tbaa.struct</tt> metadata can describe which memory subregions in a memcpy
+are padding and what the TBAA tags of the struct are.</p>
+
+<p>The current metadata format is very simple. <tt>!tbaa.struct</tt> metadata nodes
+   are a list of operands which are in conceptual groups of three. For each
+   group of three, the first operand gives the byte offset of a field in bytes,
+   the second gives its size in bytes, and the third gives its
+   tbaa tag. e.g.:</p>
+
+<div class="doc_code">
+<pre>
+!4 = metadata !{ i64 0, i64 4, metadata !1, i64 8, i64 4, metadata !2 }
+</pre>
+</div>
+
+<p>This describes a struct with two fields. The first is at offset 0 bytes
+   with size 4 bytes, and has tbaa tag !1. The second is at offset 8 bytes
+   and has size 4 bytes and has tbaa tag !2.</p>
+
+<p>Note that the fields need not be contiguous. In this example, there is a
+   4 byte gap between the two fields. This gap represents padding which
+   does not carry useful data and need not be preserved.</p>
+
+</div>
+
+<!-- _______________________________________________________________________ -->
+<h4>
   <a name="fpmath">'<tt>fpmath</tt>' Metadata</a>
 </h4>
  
diff --git a/docs/programming.rst b/docs/programming.rst
index 27e43014ee..5d000cb3c2 100644
--- a/docs/programming.rst
+++ b/docs/programming.rst
@@ -8,12 +8,17 @@ Programming Documentation
 
    CodingStandards
    CommandLine
+   Atomics
 
 * `LLVM Language Reference Manual <LangRef.html>`_
 
   Defines the LLVM intermediate representation and the assembly form of the
   different nodes.
 
+* :ref:`atomics`
+
+  Information about LLVM's concurrency model.
+
 * `The LLVM Programmers Manual <ProgrammersManual.html>`_
 
   Introduction to the general layout of the LLVM sourcebase, important classes
diff --git a/docs/subsystems.rst b/docs/subsystems.rst
index be33295a15..6c20a61c60 100644
--- a/docs/subsystems.rst
+++ b/docs/subsystems.rst
@@ -15,6 +15,7 @@ Subsystem Documentation
    LinkTimeOptimization
    SegmentedStacks
    TableGenFundamentals
+   DebuggingJITedCode
 
 * `Writing an LLVM Pass <WritingAnLLVMPass.html>`_
     
@@ -78,7 +79,7 @@ Subsystem Documentation
     
    How to build your programs with link-time optimization on Linux.
     
-* `The GDB JIT interface <DebuggingJITedCode.html>`_
+* :ref:`debugging-jited-code`
     
    How to debug JITed code with GDB.
     
diff --git a/docs/userguides.rst b/docs/userguides.rst
index 26a5a8ccc2..c7197ef628 100644
--- a/docs/userguides.rst
+++ b/docs/userguides.rst
@@ -13,6 +13,8 @@ User Guides
    FAQ
    Lexicon
    Packaging
+   HowToAddABuilder
+   yaml2obj
 
 * `The LLVM Getting Started Guide <GettingStarted.html>`_
     
@@ -78,7 +80,7 @@ User Guides
 
    Definition of acronyms, terms and concepts used in LLVM.
 
-* `How To Add Your Build Configuration To LLVM Buildbot Infrastructure <HowToAddABuilder.html>`_
+* :ref:`how_to_add_a_builder`
 
    Instructions for adding new builder to LLVM buildbot master.
     
diff --git a/docs/yaml2obj.rst b/docs/yaml2obj.rst
index cb59162e5a..d051e7e22c 100644
--- a/docs/yaml2obj.rst
+++ b/docs/yaml2obj.rst
@@ -6,9 +6,9 @@ yaml2obj
 yaml2obj takes a YAML description of an object file and converts it to a binary
 file.
 
-    $ yaml2py input-file
+    $ yaml2obj input-file
 
-.. program:: yaml2py
+.. program:: yaml2obj
 
 Outputs the binary to stdout.
 
diff --git a/include/llvm-c/Core.h b/include/llvm-c/Core.h
index 6587e77080..4d3374958b 100644
--- a/include/llvm-c/Core.h
+++ b/include/llvm-c/Core.h
@@ -1869,6 +1869,27 @@ LLVMValueRef LLVMMDNode(LLVMValueRef *Vals, unsigned Count);
 const char  *LLVMGetMDString(LLVMValueRef V, unsigned* Len);
 
 /**
+ * Obtain the number of operands from an MDNode value.
+ *
+ * @param V MDNode to get number of operands from.
+ * @return Number of operands of the MDNode.
+ */
+unsigned LLVMGetMDNodeNumOperands(LLVMValueRef V);
+
+/**
+ * Obtain the given MDNode's operands.
+ *
+ * The passed LLVMValueRef pointer should point to enough memory to hold all of
+ * the operands of the given MDNode (see LLVMGetMDNodeNumOperands) as
+ * LLVMValueRefs. This memory will be populated with the LLVMValueRefs of the
+ * MDNode's operands.
+ *
+ * @param V MDNode to get the operands from.
+ * @param Dest Destination array for operands.
+ */
+void LLVMGetMDNodeOperands(LLVMValueRef V, LLVMValueRef *Dest);
+
+/**
  * @}
  */
 
diff --git a/include/llvm/ADT/APInt.h b/include/llvm/ADT/APInt.h
index 7f20e3356a..4470534e04 100644
--- a/include/llvm/ADT/APInt.h
+++ b/include/llvm/ADT/APInt.h
@@ -251,7 +251,7 @@ public:
   /// constructor.
   APInt(unsigned numBits, unsigned numWords, const uint64_t bigVal[]);
 
-  /// This constructor interprets the string \arg str in the given radix. The
+  /// This constructor interprets the string \p str in the given radix. The
   /// interpretation stops when the first character that is not suitable for the
   /// radix is encountered, or the end of the string. Acceptable radix values
   /// are 2, 8, 10, 16, and 36. It is an error for the value implied by the 
@@ -1231,7 +1231,7 @@ public:
   }
 
   /// This method determines how many bits are required to hold the APInt
-  /// equivalent of the string given by \arg str.
+  /// equivalent of the string given by \p str.
   /// @brief Get bits required for string value.
   static unsigned getBitsNeeded(StringRef str, uint8_t radix);
 
diff --git a/include/llvm/ADT/DAGDeltaAlgorithm.h b/include/llvm/ADT/DAGDeltaAlgorithm.h
index e502ac4348..2dfed075de 100644
--- a/include/llvm/ADT/DAGDeltaAlgorithm.h
+++ b/include/llvm/ADT/DAGDeltaAlgorithm.h
@@ -48,17 +48,18 @@ public:
 public:
   virtual ~DAGDeltaAlgorithm() {}
 
-  /// Run - Minimize the DAG formed by the \arg Changes vertices and the \arg
-  /// Dependencies edges by executing \see ExecuteOneTest() on subsets of
+  /// Run - Minimize the DAG formed by the \p Changes vertices and the
+  /// \p Dependencies edges by executing \see ExecuteOneTest() on subsets of
   /// changes and returning the smallest set which still satisfies the test
-  /// predicate and the input \arg Dependencies.
+  /// predicate and the input \p Dependencies.
   ///
   /// \param Changes The list of changes.
   ///
   /// \param Dependencies The list of dependencies amongst changes. For each
-  /// (x,y) in \arg Dependencies, both x and y must be in \arg Changes. The
-  /// minimization algorithm guarantees that for each tested changed set S, x
-  /// \in S implies y \in S. It is an error to have cyclic dependencies.
+  /// (x,y) in \p Dependencies, both x and y must be in \p Changes. The
+  /// minimization algorithm guarantees that for each tested changed set S,
+  /// \f$ x \in S \f$ implies \f$ y \in S \f$. It is an error to have cyclic
+  /// dependencies.
   changeset_ty Run(const changeset_ty &Changes,
                    const std::vector<edge_ty> &Dependencies);
 
@@ -67,7 +68,7 @@ public:
                                   const changesetlist_ty &Sets,
                                   const changeset_ty &Required) {}
 
-  /// ExecuteOneTest - Execute a single test predicate on the change set \arg S.
+  /// ExecuteOneTest - Execute a single test predicate on the change set \p S.
   virtual bool ExecuteOneTest(const changeset_ty &S) = 0;
 };
 
diff --git a/include/llvm/ADT/DeltaAlgorithm.h b/include/llvm/ADT/DeltaAlgorithm.h
index 45ba19891d..7bf7960c63 100644
--- a/include/llvm/ADT/DeltaAlgorithm.h
+++ b/include/llvm/ADT/DeltaAlgorithm.h
@@ -45,23 +45,23 @@ private:
   /// since we always reduce following a success.
   std::set<changeset_ty> FailedTestsCache;
 
-  /// GetTestResult - Get the test result for the \arg Changes from the
+  /// GetTestResult - Get the test result for the \p Changes from the
   /// cache, executing the test if necessary.
   ///
   /// \param Changes - The change set to test.
   /// \return - The test result.
   bool GetTestResult(const changeset_ty &Changes);
 
-  /// Split - Partition a set of changes \arg S into one or two subsets.
+  /// Split - Partition a set of changes \p S into one or two subsets.
   void Split(const changeset_ty &S, changesetlist_ty &Res);
 
-  /// Delta - Minimize a set of \arg Changes which has been partioned into
+  /// Delta - Minimize a set of \p Changes which has been partioned into
   /// smaller sets, by attempting to remove individual subsets.
   changeset_ty Delta(const changeset_ty &Changes,
                      const changesetlist_ty &Sets);
 
-  /// Search - Search for a subset (or subsets) in \arg Sets which can be
-  /// removed from \arg Changes while still satisfying the predicate.
+  /// Search - Search for a subset (or subsets) in \p Sets which can be
+  /// removed from \p Changes while still satisfying the predicate.
   ///
   /// \param Res - On success, a subset of Changes which satisfies the
   /// predicate.
@@ -74,13 +74,13 @@ protected:
   virtual void UpdatedSearchState(const changeset_ty &Changes,
                                   const changesetlist_ty &Sets) {}
 
-  /// ExecuteOneTest - Execute a single test predicate on the change set \arg S.
+  /// ExecuteOneTest - Execute a single test predicate on the change set \p S.
   virtual bool ExecuteOneTest(const changeset_ty &S) = 0;
 
 public:
   virtual ~DeltaAlgorithm();
 
-  /// Run - Minimize the set \arg Changes by executing \see ExecuteOneTest() on
+  /// Run - Minimize the set \p Changes by executing \see ExecuteOneTest() on
   /// subsets of changes and returning the smallest set which still satisfies
   /// the test predicate.
   changeset_ty Run(const changeset_ty &Changes);
diff --git a/include/llvm/ADT/DenseMap.h b/include/llvm/ADT/DenseMap.h
index f60d688c0d..cbcf7892c9 100644
--- a/include/llvm/ADT/DenseMap.h
+++ b/include/llvm/ADT/DenseMap.h
@@ -423,6 +423,7 @@ private:
       this->grow(NumBuckets);
       LookupBucketFor(Key, TheBucket);
     }
+    assert(TheBucket);
 
     // Only update the state after we've grown our bucket space appropriately
     // so that when growing buckets we have self-consistent entry count.
diff --git a/include/llvm/ADT/EquivalenceClasses.h b/include/llvm/ADT/EquivalenceClasses.h
index 771476c303..1d81772ee8 100644
--- a/include/llvm/ADT/EquivalenceClasses.h
+++ b/include/llvm/ADT/EquivalenceClasses.h
@@ -33,6 +33,7 @@ namespace llvm {
 ///
 /// Here is a simple example using integers:
 ///
+/// \code
 ///  EquivalenceClasses<int> EC;
 ///  EC.unionSets(1, 2);                // insert 1, 2 into the same set
 ///  EC.insert(4); EC.insert(5);        // insert 4, 5 into own sets
@@ -46,6 +47,7 @@ namespace llvm {
 ///      cerr << *MI << " ";  // Print member.
 ///    cerr << "\n";   // Finish set.
 ///  }
+/// \endcode
 ///
 /// This example prints:
 ///   4
diff --git a/include/llvm/ADT/Hashing.h b/include/llvm/ADT/Hashing.h
index 23633045ff..cda31a261d 100644
--- a/include/llvm/ADT/Hashing.h
+++ b/include/llvm/ADT/Hashing.h
@@ -710,7 +710,7 @@ hash_code hash_combine(const T1 &arg1) {
 #endif
 
 
-// Implementation details for implementatinos of hash_value overloads provided
+// Implementation details for implementations of hash_value overloads provided
 // here.
 namespace hashing {
 namespace detail {
diff --git a/include/llvm/ADT/ImmutableList.h b/include/llvm/ADT/ImmutableList.h
index d7c0074a9f..20bdd903f7 100644
--- a/include/llvm/ADT/ImmutableList.h
+++ b/include/llvm/ADT/ImmutableList.h
@@ -33,9 +33,8 @@ class ImmutableListImpl : public FoldingSetNode {
 
   friend class ImmutableListFactory<T>;
 
-  // Do not implement.
-  void operator=(const ImmutableListImpl&);
-  ImmutableListImpl(const ImmutableListImpl&);
+  void operator=(const ImmutableListImpl&) LLVM_DELETED_FUNCTION;
+  ImmutableListImpl(const ImmutableListImpl&) LLVM_DELETED_FUNCTION;
 
 public:
   const T& getHead() const { return Head; }
diff --git a/include/llvm/ADT/ImmutableMap.h b/include/llvm/ADT/ImmutableMap.h
index 8346ffabff..4883c5ba0a 100644
--- a/include/llvm/ADT/ImmutableMap.h
+++ b/include/llvm/ADT/ImmutableMap.h
@@ -122,8 +122,8 @@ public:
     }
 
   private:
-    Factory(const Factory& RHS); // DO NOT IMPLEMENT
-    void operator=(const Factory& RHS); // DO NOT IMPLEMENT
+    Factory(const Factory& RHS) LLVM_DELETED_FUNCTION;
+    void operator=(const Factory& RHS) LLVM_DELETED_FUNCTION;
   };
 
   bool contains(key_type_ref K) const {
diff --git a/include/llvm/ADT/ImmutableSet.h b/include/llvm/ADT/ImmutableSet.h
index 949dc44dab..261d0494e2 100644
--- a/include/llvm/ADT/ImmutableSet.h
+++ b/include/llvm/ADT/ImmutableSet.h
@@ -22,7 +22,6 @@
 #include <cassert>
 #include <functional>
 #include <vector>
-#include <stdio.h>
 
 namespace llvm {
 
@@ -84,12 +83,12 @@ public:
     }
     return NULL;
   }
-  
+
   /// getMaxElement - Find the subtree associated with the highest ranged
   ///  key value.
   ImutAVLTree* getMaxElement() {
     ImutAVLTree *T = this;
-    ImutAVLTree *Right = T->getRight();    
+    ImutAVLTree *Right = T->getRight();
     while (Right) { T = right; right = T->getRight(); }
     return T;
   }
@@ -258,7 +257,7 @@ private:
   ///  method returns false for an instance of ImutAVLTree, all subtrees
   ///  will also have this method return false.  The converse is not true.
   bool isMutable() const { return IsMutable; }
-  
+
   /// hasCachedDigest - Returns true if the digest for this tree is cached.
   ///  This can only be true if the tree is immutable.
   bool hasCachedDigest() const { return IsDigestCached; }
@@ -280,7 +279,7 @@ private:
     assert(isMutable() && "Mutable flag already removed.");
     IsMutable = false;
   }
-  
+
   /// markedCachedDigest - Clears the NoCachedDigest flag for a tree.
   void markedCachedDigest() {
     assert(!hasCachedDigest() && "NoCachedDigest flag already removed.");
@@ -349,7 +348,7 @@ public:
       else
         factory->Cache[factory->maskCacheIndex(computeDigest())] = next;
     }
-    
+
     // We need to clear the mutability bit in case we are
     // destroying the node as part of a sweep in ImutAVLFactory::recoverNodes().
     IsMutable = false;
@@ -415,7 +414,7 @@ public:
   TreeTy* getEmptyTree() const { return NULL; }
 
 protected:
-  
+
   //===--------------------------------------------------===//
   // A bunch of quick helper functions used for reasoning
   // about the properties of trees and their children.
@@ -461,7 +460,7 @@ protected:
   // returned to the caller.
   //===--------------------------------------------------===//
 
-  TreeTy* createNode(TreeTy* L, value_type_ref V, TreeTy* R) {   
+  TreeTy* createNode(TreeTy* L, value_type_ref V, TreeTy* R) {
     BumpPtrAllocator& A = getAllocator();
     TreeTy* T;
     if (!freeNodes.empty()) {
@@ -469,8 +468,7 @@ protected:
       freeNodes.pop_back();
       assert(T != L);
       assert(T != R);
-    }
-    else {
+    } else {
       T = (TreeTy*) A.Allocate<TreeTy>();
     }
     new (T) TreeTy(this, L, R, V, incrementHeight(L,R));
@@ -513,7 +511,8 @@ protected:
 
       return createNode(createNode(LL,L,LRL), LR, createNode(LRR,V,R));
     }
-    else if (hr > hl + 2) {
+
+    if (hr > hl + 2) {
       assert(!isEmpty(R) && "Right tree cannot be empty to have a height >= 2");
 
       TreeTy *RL = getLeft(R);
@@ -529,8 +528,8 @@ protected:
 
       return createNode(createNode(L,V,RLL), RL, createNode(RLR,R,RR));
     }
-    else
-      return createNode(L,V,R);
+
+    return createNode(L,V,R);
   }
 
   /// add_internal - Creates a new tree that includes the specified
@@ -604,7 +603,7 @@ protected:
     markImmutable(getLeft(T));
     markImmutable(getRight(T));
   }
-  
+
 public:
   TreeTy *getCanonicalTree(TreeTy *TNew) {
     if (!TNew)
@@ -937,7 +936,7 @@ public:
 
 private:
   TreeTy *Root;
-  
+
 public:
   /// Constructs a set from a pointer to a tree root.  In general one
   /// should use a Factory object to create sets instead of directly
@@ -1006,10 +1005,10 @@ public:
     typename TreeTy::Factory *getTreeFactory() const {
       return const_cast<typename TreeTy::Factory *>(&F);
     }
-    
+
   private:
-    Factory(const Factory& RHS); // DO NOT IMPLEMENT
-    void operator=(const Factory& RHS); // DO NOT IMPLEMENT
+    Factory(const Factory& RHS) LLVM_DELETED_FUNCTION;
+    void operator=(const Factory& RHS) LLVM_DELETED_FUNCTION;
   };
 
   friend class Factory;
@@ -1027,11 +1026,11 @@ public:
     return Root && RHS.Root ? Root->isNotEqual(*RHS.Root) : Root != RHS.Root;
   }
 
-  TreeTy *getRoot() { 
+  TreeTy *getRoot() {
     if (Root) { Root->retain(); }
     return Root;
   }
-  
+
   TreeTy *getRootWithoutRetain() const {
     return Root;
   }
@@ -1092,7 +1091,7 @@ public:
 
   void validateTree() const { if (Root) Root->validateTree(); }
 };
-  
+
 // NOTE: This may some day replace the current ImmutableSet.
 template <typename ValT, typename ValInfo = ImutContainerInfo<ValT> >
 class ImmutableSetRef {
@@ -1101,11 +1100,11 @@ public:
   typedef typename ValInfo::value_type_ref  value_type_ref;
   typedef ImutAVLTree<ValInfo> TreeTy;
   typedef typename TreeTy::Factory          FactoryTy;
-  
+
 private:
   TreeTy *Root;
   FactoryTy *Factory;
-  
+
 public:
   /// Constructs a set from a pointer to a tree root.  In general one
   /// should use a Factory object to create sets instead of directly
@@ -1133,44 +1132,44 @@ public:
   ~ImmutableSetRef() {
     if (Root) { Root->release(); }
   }
-  
+
   static inline ImmutableSetRef getEmptySet(FactoryTy *F) {
     return ImmutableSetRef(0, F);
   }
-  
+
   ImmutableSetRef add(value_type_ref V) {
     return ImmutableSetRef(Factory->add(Root, V), Factory);
   }
-  
+
   ImmutableSetRef remove(value_type_ref V) {
     return ImmutableSetRef(Factory->remove(Root, V), Factory);
   }
-    
+
   /// Returns true if the set contains the specified value.
   bool contains(value_type_ref V) const {
     return Root ? Root->contains(V) : false;
   }
-  
+
   ImmutableSet<ValT> asImmutableSet(bool canonicalize = true) const {
     return ImmutableSet<ValT>(canonicalize ?
                               Factory->getCanonicalTree(Root) : Root);
   }
-  
+
   TreeTy *getRootWithoutRetain() const {
     return Root;
   }
-  
+
   bool operator==(const ImmutableSetRef &RHS) const {
     return Root && RHS.Root ? Root->isEqual(*RHS.Root) : Root == RHS.Root;
   }
-  
+
   bool operator!=(const ImmutableSetRef &RHS) const {
     return Root && RHS.Root ? Root->isNotEqual(*RHS.Root) : Root != RHS.Root;
   }
 
   /// isEmpty - Return true if the set contains no elements.
   bool isEmpty() const { return !Root; }
-  
+
   /// isSingleton - Return true if the set contains exactly one element.
   ///   This method runs in constant time.
   bool isSingleton() const { return getHeight() == 1; }
@@ -1178,7 +1177,7 @@ public:
   //===--------------------------------------------------===//
   // Iterators.
   //===--------------------------------------------------===//
-  
+
   class iterator {
     typename TreeTy::iterator itr;
     iterator(TreeTy* t) : itr(t) {}
@@ -1194,28 +1193,28 @@ public:
     inline bool operator!=(const iterator& RHS) const { return RHS.itr != itr; }
     inline value_type *operator->() const { return &(operator*()); }
   };
-  
+
   iterator begin() const { return iterator(Root); }
   iterator end() const { return iterator(); }
-  
+
   //===--------------------------------------------------===//
   // Utility methods.
   //===--------------------------------------------------===//
-  
+
   unsigned getHeight() const { return Root ? Root->getHeight() : 0; }
-  
+
   static inline void Profile(FoldingSetNodeID& ID, const ImmutableSetRef& S) {
     ID.AddPointer(S.Root);
   }
-  
+
   inline void Profile(FoldingSetNodeID& ID) const {
     return Profile(ID,*this);
   }
-  
+
   //===--------------------------------------------------===//
   // For testing.
   //===--------------------------------------------------===//
-  
+
   void validateTree() const { if (Root) Root->validateTree(); }
 };
 
diff --git a/include/llvm/ADT/MapVector.h b/include/llvm/ADT/MapVector.h
new file mode 100644
index 0000000000..bad207baa9
--- /dev/null
+++ b/include/llvm/ADT/MapVector.h
@@ -0,0 +1,80 @@
+//===- llvm/ADT/MapVector.h - Map with deterministic value order *- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a map that provides insertion order iteration. The
+// interface is purposefully minimal. The key is assumed to be cheap to copy
+// and 2 copies are kept, one for indexing in a DenseMap, one for iteration in
+// a std::vector.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_MAPVECTOR_H
+#define LLVM_ADT_MAPVECTOR_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include <vector>
+
+namespace llvm {
+
+/// This class implements a map that also provides access to all stored values
+/// in a deterministic order. The values are kept in a std::vector and the
+/// mapping is done with DenseMap from Keys to indexes in that vector.
+template<typename KeyT, typename ValueT>
+class MapVector {
+  typedef llvm::DenseMap<KeyT, unsigned> MapType;
+  typedef std::vector<std::pair<KeyT, ValueT> > VectorType;
+  typedef typename VectorType::size_type SizeType;
+
+  MapType Map;
+  VectorType Vector;
+
+public:
+  typedef typename VectorType::iterator iterator;
+  typedef typename VectorType::const_iterator const_iterator;
+
+  SizeType size() const {
+    return Vector.size();
+  }
+
+  iterator begin() {
+    return Vector.begin();
+  }
+
+  const_iterator begin() const {
+    return Vector.begin();
+  }
+
+  iterator end() {
+    return Vector.end();
+  }
+
+  const_iterator end() const {
+    return Vector.end();
+  }
+
+  bool empty() const {
+    return Vector.empty();
+  }
+
+  ValueT &operator[](const KeyT &Key) {
+    std::pair<KeyT, unsigned> Pair = std::make_pair(Key, 0);
+    std::pair<typename MapType::iterator, bool> Result = Map.insert(Pair);
+    unsigned &I = Result.first->second;
+    if (Result.second) {
+      Vector.push_back(std::make_pair(Key, ValueT()));
+      I = Vector.size() - 1;
+    }
+    return Vector[I].second;
+  }
+};
+
+}
+
+#endif
diff --git a/include/llvm/ADT/OwningPtr.h b/include/llvm/ADT/OwningPtr.h
index 6d9c305977..ea9495d386 100644
--- a/include/llvm/ADT/OwningPtr.h
+++ b/include/llvm/ADT/OwningPtr.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_ADT_OWNING_PTR_H
 #define LLVM_ADT_OWNING_PTR_H
 
+#include "llvm/Support/Compiler.h"
 #include <cassert>
 #include <cstddef>
 
@@ -25,8 +26,8 @@ namespace llvm {
 /// pointee object can be taken away from OwningPtr by using the take method.
 template<class T>
 class OwningPtr {
-  OwningPtr(OwningPtr const &);             // DO NOT IMPLEMENT
-  OwningPtr &operator=(OwningPtr const &);  // DO NOT IMPLEMENT
+  OwningPtr(OwningPtr const &) LLVM_DELETED_FUNCTION;
+  OwningPtr &operator=(OwningPtr const &) LLVM_DELETED_FUNCTION;
   T *Ptr;
 public:
   explicit OwningPtr(T *P = 0) : Ptr(P) {}
@@ -79,8 +80,8 @@ inline void swap(OwningPtr<T> &a, OwningPtr<T> &b) {
 ///  functionality as OwningPtr, except that it works for array types.
 template<class T>
 class OwningArrayPtr {
-  OwningArrayPtr(OwningArrayPtr const &);            // DO NOT IMPLEMENT
-  OwningArrayPtr &operator=(OwningArrayPtr const &); // DO NOT IMPLEMENT
+  OwningArrayPtr(OwningArrayPtr const &) LLVM_DELETED_FUNCTION;
+  OwningArrayPtr &operator=(OwningArrayPtr const &) LLVM_DELETED_FUNCTION;
   T *Ptr;
 public:
   explicit OwningArrayPtr(T *P = 0) : Ptr(P) {}
diff --git a/include/llvm/ADT/ScopedHashTable.h b/include/llvm/ADT/ScopedHashTable.h
index a6803ee0ed..efddd9f9b8 100644
--- a/include/llvm/ADT/ScopedHashTable.h
+++ b/include/llvm/ADT/ScopedHashTable.h
@@ -90,8 +90,8 @@ class ScopedHashTableScope {
   /// LastValInScope - This is the last value that was inserted for this scope
   /// or null if none have been inserted yet.
   ScopedHashTableVal<K, V> *LastValInScope;
-  void operator=(ScopedHashTableScope&);       // DO NOT IMPLEMENT
-  ScopedHashTableScope(ScopedHashTableScope&); // DO NOT IMPLEMENT
+  void operator=(ScopedHashTableScope&) LLVM_DELETED_FUNCTION;
+  ScopedHashTableScope(ScopedHashTableScope&) LLVM_DELETED_FUNCTION;
 public:
   ScopedHashTableScope(ScopedHashTable<K, V, KInfo, AllocatorTy> &HT);
   ~ScopedHashTableScope();
diff --git a/include/llvm/ADT/SmallPtrSet.h b/include/llvm/ADT/SmallPtrSet.h
index 498a0345d8..3bb883088c 100644
--- a/include/llvm/ADT/SmallPtrSet.h
+++ b/include/llvm/ADT/SmallPtrSet.h
@@ -15,12 +15,13 @@
 #ifndef LLVM_ADT_SMALLPTRSET_H
 #define LLVM_ADT_SMALLPTRSET_H
 
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/PointerLikeTypeTraits.h"
 #include <cassert>
 #include <cstddef>
 #include <cstring>
 #include <iterator>
-#include "llvm/Support/DataTypes.h"
-#include "llvm/Support/PointerLikeTypeTraits.h"
 
 namespace llvm {
 
@@ -132,7 +133,7 @@ private:
   /// Grow - Allocate a larger backing store for the buckets and move it over.
   void Grow(unsigned NewSize);
 
-  void operator=(const SmallPtrSetImpl &RHS);  // DO NOT IMPLEMENT.
+  void operator=(const SmallPtrSetImpl &RHS) LLVM_DELETED_FUNCTION;
 protected:
   /// swap - Swaps the elements of two sets.
   /// Note: This method assumes that both sets have the same small size.
diff --git a/include/llvm/ADT/SmallString.h b/include/llvm/ADT/SmallString.h
index c6f0a5bf15..8da99d1c12 100644
--- a/include/llvm/ADT/SmallString.h
+++ b/include/llvm/ADT/SmallString.h
@@ -44,25 +44,25 @@ public:
   /// @name String Assignment
   /// @{
 
-  /// Assign from a repeated element
+  /// Assign from a repeated element.
   void assign(size_t NumElts, char Elt) {
     this->SmallVectorImpl<char>::assign(NumElts, Elt);
   }
 
-  /// Assign from an iterator pair
+  /// Assign from an iterator pair.
   template<typename in_iter>
   void assign(in_iter S, in_iter E) {
     this->clear();
     SmallVectorImpl<char>::append(S, E);
   }
 
-  /// Assign from a StringRef
+  /// Assign from a StringRef.
   void assign(StringRef RHS) {
     this->clear();
     SmallVectorImpl<char>::append(RHS.begin(), RHS.end());
   }
 
-  /// Assign from a SmallVector
+  /// Assign from a SmallVector.
   void assign(const SmallVectorImpl<char> &RHS) {
     this->clear();
     SmallVectorImpl<char>::append(RHS.begin(), RHS.end());
@@ -72,7 +72,7 @@ public:
   /// @name String Concatenation
   /// @{
 
-  /// Append from an iterator pair
+  /// Append from an iterator pair.
   template<typename in_iter>
   void append(in_iter S, in_iter E) {
     SmallVectorImpl<char>::append(S, E);
@@ -83,12 +83,12 @@ public:
   }
 
 
-  /// Append from a StringRef
+  /// Append from a StringRef.
   void append(StringRef RHS) {
     SmallVectorImpl<char>::append(RHS.begin(), RHS.end());
   }
 
-  /// Append from a SmallVector
+  /// Append from a SmallVector.
   void append(const SmallVectorImpl<char> &RHS) {
     SmallVectorImpl<char>::append(RHS.begin(), RHS.end());
   }
@@ -97,19 +97,19 @@ public:
   /// @name String Comparison
   /// @{
 
-  /// equals - Check for string equality, this is more efficient than
-  /// compare() when the relative ordering of inequal strings isn't needed.
+  /// Check for string equality.  This is more efficient than compare() when
+  /// the relative ordering of inequal strings isn't needed.
   bool equals(StringRef RHS) const {
     return str().equals(RHS);
   }
 
-  /// equals_lower - Check for string equality, ignoring case.
+  /// Check for string equality, ignoring case.
   bool equals_lower(StringRef RHS) const {
     return str().equals_lower(RHS);
   }
 
-  /// compare - Compare two strings; the result is -1, 0, or 1 if this string
-  /// is lexicographically less than, equal to, or greater than the \arg RHS.
+  /// Compare two strings; the result is -1, 0, or 1 if this string is
+  /// lexicographically less than, equal to, or greater than the \p RHS.
   int compare(StringRef RHS) const {
     return str().compare(RHS);
   }
@@ -129,12 +129,12 @@ public:
   /// @name String Predicates
   /// @{
 
-  /// startswith - Check if this string starts with the given \arg Prefix.
+  /// startswith - Check if this string starts with the given \p Prefix.
   bool startswith(StringRef Prefix) const {
     return str().startswith(Prefix);
   }
 
-  /// endswith - Check if this string ends with the given \arg Suffix.
+  /// endswith - Check if this string ends with the given \p Suffix.
   bool endswith(StringRef Suffix) const {
     return str().endswith(Suffix);
   }
@@ -143,76 +143,76 @@ public:
   /// @name String Searching
   /// @{
 
-  /// find - Search for the first character \arg C in the string.
+  /// find - Search for the first character \p C in the string.
   ///
-  /// \return - The index of the first occurrence of \arg C, or npos if not
+  /// \return - The index of the first occurrence of \p C, or npos if not
   /// found.
   size_t find(char C, size_t From = 0) const {
     return str().find(C, From);
   }
 
-  /// find - Search for the first string \arg Str in the string.
+  /// Search for the first string \p Str in the string.
   ///
-  /// \return - The index of the first occurrence of \arg Str, or npos if not
+  /// \returns The index of the first occurrence of \p Str, or npos if not
   /// found.
   size_t find(StringRef Str, size_t From = 0) const {
     return str().find(Str, From);
   }
 
-  /// rfind - Search for the last character \arg C in the string.
+  /// Search for the last character \p C in the string.
   ///
-  /// \return - The index of the last occurrence of \arg C, or npos if not
+  /// \returns The index of the last occurrence of \p C, or npos if not
   /// found.
   size_t rfind(char C, size_t From = StringRef::npos) const {
     return str().rfind(C, From);
   }
 
-  /// rfind - Search for the last string \arg Str in the string.
+  /// Search for the last string \p Str in the string.
   ///
-  /// \return - The index of the last occurrence of \arg Str, or npos if not
+  /// \returns The index of the last occurrence of \p Str, or npos if not
   /// found.
   size_t rfind(StringRef Str) const {
     return str().rfind(Str);
   }
 
-  /// find_first_of - Find the first character in the string that is \arg C,
-  /// or npos if not found. Same as find.
+  /// Find the first character in the string that is \p C, or npos if not
+  /// found. Same as find.
   size_t find_first_of(char C, size_t From = 0) const {
     return str().find_first_of(C, From);
   }
 
-  /// find_first_of - Find the first character in the string that is in \arg
-  /// Chars, or npos if not found.
+  /// Find the first character in the string that is in \p Chars, or npos if
+  /// not found.
   ///
-  /// Note: O(size() + Chars.size())
+  /// Complexity: O(size() + Chars.size())
   size_t find_first_of(StringRef Chars, size_t From = 0) const {
     return str().find_first_of(Chars, From);
   }
 
-  /// find_first_not_of - Find the first character in the string that is not
-  /// \arg C or npos if not found.
+  /// Find the first character in the string that is not \p C or npos if not
+  /// found.
   size_t find_first_not_of(char C, size_t From = 0) const {
     return str().find_first_not_of(C, From);
   }
 
-  /// find_first_not_of - Find the first character in the string that is not
-  /// in the string \arg Chars, or npos if not found.
+  /// Find the first character in the string that is not in the string
+  /// \p Chars, or npos if not found.
   ///
-  /// Note: O(size() + Chars.size())
+  /// Complexity: O(size() + Chars.size())
   size_t find_first_not_of(StringRef Chars, size_t From = 0) const {
     return str().find_first_not_of(Chars, From);
   }
 
-  /// find_last_of - Find the last character in the string that is \arg C, or
-  /// npos if not found.
+  /// Find the last character in the string that is \p C, or npos if not
+  /// found.
   size_t find_last_of(char C, size_t From = StringRef::npos) const {
     return str().find_last_of(C, From);
   }
 
-  /// find_last_of - Find the last character in the string that is in \arg C,
-  /// or npos if not found.
+  /// Find the last character in the string that is in \p C, or npos if not
+  /// found.
   ///
-  /// Note: O(size() + Chars.size())
+  /// Complexity: O(size() + Chars.size())
   size_t find_last_of(
       StringRef Chars, size_t From = StringRef::npos) const {
     return str().find_last_of(Chars, From);
@@ -222,13 +222,13 @@ public:
   /// @name Helpful Algorithms
   /// @{
 
-  /// count - Return the number of occurrences of \arg C in the string.
+  /// Return the number of occurrences of \p C in the string.
   size_t count(char C) const {
     return str().count(C);
   }
 
-  /// count - Return the number of non-overlapped occurrences of \arg Str in
-  /// the string.
+  /// Return the number of non-overlapped occurrences of \p Str in the
+  /// string.
   size_t count(StringRef Str) const {
     return str().count(Str);
   }
@@ -237,36 +237,36 @@ public:
   /// @name Substring Operations
   /// @{
 
-  /// substr - Return a reference to the substring from [Start, Start + N).
+  /// Return a reference to the substring from [Start, Start + N).
   ///
-  /// \param Start - The index of the starting character in the substring; if
+  /// \param Start The index of the starting character in the substring; if
   /// the index is npos or greater than the length of the string then the
   /// empty substring will be returned.
   ///
-  /// \param N - The number of characters to included in the substring. If N
+  /// \param N The number of characters to included in the substring. If \p N
   /// exceeds the number of characters remaining in the string, the string
-  /// suffix (starting with \arg Start) will be returned.
+  /// suffix (starting with \p Start) will be returned.
   StringRef substr(size_t Start, size_t N = StringRef::npos) const {
     return str().substr(Start, N);
   }
 
-  /// slice - Return a reference to the substring from [Start, End).
+  /// Return a reference to the substring from [Start, End).
   ///
-  /// \param Start - The index of the starting character in the substring; if
+  /// \param Start The index of the starting character in the substring; if
   /// the index is npos or greater than the length of the string then the
   /// empty substring will be returned.
   ///
-  /// \param End - The index following the last character to include in the
-  /// substring. If this is npos, or less than \arg Start, or exceeds the
+  /// \param End The index following the last character to include in the
+  /// substring. If this is npos, or less than \p Start, or exceeds the
   /// number of characters remaining in the string, the string suffix
-  /// (starting with \arg Start) will be returned.
+  /// (starting with \p Start) will be returned.
   StringRef slice(size_t Start, size_t End) const {
     return str().slice(Start, End);
   }
 
   // Extra methods.
 
-  /// Explicit conversion to StringRef
+  /// Explicit conversion to StringRef.
   StringRef str() const { return StringRef(this->begin(), this->size()); }
 
   // TODO: Make this const, if it's safe...
diff --git a/include/llvm/ADT/SmallVector.h b/include/llvm/ADT/SmallVector.h
index 769b7dc9f5..6e0fd94dfe 100644
--- a/include/llvm/ADT/SmallVector.h
+++ b/include/llvm/ADT/SmallVector.h
@@ -684,8 +684,8 @@ public:
                                         RHS.begin(), RHS.end());
   }
 
-  /// set_size - Set the array size to \arg N, which the current array must have
-  /// enough capacity for.
+  /// Set the array size to \p N, which the current array must have enough
+  /// capacity for.
   ///
   /// This does not construct or destroy any elements in the vector.
   ///
diff --git a/include/llvm/ADT/SparseSet.h b/include/llvm/ADT/SparseSet.h
index dc3db4ce1f..063c6755c6 100644
--- a/include/llvm/ADT/SparseSet.h
+++ b/include/llvm/ADT/SparseSet.h
@@ -128,8 +128,8 @@ class SparseSet {
 
   // Disable copy construction and assignment.
   // This data structure is not meant to be used that way.
-  SparseSet(const SparseSet&); // DO NOT IMPLEMENT.
-  SparseSet &operator=(const SparseSet&); // DO NOT IMPLEMENT.
+  SparseSet(const SparseSet&) LLVM_DELETED_FUNCTION;
+  SparseSet &operator=(const SparseSet&) LLVM_DELETED_FUNCTION;
 
 public:
   typedef ValueT value_type;
diff --git a/include/llvm/ADT/StringExtras.h b/include/llvm/ADT/StringExtras.h
index 36df5acadb..bf27c4313f 100644
--- a/include/llvm/ADT/StringExtras.h
+++ b/include/llvm/ADT/StringExtras.h
@@ -21,7 +21,7 @@ namespace llvm {
 template<typename T> class SmallVectorImpl;
 
 /// hexdigit - Return the hexadecimal character for the
-/// given number \arg X (which should be less than 16).
+/// given number \p X (which should be less than 16).
 static inline char hexdigit(unsigned X, bool LowerCase = false) {
   const char HexChar = LowerCase ? 'a' : 'A';
   return X < 10 ? '0' + X : HexChar + X - 10;
@@ -129,6 +129,25 @@ static inline unsigned HashString(StringRef Str, unsigned Result = 0) {
   return Result;
 }
 
+/// Returns the English suffix for an ordinal integer (-st, -nd, -rd, -th).
+static inline StringRef getOrdinalSuffix(unsigned Val) {
+  // It is critically important that we do this perfectly for
+  // user-written sequences with over 100 elements.
+  switch (Val % 100) {
+  case 11:
+  case 12:
+  case 13:
+    return "th";
+  default:
+    switch (Val % 10) {
+      case 1: return "st";
+      case 2: return "nd";
+      case 3: return "rd";
+      default: return "th";
+    }
+  }
+}
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/ADT/StringRef.h b/include/llvm/ADT/StringRef.h
index cd846031c5..292bde0cd9 100644
--- a/include/llvm/ADT/StringRef.h
+++ b/include/llvm/ADT/StringRef.h
@@ -138,7 +138,7 @@ namespace llvm {
     }
 
     /// compare - Compare two strings; the result is -1, 0, or 1 if this string
-    /// is lexicographically less than, equal to, or greater than the \arg RHS.
+    /// is lexicographically less than, equal to, or greater than the \p RHS.
     int compare(StringRef RHS) const {
       // Check the prefix for a mismatch.
       if (int Res = compareMemory(Data, RHS.Data, min(Length, RHS.Length)))
@@ -205,13 +205,13 @@ namespace llvm {
     /// @name String Predicates
     /// @{
 
-    /// startswith - Check if this string starts with the given \arg Prefix.
+    /// Check if this string starts with the given \p Prefix.
     bool startswith(StringRef Prefix) const {
       return Length >= Prefix.Length &&
              compareMemory(Data, Prefix.Data, Prefix.Length) == 0;
     }
 
-    /// endswith - Check if this string ends with the given \arg Suffix.
+    /// Check if this string ends with the given \p Suffix.
     bool endswith(StringRef Suffix) const {
       return Length >= Suffix.Length &&
         compareMemory(end() - Suffix.Length, Suffix.Data, Suffix.Length) == 0;
@@ -221,9 +221,9 @@ namespace llvm {
     /// @name String Searching
     /// @{
 
-    /// find - Search for the first character \arg C in the string.
+    /// Search for the first character \p C in the string.
     ///
-    /// \return - The index of the first occurrence of \arg C, or npos if not
+    /// \returns The index of the first occurrence of \p C, or npos if not
     /// found.
     size_t find(char C, size_t From = 0) const {
       for (size_t i = min(From, Length), e = Length; i != e; ++i)
@@ -232,15 +232,15 @@ namespace llvm {
       return npos;
     }
 
-    /// find - Search for the first string \arg Str in the string.
+    /// Search for the first string \p Str in the string.
     ///
-    /// \return - The index of the first occurrence of \arg Str, or npos if not
+    /// \returns The index of the first occurrence of \p Str, or npos if not
     /// found.
     size_t find(StringRef Str, size_t From = 0) const;
 
-    /// rfind - Search for the last character \arg C in the string.
+    /// Search for the last character \p C in the string.
     ///
-    /// \return - The index of the last occurrence of \arg C, or npos if not
+    /// \returns The index of the last occurrence of \p C, or npos if not
     /// found.
     size_t rfind(char C, size_t From = npos) const {
       From = min(From, Length);
@@ -253,61 +253,61 @@ namespace llvm {
       return npos;
     }
 
-    /// rfind - Search for the last string \arg Str in the string.
+    /// Search for the last string \p Str in the string.
     ///
-    /// \return - The index of the last occurrence of \arg Str, or npos if not
+    /// \returns The index of the last occurrence of \p Str, or npos if not
     /// found.
     size_t rfind(StringRef Str) const;
 
-    /// find_first_of - Find the first character in the string that is \arg C,
-    /// or npos if not found. Same as find.
+    /// Find the first character in the string that is \p C, or npos if not
+    /// found. Same as find.
     size_type find_first_of(char C, size_t From = 0) const {
       return find(C, From);
     }
 
-    /// find_first_of - Find the first character in the string that is in \arg
-    /// Chars, or npos if not found.
+    /// Find the first character in the string that is in \p Chars, or npos if
+    /// not found.
     ///
-    /// Note: O(size() + Chars.size())
+    /// Complexity: O(size() + Chars.size())
     size_type find_first_of(StringRef Chars, size_t From = 0) const;
 
-    /// find_first_not_of - Find the first character in the string that is not
-    /// \arg C or npos if not found.
+    /// Find the first character in the string that is not \p C or npos if not
+    /// found.
     size_type find_first_not_of(char C, size_t From = 0) const;
 
-    /// find_first_not_of - Find the first character in the string that is not
-    /// in the string \arg Chars, or npos if not found.
+    /// Find the first character in the string that is not in the string
+    /// \p Chars, or npos if not found.
     ///
-    /// Note: O(size() + Chars.size())
+    /// Complexity: O(size() + Chars.size())
     size_type find_first_not_of(StringRef Chars, size_t From = 0) const;
 
-    /// find_last_of - Find the last character in the string that is \arg C, or
-    /// npos if not found.
+    /// Find the last character in the string that is \p C, or npos if not
+    /// found.
     size_type find_last_of(char C, size_t From = npos) const {
       return rfind(C, From);
     }
 
-    /// find_last_of - Find the last character in the string that is in \arg C,
-    /// or npos if not found.
+    /// Find the last character in the string that is in \p C, or npos if not
+    /// found.
     ///
-    /// Note: O(size() + Chars.size())
+    /// Complexity: O(size() + Chars.size())
     size_type find_last_of(StringRef Chars, size_t From = npos) const;
 
-    /// find_last_not_of - Find the last character in the string that is not
-    /// \arg C, or npos if not found.
+    /// Find the last character in the string that is not \p C, or npos if not
+    /// found.
     size_type find_last_not_of(char C, size_t From = npos) const;
 
-    /// find_last_not_of - Find the last character in the string that is not in
-    /// \arg Chars, or npos if not found.
+    /// Find the last character in the string that is not in \p Chars, or
+    /// npos if not found.
     ///
-    /// Note: O(size() + Chars.size())
+    /// Complexity: O(size() + Chars.size())
     size_type find_last_not_of(StringRef Chars, size_t From = npos) const;
 
     /// @}
     /// @name Helpful Algorithms
     /// @{
 
-    /// count - Return the number of occurrences of \arg C in the string.
+    /// Return the number of occurrences of \p C in the string.
     size_t count(char C) const {
       size_t Count = 0;
       for (size_t i = 0, e = Length; i != e; ++i)
@@ -316,18 +316,17 @@ namespace llvm {
       return Count;
     }
 
-    /// count - Return the number of non-overlapped occurrences of \arg Str in
+    /// Return the number of non-overlapped occurrences of \p Str in
     /// the string.
     size_t count(StringRef Str) const;
 
-    /// getAsInteger - Parse the current string as an integer of the specified
-    /// radix.  If Radix is specified as zero, this does radix autosensing using
+    /// Parse the current string as an integer of the specified radix.  If
+    /// \p Radix is specified as zero, this does radix autosensing using
     /// extended C rules: 0 is octal, 0x is hex, 0b is binary.
     ///
     /// If the string is invalid or if only a subset of the string is valid,
     /// this returns true to signify the error.  The string is considered
     /// erroneous if empty or if it overflows T.
-    ///
     template <typename T>
     typename enable_if_c<std::numeric_limits<T>::is_signed, bool>::type
     getAsInteger(unsigned Radix, T &Result) const {
@@ -350,13 +349,12 @@ namespace llvm {
       return false;
     }
 
-    /// getAsInteger - Parse the current string as an integer of the
-    /// specified radix, or of an autosensed radix if the radix given
-    /// is 0.  The current value in Result is discarded, and the
-    /// storage is changed to be wide enough to store the parsed
-    /// integer.
+    /// Parse the current string as an integer of the specified \p Radix, or of
+    /// an autosensed radix if the \p Radix given is 0.  The current value in
+    /// \p Result is discarded, and the storage is changed to be wide enough to
+    /// store the parsed integer.
     ///
-    /// Returns true if the string does not solely consist of a valid
+    /// \returns true if the string does not solely consist of a valid
     /// non-empty number in the appropriate base.
     ///
     /// APInt::fromString is superficially similar but assumes the
@@ -367,70 +365,70 @@ namespace llvm {
     /// @name String Operations
     /// @{
 
-    // lower - Convert the given ASCII string to lowercase.
+    // Convert the given ASCII string to lowercase.
     std::string lower() const;
 
-    /// upper - Convert the given ASCII string to uppercase.
+    /// Convert the given ASCII string to uppercase.
     std::string upper() const;
 
     /// @}
     /// @name Substring Operations
     /// @{
 
-    /// substr - Return a reference to the substring from [Start, Start + N).
+    /// Return a reference to the substring from [Start, Start + N).
     ///
-    /// \param Start - The index of the starting character in the substring; if
+    /// \param Start The index of the starting character in the substring; if
     /// the index is npos or greater than the length of the string then the
     /// empty substring will be returned.
     ///
-    /// \param N - The number of characters to included in the substring. If N
+    /// \param N The number of characters to included in the substring. If N
     /// exceeds the number of characters remaining in the string, the string
-    /// suffix (starting with \arg Start) will be returned.
+    /// suffix (starting with \p Start) will be returned.
     StringRef substr(size_t Start, size_t N = npos) const {
       Start = min(Start, Length);
       return StringRef(Data + Start, min(N, Length - Start));
     }
     
-    /// drop_front - Return a StringRef equal to 'this' but with the first
-    /// elements dropped.
+    /// Return a StringRef equal to 'this' but with the first \p N elements
+    /// dropped.
     StringRef drop_front(unsigned N = 1) const {
       assert(size() >= N && "Dropping more elements than exist");
       return substr(N);
     }
 
-    /// drop_back - Return a StringRef equal to 'this' but with the last
-    /// elements dropped.
+    /// Return a StringRef equal to 'this' but with the last \p N elements
+    /// dropped.
     StringRef drop_back(unsigned N = 1) const {
       assert(size() >= N && "Dropping more elements than exist");
       return substr(0, size()-N);
     }
 
-    /// slice - Return a reference to the substring from [Start, End).
+    /// Return a reference to the substring from [Start, End).
     ///
-    /// \param Start - The index of the starting character in the substring; if
+    /// \param Start The index of the starting character in the substring; if
     /// the index is npos or greater than the length of the string then the
     /// empty substring will be returned.
     ///
-    /// \param End - The index following the last character to include in the
-    /// substring. If this is npos, or less than \arg Start, or exceeds the
+    /// \param End The index following the last character to include in the
+    /// substring. If this is npos, or less than \p Start, or exceeds the
     /// number of characters remaining in the string, the string suffix
-    /// (starting with \arg Start) will be returned.
+    /// (starting with \p Start) will be returned.
     StringRef slice(size_t Start, size_t End) const {
       Start = min(Start, Length);
       End = min(max(Start, End), Length);
       return StringRef(Data + Start, End - Start);
     }
 
-    /// split - Split into two substrings around the first occurrence of a
-    /// separator character.
+    /// Split into two substrings around the first occurrence of a separator
+    /// character.
     ///
-    /// If \arg Separator is in the string, then the result is a pair (LHS, RHS)
+    /// If \p Separator is in the string, then the result is a pair (LHS, RHS)
     /// such that (*this == LHS + Separator + RHS) is true and RHS is
-    /// maximal. If \arg Separator is not in the string, then the result is a
+    /// maximal. If \p Separator is not in the string, then the result is a
     /// pair (LHS, RHS) where (*this == LHS) and (RHS == "").
     ///
-    /// \param Separator - The character to split on.
-    /// \return - The split substrings.
+    /// \param Separator The character to split on.
+    /// \returns The split substrings.
     std::pair<StringRef, StringRef> split(char Separator) const {
       size_t Idx = find(Separator);
       if (Idx == npos)
@@ -438,12 +436,12 @@ namespace llvm {
       return std::make_pair(slice(0, Idx), slice(Idx+1, npos));
     }
 
-    /// split - Split into two substrings around the first occurrence of a
-    /// separator string.
+    /// Split into two substrings around the first occurrence of a separator
+    /// string.
     ///
-    /// If \arg Separator is in the string, then the result is a pair (LHS, RHS)
+    /// If \p Separator is in the string, then the result is a pair (LHS, RHS)
     /// such that (*this == LHS + Separator + RHS) is true and RHS is
-    /// maximal. If \arg Separator is not in the string, then the result is a
+    /// maximal. If \p Separator is not in the string, then the result is a
     /// pair (LHS, RHS) where (*this == LHS) and (RHS == "").
     ///
     /// \param Separator - The string to split on.
@@ -455,14 +453,13 @@ namespace llvm {
       return std::make_pair(slice(0, Idx), slice(Idx + Separator.size(), npos));
     }
 
-    /// split - Split into substrings around the occurrences of a separator
-    /// string.
+    /// Split into substrings around the occurrences of a separator string.
     ///
-    /// Each substring is stored in \arg A. If \arg MaxSplit is >= 0, at most
-    /// \arg MaxSplit splits are done and consequently <= \arg MaxSplit
+    /// Each substring is stored in \p A. If \p MaxSplit is >= 0, at most
+    /// \p MaxSplit splits are done and consequently <= \p MaxSplit
     /// elements are added to A.
-    /// If \arg KeepEmpty is false, empty strings are not added to \arg A. They
-    /// still count when considering \arg MaxSplit
+    /// If \p KeepEmpty is false, empty strings are not added to \p A. They
+    /// still count when considering \p MaxSplit
     /// An useful invariant is that
     /// Separator.join(A) == *this if MaxSplit == -1 and KeepEmpty == true
     ///
@@ -474,12 +471,12 @@ namespace llvm {
                StringRef Separator, int MaxSplit = -1,
                bool KeepEmpty = true) const;
 
-    /// rsplit - Split into two substrings around the last occurrence of a
-    /// separator character.
+    /// Split into two substrings around the last occurrence of a separator
+    /// character.
     ///
-    /// If \arg Separator is in the string, then the result is a pair (LHS, RHS)
+    /// If \p Separator is in the string, then the result is a pair (LHS, RHS)
     /// such that (*this == LHS + Separator + RHS) is true and RHS is
-    /// minimal. If \arg Separator is not in the string, then the result is a
+    /// minimal. If \p Separator is not in the string, then the result is a
     /// pair (LHS, RHS) where (*this == LHS) and (RHS == "").
     ///
     /// \param Separator - The character to split on.
@@ -491,20 +488,20 @@ namespace llvm {
       return std::make_pair(slice(0, Idx), slice(Idx+1, npos));
     }
 
-    /// ltrim - Return string with consecutive characters in \arg Chars starting
-    /// from the left removed.
+    /// Return string with consecutive characters in \p Chars starting from
+    /// the left removed.
     StringRef ltrim(StringRef Chars = " \t\n\v\f\r") const {
       return drop_front(std::min(Length, find_first_not_of(Chars)));
     }
 
-    /// rtrim - Return string with consecutive characters in \arg Chars starting
-    /// from the right removed.
+    /// Return string with consecutive characters in \p Chars starting from
+    /// the right removed.
     StringRef rtrim(StringRef Chars = " \t\n\v\f\r") const {
       return drop_back(Length - std::min(Length, find_last_not_of(Chars) + 1));
     }
 
-    /// trim - Return string with consecutive characters in \arg Chars starting
-    /// from the left and right removed.
+    /// Return string with consecutive characters in \p Chars starting from
+    /// the left and right removed.
     StringRef trim(StringRef Chars = " \t\n\v\f\r") const {
       return ltrim(Chars).rtrim(Chars);
     }
diff --git a/include/llvm/ADT/Triple.h b/include/llvm/ADT/Triple.h
index ab1f0da51e..f701a793af 100644
--- a/include/llvm/ADT/Triple.h
+++ b/include/llvm/ADT/Triple.h
@@ -342,7 +342,7 @@ public:
   /// to a known type.
   void setEnvironment(EnvironmentType Kind);
 
-  /// setTriple - Set all components to the new triple \arg Str.
+  /// setTriple - Set all components to the new triple \p Str.
   void setTriple(const Twine &Str);
 
   /// setArchName - Set the architecture (first) component of the
@@ -393,11 +393,10 @@ public:
   /// @name Static helpers for IDs.
   /// @{
 
-  /// getArchTypeName - Get the canonical name for the \arg Kind
-  /// architecture.
+  /// getArchTypeName - Get the canonical name for the \p Kind architecture.
   static const char *getArchTypeName(ArchType Kind);
 
-  /// getArchTypePrefix - Get the "prefix" canonical name for the \arg Kind
+  /// getArchTypePrefix - Get the "prefix" canonical name for the \p Kind
   /// architecture. This is the prefix used by the architecture specific
   /// builtins, and is suitable for passing to \see
   /// Intrinsic::getIntrinsicForGCCBuiltin().
@@ -405,15 +404,13 @@ public:
   /// \return - The architecture prefix, or 0 if none is defined.
   static const char *getArchTypePrefix(ArchType Kind);
 
-  /// getVendorTypeName - Get the canonical name for the \arg Kind
-  /// vendor.
+  /// getVendorTypeName - Get the canonical name for the \p Kind vendor.
   static const char *getVendorTypeName(VendorType Kind);
 
-  /// getOSTypeName - Get the canonical name for the \arg Kind operating
-  /// system.
+  /// getOSTypeName - Get the canonical name for the \p Kind operating system.
   static const char *getOSTypeName(OSType Kind);
 
-  /// getEnvironmentTypeName - Get the canonical name for the \arg Kind
+  /// getEnvironmentTypeName - Get the canonical name for the \p Kind
   /// environment.
   static const char *getEnvironmentTypeName(EnvironmentType Kind);
 
diff --git a/include/llvm/ADT/Twine.h b/include/llvm/ADT/Twine.h
index 9101df8cee..cc290d51d2 100644
--- a/include/llvm/ADT/Twine.h
+++ b/include/llvm/ADT/Twine.h
@@ -44,7 +44,7 @@ namespace llvm {
   /// itself, and renders as an empty string. This can be returned from APIs to
   /// effectively nullify any concatenations performed on the result.
   ///
-  /// \b Implementation \n
+  /// \b Implementation
   ///
   /// Given the nature of a Twine, it is not possible for the Twine's
   /// concatenation method to construct interior nodes; the result must be
@@ -67,7 +67,7 @@ namespace llvm {
   ///
   /// These invariants are check by \see isValid().
   ///
-  /// \b Efficiency Considerations \n
+  /// \b Efficiency Considerations
   ///
   /// The Twine is designed to yield efficient and small code for common
   /// situations. For this reason, the concat() method is inlined so that
@@ -303,37 +303,37 @@ namespace llvm {
       LHS.character = static_cast<char>(Val);
     }
 
-    /// Construct a twine to print \arg Val as an unsigned decimal integer.
+    /// Construct a twine to print \p Val as an unsigned decimal integer.
     explicit Twine(unsigned Val)
       : LHSKind(DecUIKind), RHSKind(EmptyKind) {
       LHS.decUI = Val;
     }
 
-    /// Construct a twine to print \arg Val as a signed decimal integer.
+    /// Construct a twine to print \p Val as a signed decimal integer.
     explicit Twine(int Val)
       : LHSKind(DecIKind), RHSKind(EmptyKind) {
       LHS.decI = Val;
     }
 
-    /// Construct a twine to print \arg Val as an unsigned decimal integer.
+    /// Construct a twine to print \p Val as an unsigned decimal integer.
     explicit Twine(const unsigned long &Val)
       : LHSKind(DecULKind), RHSKind(EmptyKind) {
       LHS.decUL = &Val;
     }
 
-    /// Construct a twine to print \arg Val as a signed decimal integer.
+    /// Construct a twine to print \p Val as a signed decimal integer.
     explicit Twine(const long &Val)
       : LHSKind(DecLKind), RHSKind(EmptyKind) {
       LHS.decL = &Val;
     }
 
-    /// Construct a twine to print \arg Val as an unsigned decimal integer.
+    /// Construct a twine to print \p Val as an unsigned decimal integer.
     explicit Twine(const unsigned long long &Val)
       : LHSKind(DecULLKind), RHSKind(EmptyKind) {
       LHS.decULL = &Val;
     }
 
-    /// Construct a twine to print \arg Val as a signed decimal integer.
+    /// Construct a twine to print \p Val as a signed decimal integer.
     explicit Twine(const long long &Val)
       : LHSKind(DecLLKind), RHSKind(EmptyKind) {
       LHS.decLL = &Val;
@@ -370,7 +370,7 @@ namespace llvm {
     /// @name Numeric Conversions
     /// @{
 
-    // Construct a twine to print \arg Val as an unsigned hexadecimal integer.
+    // Construct a twine to print \p Val as an unsigned hexadecimal integer.
     static Twine utohexstr(const uint64_t &Val) {
       Child LHS, RHS;
       LHS.uHex = &Val;
@@ -447,17 +447,17 @@ namespace llvm {
     /// The returned StringRef's size does not include the null terminator.
     StringRef toNullTerminatedStringRef(SmallVectorImpl<char> &Out) const;
 
-    /// print - Write the concatenated string represented by this twine to the
-    /// stream \arg OS.
+    /// Write the concatenated string represented by this twine to the
+    /// stream \p OS.
     void print(raw_ostream &OS) const;
 
-    /// dump - Dump the concatenated string represented by this twine to stderr.
+    /// Dump the concatenated string represented by this twine to stderr.
     void dump() const;
 
-    /// print - Write the representation of this twine to the stream \arg OS.
+    /// Write the representation of this twine to the stream \p OS.
     void printRepr(raw_ostream &OS) const;
 
-    /// dumpRepr - Dump the representation of this twine to stderr.
+    /// Dump the representation of this twine to stderr.
     void dumpRepr() const;
 
     /// @}
diff --git a/include/llvm/ADT/ValueMap.h b/include/llvm/ADT/ValueMap.h
index f7e255181e..d23fccf3e8 100644
--- a/include/llvm/ADT/ValueMap.h
+++ b/include/llvm/ADT/ValueMap.h
@@ -80,8 +80,8 @@ class ValueMap {
   typedef typename Config::ExtraData ExtraData;
   MapT Map;
   ExtraData Data;
-  ValueMap(const ValueMap&); // DO NOT IMPLEMENT
-  ValueMap& operator=(const ValueMap&); // DO NOT IMPLEMENT
+  ValueMap(const ValueMap&) LLVM_DELETED_FUNCTION;
+  ValueMap& operator=(const ValueMap&) LLVM_DELETED_FUNCTION;
 public:
   typedef KeyT key_type;
   typedef ValueT mapped_type;
diff --git a/include/llvm/ADT/ilist.h b/include/llvm/ADT/ilist.h
index ba9864a98a..7f5cd17181 100644
--- a/include/llvm/ADT/ilist.h
+++ b/include/llvm/ADT/ilist.h
@@ -38,6 +38,7 @@
 #ifndef LLVM_ADT_ILIST_H
 #define LLVM_ADT_ILIST_H
 
+#include "llvm/Support/Compiler.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -331,8 +332,8 @@ class iplist : public Traits {
 
   // No fundamental reason why iplist can't be copyable, but the default
   // copy/copy-assign won't do.
-  iplist(const iplist &);         // do not implement
-  void operator=(const iplist &); // do not implement
+  iplist(const iplist &) LLVM_DELETED_FUNCTION;
+  void operator=(const iplist &) LLVM_DELETED_FUNCTION;
 
 public:
   typedef NodeTy *pointer;
diff --git a/include/llvm/Analysis/AliasSetTracker.h b/include/llvm/Analysis/AliasSetTracker.h
index 95626d624a..1e606c81d9 100644
--- a/include/llvm/Analysis/AliasSetTracker.h
+++ b/include/llvm/Analysis/AliasSetTracker.h
@@ -109,7 +109,6 @@ class AliasSet : public ilist_node<AliasSet> {
 
   PointerRec *PtrList, **PtrListEnd;  // Doubly linked list of nodes.
   AliasSet *Forward;             // Forwarding pointer.
-  AliasSet *Next, *Prev;         // Doubly linked list of AliasSets.
 
   // All instructions without a specific address in this alias set.
   std::vector<AssertingVH<Instruction> > UnknownInsts;
@@ -226,8 +225,8 @@ private:
                AccessTy(NoModRef), AliasTy(MustAlias), Volatile(false) {
   }
 
-  AliasSet(const AliasSet &AS);        // do not implement
-  void operator=(const AliasSet &AS);  // do not implement
+  AliasSet(const AliasSet &AS) LLVM_DELETED_FUNCTION;
+  void operator=(const AliasSet &AS) LLVM_DELETED_FUNCTION;
 
   PointerRec *getSomePointer() const {
     return PtrList;
diff --git a/include/llvm/Analysis/CallGraph.h b/include/llvm/Analysis/CallGraph.h
index fb77da7b69..6a9ed31037 100644
--- a/include/llvm/Analysis/CallGraph.h
+++ b/include/llvm/Analysis/CallGraph.h
@@ -185,9 +185,9 @@ private:
   /// in the CalledFunctions array of this or other CallGraphNodes.
   unsigned NumReferences;
 
-  CallGraphNode(const CallGraphNode &);            // DO NOT IMPLEMENT
-  void operator=(const CallGraphNode &);           // DO NOT IMPLEMENT
-  
+  CallGraphNode(const CallGraphNode &) LLVM_DELETED_FUNCTION;
+  void operator=(const CallGraphNode &) LLVM_DELETED_FUNCTION;
+ 
   void DropRef() { --NumReferences; }
   void AddRef() { ++NumReferences; }
 public:
diff --git a/include/llvm/Analysis/InlineCost.h b/include/llvm/Analysis/InlineCost.h
index 0cba135222..d58671f873 100644
--- a/include/llvm/Analysis/InlineCost.h
+++ b/include/llvm/Analysis/InlineCost.h
@@ -36,6 +36,9 @@ namespace llvm {
     const int LastCallToStaticBonus = -15000;
     const int ColdccPenalty = 2000;
     const int NoreturnPenalty = 10000;
+    /// Do not inline functions which allocate this many bytes on the stack
+    /// when the caller is recursive.
+    const unsigned TotalAllocaSizeRecursiveCaller = 1024;
   }
 
   /// \brief Represents the cost of inlining a function.
diff --git a/include/llvm/Analysis/IntervalPartition.h b/include/llvm/Analysis/IntervalPartition.h
index df7313f18f..bce84be2f4 100644
--- a/include/llvm/Analysis/IntervalPartition.h
+++ b/include/llvm/Analysis/IntervalPartition.h
@@ -33,8 +33,8 @@ namespace llvm {
 //
 // IntervalPartition - This class builds and holds an "interval partition" for
 // a function.  This partition divides the control flow graph into a set of
-// maximal intervals, as defined with the properties above.  Intuitively, a
-// BasicBlock is a (possibly nonexistent) loop with a "tail" of non looping
+// maximal intervals, as defined with the properties above.  Intuitively, an
+// interval is a (possibly nonexistent) loop with a "tail" of non looping
 // nodes following it.
 //
 class IntervalPartition : public FunctionPass {
diff --git a/include/llvm/Analysis/LazyValueInfo.h b/include/llvm/Analysis/LazyValueInfo.h
index 065c230fb2..7b887d5a9a 100644
--- a/include/llvm/Analysis/LazyValueInfo.h
+++ b/include/llvm/Analysis/LazyValueInfo.h
@@ -29,8 +29,8 @@ class LazyValueInfo : public FunctionPass {
   class TargetData *TD;
   class TargetLibraryInfo *TLI;
   void *PImpl;
-  LazyValueInfo(const LazyValueInfo&); // DO NOT IMPLEMENT.
-  void operator=(const LazyValueInfo&); // DO NOT IMPLEMENT.
+  LazyValueInfo(const LazyValueInfo&) LLVM_DELETED_FUNCTION;
+  void operator=(const LazyValueInfo&) LLVM_DELETED_FUNCTION;
 public:
   static char ID;
   LazyValueInfo() : FunctionPass(ID), PImpl(0) {
diff --git a/include/llvm/Analysis/LoopInfo.h b/include/llvm/Analysis/LoopInfo.h
index eeb482d82a..c5d7b0128e 100644
--- a/include/llvm/Analysis/LoopInfo.h
+++ b/include/llvm/Analysis/LoopInfo.h
@@ -72,10 +72,9 @@ class LoopBase {
   // Blocks - The list of blocks in this loop.  First entry is the header node.
   std::vector<BlockT*> Blocks;
 
-  // DO NOT IMPLEMENT
-  LoopBase(const LoopBase<BlockT, LoopT> &);
-  // DO NOT IMPLEMENT
-  const LoopBase<BlockT, LoopT>&operator=(const LoopBase<BlockT, LoopT> &);
+  LoopBase(const LoopBase<BlockT, LoopT> &) LLVM_DELETED_FUNCTION;
+  const LoopBase<BlockT, LoopT>&
+    operator=(const LoopBase<BlockT, LoopT> &) LLVM_DELETED_FUNCTION;
 public:
   /// Loop ctor - This creates an empty loop.
   LoopBase() : ParentLoop(0) {}
@@ -416,8 +415,8 @@ class LoopInfoBase {
   friend class LoopBase<BlockT, LoopT>;
   friend class LoopInfo;
 
-  void operator=(const LoopInfoBase &); // do not implement
-  LoopInfoBase(const LoopInfo &);       // do not implement
+  void operator=(const LoopInfoBase &) LLVM_DELETED_FUNCTION;
+  LoopInfoBase(const LoopInfo &) LLVM_DELETED_FUNCTION;
 public:
   LoopInfoBase() { }
   ~LoopInfoBase() { releaseMemory(); }
@@ -550,8 +549,8 @@ class LoopInfo : public FunctionPass {
   LoopInfoBase<BasicBlock, Loop> LI;
   friend class LoopBase<BasicBlock, Loop>;
 
-  void operator=(const LoopInfo &); // do not implement
-  LoopInfo(const LoopInfo &);       // do not implement
+  void operator=(const LoopInfo &) LLVM_DELETED_FUNCTION;
+  LoopInfo(const LoopInfo &) LLVM_DELETED_FUNCTION;
 public:
   static char ID; // Pass identification, replacement for typeid
 
diff --git a/include/llvm/Analysis/RegionInfo.h b/include/llvm/Analysis/RegionInfo.h
index e62040e2ee..48d7ee6b54 100644
--- a/include/llvm/Analysis/RegionInfo.h
+++ b/include/llvm/Analysis/RegionInfo.h
@@ -54,10 +54,8 @@ class FlatIt {};
 /// @brief A RegionNode represents a subregion or a BasicBlock that is part of a
 /// Region.
 class RegionNode {
-  // DO NOT IMPLEMENT
-  RegionNode(const RegionNode &);
-  // DO NOT IMPLEMENT
-  const RegionNode &operator=(const RegionNode &);
+  RegionNode(const RegionNode &) LLVM_DELETED_FUNCTION;
+  const RegionNode &operator=(const RegionNode &) LLVM_DELETED_FUNCTION;
 
 protected:
   /// This is the entry basic block that starts this region node.  If this is a
@@ -203,10 +201,8 @@ inline Region* RegionNode::getNodeAs<Region>() const {
 /// tree, the second one creates a graphical representation using graphviz.
 class Region : public RegionNode {
   friend class RegionInfo;
-  // DO NOT IMPLEMENT
-  Region(const Region &);
-  // DO NOT IMPLEMENT
-  const Region &operator=(const Region &);
+  Region(const Region &) LLVM_DELETED_FUNCTION;
+  const Region &operator=(const Region &) LLVM_DELETED_FUNCTION;
 
   // Information necessary to manage this Region.
   RegionInfo* RI;
@@ -565,10 +561,8 @@ class RegionInfo : public FunctionPass {
   typedef DenseMap<BasicBlock*, Region*> BBtoRegionMap;
   typedef SmallPtrSet<Region*, 4> RegionSet;
 
-  // DO NOT IMPLEMENT
-  RegionInfo(const RegionInfo &);
-  // DO NOT IMPLEMENT
-  const RegionInfo &operator=(const RegionInfo &);
+  RegionInfo(const RegionInfo &) LLVM_DELETED_FUNCTION;
+  const RegionInfo &operator=(const RegionInfo &) LLVM_DELETED_FUNCTION;
 
   DominatorTree *DT;
   PostDominatorTree *PDT;
diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h
index c213ade5e8..c8c249a1b1 100644
--- a/include/llvm/Analysis/ScalarEvolution.h
+++ b/include/llvm/Analysis/ScalarEvolution.h
@@ -70,8 +70,8 @@ namespace llvm {
     unsigned short SubclassData;
 
   private:
-    SCEV(const SCEV &);            // DO NOT IMPLEMENT
-    void operator=(const SCEV &);  // DO NOT IMPLEMENT
+    SCEV(const SCEV &) LLVM_DELETED_FUNCTION;
+    void operator=(const SCEV &) LLVM_DELETED_FUNCTION;
 
   public:
     /// NoWrapFlags are bitfield indices into SubclassData.
diff --git a/include/llvm/Analysis/SparsePropagation.h b/include/llvm/Analysis/SparsePropagation.h
index c3c2f4b066..b758eca42e 100644
--- a/include/llvm/Analysis/SparsePropagation.h
+++ b/include/llvm/Analysis/SparsePropagation.h
@@ -130,9 +130,9 @@ class SparseSolver {
   /// PHI nodes retriggered.
   typedef std::pair<BasicBlock*,BasicBlock*> Edge;
   std::set<Edge> KnownFeasibleEdges;
-  
-  SparseSolver(const SparseSolver&);    // DO NOT IMPLEMENT
-  void operator=(const SparseSolver&);  // DO NOT IMPLEMENT
+
+  SparseSolver(const SparseSolver&) LLVM_DELETED_FUNCTION;
+  void operator=(const SparseSolver&) LLVM_DELETED_FUNCTION;
 public:
   explicit SparseSolver(AbstractLatticeFunction *Lattice)
     : LatticeFunc(Lattice) {}
diff --git a/include/llvm/Attributes.h b/include/llvm/Attributes.h
index 0228d8691d..9dc2c1aa57 100644
--- a/include/llvm/Attributes.h
+++ b/include/llvm/Attributes.h
@@ -21,11 +21,13 @@
 #include <string>
 
 namespace llvm {
+
 class Type;
 
 namespace Attribute {
-/// We use this proxy POD type to allow constructing Attributes constants
-/// using initializer lists. Do not use this class directly.
+
+/// We use this proxy POD type to allow constructing Attributes constants using
+/// initializer lists. Do not use this class directly.
 struct AttrConst {
   uint64_t v;
   AttrConst operator | (const AttrConst Attrs) const {
@@ -37,49 +39,6 @@ struct AttrConst {
     return Res;
   }
 };
-}  // namespace Attribute
-
-
-/// Attributes - A bitset of attributes.
-class Attributes {
- public:
-  Attributes() : Bits(0) { }
-  explicit Attributes(uint64_t Val) : Bits(Val) { }
-  /*implicit*/ Attributes(Attribute::AttrConst Val) : Bits(Val.v) { }
-  // This is a "safe bool() operator".
-  operator const void *() const { return Bits ? this : 0; }
-  bool isEmptyOrSingleton() const { return (Bits & (Bits - 1)) == 0; }
-  bool operator == (const Attributes &Attrs) const {
-    return Bits == Attrs.Bits;
-  }
-  bool operator != (const Attributes &Attrs) const {
-    return Bits != Attrs.Bits;
-  }
-  Attributes operator | (const Attributes &Attrs) const {
-    return Attributes(Bits | Attrs.Bits);
-  }
-  Attributes operator & (const Attributes &Attrs) const {
-    return Attributes(Bits & Attrs.Bits);
-  }
-  Attributes operator ^ (const Attributes &Attrs) const {
-    return Attributes(Bits ^ Attrs.Bits);
-  }
-  Attributes &operator |= (const Attributes &Attrs) {
-    Bits |= Attrs.Bits;
-    return *this;
-  }
-  Attributes &operator &= (const Attributes &Attrs) {
-    Bits &= Attrs.Bits;
-    return *this;
-  }
-  Attributes operator ~ () const { return Attributes(~Bits); }
-  uint64_t Raw() const { return Bits; }
- private:
-  // Currently, we need less than 64 bits.
-  uint64_t Bits;
-};
-
-namespace Attribute {
 
 /// Function parameters and results can have attributes to indicate how they
 /// should be treated by optimizations and code generation. This enumeration
@@ -87,10 +46,10 @@ namespace Attribute {
 /// results or the function itself.
 /// @brief Function attributes.
 
-// We declare AttrConst objects that will be used throughout the code
-// and also raw uint64_t objects with _i suffix to be used below for other
-// constant declarations. This is done to avoid static CTORs and at the same
-// time to keep type-safety of Attributes.
+/// We declare AttrConst objects that will be used throughout the code and also
+/// raw uint64_t objects with _i suffix to be used below for other constant
+/// declarations. This is done to avoid static CTORs and at the same time to
+/// keep type-safety of Attributes.
 #define DECLARE_LLVM_ATTRIBUTE(name, value) \
   const uint64_t name##_i = value; \
   const AttrConst name = {value};
@@ -137,6 +96,195 @@ DECLARE_LLVM_ATTRIBUTE(AddressSafety,1ULL<<32) ///< Address safety checking is o
 
 #undef DECLARE_LLVM_ATTRIBUTE
 
+}  // namespace Attribute
+
+/// Attributes - A bitset of attributes.
+class Attributes {
+  // Currently, we need less than 64 bits.
+  uint64_t Bits;
+public:
+  Attributes() : Bits(0) { }
+  explicit Attributes(uint64_t Val) : Bits(Val) { }
+  /*implicit*/ Attributes(Attribute::AttrConst Val) : Bits(Val.v) { }
+
+  // Attribute query methods.
+  // FIXME: StackAlignment & Alignment attributes have no predicate methods.
+  bool hasAttributes() const {
+    return Bits != 0;
+  }
+  bool hasAttributes(const Attributes &A) const {
+    return Bits & A.Bits;
+  }
+
+  bool hasZExtAttr() const {
+    return Bits & Attribute::ZExt_i;
+  }
+  bool hasSExtAttr() const {
+    return Bits & Attribute::SExt_i;
+  }
+  bool hasNoReturnAttr() const {
+    return Bits & Attribute::NoReturn_i;
+  }
+  bool hasInRegAttr() const {
+    return Bits & Attribute::InReg_i;
+  }
+  bool hasStructRetAttr() const {
+    return Bits & Attribute::StructRet_i;
+  }
+  bool hasNoUnwindAttr() const {
+    return Bits & Attribute::NoUnwind_i;
+  }
+  bool hasNoAliasAttr() const {
+    return Bits & Attribute::NoAlias_i;
+  }
+  bool hasByValAttr() const {
+    return Bits & Attribute::ByVal_i;
+  }
+  bool hasNestAttr() const {
+    return Bits & Attribute::Nest_i;
+  }
+  bool hasReadNoneAttr() const {
+    return Bits & Attribute::ReadNone_i;
+  }
+  bool hasReadOnlyAttr() const {
+    return Bits & Attribute::ReadOnly_i;
+  }
+  bool hasNoInlineAttr() const {
+    return Bits & Attribute::NoInline_i;
+  }
+  bool hasAlwaysInlineAttr() const {
+    return Bits & Attribute::AlwaysInline_i;
+  }
+  bool hasOptimizeForSizeAttr() const {
+    return Bits & Attribute::OptimizeForSize_i;
+  }
+  bool hasStackProtectAttr() const {
+    return Bits & Attribute::StackProtect_i;
+  }
+  bool hasStackProtectReqAttr() const {
+    return Bits & Attribute::StackProtectReq_i;
+  }
+  bool hasAlignmentAttr() const {
+    return Bits & Attribute::Alignment_i;
+  }
+  bool hasNoCaptureAttr() const {
+    return Bits & Attribute::NoCapture_i;
+  }
+  bool hasNoRedZoneAttr() const {
+    return Bits & Attribute::NoRedZone_i;
+  }
+  bool hasNoImplicitFloatAttr() const {
+    return Bits & Attribute::NoImplicitFloat_i;
+  }
+  bool hasNakedAttr() const {
+    return Bits & Attribute::Naked_i;
+  }
+  bool hasInlineHintAttr() const {
+    return Bits & Attribute::InlineHint_i;
+  }
+  bool hasReturnsTwiceAttr() const {
+    return Bits & Attribute::ReturnsTwice_i;
+  }
+  bool hasStackAlignmentAttr() const {
+    return Bits & Attribute::StackAlignment_i;
+  }
+  bool hasUWTableAttr() const {
+    return Bits & Attribute::UWTable_i;
+  }
+  bool hasNonLazyBindAttr() const {
+    return Bits & Attribute::NonLazyBind_i;
+  }
+  bool hasAddressSafetyAttr() const {
+    return Bits & Attribute::AddressSafety_i;
+  }
+
+  uint64_t getRawAlignment() const {
+    return Bits & Attribute::Alignment_i;
+  }
+  uint64_t getRawStackAlignment() const {
+    return Bits & Attribute::StackAlignment_i;
+  }
+
+  /// This returns the alignment field of an attribute as a byte alignment
+  /// value.
+  unsigned getAlignment() const {
+    if (!hasAlignmentAttr())
+      return 0;
+
+    return 1U << ((getRawAlignment() >> 16) - 1);
+  }
+
+  /// This returns the stack alignment field of an attribute as a byte alignment
+  /// value.
+  unsigned getStackAlignment() const {
+    if (!hasStackAlignmentAttr())
+      return 0;
+
+    return 1U << ((getRawStackAlignment() >> 26) - 1);
+  }
+
+  /// This turns an int alignment (a power of 2, normally) into the form used
+  /// internally in Attributes.
+  static Attributes constructAlignmentFromInt(unsigned i) {
+    // Default alignment, allow the target to define how to align it.
+    if (i == 0)
+      return Attribute::None;
+
+    assert(isPowerOf2_32(i) && "Alignment must be a power of two.");
+    assert(i <= 0x40000000 && "Alignment too large.");
+    return Attributes((Log2_32(i)+1) << 16);
+  }
+
+  /// This turns an int stack alignment (which must be a power of 2) into the
+  /// form used internally in Attributes.
+  static Attributes constructStackAlignmentFromInt(unsigned i) {
+    // Default alignment, allow the target to define how to align it.
+    if (i == 0)
+      return Attribute::None;
+
+    assert(isPowerOf2_32(i) && "Alignment must be a power of two.");
+    assert(i <= 0x100 && "Alignment too large.");
+    return Attributes((Log2_32(i)+1) << 26);
+  }
+
+  // This is a "safe bool() operator".
+  operator const void *() const { return Bits ? this : 0; }
+  bool isEmptyOrSingleton() const { return (Bits & (Bits - 1)) == 0; }
+  bool operator == (const Attributes &Attrs) const {
+    return Bits == Attrs.Bits;
+  }
+  bool operator != (const Attributes &Attrs) const {
+    return Bits != Attrs.Bits;
+  }
+  Attributes operator | (const Attributes &Attrs) const {
+    return Attributes(Bits | Attrs.Bits);
+  }
+  Attributes operator & (const Attributes &Attrs) const {
+    return Attributes(Bits & Attrs.Bits);
+  }
+  Attributes operator ^ (const Attributes &Attrs) const {
+    return Attributes(Bits ^ Attrs.Bits);
+  }
+  Attributes &operator |= (const Attributes &Attrs) {
+    Bits |= Attrs.Bits;
+    return *this;
+  }
+  Attributes &operator &= (const Attributes &Attrs) {
+    Bits &= Attrs.Bits;
+    return *this;
+  }
+  Attributes operator ~ () const { return Attributes(~Bits); }
+  uint64_t Raw() const { return Bits; }
+
+  /// The set of Attributes set in Attributes is converted to a string of
+  /// equivalent mnemonics. This is, presumably, for writing out the mnemonics
+  /// for the assembly writer.
+  /// @brief Convert attribute bits to text
+  std::string getAsString() const;
+};
+
+namespace Attribute {
+
 /// Note that uwtable is about the ABI or the user mandating an entry in the
 /// unwind table. The nounwind attribute is about an exception passing by the
 /// function.
@@ -176,49 +324,6 @@ const AttrConst MutuallyIncompatible[5] = {
 /// @brief Which attributes cannot be applied to a type.
 Attributes typeIncompatible(Type *Ty);
 
-/// This turns an int alignment (a power of 2, normally) into the
-/// form used internally in Attributes.
-inline Attributes constructAlignmentFromInt(unsigned i) {
-  // Default alignment, allow the target to define how to align it.
-  if (i == 0)
-    return None;
-
-  assert(isPowerOf2_32(i) && "Alignment must be a power of two.");
-  assert(i <= 0x40000000 && "Alignment too large.");
-  return Attributes((Log2_32(i)+1) << 16);
-}
-
-/// This returns the alignment field of an attribute as a byte alignment value.
-inline unsigned getAlignmentFromAttrs(Attributes A) {
-  Attributes Align = A & Attribute::Alignment;
-  if (!Align)
-    return 0;
-
-  return 1U << ((Align.Raw() >> 16) - 1);
-}
-
-/// This turns an int stack alignment (which must be a power of 2) into
-/// the form used internally in Attributes.
-inline Attributes constructStackAlignmentFromInt(unsigned i) {
-  // Default alignment, allow the target to define how to align it.
-  if (i == 0)
-    return None;
-
-  assert(isPowerOf2_32(i) && "Alignment must be a power of two.");
-  assert(i <= 0x100 && "Alignment too large.");
-  return Attributes((Log2_32(i)+1) << 26);
-}
-
-/// This returns the stack alignment field of an attribute as a byte alignment
-/// value.
-inline unsigned getStackAlignmentFromAttrs(Attributes A) {
-  Attributes StackAlign = A & Attribute::StackAlignment;
-  if (!StackAlign)
-    return 0;
-
-  return 1U << ((StackAlign.Raw() >> 26) - 1);
-}
-
 /// This returns an integer containing an encoding of all the
 /// LLVM attributes found in the given attribute bitset.  Any
 /// change to this encoding is a breaking change to bitcode
@@ -236,9 +341,9 @@ inline uint64_t encodeLLVMAttributesForBitcode(Attributes Attrs) {
   // 11 bits.
 
   uint64_t EncodedAttrs = Attrs.Raw() & 0xffff;
-  if (Attrs & Attribute::Alignment)
+  if (Attrs.hasAlignmentAttr())
     EncodedAttrs |= (1ull << 16) <<
-      (((Attrs & Attribute::Alignment).Raw()-1) >> 16);
+      ((Attrs.getRawAlignment() - 1) >> 16);
   EncodedAttrs |= (Attrs.Raw() & (0xfffull << 21)) << 11;
 
   return EncodedAttrs;
@@ -257,18 +362,12 @@ inline Attributes decodeLLVMAttributesForBitcode(uint64_t EncodedAttrs) {
 
   Attributes Attrs(EncodedAttrs & 0xffff);
   if (Alignment)
-    Attrs |= Attribute::constructAlignmentFromInt(Alignment);
+    Attrs |= Attributes::constructAlignmentFromInt(Alignment);
   Attrs |= Attributes((EncodedAttrs & (0xfffull << 32)) >> 11);
 
   return Attrs;
 }
 
-
-/// The set of Attributes set in Attributes is converted to a
-/// string of equivalent mnemonics. This is, presumably, for writing out
-/// the mnemonics for the assembly writer.
-/// @brief Convert attribute bits to text
-std::string getAsString(Attributes Attrs);
 } // end namespace Attribute
 
 /// This is just a pair of values to associate a set of attributes
@@ -346,13 +445,13 @@ public:
   /// paramHasAttr - Return true if the specified parameter index has the
   /// specified attribute set.
   bool paramHasAttr(unsigned Idx, Attributes Attr) const {
-    return getAttributes(Idx) & Attr;
+    return getAttributes(Idx).hasAttributes(Attr);
   }
 
   /// getParamAlignment - Return the alignment for the specified function
   /// parameter.
   unsigned getParamAlignment(unsigned Idx) const {
-    return Attribute::getAlignmentFromAttrs(getAttributes(Idx));
+    return getAttributes(Idx).getAlignment();
   }
 
   /// hasAttrSomewhere - Return true if the specified attribute is set for at
diff --git a/include/llvm/BasicBlock.h b/include/llvm/BasicBlock.h
index d2aa1673d9..9ab4a74e5a 100644
--- a/include/llvm/BasicBlock.h
+++ b/include/llvm/BasicBlock.h
@@ -79,8 +79,8 @@ private:
   void setParent(Function *parent);
   friend class SymbolTableListTraits<BasicBlock, Function>;
 
-  BasicBlock(const BasicBlock &);     // Do not implement
-  void operator=(const BasicBlock &); // Do not implement
+  BasicBlock(const BasicBlock &) LLVM_DELETED_FUNCTION;
+  void operator=(const BasicBlock &) LLVM_DELETED_FUNCTION;
 
   /// BasicBlock ctor - If the function parameter is specified, the basic block
   /// is automatically inserted at either the end of the function (if
diff --git a/include/llvm/Bitcode/Archive.h b/include/llvm/Bitcode/Archive.h
index c1c3ec044b..4fd4b5d90a 100644
--- a/include/llvm/Bitcode/Archive.h
+++ b/include/llvm/Bitcode/Archive.h
@@ -527,9 +527,9 @@ class Archive {
   /// @name Hidden
   /// @{
   private:
-    Archive();                          ///< Do not implement
-    Archive(const Archive&);            ///< Do not implement
-    Archive& operator=(const Archive&); ///< Do not implement
+    Archive() LLVM_DELETED_FUNCTION;
+    Archive(const Archive&) LLVM_DELETED_FUNCTION;
+    Archive& operator=(const Archive&) LLVM_DELETED_FUNCTION;
   /// @}
 };
 
diff --git a/include/llvm/Bitcode/BitstreamReader.h b/include/llvm/Bitcode/BitstreamReader.h
index 6586829440..3daef789d6 100644
--- a/include/llvm/Bitcode/BitstreamReader.h
+++ b/include/llvm/Bitcode/BitstreamReader.h
@@ -47,9 +47,9 @@ private:
   /// block/record name information in the BlockInfo block. Only llvm-bcanalyzer
   /// uses this.
   bool IgnoreBlockInfoNames;
-  
-  BitstreamReader(const BitstreamReader&);  // DO NOT IMPLEMENT
-  void operator=(const BitstreamReader&);  // DO NOT IMPLEMENT
+
+  BitstreamReader(const BitstreamReader&) LLVM_DELETED_FUNCTION;
+  void operator=(const BitstreamReader&) LLVM_DELETED_FUNCTION;
 public:
   BitstreamReader() : IgnoreBlockInfoNames(true) {
   }
diff --git a/include/llvm/CodeGen/GCMetadataPrinter.h b/include/llvm/CodeGen/GCMetadataPrinter.h
index 17a2653000..4a6b5ac19c 100644
--- a/include/llvm/CodeGen/GCMetadataPrinter.h
+++ b/include/llvm/CodeGen/GCMetadataPrinter.h
@@ -48,9 +48,10 @@ namespace llvm {
     // May only be subclassed.
     GCMetadataPrinter();
 
-    // Do not implement.
-    GCMetadataPrinter(const GCMetadataPrinter &);
-    GCMetadataPrinter &operator=(const GCMetadataPrinter &);
+  private:
+    GCMetadataPrinter(const GCMetadataPrinter &) LLVM_DELETED_FUNCTION;
+    GCMetadataPrinter &
+      operator=(const GCMetadataPrinter &) LLVM_DELETED_FUNCTION;
 
   public:
     GCStrategy &getStrategy() { return *S; }
diff --git a/include/llvm/CodeGen/LiveInterval.h b/include/llvm/CodeGen/LiveInterval.h
index 5aeb1a8c31..185e414ae2 100644
--- a/include/llvm/CodeGen/LiveInterval.h
+++ b/include/llvm/CodeGen/LiveInterval.h
@@ -114,9 +114,6 @@ namespace llvm {
 
     void dump() const;
     void print(raw_ostream &os) const;
-
-  private:
-    LiveRange(); // DO NOT IMPLEMENT
   };
 
   template <> struct isPodLike<LiveRange> { static const bool value = true; };
@@ -276,11 +273,6 @@ namespace llvm {
     void MergeValueInAsValue(const LiveInterval &RHS,
                              const VNInfo *RHSValNo, VNInfo *LHSValNo);
 
-    /// Copy - Copy the specified live interval. This copies all the fields
-    /// except for the register of the interval.
-    void Copy(const LiveInterval &RHS, MachineRegisterInfo *MRI,
-              VNInfo::Allocator &VNInfoAllocator);
-
     bool empty() const { return ranges.empty(); }
 
     /// beginIndex - Return the lowest numbered slot covered by interval.
@@ -313,12 +305,6 @@ namespace llvm {
       return r != end() && r->end == index;
     }
 
-    /// killedInRange - Return true if the interval has kills in [Start,End).
-    /// Note that the kill point is considered the end of a live range, so it is
-    /// not contained in the live range. If a live range ends at End, it won't
-    /// be counted as a kill by this method.
-    bool killedInRange(SlotIndex Start, SlotIndex End) const;
-
     /// getLiveRangeContaining - Return the live range that contains the
     /// specified index, or null if there is none.
     const LiveRange *getLiveRangeContaining(SlotIndex Idx) const {
@@ -478,7 +464,7 @@ namespace llvm {
                              VNInfo *LHSValNo = 0,
                              const VNInfo *RHSValNo = 0);
 
-    LiveInterval& operator=(const LiveInterval& rhs); // DO NOT IMPLEMENT
+    LiveInterval& operator=(const LiveInterval& rhs) LLVM_DELETED_FUNCTION;
 
   };
 
@@ -510,7 +496,9 @@ namespace llvm {
       if (I == E)
         return;
       // Is this an instruction live-in segment?
-      if (SlotIndex::isEarlierInstr(I->start, Idx)) {
+      // If Idx is the start index of a basic block, include live-in segments
+      // that start at Idx.getBaseIndex().
+      if (I->start <= Idx.getBaseIndex()) {
         EarlyVal = I->valno;
         EndPoint = I->end;
         // Move to the potentially live-out segment.
@@ -519,6 +507,12 @@ namespace llvm {
           if (++I == E)
             return;
         }
+        // Special case: A PHIDef value can have its def in the middle of a
+        // segment if the value happens to be live out of the layout
+        // predecessor.
+        // Such a value is not live-in.
+        if (EarlyVal->def == Idx.getBaseIndex())
+          EarlyVal = 0;
       }
       // I now points to the segment that may be live-through, or defined by
       // this instr. Ignore segments starting after the current instr.
diff --git a/include/llvm/CodeGen/LiveIntervalAnalysis.h b/include/llvm/CodeGen/LiveIntervalAnalysis.h
index bf7469093a..1e8dde1255 100644
--- a/include/llvm/CodeGen/LiveIntervalAnalysis.h
+++ b/include/llvm/CodeGen/LiveIntervalAnalysis.h
@@ -165,6 +165,26 @@ namespace llvm {
     bool shrinkToUses(LiveInterval *li,
                       SmallVectorImpl<MachineInstr*> *dead = 0);
 
+    /// extendToIndices - Extend the live range of LI to reach all points in
+    /// Indices. The points in the Indices array must be jointly dominated by
+    /// existing defs in LI. PHI-defs are added as needed to maintain SSA form.
+    ///
+    /// If a SlotIndex in Indices is the end index of a basic block, LI will be
+    /// extended to be live out of the basic block.
+    ///
+    /// See also LiveRangeCalc::extend().
+    void extendToIndices(LiveInterval *LI, ArrayRef<SlotIndex> Indices);
+
+    /// pruneValue - If an LI value is live at Kill, prune its live range by
+    /// removing any liveness reachable from Kill. Add live range end points to
+    /// EndPoints such that extendToIndices(LI, EndPoints) will reconstruct the
+    /// value's live range.
+    ///
+    /// Calling pruneValue() and extendToIndices() can be used to reconstruct
+    /// SSA form after adding defs to a virtual register.
+    void pruneValue(LiveInterval *LI, SlotIndex Kill,
+                    SmallVectorImpl<SlotIndex> *EndPoints);
+
     SlotIndexes *getSlotIndexes() const {
       return Indexes;
     }
diff --git a/include/llvm/CodeGen/MachineBasicBlock.h b/include/llvm/CodeGen/MachineBasicBlock.h
index 77ea4d03ea..97c39458d9 100644
--- a/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/include/llvm/CodeGen/MachineBasicBlock.h
@@ -547,6 +547,28 @@ public:
     return findDebugLoc(MBBI.getInstrIterator());
   }
 
+  /// Possible outcome of a register liveness query to computeRegisterLiveness()
+  enum LivenessQueryResult {
+    LQR_Live,            ///< Register is known to be live.
+    LQR_OverlappingLive, ///< Register itself is not live, but some overlapping
+                         ///< register is.
+    LQR_Dead,            ///< Register is known to be dead.
+    LQR_Unknown          ///< Register liveness not decidable from local
+                         ///< neighborhood.
+  };
+
+  /// computeRegisterLiveness - Return whether (physical) register \c Reg
+  /// has been <def>ined and not <kill>ed as of just before \c MI.
+  /// 
+  /// Search is localised to a neighborhood of
+  /// \c Neighborhood instructions before (searching for defs or kills) and
+  /// Neighborhood instructions after (searching just for defs) MI.
+  ///
+  /// \c Reg must be a physical register.
+  LivenessQueryResult computeRegisterLiveness(const TargetRegisterInfo *TRI,
+                                              unsigned Reg, MachineInstr *MI,
+                                              unsigned Neighborhood=10);
+
   // Debugging methods.
   void dump() const;
   void print(raw_ostream &OS, SlotIndexes* = 0) const;
diff --git a/include/llvm/CodeGen/MachineFunction.h b/include/llvm/CodeGen/MachineFunction.h
index 0eb9d0e509..025e18a9dd 100644
--- a/include/llvm/CodeGen/MachineFunction.h
+++ b/include/llvm/CodeGen/MachineFunction.h
@@ -127,8 +127,8 @@ class MachineFunction {
   /// about the control flow of such functions.
   bool ExposesReturnsTwice;
 
-  MachineFunction(const MachineFunction &); // DO NOT IMPLEMENT
-  void operator=(const MachineFunction&);   // DO NOT IMPLEMENT
+  MachineFunction(const MachineFunction &) LLVM_DELETED_FUNCTION;
+  void operator=(const MachineFunction&) LLVM_DELETED_FUNCTION;
 public:
   MachineFunction(const Function *Fn, const TargetMachine &TM,
                   unsigned FunctionNum, MachineModuleInfo &MMI,
diff --git a/include/llvm/CodeGen/MachineInstr.h b/include/llvm/CodeGen/MachineInstr.h
index 4e1533a8e7..32c79510d8 100644
--- a/include/llvm/CodeGen/MachineInstr.h
+++ b/include/llvm/CodeGen/MachineInstr.h
@@ -82,8 +82,8 @@ private:
   MachineBasicBlock *Parent;            // Pointer to the owning basic block.
   DebugLoc debugLoc;                    // Source line information.
 
-  MachineInstr(const MachineInstr&);   // DO NOT IMPLEMENT
-  void operator=(const MachineInstr&); // DO NOT IMPLEMENT
+  MachineInstr(const MachineInstr&) LLVM_DELETED_FUNCTION;
+  void operator=(const MachineInstr&) LLVM_DELETED_FUNCTION;
 
   // Intrusive list support
   friend struct ilist_traits<MachineInstr>;
diff --git a/include/llvm/CodeGen/MachineInstrBundle.h b/include/llvm/CodeGen/MachineInstrBundle.h
index dc5f9a6ec8..854ba06209 100644
--- a/include/llvm/CodeGen/MachineInstrBundle.h
+++ b/include/llvm/CodeGen/MachineInstrBundle.h
@@ -130,9 +130,9 @@ public:
     return OpI - InstrI->operands_begin();
   }
 
-  /// RegInfo - Information about a virtual register used by a set of operands.
+  /// VirtRegInfo - Information about a virtual register used by a set of operands.
   ///
-  struct RegInfo {
+  struct VirtRegInfo {
     /// Reads - One of the operands read the virtual register.  This does not
     /// include <undef> or <internal> use operands, see MO::readsReg().
     bool Reads;
@@ -146,6 +146,32 @@ public:
     bool Tied;
   };
 
+  /// PhysRegInfo - Information about a physical register used by a set of
+  /// operands.
+  struct PhysRegInfo {
+    /// Clobbers - Reg or an overlapping register is defined, or a regmask 
+    /// clobbers Reg.
+    bool Clobbers;
+
+    /// Defines - Reg or a super-register is defined.
+    bool Defines;
+
+    /// DefinesOverlap - Reg or an overlapping register is defined.
+    bool DefinesOverlap;
+
+    /// Reads - Read or a super-register is read.
+    bool Reads;
+
+    /// ReadsOverlap - Reg or an overlapping register is read.
+    bool ReadsOverlap;
+
+    /// DefinesDead - All defs of a Reg or a super-register are dead.
+    bool DefinesDead;
+
+    /// There is a kill of Reg or a super-register.
+    bool Kills;
+  };
+
   /// analyzeVirtReg - Analyze how the current instruction or bundle uses a
   /// virtual register.  This function should not be called after operator++(),
   /// it expects a fresh iterator.
@@ -154,8 +180,16 @@ public:
   /// @param Ops When set, this vector will receive an (MI, OpNum) entry for
   ///            each operand referring to Reg.
   /// @returns A filled-in RegInfo struct.
-  RegInfo analyzeVirtReg(unsigned Reg,
+  VirtRegInfo analyzeVirtReg(unsigned Reg,
                  SmallVectorImpl<std::pair<MachineInstr*, unsigned> > *Ops = 0);
+
+  /// analyzePhysReg - Analyze how the current instruction or bundle uses a
+  /// physical register.  This function should not be called after operator++(),
+  /// it expects a fresh iterator.
+  ///
+  /// @param Reg The physical register to analyze.
+  /// @returns A filled-in PhysRegInfo struct.
+  PhysRegInfo analyzePhysReg(unsigned Reg, const TargetRegisterInfo *TRI);
 };
 
 /// MIOperands - Iterate over operands of a single instruction.
diff --git a/include/llvm/CodeGen/MachineLoopInfo.h b/include/llvm/CodeGen/MachineLoopInfo.h
index 3e204bed15..d53f041128 100644
--- a/include/llvm/CodeGen/MachineLoopInfo.h
+++ b/include/llvm/CodeGen/MachineLoopInfo.h
@@ -73,8 +73,8 @@ class MachineLoopInfo : public MachineFunctionPass {
   LoopInfoBase<MachineBasicBlock, MachineLoop> LI;
   friend class LoopBase<MachineBasicBlock, MachineLoop>;
 
-  void operator=(const MachineLoopInfo &);  // do not implement
-  MachineLoopInfo(const MachineLoopInfo &); // do not implement
+  void operator=(const MachineLoopInfo &) LLVM_DELETED_FUNCTION;
+  MachineLoopInfo(const MachineLoopInfo &) LLVM_DELETED_FUNCTION;
 
 public:
   static char ID; // Pass identification, replacement for typeid
diff --git a/include/llvm/CodeGen/MachineOperand.h b/include/llvm/CodeGen/MachineOperand.h
index 0b9d67f37a..5a182101c1 100644
--- a/include/llvm/CodeGen/MachineOperand.h
+++ b/include/llvm/CodeGen/MachineOperand.h
@@ -628,11 +628,11 @@ public:
     Op.setTargetFlags(TargetFlags);
     return Op;
   }
-  static MachineOperand CreateBA(const BlockAddress *BA,
+  static MachineOperand CreateBA(const BlockAddress *BA, int64_t Offset,
                                  unsigned char TargetFlags = 0) {
     MachineOperand Op(MachineOperand::MO_BlockAddress);
     Op.Contents.OffsetedInfo.Val.BA = BA;
-    Op.setOffset(0); // Offset is always 0.
+    Op.setOffset(Offset);
     Op.setTargetFlags(TargetFlags);
     return Op;
   }
diff --git a/include/llvm/CodeGen/MachinePostDominators.h b/include/llvm/CodeGen/MachinePostDominators.h
new file mode 100644
index 0000000000..a9fc8434ab
--- /dev/null
+++ b/include/llvm/CodeGen/MachinePostDominators.h
@@ -0,0 +1,87 @@
+//=- llvm/CodeGen/MachineDominators.h ----------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file exposes interfaces to post dominance information for
+// target-specific code.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_MACHINEPOSTDOMINATORS_H
+#define LLVM_CODEGEN_MACHINEPOSTDOMINATORS_H
+
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/DominatorInternals.h"
+
+namespace llvm {
+
+///
+/// PostDominatorTree Class - Concrete subclass of DominatorTree that is used
+/// to compute the a post-dominator tree.
+///
+struct MachinePostDominatorTree : public MachineFunctionPass {
+private:
+  DominatorTreeBase<MachineBasicBlock> *DT;
+
+public:
+  static char ID;
+
+  MachinePostDominatorTree();
+
+  ~MachinePostDominatorTree();
+
+  FunctionPass *createMachinePostDominatorTreePass();
+
+  const std::vector<MachineBasicBlock *> &getRoots() const {
+    return DT->getRoots();
+  }
+
+  MachineDomTreeNode *getRootNode() const {
+    return DT->getRootNode();
+  }
+
+  MachineDomTreeNode *operator[](MachineBasicBlock *BB) const {
+    return DT->getNode(BB);
+  }
+
+  MachineDomTreeNode *getNode(MachineBasicBlock *BB) const {
+    return DT->getNode(BB);
+  }
+
+  bool dominates(MachineDomTreeNode *A, MachineDomTreeNode *B) const {
+    return DT->dominates(A, B);
+  }
+
+  bool dominates(MachineBasicBlock *A, MachineBasicBlock *B) const {
+    return DT->dominates(A, B);
+  }
+
+  bool
+  properlyDominates(const MachineDomTreeNode *A, MachineDomTreeNode *B) const {
+    return DT->properlyDominates(A, B);
+  }
+
+  bool
+  properlyDominates(MachineBasicBlock *A, MachineBasicBlock *B) const {
+    return DT->properlyDominates(A, B);
+  }
+
+  MachineBasicBlock *findNearestCommonDominator(MachineBasicBlock *A,
+                                                       MachineBasicBlock *B) {
+    return DT->findNearestCommonDominator(A, B);
+  }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+  virtual void print(llvm::raw_ostream &OS, const Module *M = 0) const;
+};
+} //end of namespace llvm
+
+#endif
diff --git a/include/llvm/CodeGen/MachineRegisterInfo.h b/include/llvm/CodeGen/MachineRegisterInfo.h
index 42a8aa43d9..91d24dd0fc 100644
--- a/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -106,8 +106,8 @@ class MachineRegisterInfo {
   std::vector<std::pair<unsigned, unsigned> > LiveIns;
   std::vector<unsigned> LiveOuts;
 
-  MachineRegisterInfo(const MachineRegisterInfo&); // DO NOT IMPLEMENT
-  void operator=(const MachineRegisterInfo&);      // DO NOT IMPLEMENT
+  MachineRegisterInfo(const MachineRegisterInfo&) LLVM_DELETED_FUNCTION;
+  void operator=(const MachineRegisterInfo&) LLVM_DELETED_FUNCTION;
 public:
   explicit MachineRegisterInfo(const TargetRegisterInfo &TRI);
   ~MachineRegisterInfo();
diff --git a/include/llvm/CodeGen/MachineSSAUpdater.h b/include/llvm/CodeGen/MachineSSAUpdater.h
index cbb45a7127..edf93d13bd 100644
--- a/include/llvm/CodeGen/MachineSSAUpdater.h
+++ b/include/llvm/CodeGen/MachineSSAUpdater.h
@@ -14,6 +14,8 @@
 #ifndef LLVM_CODEGEN_MACHINESSAUPDATER_H
 #define LLVM_CODEGEN_MACHINESSAUPDATER_H
 
+#include "llvm/Support/Compiler.h"
+
 namespace llvm {
   class MachineBasicBlock;
   class MachineFunction;
@@ -106,8 +108,8 @@ private:
   void ReplaceRegWith(unsigned OldReg, unsigned NewReg);
   unsigned GetValueAtEndOfBlockInternal(MachineBasicBlock *BB);
 
-  void operator=(const MachineSSAUpdater&); // DO NOT IMPLEMENT
-  MachineSSAUpdater(const MachineSSAUpdater&);     // DO NOT IMPLEMENT
+  void operator=(const MachineSSAUpdater&) LLVM_DELETED_FUNCTION;
+  MachineSSAUpdater(const MachineSSAUpdater&) LLVM_DELETED_FUNCTION;
 };
 
 } // End llvm namespace
diff --git a/include/llvm/CodeGen/MachineScheduler.h b/include/llvm/CodeGen/MachineScheduler.h
index f87c2a5c40..d88f3fc57d 100644
--- a/include/llvm/CodeGen/MachineScheduler.h
+++ b/include/llvm/CodeGen/MachineScheduler.h
@@ -179,6 +179,14 @@ public:
 #endif
 };
 
+/// Mutate the DAG as a postpass after normal DAG building.
+class ScheduleDAGMutation {
+public:
+  virtual ~ScheduleDAGMutation() {}
+
+  virtual void apply(ScheduleDAGMI *DAG) = 0;
+};
+
 /// ScheduleDAGMI is an implementation of ScheduleDAGInstrs that schedules
 /// machine instructions while updating LiveIntervals and tracking regpressure.
 class ScheduleDAGMI : public ScheduleDAGInstrs {
@@ -187,6 +195,9 @@ protected:
   RegisterClassInfo *RegClassInfo;
   MachineSchedStrategy *SchedImpl;
 
+  /// Ordered list of DAG postprocessing steps.
+  std::vector<ScheduleDAGMutation*> Mutations;
+
   MachineBasicBlock::iterator LiveRegionEnd;
 
   /// Register pressure in this region computed by buildSchedGraph.
@@ -229,6 +240,13 @@ public:
     delete SchedImpl;
   }
 
+  /// Add a postprocessing step to the DAG builder.
+  /// Mutations are applied in the order that they are added after normal DAG
+  /// building and before MachineSchedStrategy initialization.
+  void addMutation(ScheduleDAGMutation *Mutation) {
+    Mutations.push_back(Mutation);
+  }
+
   MachineBasicBlock::iterator top() const { return CurrentTop; }
   MachineBasicBlock::iterator bottom() const { return CurrentBottom; }
 
@@ -282,6 +300,10 @@ protected:
   /// bottom of the DAG region without covereing any unscheduled instruction.
   void buildDAGWithRegPressure();
 
+  /// Apply each ScheduleDAGMutation step in order. This allows different
+  /// instances of ScheduleDAGMI to perform custom DAG postprocessing.
+  void postprocessDAG();
+
   /// Identify DAG roots and setup scheduler queues.
   void initQueues();
 
diff --git a/include/llvm/CodeGen/RegAllocPBQP.h b/include/llvm/CodeGen/RegAllocPBQP.h
index bce3ec739b..acfc07dd31 100644
--- a/include/llvm/CodeGen/RegAllocPBQP.h
+++ b/include/llvm/CodeGen/RegAllocPBQP.h
@@ -109,8 +109,8 @@ namespace llvm {
   /// class to support additional constraints for your architecture.
   class PBQPBuilder {
   private:
-    PBQPBuilder(const PBQPBuilder&) {}
-    void operator=(const PBQPBuilder&) {}
+    PBQPBuilder(const PBQPBuilder&) LLVM_DELETED_FUNCTION;
+    void operator=(const PBQPBuilder&) LLVM_DELETED_FUNCTION;
   public:
 
     typedef std::set<unsigned> RegSet;
diff --git a/include/llvm/CodeGen/ScheduleDAGInstrs.h b/include/llvm/CodeGen/ScheduleDAGInstrs.h
index 8b52b5a9c7..d13ee84257 100644
--- a/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -18,6 +18,7 @@
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/ADT/SmallSet.h"
@@ -181,6 +182,9 @@ namespace llvm {
     /// Live Intervals provides reaching defs in preRA scheduling.
     LiveIntervals *LIS;
 
+    /// TargetSchedModel provides an interface to the machine model.
+    TargetSchedModel SchedModel;
+
     /// isPostRA flag indicates vregs cannot be present.
     bool IsPostRA;
 
diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h
index 1ccfe54d21..619ee69943 100644
--- a/include/llvm/CodeGen/SelectionDAG.h
+++ b/include/llvm/CodeGen/SelectionDAG.h
@@ -73,8 +73,8 @@ class SDDbgInfo {
   SmallVector<SDDbgValue*, 32> ByvalParmDbgValues;
   DenseMap<const SDNode*, SmallVector<SDDbgValue*, 2> > DbgValMap;
 
-  void operator=(const SDDbgInfo&);   // Do not implement.
-  SDDbgInfo(const SDDbgInfo&);   // Do not implement.
+  void operator=(const SDDbgInfo&) LLVM_DELETED_FUNCTION;
+  SDDbgInfo(const SDDbgInfo&) LLVM_DELETED_FUNCTION;
 public:
   SDDbgInfo() {}
 
@@ -222,8 +222,8 @@ private:
                               DenseSet<SDNode *> &visited,
                               int level, bool &printed);
 
-  void operator=(const SelectionDAG&); // Do not implement.
-  SelectionDAG(const SelectionDAG&);   // Do not implement.
+  void operator=(const SelectionDAG&) LLVM_DELETED_FUNCTION;
+  SelectionDAG(const SelectionDAG&) LLVM_DELETED_FUNCTION;
 
 public:
   explicit SelectionDAG(const TargetMachine &TM, llvm::CodeGenOpt::Level);
@@ -437,7 +437,13 @@ public:
   SDValue getRegisterMask(const uint32_t *RegMask);
   SDValue getEHLabel(DebugLoc dl, SDValue Root, MCSymbol *Label);
   SDValue getBlockAddress(const BlockAddress *BA, EVT VT,
-                          bool isTarget = false, unsigned char TargetFlags = 0);
+                          int64_t Offset = 0, bool isTarget = false,
+                          unsigned char TargetFlags = 0);
+  SDValue getTargetBlockAddress(const BlockAddress *BA, EVT VT,
+                                int64_t Offset = 0,
+                                unsigned char TargetFlags = 0) {
+    return getBlockAddress(BA, VT, Offset, true, TargetFlags);
+  }
 
   SDValue getCopyToReg(SDValue Chain, DebugLoc dl, unsigned Reg, SDValue N) {
     return getNode(ISD::CopyToReg, dl, MVT::Other, Chain,
diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h
index 3bea2ded68..fd89973263 100644
--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -216,8 +216,8 @@ class SDUse {
   /// this operand.
   SDUse **Prev, *Next;
 
-  SDUse(const SDUse &U);          // Do not implement
-  void operator=(const SDUse &U); // Do not implement
+  SDUse(const SDUse &U) LLVM_DELETED_FUNCTION;
+  void operator=(const SDUse &U) LLVM_DELETED_FUNCTION;
 
 public:
   SDUse() : Val(), User(NULL), Prev(NULL), Next(NULL) {}
@@ -1390,7 +1390,7 @@ public:
 /// BUILD_VECTORs.
 class BuildVectorSDNode : public SDNode {
   // These are constructed as SDNodes and then cast to BuildVectorSDNodes.
-  explicit BuildVectorSDNode();        // Do not implement
+  explicit BuildVectorSDNode() LLVM_DELETED_FUNCTION;
 public:
   /// isConstantSplat - Check if this is a constant splat, and if so, find the
   /// smallest element size that splats the vector.  If MinSplatBits is
@@ -1483,15 +1483,17 @@ public:
 
 class BlockAddressSDNode : public SDNode {
   const BlockAddress *BA;
+  int64_t Offset;
   unsigned char TargetFlags;
   friend class SelectionDAG;
   BlockAddressSDNode(unsigned NodeTy, EVT VT, const BlockAddress *ba,
-                     unsigned char Flags)
+                     int64_t o, unsigned char Flags)
     : SDNode(NodeTy, DebugLoc(), getSDVTList(VT)),
-             BA(ba), TargetFlags(Flags) {
+             BA(ba), Offset(o), TargetFlags(Flags) {
   }
 public:
   const BlockAddress *getBlockAddress() const { return BA; }
+  int64_t getOffset() const { return Offset; }
   unsigned char getTargetFlags() const { return TargetFlags; }
 
   static bool classof(const BlockAddressSDNode *) { return true; }
diff --git a/include/llvm/CodeGen/TargetSchedule.h b/include/llvm/CodeGen/TargetSchedule.h
new file mode 100644
index 0000000000..d2a26afe99
--- /dev/null
+++ b/include/llvm/CodeGen/TargetSchedule.h
@@ -0,0 +1,79 @@
+//===-- llvm/CodeGen/TargetSchedule.h - Sched Machine Model -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines a wrapper around MCSchedModel that allows the interface to
+// benefit from information currently only available in TargetInstrInfo.
+// Ideally, the scheduling interface would be fully defined in the MC layer.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TARGET_TARGETSCHEDMODEL_H
+#define LLVM_TARGET_TARGETSCHEDMODEL_H
+
+#include "llvm/MC/MCSchedule.h"
+#include "llvm/MC/MCInstrItineraries.h"
+
+namespace llvm {
+
+class TargetRegisterInfo;
+class TargetSubtargetInfo;
+class TargetInstrInfo;
+class MachineInstr;
+
+/// Provide an instruction scheduling machine model to CodeGen passes.
+class TargetSchedModel {
+  // For efficiency, hold a copy of the statically defined MCSchedModel for this
+  // processor.
+  MCSchedModel SchedModel;
+  InstrItineraryData InstrItins;
+  const TargetSubtargetInfo *STI;
+  const TargetInstrInfo *TII;
+public:
+  TargetSchedModel(): STI(0), TII(0) {}
+
+  void init(const MCSchedModel &sm, const TargetSubtargetInfo *sti,
+            const TargetInstrInfo *tii);
+
+  const TargetInstrInfo *getInstrInfo() const { return TII; }
+
+  /// Return true if this machine model includes an instruction-level scheduling
+  /// model. This is more detailed than the course grain IssueWidth and default
+  /// latency properties, but separate from the per-cycle itinerary data.
+  bool hasInstrSchedModel() const { return SchedModel.hasInstrSchedModel(); }
+
+  /// Return true if this machine model includes cycle-to-cycle itinerary
+  /// data. This models scheduling at each stage in the processor pipeline.
+  bool hasInstrItineraries() const { return !InstrItins.isEmpty(); }
+
+  /// computeOperandLatency - Compute and return the latency of the given data
+  /// dependent def and use when the operand indices are already known. UseMI
+  /// may be NULL for an unknown user.
+  ///
+  /// FindMin may be set to get the minimum vs. expected latency. Minimum
+  /// latency is used for scheduling groups, while expected latency is for
+  /// instruction cost and critical path.
+  unsigned computeOperandLatency(const MachineInstr *DefMI, unsigned DefOperIdx,
+                                 const MachineInstr *UseMI, unsigned UseOperIdx,
+                                 bool FindMin) const;
+
+  unsigned getProcessorID() const { return SchedModel.getProcessorID(); }
+
+private:
+  /// getDefLatency is a helper for computeOperandLatency. Return the
+  /// instruction's latency if operand lookup is not required.
+  /// Otherwise return -1.
+  int getDefLatency(const MachineInstr *DefMI, bool FindMin) const;
+
+  /// Return the MCSchedClassDesc for this instruction.
+  const MCSchedClassDesc *resolveSchedClass(const MachineInstr *MI) const;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/include/llvm/CodeGen/ValueTypes.h b/include/llvm/CodeGen/ValueTypes.h
index eb38cd33d1..2d92d025b4 100644
--- a/include/llvm/CodeGen/ValueTypes.h
+++ b/include/llvm/CodeGen/ValueTypes.h
@@ -56,50 +56,56 @@ namespace llvm {
       FIRST_FP_VALUETYPE = f16,
       LAST_FP_VALUETYPE  = ppcf128,
 
-      v2i8           =  13,   //  2 x i8
-      v4i8           =  14,   //  4 x i8
-      v8i8           =  15,   //  8 x i8
-      v16i8          =  16,   // 16 x i8
-      v32i8          =  17,   // 32 x i8
-      v2i16          =  18,   //  2 x i16
-      v4i16          =  19,   //  4 x i16
-      v8i16          =  20,   //  8 x i16
-      v16i16         =  21,   // 16 x i16
-      v2i32          =  22,   //  2 x i32
-      v4i32          =  23,   //  4 x i32
-      v8i32          =  24,   //  8 x i32
-      v16i32         =  25,   // 16 x i32
-      v1i64          =  26,   //  1 x i64
-      v2i64          =  27,   //  2 x i64
-      v4i64          =  28,   //  4 x i64
-      v8i64          =  29,   //  8 x i64
-      v16i64         =  30,   // 16 x i64
-
-      v2f16          =  31,   //  2 x f16
-      v2f32          =  32,   //  2 x f32
-      v4f32          =  33,   //  4 x f32
-      v8f32          =  34,   //  8 x f32
-      v2f64          =  35,   //  2 x f64
-      v4f64          =  36,   //  4 x f64
-
-      FIRST_VECTOR_VALUETYPE = v2i8,
+      v2i1           =  13,   //  2 x i1
+      v4i1           =  14,   //  4 x i1
+      v8i1           =  15,   //  8 x i1
+      v16i1          =  16,   // 16 x i1
+      v2i8           =  17,   //  2 x i8
+      v4i8           =  18,   //  4 x i8
+      v8i8           =  19,   //  8 x i8
+      v16i8          =  20,   // 16 x i8
+      v32i8          =  21,   // 32 x i8
+      v1i16          =  22,   //  1 x i16
+      v2i16          =  23,   //  2 x i16
+      v4i16          =  24,   //  4 x i16
+      v8i16          =  25,   //  8 x i16
+      v16i16         =  26,   // 16 x i16
+      v1i32          =  27,   //  1 x i32
+      v2i32          =  28,   //  2 x i32
+      v4i32          =  29,   //  4 x i32
+      v8i32          =  30,   //  8 x i32
+      v16i32         =  31,   // 16 x i32
+      v1i64          =  32,   //  1 x i64
+      v2i64          =  33,   //  2 x i64
+      v4i64          =  34,   //  4 x i64
+      v8i64          =  35,   //  8 x i64
+      v16i64         =  36,   // 16 x i64
+
+      v2f16          =  37,   //  2 x f16
+      v2f32          =  38,   //  2 x f32
+      v4f32          =  39,   //  4 x f32
+      v8f32          =  40,   //  8 x f32
+      v2f64          =  41,   //  2 x f64
+      v4f64          =  42,   //  4 x f64
+
+      FIRST_VECTOR_VALUETYPE = v2i1,
       LAST_VECTOR_VALUETYPE  = v4f64,
-      FIRST_INTEGER_VECTOR_VALUETYPE = v2i8,
+      FIRST_INTEGER_VECTOR_VALUETYPE = v2i1,
       LAST_INTEGER_VECTOR_VALUETYPE = v16i64,
       FIRST_FP_VECTOR_VALUETYPE = v2f16,
       LAST_FP_VECTOR_VALUETYPE = v4f64,
 
-      x86mmx         =  37,   // This is an X86 MMX value
+      x86mmx         =  43,   // This is an X86 MMX value
 
-      Glue           =  38,   // This glues nodes together during pre-RA sched
+      Glue           =  44,   // This glues nodes together during pre-RA sched
 
-      isVoid         =  39,   // This has no value
+      isVoid         =  45,   // This has no value
 
-      Untyped        =  40,   // This value takes a register, but has
+      Untyped        =  46,   // This value takes a register, but has
                               // unspecified type.  The register class
                               // will be determined by the opcode.
 
-      LAST_VALUETYPE =  41,   // This always remains at the end of the list.
+      LAST_VALUETYPE =  47,   // This always remains at the end of the list.
 
       // This is the current maximum for LAST_VALUETYPE.
       // MVT::MAX_ALLOWED_VALUETYPE is used for asserts and to size bit vectors
@@ -233,15 +239,21 @@ namespace llvm {
       switch (SimpleTy) {
       default:
         llvm_unreachable("Not a vector MVT!");
+      case v2i1 :
+      case v4i1 :
+      case v8i1 :
+      case v16i1: return i1;
       case v2i8 :
       case v4i8 :
       case v8i8 :
       case v16i8:
       case v32i8: return i8;
+      case v1i16:
       case v2i16:
       case v4i16:
       case v8i16:
       case v16i16: return i16;
+      case v1i32:
       case v2i32:
       case v4i32:
       case v8i32:
@@ -265,21 +277,25 @@ namespace llvm {
       default:
         llvm_unreachable("Not a vector MVT!");
       case v32i8: return 32;
+      case v16i1:
       case v16i8:
       case v16i16:
       case v16i32:
       case v16i64:return 16;
+      case v8i1:
       case v8i8 :
       case v8i16:
       case v8i32:
       case v8i64:
       case v8f32: return 8;
+      case v4i1:
       case v4i8:
       case v4i16:
       case v4i32:
       case v4i64:
       case v4f32:
       case v4f64: return 4;
+      case v2i1:
       case v2i8:
       case v2i16:
       case v2i32:
@@ -287,6 +303,8 @@ namespace llvm {
       case v2f16:
       case v2f32:
       case v2f64: return 2;
+      case v1i16:
+      case v1i32:
       case v1i64: return 1;
       }
     }
@@ -302,15 +320,21 @@ namespace llvm {
       default:
         llvm_unreachable("getSizeInBits called on extended MVT.");
       case i1  :  return 1;
-      case i8  :  return 8;
+      case v2i1:  return 2;
+      case v4i1:  return 4;
+      case i8  :
+      case v8i1: return 8;
       case i16 :
       case f16:
-      case v2i8:  return 16;
+      case v16i1:
+      case v2i8:
+      case v1i16: return 16;
       case f32 :
       case i32 :
       case v4i8:
       case v2i16:
-      case v2f16: return 32;
+      case v2f16: 
+      case v1i32: return 32;
       case x86mmx:
       case f64 :
       case i64 :
@@ -393,6 +417,12 @@ namespace llvm {
       switch (VT.SimpleTy) {
       default:
         break;
+      case MVT::i1:
+        if (NumElements == 2)  return MVT::v2i1;
+        if (NumElements == 4)  return MVT::v4i1;
+        if (NumElements == 8)  return MVT::v8i1;
+        if (NumElements == 16) return MVT::v16i1;
+        break;
       case MVT::i8:
         if (NumElements == 2)  return MVT::v2i8;
         if (NumElements == 4)  return MVT::v4i8;
@@ -401,12 +431,14 @@ namespace llvm {
         if (NumElements == 32) return MVT::v32i8;
         break;
       case MVT::i16:
+        if (NumElements == 1)  return MVT::v1i16;
         if (NumElements == 2)  return MVT::v2i16;
         if (NumElements == 4)  return MVT::v4i16;
         if (NumElements == 8)  return MVT::v8i16;
         if (NumElements == 16) return MVT::v16i16;
         break;
       case MVT::i32:
+        if (NumElements == 1)  return MVT::v1i32;
         if (NumElements == 2)  return MVT::v2i32;
         if (NumElements == 4)  return MVT::v4i32;
         if (NumElements == 8)  return MVT::v8i32;
@@ -529,6 +561,23 @@ namespace llvm {
       return isSimple() ? V.isVector() : isExtendedVector();
     }
 
+    /// is16BitVector - Return true if this is a 16-bit vector type.
+    bool is16BitVector() const {
+      if (!isSimple())
+        return isExtended16BitVector();
+
+      return (V == MVT::v2i8  || V==MVT::v1i16 || V == MVT::v16i1);
+    }
+
+    /// is32BitVector - Return true if this is a 32-bit vector type.
+    bool is32BitVector() const {
+      if (!isSimple())
+        return isExtended32BitVector();
+
+      return (V == MVT::v4i8  || V==MVT::v2i16
+          || V==MVT::v1i32);
+    }
+
     /// is64BitVector - Return true if this is a 64-bit vector type.
     bool is64BitVector() const {
       return isSimple() ? V.is64BitVector() : isExtended64BitVector();
@@ -740,6 +789,8 @@ namespace llvm {
     bool isExtendedFloatingPoint() const;
     bool isExtendedInteger() const;
     bool isExtendedVector() const;
+    bool isExtended16BitVector() const;
+    bool isExtended32BitVector() const;
     bool isExtended64BitVector() const;
     bool isExtended128BitVector() const;
     bool isExtended256BitVector() const;
diff --git a/include/llvm/CodeGen/ValueTypes.td b/include/llvm/CodeGen/ValueTypes.td
index f4b75bd1b1..a707f887aa 100644
--- a/include/llvm/CodeGen/ValueTypes.td
+++ b/include/llvm/CodeGen/ValueTypes.td
@@ -33,36 +33,42 @@ def f80    : ValueType<80 , 10>;   // 80-bit floating point value
 def f128   : ValueType<128, 11>;   // 128-bit floating point value
 def ppcf128: ValueType<128, 12>;   // PPC 128-bit floating point value
 
-def v2i8   : ValueType<16 , 13>;   //  2 x i8  vector value
-def v4i8   : ValueType<32 , 14>;   //  4 x i8  vector value
-def v8i8   : ValueType<64 , 15>;   //  8 x i8  vector value
-def v16i8  : ValueType<128, 16>;   // 16 x i8  vector value
-def v32i8  : ValueType<256, 17>;   // 32 x i8 vector value
-def v2i16  : ValueType<32 , 18>;   //  2 x i16 vector value
-def v4i16  : ValueType<64 , 19>;   //  4 x i16 vector value
-def v8i16  : ValueType<128, 20>;   //  8 x i16 vector value
-def v16i16 : ValueType<256, 21>;   // 16 x i16 vector value
-def v2i32  : ValueType<64 , 22>;   //  2 x i32 vector value
-def v4i32  : ValueType<128, 23>;   //  4 x i32 vector value
-def v8i32  : ValueType<256, 24>;   //  8 x i32 vector value
-def v16i32 : ValueType<512, 25>;   // 16 x i32 vector value
-def v1i64  : ValueType<64 , 26>;   //  1 x i64 vector value
-def v2i64  : ValueType<128, 27>;   //  2 x i64 vector value
-def v4i64  : ValueType<256, 28>;   //  4 x i64 vector value
-def v8i64  : ValueType<512, 29>;   //  8 x i64 vector value
-def v16i64 : ValueType<1024,30>;  // 16 x i64 vector value
+def v2i1   : ValueType<2 ,  13>;   //  2 x i1  vector value
+def v4i1   : ValueType<4 ,  14>;   //  4 x i1  vector value
+def v8i1   : ValueType<8 ,  15>;   //  8 x i1  vector value
+def v16i1  : ValueType<16,  16>;   // 16 x i1  vector value
+def v2i8   : ValueType<16 , 17>;   //  2 x i8  vector value
+def v4i8   : ValueType<32 , 18>;   //  4 x i8  vector value
+def v8i8   : ValueType<64 , 19>;   //  8 x i8  vector value
+def v16i8  : ValueType<128, 20>;   // 16 x i8  vector value
+def v32i8  : ValueType<256, 21>;   // 32 x i8 vector value
+def v1i16  : ValueType<16 , 22>;   //  1 x i16 vector value
+def v2i16  : ValueType<32 , 23>;   //  2 x i16 vector value
+def v4i16  : ValueType<64 , 24>;   //  4 x i16 vector value
+def v8i16  : ValueType<128, 25>;   //  8 x i16 vector value
+def v16i16 : ValueType<256, 26>;   // 16 x i16 vector value
+def v1i32  : ValueType<32 , 27>;   //  1 x i32 vector value
+def v2i32  : ValueType<64 , 28>;   //  2 x i32 vector value
+def v4i32  : ValueType<128, 29>;   //  4 x i32 vector value
+def v8i32  : ValueType<256, 30>;   //  8 x i32 vector value
+def v16i32 : ValueType<512, 31>;   // 16 x i32 vector value
+def v1i64  : ValueType<64 , 32>;   //  1 x i64 vector value
+def v2i64  : ValueType<128, 33>;   //  2 x i64 vector value
+def v4i64  : ValueType<256, 34>;   //  4 x i64 vector value
+def v8i64  : ValueType<512, 35>;   //  8 x i64 vector value
+def v16i64 : ValueType<1024,36>;   // 16 x i64 vector value
 
-def v2f16  : ValueType<32 , 31>;   //  2 x f16 vector value
-def v2f32  : ValueType<64 , 32>;   //  2 x f32 vector value
-def v4f32  : ValueType<128, 33>;   //  4 x f32 vector value
-def v8f32  : ValueType<256, 34>;   //  8 x f32 vector value
-def v2f64  : ValueType<128, 35>;   //  2 x f64 vector value
-def v4f64  : ValueType<256, 36>;   //  4 x f64 vector value
+def v2f16  : ValueType<32 , 37>;   //  2 x f16 vector value
+def v2f32  : ValueType<64 , 38>;   //  2 x f32 vector value
+def v4f32  : ValueType<128, 39>;   //  4 x f32 vector value
+def v8f32  : ValueType<256, 40>;   //  8 x f32 vector value
+def v2f64  : ValueType<128, 41>;   //  2 x f64 vector value
+def v4f64  : ValueType<256, 42>;   //  4 x f64 vector value
 
-def x86mmx : ValueType<64 , 37>;   // X86 MMX value
-def FlagVT : ValueType<0  , 38>;   // Pre-RA sched glue
-def isVoid : ValueType<0  , 39>;   // Produces no value
-def untyped: ValueType<8  , 40>;   // Produces an untyped value
+def x86mmx : ValueType<64 , 43>;   // X86 MMX value
+def FlagVT : ValueType<0  , 44>;   // Pre-RA sched glue
+def isVoid : ValueType<0  , 45>;   // Produces no value
+def untyped: ValueType<8  , 46>;   // Produces an untyped value
 
 def MetadataVT: ValueType<0, 250>; // Metadata
 
diff --git a/include/llvm/Config/config.h.in b/include/llvm/Config/config.h.in
index 5a60ba565f..a4f8af4db0 100644
--- a/include/llvm/Config/config.h.in
+++ b/include/llvm/Config/config.h.in
@@ -18,6 +18,9 @@
 /* Default <path> to all compiler invocations for --sysroot=<path>. */
 #undef DEFAULT_SYSROOT
 
+/* Define if you want backtraces on crash */
+#undef ENABLE_BACKTRACES
+
 /* Define if position independent code is enabled */
 #undef ENABLE_PIC
 
diff --git a/include/llvm/Constant.h b/include/llvm/Constant.h
index e0e516d55c..7464dce330 100644
--- a/include/llvm/Constant.h
+++ b/include/llvm/Constant.h
@@ -39,8 +39,8 @@ namespace llvm {
 /// don't have to worry about the lifetime of the objects.
 /// @brief LLVM Constant Representation
 class Constant : public User {
-  void operator=(const Constant &);     // Do not implement
-  Constant(const Constant &);           // Do not implement
+  void operator=(const Constant &) LLVM_DELETED_FUNCTION;
+  Constant(const Constant &) LLVM_DELETED_FUNCTION;
   virtual void anchor();
   
 protected:
diff --git a/include/llvm/Constants.h b/include/llvm/Constants.h
index fdd53823aa..85fed4259d 100644
--- a/include/llvm/Constants.h
+++ b/include/llvm/Constants.h
@@ -49,8 +49,8 @@ struct ConvertConstantType;
 /// @brief Class for constant integers.
 class ConstantInt : public Constant {
   virtual void anchor();
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
-  ConstantInt(const ConstantInt &);      // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  ConstantInt(const ConstantInt &) LLVM_DELETED_FUNCTION;
   ConstantInt(IntegerType *Ty, const APInt& V);
   APInt Val;
 protected:
@@ -234,8 +234,8 @@ public:
 class ConstantFP : public Constant {
   APFloat Val;
   virtual void anchor();
-  void *operator new(size_t, unsigned);// DO NOT IMPLEMENT
-  ConstantFP(const ConstantFP &);      // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  ConstantFP(const ConstantFP &) LLVM_DELETED_FUNCTION;
   friend class LLVMContextImpl;
 protected:
   ConstantFP(Type *Ty, const APFloat& V);
@@ -301,8 +301,8 @@ public:
 /// ConstantAggregateZero - All zero aggregate value
 ///
 class ConstantAggregateZero : public Constant {
-  void *operator new(size_t, unsigned);                      // DO NOT IMPLEMENT
-  ConstantAggregateZero(const ConstantAggregateZero &);      // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  ConstantAggregateZero(const ConstantAggregateZero &) LLVM_DELETED_FUNCTION;
 protected:
   explicit ConstantAggregateZero(Type *ty)
     : Constant(ty, ConstantAggregateZeroVal, 0, 0) {}
@@ -346,7 +346,7 @@ public:
 ///
 class ConstantArray : public Constant {
   friend struct ConstantArrayCreator<ConstantArray, ArrayType>;
-  ConstantArray(const ConstantArray &);      // DO NOT IMPLEMENT
+  ConstantArray(const ConstantArray &) LLVM_DELETED_FUNCTION;
 protected:
   ConstantArray(ArrayType *T, ArrayRef<Constant *> Val);
 public:
@@ -385,7 +385,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ConstantArray, Constant)
 //
 class ConstantStruct : public Constant {
   friend struct ConstantArrayCreator<ConstantStruct, StructType>;
-  ConstantStruct(const ConstantStruct &);      // DO NOT IMPLEMENT
+  ConstantStruct(const ConstantStruct &) LLVM_DELETED_FUNCTION;
 protected:
   ConstantStruct(StructType *T, ArrayRef<Constant *> Val);
 public:
@@ -445,7 +445,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ConstantStruct, Constant)
 ///
 class ConstantVector : public Constant {
   friend struct ConstantArrayCreator<ConstantVector, VectorType>;
-  ConstantVector(const ConstantVector &);      // DO NOT IMPLEMENT
+  ConstantVector(const ConstantVector &) LLVM_DELETED_FUNCTION;
 protected:
   ConstantVector(VectorType *T, ArrayRef<Constant *> Val);
 public:
@@ -491,8 +491,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ConstantVector, Constant)
 /// ConstantPointerNull - a constant pointer value that points to null
 ///
 class ConstantPointerNull : public Constant {
-  void *operator new(size_t, unsigned);                  // DO NOT IMPLEMENT
-  ConstantPointerNull(const ConstantPointerNull &);      // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  ConstantPointerNull(const ConstantPointerNull &) LLVM_DELETED_FUNCTION;
 protected:
   explicit ConstantPointerNull(PointerType *T)
     : Constant(reinterpret_cast<Type*>(T),
@@ -543,8 +543,8 @@ class ConstantDataSequential : public Constant {
   /// element array of i8, or a 1-element array of i32.  They'll both end up in
   /// the same StringMap bucket, linked up.
   ConstantDataSequential *Next;
-  void *operator new(size_t, unsigned);                      // DO NOT IMPLEMENT
-  ConstantDataSequential(const ConstantDataSequential &);    // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  ConstantDataSequential(const ConstantDataSequential &) LLVM_DELETED_FUNCTION;
 protected:
   explicit ConstantDataSequential(Type *ty, ValueTy VT, const char *Data)
     : Constant(ty, VT, 0, 0), DataElements(Data), Next(0) {}
@@ -655,8 +655,8 @@ private:
 /// operands because it stores all of the elements of the constant as densely
 /// packed data, instead of as Value*'s.
 class ConstantDataArray : public ConstantDataSequential {
-  void *operator new(size_t, unsigned);            // DO NOT IMPLEMENT
-  ConstantDataArray(const ConstantDataArray &);    // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  ConstantDataArray(const ConstantDataArray &) LLVM_DELETED_FUNCTION;
   virtual void anchor();
   friend class ConstantDataSequential;
   explicit ConstantDataArray(Type *ty, const char *Data)
@@ -708,8 +708,8 @@ public:
 /// operands because it stores all of the elements of the constant as densely
 /// packed data, instead of as Value*'s.
 class ConstantDataVector : public ConstantDataSequential {
-  void *operator new(size_t, unsigned);              // DO NOT IMPLEMENT
-  ConstantDataVector(const ConstantDataVector &);    // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  ConstantDataVector(const ConstantDataVector &) LLVM_DELETED_FUNCTION;
   virtual void anchor();
   friend class ConstantDataSequential;
   explicit ConstantDataVector(Type *ty, const char *Data)
@@ -760,7 +760,7 @@ public:
 /// BlockAddress - The address of a basic block.
 ///
 class BlockAddress : public Constant {
-  void *operator new(size_t, unsigned);                  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
   void *operator new(size_t s) { return User::operator new(s, 2); }
   BlockAddress(Function *F, BasicBlock *BB);
 public:
@@ -1125,8 +1125,8 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ConstantExpr, Constant)
 /// LangRef.html#undefvalues for details.
 ///
 class UndefValue : public Constant {
-  void *operator new(size_t, unsigned); // DO NOT IMPLEMENT
-  UndefValue(const UndefValue &);      // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  UndefValue(const UndefValue &) LLVM_DELETED_FUNCTION;
 protected:
   explicit UndefValue(Type *T) : Constant(T, UndefValueVal, 0, 0) {}
 protected:
diff --git a/include/llvm/DIBuilder.h b/include/llvm/DIBuilder.h
index dd4ea96ae2..80880094d2 100644
--- a/include/llvm/DIBuilder.h
+++ b/include/llvm/DIBuilder.h
@@ -63,8 +63,8 @@ namespace llvm {
     SmallVector<Value *, 4> AllSubprograms;
     SmallVector<Value *, 4> AllGVs;
 
-    DIBuilder(const DIBuilder &);       // DO NOT IMPLEMENT
-    void operator=(const DIBuilder &);  // DO NOT IMPLEMENT
+    DIBuilder(const DIBuilder &) LLVM_DELETED_FUNCTION;
+    void operator=(const DIBuilder &) LLVM_DELETED_FUNCTION;
 
     public:
     explicit DIBuilder(Module &M);
@@ -347,6 +347,10 @@ namespace llvm {
     /// createArtificialType - Create a new DIType with "artificial" flag set.
     DIType createArtificialType(DIType Ty);
 
+    /// createObjectPointerType - Create a new DIType with the "object pointer"
+    /// flag set.
+    DIType createObjectPointerType(DIType Ty);
+
     /// createTemporaryType - Create a temporary forward-declared type.
     DIType createTemporaryType();
     DIType createTemporaryType(DIFile F);
diff --git a/include/llvm/DebugInfo.h b/include/llvm/DebugInfo.h
index 618220fcb0..4c1d045fa0 100644
--- a/include/llvm/DebugInfo.h
+++ b/include/llvm/DebugInfo.h
@@ -60,7 +60,8 @@ namespace llvm {
       FlagArtificial         = 1 << 6,
       FlagExplicit           = 1 << 7,
       FlagPrototyped         = 1 << 8,
-      FlagObjcClassComplete  = 1 << 9
+      FlagObjcClassComplete  = 1 << 9,
+      FlagObjectPointer      = 1 << 10
     };
   protected:
     const MDNode *DbgNode;
@@ -287,6 +288,9 @@ namespace llvm {
     bool isArtificial() const {
       return (getFlags() & FlagArtificial) != 0;
     }
+    bool isObjectPointer() const {
+      return (getFlags() & FlagObjectPointer) != 0;
+    }
     bool isObjcClassComplete() const {
       return (getFlags() & FlagObjcClassComplete) != 0;
     }
@@ -644,6 +648,10 @@ namespace llvm {
       return (getUnsignedField(6) & FlagArtificial) != 0;
     }
 
+    bool isObjectPointer() const {
+      return (getUnsignedField(6) & FlagObjectPointer) != 0;
+    }
+
     /// getInlinedAt - If this variable is inlined then return inline location.
     MDNode *getInlinedAt() const;
 
diff --git a/include/llvm/DefaultPasses.h b/include/llvm/DefaultPasses.h
index 929569d543..9f1ade86ab 100644
--- a/include/llvm/DefaultPasses.h
+++ b/include/llvm/DefaultPasses.h
@@ -14,7 +14,7 @@
 #ifndef LLVM_DEFAULT_PASS_SUPPORT_H
 #define LLVM_DEFAULT_PASS_SUPPORT_H
 
-#include <llvm/PassSupport.h>
+#include "llvm/PassSupport.h"
 
 namespace llvm {
 
diff --git a/include/llvm/DerivedTypes.h b/include/llvm/DerivedTypes.h
index da5ad27b1f..9a723084a6 100644
--- a/include/llvm/DerivedTypes.h
+++ b/include/llvm/DerivedTypes.h
@@ -19,6 +19,7 @@
 #define LLVM_DERIVED_TYPES_H
 
 #include "llvm/Type.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataTypes.h"
 
 namespace llvm {
@@ -94,8 +95,8 @@ public:
 /// FunctionType - Class to represent function types
 ///
 class FunctionType : public Type {
-  FunctionType(const FunctionType &);                   // Do not implement
-  const FunctionType &operator=(const FunctionType &);  // Do not implement
+  FunctionType(const FunctionType &) LLVM_DELETED_FUNCTION;
+  const FunctionType &operator=(const FunctionType &) LLVM_DELETED_FUNCTION;
   FunctionType(Type *Result, ArrayRef<Type*> Params, bool IsVarArgs);
 
 public:
@@ -187,8 +188,8 @@ public:
 /// generator for a target expects).
 ///
 class StructType : public CompositeType {
-  StructType(const StructType &);                   // Do not implement
-  const StructType &operator=(const StructType &);  // Do not implement
+  StructType(const StructType &) LLVM_DELETED_FUNCTION;
+  const StructType &operator=(const StructType &) LLVM_DELETED_FUNCTION;
   StructType(LLVMContext &C)
     : CompositeType(C, StructTyID), SymbolTableEntry(0) {}
   enum {
@@ -308,8 +309,8 @@ public:
 ///
 class SequentialType : public CompositeType {
   Type *ContainedType;               ///< Storage for the single contained type.
-  SequentialType(const SequentialType &);                  // Do not implement!
-  const SequentialType &operator=(const SequentialType &); // Do not implement!
+  SequentialType(const SequentialType &) LLVM_DELETED_FUNCTION;
+  const SequentialType &operator=(const SequentialType &) LLVM_DELETED_FUNCTION;
 
 protected:
   SequentialType(TypeID TID, Type *ElType)
@@ -336,8 +337,8 @@ public:
 class ArrayType : public SequentialType {
   uint64_t NumElements;
 
-  ArrayType(const ArrayType &);                   // Do not implement
-  const ArrayType &operator=(const ArrayType &);  // Do not implement
+  ArrayType(const ArrayType &) LLVM_DELETED_FUNCTION;
+  const ArrayType &operator=(const ArrayType &) LLVM_DELETED_FUNCTION;
   ArrayType(Type *ElType, uint64_t NumEl);
 public:
   /// ArrayType::get - This static method is the primary way to construct an
@@ -363,8 +364,8 @@ public:
 class VectorType : public SequentialType {
   unsigned NumElements;
 
-  VectorType(const VectorType &);                   // Do not implement
-  const VectorType &operator=(const VectorType &);  // Do not implement
+  VectorType(const VectorType &) LLVM_DELETED_FUNCTION;
+  const VectorType &operator=(const VectorType &) LLVM_DELETED_FUNCTION;
   VectorType(Type *ElType, unsigned NumEl);
 public:
   /// VectorType::get - This static method is the primary way to construct an
@@ -429,8 +430,8 @@ public:
 /// PointerType - Class to represent pointers.
 ///
 class PointerType : public SequentialType {
-  PointerType(const PointerType &);                   // Do not implement
-  const PointerType &operator=(const PointerType &);  // Do not implement
+  PointerType(const PointerType &) LLVM_DELETED_FUNCTION;
+  const PointerType &operator=(const PointerType &) LLVM_DELETED_FUNCTION;
   explicit PointerType(Type *ElType, unsigned AddrSpace);
 public:
   /// PointerType::get - This constructs a pointer to an object of the specified
diff --git a/include/llvm/ExecutionEngine/ExecutionEngine.h b/include/llvm/ExecutionEngine/ExecutionEngine.h
index ae8b68d024..3c6d3b8dbf 100644
--- a/include/llvm/ExecutionEngine/ExecutionEngine.h
+++ b/include/llvm/ExecutionEngine/ExecutionEngine.h
@@ -88,7 +88,7 @@ public:
 
   /// \brief Erase an entry from the mapping table.
   ///
-  /// \returns The address that \arg ToUnmap was happed to.
+  /// \returns The address that \p ToUnmap was happed to.
   void *RemoveMapping(const MutexGuard &, const GlobalValue *ToUnmap);
 };
 
@@ -244,7 +244,7 @@ public:
   /// Map the address of a JIT section as returned from the memory manager
   /// to the address in the target process as the running code will see it.
   /// This is the address which will be used for relocation resolution.
-  virtual void mapSectionAddress(void *LocalAddress, uint64_t TargetAddress) {
+  virtual void mapSectionAddress(const void *LocalAddress, uint64_t TargetAddress) {
     llvm_unreachable("Re-mapping of section addresses not supported with this "
                      "EE!");
   }
diff --git a/include/llvm/ExecutionEngine/RuntimeDyld.h b/include/llvm/ExecutionEngine/RuntimeDyld.h
index 9e5ad2feb0..a02cbe2d90 100644
--- a/include/llvm/ExecutionEngine/RuntimeDyld.h
+++ b/include/llvm/ExecutionEngine/RuntimeDyld.h
@@ -29,8 +29,8 @@ class MemoryBuffer;
 // FIXME: As the RuntimeDyld fills out, additional routines will be needed
 //        for the varying types of objects to be allocated.
 class RTDyldMemoryManager {
-  RTDyldMemoryManager(const RTDyldMemoryManager&);  // DO NOT IMPLEMENT
-  void operator=(const RTDyldMemoryManager&);       // DO NOT IMPLEMENT
+  RTDyldMemoryManager(const RTDyldMemoryManager&) LLVM_DELETED_FUNCTION;
+  void operator=(const RTDyldMemoryManager&) LLVM_DELETED_FUNCTION;
 public:
   RTDyldMemoryManager() {}
   virtual ~RTDyldMemoryManager();
@@ -50,8 +50,8 @@ public:
 };
 
 class RuntimeDyld {
-  RuntimeDyld(const RuntimeDyld &);     // DO NOT IMPLEMENT
-  void operator=(const RuntimeDyld &);  // DO NOT IMPLEMENT
+  RuntimeDyld(const RuntimeDyld &) LLVM_DELETED_FUNCTION;
+  void operator=(const RuntimeDyld &) LLVM_DELETED_FUNCTION;
 
   // RuntimeDyldImpl is the actual class. RuntimeDyld is just the public
   // interface.
@@ -84,7 +84,7 @@ public:
   /// Map the address of a JIT section as returned from the memory manager
   /// to the address in the target process as the running code will see it.
   /// This is the address which will be used for relocation resolution.
-  void mapSectionAddress(void *LocalAddress, uint64_t TargetAddress);
+  void mapSectionAddress(const void *LocalAddress, uint64_t TargetAddress);
 
   StringRef getErrorString();
 };
diff --git a/include/llvm/Function.h b/include/llvm/Function.h
index fdd90d1d8f..fbd2594a45 100644
--- a/include/llvm/Function.h
+++ b/include/llvm/Function.h
@@ -109,9 +109,9 @@ private:
       BuildLazyArguments();
   }
   void BuildLazyArguments() const;
-  
-  Function(const Function&); // DO NOT IMPLEMENT
-  void operator=(const Function&); // DO NOT IMPLEMENT
+
+  Function(const Function&) LLVM_DELETED_FUNCTION;
+  void operator=(const Function&) LLVM_DELETED_FUNCTION;
 
   /// Function ctor - If the (optional) Module argument is specified, the
   /// function is automatically inserted into the end of the function list for
diff --git a/include/llvm/GlobalAlias.h b/include/llvm/GlobalAlias.h
index 164d976588..a97ecd30c9 100644
--- a/include/llvm/GlobalAlias.h
+++ b/include/llvm/GlobalAlias.h
@@ -28,8 +28,8 @@ template<typename ValueSubClass, typename ItemParentClass>
 
 class GlobalAlias : public GlobalValue, public ilist_node<GlobalAlias> {
   friend class SymbolTableListTraits<GlobalAlias, Module>;
-  void operator=(const GlobalAlias &);     // Do not implement
-  GlobalAlias(const GlobalAlias &);     // Do not implement
+  void operator=(const GlobalAlias &) LLVM_DELETED_FUNCTION;
+  GlobalAlias(const GlobalAlias &) LLVM_DELETED_FUNCTION;
 
   void setParent(Module *parent);
 
diff --git a/include/llvm/GlobalValue.h b/include/llvm/GlobalValue.h
index a3942054f4..bd193b97c9 100644
--- a/include/llvm/GlobalValue.h
+++ b/include/llvm/GlobalValue.h
@@ -26,7 +26,7 @@ class PointerType;
 class Module;
 
 class GlobalValue : public Constant {
-  GlobalValue(const GlobalValue &);             // do not implement
+  GlobalValue(const GlobalValue &) LLVM_DELETED_FUNCTION;
 public:
   /// @brief An enumeration for the kinds of linkage for global values.
   enum LinkageTypes {
diff --git a/include/llvm/GlobalVariable.h b/include/llvm/GlobalVariable.h
index 99b7a73b35..27a2ea7fb9 100644
--- a/include/llvm/GlobalVariable.h
+++ b/include/llvm/GlobalVariable.h
@@ -34,9 +34,9 @@ template<typename ValueSubClass, typename ItemParentClass>
 
 class GlobalVariable : public GlobalValue, public ilist_node<GlobalVariable> {
   friend class SymbolTableListTraits<GlobalVariable, Module>;
-  void *operator new(size_t, unsigned);       // Do not implement
-  void operator=(const GlobalVariable &);     // Do not implement
-  GlobalVariable(const GlobalVariable &);     // Do not implement
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  void operator=(const GlobalVariable &) LLVM_DELETED_FUNCTION;
+  GlobalVariable(const GlobalVariable &) LLVM_DELETED_FUNCTION;
 
   void setParent(Module *parent);
 
diff --git a/include/llvm/IRBuilder.h b/include/llvm/IRBuilder.h
index d5b6f47f8a..ae82c25e3d 100644
--- a/include/llvm/IRBuilder.h
+++ b/include/llvm/IRBuilder.h
@@ -1052,7 +1052,7 @@ public:
 private:
   // Provided to resolve 'CreateIntCast(Ptr, Ptr, "...")', giving a compile time
   // error, instead of converting the string to bool for the isSigned parameter.
-  Value *CreateIntCast(Value *, Type *, const char *); // DO NOT IMPLEMENT
+  Value *CreateIntCast(Value *, Type *, const char *) LLVM_DELETED_FUNCTION;
 public:
   Value *CreateFPCast(Value *V, Type *DestTy, const Twine &Name = "") {
     if (V->getType() == DestTy)
@@ -1261,13 +1261,13 @@ public:
   // Utility creation methods
   //===--------------------------------------------------------------------===//
 
-  /// CreateIsNull - Return an i1 value testing if \arg Arg is null.
+  /// CreateIsNull - Return an i1 value testing if \p Arg is null.
   Value *CreateIsNull(Value *Arg, const Twine &Name = "") {
     return CreateICmpEQ(Arg, Constant::getNullValue(Arg->getType()),
                         Name);
   }
 
-  /// CreateIsNotNull - Return an i1 value testing if \arg Arg is not null.
+  /// CreateIsNotNull - Return an i1 value testing if \p Arg is not null.
   Value *CreateIsNotNull(Value *Arg, const Twine &Name = "") {
     return CreateICmpNE(Arg, Constant::getNullValue(Arg->getType()),
                         Name);
diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index b3cd79b8cc..4c93d480dd 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -167,6 +167,7 @@ void initializeMachineBlockPlacementStatsPass(PassRegistry&);
 void initializeMachineBranchProbabilityInfoPass(PassRegistry&);
 void initializeMachineCSEPass(PassRegistry&);
 void initializeMachineDominatorTreePass(PassRegistry&);
+void initializeMachinePostDominatorTreePass(PassRegistry&);
 void initializeMachineLICMPass(PassRegistry&);
 void initializeMachineLoopInfoPass(PassRegistry&);
 void initializeMachineLoopRangesPass(PassRegistry&);
@@ -221,6 +222,7 @@ void initializeRegionOnlyViewerPass(PassRegistry&);
 void initializeRegionPrinterPass(PassRegistry&);
 void initializeRegionViewerPass(PassRegistry&);
 void initializeSCCPPass(PassRegistry&);
+void initializeSROAPass(PassRegistry&);
 void initializeSROA_DTPass(PassRegistry&);
 void initializeSROA_SSAUpPass(PassRegistry&);
 void initializeScalarEvolutionAliasAnalysisPass(PassRegistry&);
diff --git a/include/llvm/InlineAsm.h b/include/llvm/InlineAsm.h
index a0aecfd8e4..58c1e84e53 100644
--- a/include/llvm/InlineAsm.h
+++ b/include/llvm/InlineAsm.h
@@ -44,8 +44,8 @@ private:
   friend class ConstantUniqueMap<InlineAsmKeyType, const InlineAsmKeyType&,
                                  PointerType, InlineAsm, false>;
 
-  InlineAsm(const InlineAsm &);             // do not implement
-  void operator=(const InlineAsm&);         // do not implement
+  InlineAsm(const InlineAsm &) LLVM_DELETED_FUNCTION;
+  void operator=(const InlineAsm&) LLVM_DELETED_FUNCTION;
 
   std::string AsmString, Constraints;
   bool HasSideEffects;
diff --git a/include/llvm/InstrTypes.h b/include/llvm/InstrTypes.h
index 6291a6d988..e957d759e7 100644
--- a/include/llvm/InstrTypes.h
+++ b/include/llvm/InstrTypes.h
@@ -88,7 +88,7 @@ public:
 //===----------------------------------------------------------------------===//
 
 class UnaryInstruction : public Instruction {
-  void *operator new(size_t, unsigned);      // Do not implement
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
 
 protected:
   UnaryInstruction(Type *Ty, unsigned iType, Value *V,
@@ -138,14 +138,14 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(UnaryInstruction, Value)
 //===----------------------------------------------------------------------===//
 
 class BinaryOperator : public Instruction {
-  void *operator new(size_t, unsigned); // Do not implement
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
 protected:
   void init(BinaryOps iType);
   BinaryOperator(BinaryOps iType, Value *S1, Value *S2, Type *Ty,
                  const Twine &Name, Instruction *InsertBefore);
   BinaryOperator(BinaryOps iType, Value *S1, Value *S2, Type *Ty,
                  const Twine &Name, BasicBlock *InsertAtEnd);
-  virtual BinaryOperator *clone_impl() const;
+  virtual BinaryOperator *clone_impl() const LLVM_OVERRIDE;
 public:
   // allocate space for exactly two operands
   void *operator new(size_t s) {
@@ -388,7 +388,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(BinaryOperator, Value)
 /// if (isa<CastInst>(Instr)) { ... }
 /// @brief Base class of casting instructions.
 class CastInst : public UnaryInstruction {
-  virtual void anchor();
+  virtual void anchor() LLVM_OVERRIDE;
 protected:
   /// @brief Constructor with insert-before-instruction semantics for subclasses
   CastInst(Type *Ty, unsigned iType, Value *S,
@@ -627,8 +627,8 @@ public:
 /// This class is the base class for the comparison instructions.
 /// @brief Abstract base class of comparison instructions.
 class CmpInst : public Instruction {
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
-  CmpInst(); // do not implement
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  CmpInst() LLVM_DELETED_FUNCTION;
 protected:
   CmpInst(Type *ty, Instruction::OtherOps op, unsigned short pred,
           Value *LHS, Value *RHS, const Twine &Name = "",
@@ -638,7 +638,7 @@ protected:
           Value *LHS, Value *RHS, const Twine &Name,
           BasicBlock *InsertAtEnd);
 
-  virtual void Anchor() const; // Out of line virtual method.
+  virtual void anchor() LLVM_OVERRIDE; // Out of line virtual method.
 public:
   /// This enumeration lists the possible predicates for CmpInst subclasses.
   /// Values in the range 0-31 are reserved for FCmpInst, while values in the
diff --git a/include/llvm/Instruction.h b/include/llvm/Instruction.h
index 5512dcc9e6..c85eda28f4 100644
--- a/include/llvm/Instruction.h
+++ b/include/llvm/Instruction.h
@@ -28,8 +28,8 @@ template<typename ValueSubClass, typename ItemParentClass>
   class SymbolTableListTraits;
 
 class Instruction : public User, public ilist_node<Instruction> {
-  void operator=(const Instruction &);     // Do not implement
-  Instruction(const Instruction &);        // Do not implement
+  void operator=(const Instruction &) LLVM_DELETED_FUNCTION;
+  Instruction(const Instruction &) LLVM_DELETED_FUNCTION;
 
   BasicBlock *Parent;
   DebugLoc DbgLoc;                         // 'dbg' Metadata cache.
diff --git a/include/llvm/Instructions.h b/include/llvm/Instructions.h
index f5187e6832..7a0379e371 100644
--- a/include/llvm/Instructions.h
+++ b/include/llvm/Instructions.h
@@ -255,7 +255,7 @@ private:
 /// StoreInst - an instruction for storing to memory
 ///
 class StoreInst : public Instruction {
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
   void AssertOK();
 protected:
   virtual StoreInst *clone_impl() const;
@@ -382,7 +382,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(StoreInst, Value)
 /// FenceInst - an instruction for ordering other memory operations
 ///
 class FenceInst : public Instruction {
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
   void Init(AtomicOrdering Ordering, SynchronizationScope SynchScope);
 protected:
   virtual FenceInst *clone_impl() const;
@@ -450,7 +450,7 @@ private:
 /// there.  Returns the value that was loaded.
 ///
 class AtomicCmpXchgInst : public Instruction {
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
   void Init(Value *Ptr, Value *Cmp, Value *NewVal,
             AtomicOrdering Ordering, SynchronizationScope SynchScope);
 protected:
@@ -557,7 +557,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(AtomicCmpXchgInst, Value)
 /// the old value.
 ///
 class AtomicRMWInst : public Instruction {
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
 protected:
   virtual AtomicRMWInst *clone_impl() const;
 public:
@@ -1839,7 +1839,7 @@ ExtractValueInst::ExtractValueInst(Value *Agg,
 class InsertValueInst : public Instruction {
   SmallVector<unsigned, 4> Indices;
 
-  void *operator new(size_t, unsigned); // Do not implement
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
   InsertValueInst(const InsertValueInst &IVI);
   void init(Value *Agg, Value *Val, ArrayRef<unsigned> Idxs,
             const Twine &NameStr);
@@ -1970,7 +1970,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(InsertValueInst, Value)
 // scientist's overactive imagination.
 //
 class PHINode : public Instruction {
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
   /// ReservedSpace - The number of operands actually allocated.  NumOperands is
   /// the number actually in use.
   unsigned ReservedSpace;
@@ -2178,7 +2178,7 @@ class LandingPadInst : public Instruction {
 public:
   enum ClauseType { Catch, Filter };
 private:
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
   // Allocate space for exactly zero operands.
   void *operator new(size_t s) {
     return User::operator new(s, 0);
@@ -2445,7 +2445,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(BranchInst, Value)
 /// SwitchInst - Multiway switch
 ///
 class SwitchInst : public TerminatorInst {
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
   unsigned ReservedSpace;
   // Operands format:
   // Operand[0]    = Value to switch on
@@ -2613,7 +2613,7 @@ public:
   }
 
   /// addCase - Add an entry to the switch instruction...
-  /// @Deprecated
+  /// @deprecated
   /// Note:
   /// This action invalidates case_end(). Old case_end() iterator will
   /// point to the added case.
@@ -2699,7 +2699,7 @@ public:
     }
     
     /// Resolves case value for current case.
-    /// @Deprecated
+    /// @deprecated
     ConstantIntTy *getCaseValue() {
       assert(Index < SI->getNumCases() && "Index out the number of cases.");
       IntegersSubsetRef CaseRanges = *SubsetIt;
@@ -2803,7 +2803,7 @@ public:
     CaseIt(const ParentTy& Src) : ParentTy(Src) {}
 
     /// Sets the new value for current case.    
-    /// @Deprecated.
+    /// @deprecated.
     void setValue(ConstantInt *V) {
       assert(Index < SI->getNumCases() && "Index out the number of cases.");
       IntegersSubsetToBB Mapping;
@@ -2857,7 +2857,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(SwitchInst, Value)
 /// IndirectBrInst - Indirect Branch Instruction.
 ///
 class IndirectBrInst : public TerminatorInst {
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
   unsigned ReservedSpace;
   // Operand[0]    = Value to switch on
   // Operand[1]    = Default basic block destination
@@ -3251,7 +3251,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ResumeInst, Value)
 /// end of the block cannot be reached.
 ///
 class UnreachableInst : public TerminatorInst {
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
 protected:
   virtual UnreachableInst *clone_impl() const;
 
diff --git a/include/llvm/IntrinsicInst.h b/include/llvm/IntrinsicInst.h
index 1cebdd2ee6..e9bf0f6759 100644
--- a/include/llvm/IntrinsicInst.h
+++ b/include/llvm/IntrinsicInst.h
@@ -34,9 +34,9 @@ namespace llvm {
   /// functions.  This allows the standard isa/dyncast/cast functionality to
   /// work with calls to intrinsic functions.
   class IntrinsicInst : public CallInst {
-    IntrinsicInst();                      // DO NOT IMPLEMENT
-    IntrinsicInst(const IntrinsicInst&);  // DO NOT IMPLEMENT
-    void operator=(const IntrinsicInst&); // DO NOT IMPLEMENT
+    IntrinsicInst() LLVM_DELETED_FUNCTION;
+    IntrinsicInst(const IntrinsicInst&) LLVM_DELETED_FUNCTION;
+    void operator=(const IntrinsicInst&) LLVM_DELETED_FUNCTION;
   public:
     /// getIntrinsicID - Return the intrinsic ID of this intrinsic.
     ///
diff --git a/include/llvm/Intrinsics.td b/include/llvm/Intrinsics.td
index 116d80586c..d960f8c789 100644
--- a/include/llvm/Intrinsics.td
+++ b/include/llvm/Intrinsics.td
@@ -121,15 +121,21 @@ def llvm_metadata_ty   : LLVMType<MetadataVT>;                    // !{...}
 def llvm_x86mmx_ty     : LLVMType<x86mmx>;
 def llvm_ptrx86mmx_ty  : LLVMPointerType<llvm_x86mmx_ty>;         // <1 x i64>*
 
+def llvm_v2i1_ty       : LLVMType<v2i1>;     //  2 x i1
+def llvm_v4i1_ty       : LLVMType<v4i1>;     //  4 x i1
+def llvm_v8i1_ty       : LLVMType<v8i1>;     //  8 x i1
+def llvm_v16i1_ty      : LLVMType<v16i1>;    // 16 x i1
 def llvm_v2i8_ty       : LLVMType<v2i8>;     //  2 x i8
 def llvm_v4i8_ty       : LLVMType<v4i8>;     //  4 x i8
 def llvm_v8i8_ty       : LLVMType<v8i8>;     //  8 x i8
 def llvm_v16i8_ty      : LLVMType<v16i8>;    // 16 x i8
 def llvm_v32i8_ty      : LLVMType<v32i8>;    // 32 x i8
+def llvm_v1i16_ty      : LLVMType<v1i16>;    //  1 x i16
 def llvm_v2i16_ty      : LLVMType<v2i16>;    //  2 x i16
 def llvm_v4i16_ty      : LLVMType<v4i16>;    //  4 x i16
 def llvm_v8i16_ty      : LLVMType<v8i16>;    //  8 x i16
 def llvm_v16i16_ty     : LLVMType<v16i16>;   // 16 x i16
+def llvm_v1i32_ty      : LLVMType<v1i32>;    //  1 x i32
 def llvm_v2i32_ty      : LLVMType<v2i32>;    //  2 x i32
 def llvm_v4i32_ty      : LLVMType<v4i32>;    //  4 x i32
 def llvm_v8i32_ty      : LLVMType<v8i32>;    //  8 x i32
diff --git a/include/llvm/IntrinsicsARM.td b/include/llvm/IntrinsicsARM.td
index fa8034e0c2..93b1ae1dc8 100644
--- a/include/llvm/IntrinsicsARM.td
+++ b/include/llvm/IntrinsicsARM.td
@@ -16,147 +16,136 @@
 // TLS
 
 let TargetPrefix = "arm" in {  // All intrinsics start with "llvm.arm.".
-  def int_arm_thread_pointer : GCCBuiltin<"__builtin_thread_pointer">,
-              Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
-}
+
+def int_arm_thread_pointer : GCCBuiltin<"__builtin_thread_pointer">,
+            Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
 
 //===----------------------------------------------------------------------===//
 // Saturating Arithmentic
 
-let TargetPrefix = "arm" in {  // All intrinsics start with "llvm.arm.".
-  def int_arm_qadd : GCCBuiltin<"__builtin_arm_qadd">,
-              Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
-                        [IntrNoMem, Commutative]>;
-  def int_arm_qsub : GCCBuiltin<"__builtin_arm_qsub">,
-              Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_arm_ssat : GCCBuiltin<"__builtin_arm_ssat">,
-              Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_arm_usat : GCCBuiltin<"__builtin_arm_usat">,
-              Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-}
+def int_arm_qadd : GCCBuiltin<"__builtin_arm_qadd">,
+    Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty],
+    [IntrNoMem, Commutative]>;
+def int_arm_qsub : GCCBuiltin<"__builtin_arm_qsub">,
+    Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_arm_ssat : GCCBuiltin<"__builtin_arm_ssat">,
+    Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+def int_arm_usat : GCCBuiltin<"__builtin_arm_usat">,
+    Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
 
 //===----------------------------------------------------------------------===//
 // Load and Store exclusive doubleword
 
-let TargetPrefix = "arm" in {  // All intrinsics start with "llvm.arm.".
-  def int_arm_strexd : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty,
-                                  llvm_ptr_ty], [IntrReadWriteArgMem]>;
-  def int_arm_ldrexd : Intrinsic<[llvm_i32_ty, llvm_i32_ty], [llvm_ptr_ty],
-                                 [IntrReadArgMem]>;
-}
+def int_arm_strexd : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty,
+    llvm_ptr_ty], [IntrReadWriteArgMem]>;
+def int_arm_ldrexd : Intrinsic<[llvm_i32_ty, llvm_i32_ty], [llvm_ptr_ty],
+    [IntrReadArgMem]>;
 
 //===----------------------------------------------------------------------===//
 // VFP
 
-let TargetPrefix = "arm" in {  // All intrinsics start with "llvm.arm.".
-  def int_arm_get_fpscr : GCCBuiltin<"__builtin_arm_get_fpscr">, 
-                         Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>;
-  def int_arm_set_fpscr : GCCBuiltin<"__builtin_arm_set_fpscr">, 
-                         Intrinsic<[], [llvm_i32_ty], []>;
-  def int_arm_vcvtr     : Intrinsic<[llvm_float_ty], [llvm_anyfloat_ty],
-                                    [IntrNoMem]>;
-  def int_arm_vcvtru    : Intrinsic<[llvm_float_ty], [llvm_anyfloat_ty],
-                                    [IntrNoMem]>;
-}
+def int_arm_get_fpscr : GCCBuiltin<"__builtin_arm_get_fpscr">,
+                       Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>;
+def int_arm_set_fpscr : GCCBuiltin<"__builtin_arm_set_fpscr">,
+                       Intrinsic<[], [llvm_i32_ty], []>;
+def int_arm_vcvtr     : Intrinsic<[llvm_float_ty], [llvm_anyfloat_ty],
+                                  [IntrNoMem]>;
+def int_arm_vcvtru    : Intrinsic<[llvm_float_ty], [llvm_anyfloat_ty],
+                                  [IntrNoMem]>;
 
 //===----------------------------------------------------------------------===//
 // Coprocessor
 
-let TargetPrefix = "arm" in {  // All intrinsics start with "llvm.arm.".
-  // Move to coprocessor
-  def int_arm_mcr : GCCBuiltin<"__builtin_arm_mcr">,
-     Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
-                    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
-  def int_arm_mcr2 : GCCBuiltin<"__builtin_arm_mcr2">,
-     Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
-                    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
-
-  // Move from coprocessor
-  def int_arm_mrc : GCCBuiltin<"__builtin_arm_mrc">,
-     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
-                               llvm_i32_ty, llvm_i32_ty], []>;
-  def int_arm_mrc2 : GCCBuiltin<"__builtin_arm_mrc2">,
-     Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
-                               llvm_i32_ty, llvm_i32_ty], []>;
-
-  // Coprocessor data processing
-  def int_arm_cdp : GCCBuiltin<"__builtin_arm_cdp">,
-     Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
-                    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
-  def int_arm_cdp2 : GCCBuiltin<"__builtin_arm_cdp2">,
-     Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
-                    llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
-
-  // Move from two registers to coprocessor
-  def int_arm_mcrr : GCCBuiltin<"__builtin_arm_mcrr">,
-     Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
-                    llvm_i32_ty, llvm_i32_ty], []>;
-  def int_arm_mcrr2 : GCCBuiltin<"__builtin_arm_mcrr2">,
-     Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
-                    llvm_i32_ty, llvm_i32_ty], []>;
-}
+// Move to coprocessor
+def int_arm_mcr : GCCBuiltin<"__builtin_arm_mcr">,
+   Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                  llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+def int_arm_mcr2 : GCCBuiltin<"__builtin_arm_mcr2">,
+   Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                  llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+
+// Move from coprocessor
+def int_arm_mrc : GCCBuiltin<"__builtin_arm_mrc">,
+   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                             llvm_i32_ty, llvm_i32_ty], []>;
+def int_arm_mrc2 : GCCBuiltin<"__builtin_arm_mrc2">,
+   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                             llvm_i32_ty, llvm_i32_ty], []>;
+
+// Coprocessor data processing
+def int_arm_cdp : GCCBuiltin<"__builtin_arm_cdp">,
+   Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                  llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+def int_arm_cdp2 : GCCBuiltin<"__builtin_arm_cdp2">,
+   Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                  llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], []>;
+
+// Move from two registers to coprocessor
+def int_arm_mcrr : GCCBuiltin<"__builtin_arm_mcrr">,
+   Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                  llvm_i32_ty, llvm_i32_ty], []>;
+def int_arm_mcrr2 : GCCBuiltin<"__builtin_arm_mcrr2">,
+   Intrinsic<[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
+                  llvm_i32_ty, llvm_i32_ty], []>;
 
 //===----------------------------------------------------------------------===//
 // Advanced SIMD (NEON)
 
-let TargetPrefix = "arm" in {  // All intrinsics start with "llvm.arm.".
-
-  // The following classes do not correspond directly to GCC builtins.
-  class Neon_1Arg_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>], [IntrNoMem]>;
-  class Neon_1Arg_Narrow_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMExtendedElementVectorType<0>], [IntrNoMem]>;
-  class Neon_2Arg_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
-                [IntrNoMem]>;
-  class Neon_2Arg_Narrow_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMExtendedElementVectorType<0>,
-                 LLVMExtendedElementVectorType<0>],
-                [IntrNoMem]>;
-  class Neon_2Arg_Long_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMTruncatedElementVectorType<0>,
-                 LLVMTruncatedElementVectorType<0>],
-                [IntrNoMem]>;
-  class Neon_3Arg_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
-                [IntrNoMem]>;
-  class Neon_3Arg_Long_Intrinsic
-    : Intrinsic<[llvm_anyvector_ty],
-                [LLVMMatchType<0>,
-                 LLVMTruncatedElementVectorType<0>,
-                 LLVMTruncatedElementVectorType<0>],
-                [IntrNoMem]>;
-  class Neon_CvtFxToFP_Intrinsic
-    : Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
-  class Neon_CvtFPToFx_Intrinsic
-    : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, llvm_i32_ty], [IntrNoMem]>;
-
-  // The table operands for VTBL and VTBX consist of 1 to 4 v8i8 vectors.
-  // Besides the table, VTBL has one other v8i8 argument and VTBX has two.
-  // Overall, the classes range from 2 to 6 v8i8 arguments.
-  class Neon_Tbl2Arg_Intrinsic
-    : Intrinsic<[llvm_v8i8_ty],
-                [llvm_v8i8_ty, llvm_v8i8_ty], [IntrNoMem]>;
-  class Neon_Tbl3Arg_Intrinsic
-    : Intrinsic<[llvm_v8i8_ty],
-                [llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty], [IntrNoMem]>;
-  class Neon_Tbl4Arg_Intrinsic
-    : Intrinsic<[llvm_v8i8_ty],
-                [llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty],
-                [IntrNoMem]>;
-  class Neon_Tbl5Arg_Intrinsic
-    : Intrinsic<[llvm_v8i8_ty],
-                [llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty,
-                 llvm_v8i8_ty], [IntrNoMem]>;
-  class Neon_Tbl6Arg_Intrinsic
-    : Intrinsic<[llvm_v8i8_ty],
-                [llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty,
-                 llvm_v8i8_ty, llvm_v8i8_ty], [IntrNoMem]>;
-}
+// The following classes do not correspond directly to GCC builtins.
+class Neon_1Arg_Intrinsic
+  : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+class Neon_1Arg_Narrow_Intrinsic
+  : Intrinsic<[llvm_anyvector_ty],
+              [LLVMExtendedElementVectorType<0>], [IntrNoMem]>;
+class Neon_2Arg_Intrinsic
+  : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
+              [IntrNoMem]>;
+class Neon_2Arg_Narrow_Intrinsic
+  : Intrinsic<[llvm_anyvector_ty],
+              [LLVMExtendedElementVectorType<0>,
+               LLVMExtendedElementVectorType<0>],
+              [IntrNoMem]>;
+class Neon_2Arg_Long_Intrinsic
+  : Intrinsic<[llvm_anyvector_ty],
+              [LLVMTruncatedElementVectorType<0>,
+               LLVMTruncatedElementVectorType<0>],
+              [IntrNoMem]>;
+class Neon_3Arg_Intrinsic
+  : Intrinsic<[llvm_anyvector_ty],
+              [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+              [IntrNoMem]>;
+class Neon_3Arg_Long_Intrinsic
+  : Intrinsic<[llvm_anyvector_ty],
+              [LLVMMatchType<0>,
+               LLVMTruncatedElementVectorType<0>,
+               LLVMTruncatedElementVectorType<0>],
+              [IntrNoMem]>;
+class Neon_CvtFxToFP_Intrinsic
+  : Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
+class Neon_CvtFPToFx_Intrinsic
+  : Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty, llvm_i32_ty], [IntrNoMem]>;
+
+// The table operands for VTBL and VTBX consist of 1 to 4 v8i8 vectors.
+// Besides the table, VTBL has one other v8i8 argument and VTBX has two.
+// Overall, the classes range from 2 to 6 v8i8 arguments.
+class Neon_Tbl2Arg_Intrinsic
+  : Intrinsic<[llvm_v8i8_ty],
+              [llvm_v8i8_ty, llvm_v8i8_ty], [IntrNoMem]>;
+class Neon_Tbl3Arg_Intrinsic
+  : Intrinsic<[llvm_v8i8_ty],
+              [llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty], [IntrNoMem]>;
+class Neon_Tbl4Arg_Intrinsic
+  : Intrinsic<[llvm_v8i8_ty],
+              [llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty],
+              [IntrNoMem]>;
+class Neon_Tbl5Arg_Intrinsic
+  : Intrinsic<[llvm_v8i8_ty],
+              [llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty,
+               llvm_v8i8_ty], [IntrNoMem]>;
+class Neon_Tbl6Arg_Intrinsic
+  : Intrinsic<[llvm_v8i8_ty],
+              [llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty, llvm_v8i8_ty,
+               llvm_v8i8_ty, llvm_v8i8_ty], [IntrNoMem]>;
 
 // Arithmetic ops
 
@@ -209,20 +198,18 @@ def int_arm_neon_vsubhn : Neon_2Arg_Narrow_Intrinsic;
 def int_arm_neon_vrsubhn : Neon_2Arg_Narrow_Intrinsic;
 
 // Vector Absolute Compare.
-let TargetPrefix = "arm" in {
-  def int_arm_neon_vacged : Intrinsic<[llvm_v2i32_ty],
-                                      [llvm_v2f32_ty, llvm_v2f32_ty],
-                                      [IntrNoMem]>;
-  def int_arm_neon_vacgeq : Intrinsic<[llvm_v4i32_ty],
-                                      [llvm_v4f32_ty, llvm_v4f32_ty],
-                                      [IntrNoMem]>;
-  def int_arm_neon_vacgtd : Intrinsic<[llvm_v2i32_ty],
-                                      [llvm_v2f32_ty, llvm_v2f32_ty],
-                                      [IntrNoMem]>;
-  def int_arm_neon_vacgtq : Intrinsic<[llvm_v4i32_ty],
-                                      [llvm_v4f32_ty, llvm_v4f32_ty],
-                                      [IntrNoMem]>;
-}
+def int_arm_neon_vacged : Intrinsic<[llvm_v2i32_ty],
+                                    [llvm_v2f32_ty, llvm_v2f32_ty],
+                                    [IntrNoMem]>;
+def int_arm_neon_vacgeq : Intrinsic<[llvm_v4i32_ty],
+                                    [llvm_v4f32_ty, llvm_v4f32_ty],
+                                    [IntrNoMem]>;
+def int_arm_neon_vacgtd : Intrinsic<[llvm_v2i32_ty],
+                                    [llvm_v2f32_ty, llvm_v2f32_ty],
+                                    [IntrNoMem]>;
+def int_arm_neon_vacgtq : Intrinsic<[llvm_v4i32_ty],
+                                    [llvm_v4f32_ty, llvm_v4f32_ty],
+                                    [IntrNoMem]>;
 
 // Vector Absolute Differences.
 def int_arm_neon_vabds : Neon_2Arg_Intrinsic;
@@ -235,24 +222,20 @@ def int_arm_neon_vpadd : Neon_2Arg_Intrinsic;
 // Note: This is different than the other "long" NEON intrinsics because
 // the result vector has half as many elements as the source vector.
 // The source and destination vector types must be specified separately.
-let TargetPrefix = "arm" in {
-  def int_arm_neon_vpaddls : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty],
-                                       [IntrNoMem]>;
-  def int_arm_neon_vpaddlu : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty],
-                                       [IntrNoMem]>;
-}
+def int_arm_neon_vpaddls : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty],
+                                     [IntrNoMem]>;
+def int_arm_neon_vpaddlu : Intrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty],
+                                     [IntrNoMem]>;
 
 // Vector Pairwise Add and Accumulate Long.
 // Note: This is similar to vpaddl but the destination vector also appears
 // as the first argument.
-let TargetPrefix = "arm" in {
-  def int_arm_neon_vpadals : Intrinsic<[llvm_anyvector_ty],
-                                       [LLVMMatchType<0>, llvm_anyvector_ty],
-                                       [IntrNoMem]>;
-  def int_arm_neon_vpadalu : Intrinsic<[llvm_anyvector_ty],
-                                       [LLVMMatchType<0>, llvm_anyvector_ty],
-                                       [IntrNoMem]>;
-}
+def int_arm_neon_vpadals : Intrinsic<[llvm_anyvector_ty],
+                                     [LLVMMatchType<0>, llvm_anyvector_ty],
+                                     [IntrNoMem]>;
+def int_arm_neon_vpadalu : Intrinsic<[llvm_anyvector_ty],
+                                     [LLVMMatchType<0>, llvm_anyvector_ty],
+                                     [IntrNoMem]>;
 
 // Vector Pairwise Maximum and Minimum.
 def int_arm_neon_vpmaxs : Neon_2Arg_Intrinsic;
@@ -364,79 +347,83 @@ def int_arm_neon_vtbx2 : Neon_Tbl4Arg_Intrinsic;
 def int_arm_neon_vtbx3 : Neon_Tbl5Arg_Intrinsic;
 def int_arm_neon_vtbx4 : Neon_Tbl6Arg_Intrinsic;
 
-let TargetPrefix = "arm" in {
-
-  // De-interleaving vector loads from N-element structures.
-  // Source operands are the address and alignment.
-  def int_arm_neon_vld1 : Intrinsic<[llvm_anyvector_ty],
-                                    [llvm_ptr_ty, llvm_i32_ty],
-                                    [IntrReadArgMem]>;
-  def int_arm_neon_vld2 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
-                                    [llvm_ptr_ty, llvm_i32_ty],
-                                    [IntrReadArgMem]>;
-  def int_arm_neon_vld3 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
-                                     LLVMMatchType<0>],
-                                    [llvm_ptr_ty, llvm_i32_ty],
-                                    [IntrReadArgMem]>;
-  def int_arm_neon_vld4 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
-                                     LLVMMatchType<0>, LLVMMatchType<0>],
-                                    [llvm_ptr_ty, llvm_i32_ty],
-                                    [IntrReadArgMem]>;
-
-  // Vector load N-element structure to one lane.
-  // Source operands are: the address, the N input vectors (since only one
-  // lane is assigned), the lane number, and the alignment.
-  def int_arm_neon_vld2lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
-                                        [llvm_ptr_ty, LLVMMatchType<0>,
-                                         LLVMMatchType<0>, llvm_i32_ty,
-                                         llvm_i32_ty], [IntrReadArgMem]>;
-  def int_arm_neon_vld3lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
-                                         LLVMMatchType<0>],
-                                        [llvm_ptr_ty, LLVMMatchType<0>,
-                                         LLVMMatchType<0>, LLVMMatchType<0>,
-                                         llvm_i32_ty, llvm_i32_ty],
-                                        [IntrReadArgMem]>;
-  def int_arm_neon_vld4lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
-                                         LLVMMatchType<0>, LLVMMatchType<0>],
-                                        [llvm_ptr_ty, LLVMMatchType<0>,
-                                         LLVMMatchType<0>, LLVMMatchType<0>,
-                                         LLVMMatchType<0>, llvm_i32_ty,
-                                         llvm_i32_ty], [IntrReadArgMem]>;
-
-  // Interleaving vector stores from N-element structures.
-  // Source operands are: the address, the N vectors, and the alignment.
-  def int_arm_neon_vst1 : Intrinsic<[],
-                                    [llvm_ptr_ty, llvm_anyvector_ty,
-                                     llvm_i32_ty], [IntrReadWriteArgMem]>;
-  def int_arm_neon_vst2 : Intrinsic<[],
-                                    [llvm_ptr_ty, llvm_anyvector_ty,
-                                     LLVMMatchType<0>, llvm_i32_ty],
-                                    [IntrReadWriteArgMem]>;
-  def int_arm_neon_vst3 : Intrinsic<[],
-                                    [llvm_ptr_ty, llvm_anyvector_ty,
-                                     LLVMMatchType<0>, LLVMMatchType<0>,
-                                     llvm_i32_ty], [IntrReadWriteArgMem]>;
-  def int_arm_neon_vst4 : Intrinsic<[],
-                                    [llvm_ptr_ty, llvm_anyvector_ty,
-                                     LLVMMatchType<0>, LLVMMatchType<0>,
-                                     LLVMMatchType<0>, llvm_i32_ty],
-                                    [IntrReadWriteArgMem]>;
-
-  // Vector store N-element structure from one lane.
-  // Source operands are: the address, the N vectors, the lane number, and
-  // the alignment.
-  def int_arm_neon_vst2lane : Intrinsic<[],
-                                        [llvm_ptr_ty, llvm_anyvector_ty,
-                                         LLVMMatchType<0>, llvm_i32_ty,
-                                         llvm_i32_ty], [IntrReadWriteArgMem]>;
-  def int_arm_neon_vst3lane : Intrinsic<[],
-                                        [llvm_ptr_ty, llvm_anyvector_ty,
-                                         LLVMMatchType<0>, LLVMMatchType<0>,
-                                         llvm_i32_ty, llvm_i32_ty],
-                                        [IntrReadWriteArgMem]>;
-  def int_arm_neon_vst4lane : Intrinsic<[],
-                                        [llvm_ptr_ty, llvm_anyvector_ty,
-                                         LLVMMatchType<0>, LLVMMatchType<0>,
-                                         LLVMMatchType<0>, llvm_i32_ty,
-                                         llvm_i32_ty], [IntrReadWriteArgMem]>;
-}
+// De-interleaving vector loads from N-element structures.
+// Source operands are the address and alignment.
+def int_arm_neon_vld1 : Intrinsic<[llvm_anyvector_ty],
+                                  [llvm_ptr_ty, llvm_i32_ty],
+                                  [IntrReadArgMem]>;
+def int_arm_neon_vld2 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
+                                  [llvm_ptr_ty, llvm_i32_ty],
+                                  [IntrReadArgMem]>;
+def int_arm_neon_vld3 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
+                                   LLVMMatchType<0>],
+                                  [llvm_ptr_ty, llvm_i32_ty],
+                                  [IntrReadArgMem]>;
+def int_arm_neon_vld4 : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
+                                   LLVMMatchType<0>, LLVMMatchType<0>],
+                                  [llvm_ptr_ty, llvm_i32_ty],
+                                  [IntrReadArgMem]>;
+
+// Vector load N-element structure to one lane.
+// Source operands are: the address, the N input vectors (since only one
+// lane is assigned), the lane number, and the alignment.
+def int_arm_neon_vld2lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
+                                      [llvm_ptr_ty, LLVMMatchType<0>,
+                                       LLVMMatchType<0>, llvm_i32_ty,
+                                       llvm_i32_ty], [IntrReadArgMem]>;
+def int_arm_neon_vld3lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
+                                       LLVMMatchType<0>],
+                                      [llvm_ptr_ty, LLVMMatchType<0>,
+                                       LLVMMatchType<0>, LLVMMatchType<0>,
+                                       llvm_i32_ty, llvm_i32_ty],
+                                      [IntrReadArgMem]>;
+def int_arm_neon_vld4lane : Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>,
+                                       LLVMMatchType<0>, LLVMMatchType<0>],
+                                      [llvm_ptr_ty, LLVMMatchType<0>,
+                                       LLVMMatchType<0>, LLVMMatchType<0>,
+                                       LLVMMatchType<0>, llvm_i32_ty,
+                                       llvm_i32_ty], [IntrReadArgMem]>;
+
+// Interleaving vector stores from N-element structures.
+// Source operands are: the address, the N vectors, and the alignment.
+def int_arm_neon_vst1 : Intrinsic<[],
+                                  [llvm_ptr_ty, llvm_anyvector_ty,
+                                   llvm_i32_ty], [IntrReadWriteArgMem]>;
+def int_arm_neon_vst2 : Intrinsic<[],
+                                  [llvm_ptr_ty, llvm_anyvector_ty,
+                                   LLVMMatchType<0>, llvm_i32_ty],
+                                  [IntrReadWriteArgMem]>;
+def int_arm_neon_vst3 : Intrinsic<[],
+                                  [llvm_ptr_ty, llvm_anyvector_ty,
+                                   LLVMMatchType<0>, LLVMMatchType<0>,
+                                   llvm_i32_ty], [IntrReadWriteArgMem]>;
+def int_arm_neon_vst4 : Intrinsic<[],
+                                  [llvm_ptr_ty, llvm_anyvector_ty,
+                                   LLVMMatchType<0>, LLVMMatchType<0>,
+                                   LLVMMatchType<0>, llvm_i32_ty],
+                                  [IntrReadWriteArgMem]>;
+
+// Vector store N-element structure from one lane.
+// Source operands are: the address, the N vectors, the lane number, and
+// the alignment.
+def int_arm_neon_vst2lane : Intrinsic<[],
+                                      [llvm_ptr_ty, llvm_anyvector_ty,
+                                       LLVMMatchType<0>, llvm_i32_ty,
+                                       llvm_i32_ty], [IntrReadWriteArgMem]>;
+def int_arm_neon_vst3lane : Intrinsic<[],
+                                      [llvm_ptr_ty, llvm_anyvector_ty,
+                                       LLVMMatchType<0>, LLVMMatchType<0>,
+                                       llvm_i32_ty, llvm_i32_ty],
+                                      [IntrReadWriteArgMem]>;
+def int_arm_neon_vst4lane : Intrinsic<[],
+                                      [llvm_ptr_ty, llvm_anyvector_ty,
+                                       LLVMMatchType<0>, LLVMMatchType<0>,
+                                       LLVMMatchType<0>, llvm_i32_ty,
+                                       llvm_i32_ty], [IntrReadWriteArgMem]>;
+
+// Vector bitwise select.
+def int_arm_neon_vbsl : Intrinsic<[llvm_anyvector_ty],
+                        [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+                        [IntrNoMem]>;
+
+} // end TargetPrefix
diff --git a/include/llvm/LLVMContext.h b/include/llvm/LLVMContext.h
index a8306a9e76..5903e2e55e 100644
--- a/include/llvm/LLVMContext.h
+++ b/include/llvm/LLVMContext.h
@@ -15,6 +15,8 @@
 #ifndef LLVM_LLVMCONTEXT_H
 #define LLVM_LLVMCONTEXT_H
 
+#include "llvm/Support/Compiler.h"
+
 namespace llvm {
 
 class LLVMContextImpl;
@@ -43,7 +45,8 @@ public:
     MD_tbaa = 1, // "tbaa"
     MD_prof = 2,  // "prof"
     MD_fpmath = 3,  // "fpmath"
-    MD_range = 4 // "range"
+    MD_range = 4, // "range"
+    MD_tbaa_struct = 5 // "tbaa.struct"
   };
   
   /// getMDKindID - Return a unique non-zero ID for the specified metadata kind.
@@ -87,9 +90,8 @@ public:
   void emitError(const Twine &ErrorStr);
 
 private:
-  // DO NOT IMPLEMENT
-  LLVMContext(LLVMContext&);
-  void operator=(LLVMContext&);
+  LLVMContext(LLVMContext&) LLVM_DELETED_FUNCTION;
+  void operator=(LLVMContext&) LLVM_DELETED_FUNCTION;
 
   /// addModule - Register a module as being instantiated in this context.  If
   /// the context is deleted, the module will be deleted as well.
diff --git a/include/llvm/MC/MCAsmBackend.h b/include/llvm/MC/MCAsmBackend.h
index 048d469fd5..ebe5542555 100644
--- a/include/llvm/MC/MCAsmBackend.h
+++ b/include/llvm/MC/MCAsmBackend.h
@@ -100,7 +100,7 @@ public:
 
   /// @}
 
-  /// applyFixup - Apply the \arg Value for given \arg Fixup into the provided
+  /// applyFixup - Apply the \p Value for given \p Fixup into the provided
   /// data fragment, at the offset specified by the fixup and following the
   /// fixup kind as appropriate.
   virtual void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
@@ -127,9 +127,9 @@ public:
   /// RelaxInstruction - Relax the instruction in the given fragment to the next
   /// wider instruction.
   ///
-  /// \param Inst - The instruction to relax, which may be the same as the
+  /// \param Inst The instruction to relax, which may be the same as the
   /// output.
-  /// \parm Res [output] - On return, the relaxed instruction.
+  /// \param [out] Res On return, the relaxed instruction.
   virtual void relaxInstruction(const MCInst &Inst, MCInst &Res) const = 0;
 
   /// @}
diff --git a/include/llvm/MC/MCAssembler.h b/include/llvm/MC/MCAssembler.h
index a38b9a85c2..6fc685cb33 100644
--- a/include/llvm/MC/MCAssembler.h
+++ b/include/llvm/MC/MCAssembler.h
@@ -282,7 +282,7 @@ class MCAlignFragment : public MCFragment {
   /// Value - Value to use for filling padding bytes.
   int64_t Value;
 
-  /// ValueSize - The size of the integer (in bytes) of \arg Value.
+  /// ValueSize - The size of the integer (in bytes) of \p Value.
   unsigned ValueSize;
 
   /// MaxBytesToEmit - The maximum number of bytes to emit; if the alignment
@@ -329,7 +329,7 @@ class MCFillFragment : public MCFragment {
   /// Value - Value to use for filling bytes.
   int64_t Value;
 
-  /// ValueSize - The size (in bytes) of \arg Value to use when filling, or 0 if
+  /// ValueSize - The size (in bytes) of \p Value to use when filling, or 0 if
   /// this is a virtual fill fragment.
   unsigned ValueSize;
 
@@ -497,8 +497,8 @@ public:
 class MCSectionData : public ilist_node<MCSectionData> {
   friend class MCAsmLayout;
 
-  MCSectionData(const MCSectionData&);  // DO NOT IMPLEMENT
-  void operator=(const MCSectionData&); // DO NOT IMPLEMENT
+  MCSectionData(const MCSectionData&) LLVM_DELETED_FUNCTION;
+  void operator=(const MCSectionData&) LLVM_DELETED_FUNCTION;
 
 public:
   typedef iplist<MCFragment> FragmentListType;
@@ -782,8 +782,8 @@ public:
   typedef std::vector<DataRegionData>::iterator data_region_iterator;
 
 private:
-  MCAssembler(const MCAssembler&);    // DO NOT IMPLEMENT
-  void operator=(const MCAssembler&); // DO NOT IMPLEMENT
+  MCAssembler(const MCAssembler&) LLVM_DELETED_FUNCTION;
+  void operator=(const MCAssembler&) LLVM_DELETED_FUNCTION;
 
   MCContext &Context;
 
@@ -837,7 +837,7 @@ private:
   /// \param Value [out] On return, the value of the fixup as currently laid
   /// out.
   /// \return Whether the fixup value was fully resolved. This is true if the
-  /// \arg Value result is fixed, otherwise the value may change due to
+  /// \p Value result is fixed, otherwise the value may change due to
   /// relocation.
   bool evaluateFixup(const MCAsmLayout &Layout,
                      const MCFixup &Fixup, const MCFragment *DF,
@@ -881,7 +881,7 @@ private:
 
 public:
   /// Compute the effective fragment size assuming it is laid out at the given
-  /// \arg SectionAddress and \arg FragmentOffset.
+  /// \p SectionAddress and \p FragmentOffset.
   uint64_t computeFragmentSize(const MCAsmLayout &Layout,
                                const MCFragment &F) const;
 
@@ -910,7 +910,7 @@ public:
 public:
   /// Construct a new assembler instance.
   ///
-  /// \arg OS - The stream to output to.
+  /// \param OS The stream to output to.
   //
   // FIXME: How are we going to parameterize this? Two obvious options are stay
   // concrete and require clients to pass in a target like object. The other
@@ -936,7 +936,7 @@ public:
   MCObjectWriter &getWriter() const { return Writer; }
 
   /// Finish - Do final processing and write the object to the output stream.
-  /// \arg Writer is used for custom object writer (as the MCJIT does),
+  /// \p Writer is used for custom object writer (as the MCJIT does),
   /// if not specified it is automatically created from backend.
   void Finish();
 
diff --git a/include/llvm/MC/MCCodeEmitter.h b/include/llvm/MC/MCCodeEmitter.h
index 4f7d103060..0574890902 100644
--- a/include/llvm/MC/MCCodeEmitter.h
+++ b/include/llvm/MC/MCCodeEmitter.h
@@ -29,8 +29,8 @@ protected: // Can only create subclasses.
 public:
   virtual ~MCCodeEmitter();
 
-  /// EncodeInstruction - Encode the given \arg Inst to bytes on the output
-  /// stream \arg OS.
+  /// EncodeInstruction - Encode the given \p Inst to bytes on the output
+  /// stream \p OS.
   virtual void EncodeInstruction(const MCInst &Inst, raw_ostream &OS,
                                  SmallVectorImpl<MCFixup> &Fixups) const = 0;
 };
diff --git a/include/llvm/MC/MCContext.h b/include/llvm/MC/MCContext.h
index 23652f00d0..5a8830cb66 100644
--- a/include/llvm/MC/MCContext.h
+++ b/include/llvm/MC/MCContext.h
@@ -183,6 +183,7 @@ namespace llvm {
 
     /// LookupSymbol - Get the symbol for \p Name, or null.
     MCSymbol *LookupSymbol(StringRef Name) const;
+    MCSymbol *LookupSymbol(const Twine &Name) const;
 
     /// getSymbols - Get a reference for the symbol table for clients that
     /// want to, for example, iterate over all symbols. 'const' because we
diff --git a/include/llvm/MC/MCDwarf.h b/include/llvm/MC/MCDwarf.h
index 9e09ddf411..8fc437f3e6 100644
--- a/include/llvm/MC/MCDwarf.h
+++ b/include/llvm/MC/MCDwarf.h
@@ -59,7 +59,7 @@ namespace llvm {
     unsigned getDirIndex() const { return DirIndex; }
 
 
-    /// print - Print the value to the stream \arg OS.
+    /// print - Print the value to the stream \p OS.
     void print(raw_ostream &OS) const;
 
     /// dump - Print the value to stderr.
@@ -178,8 +178,8 @@ namespace llvm {
   class MCLineSection {
 
   private:
-    MCLineSection(const MCLineSection&);  // DO NOT IMPLEMENT
-    void operator=(const MCLineSection&); // DO NOT IMPLEMENT
+    MCLineSection(const MCLineSection&) LLVM_DELETED_FUNCTION;
+    void operator=(const MCLineSection&) LLVM_DELETED_FUNCTION;
 
   public:
     // Constructor to create an MCLineSection with an empty MCLineEntries
diff --git a/include/llvm/MC/MCInst.h b/include/llvm/MC/MCInst.h
index 397a37d3ce..e91c6a2e8e 100644
--- a/include/llvm/MC/MCInst.h
+++ b/include/llvm/MC/MCInst.h
@@ -182,7 +182,7 @@ public:
   void dump() const;
 
   /// \brief Dump the MCInst as prettily as possible using the additional MC
-  /// structures, if given. Operators are separated by the \arg Separator
+  /// structures, if given. Operators are separated by the \p Separator
   /// string.
   void dump_pretty(raw_ostream &OS, const MCAsmInfo *MAI = 0,
                    const MCInstPrinter *Printer = 0,
diff --git a/include/llvm/MC/MCLabel.h b/include/llvm/MC/MCLabel.h
index c72aabd03a..f531de8b40 100644
--- a/include/llvm/MC/MCLabel.h
+++ b/include/llvm/MC/MCLabel.h
@@ -42,7 +42,7 @@ namespace llvm {
     /// Label.
     unsigned incInstance() { return ++Instance; }
 
-    /// print - Print the value to the stream \arg OS.
+    /// print - Print the value to the stream \p OS.
     void print(raw_ostream &OS) const;
 
     /// dump - Print the value to stderr.
diff --git a/include/llvm/MC/MCMachObjectWriter.h b/include/llvm/MC/MCMachObjectWriter.h
index 949d90700e..efaabfb9e8 100644
--- a/include/llvm/MC/MCMachObjectWriter.h
+++ b/include/llvm/MC/MCMachObjectWriter.h
@@ -153,8 +153,8 @@ public:
 
   /// WriteSegmentLoadCommand - Write a segment load command.
   ///
-  /// \arg NumSections - The number of sections in this segment.
-  /// \arg SectionDataSize - The total size of the sections.
+  /// \param NumSections The number of sections in this segment.
+  /// \param SectionDataSize The total size of the sections.
   void WriteSegmentLoadCommand(unsigned NumSections,
                                uint64_t VMSize,
                                uint64_t SectionDataStartOffset,
@@ -233,6 +233,8 @@ public:
   void computeSectionAddresses(const MCAssembler &Asm,
                                const MCAsmLayout &Layout);
 
+  void markAbsoluteVariableSymbols(MCAssembler &Asm,
+                                   const MCAsmLayout &Layout);
   void ExecutePostLayoutBinding(MCAssembler &Asm, const MCAsmLayout &Layout);
 
   virtual bool IsSymbolRefDifferenceFullyResolvedImpl(const MCAssembler &Asm,
diff --git a/include/llvm/MC/MCParser/MCAsmLexer.h b/include/llvm/MC/MCParser/MCAsmLexer.h
index ca163c50e3..0a961d6d09 100644
--- a/include/llvm/MC/MCParser/MCAsmLexer.h
+++ b/include/llvm/MC/MCParser/MCAsmLexer.h
@@ -40,6 +40,7 @@ public:
     // No-value.
     EndOfStatement,
     Colon,
+    Space,
     Plus, Minus, Tilde,
     Slash,    // '/'
     BackSlash, // '\'
@@ -126,6 +127,7 @@ class MCAsmLexer {
   void operator=(const MCAsmLexer &) LLVM_DELETED_FUNCTION;
 protected: // Can only create subclasses.
   const char *TokStart;
+  bool SkipSpace;
 
   MCAsmLexer();
 
@@ -170,11 +172,14 @@ public:
   /// getKind - Get the kind of current token.
   AsmToken::TokenKind getKind() const { return CurTok.getKind(); }
 
-  /// is - Check if the current token has kind \arg K.
+  /// is - Check if the current token has kind \p K.
   bool is(AsmToken::TokenKind K) const { return CurTok.is(K); }
 
-  /// isNot - Check if the current token has kind \arg K.
+  /// isNot - Check if the current token has kind \p K.
   bool isNot(AsmToken::TokenKind K) const { return CurTok.isNot(K); }
+
+  /// setSkipSpace - Set whether spaces should be ignored by the lexer
+  void setSkipSpace(bool val) { SkipSpace = val; }
 };
 
 } // End llvm namespace
diff --git a/include/llvm/MC/MCParser/MCAsmParser.h b/include/llvm/MC/MCParser/MCAsmParser.h
index c673a79bd4..adc960d27e 100644
--- a/include/llvm/MC/MCParser/MCAsmParser.h
+++ b/include/llvm/MC/MCParser/MCAsmParser.h
@@ -73,15 +73,13 @@ public:
   /// Run - Run the parser on the input source buffer.
   virtual bool Run(bool NoInitialTextSection, bool NoFinalize = false) = 0;
 
-  /// Warning - Emit a warning at the location \arg L, with the message \arg
-  /// Msg.
+  /// Warning - Emit a warning at the location \p L, with the message \p Msg.
   ///
   /// \return The return value is true, if warnings are fatal.
   virtual bool Warning(SMLoc L, const Twine &Msg,
                        ArrayRef<SMRange> Ranges = ArrayRef<SMRange>()) = 0;
 
-  /// Error - Emit an error at the location \arg L, with the message \arg
-  /// Msg.
+  /// Error - Emit an error at the location \p L, with the message \p Msg.
   ///
   /// \return The return value is always true, as an idiomatic convenience to
   /// clients.
@@ -100,7 +98,7 @@ public:
                 ArrayRef<SMRange> Ranges = ArrayRef<SMRange>());
 
   /// ParseIdentifier - Parse an identifier or string (as a quoted identifier)
-  /// and set \arg Res to the identifier contents.
+  /// and set \p Res to the identifier contents.
   virtual bool ParseIdentifier(StringRef &Res) = 0;
 
   /// \brief Parse up to the end of statement and return the contents from the
diff --git a/include/llvm/MC/MCParser/MCAsmParserExtension.h b/include/llvm/MC/MCParser/MCAsmParserExtension.h
index 59593a88e1..0918c93bdf 100644
--- a/include/llvm/MC/MCParser/MCAsmParserExtension.h
+++ b/include/llvm/MC/MCParser/MCAsmParserExtension.h
@@ -43,8 +43,8 @@ protected:
 public:
   virtual ~MCAsmParserExtension();
 
-  /// \brief Initialize the extension for parsing using the given \arg
-  /// Parser. The extension should use the AsmParser interfaces to register its
+  /// \brief Initialize the extension for parsing using the given \p Parser.
+  /// The extension should use the AsmParser interfaces to register its
   /// parsing routines.
   virtual void Initialize(MCAsmParser &Parser);
 
diff --git a/include/llvm/MC/MCParser/MCParsedAsmOperand.h b/include/llvm/MC/MCParser/MCParsedAsmOperand.h
index 2556e5f27a..369bc72ff6 100644
--- a/include/llvm/MC/MCParser/MCParsedAsmOperand.h
+++ b/include/llvm/MC/MCParser/MCParsedAsmOperand.h
@@ -23,6 +23,15 @@ public:
   MCParsedAsmOperand() {}
   virtual ~MCParsedAsmOperand() {}
 
+  /// isToken - Is this a token operand?
+  virtual bool isToken() const = 0;
+  /// isImm - Is this an immediate operand?
+  virtual bool isImm() const = 0;
+  /// isReg - Is this a register operand?
+  virtual bool isReg() const = 0;
+  /// isMem - Is this a memory operand?
+  virtual bool isMem() const = 0;
+
   /// getStartLoc - Get the location of the first token of this operand.
   virtual SMLoc getStartLoc() const = 0;
   /// getEndLoc - Get the location of the last token of this operand.
diff --git a/include/llvm/MC/MCSchedule.h b/include/llvm/MC/MCSchedule.h
index 3b1cdf1cd2..41bdb02f61 100644
--- a/include/llvm/MC/MCSchedule.h
+++ b/include/llvm/MC/MCSchedule.h
@@ -16,17 +16,102 @@
 #define LLVM_MC_MCSCHEDMODEL_H
 
 #include "llvm/Support/DataTypes.h"
+#include <cassert>
 
 namespace llvm {
 
 struct InstrItinerary;
 
+/// Define a kind of processor resource that will be modeled by the scheduler.
+struct MCProcResourceDesc {
+#ifndef NDEBUG
+  const char *Name;
+#endif
+  unsigned Count; // Number of resource of this kind
+  unsigned SuperIdx; // Index of the resources kind that contains this kind.
+
+  bool operator==(const MCProcResourceDesc &Other) const {
+    return Count == Other.Count && SuperIdx == Other.SuperIdx;
+  }
+};
+
+/// Identify one of the processor resource kinds consumed by a particular
+/// scheduling class for the specified number of cycles.
+struct MCWriteProcResEntry {
+  unsigned ProcResourceIdx;
+  unsigned Cycles;
+
+  bool operator==(const MCWriteProcResEntry &Other) const {
+    return ProcResourceIdx == Other.ProcResourceIdx && Cycles == Other.Cycles;
+  }
+};
+
+/// Specify the latency in cpu cycles for a particular scheduling class and def
+/// index. Also identify the WriteResources of this def. When the operand
+/// expands to a sequence of writes, this ID is the last write in the sequence.
+struct MCWriteLatencyEntry {
+  unsigned Cycles;
+  unsigned WriteResourceID;
+
+  bool operator==(const MCWriteLatencyEntry &Other) const {
+    return Cycles == Other.Cycles && WriteResourceID == Other.WriteResourceID;
+  }
+};
+
+/// Specify the number of cycles allowed after instruction issue before a
+/// particular use operand reads its registers. This effectively reduces the
+/// write's latency. Here we allow negative cycles for corner cases where
+/// latency increases. This rule only applies when the entry's WriteResource
+/// matches the write's WriteResource.
+///
+/// MCReadAdvanceEntries are sorted first by operand index (UseIdx), then by
+/// WriteResourceIdx.
+struct MCReadAdvanceEntry {
+  unsigned UseIdx;
+  unsigned WriteResourceID;
+  int Cycles;
+
+  bool operator==(const MCReadAdvanceEntry &Other) const {
+    return UseIdx == Other.UseIdx && WriteResourceID == Other.WriteResourceID
+      && Cycles == Other.Cycles;
+  }
+};
+
+/// Summarize the scheduling resources required for an instruction of a
+/// particular scheduling class.
+///
+/// Defined as an aggregate struct for creating tables with initializer lists.
+struct MCSchedClassDesc {
+  static const unsigned short InvalidNumMicroOps = UINT16_MAX;
+  static const unsigned short VariantNumMicroOps = UINT16_MAX - 1;
+
+#ifndef NDEBUG
+  const char* Name;
+#endif
+  unsigned short NumMicroOps;
+  bool     BeginGroup;
+  bool     EndGroup;
+  unsigned WriteProcResIdx; // First index into WriteProcResTable.
+  unsigned NumWriteProcResEntries;
+  unsigned WriteLatencyIdx; // First index into WriteLatencyTable.
+  unsigned NumWriteLatencyEntries;
+  unsigned ReadAdvanceIdx; // First index into ReadAdvanceTable.
+  unsigned NumReadAdvanceEntries;
+
+  bool isValid() const {
+    return NumMicroOps != InvalidNumMicroOps;
+  }
+  bool isVariant() const {
+    return NumMicroOps == VariantNumMicroOps;
+  }
+};
+
 /// Machine model for scheduling, bundling, and heuristics.
 ///
 /// The machine model directly provides basic information about the
 /// microarchitecture to the scheduler in the form of properties. It also
-/// optionally refers to scheduler resources tables and itinerary
-/// tables. Scheduler resources tables model the latency and cost for each
+/// optionally refers to scheduler resource tables and itinerary
+/// tables. Scheduler resource tables model the latency and cost for each
 /// instruction type. Itinerary tables are an independant mechanism that
 /// provides a detailed reservation table describing each cycle of instruction
 /// execution. Subtargets may define any or all of the above categories of data
@@ -84,8 +169,11 @@ public:
   static const unsigned DefaultMispredictPenalty = 10;
 
 private:
-  // TODO: Add a reference to proc resource types and sched resource tables.
-
+  unsigned ProcID;
+  const MCProcResourceDesc *ProcResourceTable;
+  const MCSchedClassDesc *SchedClassTable;
+  unsigned NumProcResourceKinds;
+  unsigned NumSchedClasses;
   // Instruction itinerary tables used by InstrItineraryData.
   friend class InstrItineraryData;
   const InstrItinerary *InstrItineraries;
@@ -100,13 +188,41 @@ public:
                   LoadLatency(DefaultLoadLatency),
                   HighLatency(DefaultHighLatency),
                   MispredictPenalty(DefaultMispredictPenalty),
-                  InstrItineraries(0) {}
+                  ProcID(0), ProcResourceTable(0), SchedClassTable(0),
+                  NumProcResourceKinds(0), NumSchedClasses(0),
+                  InstrItineraries(0) {
+    (void)NumProcResourceKinds;
+    (void)NumSchedClasses;
+  }
 
   // Table-gen driven ctor.
   MCSchedModel(unsigned iw, int ml, unsigned ll, unsigned hl, unsigned mp,
+               unsigned pi, const MCProcResourceDesc *pr,
+               const MCSchedClassDesc *sc, unsigned npr, unsigned nsc,
                const InstrItinerary *ii):
     IssueWidth(iw), MinLatency(ml), LoadLatency(ll), HighLatency(hl),
-    MispredictPenalty(mp), InstrItineraries(ii){}
+    MispredictPenalty(mp), ProcID(pi), ProcResourceTable(pr),
+    SchedClassTable(sc), NumProcResourceKinds(npr), NumSchedClasses(nsc),
+    InstrItineraries(ii) {}
+
+  unsigned getProcessorID() const { return ProcID; }
+
+  /// Does this machine model include instruction-level scheduling.
+  bool hasInstrSchedModel() const { return SchedClassTable; }
+
+  const MCProcResourceDesc *getProcResource(unsigned ProcResourceIdx) const {
+    assert(hasInstrSchedModel() && "No scheduling machine model");
+
+    assert(ProcResourceIdx < NumProcResourceKinds && "bad proc resource idx");
+    return &ProcResourceTable[ProcResourceIdx];
+  }
+
+  const MCSchedClassDesc *getSchedClassDesc(unsigned SchedClassIdx) const {
+    assert(hasInstrSchedModel() && "No scheduling machine model");
+
+    assert(SchedClassIdx < NumSchedClasses && "bad scheduling class idx");
+    return &SchedClassTable[SchedClassIdx];
+  }
 };
 
 } // End llvm namespace
diff --git a/include/llvm/MC/MCStreamer.h b/include/llvm/MC/MCStreamer.h
index 391e1d59e8..ad30b9ca60 100644
--- a/include/llvm/MC/MCStreamer.h
+++ b/include/llvm/MC/MCStreamer.h
@@ -342,7 +342,7 @@ namespace llvm {
     /// @name Generating Data
     /// @{
 
-    /// EmitBytes - Emit the bytes in \arg Data into the output.
+    /// EmitBytes - Emit the bytes in \p Data into the output.
     ///
     /// This is used to implement assembler directives such as .byte, .ascii,
     /// etc.
@@ -594,11 +594,11 @@ namespace llvm {
   /// InstPrint.
   ///
   /// \param CE - If given, a code emitter to use to show the instruction
-  /// encoding inline with the assembly. This method takes ownership of \arg CE.
+  /// encoding inline with the assembly. This method takes ownership of \p CE.
   ///
   /// \param TAB - If given, a target asm backend to use to show the fixup
   /// information in conjunction with encoding information. This method takes
-  /// ownership of \arg TAB.
+  /// ownership of \p TAB.
   ///
   /// \param ShowInst - Whether to show the MCInst representation inline with
   /// the assembly.
@@ -615,7 +615,7 @@ namespace llvm {
   /// createMachOStreamer - Create a machine code streamer which will generate
   /// Mach-O format object files.
   ///
-  /// Takes ownership of \arg TAB and \arg CE.
+  /// Takes ownership of \p TAB and \p CE.
   MCStreamer *createMachOStreamer(MCContext &Ctx, MCAsmBackend &TAB,
                                   raw_ostream &OS, MCCodeEmitter *CE,
                                   bool RelaxAll = false);
@@ -623,7 +623,7 @@ namespace llvm {
   /// createWinCOFFStreamer - Create a machine code streamer which will
   /// generate Microsoft COFF format object files.
   ///
-  /// Takes ownership of \arg TAB and \arg CE.
+  /// Takes ownership of \p TAB and \p CE.
   MCStreamer *createWinCOFFStreamer(MCContext &Ctx,
                                     MCAsmBackend &TAB,
                                     MCCodeEmitter &CE, raw_ostream &OS,
@@ -638,7 +638,7 @@ namespace llvm {
   /// createPureStreamer - Create a machine code streamer which will generate
   /// "pure" MC object files, for use with MC-JIT and testing tools.
   ///
-  /// Takes ownership of \arg TAB and \arg CE.
+  /// Takes ownership of \p TAB and \p CE.
   MCStreamer *createPureStreamer(MCContext &Ctx, MCAsmBackend &TAB,
                                  raw_ostream &OS, MCCodeEmitter *CE);
 
diff --git a/include/llvm/MC/MCSubtargetInfo.h b/include/llvm/MC/MCSubtargetInfo.h
index 6c96f49716..451f435fe3 100644
--- a/include/llvm/MC/MCSubtargetInfo.h
+++ b/include/llvm/MC/MCSubtargetInfo.h
@@ -30,7 +30,14 @@ class MCSubtargetInfo {
   std::string TargetTriple;            // Target triple
   const SubtargetFeatureKV *ProcFeatures;  // Processor feature list
   const SubtargetFeatureKV *ProcDesc;  // Processor descriptions
-  const SubtargetInfoKV *ProcSchedModel; // Scheduler machine model
+
+  // Scheduler machine model
+  const SubtargetInfoKV *ProcSchedModels;
+  const MCWriteProcResEntry *WriteProcResTable;
+  const MCWriteLatencyEntry *WriteLatencyTable;
+  const MCReadAdvanceEntry *ReadAdvanceTable;
+  const MCSchedModel *CPUSchedModel;
+
   const InstrStage *Stages;            // Instruction itinerary stages
   const unsigned *OperandCycles;       // Itinerary operand cycles
   const unsigned *ForwardingPaths;     // Forwarding paths
@@ -43,6 +50,9 @@ public:
                            const SubtargetFeatureKV *PF,
                            const SubtargetFeatureKV *PD,
                            const SubtargetInfoKV *ProcSched,
+                           const MCWriteProcResEntry *WPR,
+                           const MCWriteLatencyEntry *WL,
+                           const MCReadAdvanceEntry *RA,
                            const InstrStage *IS,
                            const unsigned *OC, const unsigned *FP,
                            unsigned NF, unsigned NP);
@@ -58,9 +68,9 @@ public:
     return FeatureBits;
   }
 
-  /// ReInitMCSubtargetInfo - Change CPU (and optionally supplemented with
-  /// feature string), recompute and return feature bits.
-  uint64_t ReInitMCSubtargetInfo(StringRef CPU, StringRef FS);
+  /// InitMCProcessorInfo - Set or change the CPU (optionally supplemented with
+  /// feature string). Recompute feature bits and scheduling model.
+  void InitMCProcessorInfo(StringRef CPU, StringRef FS);
 
   /// ToggleFeature - Toggle a feature and returns the re-computed feature
   /// bits. This version does not change the implied bits.
@@ -74,9 +84,51 @@ public:
   ///
   const MCSchedModel *getSchedModelForCPU(StringRef CPU) const;
 
+  /// getSchedModel - Get the machine model for this subtarget's CPU.
+  ///
+  const MCSchedModel *getSchedModel() const { return CPUSchedModel; }
+
+  /// Return an iterator at the first process resource consumed by the given
+  /// scheduling class.
+  const MCWriteProcResEntry *getWriteProcResBegin(
+    const MCSchedClassDesc *SC) const {
+    return &WriteProcResTable[SC->WriteProcResIdx];
+  }
+  const MCWriteProcResEntry *getWriteProcResEnd(
+    const MCSchedClassDesc *SC) const {
+    return getWriteProcResBegin(SC) + SC->NumWriteProcResEntries;
+  }
+
+  const MCWriteLatencyEntry *getWriteLatencyEntry(const MCSchedClassDesc *SC,
+                                                  unsigned DefIdx) const {
+    assert(DefIdx < SC->NumWriteLatencyEntries &&
+           "MachineModel does not specify a WriteResource for DefIdx");
+
+    return &WriteLatencyTable[SC->WriteLatencyIdx + DefIdx];
+  }
+
+  int getReadAdvanceCycles(const MCSchedClassDesc *SC, unsigned UseIdx,
+                           unsigned WriteResID) const {
+    for (const MCReadAdvanceEntry *I = &ReadAdvanceTable[SC->ReadAdvanceIdx],
+           *E = I + SC->NumReadAdvanceEntries; I != E; ++I) {
+      if (I->UseIdx < UseIdx)
+        continue;
+      if (I->UseIdx > UseIdx)
+        break;
+      // Find the first WriteResIdx match, which has the highest cycle count.
+      if (!I->WriteResourceID || I->WriteResourceID == WriteResID) {
+        return I->Cycles;
+      }
+    }
+    return 0;
+  }
+
   /// getInstrItineraryForCPU - Get scheduling itinerary of a CPU.
   ///
   InstrItineraryData getInstrItineraryForCPU(StringRef CPU) const;
+
+  /// Initialize an InstrItineraryData instance.
+  void initInstrItins(InstrItineraryData &InstrItins) const;
 };
 
 } // End llvm namespace
diff --git a/include/llvm/MC/MCSymbol.h b/include/llvm/MC/MCSymbol.h
index 4c9e7f5ffd..fe927555c4 100644
--- a/include/llvm/MC/MCSymbol.h
+++ b/include/llvm/MC/MCSymbol.h
@@ -113,7 +113,7 @@ namespace llvm {
       return *Section;
     }
 
-    /// setSection - Mark the symbol as defined in the section \arg S.
+    /// setSection - Mark the symbol as defined in the section \p S.
     void setSection(const MCSection &S) { Section = &S; }
 
     /// setUndefined - Mark the symbol as undefined.
@@ -133,7 +133,7 @@ namespace llvm {
       return Value != 0;
     }
 
-    /// getValue() - Get the value for variable symbols.
+    /// getVariableValue() - Get the value for variable symbols.
     const MCExpr *getVariableValue() const {
       assert(isVariable() && "Invalid accessor!");
       IsUsed = true;
@@ -149,7 +149,7 @@ namespace llvm {
 
     /// @}
 
-    /// print - Print the value to the stream \arg OS.
+    /// print - Print the value to the stream \p OS.
     void print(raw_ostream &OS) const;
 
     /// dump - Print the value to stderr.
diff --git a/include/llvm/MC/MCTargetAsmLexer.h b/include/llvm/MC/MCTargetAsmLexer.h
index d09fe0498e..b1cc546e1e 100644
--- a/include/llvm/MC/MCTargetAsmLexer.h
+++ b/include/llvm/MC/MCTargetAsmLexer.h
@@ -45,7 +45,7 @@ public:
 
   const Target &getTarget() const { return TheTarget; }
 
-  /// InstallLexer - Set the lexer to get tokens from lower-level lexer \arg L.
+  /// InstallLexer - Set the lexer to get tokens from lower-level lexer \p L.
   void InstallLexer(MCAsmLexer &L) {
     Lexer = &L;
   }
@@ -77,10 +77,10 @@ public:
   /// getKind - Get the kind of current token.
   AsmToken::TokenKind getKind() const { return CurTok.getKind(); }
 
-  /// is - Check if the current token has kind \arg K.
+  /// is - Check if the current token has kind \p K.
   bool is(AsmToken::TokenKind K) const { return CurTok.is(K); }
 
-  /// isNot - Check if the current token has kind \arg K.
+  /// isNot - Check if the current token has kind \p K.
   bool isNot(AsmToken::TokenKind K) const { return CurTok.isNot(K); }
 };
 
diff --git a/include/llvm/MC/MCTargetAsmParser.h b/include/llvm/MC/MCTargetAsmParser.h
index 709c2d245c..a771ed7a9d 100644
--- a/include/llvm/MC/MCTargetAsmParser.h
+++ b/include/llvm/MC/MCTargetAsmParser.h
@@ -78,6 +78,10 @@ public:
   /// \param DirectiveID - the identifier token of the directive.
   virtual bool ParseDirective(AsmToken DirectiveID) = 0;
 
+  /// mnemonicIsValid - This returns true if this is a valid mnemonic and false
+  /// otherwise.
+  virtual bool mnemonicIsValid(StringRef Mnemonic) = 0;
+
   /// MatchInstruction - Recognize a series of operands of a parsed instruction
   /// as an actual MCInst.  This returns false on success and returns true on
   /// failure to match.
diff --git a/include/llvm/MC/MCValue.h b/include/llvm/MC/MCValue.h
index 8352ed183f..f9af8bcfbf 100644
--- a/include/llvm/MC/MCValue.h
+++ b/include/llvm/MC/MCValue.h
@@ -46,7 +46,7 @@ public:
   /// isAbsolute - Is this an absolute (as opposed to relocatable) value.
   bool isAbsolute() const { return !SymA && !SymB; }
 
-  /// print - Print the value to the stream \arg OS.
+  /// print - Print the value to the stream \p OS.
   void print(raw_ostream &OS, const MCAsmInfo *MAI) const;
 
   /// dump - Print the value to stderr.
diff --git a/include/llvm/MDBuilder.h b/include/llvm/MDBuilder.h
index 2aa48b0b47..1867a63923 100644
--- a/include/llvm/MDBuilder.h
+++ b/include/llvm/MDBuilder.h
@@ -134,6 +134,27 @@ namespace llvm {
       }
     }
 
+    struct TBAAStructField {
+      uint64_t Offset;
+      uint64_t Size;
+      MDNode *TBAA;
+      TBAAStructField(uint64_t Offset, uint64_t Size, MDNode *TBAA) :
+        Offset(Offset), Size(Size), TBAA(TBAA) {}
+    };
+
+    /// \brief Return metadata for a tbaa.struct node with the given
+    /// struct field descriptions.
+    MDNode *createTBAAStructNode(ArrayRef<TBAAStructField> Fields) {
+      SmallVector<Value *, 4> Vals(Fields.size() * 3);
+      Type *Int64 = IntegerType::get(Context, 64);
+      for (unsigned i = 0, e = Fields.size(); i != e; ++i) {
+        Vals[i * 3 + 0] = ConstantInt::get(Int64, Fields[i].Offset);
+        Vals[i * 3 + 1] = ConstantInt::get(Int64, Fields[i].Size);
+        Vals[i * 3 + 2] = Fields[i].TBAA;
+      }
+      return MDNode::get(Context, Vals);
+    }
+
   };
 
 } // end namespace llvm
diff --git a/include/llvm/Metadata.h b/include/llvm/Metadata.h
index b40549bed6..d0e6524623 100644
--- a/include/llvm/Metadata.h
+++ b/include/llvm/Metadata.h
@@ -37,7 +37,7 @@ template<typename ValueSubClass, typename ItemParentClass>
 /// MDString is always unnamed.
 class MDString : public Value {
   virtual void anchor();
-  MDString(const MDString &);            // DO NOT IMPLEMENT
+  MDString(const MDString &) LLVM_DELETED_FUNCTION;
 
   explicit MDString(LLVMContext &C);
 public:
@@ -71,8 +71,8 @@ class MDNodeOperand;
 //===----------------------------------------------------------------------===//
 /// MDNode - a tuple of other values.
 class MDNode : public Value, public FoldingSetNode {
-  MDNode(const MDNode &);                // DO NOT IMPLEMENT
-  void operator=(const MDNode &);        // DO NOT IMPLEMENT
+  MDNode(const MDNode &) LLVM_DELETED_FUNCTION;
+  void operator=(const MDNode &) LLVM_DELETED_FUNCTION;
   friend class MDNodeOperand;
   friend class LLVMContextImpl;
   friend struct FoldingSetTrait<MDNode>;
@@ -195,7 +195,7 @@ class NamedMDNode : public ilist_node<NamedMDNode> {
   friend struct ilist_traits<NamedMDNode>;
   friend class LLVMContextImpl;
   friend class Module;
-  NamedMDNode(const NamedMDNode &);      // DO NOT IMPLEMENT
+  NamedMDNode(const NamedMDNode &) LLVM_DELETED_FUNCTION;
 
   std::string Name;
   Module *Parent;
diff --git a/include/llvm/Object/Binary.h b/include/llvm/Object/Binary.h
index befe812a36..baed81827d 100644
--- a/include/llvm/Object/Binary.h
+++ b/include/llvm/Object/Binary.h
@@ -26,8 +26,8 @@ namespace object {
 
 class Binary {
 private:
-  Binary(); // = delete
-  Binary(const Binary &other); // = delete
+  Binary() LLVM_DELETED_FUNCTION;
+  Binary(const Binary &other) LLVM_DELETED_FUNCTION;
 
   unsigned int TypeID;
 
diff --git a/include/llvm/Object/ELF.h b/include/llvm/Object/ELF.h
index 06edf432f5..2e6a7d8c1f 100644
--- a/include/llvm/Object/ELF.h
+++ b/include/llvm/Object/ELF.h
@@ -462,6 +462,59 @@ private:
   // This is set the first time getLoadName is called.
   mutable const char *dt_soname;
 
+public:
+  /// \brief Iterate over relocations in a .rel or .rela section.
+  template<class RelocT>
+  class ELFRelocationIterator {
+  public:
+    typedef void difference_type;
+    typedef const RelocT value_type;
+    typedef std::forward_iterator_tag iterator_category;
+    typedef value_type &reference;
+    typedef value_type *pointer;
+
+    /// \brief Default construct iterator.
+    ELFRelocationIterator() : Section(0), Current(0) {}
+    ELFRelocationIterator(const Elf_Shdr *Sec, const char *Start)
+      : Section(Sec)
+      , Current(Start) {}
+
+    reference operator *() {
+      assert(Current && "Attempted to dereference an invalid iterator!");
+      return *reinterpret_cast<const RelocT*>(Current);
+    }
+
+    pointer operator ->() {
+      assert(Current && "Attempted to dereference an invalid iterator!");
+      return reinterpret_cast<const RelocT*>(Current);
+    }
+
+    bool operator ==(const ELFRelocationIterator &Other) {
+      return Section == Other.Section && Current == Other.Current;
+    }
+
+    bool operator !=(const ELFRelocationIterator &Other) {
+      return !(*this == Other);
+    }
+
+    ELFRelocationIterator &operator ++(int) {
+      assert(Current && "Attempted to increment an invalid iterator!");
+      Current += Section->sh_entsize;
+      return *this;
+    }
+
+    ELFRelocationIterator operator ++() {
+      ELFRelocationIterator Tmp = *this;
+      ++*this;
+      return Tmp;
+    }
+
+  private:
+    const Elf_Shdr *Section;
+    const char *Current;
+  };
+
+private:
   // Records for each version index the corresponding Verdef or Vernaux entry.
   // This is filled the first time LoadVersionMap() is called.
   class VersionMapEntry : public PointerIntPair<const void*, 1> {
@@ -597,6 +650,27 @@ public:
   virtual dyn_iterator begin_dynamic_table() const;
   virtual dyn_iterator end_dynamic_table() const;
 
+  typedef ELFRelocationIterator<Elf_Rela> Elf_Rela_Iter;
+  typedef ELFRelocationIterator<Elf_Rel> Elf_Rel_Iter;
+
+  virtual Elf_Rela_Iter beginELFRela(const Elf_Shdr *sec) const {
+    return Elf_Rela_Iter(sec, (const char *)(base() + sec->sh_offset));
+  }
+
+  virtual Elf_Rela_Iter endELFRela(const Elf_Shdr *sec) const {
+    return Elf_Rela_Iter(sec, (const char *)
+                         (base() + sec->sh_offset + sec->sh_size));
+  }
+
+  virtual Elf_Rel_Iter beginELFRel(const Elf_Shdr *sec) const {
+    return Elf_Rel_Iter(sec, (const char *)(base() + sec->sh_offset));
+  }
+
+  virtual Elf_Rel_Iter endELFRel(const Elf_Shdr *sec) const {
+    return Elf_Rel_Iter(sec, (const char *)
+                        (base() + sec->sh_offset + sec->sh_size));
+  }
+
   virtual uint8_t getBytesInAddress() const;
   virtual StringRef getFileFormatName() const;
   virtual StringRef getObjectType() const { return "ELF"; }
@@ -611,6 +685,7 @@ public:
   const Elf_Shdr *getSection(const Elf_Sym *symb) const;
   const Elf_Shdr *getElfSection(section_iterator &It) const;
   const Elf_Sym *getElfSymbol(symbol_iterator &It) const;
+  const Elf_Sym *getElfSymbol(uint32_t index) const;
 
   // Methods for type inquiry through isa, cast, and dyn_cast
   bool isDyldType() const { return isDyldELFObject; }
@@ -807,6 +882,16 @@ ELFObjectFile<target_endianness, is64Bits>
 }
 
 template<support::endianness target_endianness, bool is64Bits>
+const typename ELFObjectFile<target_endianness, is64Bits>::Elf_Sym *
+ELFObjectFile<target_endianness, is64Bits>
+                             ::getElfSymbol(uint32_t index) const {
+  DataRefImpl SymbolData;
+  SymbolData.d.a = index;
+  SymbolData.d.b = 1;
+  return getSymbol(SymbolData);
+}
+
+template<support::endianness target_endianness, bool is64Bits>
 error_code ELFObjectFile<target_endianness, is64Bits>
                         ::getSymbolFileOffset(DataRefImpl Symb,
                                           uint64_t &Result) const {
@@ -866,7 +951,18 @@ error_code ELFObjectFile<target_endianness, is64Bits>
   case ELF::STT_FUNC:
   case ELF::STT_OBJECT:
   case ELF::STT_NOTYPE:
-    Result = symb->st_value + (Section ? Section->sh_addr : 0);
+    bool IsRelocatable;
+    switch(Header->e_type) {
+    case ELF::ET_EXEC:
+    case ELF::ET_DYN:
+      IsRelocatable = false;
+      break;
+    default:
+      IsRelocatable = true;
+    }
+    Result = symb->st_value;
+    if (IsRelocatable && Section != 0)
+      Result += Section->sh_addr;
     return object_error::success;
   default:
     Result = UnknownAddressOrSize;
diff --git a/include/llvm/Object/ObjectFile.h b/include/llvm/Object/ObjectFile.h
index 2ec656b012..1709974ac5 100644
--- a/include/llvm/Object/ObjectFile.h
+++ b/include/llvm/Object/ObjectFile.h
@@ -212,6 +212,8 @@ public:
   error_code getNext(SymbolRef &Result) const;
 
   error_code getName(StringRef &Result) const;
+  /// Returns the symbol virtual address (i.e. address at which it will be
+  /// mapped).
   error_code getAddress(uint64_t &Result) const;
   error_code getFileOffset(uint64_t &Result) const;
   error_code getSize(uint64_t &Result) const;
@@ -266,8 +268,8 @@ const uint64_t UnknownAddressOrSize = ~0ULL;
 /// figure out which type to create.
 class ObjectFile : public Binary {
   virtual void anchor();
-  ObjectFile(); // = delete
-  ObjectFile(const ObjectFile &other); // = delete
+  ObjectFile() LLVM_DELETED_FUNCTION;
+  ObjectFile(const ObjectFile &other) LLVM_DELETED_FUNCTION;
 
 protected:
   ObjectFile(unsigned int Type, MemoryBuffer *source, error_code &ec);
diff --git a/include/llvm/Operator.h b/include/llvm/Operator.h
index 1e86980cf3..cf6d8e2c37 100644
--- a/include/llvm/Operator.h
+++ b/include/llvm/Operator.h
@@ -32,10 +32,10 @@ class Operator : public User {
 private:
   // Do not implement any of these. The Operator class is intended to be used
   // as a utility, and is never itself instantiated.
-  void *operator new(size_t, unsigned);
-  void *operator new(size_t s);
-  Operator();
-  ~Operator();
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
+  void *operator new(size_t s) LLVM_DELETED_FUNCTION;
+  Operator() LLVM_DELETED_FUNCTION;
+  ~Operator() LLVM_DELETED_FUNCTION;
 
 public:
   /// getOpcode - Return the opcode for this Instruction or ConstantExpr.
@@ -77,7 +77,7 @@ public:
   };
 
 private:
-  ~OverflowingBinaryOperator(); // do not implement
+  ~OverflowingBinaryOperator() LLVM_DELETED_FUNCTION;
 
   friend class BinaryOperator;
   friend class ConstantExpr;
@@ -131,7 +131,7 @@ public:
   };
   
 private:
-  ~PossiblyExactOperator(); // do not implement
+  ~PossiblyExactOperator() LLVM_DELETED_FUNCTION;
 
   friend class BinaryOperator;
   friend class ConstantExpr;
@@ -168,7 +168,7 @@ public:
 /// information about relaxed accuracy requirements attached to them.
 class FPMathOperator : public Operator {
 private:
-  ~FPMathOperator(); // do not implement
+  ~FPMathOperator() LLVM_DELETED_FUNCTION;
 
 public:
 
@@ -191,7 +191,7 @@ public:
 /// opcodes.
 template<typename SuperClass, unsigned Opc>
 class ConcreteOperator : public SuperClass {
-  ~ConcreteOperator(); // DO NOT IMPLEMENT
+  ~ConcreteOperator() LLVM_DELETED_FUNCTION;
 public:
   static inline bool classof(const ConcreteOperator<SuperClass, Opc> *) {
     return true;
@@ -210,44 +210,44 @@ public:
 
 class AddOperator
   : public ConcreteOperator<OverflowingBinaryOperator, Instruction::Add> {
-  ~AddOperator(); // DO NOT IMPLEMENT
+  ~AddOperator() LLVM_DELETED_FUNCTION;
 };
 class SubOperator
   : public ConcreteOperator<OverflowingBinaryOperator, Instruction::Sub> {
-  ~SubOperator(); // DO NOT IMPLEMENT
+  ~SubOperator() LLVM_DELETED_FUNCTION;
 };
 class MulOperator
   : public ConcreteOperator<OverflowingBinaryOperator, Instruction::Mul> {
-  ~MulOperator(); // DO NOT IMPLEMENT
+  ~MulOperator() LLVM_DELETED_FUNCTION;
 };
 class ShlOperator
   : public ConcreteOperator<OverflowingBinaryOperator, Instruction::Shl> {
-  ~ShlOperator(); // DO NOT IMPLEMENT
+  ~ShlOperator() LLVM_DELETED_FUNCTION;
 };
 
   
 class SDivOperator
   : public ConcreteOperator<PossiblyExactOperator, Instruction::SDiv> {
-  ~SDivOperator(); // DO NOT IMPLEMENT
+  ~SDivOperator() LLVM_DELETED_FUNCTION;
 };
 class UDivOperator
   : public ConcreteOperator<PossiblyExactOperator, Instruction::UDiv> {
-  ~UDivOperator(); // DO NOT IMPLEMENT
+  ~UDivOperator() LLVM_DELETED_FUNCTION;
 };
 class AShrOperator
   : public ConcreteOperator<PossiblyExactOperator, Instruction::AShr> {
-  ~AShrOperator(); // DO NOT IMPLEMENT
+  ~AShrOperator() LLVM_DELETED_FUNCTION;
 };
 class LShrOperator
   : public ConcreteOperator<PossiblyExactOperator, Instruction::LShr> {
-  ~LShrOperator(); // DO NOT IMPLEMENT
+  ~LShrOperator() LLVM_DELETED_FUNCTION;
 };
   
   
   
 class GEPOperator
   : public ConcreteOperator<Operator, Instruction::GetElementPtr> {
-  ~GEPOperator(); // DO NOT IMPLEMENT
+  ~GEPOperator() LLVM_DELETED_FUNCTION;
 
   enum {
     IsInBounds = (1 << 0)
diff --git a/include/llvm/Pass.h b/include/llvm/Pass.h
index 888537daa4..cd651db1f1 100644
--- a/include/llvm/Pass.h
+++ b/include/llvm/Pass.h
@@ -29,6 +29,7 @@
 #ifndef LLVM_PASS_H
 #define LLVM_PASS_H
 
+#include "llvm/Support/Compiler.h"
 #include <string>
 
 namespace llvm {
@@ -82,8 +83,8 @@ class Pass {
   AnalysisResolver *Resolver;  // Used to resolve analysis
   const void *PassID;
   PassKind Kind;
-  void operator=(const Pass&);  // DO NOT IMPLEMENT
-  Pass(const Pass &);           // DO NOT IMPLEMENT
+  void operator=(const Pass&) LLVM_DELETED_FUNCTION;
+  Pass(const Pass &) LLVM_DELETED_FUNCTION;
 
 public:
   explicit Pass(PassKind K, char &pid) : Resolver(0), PassID(&pid), Kind(K) { }
diff --git a/include/llvm/PassAnalysisSupport.h b/include/llvm/PassAnalysisSupport.h
index 5c6a2d7a92..d14d73b1b1 100644
--- a/include/llvm/PassAnalysisSupport.h
+++ b/include/llvm/PassAnalysisSupport.h
@@ -120,7 +120,7 @@ public:
 class PMDataManager;
 class AnalysisResolver {
 private:
-  AnalysisResolver();  // DO NOT IMPLEMENT
+  AnalysisResolver() LLVM_DELETED_FUNCTION;
 
 public:
   explicit AnalysisResolver(PMDataManager &P) : PM(P) { }
diff --git a/include/llvm/PassSupport.h b/include/llvm/PassSupport.h
index c50c2cc184..c6ad44f5f4 100644
--- a/include/llvm/PassSupport.h
+++ b/include/llvm/PassSupport.h
@@ -126,8 +126,8 @@ public:
   }
 
 private:
-  void operator=(const PassInfo &); // do not implement
-  PassInfo(const PassInfo &);       // do not implement
+  void operator=(const PassInfo &) LLVM_DELETED_FUNCTION;
+  PassInfo(const PassInfo &) LLVM_DELETED_FUNCTION;
 };
 
 #define CALL_ONCE_INITIALIZATION(function) \
diff --git a/include/llvm/Support/AlignOf.h b/include/llvm/Support/AlignOf.h
index 8c389afa80..22c07d04fa 100644
--- a/include/llvm/Support/AlignOf.h
+++ b/include/llvm/Support/AlignOf.h
@@ -68,10 +68,7 @@ inline unsigned alignOf() { return AlignOf<T>::Alignment; }
 /// integer literal can be used to specify an alignment constraint. Once built
 /// up here, we can then begin to indirect between these using normal C++
 /// template parameters.
-template <size_t Alignment> struct AlignedCharArrayImpl {};
-template <> struct AlignedCharArrayImpl<0> {
-  typedef char type;
-};
+template <size_t Alignment> struct AlignedCharArrayImpl;
 
 // MSVC requires special handling here.
 #ifndef _MSC_VER
@@ -79,12 +76,12 @@ template <> struct AlignedCharArrayImpl<0> {
 #if __has_feature(cxx_alignas)
 #define LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(x) \
   template <> struct AlignedCharArrayImpl<x> { \
-    typedef char alignas(x) type; \
+    char alignas(x) aligned; \
   }
-#elif defined(__clang__) || defined(__GNUC__)
+#elif defined(__GNUC__)
 #define LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(x) \
   template <> struct AlignedCharArrayImpl<x> { \
-    typedef char type __attribute__((aligned(x))); \
+    char aligned __attribute__((aligned(x))); \
   }
 #else
 # error No supported align as directive.
@@ -112,14 +109,14 @@ LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(8192);
 // alignments because __declspec(align(...)) doesn't actually work when it is
 // a member of a by-value function argument in MSVC, even if the alignment
 // request is something reasonably like 8-byte or 16-byte.
-template <> struct AlignedCharArrayImpl<1> { typedef char type; };
-template <> struct AlignedCharArrayImpl<2> { typedef short type; };
-template <> struct AlignedCharArrayImpl<4> { typedef int type; };
-template <> struct AlignedCharArrayImpl<8> { typedef double type; };
+template <> struct AlignedCharArrayImpl<1> { char aligned; };
+template <> struct AlignedCharArrayImpl<2> { short aligned; };
+template <> struct AlignedCharArrayImpl<4> { int aligned; };
+template <> struct AlignedCharArrayImpl<8> { double aligned; };
 
 #define LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(x) \
   template <> struct AlignedCharArrayImpl<x> { \
-    typedef __declspec(align(x)) char type; \
+    __declspec(align(x)) char aligned; \
   }
 LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(16);
 LLVM_ALIGNEDCHARARRAY_TEMPLATE_ALIGNMENT(32);
@@ -162,17 +159,11 @@ public:
   /// constrain the layout of this character array.
   char buffer[sizeof(SizerImpl)];
 
-  // Sadly, Clang and GCC both fail to align a character array properly even
-  // with an explicit alignment attribute. To work around this, we union
-  // the character array that will actually be used with a struct that contains
-  // a single aligned character member. Tests seem to indicate that both Clang
-  // and GCC will properly register the alignment of a struct containing an
-  // aligned member, and this alignment should carry over to the character
-  // array in the union.
-  struct {
-    typename llvm::AlignedCharArrayImpl<AlignOf<AlignerImpl>::Alignment>::type
-      nonce_inner_member;
-  } nonce_member;
+private:
+  // Tests seem to indicate that both Clang and GCC will properly register the
+  // alignment of a struct containing an aligned member, and this alignment
+  // should carry over to the character array in the union.
+  llvm::AlignedCharArrayImpl<AlignOf<AlignerImpl>::Alignment> nonce_member;
 };
 
 } // end namespace llvm
diff --git a/include/llvm/Support/Allocator.h b/include/llvm/Support/Allocator.h
index a2ad24ffea..a644b13366 100644
--- a/include/llvm/Support/Allocator.h
+++ b/include/llvm/Support/Allocator.h
@@ -79,8 +79,8 @@ class MallocSlabAllocator : public SlabAllocator {
 public:
   MallocSlabAllocator() : Allocator() { }
   virtual ~MallocSlabAllocator();
-  virtual MemSlab *Allocate(size_t Size);
-  virtual void Deallocate(MemSlab *Slab);
+  virtual MemSlab *Allocate(size_t Size) LLVM_OVERRIDE;
+  virtual void Deallocate(MemSlab *Slab) LLVM_OVERRIDE;
 };
 
 /// BumpPtrAllocator - This allocator is useful for containers that need
@@ -88,8 +88,8 @@ public:
 /// allocating memory, and never deletes it until the entire block is dead. This
 /// makes allocation speedy, but must only be used when the trade-off is ok.
 class BumpPtrAllocator {
-  BumpPtrAllocator(const BumpPtrAllocator &); // do not implement
-  void operator=(const BumpPtrAllocator &);   // do not implement
+  BumpPtrAllocator(const BumpPtrAllocator &) LLVM_DELETED_FUNCTION;
+  void operator=(const BumpPtrAllocator &) LLVM_DELETED_FUNCTION;
 
   /// SlabSize - Allocate data into slabs of this size unless we get an
   /// allocation above SizeThreshold.
diff --git a/include/llvm/Support/Casting.h b/include/llvm/Support/Casting.h
index 3aab4367f5..d35febbe6d 100644
--- a/include/llvm/Support/Casting.h
+++ b/include/llvm/Support/Casting.h
@@ -65,18 +65,21 @@ template <typename To, typename From> struct isa_impl_cl<To, const From> {
 
 template <typename To, typename From> struct isa_impl_cl<To, From*> {
   static inline bool doit(const From *Val) {
+    assert(Val && "isa<> used on a null pointer");
     return isa_impl<To, From>::doit(*Val);
   }
 };
 
 template <typename To, typename From> struct isa_impl_cl<To, const From*> {
   static inline bool doit(const From *Val) {
+    assert(Val && "isa<> used on a null pointer");
     return isa_impl<To, From>::doit(*Val);
   }
 };
 
 template <typename To, typename From> struct isa_impl_cl<To, const From*const> {
   static inline bool doit(const From *Val) {
+    assert(Val && "isa<> used on a null pointer");
     return isa_impl<To, From>::doit(*Val);
   }
 };
diff --git a/include/llvm/Support/CommandLine.h b/include/llvm/Support/CommandLine.h
index ae1570da9c..d0c304712a 100644
--- a/include/llvm/Support/CommandLine.h
+++ b/include/llvm/Support/CommandLine.h
@@ -1608,15 +1608,16 @@ public:
 class alias : public Option {
   Option *AliasFor;
   virtual bool handleOccurrence(unsigned pos, StringRef /*ArgName*/,
-                                StringRef Arg) {
+                                StringRef Arg) LLVM_OVERRIDE {
     return AliasFor->handleOccurrence(pos, AliasFor->ArgStr, Arg);
   }
   // Handle printing stuff...
-  virtual size_t getOptionWidth() const;
-  virtual void printOptionInfo(size_t GlobalWidth) const;
+  virtual size_t getOptionWidth() const LLVM_OVERRIDE;
+  virtual void printOptionInfo(size_t GlobalWidth) const LLVM_OVERRIDE;
 
   // Aliases do not need to print their values.
-  virtual void printOptionValue(size_t /*GlobalWidth*/, bool /*Force*/) const {}
+  virtual void printOptionValue(size_t /*GlobalWidth*/,
+                                bool /*Force*/) const LLVM_OVERRIDE {}
 
   void done() {
     if (!hasArgStr())
diff --git a/include/llvm/Support/Compiler.h b/include/llvm/Support/Compiler.h
index 1136ff70c3..7ceeb32121 100644
--- a/include/llvm/Support/Compiler.h
+++ b/include/llvm/Support/Compiler.h
@@ -40,7 +40,7 @@
 
 /// LLVM_DELETED_FUNCTION - Expands to = delete if the compiler supports it.
 /// Use to mark functions as uncallable. Member functions with this should
-/// be declared private so that some behaivor is kept in C++03 mode.
+/// be declared private so that some behavior is kept in C++03 mode.
 ///
 /// class DontCopy {
 /// private:
@@ -57,6 +57,22 @@
 #define LLVM_DELETED_FUNCTION
 #endif
 
+/// LLVM_FINAL - Expands to 'final' if the compiler supports it.
+/// Use to mark classes or virtual methods as final.
+#if (__has_feature(cxx_override_control))
+#define LLVM_FINAL final
+#else
+#define LLVM_FINAL
+#endif
+
+/// LLVM_OVERRIDE - Expands to 'override' if the compiler supports it.
+/// Use to mark virtual methods as overriding a base class method.
+#if (__has_feature(cxx_override_control))
+#define LLVM_OVERRIDE override
+#else
+#define LLVM_OVERRIDE
+#endif
+
 /// LLVM_LIBRARY_VISIBILITY - If a class marked with this attribute is linked
 /// into a shared library, then the class should be private to the library and
 /// not accessible from outside it.  Can also be used to mark variables and
diff --git a/include/llvm/Support/ELF.h b/include/llvm/Support/ELF.h
index 705ff679f8..dbd1091629 100644
--- a/include/llvm/Support/ELF.h
+++ b/include/llvm/Support/ELF.h
@@ -442,6 +442,7 @@ enum {
   R_MICROBLAZE_COPY           = 21
 };
 
+// ELF Relocation types for PPC32
 enum {
   R_PPC_NONE                  = 0,      /* No relocation. */
   R_PPC_ADDR32                = 1,
@@ -460,6 +461,20 @@ enum {
   R_PPC_REL32                 = 26
 };
 
+// ELF Relocation types for PPC64
+enum {
+  R_PPC64_ADDR16_LO           = 4,
+  R_PPC64_ADDR16_HI           = 5,
+  R_PPC64_ADDR14              = 7,
+  R_PPC64_REL24               = 10,
+  R_PPC64_ADDR64              = 38,
+  R_PPC64_ADDR16_HIGHER       = 39,
+  R_PPC64_ADDR16_HIGHEST      = 41,
+  R_PPC64_TOC16               = 47,
+  R_PPC64_TOC                 = 51,
+  R_PPC64_TOC16_DS            = 63
+};
+
 // ARM Specific e_flags
 enum { EF_ARM_EABIMASK = 0xFF000000U };
 
diff --git a/include/llvm/Support/FileOutputBuffer.h b/include/llvm/Support/FileOutputBuffer.h
index 0f07164eb8..bcd35e3c1e 100644
--- a/include/llvm/Support/FileOutputBuffer.h
+++ b/include/llvm/Support/FileOutputBuffer.h
@@ -78,10 +78,11 @@ public:
   ~FileOutputBuffer();
 
   
+private:
+  FileOutputBuffer(const FileOutputBuffer &) LLVM_DELETED_FUNCTION;
+  FileOutputBuffer &operator=(const FileOutputBuffer &) LLVM_DELETED_FUNCTION;
 protected:
-  FileOutputBuffer(const FileOutputBuffer &); // DO NOT IMPLEMENT
-  FileOutputBuffer &operator=(const FileOutputBuffer &); // DO NOT IMPLEMENT
-  FileOutputBuffer(uint8_t *Start, uint8_t *End, 
+  FileOutputBuffer(uint8_t *Start, uint8_t *End,
                     StringRef Path, StringRef TempPath);
     
   uint8_t            *BufferStart;
diff --git a/include/llvm/Support/FileSystem.h b/include/llvm/Support/FileSystem.h
index 5d6020502d..b455b28b81 100644
--- a/include/llvm/Support/FileSystem.h
+++ b/include/llvm/Support/FileSystem.h
@@ -280,7 +280,7 @@ error_code create_symlink(const Twine &to, const Twine &from);
 /// @brief Get the current path.
 ///
 /// @param result Holds the current path on return.
-/// @results errc::success if the current path has been stored in result,
+/// @returns errc::success if the current path has been stored in result,
 ///          otherwise a platform specific error_code.
 error_code current_path(SmallVectorImpl<char> &result);
 
@@ -289,7 +289,7 @@ error_code current_path(SmallVectorImpl<char> &result);
 /// @param path Input path.
 /// @param existed Set to true if \a path existed, false if it did not.
 ///                undefined otherwise.
-/// @results errc::success if path has been removed and existed has been
+/// @returns errc::success if path has been removed and existed has been
 ///          successfully set, otherwise a platform specific error_code.
 error_code remove(const Twine &path, bool &existed);
 
@@ -298,7 +298,7 @@ error_code remove(const Twine &path, bool &existed);
 ///
 /// @param path Input path.
 /// @param num_removed Number of files removed.
-/// @results errc::success if path has been removed and num_removed has been
+/// @returns errc::success if path has been removed and num_removed has been
 ///          successfully set, otherwise a platform specific error_code.
 error_code remove_all(const Twine &path, uint32_t &num_removed);
 
@@ -323,7 +323,7 @@ error_code resize_file(const Twine &path, uint64_t size);
 /// @brief Does file exist?
 ///
 /// @param status A file_status previously returned from stat.
-/// @results True if the file represented by status exists, false if it does
+/// @returns True if the file represented by status exists, false if it does
 ///          not.
 bool exists(file_status status);
 
@@ -332,7 +332,7 @@ bool exists(file_status status);
 /// @param path Input path.
 /// @param result Set to true if the file represented by status exists, false if
 ///               it does not. Undefined otherwise.
-/// @results errc::success if result has been successfully set, otherwise a
+/// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
 error_code exists(const Twine &path, bool &result);
 
@@ -350,7 +350,7 @@ inline bool exists(const Twine &path) {
 ///
 /// assert(status_known(A) || status_known(B));
 ///
-/// @results True if A and B both represent the same file system entity, false
+/// @returns True if A and B both represent the same file system entity, false
 ///          otherwise.
 bool equivalent(file_status A, file_status B);
 
@@ -362,7 +362,7 @@ bool equivalent(file_status A, file_status B);
 /// @param B Input path B.
 /// @param result Set to true if stat(A) and stat(B) have the same device and
 ///               inode (or equivalent).
-/// @results errc::success if result has been successfully set, otherwise a
+/// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
 error_code equivalent(const Twine &A, const Twine &B, bool &result);
 
@@ -384,7 +384,7 @@ error_code file_size(const Twine &path, uint64_t &result);
 /// @brief Does status represent a directory?
 ///
 /// @param status A file_status previously returned from status.
-/// @results status.type() == file_type::directory_file.
+/// @returns status.type() == file_type::directory_file.
 bool is_directory(file_status status);
 
 /// @brief Is path a directory?
@@ -392,14 +392,14 @@ bool is_directory(file_status status);
 /// @param path Input path.
 /// @param result Set to true if \a path is a directory, false if it is not.
 ///               Undefined otherwise.
-/// @results errc::success if result has been successfully set, otherwise a
+/// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
 error_code is_directory(const Twine &path, bool &result);
 
 /// @brief Does status represent a regular file?
 ///
 /// @param status A file_status previously returned from status.
-/// @results status_known(status) && status.type() == file_type::regular_file.
+/// @returns status_known(status) && status.type() == file_type::regular_file.
 bool is_regular_file(file_status status);
 
 /// @brief Is path a regular file?
@@ -407,7 +407,7 @@ bool is_regular_file(file_status status);
 /// @param path Input path.
 /// @param result Set to true if \a path is a regular file, false if it is not.
 ///               Undefined otherwise.
-/// @results errc::success if result has been successfully set, otherwise a
+/// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
 error_code is_regular_file(const Twine &path, bool &result);
 
@@ -415,7 +415,7 @@ error_code is_regular_file(const Twine &path, bool &result);
 ///        directory, regular file, or symlink?
 ///
 /// @param status A file_status previously returned from status.
-/// @results exists(s) && !is_regular_file(s) && !is_directory(s) &&
+/// @returns exists(s) && !is_regular_file(s) && !is_directory(s) &&
 ///          !is_symlink(s)
 bool is_other(file_status status);
 
@@ -425,7 +425,7 @@ bool is_other(file_status status);
 /// @param path Input path.
 /// @param result Set to true if \a path exists, but is not a directory, regular
 ///               file, or a symlink, false if it does not. Undefined otherwise.
-/// @results errc::success if result has been successfully set, otherwise a
+/// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
 error_code is_other(const Twine &path, bool &result);
 
@@ -440,7 +440,7 @@ bool is_symlink(file_status status);
 /// @param path Input path.
 /// @param result Set to true if \a path is a symlink, false if it is not.
 ///               Undefined otherwise.
-/// @results errc::success if result has been successfully set, otherwise a
+/// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
 error_code is_symlink(const Twine &path, bool &result);
 
@@ -448,28 +448,28 @@ error_code is_symlink(const Twine &path, bool &result);
 ///
 /// @param path Input path.
 /// @param result Set to the file status.
-/// @results errc::success if result has been successfully set, otherwise a
+/// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
 error_code status(const Twine &path, file_status &result);
 
 /// @brief Modifies permission bits on a file
 ///
 /// @param path Input path.
-/// @results errc::success if permissions have been changed, otherwise a
+/// @returns errc::success if permissions have been changed, otherwise a
 ///          platform specific error_code.
 error_code permissions(const Twine &path, perms prms);
 
 /// @brief Is status available?
 ///
 /// @param s Input file status.
-/// @results True if status() != status_error.
+/// @returns True if status() != status_error.
 bool status_known(file_status s);
 
 /// @brief Is status available?
 ///
 /// @param path Input path.
 /// @param result Set to true if status() != status_error.
-/// @results errc::success if result has been successfully set, otherwise a
+/// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
 error_code status_known(const Twine &path, bool &result);
 
@@ -488,9 +488,9 @@ error_code status_known(const Twine &path, bool &result);
 /// @param model Name to base unique path off of.
 /// @param result_fd Set to the opened file's file descriptor.
 /// @param result_path Set to the opened file's absolute path.
-/// @param makeAbsolute If true and @model is not an absolute path, a temp
+/// @param makeAbsolute If true and \a model is not an absolute path, a temp
 ///        directory will be prepended.
-/// @results errc::success if result_{fd,path} have been successfully set,
+/// @returns errc::success if result_{fd,path} have been successfully set,
 ///          otherwise a platform specific error_code.
 error_code unique_file(const Twine &model, int &result_fd,
                        SmallVectorImpl<char> &result_path,
@@ -503,7 +503,7 @@ error_code unique_file(const Twine &model, int &result_fd,
 ///
 /// @param path Input path.
 /// @param result Set to the canonicalized version of \a path.
-/// @results errc::success if result has been successfully set, otherwise a
+/// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
 error_code canonicalize(const Twine &path, SmallVectorImpl<char> &result);
 
@@ -511,7 +511,7 @@ error_code canonicalize(const Twine &path, SmallVectorImpl<char> &result);
 ///
 /// @param path Input path.
 /// @param magic Byte sequence to compare \a path's first len(magic) bytes to.
-/// @results errc::success if result has been successfully set, otherwise a
+/// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
 error_code has_magic(const Twine &path, const Twine &magic, bool &result);
 
@@ -522,7 +522,7 @@ error_code has_magic(const Twine &path, const Twine &magic, bool &result);
 /// @param result Set to the first \a len bytes in the file pointed to by
 ///               \a path. Or the entire file if file_size(path) < len, in which
 ///               case result.size() returns the size of the file.
-/// @results errc::success if result has been successfully set,
+/// @returns errc::success if result has been successfully set,
 ///          errc::value_too_large if len is larger then the file pointed to by
 ///          \a path, otherwise a platform specific error_code.
 error_code get_magic(const Twine &path, uint32_t len,
@@ -535,14 +535,14 @@ file_magic identify_magic(StringRef magic);
 ///
 /// @param path Input path.
 /// @param result Set to the type of file, or LLVMFileType::Unknown_FileType.
-/// @results errc::success if result has been successfully set, otherwise a
+/// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
 error_code identify_magic(const Twine &path, file_magic &result);
 
 /// @brief Get library paths the system linker uses.
 ///
 /// @param result Set to the list of system library paths.
-/// @results errc::success if result has been successfully set, otherwise a
+/// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
 error_code GetSystemLibraryPaths(SmallVectorImpl<std::string> &result);
 
@@ -550,7 +550,7 @@ error_code GetSystemLibraryPaths(SmallVectorImpl<std::string> &result);
 ///        + LLVM_LIB_SEARCH_PATH + LLVM_LIBDIR.
 ///
 /// @param result Set to the list of bitcode library paths.
-/// @results errc::success if result has been successfully set, otherwise a
+/// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
 error_code GetBitcodeLibraryPaths(SmallVectorImpl<std::string> &result);
 
@@ -563,7 +563,7 @@ error_code GetBitcodeLibraryPaths(SmallVectorImpl<std::string> &result);
 ///
 /// @param short_name Library name one would give to the system linker.
 /// @param result Set to the absolute path \a short_name represents.
-/// @results errc::success if result has been successfully set, otherwise a
+/// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
 error_code FindLibrary(const Twine &short_name, SmallVectorImpl<char> &result);
 
@@ -572,7 +572,7 @@ error_code FindLibrary(const Twine &short_name, SmallVectorImpl<char> &result);
 /// @param argv0 The program name as it was spelled on the command line.
 /// @param MainAddr Address of some symbol in the executable (not in a library).
 /// @param result Set to the absolute path of the current executable.
-/// @results errc::success if result has been successfully set, otherwise a
+/// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
 error_code GetMainExecutable(const char *argv0, void *MainAddr,
                              SmallVectorImpl<char> &result);
@@ -664,7 +664,7 @@ public:
 ///        to the file.  If false, the file will be mapped read-only
 ///        and the buffer will be read-only.
 /// @param result Set to the start address of the mapped buffer.
-/// @results errc::success if result has been successfully set, otherwise a
+/// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
 error_code map_file_pages(const Twine &path, off_t file_offset, size_t size,  
                           bool map_writable, void *&result);
@@ -674,7 +674,7 @@ error_code map_file_pages(const Twine &path, off_t file_offset, size_t size,
 ///
 /// @param base Pointer to the start of the buffer.
 /// @param size Byte length of the range to unmmap.
-/// @results errc::success if result has been successfully set, otherwise a
+/// @returns errc::success if result has been successfully set, otherwise a
 ///          platform specific error_code.
 error_code unmap_file_pages(void *base, size_t size);
 
diff --git a/include/llvm/Support/Format.h b/include/llvm/Support/Format.h
index 59812d98f5..aaa54e1090 100644
--- a/include/llvm/Support/Format.h
+++ b/include/llvm/Support/Format.h
@@ -170,31 +170,47 @@ public:
   }
 };
 
-/// format - This is a helper function that is used to produce formatted output.
-/// This is typically used like:  OS << format("%0.4f", myfloat) << '\n';
+/// This is a helper function that is used to produce formatted output.
+///
+/// This is typically used like:
+/// \code
+///   OS << format("%0.4f", myfloat) << '\n';
+/// \endcode
 template <typename T>
 inline format_object1<T> format(const char *Fmt, const T &Val) {
   return format_object1<T>(Fmt, Val);
 }
 
-/// format - This is a helper function that is used to produce formatted output.
-/// This is typically used like:  OS << format("%0.4f", myfloat) << '\n';
+/// This is a helper function that is used to produce formatted output.
+///
+/// This is typically used like:
+/// \code
+///   OS << format("%0.4f", myfloat) << '\n';
+/// \endcode
 template <typename T1, typename T2>
 inline format_object2<T1, T2> format(const char *Fmt, const T1 &Val1,
                                      const T2 &Val2) {
   return format_object2<T1, T2>(Fmt, Val1, Val2);
 }
 
-/// format - This is a helper function that is used to produce formatted output.
-/// This is typically used like:  OS << format("%0.4f", myfloat) << '\n';
+/// This is a helper function that is used to produce formatted output.
+///
+/// This is typically used like:
+/// \code
+///   OS << format("%0.4f", myfloat) << '\n';
+/// \endcode
 template <typename T1, typename T2, typename T3>
   inline format_object3<T1, T2, T3> format(const char *Fmt, const T1 &Val1,
                                            const T2 &Val2, const T3 &Val3) {
   return format_object3<T1, T2, T3>(Fmt, Val1, Val2, Val3);
 }
 
-/// format - This is a helper function that is used to produce formatted output.
-/// This is typically used like:  OS << format("%0.4f", myfloat) << '\n';
+/// This is a helper function that is used to produce formatted output.
+///
+/// This is typically used like:
+/// \code
+///   OS << format("%0.4f", myfloat) << '\n';
+/// \endcode
 template <typename T1, typename T2, typename T3, typename T4>
 inline format_object4<T1, T2, T3, T4> format(const char *Fmt, const T1 &Val1,
                                              const T2 &Val2, const T3 &Val3,
@@ -202,8 +218,12 @@ inline format_object4<T1, T2, T3, T4> format(const char *Fmt, const T1 &Val1,
   return format_object4<T1, T2, T3, T4>(Fmt, Val1, Val2, Val3, Val4);
 }
 
-/// format - This is a helper function that is used to produce formatted output.
-/// This is typically used like:  OS << format("%0.4f", myfloat) << '\n';
+/// This is a helper function that is used to produce formatted output.
+///
+/// This is typically used like:
+/// \code
+///   OS << format("%0.4f", myfloat) << '\n';
+/// \endcode
 template <typename T1, typename T2, typename T3, typename T4, typename T5>
 inline format_object5<T1, T2, T3, T4, T5> format(const char *Fmt,const T1 &Val1,
                                              const T2 &Val2, const T3 &Val3,
diff --git a/include/llvm/Support/FormattedStream.h b/include/llvm/Support/FormattedStream.h
index 58a1885168..21635dcfb6 100644
--- a/include/llvm/Support/FormattedStream.h
+++ b/include/llvm/Support/FormattedStream.h
@@ -55,14 +55,15 @@ namespace llvm
     ///
     const char *Scanned;
 
-    virtual void write_impl(const char *Ptr, size_t Size);
+    virtual void write_impl(const char *Ptr, size_t Size) LLVM_OVERRIDE;
 
     /// current_pos - Return the current position within the stream,
     /// not counting the bytes currently in the buffer.
-    virtual uint64_t current_pos() const { 
-      // This has the same effect as calling TheStream.current_pos(),
-      // but that interface is private.
-      return TheStream->tell() - TheStream->GetNumBytesInBuffer();
+    virtual uint64_t current_pos() const LLVM_OVERRIDE {
+      // Our current position in the stream is all the contents which have been
+      // written to the underlying stream (*not* the current position of the
+      // underlying stream).
+      return TheStream->tell();
     }
 
     /// ComputeColumn - Examine the given output buffer and figure out which
diff --git a/include/llvm/Support/LEB128.h b/include/llvm/Support/LEB128.h
index 410edd4dc7..b52e5bc9ad 100644
--- a/include/llvm/Support/LEB128.h
+++ b/include/llvm/Support/LEB128.h
@@ -15,7 +15,7 @@
 #ifndef LLVM_SYSTEM_LEB128_H
 #define LLVM_SYSTEM_LEB128_H
 
-#include <llvm/Support/raw_ostream.h>
+#include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
 
diff --git a/include/llvm/Support/LockFileManager.h b/include/llvm/Support/LockFileManager.h
index e2fa8ebc56..8c4a760291 100644
--- a/include/llvm/Support/LockFileManager.h
+++ b/include/llvm/Support/LockFileManager.h
@@ -47,8 +47,8 @@ private:
   Optional<std::pair<std::string, int> > Owner;
   Optional<error_code> Error;
 
-  LockFileManager(const LockFileManager &);
-  LockFileManager &operator=(const LockFileManager &);
+  LockFileManager(const LockFileManager &) LLVM_DELETED_FUNCTION;
+  LockFileManager &operator=(const LockFileManager &) LLVM_DELETED_FUNCTION;
 
   static Optional<std::pair<std::string, int> >
   readLockFile(StringRef LockFileName);
diff --git a/include/llvm/Support/MathExtras.h b/include/llvm/Support/MathExtras.h
index 35c2694cff..11f9e63c9b 100644
--- a/include/llvm/Support/MathExtras.h
+++ b/include/llvm/Support/MathExtras.h
@@ -431,21 +431,22 @@ inline uint64_t NextPowerOf2(uint64_t A) {
   return A + 1;
 }
 
-/// RoundUpToAlignment - Returns the next integer (mod 2**64) that is
-/// greater than or equal to \arg Value and is a multiple of \arg
-/// Align. Align must be non-zero.
+/// Returns the next integer (mod 2**64) that is greater than or equal to
+/// \p Value and is a multiple of \p Align. \p Align must be non-zero.
 ///
 /// Examples:
-/// RoundUpToAlignment(5, 8) = 8
-/// RoundUpToAlignment(17, 8) = 24
-/// RoundUpToAlignment(~0LL, 8) = 0
+/// \code
+///   RoundUpToAlignment(5, 8) = 8
+///   RoundUpToAlignment(17, 8) = 24
+///   RoundUpToAlignment(~0LL, 8) = 0
+/// \endcode
 inline uint64_t RoundUpToAlignment(uint64_t Value, uint64_t Align) {
   return ((Value + Align - 1) / Align) * Align;
 }
 
-/// OffsetToAlignment - Return the offset to the next integer (mod 2**64) that
-/// is greater than or equal to \arg Value and is a multiple of \arg
-/// Align. Align must be non-zero.
+/// Returns the offset to the next integer (mod 2**64) that is greater than
+/// or equal to \p Value and is a multiple of \p Align. \p Align must be
+/// non-zero.
 inline uint64_t OffsetToAlignment(uint64_t Value, uint64_t Align) {
   return RoundUpToAlignment(Value, Align) - Value;
 }
diff --git a/include/llvm/Support/Memory.h b/include/llvm/Support/Memory.h
index 37890e7e4a..8227c84bff 100644
--- a/include/llvm/Support/Memory.h
+++ b/include/llvm/Support/Memory.h
@@ -15,6 +15,7 @@
 #define LLVM_SYSTEM_MEMORY_H
 
 #include "llvm/Support/DataTypes.h"
+#include "llvm/Support/system_error.h"
 #include <string>
 
 namespace llvm {
@@ -43,6 +44,70 @@ namespace sys {
   /// @brief An abstraction for memory operations.
   class Memory {
   public:
+    enum ProtectionFlags {
+      MF_READ  = 0x1000000,
+      MF_WRITE = 0x2000000,
+      MF_EXEC  = 0x4000000
+    };
+
+    /// This method allocates a block of memory that is suitable for loading
+    /// dynamically generated code (e.g. JIT). An attempt to allocate
+    /// \p NumBytes bytes of virtual memory is made.
+    /// \p NearBlock may point to an existing allocation in which case
+    /// an attempt is made to allocate more memory near the existing block.
+    /// The actual allocated address is not guaranteed to be near the requested
+    /// address.
+    /// \p Flags is used to set the initial protection flags for the block
+    /// of the memory.
+    /// \p EC [out] returns an object describing any error that occurs.
+    ///
+    /// This method may allocate more than the number of bytes requested.  The
+    /// actual number of bytes allocated is indicated in the returned
+    /// MemoryBlock.
+    ///
+    /// The start of the allocated block must be aligned with the
+    /// system allocation granularity (64K on Windows, page size on Linux).
+    /// If the address following \p NearBlock is not so aligned, it will be
+    /// rounded up to the next allocation granularity boundary.
+    ///
+    /// \r a non-null MemoryBlock if the function was successful, 
+    /// otherwise a null MemoryBlock is with \p EC describing the error.
+    ///
+    /// @brief Allocate mapped memory.
+    static MemoryBlock allocateMappedMemory(size_t NumBytes,
+                                            const MemoryBlock *const NearBlock,
+                                            unsigned Flags,
+                                            error_code &EC);
+
+    /// This method releases a block of memory that was allocated with the
+    /// allocateMappedMemory method. It should not be used to release any
+    /// memory block allocated any other way.
+    /// \p Block describes the memory to be released.
+    ///
+    /// \r error_success if the function was successful, or an error_code
+    /// describing the failure if an error occurred.
+    /// 
+    /// @brief Release mapped memory.
+    static error_code releaseMappedMemory(MemoryBlock &Block);
+
+    /// This method sets the protection flags for a block of memory to the
+    /// state specified by /p Flags.  The behavior is not specified if the
+    /// memory was not allocated using the allocateMappedMemory method.
+    /// \p Block describes the memory block to be protected.
+    /// \p Flags specifies the new protection state to be assigned to the block.
+    /// \p ErrMsg [out] returns a string describing any error that occured.
+    ///
+    /// If \p Flags is MF_WRITE, the actual behavior varies
+    /// with the operating system (i.e. MF_READWRITE on Windows) and the
+    /// target architecture (i.e. MF_WRITE -> MF_READWRITE on i386).
+    ///
+    /// \r error_success if the function was successful, or an error_code
+    /// describing the failure if an error occurred.
+    ///
+    /// @brief Set memory protection state.
+    static error_code protectMappedMemory(const MemoryBlock &Block,
+                                          unsigned Flags);
+
     /// This method allocates a block of Read/Write/Execute memory that is
     /// suitable for executing dynamically generated code (e.g. JIT). An
     /// attempt to allocate \p NumBytes bytes of virtual memory is made.
diff --git a/include/llvm/Support/MemoryBuffer.h b/include/llvm/Support/MemoryBuffer.h
index 06816de971..1f02907d9f 100644
--- a/include/llvm/Support/MemoryBuffer.h
+++ b/include/llvm/Support/MemoryBuffer.h
@@ -15,6 +15,7 @@
 #define LLVM_SUPPORT_MEMORYBUFFER_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataTypes.h"
 
 namespace llvm {
@@ -36,8 +37,8 @@ class MemoryBuffer {
   const char *BufferStart; // Start of the buffer.
   const char *BufferEnd;   // End of the buffer.
 
-  MemoryBuffer(const MemoryBuffer &); // DO NOT IMPLEMENT
-  MemoryBuffer &operator=(const MemoryBuffer &); // DO NOT IMPLEMENT
+  MemoryBuffer(const MemoryBuffer &) LLVM_DELETED_FUNCTION;
+  MemoryBuffer &operator=(const MemoryBuffer &) LLVM_DELETED_FUNCTION;
 protected:
   MemoryBuffer() {}
   void init(const char *BufStart, const char *BufEnd,
diff --git a/include/llvm/Support/Mutex.h b/include/llvm/Support/Mutex.h
index 42ea63060f..6abc533d28 100644
--- a/include/llvm/Support/Mutex.h
+++ b/include/llvm/Support/Mutex.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_SYSTEM_MUTEX_H
 #define LLVM_SYSTEM_MUTEX_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Threading.h"
 #include <cassert>
 
@@ -75,8 +76,8 @@ namespace llvm
     /// @name Do Not Implement
     /// @{
     private:
-      MutexImpl(const MutexImpl & original);
-      void operator=(const MutexImpl &);
+      MutexImpl(const MutexImpl &) LLVM_DELETED_FUNCTION;
+      void operator=(const MutexImpl &) LLVM_DELETED_FUNCTION;
     /// @}
     };
 
diff --git a/include/llvm/Support/MutexGuard.h b/include/llvm/Support/MutexGuard.h
index cd13bfe6ee..6bb162277e 100644
--- a/include/llvm/Support/MutexGuard.h
+++ b/include/llvm/Support/MutexGuard.h
@@ -26,8 +26,8 @@ namespace llvm {
   /// @brief Guard a section of code with a Mutex.
   class MutexGuard {
     sys::Mutex &M;
-    MutexGuard(const MutexGuard &);    // DO NOT IMPLEMENT
-    void operator=(const MutexGuard &); // DO NOT IMPLEMENT
+    MutexGuard(const MutexGuard &) LLVM_DELETED_FUNCTION;
+    void operator=(const MutexGuard &) LLVM_DELETED_FUNCTION;
   public:
     MutexGuard(sys::Mutex &m) : M(m) { M.acquire(); }
     ~MutexGuard() { M.release(); }
diff --git a/include/llvm/Support/PathV2.h b/include/llvm/Support/PathV2.h
index 967ea1e1d1..ae1a21c7ce 100644
--- a/include/llvm/Support/PathV2.h
+++ b/include/llvm/Support/PathV2.h
@@ -39,13 +39,14 @@ namespace path {
 /// The backwards traversal order is the reverse of forward traversal.
 ///
 /// Iteration examples. Each component is separated by ',':
-/// /          => /
-/// /foo       => /,foo
-/// foo/       => foo,.
-/// /foo/bar   => /,foo,bar
-/// ../        => ..,.
-/// C:\foo\bar => C:,/,foo,bar
-///
+/// @code
+///   /          => /
+///   /foo       => /,foo
+///   foo/       => foo,.
+///   /foo/bar   => /,foo,bar
+///   ../        => ..,.
+///   C:\foo\bar => C:,/,foo,bar
+/// @endcode
 class const_iterator {
   StringRef Path;      ///< The entire path.
   StringRef Component; ///< The current component. Not necessarily in Path.
@@ -107,18 +108,22 @@ inline reverse_iterator rend(StringRef path) {
 
 /// @brief Remove the last component from \a path unless it is the root dir.
 ///
-/// directory/filename.cpp => directory/
-/// directory/             => directory
-/// /                      => /
+/// @code
+///   directory/filename.cpp => directory/
+///   directory/             => directory
+///   /                      => /
+/// @endcode
 ///
 /// @param path A path that is modified to not have a file component.
 void remove_filename(SmallVectorImpl<char> &path);
 
 /// @brief Replace the file extension of \a path with \a extension.
 ///
-/// ./filename.cpp => ./filename.extension
-/// ./filename     => ./filename.extension
-/// ./             => ./.extension
+/// @code
+///   ./filename.cpp => ./filename.extension
+///   ./filename     => ./filename.extension
+///   ./             => ./.extension
+/// @endcode
 ///
 /// @param path A path that has its extension replaced with \a extension.
 /// @param extension The extension to be added. It may be empty. It may also
@@ -128,9 +133,11 @@ void replace_extension(SmallVectorImpl<char> &path, const Twine &extension);
 
 /// @brief Append to path.
 ///
-/// /foo  + bar/f => /foo/bar/f
-/// /foo/ + bar/f => /foo/bar/f
-/// foo   + bar/f => foo/bar/f
+/// @code
+///   /foo  + bar/f => /foo/bar/f
+///   /foo/ + bar/f => /foo/bar/f
+///   foo   + bar/f => foo/bar/f
+/// @endcode
 ///
 /// @param path Set to \a path + \a component.
 /// @param a The component to be appended to \a path.
@@ -141,9 +148,11 @@ void append(SmallVectorImpl<char> &path, const Twine &a,
 
 /// @brief Append to path.
 ///
-/// /foo  + [bar,f] => /foo/bar/f
-/// /foo/ + [bar,f] => /foo/bar/f
-/// foo   + [bar,f] => foo/bar/f
+/// @code
+///   /foo  + [bar,f] => /foo/bar/f
+///   /foo/ + [bar,f] => /foo/bar/f
+///   foo   + [bar,f] => foo/bar/f
+/// @endcode
 ///
 /// @param path Set to \a path + [\a begin, \a end).
 /// @param begin Start of components to append.
@@ -169,9 +178,11 @@ void native(const Twine &path, SmallVectorImpl<char> &result);
 
 /// @brief Get root name.
 ///
-/// //net/hello => //net
-/// c:/hello    => c: (on Windows, on other platforms nothing)
-/// /hello      => <empty>
+/// @code
+///   //net/hello => //net
+///   c:/hello    => c: (on Windows, on other platforms nothing)
+///   /hello      => <empty>
+/// @endcode
 ///
 /// @param path Input path.
 /// @result The root name of \a path if it has one, otherwise "".
@@ -179,9 +190,11 @@ const StringRef root_name(StringRef path);
 
 /// @brief Get root directory.
 ///
-/// /goo/hello => /
-/// c:/hello   => /
-/// d/file.txt => <empty>
+/// @code
+///   /goo/hello => /
+///   c:/hello   => /
+///   d/file.txt => <empty>
+/// @endcode
 ///
 /// @param path Input path.
 /// @result The root directory of \a path if it has one, otherwise
@@ -198,9 +211,11 @@ const StringRef root_path(StringRef path);
 
 /// @brief Get relative path.
 ///
-/// C:\hello\world => hello\world
-/// foo/bar        => foo/bar
-/// /foo/bar       => foo/bar
+/// @code
+///   C:\hello\world => hello\world
+///   foo/bar        => foo/bar
+///   /foo/bar       => foo/bar
+/// @endcode
 ///
 /// @param path Input path.
 /// @result The path starting after root_path if one exists, otherwise "".
@@ -208,9 +223,11 @@ const StringRef relative_path(StringRef path);
 
 /// @brief Get parent path.
 ///
-/// /          => <empty>
-/// /foo       => /
-/// foo/../bar => foo/..
+/// @code
+///   /          => <empty>
+///   /foo       => /
+///   foo/../bar => foo/..
+/// @endcode
 ///
 /// @param path Input path.
 /// @result The parent path of \a path if one exists, otherwise "".
@@ -218,10 +235,12 @@ const StringRef parent_path(StringRef path);
 
 /// @brief Get filename.
 ///
-/// /foo.txt    => foo.txt
-/// .          => .
-/// ..         => ..
-/// /          => /
+/// @code
+///   /foo.txt    => foo.txt
+///   .          => .
+///   ..         => ..
+///   /          => /
+/// @endcode
 ///
 /// @param path Input path.
 /// @result The filename part of \a path. This is defined as the last component
@@ -234,11 +253,13 @@ const StringRef filename(StringRef path);
 /// substring of filename ending at (but not including) the last dot. Otherwise
 /// it is filename.
 ///
-/// /foo/bar.txt => bar
-/// /foo/bar     => bar
-/// /foo/.txt    => <empty>
-/// /foo/.       => .
-/// /foo/..      => ..
+/// @code
+///   /foo/bar.txt => bar
+///   /foo/bar     => bar
+///   /foo/.txt    => <empty>
+///   /foo/.       => .
+///   /foo/..      => ..
+/// @endcode
 ///
 /// @param path Input path.
 /// @result The stem of \a path.
@@ -250,9 +271,11 @@ const StringRef stem(StringRef path);
 /// substring of filename starting at (and including) the last dot, and ending
 /// at the end of \a path. Otherwise "".
 ///
-/// /foo/bar.txt => .txt
-/// /foo/bar     => <empty>
-/// /foo/.txt    => .txt
+/// @code
+///   /foo/bar.txt => .txt
+///   /foo/bar     => <empty>
+///   /foo/.txt    => .txt
+/// @endcode
 ///
 /// @param path Input path.
 /// @result The extension of \a path.
diff --git a/include/llvm/Support/PrettyStackTrace.h b/include/llvm/Support/PrettyStackTrace.h
index 9b3ecda50c..2122e06d53 100644
--- a/include/llvm/Support/PrettyStackTrace.h
+++ b/include/llvm/Support/PrettyStackTrace.h
@@ -16,6 +16,8 @@
 #ifndef LLVM_SUPPORT_PRETTYSTACKTRACE_H
 #define LLVM_SUPPORT_PRETTYSTACKTRACE_H
 
+#include "llvm/Support/Compiler.h"
+
 namespace llvm {
   class raw_ostream;
 
@@ -32,8 +34,8 @@ namespace llvm {
   /// virtual stack trace.  This gets dumped out if the program crashes.
   class PrettyStackTraceEntry {
     const PrettyStackTraceEntry *NextEntry;
-    PrettyStackTraceEntry(const PrettyStackTraceEntry &); // DO NOT IMPLEMENT
-    void operator=(const PrettyStackTraceEntry&);         // DO NOT IMPLEMENT
+    PrettyStackTraceEntry(const PrettyStackTraceEntry &) LLVM_DELETED_FUNCTION;
+    void operator=(const PrettyStackTraceEntry&) LLVM_DELETED_FUNCTION;
   public:
     PrettyStackTraceEntry();
     virtual ~PrettyStackTraceEntry();
@@ -52,7 +54,7 @@ namespace llvm {
     const char *Str;
   public:
     PrettyStackTraceString(const char *str) : Str(str) {}
-    virtual void print(raw_ostream &OS) const;
+    virtual void print(raw_ostream &OS) const LLVM_OVERRIDE;
   };
 
   /// PrettyStackTraceProgram - This object prints a specified program arguments
@@ -63,7 +65,7 @@ namespace llvm {
   public:
     PrettyStackTraceProgram(int argc, const char * const*argv)
       : ArgC(argc), ArgV(argv) {}
-    virtual void print(raw_ostream &OS) const;
+    virtual void print(raw_ostream &OS) const LLVM_OVERRIDE;
   };
 
 } // end namespace llvm
diff --git a/include/llvm/Support/Program.h b/include/llvm/Support/Program.h
index a85f23550e..7c9a951031 100644
--- a/include/llvm/Support/Program.h
+++ b/include/llvm/Support/Program.h
@@ -34,8 +34,8 @@ namespace sys {
     void *Data_;
 
     // Noncopyable.
-    Program(const Program& other);
-    Program& operator=(const Program& other);
+    Program(const Program& other) LLVM_DELETED_FUNCTION;
+    Program& operator=(const Program& other) LLVM_DELETED_FUNCTION;
 
     /// @name Methods
     /// @{
diff --git a/include/llvm/Support/RWMutex.h b/include/llvm/Support/RWMutex.h
index 0d4cb81de3..935b3075df 100644
--- a/include/llvm/Support/RWMutex.h
+++ b/include/llvm/Support/RWMutex.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_SYSTEM_RWMUTEX_H
 #define LLVM_SYSTEM_RWMUTEX_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Threading.h"
 #include <cassert>
 
@@ -75,8 +76,8 @@ namespace llvm
     /// @name Do Not Implement
     /// @{
     private:
-      RWMutexImpl(const RWMutexImpl & original);
-      void operator=(const RWMutexImpl &);
+      RWMutexImpl(const RWMutexImpl & original) LLVM_DELETED_FUNCTION;
+      void operator=(const RWMutexImpl &) LLVM_DELETED_FUNCTION;
     /// @}
     };
 
diff --git a/include/llvm/Support/Regex.h b/include/llvm/Support/Regex.h
index 7648e77bfb..ffe09b19b6 100644
--- a/include/llvm/Support/Regex.h
+++ b/include/llvm/Support/Regex.h
@@ -36,7 +36,7 @@ namespace llvm {
       Newline=2
     };
 
-    /// Compiles the given POSIX Extended Regular Expression \arg Regex.
+    /// Compiles the given POSIX Extended Regular Expression \p Regex.
     /// This implementation supports regexes and matching strings with embedded
     /// NUL characters.
     Regex(StringRef Regex, unsigned Flags = NoFlags);
@@ -51,17 +51,17 @@ namespace llvm {
     /// many entries plus one for the whole regex (as element 0).
     unsigned getNumMatches() const;
 
-    /// matches - Match the regex against a given \arg String.
+    /// matches - Match the regex against a given \p String.
     ///
     /// \param Matches - If given, on a successful match this will be filled in
-    /// with references to the matched group expressions (inside \arg String),
+    /// with references to the matched group expressions (inside \p String),
     /// the first group is always the entire pattern.
     ///
     /// This returns true on a successful match.
     bool match(StringRef String, SmallVectorImpl<StringRef> *Matches = 0);
 
     /// sub - Return the result of replacing the first match of the regex in
-    /// \arg String with the \arg Repl string. Backreferences like "\0" in the
+    /// \p String with the \p Repl string. Backreferences like "\0" in the
     /// replacement string are replaced with the appropriate match substring.
     ///
     /// Note that the replacement string has backslash escaping performed on
diff --git a/include/llvm/Support/Registry.h b/include/llvm/Support/Registry.h
index d0375bedd9..29eafb63ca 100644
--- a/include/llvm/Support/Registry.h
+++ b/include/llvm/Support/Registry.h
@@ -37,7 +37,7 @@ namespace llvm {
   /// is necessary to define an alternate traits class.
   template <typename T>
   class RegistryTraits {
-    RegistryTraits(); // Do not implement.
+    RegistryTraits() LLVM_DELETED_FUNCTION;
 
   public:
     typedef SimpleRegistryEntry<T> entry;
@@ -63,7 +63,7 @@ namespace llvm {
     class iterator;
 
   private:
-    Registry(); // Do not implement.
+    Registry() LLVM_DELETED_FUNCTION;
 
     static void Announce(const entry &E) {
       for (listener *Cur = ListenerHead; Cur; Cur = Cur->Next)
@@ -120,6 +120,7 @@ namespace llvm {
     /// Abstract base class for registry listeners, which are informed when new
     /// entries are added to the registry. Simply subclass and instantiate:
     ///
+    /// \code
     ///   class CollectorPrinter : public Registry<Collector>::listener {
     ///   protected:
     ///     void registered(const Registry<Collector>::entry &e) {
@@ -131,7 +132,7 @@ namespace llvm {
     ///   };
     ///
     ///   CollectorPrinter Printer;
-    ///
+    /// \endcode
     class listener {
       listener *Prev, *Next;
 
diff --git a/include/llvm/Support/SourceMgr.h b/include/llvm/Support/SourceMgr.h
index 3835e84592..bcf95f2f6e 100644
--- a/include/llvm/Support/SourceMgr.h
+++ b/include/llvm/Support/SourceMgr.h
@@ -64,9 +64,9 @@ private:
 
   DiagHandlerTy DiagHandler;
   void *DiagContext;
-  
-  SourceMgr(const SourceMgr&);    // DO NOT IMPLEMENT
-  void operator=(const SourceMgr&); // DO NOT IMPLEMENT
+
+  SourceMgr(const SourceMgr&) LLVM_DELETED_FUNCTION;
+  void operator=(const SourceMgr&) LLVM_DELETED_FUNCTION;
 public:
   SourceMgr() : LineNoCache(0), DiagHandler(0), DiagContext(0) {}
   ~SourceMgr();
diff --git a/include/llvm/Support/StreamableMemoryObject.h b/include/llvm/Support/StreamableMemoryObject.h
index 531dbb216d..a2b4bcb9aa 100644
--- a/include/llvm/Support/StreamableMemoryObject.h
+++ b/include/llvm/Support/StreamableMemoryObject.h
@@ -12,6 +12,7 @@
 #define STREAMABLEMEMORYOBJECT_H_
 
 #include "llvm/ADT/OwningPtr.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/MemoryObject.h"
 #include "llvm/Support/DataStream.h"
 #include <vector>
@@ -107,14 +108,15 @@ class StreamableMemoryObject : public MemoryObject {
 class StreamingMemoryObject : public StreamableMemoryObject {
 public:
   StreamingMemoryObject(DataStreamer *streamer);
-  virtual uint64_t getBase() const { return 0; }
-  virtual uint64_t getExtent() const;
-  virtual int readByte(uint64_t address, uint8_t* ptr) const;
+  virtual uint64_t getBase() const LLVM_OVERRIDE { return 0; }
+  virtual uint64_t getExtent() const LLVM_OVERRIDE;
+  virtual int readByte(uint64_t address, uint8_t* ptr) const LLVM_OVERRIDE;
   virtual int readBytes(uint64_t address,
                         uint64_t size,
                         uint8_t* buf,
-                        uint64_t* copied) const ;
-  virtual const uint8_t *getPointer(uint64_t address, uint64_t size) const {
+                        uint64_t* copied) const LLVM_OVERRIDE;
+  virtual const uint8_t *getPointer(uint64_t address,
+                                    uint64_t size) const LLVM_OVERRIDE {
     // This could be fixed by ensuring the bytes are fetched and making a copy,
     // requiring that the bitcode size be known, or otherwise ensuring that
     // the memory doesn't go away/get reallocated, but it's
@@ -122,8 +124,8 @@ public:
     assert(0 && "getPointer in streaming memory objects not allowed");
     return NULL;
   }
-  virtual bool isValidAddress(uint64_t address) const;
-  virtual bool isObjectEnd(uint64_t address) const;
+  virtual bool isValidAddress(uint64_t address) const LLVM_OVERRIDE;
+  virtual bool isObjectEnd(uint64_t address) const LLVM_OVERRIDE;
 
   /// Drop s bytes from the front of the stream, pushing the positions of the
   /// remaining bytes down by s. This is used to skip past the bitcode header,
@@ -170,8 +172,8 @@ private:
     return true;
   }
 
-  StreamingMemoryObject(const StreamingMemoryObject&);  // DO NOT IMPLEMENT
-  void operator=(const StreamingMemoryObject&);  // DO NOT IMPLEMENT
+  StreamingMemoryObject(const StreamingMemoryObject&) LLVM_DELETED_FUNCTION;
+  void operator=(const StreamingMemoryObject&) LLVM_DELETED_FUNCTION;
 };
 
 StreamableMemoryObject *getNonStreamedMemoryObject(
diff --git a/include/llvm/Support/TargetRegistry.h b/include/llvm/Support/TargetRegistry.h
index c0be8f130a..ca58bfb0d7 100644
--- a/include/llvm/Support/TargetRegistry.h
+++ b/include/llvm/Support/TargetRegistry.h
@@ -93,7 +93,9 @@ namespace llvm {
                                                   CodeGenOpt::Level OL);
     typedef AsmPrinter *(*AsmPrinterCtorTy)(TargetMachine &TM,
                                             MCStreamer &Streamer);
-    typedef MCAsmBackend *(*MCAsmBackendCtorTy)(const Target &T, StringRef TT);
+    typedef MCAsmBackend *(*MCAsmBackendCtorTy)(const Target &T,
+                                                StringRef TT,
+                                                StringRef CPU);
     typedef MCTargetAsmLexer *(*MCAsmLexerCtorTy)(const Target &T,
                                                   const MCRegisterInfo &MRI,
                                                   const MCAsmInfo &MAI);
@@ -271,7 +273,7 @@ namespace llvm {
     /// createMCAsmInfo - Create a MCAsmInfo implementation for the specified
     /// target triple.
     ///
-    /// \arg Triple - This argument is used to determine the target machine
+    /// \param Triple This argument is used to determine the target machine
     /// feature set; it should always be provided. Generally this should be
     /// either the target triple from the module, or the target triple of the
     /// host if that does not exist.
@@ -317,12 +319,12 @@ namespace llvm {
 
     /// createMCSubtargetInfo - Create a MCSubtargetInfo implementation.
     ///
-    /// \arg Triple - This argument is used to determine the target machine
+    /// \param Triple This argument is used to determine the target machine
     /// feature set; it should always be provided. Generally this should be
     /// either the target triple from the module, or the target triple of the
     /// host if that does not exist.
-    /// \arg CPU - This specifies the name of the target CPU.
-    /// \arg Features - This specifies the string representation of the
+    /// \param CPU This specifies the name of the target CPU.
+    /// \param Features This specifies the string representation of the
     /// additional target features.
     MCSubtargetInfo *createMCSubtargetInfo(StringRef Triple, StringRef CPU,
                                            StringRef Features) const {
@@ -332,9 +334,9 @@ namespace llvm {
     }
 
     /// createTargetMachine - Create a target specific machine implementation
-    /// for the specified \arg Triple.
+    /// for the specified \p Triple.
     ///
-    /// \arg Triple - This argument is used to determine the target machine
+    /// \param Triple This argument is used to determine the target machine
     /// feature set; it should always be provided. Generally this should be
     /// either the target triple from the module, or the target triple of the
     /// host if that does not exist.
@@ -351,12 +353,11 @@ namespace llvm {
 
     /// createMCAsmBackend - Create a target specific assembly parser.
     ///
-    /// \arg Triple - The target triple string.
-    /// \arg Backend - The target independent assembler object.
-    MCAsmBackend *createMCAsmBackend(StringRef Triple) const {
+    /// \param Triple The target triple string.
+    MCAsmBackend *createMCAsmBackend(StringRef Triple, StringRef CPU) const {
       if (!MCAsmBackendCtorFn)
         return 0;
-      return MCAsmBackendCtorFn(*this, Triple);
+      return MCAsmBackendCtorFn(*this, Triple, CPU);
     }
 
     /// createMCAsmLexer - Create a target specific assembly lexer.
@@ -370,7 +371,7 @@ namespace llvm {
 
     /// createMCAsmParser - Create a target specific assembly parser.
     ///
-    /// \arg Parser - The target independent parser implementation to use for
+    /// \param Parser The target independent parser implementation to use for
     /// parsing and lexing.
     MCTargetAsmParser *createMCAsmParser(MCSubtargetInfo &STI,
                                          MCAsmParser &Parser) const {
@@ -416,13 +417,13 @@ namespace llvm {
 
     /// createMCObjectStreamer - Create a target specific MCStreamer.
     ///
-    /// \arg TT - The target triple.
-    /// \arg Ctx - The target context.
-    /// \arg TAB - The target assembler backend object. Takes ownership.
-    /// \arg _OS - The stream object.
-    /// \arg _Emitter - The target independent assembler object.Takes ownership.
-    /// \arg RelaxAll - Relax all fixups?
-    /// \arg NoExecStack - Mark file as not needing a executable stack.
+    /// \param TT The target triple.
+    /// \param Ctx The target context.
+    /// \param TAB The target assembler backend object. Takes ownership.
+    /// \param _OS The stream object.
+    /// \param _Emitter The target independent assembler object.Takes ownership.
+    /// \param RelaxAll Relax all fixups?
+    /// \param NoExecStack Mark file as not needing a executable stack.
     MCStreamer *createMCObjectStreamer(StringRef TT, MCContext &Ctx,
                                        MCAsmBackend &TAB,
                                        raw_ostream &_OS,
@@ -1063,8 +1064,9 @@ namespace llvm {
     }
 
   private:
-    static MCAsmBackend *Allocator(const Target &T, StringRef Triple) {
-      return new MCAsmBackendImpl(T, Triple);
+    static MCAsmBackend *Allocator(const Target &T, StringRef Triple,
+                                   StringRef CPU) {
+      return new MCAsmBackendImpl(T, Triple, CPU);
     }
   };
 
diff --git a/include/llvm/Support/Threading.h b/include/llvm/Support/Threading.h
index c0e842c2fe..9017afb890 100644
--- a/include/llvm/Support/Threading.h
+++ b/include/llvm/Support/Threading.h
@@ -41,8 +41,8 @@ namespace llvm {
   /// before llvm_start_multithreaded().
   void llvm_release_global_lock();
 
-  /// llvm_execute_on_thread - Execute the given \arg UserFn on a separate
-  /// thread, passing it the provided \arg UserData.
+  /// llvm_execute_on_thread - Execute the given \p UserFn on a separate
+  /// thread, passing it the provided \p UserData.
   ///
   /// This function does not guarantee that the code will actually be executed
   /// on a separate thread or honoring the requested stack size, but tries to do
diff --git a/include/llvm/Support/Timer.h b/include/llvm/Support/Timer.h
index 404cb6d6c8..a7418827ca 100644
--- a/include/llvm/Support/Timer.h
+++ b/include/llvm/Support/Timer.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_SUPPORT_TIMER_H
 #define LLVM_SUPPORT_TIMER_H
 
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/ADT/StringRef.h"
 #include <cassert>
@@ -130,7 +131,7 @@ private:
 ///
 class TimeRegion {
   Timer *T;
-  TimeRegion(const TimeRegion &); // DO NOT IMPLEMENT
+  TimeRegion(const TimeRegion &) LLVM_DELETED_FUNCTION;
 public:
   explicit TimeRegion(Timer &t) : T(&t) {
     T->startTimer();
@@ -168,8 +169,8 @@ class TimerGroup {
   std::vector<std::pair<TimeRecord, std::string> > TimersToPrint;
   
   TimerGroup **Prev, *Next; // Doubly linked list of TimerGroup's.
-  TimerGroup(const TimerGroup &TG);      // DO NOT IMPLEMENT
-  void operator=(const TimerGroup &TG);  // DO NOT IMPLEMENT
+  TimerGroup(const TimerGroup &TG) LLVM_DELETED_FUNCTION;
+  void operator=(const TimerGroup &TG) LLVM_DELETED_FUNCTION;
 public:
   explicit TimerGroup(StringRef name);
   ~TimerGroup();
diff --git a/include/llvm/Support/ValueHandle.h b/include/llvm/Support/ValueHandle.h
index dfa95798b3..5e98fbd07a 100644
--- a/include/llvm/Support/ValueHandle.h
+++ b/include/llvm/Support/ValueHandle.h
@@ -59,8 +59,8 @@ private:
   // pair. The 'setValPtrInt' and 'getValPtrInt' methods below give them this
   // access.
   PointerIntPair<Value*, 2> VP;
-  
-  explicit ValueHandleBase(const ValueHandleBase&); // DO NOT IMPLEMENT.
+
+  ValueHandleBase(const ValueHandleBase&) LLVM_DELETED_FUNCTION;
 public:
   explicit ValueHandleBase(HandleBaseKind Kind)
     : PrevPair(0, Kind), Next(0), VP(0, 0) {}
diff --git a/include/llvm/Support/YAMLParser.h b/include/llvm/Support/YAMLParser.h
index 98910eb757..eacd651394 100644
--- a/include/llvm/Support/YAMLParser.h
+++ b/include/llvm/Support/YAMLParser.h
@@ -241,7 +241,7 @@ public:
   /// @returns The value, or nullptr if failed() == true.
   Node *getValue();
 
-  virtual void skip() {
+  virtual void skip() LLVM_OVERRIDE {
     getKey()->skip();
     getValue()->skip();
   }
@@ -358,7 +358,7 @@ public:
 
   iterator end() { return iterator(); }
 
-  virtual void skip() {
+  virtual void skip() LLVM_OVERRIDE {
     yaml::skip(*this);
   }
 
@@ -421,7 +421,7 @@ public:
 
   iterator end() { return iterator(); }
 
-  virtual void skip() {
+  virtual void skip() LLVM_OVERRIDE {
     yaml::skip(*this);
   }
 
diff --git a/include/llvm/Support/circular_raw_ostream.h b/include/llvm/Support/circular_raw_ostream.h
index 2b3c329b58..2823af33b7 100644
--- a/include/llvm/Support/circular_raw_ostream.h
+++ b/include/llvm/Support/circular_raw_ostream.h
@@ -81,12 +81,12 @@ namespace llvm
       Filled = false;
     }
 
-    virtual void write_impl(const char *Ptr, size_t Size);
+    virtual void write_impl(const char *Ptr, size_t Size) LLVM_OVERRIDE;
 
     /// current_pos - Return the current position within the stream,
     /// not counting the bytes currently in the buffer.
     ///
-    virtual uint64_t current_pos() const { 
+    virtual uint64_t current_pos() const LLVM_OVERRIDE {
       // This has the same effect as calling TheStream.current_pos(),
       // but that interface is private.
       return TheStream->tell() - TheStream->GetNumBytesInBuffer();
diff --git a/include/llvm/Support/raw_os_ostream.h b/include/llvm/Support/raw_os_ostream.h
index 4f5d3612da..4385721e82 100644
--- a/include/llvm/Support/raw_os_ostream.h
+++ b/include/llvm/Support/raw_os_ostream.h
@@ -24,14 +24,14 @@ namespace llvm {
 /// use the underlying stream to detect errors.
 class raw_os_ostream : public raw_ostream {
   std::ostream &OS;
-  
+
   /// write_impl - See raw_ostream::write_impl.
-  virtual void write_impl(const char *Ptr, size_t Size);
-  
+  virtual void write_impl(const char *Ptr, size_t Size) LLVM_OVERRIDE;
+
   /// current_pos - Return the current position within the stream, not
   /// counting the bytes currently in the buffer.
-  virtual uint64_t current_pos() const;
-  
+  virtual uint64_t current_pos() const LLVM_OVERRIDE;
+
 public:
   raw_os_ostream(std::ostream &O) : OS(O) {}
   ~raw_os_ostream();
diff --git a/include/llvm/Support/raw_ostream.h b/include/llvm/Support/raw_ostream.h
index 9913f989fc..eab0f2d805 100644
--- a/include/llvm/Support/raw_ostream.h
+++ b/include/llvm/Support/raw_ostream.h
@@ -15,6 +15,7 @@
 #define LLVM_SUPPORT_RAW_OSTREAM_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataTypes.h"
 
 namespace llvm {
@@ -29,8 +30,8 @@ namespace llvm {
 class raw_ostream {
 private:
   // Do not implement. raw_ostream is noncopyable.
-  void operator=(const raw_ostream &);
-  raw_ostream(const raw_ostream &);
+  void operator=(const raw_ostream &) LLVM_DELETED_FUNCTION;
+  raw_ostream(const raw_ostream &) LLVM_DELETED_FUNCTION;
 
   /// The buffer is handled in such a way that the buffer is
   /// uninitialized, unbuffered, or out of space when OutBufCur >=
@@ -191,10 +192,10 @@ public:
 
   raw_ostream &operator<<(double N);
 
-  /// write_hex - Output \arg N in hexadecimal, without any prefix or padding.
+  /// write_hex - Output \p N in hexadecimal, without any prefix or padding.
   raw_ostream &write_hex(unsigned long long N);
 
-  /// write_escaped - Output \arg Str, turning '\\', '\t', '\n', '"', and
+  /// write_escaped - Output \p Str, turning '\\', '\t', '\n', '"', and
   /// anything that doesn't satisfy std::isprint into an escape sequence.
   raw_ostream &write_escaped(StringRef Str, bool UseHexEscapes = false);
 
@@ -218,6 +219,9 @@ public:
   virtual raw_ostream &changeColor(enum Colors Color,
                                    bool Bold = false,
                                    bool BG = false) {
+    (void)Color;
+    (void)Bold;
+    (void)BG;
     return *this;
   }
 
@@ -242,15 +246,16 @@ public:
 
 private:
   /// write_impl - The is the piece of the class that is implemented
-  /// by subclasses.  This writes the \args Size bytes starting at
-  /// \arg Ptr to the underlying stream.
+  /// by subclasses.  This writes the \p Size bytes starting at
+  /// \p Ptr to the underlying stream.
   ///
   /// This function is guaranteed to only be called at a point at which it is
   /// safe for the subclass to install a new buffer via SetBuffer.
   ///
-  /// \arg Ptr - The start of the data to be written. For buffered streams this
+  /// \param Ptr The start of the data to be written. For buffered streams this
   /// is guaranteed to be the start of the buffer.
-  /// \arg Size - The number of bytes to be written.
+  ///
+  /// \param Size The number of bytes to be written.
   ///
   /// \invariant { Size > 0 }
   virtual void write_impl(const char *Ptr, size_t Size) = 0;
@@ -317,14 +322,14 @@ class raw_fd_ostream : public raw_ostream {
   uint64_t pos;
 
   /// write_impl - See raw_ostream::write_impl.
-  virtual void write_impl(const char *Ptr, size_t Size);
+  virtual void write_impl(const char *Ptr, size_t Size) LLVM_OVERRIDE;
 
   /// current_pos - Return the current position within the stream, not
   /// counting the bytes currently in the buffer.
-  virtual uint64_t current_pos() const { return pos; }
+  virtual uint64_t current_pos() const LLVM_OVERRIDE { return pos; }
 
   /// preferred_buffer_size - Determine an efficient buffer size.
-  virtual size_t preferred_buffer_size() const;
+  virtual size_t preferred_buffer_size() const LLVM_OVERRIDE;
 
   /// error_detected - Set the flag indicating that an output error has
   /// been encountered.
@@ -385,14 +390,14 @@ public:
   }
 
   virtual raw_ostream &changeColor(enum Colors colors, bool bold=false,
-                                   bool bg=false);
-  virtual raw_ostream &resetColor();
+                                   bool bg=false) LLVM_OVERRIDE;
+  virtual raw_ostream &resetColor() LLVM_OVERRIDE;
 
-  virtual raw_ostream &reverseColor();
+  virtual raw_ostream &reverseColor() LLVM_OVERRIDE;
 
-  virtual bool is_displayed() const;
+  virtual bool is_displayed() const LLVM_OVERRIDE;
 
-  virtual bool has_colors() const;
+  virtual bool has_colors() const LLVM_OVERRIDE;
 
   /// has_error - Return the value of the flag in this raw_fd_ostream indicating
   /// whether an output error has been encountered.
@@ -438,11 +443,11 @@ class raw_string_ostream : public raw_ostream {
   std::string &OS;
 
   /// write_impl - See raw_ostream::write_impl.
-  virtual void write_impl(const char *Ptr, size_t Size);
+  virtual void write_impl(const char *Ptr, size_t Size) LLVM_OVERRIDE;
 
   /// current_pos - Return the current position within the stream, not
   /// counting the bytes currently in the buffer.
-  virtual uint64_t current_pos() const { return OS.size(); }
+  virtual uint64_t current_pos() const LLVM_OVERRIDE { return OS.size(); }
 public:
   explicit raw_string_ostream(std::string &O) : OS(O) {}
   ~raw_string_ostream();
@@ -462,15 +467,15 @@ class raw_svector_ostream : public raw_ostream {
   SmallVectorImpl<char> &OS;
 
   /// write_impl - See raw_ostream::write_impl.
-  virtual void write_impl(const char *Ptr, size_t Size);
+  virtual void write_impl(const char *Ptr, size_t Size) LLVM_OVERRIDE;
 
   /// current_pos - Return the current position within the stream, not
   /// counting the bytes currently in the buffer.
-  virtual uint64_t current_pos() const;
+  virtual uint64_t current_pos() const LLVM_OVERRIDE;
 public:
   /// Construct a new raw_svector_ostream.
   ///
-  /// \arg O - The vector to write to; this should generally have at least 128
+  /// \param O The vector to write to; this should generally have at least 128
   /// bytes free to avoid any extraneous memory overhead.
   explicit raw_svector_ostream(SmallVectorImpl<char> &O);
   ~raw_svector_ostream();
@@ -488,11 +493,11 @@ public:
 /// raw_null_ostream - A raw_ostream that discards all output.
 class raw_null_ostream : public raw_ostream {
   /// write_impl - See raw_ostream::write_impl.
-  virtual void write_impl(const char *Ptr, size_t size);
+  virtual void write_impl(const char *Ptr, size_t size) LLVM_OVERRIDE;
 
   /// current_pos - Return the current position within the stream, not
   /// counting the bytes currently in the buffer.
-  virtual uint64_t current_pos() const;
+  virtual uint64_t current_pos() const LLVM_OVERRIDE;
 
 public:
   explicit raw_null_ostream() {}
diff --git a/include/llvm/Support/system_error.h b/include/llvm/Support/system_error.h
index 959fb3fff1..844013ed5d 100644
--- a/include/llvm/Support/system_error.h
+++ b/include/llvm/Support/system_error.h
@@ -17,6 +17,8 @@
 #ifndef LLVM_SYSTEM_SYSTEM_ERROR_H
 #define LLVM_SYSTEM_SYSTEM_ERROR_H
 
+#include "llvm/Support/Compiler.h"
+
 /*
     system_error synopsis
 
@@ -629,8 +631,8 @@ public:
 
 private:
   error_category();
-  error_category(const error_category&);// = delete;
-  error_category& operator=(const error_category&);// = delete;
+  error_category(const error_category&) LLVM_DELETED_FUNCTION;
+  error_category& operator=(const error_category&) LLVM_DELETED_FUNCTION;
 
 public:
   virtual const char* name() const = 0;
@@ -651,7 +653,7 @@ public:
 class _do_message : public error_category
 {
 public:
-  virtual std::string message(int ev) const;
+  virtual std::string message(int ev) const LLVM_OVERRIDE;
 };
 
 const error_category& generic_category();
diff --git a/include/llvm/Support/type_traits.h b/include/llvm/Support/type_traits.h
index 7b97547be5..f9306395fc 100644
--- a/include/llvm/Support/type_traits.h
+++ b/include/llvm/Support/type_traits.h
@@ -54,8 +54,9 @@ struct is_class
   // is_class<> metafunction due to Paul Mensonides (leavings@attbi.com). For
   // more details:
   // http://groups.google.com/groups?hl=en&selm=000001c1cc83%24e154d5e0%247772e50c%40c161550a&rnum=1
- public:
-    enum { value = sizeof(char) == sizeof(dont_use::is_class_helper<T>(0)) };
+public:
+  static const bool value =
+      sizeof(char) == sizeof(dont_use::is_class_helper<T>(0));
 };
   
   
@@ -162,12 +163,11 @@ template <typename T> class is_integral_or_enum {
   static UnderlyingT &nonce_instance;
 
 public:
-  enum {
+  static const bool
     value = (!is_class<UnderlyingT>::value && !is_pointer<UnderlyingT>::value &&
              !is_same<UnderlyingT, float>::value &&
              !is_same<UnderlyingT, double>::value &&
-             sizeof(char) != sizeof(check_int_convertible(nonce_instance)))
-  };
+             sizeof(char) != sizeof(check_int_convertible(nonce_instance)));
 };
 
 // enable_if_c - Enable/disable a template based on a metafunction
diff --git a/include/llvm/TableGen/Record.h b/include/llvm/TableGen/Record.h
index adb1a77ae4..8c4893dd04 100644
--- a/include/llvm/TableGen/Record.h
+++ b/include/llvm/TableGen/Record.h
@@ -152,9 +152,9 @@ public:
   virtual Init *convertValue(   VarInit *VI) { return RecTy::convertValue(VI);}
   virtual Init *convertValue( FieldInit *FI) { return RecTy::convertValue(FI);}
 
-  std::string getAsString() const { return "bit"; }
+  virtual std::string getAsString() const { return "bit"; }
 
-  bool typeIsConvertibleTo(const RecTy *RHS) const {
+  virtual bool typeIsConvertibleTo(const RecTy *RHS) const {
     return RHS->baseClassOf(this);
   }
   virtual bool baseClassOf(const BitRecTy    *RHS) const { return true; }
@@ -195,9 +195,9 @@ public:
   virtual Init *convertValue(   VarInit *VI) { return RecTy::convertValue(VI);}
   virtual Init *convertValue( FieldInit *FI) { return RecTy::convertValue(FI);}
 
-  std::string getAsString() const;
+  virtual std::string getAsString() const;
 
-  bool typeIsConvertibleTo(const RecTy *RHS) const {
+  virtual bool typeIsConvertibleTo(const RecTy *RHS) const {
     return RHS->baseClassOf(this);
   }
   virtual bool baseClassOf(const BitRecTy    *RHS) const { return Size == 1; }
@@ -237,9 +237,9 @@ public:
   virtual Init *convertValue(   VarInit *VI) { return RecTy::convertValue(VI);}
   virtual Init *convertValue( FieldInit *FI) { return RecTy::convertValue(FI);}
 
-  std::string getAsString() const { return "int"; }
+  virtual std::string getAsString() const { return "int"; }
 
-  bool typeIsConvertibleTo(const RecTy *RHS) const {
+  virtual bool typeIsConvertibleTo(const RecTy *RHS) const {
     return RHS->baseClassOf(this);
   }
 
@@ -278,9 +278,9 @@ public:
   virtual Init *convertValue(   VarInit *VI) { return RecTy::convertValue(VI);}
   virtual Init *convertValue( FieldInit *FI) { return RecTy::convertValue(FI);}
 
-  std::string getAsString() const { return "string"; }
+  virtual std::string getAsString() const { return "string"; }
 
-  bool typeIsConvertibleTo(const RecTy *RHS) const {
+  virtual bool typeIsConvertibleTo(const RecTy *RHS) const {
     return RHS->baseClassOf(this);
   }
 
@@ -322,9 +322,9 @@ public:
   virtual Init *convertValue(   VarInit *VI) { return RecTy::convertValue(VI);}
   virtual Init *convertValue( FieldInit *FI) { return RecTy::convertValue(FI);}
 
-  std::string getAsString() const;
+  virtual std::string getAsString() const;
 
-  bool typeIsConvertibleTo(const RecTy *RHS) const {
+  virtual bool typeIsConvertibleTo(const RecTy *RHS) const {
     return RHS->baseClassOf(this);
   }
 
@@ -363,9 +363,9 @@ public:
   virtual Init *convertValue(   VarInit *VI) { return RecTy::convertValue(VI);}
   virtual Init *convertValue( FieldInit *FI) { return RecTy::convertValue(FI);}
 
-  std::string getAsString() const { return "dag"; }
+  virtual std::string getAsString() const { return "dag"; }
 
-  bool typeIsConvertibleTo(const RecTy *RHS) const {
+  virtual bool typeIsConvertibleTo(const RecTy *RHS) const {
     return RHS->baseClassOf(this);
   }
 
@@ -407,9 +407,9 @@ public:
   virtual Init *convertValue(   VarInit *VI) { return RecTy::convertValue(VI);}
   virtual Init *convertValue( FieldInit *FI) { return RecTy::convertValue(FI);}
 
-  std::string getAsString() const;
+  virtual std::string getAsString() const;
 
-  bool typeIsConvertibleTo(const RecTy *RHS) const {
+  virtual bool typeIsConvertibleTo(const RecTy *RHS) const {
     return RHS->baseClassOf(this);
   }
   virtual bool baseClassOf(const BitRecTy    *RHS) const { return false; }
@@ -431,8 +431,8 @@ RecTy *resolveTypes(RecTy *T1, RecTy *T2);
 //===----------------------------------------------------------------------===//
 
 class Init {
-  Init(const Init &);  // Do not define.
-  Init &operator=(const Init &);  // Do not define.
+  Init(const Init &) LLVM_DELETED_FUNCTION;
+  Init &operator=(const Init &) LLVM_DELETED_FUNCTION;
   virtual void anchor();
 
 protected:
@@ -533,8 +533,8 @@ inline raw_ostream &operator<<(raw_ostream &OS, const Init &I) {
 class TypedInit : public Init {
   RecTy *Ty;
 
-  TypedInit(const TypedInit &Other);  // Do not define.
-  TypedInit &operator=(const TypedInit &Other);  // Do not define.
+  TypedInit(const TypedInit &Other) LLVM_DELETED_FUNCTION;
+  TypedInit &operator=(const TypedInit &Other) LLVM_DELETED_FUNCTION;
 
 protected:
   explicit TypedInit(RecTy *T) : Ty(T) {}
@@ -565,8 +565,8 @@ public:
 ///
 class UnsetInit : public Init {
   UnsetInit() : Init() {}
-  UnsetInit(const UnsetInit &);  // Do not define.
-  UnsetInit &operator=(const UnsetInit &Other);  // Do not define.
+  UnsetInit(const UnsetInit &) LLVM_DELETED_FUNCTION;
+  UnsetInit &operator=(const UnsetInit &Other) LLVM_DELETED_FUNCTION;
   virtual void anchor();
 
 public:
@@ -591,8 +591,8 @@ class BitInit : public Init {
   bool Value;
 
   explicit BitInit(bool V) : Value(V) {}
-  BitInit(const BitInit &Other);  // Do not define.
-  BitInit &operator=(BitInit &Other);  // Do not define.
+  BitInit(const BitInit &Other) LLVM_DELETED_FUNCTION;
+  BitInit &operator=(BitInit &Other) LLVM_DELETED_FUNCTION;
   virtual void anchor();
 
 public:
@@ -620,8 +620,8 @@ class BitsInit : public Init, public FoldingSetNode {
 
   BitsInit(ArrayRef<Init *> Range) : Bits(Range.begin(), Range.end()) {}
 
-  BitsInit(const BitsInit &Other);  // Do not define.
-  BitsInit &operator=(const BitsInit &Other);  // Do not define.
+  BitsInit(const BitsInit &Other) LLVM_DELETED_FUNCTION;
+  BitsInit &operator=(const BitsInit &Other) LLVM_DELETED_FUNCTION;
 
 public:
   static BitsInit *get(ArrayRef<Init *> Range);
@@ -664,8 +664,8 @@ class IntInit : public TypedInit {
 
   explicit IntInit(int64_t V) : TypedInit(IntRecTy::get()), Value(V) {}
 
-  IntInit(const IntInit &Other);  // Do not define.
-  IntInit &operator=(const IntInit &Other);  // Do note define.
+  IntInit(const IntInit &Other) LLVM_DELETED_FUNCTION;
+  IntInit &operator=(const IntInit &Other) LLVM_DELETED_FUNCTION;
 
 public:
   static IntInit *get(int64_t V);
@@ -702,8 +702,8 @@ class StringInit : public TypedInit {
   explicit StringInit(const std::string &V)
     : TypedInit(StringRecTy::get()), Value(V) {}
 
-  StringInit(const StringInit &Other);  // Do not define.
-  StringInit &operator=(const StringInit &Other);  // Do not define.
+  StringInit(const StringInit &Other) LLVM_DELETED_FUNCTION;
+  StringInit &operator=(const StringInit &Other) LLVM_DELETED_FUNCTION;
   virtual void anchor();
 
 public:
@@ -742,8 +742,8 @@ private:
   explicit ListInit(ArrayRef<Init *> Range, RecTy *EltTy)
       : TypedInit(ListRecTy::get(EltTy)), Values(Range.begin(), Range.end()) {}
 
-  ListInit(const ListInit &Other);  // Do not define.
-  ListInit &operator=(const ListInit &Other);  // Do not define.
+  ListInit(const ListInit &Other) LLVM_DELETED_FUNCTION;
+  ListInit &operator=(const ListInit &Other) LLVM_DELETED_FUNCTION;
 
 public:
   static ListInit *get(ArrayRef<Init *> Range, RecTy *EltTy);
@@ -758,7 +758,8 @@ public:
 
   Record *getElementAsRecord(unsigned i) const;
 
-  Init *convertInitListSlice(const std::vector<unsigned> &Elements) const;
+  virtual Init *
+    convertInitListSlice(const std::vector<unsigned> &Elements) const;
 
   virtual Init *convertInitializerTo(RecTy *Ty) const {
     return Ty->convertValue(const_cast<ListInit *>(this));
@@ -796,8 +797,8 @@ public:
 /// OpInit - Base class for operators
 ///
 class OpInit : public TypedInit {
-  OpInit(const OpInit &Other);  // Do not define.
-  OpInit &operator=(OpInit &Other);  // Do not define.
+  OpInit(const OpInit &Other) LLVM_DELETED_FUNCTION;
+  OpInit &operator=(OpInit &Other) LLVM_DELETED_FUNCTION;
 
 protected:
   explicit OpInit(RecTy *Type) : TypedInit(Type) {}
@@ -836,8 +837,8 @@ private:
   UnOpInit(UnaryOp opc, Init *lhs, RecTy *Type)
       : OpInit(Type), Opc(opc), LHS(lhs) {}
 
-  UnOpInit(const UnOpInit &Other);  // Do not define.
-  UnOpInit &operator=(const UnOpInit &Other);  // Do not define.
+  UnOpInit(const UnOpInit &Other) LLVM_DELETED_FUNCTION;
+  UnOpInit &operator=(const UnOpInit &Other) LLVM_DELETED_FUNCTION;
 
 public:
   static UnOpInit *get(UnaryOp opc, Init *lhs, RecTy *Type);
@@ -849,8 +850,8 @@ public:
     return UnOpInit::get(getOpcode(), *Operands.begin(), getType());
   }
 
-  int getNumOperands() const { return 1; }
-  Init *getOperand(int i) const {
+  virtual int getNumOperands() const { return 1; }
+  virtual Init *getOperand(int i) const {
     assert(i == 0 && "Invalid operand id for unary operator");
     return getOperand();
   }
@@ -860,7 +861,7 @@ public:
 
   // Fold - If possible, fold this to a simpler init.  Return this if not
   // possible to fold.
-  Init *Fold(Record *CurRec, MultiClass *CurMultiClass) const;
+  virtual Init *Fold(Record *CurRec, MultiClass *CurMultiClass) const;
 
   virtual Init *resolveReferences(Record &R, const RecordVal *RV) const;
 
@@ -879,8 +880,8 @@ private:
   BinOpInit(BinaryOp opc, Init *lhs, Init *rhs, RecTy *Type) :
       OpInit(Type), Opc(opc), LHS(lhs), RHS(rhs) {}
 
-  BinOpInit(const BinOpInit &Other);  // Do not define.
-  BinOpInit &operator=(const BinOpInit &Other);  // Do not define.
+  BinOpInit(const BinOpInit &Other) LLVM_DELETED_FUNCTION;
+  BinOpInit &operator=(const BinOpInit &Other) LLVM_DELETED_FUNCTION;
 
 public:
   static BinOpInit *get(BinaryOp opc, Init *lhs, Init *rhs,
@@ -893,8 +894,8 @@ public:
     return BinOpInit::get(getOpcode(), Operands[0], Operands[1], getType());
   }
 
-  int getNumOperands() const { return 2; }
-  Init *getOperand(int i) const {
+  virtual int getNumOperands() const { return 2; }
+  virtual Init *getOperand(int i) const {
     assert((i == 0 || i == 1) && "Invalid operand id for binary operator");
     if (i == 0) {
       return getLHS();
@@ -909,7 +910,7 @@ public:
 
   // Fold - If possible, fold this to a simpler init.  Return this if not
   // possible to fold.
-  Init *Fold(Record *CurRec, MultiClass *CurMultiClass) const;
+  virtual Init *Fold(Record *CurRec, MultiClass *CurMultiClass) const;
 
   virtual Init *resolveReferences(Record &R, const RecordVal *RV) const;
 
@@ -929,8 +930,8 @@ private:
              RecTy *Type) :
       OpInit(Type), Opc(opc), LHS(lhs), MHS(mhs), RHS(rhs) {}
 
-  TernOpInit(const TernOpInit &Other);  // Do not define.
-  TernOpInit &operator=(const TernOpInit &Other);  // Do not define.
+  TernOpInit(const TernOpInit &Other) LLVM_DELETED_FUNCTION;
+  TernOpInit &operator=(const TernOpInit &Other) LLVM_DELETED_FUNCTION;
 
 public:
   static TernOpInit *get(TernaryOp opc, Init *lhs,
@@ -945,8 +946,8 @@ public:
                            getType());
   }
 
-  int getNumOperands() const { return 3; }
-  Init *getOperand(int i) const {
+  virtual int getNumOperands() const { return 3; }
+  virtual Init *getOperand(int i) const {
     assert((i == 0 || i == 1 || i == 2) &&
            "Invalid operand id for ternary operator");
     if (i == 0) {
@@ -965,7 +966,7 @@ public:
 
   // Fold - If possible, fold this to a simpler init.  Return this if not
   // possible to fold.
-  Init *Fold(Record *CurRec, MultiClass *CurMultiClass) const;
+  virtual Init *Fold(Record *CurRec, MultiClass *CurMultiClass) const;
 
   virtual bool isComplete() const { return false; }
 
@@ -985,8 +986,8 @@ class VarInit : public TypedInit {
   explicit VarInit(Init *VN, RecTy *T)
       : TypedInit(T), VarName(VN) {}
 
-  VarInit(const VarInit &Other);  // Do not define.
-  VarInit &operator=(const VarInit &Other);  // Do not define.
+  VarInit(const VarInit &Other) LLVM_DELETED_FUNCTION;
+  VarInit &operator=(const VarInit &Other) LLVM_DELETED_FUNCTION;
 
 public:
   static VarInit *get(const std::string &VN, RecTy *T);
@@ -1036,8 +1037,8 @@ class VarBitInit : public Init {
            "Illegal VarBitInit expression!");
   }
 
-  VarBitInit(const VarBitInit &Other);  // Do not define.
-  VarBitInit &operator=(const VarBitInit &Other);  // Do not define.
+  VarBitInit(const VarBitInit &Other) LLVM_DELETED_FUNCTION;
+  VarBitInit &operator=(const VarBitInit &Other) LLVM_DELETED_FUNCTION;
 
 public:
   static VarBitInit *get(TypedInit *T, unsigned B);
@@ -1071,10 +1072,8 @@ class VarListElementInit : public TypedInit {
            "Illegal VarBitInit expression!");
   }
 
-  VarListElementInit(const VarListElementInit &Other);  // Do not define.
-  VarListElementInit &operator=(const VarListElementInit &Other);  // Do
-                                                                   // not
-                                                                   // define.
+  VarListElementInit(const VarListElementInit &Other) LLVM_DELETED_FUNCTION;
+  void operator=(const VarListElementInit &Other) LLVM_DELETED_FUNCTION;
 
 public:
   static VarListElementInit *get(TypedInit *T, unsigned E);
@@ -1107,8 +1106,8 @@ class DefInit : public TypedInit {
   DefInit(Record *D, RecordRecTy *T) : TypedInit(T), Def(D) {}
   friend class Record;
 
-  DefInit(const DefInit &Other);  // Do not define.
-  DefInit &operator=(const DefInit &Other);  // Do not define.
+  DefInit(const DefInit &Other) LLVM_DELETED_FUNCTION;
+  DefInit &operator=(const DefInit &Other) LLVM_DELETED_FUNCTION;
 
 public:
   static DefInit *get(Record*);
@@ -1152,8 +1151,8 @@ class FieldInit : public TypedInit {
     assert(getType() && "FieldInit with non-record type!");
   }
 
-  FieldInit(const FieldInit &Other);  // Do not define.
-  FieldInit &operator=(const FieldInit &Other);  // Do not define.
+  FieldInit(const FieldInit &Other) LLVM_DELETED_FUNCTION;
+  FieldInit &operator=(const FieldInit &Other) LLVM_DELETED_FUNCTION;
 
 public:
   static FieldInit *get(Init *R, const std::string &FN);
@@ -1193,8 +1192,8 @@ class DagInit : public TypedInit, public FoldingSetNode {
           Args(ArgRange.begin(), ArgRange.end()),
           ArgNames(NameRange.begin(), NameRange.end()) {}
 
-  DagInit(const DagInit &Other);  // Do not define.
-  DagInit &operator=(const DagInit &Other);  // Do not define.
+  DagInit(const DagInit &Other) LLVM_DELETED_FUNCTION;
+  DagInit &operator=(const DagInit &Other) LLVM_DELETED_FUNCTION;
 
 public:
   static DagInit *get(Init *V, const std::string &VN,
@@ -1329,6 +1328,14 @@ public:
     TrackedRecords(records), TheInit(0) {
     init();
   }
+
+  // When copy-constructing a Record, we must still guarantee a globally unique
+  // ID number.  All other fields can be copied normally.
+  Record(const Record &O) :
+    ID(LastID++), Name(O.Name), Locs(O.Locs), TemplateArgs(O.TemplateArgs),
+    Values(O.Values), SuperClasses(O.SuperClasses),
+    TrackedRecords(O.TrackedRecords), TheInit(O.TheInit) { }
+
   ~Record() {}
 
 
@@ -1610,6 +1617,16 @@ struct LessRecord {
   }
 };
 
+/// LessRecordByID - Sorting predicate to sort record pointers by their
+/// unique ID. If you just need a deterministic order, use this, since it
+/// just compares two `unsigned`; the other sorting predicates require
+/// string manipulation.
+struct LessRecordByID {
+  bool operator()(const Record *LHS, const Record *RHS) const {
+    return LHS->getID() < RHS->getID();
+  }
+};
+
 /// LessRecordFieldName - Sorting predicate to sort record pointers by their
 /// name field.
 ///
diff --git a/include/llvm/Target/Target.td b/include/llvm/Target/Target.td
index 09f6929bd2..ed3db69a4e 100644
--- a/include/llvm/Target/Target.td
+++ b/include/llvm/Target/Target.td
@@ -495,7 +495,8 @@ def ptr_rc : PointerLikeRegClass<0>;
 
 /// unknown definition - Mark this operand as being of unknown type, causing
 /// it to be resolved by inference in the context it is used.
-def unknown;
+class unknown_class;
+def unknown : unknown_class;
 
 /// AsmOperandClass - Representation for the kinds of operands which the target
 /// specific parser can create and the assembly matcher may need to distinguish.
@@ -1022,6 +1023,9 @@ class Processor<string n, ProcessorItineraries pi, list<SubtargetFeature> f> {
 // ProcessorModel allows subtargets to specify the more general
 // SchedMachineModel instead if a ProcessorItinerary. Subtargets will
 // gradually move to this newer form.
+//
+// Although this class always passes NoItineraries to the Processor
+// class, the SchedMachineModel may still define valid Itineraries.
 class ProcessorModel<string n, SchedMachineModel m, list<SubtargetFeature> f>
   : Processor<n, NoItineraries, f> {
   let SchedModel = m;
diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/Target/TargetInstrInfo.h
index d7cc1cf45a..c5c5a7a9c9 100644
--- a/include/llvm/Target/TargetInstrInfo.h
+++ b/include/llvm/Target/TargetInstrInfo.h
@@ -45,8 +45,8 @@ template<class T> class SmallVectorImpl;
 /// TargetInstrInfo - Interface to description of machine instruction set
 ///
 class TargetInstrInfo : public MCInstrInfo {
-  TargetInstrInfo(const TargetInstrInfo &);  // DO NOT IMPLEMENT
-  void operator=(const TargetInstrInfo &);   // DO NOT IMPLEMENT
+  TargetInstrInfo(const TargetInstrInfo &) LLVM_DELETED_FUNCTION;
+  void operator=(const TargetInstrInfo &) LLVM_DELETED_FUNCTION;
 public:
   TargetInstrInfo(int CFSetupOpcode = -1, int CFDestroyOpcode = -1)
     : CallFrameSetupOpcode(CFSetupOpcode),
@@ -824,6 +824,9 @@ public:
   unsigned defaultDefLatency(const MCSchedModel *SchedModel,
                              const MachineInstr *DefMI) const;
 
+  int computeDefOperandLatency(const InstrItineraryData *ItinData,
+                               const MachineInstr *DefMI, bool FindMin) const;
+
   /// isHighLatencyDef - Return true if this opcode has high latency to its
   /// result.
   virtual bool isHighLatencyDef(int opc) const { return false; }
diff --git a/include/llvm/Target/TargetIntrinsicInfo.h b/include/llvm/Target/TargetIntrinsicInfo.h
index c44b9230c0..ce21349693 100644
--- a/include/llvm/Target/TargetIntrinsicInfo.h
+++ b/include/llvm/Target/TargetIntrinsicInfo.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_TARGET_TARGETINTRINSICINFO_H
 #define LLVM_TARGET_TARGETINTRINSICINFO_H
 
+#include "llvm/Support/Compiler.h"
 #include <string>
 
 namespace llvm {
@@ -27,8 +28,8 @@ class Type;
 /// TargetIntrinsicInfo - Interface to description of machine instruction set
 ///
 class TargetIntrinsicInfo {
-  TargetIntrinsicInfo(const TargetIntrinsicInfo &); // DO NOT IMPLEMENT
-  void operator=(const TargetIntrinsicInfo &);      // DO NOT IMPLEMENT
+  TargetIntrinsicInfo(const TargetIntrinsicInfo &) LLVM_DELETED_FUNCTION;
+  void operator=(const TargetIntrinsicInfo &) LLVM_DELETED_FUNCTION;
 public:
   TargetIntrinsicInfo();
   virtual ~TargetIntrinsicInfo();
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index 03f7dcd864..c1043aad37 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -77,8 +77,8 @@ namespace llvm {
 /// target-specific constructs to SelectionDAG operators.
 ///
 class TargetLowering {
-  TargetLowering(const TargetLowering&);  // DO NOT IMPLEMENT
-  void operator=(const TargetLowering&);  // DO NOT IMPLEMENT
+  TargetLowering(const TargetLowering&) LLVM_DELETED_FUNCTION;
+  void operator=(const TargetLowering&) LLVM_DELETED_FUNCTION;
 public:
   /// LegalizeAction - This enum indicates whether operations are valid for a
   /// target, and if not, what action should be used to make them valid.
@@ -508,8 +508,12 @@ public:
     assert((unsigned)CC < array_lengthof(CondCodeActions) &&
            (unsigned)VT.getSimpleVT().SimpleTy < sizeof(CondCodeActions[0])*4 &&
            "Table isn't big enough!");
+    /// The lower 5 bits of the SimpleTy index into Nth 2bit set from the 64bit
+    /// value and the upper 27 bits index into the second dimension of the
+    /// array to select what 64bit value to use.
     LegalizeAction Action = (LegalizeAction)
-      ((CondCodeActions[CC] >> (2*VT.getSimpleVT().SimpleTy)) & 3);
+      ((CondCodeActions[CC][VT.getSimpleVT().SimpleTy >> 5]
+        >> (2*(VT.getSimpleVT().SimpleTy & 0x1F))) & 3);
     assert(Action != Promote && "Can't promote condition code!");
     return Action;
   }
@@ -566,6 +570,7 @@ public:
     }
     return EVT::getEVT(Ty, AllowUnknown);
   }
+  
 
   /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
   /// function arguments in the caller parameter area.  This is the actual
@@ -1165,8 +1170,13 @@ protected:
     assert(VT < MVT::LAST_VALUETYPE &&
            (unsigned)CC < array_lengthof(CondCodeActions) &&
            "Table isn't big enough!");
-    CondCodeActions[(unsigned)CC] &= ~(uint64_t(3UL)  << VT.SimpleTy*2);
-    CondCodeActions[(unsigned)CC] |= (uint64_t)Action << VT.SimpleTy*2;
+    /// The lower 5 bits of the SimpleTy index into Nth 2bit set from the 64bit
+    /// value and the upper 27 bits index into the second dimension of the
+    /// array to select what 64bit value to use.
+    CondCodeActions[(unsigned)CC][VT.SimpleTy >> 5]
+      &= ~(uint64_t(3UL)  << (VT.SimpleTy & 0x1F)*2);
+    CondCodeActions[(unsigned)CC][VT.SimpleTy >> 5]
+      |= (uint64_t)Action << (VT.SimpleTy & 0x1F)*2;
   }
 
   /// AddPromotedToType - If Opc/OrigVT is specified as being promoted, the
@@ -1945,7 +1955,10 @@ private:
   /// CondCodeActions - For each condition code (ISD::CondCode) keep a
   /// LegalizeAction that indicates how instruction selection should
   /// deal with the condition code.
-  uint64_t CondCodeActions[ISD::SETCC_INVALID];
+  /// Because each CC action takes up 2 bits, we need to have the array size
+  /// be large enough to fit all of the value types. This can be done by
+  /// dividing the MVT::LAST_VALUETYPE by 32 and adding one.
+  uint64_t CondCodeActions[ISD::SETCC_INVALID][(MVT::LAST_VALUETYPE / 32) + 1];
 
   ValueTypeActionImpl ValueTypeActions;
 
diff --git a/include/llvm/Target/TargetLoweringObjectFile.h b/include/llvm/Target/TargetLoweringObjectFile.h
index d631f58aab..13a6fe37d7 100644
--- a/include/llvm/Target/TargetLoweringObjectFile.h
+++ b/include/llvm/Target/TargetLoweringObjectFile.h
@@ -33,10 +33,11 @@ namespace llvm {
   
 class TargetLoweringObjectFile : public MCObjectFileInfo {
   MCContext *Ctx;
-  
-  TargetLoweringObjectFile(const TargetLoweringObjectFile&); // DO NOT IMPLEMENT
-  void operator=(const TargetLoweringObjectFile&);           // DO NOT IMPLEMENT
-  
+
+  TargetLoweringObjectFile(
+    const TargetLoweringObjectFile&) LLVM_DELETED_FUNCTION;
+  void operator=(const TargetLoweringObjectFile&) LLVM_DELETED_FUNCTION;
+
 public:
   MCContext &getContext() const { return *Ctx; }
 
diff --git a/include/llvm/Target/TargetMachine.h b/include/llvm/Target/TargetMachine.h
index e4bf32bd86..b6fa817542 100644
--- a/include/llvm/Target/TargetMachine.h
+++ b/include/llvm/Target/TargetMachine.h
@@ -52,8 +52,8 @@ class raw_ostream;
 /// through this interface.
 ///
 class TargetMachine {
-  TargetMachine(const TargetMachine &);   // DO NOT IMPLEMENT
-  void operator=(const TargetMachine &);  // DO NOT IMPLEMENT
+  TargetMachine(const TargetMachine &) LLVM_DELETED_FUNCTION;
+  void operator=(const TargetMachine &) LLVM_DELETED_FUNCTION;
 protected: // Can only create subclasses.
   TargetMachine(const Target &T, StringRef TargetTriple,
                 StringRef CPU, StringRef FS, const TargetOptions &Options);
diff --git a/include/llvm/Target/TargetSchedule.td b/include/llvm/Target/TargetSchedule.td
index 4dc488dbae..cf3d250590 100644
--- a/include/llvm/Target/TargetSchedule.td
+++ b/include/llvm/Target/TargetSchedule.td
@@ -10,25 +10,68 @@
 // This file defines the target-independent scheduling interfaces which should
 // be implemented by each target which is using TableGen based scheduling.
 //
+// The SchedMachineModel is defined by subtargets for three categories of data:
+// 1. Basic properties for coarse grained instruction cost model.
+// 2. Scheduler Read/Write resources for simple per-opcode cost model.
+// 3. Instruction itineraties for detailed reservation tables.
+//
+// (1) Basic properties are defined by the SchedMachineModel
+// class. Target hooks allow subtargets to associate opcodes with
+// those properties.
+//
+// (2) A per-operand machine model can be implemented in any
+// combination of the following ways:
+//
+// A. Associate per-operand SchedReadWrite types with Instructions by
+// modifying the Instruction definition to inherit from Sched. For
+// each subtarget, define WriteRes and ReadAdvance to associate
+// processor resources and latency with each SchedReadWrite type.
+//
+// B. In each instruction definition, name an ItineraryClass. For each
+// subtarget, define ItinRW entries to map ItineraryClass to
+// per-operand SchedReadWrite types. Unlike method A, these types may
+// be subtarget specific and can be directly associated with resources
+// by defining SchedWriteRes and SchedReadAdvance.
+//
+// C. In the subtarget, map SchedReadWrite types to specific
+// opcodes. This overrides any SchedReadWrite types or
+// ItineraryClasses defined by the Instruction. As in method B, the
+// subtarget can directly associate resources with SchedReadWrite
+// types by defining SchedWriteRes and SchedReadAdvance.
+//
+// D. In either the target or subtarget, define SchedWriteVariant or
+// SchedReadVariant to map one SchedReadWrite type onto another
+// sequence of SchedReadWrite types. This allows dynamic selection of
+// an instruction's machine model via custom C++ code. It also allows
+// a machine-independent SchedReadWrite type to map to a sequence of
+// machine-dependent types.
+//
+// (3) A per-pipeline-stage machine model can be implemented by providing
+// Itineraries in addition to mapping instructions to ItineraryClasses.
 //===----------------------------------------------------------------------===//
 
+// Include legacy support for instruction itineraries.
 include "llvm/Target/TargetItinerary.td"
 
-// The SchedMachineModel is defined by subtargets for three categories of data:
-// 1) Basic properties for coarse grained instruction cost model.
-// 2) Scheduler Read/Write resources for simple per-opcode cost model.
-// 3) Instruction itineraties for detailed reservation tables.
+class Instruction; // Forward def
+
+// Define the SchedMachineModel and provide basic properties for
+// coarse grained instruction cost model. Default values for the
+// properties are defined in MCSchedModel. A value of "-1" in the
+// target description's SchedMachineModel indicates that the property
+// is not overriden by the target.
 //
-// Default values for basic properties are defined in MCSchedModel. "-1"
-// indicates that the property is not overriden by the target description.
+// Target hooks allow subtargets to associate LoadLatency and
+// HighLatency with groups of opcodes.
 class SchedMachineModel {
-  int IssueWidth = -1; // Max instructions that may be scheduled per cycle.
+  int IssueWidth = -1; // Max micro-ops that may be scheduled per cycle.
   int MinLatency = -1; // Determines which instrucions are allowed in a group.
                        // (-1) inorder (0) ooo, (1): inorder +var latencies.
   int LoadLatency = -1; // Cycles for loads to access the cache.
   int HighLatency = -1; // Approximation of cycles for "high latency" ops.
   int MispredictPenalty = -1; // Extra cycles for a mispredicted branch.
 
+  // Per-cycle resources tables.
   ProcessorItineraries Itineraries = NoItineraries;
 
   bit NoModel = 0; // Special tag to indicate missing machine model.
@@ -38,4 +81,276 @@ def NoSchedModel : SchedMachineModel {
   let NoModel = 1;
 }
 
-// TODO: Define classes for processor and scheduler resources.
+// Define a kind of processor resource that may be common across
+// similar subtargets.
+class ProcResourceKind;
+
+// Define a number of interchangeable processor resources. NumUnits
+// determines the throughput of instructions that require the resource.
+//
+// An optional Super resource may be given to model these resources as
+// a subset of the more general super resources. Using one of these
+// resources implies using one of the super resoruces.
+//
+// ProcResourceUnits normally model a few buffered resources within an
+// out-of-order engine that the compiler attempts to conserve.
+// Buffered resources may be held for multiple clock cycles, but the
+// scheduler does not pin them to a particular clock cycle relative to
+// instruction dispatch. Setting Buffered=0 changes this to an
+// in-order resource. In this case, the scheduler counts down from the
+// cycle that the instruction issues in-order, forcing an interlock
+// with subsequent instructions that require the same resource until
+// the number of ResourceCyles specified in WriteRes expire.
+//
+// SchedModel ties these units to a processor for any stand-alone defs
+// of this class. Instances of subclass ProcResource will be automatically
+// attached to a processor, so SchedModel is not needed.
+class ProcResourceUnits<ProcResourceKind kind, int num> {
+  ProcResourceKind Kind = kind;
+  int NumUnits = num;
+  ProcResourceKind Super = ?;
+  bit Buffered = 1;
+  SchedMachineModel SchedModel = ?;
+}
+
+// EponymousProcResourceKind helps implement ProcResourceUnits by
+// allowing a ProcResourceUnits definition to reference itself. It
+// should not be referenced anywhere else.
+def EponymousProcResourceKind : ProcResourceKind;
+
+// Subtargets typically define processor resource kind and number of
+// units in one place.
+class ProcResource<int num> : ProcResourceKind,
+  ProcResourceUnits<EponymousProcResourceKind, num>;
+
+// A target architecture may define SchedReadWrite types and associate
+// them with instruction operands.
+class SchedReadWrite;
+
+// List the per-operand types that map to the machine model of an
+// instruction. One SchedWrite type must be listed for each explicit
+// def operand in order. Additional SchedWrite types may optionally be
+// listed for implicit def operands.  SchedRead types may optionally
+// be listed for use operands in order. The order of defs relative to
+// uses is insignificant. This way, the same SchedReadWrite list may
+// be used for multiple forms of an operation. For example, a
+// two-address instruction could have two tied operands or single
+// operand that both reads and writes a reg. In both cases we have a
+// single SchedWrite and single SchedRead in any order.
+class Sched<list<SchedReadWrite> schedrw> {
+  list<SchedReadWrite> SchedRW = schedrw;
+}
+
+// Define a scheduler resource associated with a def operand.
+class SchedWrite : SchedReadWrite;
+def NoWrite : SchedWrite;
+
+// Define a scheduler resource associated with a use operand.
+class SchedRead  : SchedReadWrite;
+
+// Define a SchedWrite that is modeled as a sequence of other
+// SchedWrites with additive latency. This allows a single operand to
+// be mapped the resources composed from a set of previously defined
+// SchedWrites.
+//
+// If the final write in this sequence is a SchedWriteVariant marked
+// Variadic, then the list of prior writes are distributed across all
+// operands after resolving the predicate for the final write.
+//
+// SchedModel silences warnings but is ignored.
+class WriteSequence<list<SchedWrite> writes, int rep = 1> : SchedWrite {
+  list<SchedWrite> Writes = writes;
+  int Repeat = rep;
+  SchedMachineModel SchedModel = ?;
+}
+
+// Define values common to WriteRes and SchedWriteRes.
+//
+// SchedModel ties these resources to a processor.
+class ProcWriteResources<list<ProcResourceKind> resources> {
+  list<ProcResourceKind> ProcResources = resources;
+  list<int> ResourceCycles = [];
+  int Latency = 1;
+  int NumMicroOps = 1;
+  bit BeginGroup = 0;
+  bit EndGroup = 0;
+  // Allow a processor to mark some scheduling classes as unsupported
+  // for stronger verification.
+  bit Unsupported = 0;
+  SchedMachineModel SchedModel = ?;
+}
+
+// Define the resources and latency of a SchedWrite. This will be used
+// directly by targets that have no itinerary classes. In this case,
+// SchedWrite is defined by the target, while WriteResources is
+// defined by the subtarget, and maps the SchedWrite to processor
+// resources.
+//
+// If a target already has itinerary classes, SchedWriteResources can
+// be used instead to define subtarget specific SchedWrites and map
+// them to processor resources in one place. Then ItinRW can map
+// itinerary classes to the subtarget's SchedWrites.
+//
+// ProcResources indicates the set of resources consumed by the write.
+// Optionally, ResourceCycles indicates the number of cycles the
+// resource is consumed. Each ResourceCycles item is paired with the
+// ProcResource item at the same position in its list. Since
+// ResourceCycles are rarely specialized, the list may be
+// incomplete. By default, resources are consumed for a single cycle,
+// regardless of latency, which models a fully pipelined processing
+// unit. A value of 0 for ResourceCycles means that the resource must
+// be available but is not consumed, which is only relevant for
+// unbuffered resources.
+//
+// By default, each SchedWrite takes one micro-op, which is counted
+// against the processor's IssueWidth limit. If an instruction can
+// write multiple registers with a single micro-op, the subtarget
+// should define one of the writes to be zero micro-ops. If a
+// subtarget requires multiple micro-ops to write a single result, it
+// should either override the write's NumMicroOps to be greater than 1
+// or require additional writes. Extra writes can be required either
+// by defining a WriteSequence, or simply listing extra writes in the
+// instruction's list of writers beyond the number of "def"
+// operands. The scheduler assumes that all micro-ops must be
+// dispatched in the same cycle. These micro-ops may be required to
+// begin or end the current dispatch group.
+class WriteRes<SchedWrite write, list<ProcResourceKind> resources>
+  : ProcWriteResources<resources> {
+  SchedWrite WriteType = write;
+}
+
+// Directly name a set of WriteResources defining a new SchedWrite
+// type at the same time. This class is unaware of its SchedModel so
+// must be referenced by InstRW or ItinRW.
+class SchedWriteRes<list<ProcResourceKind> resources> : SchedWrite,
+  ProcWriteResources<resources>;
+
+// Define values common to ReadAdvance and SchedReadAdvance.
+//
+// SchedModel ties these resources to a processor.
+class ProcReadAdvance<int cycles, list<SchedWrite> writes = []> {
+  int Cycles = cycles;
+  list<SchedWrite> ValidWrites = writes;
+  // Allow a processor to mark some scheduling classes as unsupported
+  // for stronger verification.
+  bit Unsupported = 0;
+  SchedMachineModel SchedModel = ?;
+}
+
+// A processor may define a ReadAdvance associated with a SchedRead
+// to reduce latency of a prior write by N cycles. A negative advance
+// effectively increases latency, which may be used for cross-domain
+// stalls.
+//
+// A ReadAdvance may be associated with a list of SchedWrites
+// to implement pipeline bypass. The Writes list may be empty to
+// indicate operands that are always read this number of Cycles later
+// than a normal register read, allowing the read's parent instruction
+// to issue earlier relative to the writer.
+class ReadAdvance<SchedRead read, int cycles, list<SchedWrite> writes = []>
+  : ProcReadAdvance<cycles, writes> {
+  SchedRead ReadType = read;
+}
+
+// Directly associate a new SchedRead type with a delay and optional
+// pipeline bypess. For use with InstRW or ItinRW.
+class SchedReadAdvance<int cycles, list<SchedWrite> writes = []> : SchedRead,
+  ProcReadAdvance<cycles, writes>;
+
+// Define SchedRead defaults. Reads seldom need special treatment.
+def ReadDefault : SchedRead;
+def NoReadAdvance : SchedReadAdvance<0>;
+
+// Define shared code that will be in the same scope as all
+// SchedPredicates. Available variables are:
+// (const MachineInstr *MI, const TargetSchedModel *SchedModel)
+class PredicateProlog<code c> {
+  code Code = c;
+}
+
+// Define a predicate to determine which SchedVariant applies to a
+// particular MachineInstr. The code snippet is used as an
+// if-statement's expression. Available variables are MI, SchedModel,
+// and anything defined in a PredicateProlog.
+//
+// SchedModel silences warnings but is ignored.
+class SchedPredicate<code pred> {
+  SchedMachineModel SchedModel = ?;
+  code Predicate = pred;
+}
+def NoSchedPred : SchedPredicate<[{true}]>;
+
+// Associate a predicate with a list of SchedReadWrites. By default,
+// the selected SchedReadWrites are still associated with a single
+// operand and assumed to execute sequentially with additive
+// latency. However, if the parent SchedWriteVariant or
+// SchedReadVariant is marked "Variadic", then each Selected
+// SchedReadWrite is mapped in place to the instruction's variadic
+// operands. In this case, latency is not additive. If the current Variant
+// is already part of a Sequence, then that entire chain leading up to
+// the Variant is distributed over the variadic operands.
+class SchedVar<SchedPredicate pred, list<SchedReadWrite> selected> {
+  SchedPredicate Predicate = pred;
+  list<SchedReadWrite> Selected = selected;
+}
+
+// SchedModel silences warnings but is ignored.
+class SchedVariant<list<SchedVar> variants> {
+  list<SchedVar> Variants = variants;
+  bit Variadic = 0;
+  SchedMachineModel SchedModel = ?;
+}
+
+// A SchedWriteVariant is a single SchedWrite type that maps to a list
+// of SchedWrite types under the conditions defined by its predicates.
+//
+// A Variadic write is expanded to cover multiple "def" operands. The
+// SchedVariant's Expansion list is then interpreted as one write
+// per-operand instead of the usual sequential writes feeding a single
+// operand.
+class SchedWriteVariant<list<SchedVar> variants> : SchedWrite,
+  SchedVariant<variants> {
+}
+
+// A SchedReadVariant is a single SchedRead type that maps to a list
+// of SchedRead types under the conditions defined by its predicates.
+//
+// A Variadic write is expanded to cover multiple "readsReg" operands as
+// explained above.
+class SchedReadVariant<list<SchedVar> variants> : SchedRead,
+  SchedVariant<variants> {
+}
+
+// Map a set of opcodes to a list of SchedReadWrite types. This allows
+// the subtarget to easily override specific operations.
+//
+// SchedModel ties this opcode mapping to a processor.
+class InstRW<list<SchedReadWrite> rw, list<Instruction> instrs> {
+  list<SchedReadWrite> OperandReadWrites = rw;
+  list<Instruction> Instrs = instrs;
+  SchedMachineModel SchedModel = ?;
+}
+
+// Map a set of itinerary classes to SchedReadWrite resources. This is
+// used to bootstrap a target (e.g. ARM) when itineraries already
+// exist and changing InstrInfo is undesirable.
+//
+// SchedModel ties this ItineraryClass mapping to a processor.
+class ItinRW<list<SchedReadWrite> rw, list<InstrItinClass> iic> {
+  list<InstrItinClass> MatchedItinClasses = iic;
+  list<SchedReadWrite> OperandReadWrites = rw;
+  SchedMachineModel SchedModel = ?;
+}
+
+// Alias a target-defined SchedReadWrite to a processor specific
+// SchedReadWrite. This allows a subtarget to easily map a
+// SchedReadWrite type onto a WriteSequence, SchedWriteVariant, or
+// SchedReadVariant.
+//
+// SchedModel will usually be provided by surrounding let statement
+// and ties this SchedAlias mapping to a processor.
+class SchedAlias<SchedReadWrite match, SchedReadWrite alias> {
+  SchedReadWrite MatchRW = match;
+  SchedReadWrite AliasRW = alias;
+  SchedMachineModel SchedModel = ?;
+}
diff --git a/include/llvm/Target/TargetSelectionDAGInfo.h b/include/llvm/Target/TargetSelectionDAGInfo.h
index c9ca7223b5..8c5874e38f 100644
--- a/include/llvm/Target/TargetSelectionDAGInfo.h
+++ b/include/llvm/Target/TargetSelectionDAGInfo.h
@@ -28,8 +28,8 @@ class TargetMachine;
 /// SelectionDAG lowering and instruction selection process.
 ///
 class TargetSelectionDAGInfo {
-  TargetSelectionDAGInfo(const TargetSelectionDAGInfo &); // DO NOT IMPLEMENT
-  void operator=(const TargetSelectionDAGInfo &);         // DO NOT IMPLEMENT
+  TargetSelectionDAGInfo(const TargetSelectionDAGInfo &) LLVM_DELETED_FUNCTION;
+  void operator=(const TargetSelectionDAGInfo &) LLVM_DELETED_FUNCTION;
 
   const TargetData *TD;
 
diff --git a/include/llvm/Target/TargetSubtargetInfo.h b/include/llvm/Target/TargetSubtargetInfo.h
index fc23b2c6b5..08c78f2564 100644
--- a/include/llvm/Target/TargetSubtargetInfo.h
+++ b/include/llvm/Target/TargetSubtargetInfo.h
@@ -19,9 +19,11 @@
 
 namespace llvm {
 
+class MachineInstr;
 class SDep;
 class SUnit;
 class TargetRegisterClass;
+class TargetSchedModel;
 template <typename T> class SmallVectorImpl;
 
 //===----------------------------------------------------------------------===//
@@ -31,8 +33,8 @@ template <typename T> class SmallVectorImpl;
 /// be exposed through a TargetSubtargetInfo-derived class.
 ///
 class TargetSubtargetInfo : public MCSubtargetInfo {
-  TargetSubtargetInfo(const TargetSubtargetInfo&);   // DO NOT IMPLEMENT
-  void operator=(const TargetSubtargetInfo&);  // DO NOT IMPLEMENT
+  TargetSubtargetInfo(const TargetSubtargetInfo&) LLVM_DELETED_FUNCTION;
+  void operator=(const TargetSubtargetInfo&) LLVM_DELETED_FUNCTION;
 protected: // Can only create subclasses...
   TargetSubtargetInfo();
 public:
@@ -43,6 +45,15 @@ public:
 
   virtual ~TargetSubtargetInfo();
 
+  /// Resolve a SchedClass at runtime, where SchedClass identifies an
+  /// MCSchedClassDesc with the isVariant property. This may return the ID of
+  /// another variant SchedClass, but repeated invocation must quickly terminate
+  /// in a nonvariant SchedClass.
+  virtual unsigned resolveSchedClass(unsigned SchedClass, const MachineInstr *MI,
+                                     const TargetSchedModel* SchedModel) const {
+    return 0;
+  }
+
   /// getSpecialAddressLatency - For targets where it is beneficial to
   /// backschedule instructions that compute addresses, return a value
   /// indicating the number of scheduling cycles of backscheduling that
@@ -53,13 +64,13 @@ public:
   // scheduling and the specified optimization level meets the requirement
   // return true to enable post-register-allocation scheduling. In
   // CriticalPathRCs return any register classes that should only be broken
-  // if on the critical path. 
+  // if on the critical path.
   virtual bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
                                      AntiDepBreakMode& Mode,
                                      RegClassVector& CriticalPathRCs) const;
   // adjustSchedDependency - Perform target specific adjustments to
   // the latency of a schedule dependency.
-  virtual void adjustSchedDependency(SUnit *def, SUnit *use, 
+  virtual void adjustSchedDependency(SUnit *def, SUnit *use,
                                      SDep& dep) const { }
 };
 
diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h
index 29b5233e22..1ddca844c9 100644
--- a/include/llvm/Transforms/Scalar.h
+++ b/include/llvm/Transforms/Scalar.h
@@ -70,6 +70,12 @@ FunctionPass *createAggressiveDCEPass();
 
 //===----------------------------------------------------------------------===//
 //
+// SROA - Replace aggregates or pieces of aggregates with scalar SSA values.
+//
+FunctionPass *createSROAPass(bool RequiresDomTree = true);
+
+//===----------------------------------------------------------------------===//
+//
 // ScalarReplAggregates - Break up alloca's of aggregates into multiple allocas
 // if possible.
 //
diff --git a/include/llvm/Transforms/Utils/BypassSlowDivision.h b/include/llvm/Transforms/Utils/BypassSlowDivision.h
index c262434b66..dceda48587 100644
--- a/include/llvm/Transforms/Utils/BypassSlowDivision.h
+++ b/include/llvm/Transforms/Utils/BypassSlowDivision.h
@@ -26,7 +26,7 @@ namespace llvm {
 /// profitably bypassed and carried out with a shorter, faster divide.
 bool bypassSlowDivision(Function &F,
                         Function::iterator &I,
-                        const DenseMap<Type *, Type *> &BypassTypeMap);
+                        const DenseMap<Type*, Type*> &BypassTypeMap);
 
 } // End llvm namespace
 
diff --git a/include/llvm/Transforms/Utils/Cloning.h b/include/llvm/Transforms/Utils/Cloning.h
index b7b5d29b32..92e15ce932 100644
--- a/include/llvm/Transforms/Utils/Cloning.h
+++ b/include/llvm/Transforms/Utils/Cloning.h
@@ -116,13 +116,6 @@ Function *CloneFunction(const Function *F,
                         bool ModuleLevelChanges,
                         ClonedCodeInfo *CodeInfo = 0);
 
-/// CloneFunction - Version of the function that doesn't need the VMap.
-///
-inline Function *CloneFunction(const Function *F, ClonedCodeInfo *CodeInfo = 0){
-  ValueToValueMapTy VMap;
-  return CloneFunction(F, VMap, CodeInfo);
-}
-
 /// Clone OldFunc into NewFunc, transforming the old arguments into references
 /// to VMap values.  Note that if NewFunc already has basic blocks, the ones
 /// cloned into it will be added to the end of the function.  This function
diff --git a/include/llvm/Transforms/Utils/IntegerDivision.h b/include/llvm/Transforms/Utils/IntegerDivision.h
new file mode 100644
index 0000000000..8d3f53e6f9
--- /dev/null
+++ b/include/llvm/Transforms/Utils/IntegerDivision.h
@@ -0,0 +1,38 @@
+//===- llvm/Transforms/Utils/IntegerDivision.h ------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains an implementation of 32bit integer division for targets
+// that don't have native support. It's largely derived from compiler-rt's
+// implementation of __udivsi3, but hand-tuned for targets that prefer less
+// control flow.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TRANSFORMS_UTILS_INTEGERDIVISION_H
+#define TRANSFORMS_UTILS_INTEGERDIVISION_H
+
+namespace llvm {
+  class BinaryOperator;
+}
+
+namespace llvm {
+
+  /// Generate code to divide two integers, replacing Div with the generated
+  /// code. This currently generates code similarly to compiler-rt's
+  /// implementations, but future work includes generating more specialized code
+  /// when more information about the operands are known. Currently only
+  /// implements 32bit scalar division, but future work is removing this
+  /// limitation.
+  ///
+  /// @brief Replace Div with generated code.
+  bool expandDivision(BinaryOperator* Div);
+
+} // End llvm namespace
+
+#endif
diff --git a/include/llvm/Transforms/Utils/SSAUpdater.h b/include/llvm/Transforms/Utils/SSAUpdater.h
index 4c821491b2..db65a47e97 100644
--- a/include/llvm/Transforms/Utils/SSAUpdater.h
+++ b/include/llvm/Transforms/Utils/SSAUpdater.h
@@ -109,8 +109,8 @@ public:
 private:
   Value *GetValueAtEndOfBlockInternal(BasicBlock *BB);
 
-  void operator=(const SSAUpdater&); // DO NOT IMPLEMENT
-  SSAUpdater(const SSAUpdater&);     // DO NOT IMPLEMENT
+  void operator=(const SSAUpdater&) LLVM_DELETED_FUNCTION;
+  SSAUpdater(const SSAUpdater&) LLVM_DELETED_FUNCTION;
 };
   
 /// LoadAndStorePromoter - This little helper class provides a convenient way to
diff --git a/include/llvm/Use.h b/include/llvm/Use.h
index a496325c1f..80804459cc 100644
--- a/include/llvm/Use.h
+++ b/include/llvm/Use.h
@@ -26,6 +26,7 @@
 #define LLVM_USE_H
 
 #include "llvm/ADT/PointerIntPair.h"
+#include "llvm/Support/Compiler.h"
 #include <cstddef>
 #include <iterator>
 
@@ -66,7 +67,7 @@ public:
 
 private:
   /// Copy ctor - do not implement
-  Use(const Use &U);
+  Use(const Use &U) LLVM_DELETED_FUNCTION;
 
   /// Destructor - Only for zap()
   ~Use() {
diff --git a/include/llvm/User.h b/include/llvm/User.h
index 5d5460cd6f..abd80afb70 100644
--- a/include/llvm/User.h
+++ b/include/llvm/User.h
@@ -31,8 +31,8 @@ template <class>
 struct OperandTraits;
 
 class User : public Value {
-  User(const User &);             // Do not implement
-  void *operator new(size_t);     // Do not implement
+  User(const User &) LLVM_DELETED_FUNCTION;
+  void *operator new(size_t) LLVM_DELETED_FUNCTION;
   template <unsigned>
   friend struct HungoffOperandTraits;
   virtual void anchor();
diff --git a/include/llvm/Value.h b/include/llvm/Value.h
index d7ccd4dccc..41acc84415 100644
--- a/include/llvm/Value.h
+++ b/include/llvm/Value.h
@@ -16,6 +16,7 @@
 
 #include "llvm/Use.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 
 namespace llvm {
 
@@ -80,8 +81,8 @@ private:
   friend class ValueHandleBase;
   ValueName *Name;
 
-  void operator=(const Value &);     // Do not implement
-  Value(const Value &);              // Do not implement
+  void operator=(const Value &) LLVM_DELETED_FUNCTION;
+  Value(const Value &) LLVM_DELETED_FUNCTION;
 
 protected:
   /// printCustom - Value subclasses can override this to implement custom
@@ -126,7 +127,7 @@ public:
   /// setName() - Change the name of the value, choosing a new unique name if
   /// the provided name is taken.
   ///
-  /// \arg Name - The new name; or "" if the value's name should be removed.
+  /// \param Name The new name; or "" if the value's name should be removed.
   void setName(const Twine &Name);
 
   
diff --git a/lib/Analysis/AliasSetTracker.cpp b/lib/Analysis/AliasSetTracker.cpp
index e9dcb37903..3199a118eb 100644
--- a/lib/Analysis/AliasSetTracker.cpp
+++ b/lib/Analysis/AliasSetTracker.cpp
@@ -590,7 +590,7 @@ void AliasSetTracker::print(raw_ostream &OS) const {
   OS << "\n";
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void AliasSet::dump() const { print(dbgs()); }
 void AliasSetTracker::dump() const { print(dbgs()); }
 #endif
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index a3bc06a80f..7fad8fecb0 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -1086,7 +1086,7 @@ BasicAliasAnalysis::aliasPHI(const PHINode *PN, uint64_t PNSize,
       // We assume for the recursion that the the phis (ptr_phi, ptr2_phi) do
       // not alias each other.
       bool ArePhisAssumedNoAlias = false;
-      AliasResult OrigAliasResult;
+      AliasResult OrigAliasResult = NoAlias;
       if (Alias == NoAlias) {
         // Pretend the phis do not alias.
         assert(AliasCache.count(Locs) &&
diff --git a/lib/Analysis/DominanceFrontier.cpp b/lib/Analysis/DominanceFrontier.cpp
index 5536a9b705..3e537e9f1a 100644
--- a/lib/Analysis/DominanceFrontier.cpp
+++ b/lib/Analysis/DominanceFrontier.cpp
@@ -133,7 +133,7 @@ void DominanceFrontierBase::print(raw_ostream &OS, const Module* ) const {
   }
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void DominanceFrontierBase::dump() const {
   print(dbgs());
 }
diff --git a/lib/Analysis/IPA/CallGraph.cpp b/lib/Analysis/IPA/CallGraph.cpp
index 947ad519c6..17631ddb30 100644
--- a/lib/Analysis/IPA/CallGraph.cpp
+++ b/lib/Analysis/IPA/CallGraph.cpp
@@ -198,7 +198,7 @@ void CallGraph::print(raw_ostream &OS, Module*) const {
   for (CallGraph::const_iterator I = begin(), E = end(); I != E; ++I)
     I->second->print(OS);
 }
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void CallGraph::dump() const {
   print(dbgs(), 0);
 }
@@ -269,7 +269,7 @@ void CallGraphNode::print(raw_ostream &OS) const {
   OS << '\n';
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void CallGraphNode::dump() const { print(dbgs()); }
 #endif
 
diff --git a/lib/Analysis/IVUsers.cpp b/lib/Analysis/IVUsers.cpp
index f70518165a..3807ccdbf8 100644
--- a/lib/Analysis/IVUsers.cpp
+++ b/lib/Analysis/IVUsers.cpp
@@ -273,7 +273,7 @@ void IVUsers::print(raw_ostream &OS, const Module *M) const {
   }
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void IVUsers::dump() const {
   print(dbgs());
 }
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index 12be7fdc14..1a94665096 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -51,9 +51,12 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   int Cost;
   const bool AlwaysInline;
 
-  bool IsRecursive;
+  bool IsCallerRecursive;
+  bool IsRecursiveCall;
   bool ExposesReturnsTwice;
   bool HasDynamicAlloca;
+  /// Number of bytes allocated statically by the callee.
+  uint64_t AllocatedSize;
   unsigned NumInstructions, NumVectorInstructions;
   int FiftyPercentVectorBonus, TenPercentVectorBonus;
   int VectorBonus;
@@ -126,7 +129,8 @@ public:
   CallAnalyzer(const TargetData *TD, Function &Callee, int Threshold)
     : TD(TD), F(Callee), Threshold(Threshold), Cost(0),
       AlwaysInline(F.hasFnAttr(Attribute::AlwaysInline)),
-      IsRecursive(false), ExposesReturnsTwice(false), HasDynamicAlloca(false),
+      IsCallerRecursive(false), IsRecursiveCall(false),
+      ExposesReturnsTwice(false), HasDynamicAlloca(false), AllocatedSize(0),
       NumInstructions(0), NumVectorInstructions(0),
       FiftyPercentVectorBonus(0), TenPercentVectorBonus(0), VectorBonus(0),
       NumConstantArgs(0), NumConstantOffsetPtrArgs(0), NumAllocaArgs(0),
@@ -269,6 +273,13 @@ bool CallAnalyzer::visitAlloca(AllocaInst &I) {
   // FIXME: Check whether inlining will turn a dynamic alloca into a static
   // alloca, and handle that case.
 
+  // Accumulate the allocated size.
+  if (I.isStaticAlloca()) {
+    Type *Ty = I.getAllocatedType();
+    AllocatedSize += (TD ? TD->getTypeAllocSize(Ty) :
+                      Ty->getPrimitiveSizeInBits());
+  }
+
   // We will happily inline static alloca instructions or dynamic alloca
   // instructions in always-inline situations.
   if (AlwaysInline || I.isStaticAlloca())
@@ -625,7 +636,7 @@ bool CallAnalyzer::visitCallSite(CallSite CS) {
     if (F == CS.getInstruction()->getParent()->getParent()) {
       // This flag will fully abort the analysis, so don't bother with anything
       // else.
-      IsRecursive = true;
+      IsRecursiveCall = true;
       return false;
     }
 
@@ -712,7 +723,14 @@ bool CallAnalyzer::analyzeBlock(BasicBlock *BB) {
       Cost += InlineConstants::InstrCost;
 
     // If the visit this instruction detected an uninlinable pattern, abort.
-    if (IsRecursive || ExposesReturnsTwice || HasDynamicAlloca)
+    if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca)
+      return false;
+
+    // If the caller is a recursive function then we don't want to inline
+    // functions which allocate a lot of stack space because it would increase
+    // the caller stack usage dramatically.
+    if (IsCallerRecursive &&
+        AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller)
       return false;
 
     if (NumVectorInstructions > NumInstructions/2)
@@ -831,12 +849,14 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
       Cost += InlineConstants::LastCallToStaticBonus;
 
     // If the instruction after the call, or if the normal destination of the
-    // invoke is an unreachable instruction, the function is noreturn.  As such,
-    // there is little point in inlining this unless there is literally zero cost.
-    if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) {
+    // invoke is an unreachable instruction, the function is noreturn. As such,
+    // there is little point in inlining this unless there is literally zero
+    // cost.
+    Instruction *Instr = CS.getInstruction();
+    if (InvokeInst *II = dyn_cast<InvokeInst>(Instr)) {
       if (isa<UnreachableInst>(II->getNormalDest()->begin()))
         Threshold = 1;
-    } else if (isa<UnreachableInst>(++BasicBlock::iterator(CS.getInstruction())))
+    } else if (isa<UnreachableInst>(++BasicBlock::iterator(Instr)))
       Threshold = 1;
 
     // If this function uses the coldcc calling convention, prefer not to inline
@@ -852,6 +872,20 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
   if (F.empty())
     return true;
 
+  Function *Caller = CS.getInstruction()->getParent()->getParent();
+  // Check if the caller function is recursive itself.
+  for (Value::use_iterator U = Caller->use_begin(), E = Caller->use_end();
+       U != E; ++U) {
+    CallSite Site(cast<Value>(*U));
+    if (!Site)
+      continue;
+    Instruction *I = Site.getInstruction();
+    if (I->getParent()->getParent() == Caller) {
+      IsCallerRecursive = true;
+      break;
+    }
+  }
+
   // Track whether we've seen a return instruction. The first return
   // instruction is free, as at least one will usually disappear in inlining.
   bool HasReturn = false;
@@ -908,9 +942,9 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
 
     // We never want to inline functions that contain an indirectbr.  This is
     // incorrect because all the blockaddress's (in static global initializers
-    // for example) would be referring to the original function, and this indirect
-    // jump would jump from the inlined copy of the function into the original
-    // function which is extremely undefined behavior.
+    // for example) would be referring to the original function, and this
+    // indirect jump would jump from the inlined copy of the function into the 
+    // original function which is extremely undefined behavior.
     // FIXME: This logic isn't really right; we can safely inline functions
     // with indirectbr's as long as no other function or global references the
     // blockaddress of a block within the current function.  And as a QOI issue,
@@ -928,8 +962,16 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
     // Analyze the cost of this block. If we blow through the threshold, this
     // returns false, and we can bail on out.
     if (!analyzeBlock(BB)) {
-      if (IsRecursive || ExposesReturnsTwice || HasDynamicAlloca)
+      if (IsRecursiveCall || ExposesReturnsTwice || HasDynamicAlloca)
         return false;
+
+      // If the caller is a recursive function then we don't want to inline
+      // functions which allocate a lot of stack space because it would increase
+      // the caller stack usage dramatically.
+      if (IsCallerRecursive &&
+          AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller)
+        return false;
+
       break;
     }
 
@@ -955,7 +997,8 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
 
     // If we're unable to select a particular successor, just count all of
     // them.
-    for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize; ++TIdx)
+    for (unsigned TIdx = 0, TSize = TI->getNumSuccessors(); TIdx != TSize;
+         ++TIdx)
       BBWorklist.insert(TI->getSuccessor(TIdx));
 
     // If we had any successors at this point, than post-inlining is likely to
@@ -974,7 +1017,7 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
   return AlwaysInline || Cost < Threshold;
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 /// \brief Dump stats about this call's analysis.
 void CallAnalyzer::dump() {
 #define DEBUG_PRINT_STAT(x) llvm::dbgs() << "      " #x ": " << x << "\n"
@@ -1003,7 +1046,8 @@ InlineCost InlineCostAnalyzer::getInlineCost(CallSite CS, Function *Callee,
       Callee->hasFnAttr(Attribute::NoInline) || CS.isNoInline())
     return llvm::InlineCost::getNever();
 
-  DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName() << "...\n");
+  DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
+        << "...\n");
 
   CallAnalyzer CA(TD, *Callee, Threshold);
   bool ShouldInline = CA.analyzeCall(CS);
diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp
index 4a18104776..8341f9d830 100644
--- a/lib/Analysis/LoopInfo.cpp
+++ b/lib/Analysis/LoopInfo.cpp
@@ -306,7 +306,7 @@ BasicBlock *Loop::getUniqueExitBlock() const {
   return 0;
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void Loop::dump() const {
   print(dbgs());
 }
diff --git a/lib/Analysis/PHITransAddr.cpp b/lib/Analysis/PHITransAddr.cpp
index d6a17ca725..c35737e472 100644
--- a/lib/Analysis/PHITransAddr.cpp
+++ b/lib/Analysis/PHITransAddr.cpp
@@ -41,7 +41,7 @@ static bool CanPHITrans(Instruction *Inst) {
   return false;
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void PHITransAddr::dump() const {
   if (Addr == 0) {
     dbgs() << "PHITransAddr: null\n";
diff --git a/lib/Analysis/RegionInfo.cpp b/lib/Analysis/RegionInfo.cpp
index 0f9a8b3ac4..30f0d2f10d 100644
--- a/lib/Analysis/RegionInfo.cpp
+++ b/lib/Analysis/RegionInfo.cpp
@@ -427,7 +427,7 @@ void Region::print(raw_ostream &OS, bool print_tree, unsigned level,
     OS.indent(level*2) << "} \n";
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void Region::dump() const {
   print(dbgs(), true, getDepth(), printStyle.getValue());
 }
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 84e147bf8e..9b9c889496 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -122,7 +122,7 @@ char ScalarEvolution::ID = 0;
 // Implementation of the SCEV class.
 //
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void SCEV::dump() const {
   print(dbgs());
   dbgs() << '\n';
diff --git a/lib/Analysis/Trace.cpp b/lib/Analysis/Trace.cpp
index dbb9535728..22da857620 100644
--- a/lib/Analysis/Trace.cpp
+++ b/lib/Analysis/Trace.cpp
@@ -43,7 +43,7 @@ void Trace::print(raw_ostream &O) const {
   O << "; Trace parent function: \n" << *F;
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 /// dump - Debugger convenience method; writes trace to standard error
 /// output stream.
 ///
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index b0b64d89d9..eedec8383a 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -967,7 +967,7 @@ bool LLParser::ParseOptionalAttrs(Attributes &Attrs, unsigned AttrKind) {
       unsigned Alignment;
       if (ParseOptionalStackAlignment(Alignment))
         return true;
-      Attrs |= Attribute::constructStackAlignmentFromInt(Alignment);
+      Attrs |= Attributes::constructStackAlignmentFromInt(Alignment);
       continue;
     }
 
@@ -975,7 +975,7 @@ bool LLParser::ParseOptionalAttrs(Attributes &Attrs, unsigned AttrKind) {
       unsigned Alignment;
       if (ParseOptionalAlignment(Alignment))
         return true;
-      Attrs |= Attribute::constructAlignmentFromInt(Alignment);
+      Attrs |= Attributes::constructAlignmentFromInt(Alignment);
       continue;
     }
 
@@ -2717,7 +2717,7 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
 
   // If the alignment was parsed as an attribute, move to the alignment field.
   if (FuncAttrs & Attribute::Alignment) {
-    Alignment = Attribute::getAlignmentFromAttrs(FuncAttrs);
+    Alignment = FuncAttrs.getAlignment();
     FuncAttrs &= ~Attribute::Alignment;
   }
 
@@ -2726,16 +2726,16 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   std::vector<Type*> ParamTypeList;
   SmallVector<AttributeWithIndex, 8> Attrs;
 
-  if (RetAttrs != Attribute::None)
+  if (RetAttrs.hasAttributes())
     Attrs.push_back(AttributeWithIndex::get(0, RetAttrs));
 
   for (unsigned i = 0, e = ArgList.size(); i != e; ++i) {
     ParamTypeList.push_back(ArgList[i].Ty);
-    if (ArgList[i].Attrs != Attribute::None)
+    if (ArgList[i].Attrs.hasAttributes())
       Attrs.push_back(AttributeWithIndex::get(i+1, ArgList[i].Attrs));
   }
 
-  if (FuncAttrs != Attribute::None)
+  if (FuncAttrs.hasAttributes())
     Attrs.push_back(AttributeWithIndex::get(~0, FuncAttrs));
 
   AttrListPtr PAL = AttrListPtr::get(Attrs);
@@ -3253,7 +3253,7 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
 
   // Set up the Attributes for the function.
   SmallVector<AttributeWithIndex, 8> Attrs;
-  if (RetAttrs != Attribute::None)
+  if (RetAttrs.hasAttributes())
     Attrs.push_back(AttributeWithIndex::get(0, RetAttrs));
 
   SmallVector<Value*, 8> Args;
@@ -3274,14 +3274,14 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
       return Error(ArgList[i].Loc, "argument is not of expected type '" +
                    getTypeString(ExpectedTy) + "'");
     Args.push_back(ArgList[i].V);
-    if (ArgList[i].Attrs != Attribute::None)
+    if (ArgList[i].Attrs.hasAttributes())
       Attrs.push_back(AttributeWithIndex::get(i+1, ArgList[i].Attrs));
   }
 
   if (I != E)
     return Error(CallLoc, "not enough parameters specified for call");
 
-  if (FnAttrs != Attribute::None)
+  if (FnAttrs.hasAttributes())
     Attrs.push_back(AttributeWithIndex::get(~0, FnAttrs));
 
   // Finish off the Attributes and check them
@@ -3649,7 +3649,7 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
 
   // Set up the Attributes for the function.
   SmallVector<AttributeWithIndex, 8> Attrs;
-  if (RetAttrs != Attribute::None)
+  if (RetAttrs.hasAttributes())
     Attrs.push_back(AttributeWithIndex::get(0, RetAttrs));
 
   SmallVector<Value*, 8> Args;
@@ -3670,14 +3670,14 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
       return Error(ArgList[i].Loc, "argument is not of expected type '" +
                    getTypeString(ExpectedTy) + "'");
     Args.push_back(ArgList[i].V);
-    if (ArgList[i].Attrs != Attribute::None)
+    if (ArgList[i].Attrs.hasAttributes())
       Attrs.push_back(AttributeWithIndex::get(i+1, ArgList[i].Attrs));
   }
 
   if (I != E)
     return Error(CallLoc, "not enough parameters specified for call");
 
-  if (FnAttrs != Attribute::None)
+  if (FnAttrs.hasAttributes())
     Attrs.push_back(AttributeWithIndex::get(~0, FnAttrs));
 
   // Finish off the Attributes and check them
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index 9130ddef16..4a11223711 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -52,6 +52,8 @@ void BitcodeReader::FreeState() {
   std::vector<Function*>().swap(FunctionsWithBodies);
   DeferredFunctionInfo.clear();
   MDKindMap.clear();
+
+  assert(BlockAddrFwdRefs.empty() && "Unresolved blockaddress fwd references");
 }
 
 //===----------------------------------------------------------------------===//
@@ -197,7 +199,7 @@ namespace {
   /// @brief A class for maintaining the slot number definition
   /// as a placeholder for the actual definition for forward constants defs.
   class ConstantPlaceHolder : public ConstantExpr {
-    void operator=(const ConstantPlaceHolder &); // DO NOT IMPLEMENT
+    void operator=(const ConstantPlaceHolder &) LLVM_DELETED_FUNCTION;
   public:
     // allocate space for exactly one operand
     void *operator new(size_t s) {
@@ -1300,13 +1302,27 @@ bool BitcodeReader::ParseConstants() {
       Function *Fn =
         dyn_cast_or_null<Function>(ValueList.getConstantFwdRef(Record[1],FnTy));
       if (Fn == 0) return Error("Invalid CE_BLOCKADDRESS record");
-      
-      GlobalVariable *FwdRef = new GlobalVariable(*Fn->getParent(),
-                                                  Type::getInt8Ty(Context),
+
+      // If the function is already parsed we can insert the block address right
+      // away.
+      if (!Fn->empty()) {
+        Function::iterator BBI = Fn->begin(), BBE = Fn->end();
+        for (size_t I = 0, E = Record[2]; I != E; ++I) {
+          if (BBI == BBE)
+            return Error("Invalid blockaddress block #");
+          ++BBI;
+        }
+        V = BlockAddress::get(Fn, BBI);
+      } else {
+        // Otherwise insert a placeholder and remember it so it can be inserted
+        // when the function is parsed.
+        GlobalVariable *FwdRef = new GlobalVariable(*Fn->getParent(),
+                                                    Type::getInt8Ty(Context),
                                             false, GlobalValue::InternalLinkage,
-                                                  0, "");
-      BlockAddrFwdRefs[Fn].push_back(std::make_pair(Record[2], FwdRef));
-      V = FwdRef;
+                                                    0, "");
+        BlockAddrFwdRefs[Fn].push_back(std::make_pair(Record[2], FwdRef));
+        V = FwdRef;
+      }
       break;
     }  
     }
diff --git a/lib/Bitcode/Writer/ValueEnumerator.h b/lib/Bitcode/Writer/ValueEnumerator.h
index a6ca536062..75468e6c5e 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.h
+++ b/lib/Bitcode/Writer/ValueEnumerator.h
@@ -78,9 +78,9 @@ private:
 
   unsigned FirstFuncConstantID;
   unsigned FirstInstID;
-  
-  ValueEnumerator(const ValueEnumerator &);  // DO NOT IMPLEMENT
-  void operator=(const ValueEnumerator &);   // DO NOT IMPLEMENT
+
+  ValueEnumerator(const ValueEnumerator &) LLVM_DELETED_FUNCTION;
+  void operator=(const ValueEnumerator &) LLVM_DELETED_FUNCTION;
 public:
   ValueEnumerator(const Module *M);
 
diff --git a/lib/CodeGen/Analysis.cpp b/lib/CodeGen/Analysis.cpp
index 447f3981b5..1f3f5a5f38 100644
--- a/lib/CodeGen/Analysis.cpp
+++ b/lib/CodeGen/Analysis.cpp
@@ -318,7 +318,7 @@ bool llvm::isInTailCallPosition(ImmutableCallSite CS, Attributes CalleeRetAttr,
     return false;
 
   // It's not safe to eliminate the sign / zero extension of the return value.
-  if ((CallerRetAttr & Attribute::ZExt) || (CallerRetAttr & Attribute::SExt))
+  if (CallerRetAttr.hasZExtAttr() || CallerRetAttr.hasSExtAttr())
     return false;
 
   // Otherwise, make sure the unmodified return value of I is the return value.
@@ -358,7 +358,7 @@ bool llvm::isInTailCallPosition(SelectionDAG &DAG, SDNode *Node,
     return false;
 
   // It's not safe to eliminate the sign / zero extension of the return value.
-  if ((CallerRetAttr & Attribute::ZExt) || (CallerRetAttr & Attribute::SExt))
+  if (CallerRetAttr.hasZExtAttr() || CallerRetAttr.hasSExtAttr())
     return false;
 
   // Check if the only use is a function return node.
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index 260871d33b..50f0fc30a0 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -137,80 +137,113 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MDNode *LocMDNode,
     report_fatal_error("Error parsing inline asm\n");
 }
 
+static void EmitMSInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
+                               MachineModuleInfo *MMI, int InlineAsmVariant,
+                               AsmPrinter *AP, unsigned LocCookie,
+                               raw_ostream &OS) {
+  // Switch to the inline assembly variant.
+  OS << "\t.intel_syntax\n\t";
 
-/// EmitInlineAsm - This method formats and emits the specified machine
-/// instruction that is an inline asm.
-void AsmPrinter::EmitInlineAsm(const MachineInstr *MI) const {
-  assert(MI->isInlineAsm() && "printInlineAsm only works on inline asms");
-
+  const char *LastEmitted = AsmStr; // One past the last character emitted.
   unsigned NumOperands = MI->getNumOperands();
 
-  // Count the number of register definitions to find the asm string.
-  unsigned NumDefs = 0;
-  for (; MI->getOperand(NumDefs).isReg() && MI->getOperand(NumDefs).isDef();
-       ++NumDefs)
-    assert(NumDefs != NumOperands-2 && "No asm string?");
-
-  assert(MI->getOperand(NumDefs).isSymbol() && "No asm string?");
+  while (*LastEmitted) {
+    switch (*LastEmitted) {
+    default: {
+      // Not a special case, emit the string section literally.
+      const char *LiteralEnd = LastEmitted+1;
+      while (*LiteralEnd && *LiteralEnd != '{' && *LiteralEnd != '|' &&
+             *LiteralEnd != '}' && *LiteralEnd != '$' && *LiteralEnd != '\n')
+        ++LiteralEnd;
 
-  // Disassemble the AsmStr, printing out the literal pieces, the operands, etc.
-  const char *AsmStr = MI->getOperand(NumDefs).getSymbolName();
+      OS.write(LastEmitted, LiteralEnd-LastEmitted);
+      LastEmitted = LiteralEnd;
+      break;
+    }
+    case '\n':
+      ++LastEmitted;   // Consume newline character.
+      OS << '\n';      // Indent code with newline.
+      break;
+    case '$': {
+      ++LastEmitted;   // Consume '$' character.
+      bool Done = true;
 
-  // If this asmstr is empty, just print the #APP/#NOAPP markers.
-  // These are useful to see where empty asm's wound up.
-  if (AsmStr[0] == 0) {
-    // Don't emit the comments if writing to a .o file.
-    if (!OutStreamer.hasRawTextSupport()) return;
+      // Handle escapes.
+      switch (*LastEmitted) {
+      default: Done = false; break;
+      case '$':
+        ++LastEmitted;  // Consume second '$' character.
+        break;
+      }
+      if (Done) break;
 
-    OutStreamer.EmitRawText(Twine("\t")+MAI->getCommentString()+
-                            MAI->getInlineAsmStart());
-    OutStreamer.EmitRawText(Twine("\t")+MAI->getCommentString()+
-                            MAI->getInlineAsmEnd());
-    return;
-  }
+      const char *IDStart = LastEmitted;
+      const char *IDEnd = IDStart;
+      while (*IDEnd >= '0' && *IDEnd <= '9') ++IDEnd;
 
-  // Emit the #APP start marker.  This has to happen even if verbose-asm isn't
-  // enabled, so we use EmitRawText.
-  if (OutStreamer.hasRawTextSupport())
-    OutStreamer.EmitRawText(Twine("\t")+MAI->getCommentString()+
-                            MAI->getInlineAsmStart());
+      unsigned Val;
+      if (StringRef(IDStart, IDEnd-IDStart).getAsInteger(10, Val))
+        report_fatal_error("Bad $ operand number in inline asm string: '" +
+                           Twine(AsmStr) + "'");
+      LastEmitted = IDEnd;
 
-  // Get the !srcloc metadata node if we have it, and decode the loc cookie from
-  // it.
-  unsigned LocCookie = 0;
-  const MDNode *LocMD = 0;
-  for (unsigned i = MI->getNumOperands(); i != 0; --i) {
-    if (MI->getOperand(i-1).isMetadata() &&
-        (LocMD = MI->getOperand(i-1).getMetadata()) &&
-        LocMD->getNumOperands() != 0) {
-      if (const ConstantInt *CI = dyn_cast<ConstantInt>(LocMD->getOperand(0))) {
-        LocCookie = CI->getZExtValue();
-        break;
-      }
-    }
-  }
+      if (Val >= NumOperands-1)
+        report_fatal_error("Invalid $ operand number in inline asm string: '" +
+                           Twine(AsmStr) + "'");
 
-  // Emit the inline asm to a temporary string so we can emit it through
-  // EmitInlineAsm.
-  SmallString<256> StringData;
-  raw_svector_ostream OS(StringData);
+      // Okay, we finally have a value number.  Ask the target to print this
+      // operand!
+      unsigned OpNo = InlineAsm::MIOp_FirstOperand;
 
-  OS << '\t';
+      bool Error = false;
 
-  // The variant of the current asmprinter.
-  int AsmPrinterVariant = MAI->getAssemblerDialect();
-  int InlineAsmVariant = MI->getInlineAsmDialect();
+      // Scan to find the machine operand number for the operand.
+      for (; Val; --Val) {
+        if (OpNo >= MI->getNumOperands()) break;
+        unsigned OpFlags = MI->getOperand(OpNo).getImm();
+        OpNo += InlineAsm::getNumOperandRegisters(OpFlags) + 1;
+      }
 
-  // Switch to the inline assembly variant.
-  if (AsmPrinterVariant != InlineAsmVariant) {
-    if (InlineAsmVariant == 0)
-      OS << ".att_syntax\n\t";
-    else
-      OS << ".intel_syntax\n\t";
+      // We may have a location metadata attached to the end of the
+      // instruction, and at no point should see metadata at any
+      // other point while processing. It's an error if so.
+      if (OpNo >= MI->getNumOperands() ||
+          MI->getOperand(OpNo).isMetadata()) {
+        Error = true;
+      } else {
+        unsigned OpFlags = MI->getOperand(OpNo).getImm();
+        ++OpNo;  // Skip over the ID number.
+        
+        if (InlineAsm::isMemKind(OpFlags)) {
+          Error = AP->PrintAsmMemoryOperand(MI, OpNo, InlineAsmVariant,
+                                            /*Modifier*/ 0, OS);
+        } else {
+          Error = AP->PrintAsmOperand(MI, OpNo, InlineAsmVariant,
+                                      /*Modifier*/ 0, OS);
+        }
+      }
+      if (Error) {
+        std::string msg;
+        raw_string_ostream Msg(msg);
+        Msg << "invalid operand in inline asm: '" << AsmStr << "'";
+        MMI->getModule()->getContext().emitError(LocCookie, Msg.str());
+      }
+      break;
+    }
+    }
   }
+  OS << "\n\t.att_syntax\n" << (char)0;  // null terminate string.
+}
 
+static void EmitGCCInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
+                                MachineModuleInfo *MMI, int InlineAsmVariant,
+                                int AsmPrinterVariant, AsmPrinter *AP,
+                                unsigned LocCookie, raw_ostream &OS) {
   int CurVariant = -1;            // The number of the {.|.|.} region we are in.
   const char *LastEmitted = AsmStr; // One past the last character emitted.
+  unsigned NumOperands = MI->getNumOperands();
+
+  OS << '\t';
 
   while (*LastEmitted) {
     switch (*LastEmitted) {
@@ -283,7 +316,7 @@ void AsmPrinter::EmitInlineAsm(const MachineInstr *MI) const {
                              " string: '" + Twine(AsmStr) + "'");
 
         std::string Val(StrStart, StrEnd);
-        PrintSpecial(MI, OS, Val.c_str());
+        AP->PrintSpecial(MI, OS, Val.c_str());
         LastEmitted = StrEnd+1;
         break;
       }
@@ -351,7 +384,6 @@ void AsmPrinter::EmitInlineAsm(const MachineInstr *MI) const {
             // FIXME: What if the operand isn't an MBB, report error?
             OS << *MI->getOperand(OpNo).getMBB()->getSymbol();
           else {
-            AsmPrinter *AP = const_cast<AsmPrinter*>(this);
             if (InlineAsm::isMemKind(OpFlags)) {
               Error = AP->PrintAsmMemoryOperand(MI, OpNo, InlineAsmVariant,
                                                 Modifier[0] ? Modifier : 0,
@@ -373,15 +405,74 @@ void AsmPrinter::EmitInlineAsm(const MachineInstr *MI) const {
     }
     }
   }
-  // Switch to the AsmPrinter variant.
-  if (AsmPrinterVariant != InlineAsmVariant) {
-    if (AsmPrinterVariant == 0)
-      OS << "\n\t.att_syntax";
-    else
-      OS << "\n\t.intel_syntax";
+  OS << '\n' << (char)0;  // null terminate string.
+}
+
+/// EmitInlineAsm - This method formats and emits the specified machine
+/// instruction that is an inline asm.
+void AsmPrinter::EmitInlineAsm(const MachineInstr *MI) const {
+  assert(MI->isInlineAsm() && "printInlineAsm only works on inline asms");
+
+  // Count the number of register definitions to find the asm string.
+  unsigned NumDefs = 0;
+  for (; MI->getOperand(NumDefs).isReg() && MI->getOperand(NumDefs).isDef();
+       ++NumDefs)
+    assert(NumDefs != MI->getNumOperands()-2 && "No asm string?");
+
+  assert(MI->getOperand(NumDefs).isSymbol() && "No asm string?");
+
+  // Disassemble the AsmStr, printing out the literal pieces, the operands, etc.
+  const char *AsmStr = MI->getOperand(NumDefs).getSymbolName();
+
+  // If this asmstr is empty, just print the #APP/#NOAPP markers.
+  // These are useful to see where empty asm's wound up.
+  if (AsmStr[0] == 0) {
+    // Don't emit the comments if writing to a .o file.
+    if (!OutStreamer.hasRawTextSupport()) return;
+
+    OutStreamer.EmitRawText(Twine("\t")+MAI->getCommentString()+
+                            MAI->getInlineAsmStart());
+    OutStreamer.EmitRawText(Twine("\t")+MAI->getCommentString()+
+                            MAI->getInlineAsmEnd());
+    return;
   }
 
-  OS << '\n' << (char)0;  // null terminate string.
+  // Emit the #APP start marker.  This has to happen even if verbose-asm isn't
+  // enabled, so we use EmitRawText.
+  if (OutStreamer.hasRawTextSupport())
+    OutStreamer.EmitRawText(Twine("\t")+MAI->getCommentString()+
+                            MAI->getInlineAsmStart());
+
+  // Get the !srcloc metadata node if we have it, and decode the loc cookie from
+  // it.
+  unsigned LocCookie = 0;
+  const MDNode *LocMD = 0;
+  for (unsigned i = MI->getNumOperands(); i != 0; --i) {
+    if (MI->getOperand(i-1).isMetadata() &&
+        (LocMD = MI->getOperand(i-1).getMetadata()) &&
+        LocMD->getNumOperands() != 0) {
+      if (const ConstantInt *CI = dyn_cast<ConstantInt>(LocMD->getOperand(0))) {
+        LocCookie = CI->getZExtValue();
+        break;
+      }
+    }
+  }
+
+  // Emit the inline asm to a temporary string so we can emit it through
+  // EmitInlineAsm.
+  SmallString<256> StringData;
+  raw_svector_ostream OS(StringData);
+
+  // The variant of the current asmprinter.
+  int AsmPrinterVariant = MAI->getAssemblerDialect();
+  InlineAsm::AsmDialect InlineAsmVariant = MI->getInlineAsmDialect();
+  AsmPrinter *AP = const_cast<AsmPrinter*>(this);
+  if (InlineAsmVariant == InlineAsm::AD_ATT)
+    EmitGCCInlineAsmStr(AsmStr, MI, MMI, InlineAsmVariant, AsmPrinterVariant,
+                        AP, LocCookie, OS);
+  else
+    EmitMSInlineAsmStr(AsmStr, MI, MMI, InlineAsmVariant, AP, LocCookie, OS);
+
   EmitInlineAsm(OS.str(), LocMD, MI->getInlineAsmDialect());
 
   // Emit the #NOAPP end marker.  This has to happen even if verbose-asm isn't
diff --git a/lib/CodeGen/AsmPrinter/DwarfAccelTable.h b/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
index 963b8cdf34..92d1bbe4f7 100644
--- a/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
+++ b/lib/CodeGen/AsmPrinter/DwarfAccelTable.h
@@ -237,8 +237,8 @@ private:
     #endif
   };
 
-  DwarfAccelTable(const DwarfAccelTable&); // DO NOT IMPLEMENT
-  void operator=(const DwarfAccelTable&);  // DO NOT IMPLEMENT
+  DwarfAccelTable(const DwarfAccelTable&) LLVM_DELETED_FUNCTION;
+  void operator=(const DwarfAccelTable&) LLVM_DELETED_FUNCTION;
 
   // Internal Functions
   void EmitHeader(AsmPrinter *);
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 5e5d9c83d5..ee8a6f36e7 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -330,6 +330,9 @@ DIE *DwarfDebug::updateSubprogramScopeDIE(CompileUnit *SPCU,
           SPCU->addType(Arg, ATy);
           if (ATy.isArtificial())
             SPCU->addFlag(Arg, dwarf::DW_AT_artificial);
+          if (ATy.isObjectPointer())
+            SPCU->addDIEEntry(SPDie, dwarf::DW_AT_object_pointer, dwarf::DW_FORM_ref4,
+                              Arg);
           SPDie->addChild(Arg);
         }
       DIE *SPDeclDie = SPDie;
@@ -496,21 +499,26 @@ DIE *DwarfDebug::constructScopeDIE(CompileUnit *TheCU, LexicalScope *Scope) {
     return NULL;
 
   SmallVector<DIE *, 8> Children;
+  DIE *ObjectPointer = NULL;
 
   // Collect arguments for current function.
   if (LScopes.isCurrentFunctionScope(Scope))
     for (unsigned i = 0, N = CurrentFnArguments.size(); i < N; ++i)
       if (DbgVariable *ArgDV = CurrentFnArguments[i])
         if (DIE *Arg = 
-            TheCU->constructVariableDIE(ArgDV, Scope->isAbstractScope()))
+            TheCU->constructVariableDIE(ArgDV, Scope->isAbstractScope())) {
           Children.push_back(Arg);
+          if (ArgDV->isObjectPointer()) ObjectPointer = Arg;
+        }
 
   // Collect lexical scope children first.
   const SmallVector<DbgVariable *, 8> &Variables = ScopeVariables.lookup(Scope);
   for (unsigned i = 0, N = Variables.size(); i < N; ++i)
     if (DIE *Variable = 
-        TheCU->constructVariableDIE(Variables[i], Scope->isAbstractScope()))
+        TheCU->constructVariableDIE(Variables[i], Scope->isAbstractScope())) {
       Children.push_back(Variable);
+      if (Variables[i]->isObjectPointer()) ObjectPointer = Variable;
+    }
   const SmallVector<LexicalScope *, 4> &Scopes = Scope->getChildren();
   for (unsigned j = 0, M = Scopes.size(); j < M; ++j)
     if (DIE *Nested = constructScopeDIE(TheCU, Scopes[j]))
@@ -544,6 +552,10 @@ DIE *DwarfDebug::constructScopeDIE(CompileUnit *TheCU, LexicalScope *Scope) {
          E = Children.end(); I != E; ++I)
     ScopeDIE->addChild(*I);
 
+  if (DS.isSubprogram() && ObjectPointer != NULL)
+    TheCU->addDIEEntry(ScopeDIE, dwarf::DW_AT_object_pointer,
+                       dwarf::DW_FORM_ref4, ObjectPointer);
+
   if (DS.isSubprogram())
     TheCU->addPubTypes(DISubprogram(DS));
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index df4aedb540..5508674bc9 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -159,11 +159,19 @@ public:
   bool isArtificial()                const {
     if (Var.isArtificial())
       return true;
-    if (Var.getTag() == dwarf::DW_TAG_arg_variable
-        && getType().isArtificial())
+    if (getType().isArtificial())
       return true;
     return false;
   }
+
+  bool isObjectPointer()             const {
+    if (Var.isObjectPointer())
+      return true;
+    if (getType().isObjectPointer())
+      return true;
+    return false;
+  }
+  
   bool variableHasComplexAddress()   const {
     assert(Var.Verify() && "Invalid complex DbgVariable!");
     return Var.hasComplexAddress();
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index 386509b702..fa6d4e16cf 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -45,6 +45,7 @@ add_llvm_library(LLVMCodeGen
   MachineCopyPropagation.cpp
   MachineCSE.cpp
   MachineDominators.cpp
+  MachinePostDominators.cpp
   MachineFunction.cpp
   MachineFunctionAnalysis.cpp
   MachineFunctionPass.cpp
@@ -102,6 +103,7 @@ add_llvm_library(LLVMCodeGen
   TargetInstrInfoImpl.cpp
   TargetLoweringObjectFileImpl.cpp
   TargetOptionsImpl.cpp
+  TargetSchedule.cpp
   TwoAddressInstructionPass.cpp
   UnreachableBlockElim.cpp
   VirtRegMap.cpp
diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp
index 65f0941287..a53f6f8d0f 100644
--- a/lib/CodeGen/CodeGen.cpp
+++ b/lib/CodeGen/CodeGen.cpp
@@ -41,6 +41,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeMachineCopyPropagationPass(Registry);
   initializeMachineCSEPass(Registry);
   initializeMachineDominatorTreePass(Registry);
+  initializeMachinePostDominatorTreePass(Registry);
   initializeMachineLICMPass(Registry);
   initializeMachineLoopInfoPass(Registry);
   initializeMachineModuleInfoPass(Registry);
diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp
index 622127cc74..37828a70b5 100644
--- a/lib/CodeGen/InlineSpiller.cpp
+++ b/lib/CodeGen/InlineSpiller.cpp
@@ -863,7 +863,7 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg,
   // If the instruction also writes VirtReg.reg, it had better not require the
   // same register for uses and defs.
   SmallVector<std::pair<MachineInstr*, unsigned>, 8> Ops;
-  MIBundleOperands::RegInfo RI =
+  MIBundleOperands::VirtRegInfo RI =
     MIBundleOperands(MI).analyzeVirtReg(VirtReg.reg, &Ops);
   if (RI.Tied) {
     markValueUsed(&VirtReg, ParentVNI);
@@ -1142,7 +1142,7 @@ void InlineSpiller::spillAroundUses(unsigned Reg) {
 
     // Analyze instruction.
     SmallVector<std::pair<MachineInstr*, unsigned>, 8> Ops;
-    MIBundleOperands::RegInfo RI =
+    MIBundleOperands::VirtRegInfo RI =
       MIBundleOperands(MI).analyzeVirtReg(Reg, &Ops);
 
     // Find the slot index where this instruction reads and writes OldLI.
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index cac0c83bca..24daafaa62 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -172,7 +172,7 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
       const MCSubtargetInfo &STI = getSubtarget<MCSubtargetInfo>();
       MCE = getTarget().createMCCodeEmitter(*getInstrInfo(), MRI, STI,
                                             *Context);
-      MAB = getTarget().createMCAsmBackend(getTargetTriple());
+      MAB = getTarget().createMCAsmBackend(getTargetTriple(), TargetCPU);
     }
 
     MCStreamer *S = getTarget().createAsmStreamer(*Context, Out,
@@ -191,7 +191,7 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
     // emission fails.
     MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(*getInstrInfo(), MRI,
                                                          STI, *Context);
-    MCAsmBackend *MAB = getTarget().createMCAsmBackend(getTargetTriple());
+    MCAsmBackend *MAB = getTarget().createMCAsmBackend(getTargetTriple(), TargetCPU);
     if (MCE == 0 || MAB == 0)
       return true;
 
@@ -266,7 +266,7 @@ bool LLVMTargetMachine::addPassesToEmitMC(PassManagerBase &PM,
   const MCSubtargetInfo &STI = getSubtarget<MCSubtargetInfo>();
   MCCodeEmitter *MCE = getTarget().createMCCodeEmitter(*getInstrInfo(), MRI,
                                                        STI, *Ctx);
-  MCAsmBackend *MAB = getTarget().createMCAsmBackend(getTargetTriple());
+  MCAsmBackend *MAB = getTarget().createMCAsmBackend(getTargetTriple(), TargetCPU);
   if (MCE == 0 || MAB == 0)
     return true;
 
diff --git a/lib/CodeGen/LiveInterval.cpp b/lib/CodeGen/LiveInterval.cpp
index 3e9b485ca8..f4ebcd6fa4 100644
--- a/lib/CodeGen/LiveInterval.cpp
+++ b/lib/CodeGen/LiveInterval.cpp
@@ -69,21 +69,6 @@ VNInfo *LiveInterval::createDeadDef(SlotIndex Def,
   return VNI;
 }
 
-/// killedInRange - Return true if the interval has kills in [Start,End).
-bool LiveInterval::killedInRange(SlotIndex Start, SlotIndex End) const {
-  Ranges::const_iterator r =
-    std::lower_bound(ranges.begin(), ranges.end(), End);
-
-  // Now r points to the first interval with start >= End, or ranges.end().
-  if (r == ranges.begin())
-    return false;
-
-  --r;
-  // Now r points to the last interval with end <= End.
-  // r->end is the kill point.
-  return r->end >= Start && r->end < End;
-}
-
 // overlaps - Return true if the intersection of the two live intervals is
 // not empty.
 //
@@ -716,27 +701,6 @@ VNInfo* LiveInterval::MergeValueNumberInto(VNInfo *V1, VNInfo *V2) {
   return V2;
 }
 
-void LiveInterval::Copy(const LiveInterval &RHS,
-                        MachineRegisterInfo *MRI,
-                        VNInfo::Allocator &VNInfoAllocator) {
-  ranges.clear();
-  valnos.clear();
-  std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(RHS.reg);
-  MRI->setRegAllocationHint(reg, Hint.first, Hint.second);
-
-  weight = RHS.weight;
-  for (unsigned i = 0, e = RHS.getNumValNums(); i != e; ++i) {
-    const VNInfo *VNI = RHS.getValNumInfo(i);
-    createValueCopy(VNI, VNInfoAllocator);
-  }
-  for (unsigned i = 0, e = RHS.ranges.size(); i != e; ++i) {
-    const LiveRange &LR = RHS.ranges[i];
-    addRange(LiveRange(LR.start, LR.end, getValNumInfo(LR.valno->id)));
-  }
-
-  verify();
-}
-
 unsigned LiveInterval::getSize() const {
   unsigned Sum = 0;
   for (const_iterator I = begin(), E = end(); I != E; ++I)
@@ -748,7 +712,7 @@ raw_ostream& llvm::operator<<(raw_ostream& os, const LiveRange &LR) {
   return os << '[' << LR.start << ',' << LR.end << ':' << LR.valno->id << ")";
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void LiveRange::dump() const {
   dbgs() << *this << "\n";
 }
@@ -785,7 +749,7 @@ void LiveInterval::print(raw_ostream &OS) const {
   }
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void LiveInterval::dump() const {
   dbgs() << *this << "\n";
 }
diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
index 17f9d9e982..dca305849e 100644
--- a/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -156,7 +156,7 @@ void LiveIntervals::printInstrs(raw_ostream &OS) const {
   MF->print(OS, Indexes);
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void LiveIntervals::dumpInstrs() const {
   printInstrs(dbgs());
 }
@@ -731,6 +731,70 @@ bool LiveIntervals::shrinkToUses(LiveInterval *li,
   return CanSeparate;
 }
 
+void LiveIntervals::extendToIndices(LiveInterval *LI,
+                                    ArrayRef<SlotIndex> Indices) {
+  assert(LRCalc && "LRCalc not initialized.");
+  LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
+  for (unsigned i = 0, e = Indices.size(); i != e; ++i)
+    LRCalc->extend(LI, Indices[i]);
+}
+
+void LiveIntervals::pruneValue(LiveInterval *LI, SlotIndex Kill,
+                               SmallVectorImpl<SlotIndex> *EndPoints) {
+  LiveRangeQuery LRQ(*LI, Kill);
+  VNInfo *VNI = LRQ.valueOut();
+  if (!VNI)
+    return;
+
+  MachineBasicBlock *KillMBB = Indexes->getMBBFromIndex(Kill);
+  SlotIndex MBBStart, MBBEnd;
+  tie(MBBStart, MBBEnd) = Indexes->getMBBRange(KillMBB);
+
+  // If VNI isn't live out from KillMBB, the value is trivially pruned.
+  if (LRQ.endPoint() < MBBEnd) {
+    LI->removeRange(Kill, LRQ.endPoint());
+    if (EndPoints) EndPoints->push_back(LRQ.endPoint());
+    return;
+  }
+
+  // VNI is live out of KillMBB.
+  LI->removeRange(Kill, MBBEnd);
+  if (EndPoints) EndPoints->push_back(MBBEnd);
+
+  // Find all blocks that are reachable from MBB without leaving VNI's live
+  // range.
+  for (df_iterator<MachineBasicBlock*>
+       I = df_begin(KillMBB), E = df_end(KillMBB); I != E;) {
+    MachineBasicBlock *MBB = *I;
+    // KillMBB itself was already handled.
+    if (MBB == KillMBB) {
+      ++I;
+      continue;
+    }
+
+    // Check if VNI is live in to MBB.
+    tie(MBBStart, MBBEnd) = Indexes->getMBBRange(MBB);
+    LiveRangeQuery LRQ(*LI, MBBStart);
+    if (LRQ.valueIn() != VNI) {
+      // This block isn't part of the VNI live range. Prune the search.
+      I.skipChildren();
+      continue;
+    }
+
+    // Prune the search if VNI is killed in MBB.
+    if (LRQ.endPoint() < MBBEnd) {
+      LI->removeRange(MBBStart, LRQ.endPoint());
+      if (EndPoints) EndPoints->push_back(LRQ.endPoint());
+      I.skipChildren();
+      continue;
+    }
+
+    // VNI is live through MBB.
+    LI->removeRange(MBBStart, MBBEnd);
+    if (EndPoints) EndPoints->push_back(MBBEnd);
+    ++I;
+  }
+}
 
 //===----------------------------------------------------------------------===//
 // Register allocator hooks.
@@ -1196,14 +1260,35 @@ private:
   // Return the last use of reg between NewIdx and OldIdx.
   SlotIndex findLastUseBefore(unsigned Reg, SlotIndex OldIdx) {
     SlotIndex LastUse = NewIdx;
-    for (MachineRegisterInfo::use_nodbg_iterator
-           UI = MRI.use_nodbg_begin(Reg),
-           UE = MRI.use_nodbg_end();
-         UI != UE; UI.skipInstruction()) {
-      const MachineInstr* MI = &*UI;
-      SlotIndex InstSlot = LIS.getSlotIndexes()->getInstructionIndex(MI);
-      if (InstSlot > LastUse && InstSlot < OldIdx)
-        LastUse = InstSlot;
+
+    if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+      for (MachineRegisterInfo::use_nodbg_iterator
+             UI = MRI.use_nodbg_begin(Reg),
+             UE = MRI.use_nodbg_end();
+           UI != UE; UI.skipInstruction()) {
+        const MachineInstr* MI = &*UI;
+        SlotIndex InstSlot = LIS.getSlotIndexes()->getInstructionIndex(MI);
+        if (InstSlot > LastUse && InstSlot < OldIdx)
+          LastUse = InstSlot;
+      }
+    } else {
+      MachineInstr* MI = LIS.getSlotIndexes()->getInstructionFromIndex(NewIdx);
+      MachineBasicBlock::iterator MII(MI);
+      ++MII;
+      MachineBasicBlock* MBB = MI->getParent();
+      for (; MII != MBB->end() && LIS.getInstructionIndex(MII) < OldIdx; ++MII){
+        for (MachineInstr::mop_iterator MOI = MII->operands_begin(),
+                                        MOE = MII->operands_end();
+             MOI != MOE; ++MOI) {
+          const MachineOperand& mop = *MOI;
+          if (!mop.isReg() || mop.getReg() == 0 ||
+              TargetRegisterInfo::isVirtualRegister(mop.getReg()))
+            continue;
+
+          if (TRI.hasRegUnit(mop.getReg(), Reg))
+            LastUse = LIS.getInstructionIndex(MII);
+        }
+      }
     }
     return LastUse;
   }
diff --git a/lib/CodeGen/LiveIntervalUnion.h b/lib/CodeGen/LiveIntervalUnion.h
index cd4e690c37..4d41fca85a 100644
--- a/lib/CodeGen/LiveIntervalUnion.h
+++ b/lib/CodeGen/LiveIntervalUnion.h
@@ -178,8 +178,8 @@ public:
     bool checkLoopInterference(MachineLoopRange*);
 
   private:
-    Query(const Query&);          // DO NOT IMPLEMENT
-    void operator=(const Query&); // DO NOT IMPLEMENT
+    Query(const Query&) LLVM_DELETED_FUNCTION;
+    void operator=(const Query&) LLVM_DELETED_FUNCTION;
   };
 
   // Array of LiveIntervalUnions.
diff --git a/lib/CodeGen/LiveRegMatrix.h b/lib/CodeGen/LiveRegMatrix.h
index b3e2d7f4b4..8f22c24478 100644
--- a/lib/CodeGen/LiveRegMatrix.h
+++ b/lib/CodeGen/LiveRegMatrix.h
@@ -15,7 +15,7 @@
 // Register units are defined in MCRegisterInfo.h, they represent the smallest
 // unit of interference when dealing with overlapping physical registers. The
 // LiveRegMatrix is represented as a LiveIntervalUnion per register unit. When
-// a virtual register is assigned to a physicval register, the live range for
+// a virtual register is assigned to a physical register, the live range for
 // the virtual register is inserted into the LiveIntervalUnion for each regunit
 // in the physreg.
 //
diff --git a/lib/CodeGen/LiveStackAnalysis.cpp b/lib/CodeGen/LiveStackAnalysis.cpp
index 939e795b4a..f0b522bd7d 100644
--- a/lib/CodeGen/LiveStackAnalysis.cpp
+++ b/lib/CodeGen/LiveStackAnalysis.cpp
@@ -25,7 +25,10 @@
 using namespace llvm;
 
 char LiveStacks::ID = 0;
-INITIALIZE_PASS(LiveStacks, "livestacks",
+INITIALIZE_PASS_BEGIN(LiveStacks, "livestacks",
+                "Live Stack Slot Analysis", false, false)
+INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
+INITIALIZE_PASS_END(LiveStacks, "livestacks",
                 "Live Stack Slot Analysis", false, false)
 
 char &llvm::LiveStacksID = LiveStacks::ID;
diff --git a/lib/CodeGen/LiveVariables.cpp b/lib/CodeGen/LiveVariables.cpp
index f294124228..7359bb92a1 100644
--- a/lib/CodeGen/LiveVariables.cpp
+++ b/lib/CodeGen/LiveVariables.cpp
@@ -65,7 +65,7 @@ LiveVariables::VarInfo::findKill(const MachineBasicBlock *MBB) const {
 }
 
 void LiveVariables::VarInfo::dump() const {
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   dbgs() << "  Alive in blocks: ";
   for (SparseBitVector<>::iterator I = AliveBlocks.begin(),
            E = AliveBlocks.end(); I != E; ++I)
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index 0d44fd9938..22a64e5509 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -229,7 +229,7 @@ const MachineBasicBlock *MachineBasicBlock::getLandingPadSuccessor() const {
   return 0;
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void MachineBasicBlock::dump() const {
   print(dbgs());
 }
@@ -972,6 +972,80 @@ getWeightIterator(MachineBasicBlock::const_succ_iterator I) const {
   return Weights.begin() + index;
 }
 
+/// Return whether (physical) register "Reg" has been <def>ined and not <kill>ed
+/// as of just before "MI".
+/// 
+/// Search is localised to a neighborhood of
+/// Neighborhood instructions before (searching for defs or kills) and N
+/// instructions after (searching just for defs) MI.
+MachineBasicBlock::LivenessQueryResult
+MachineBasicBlock::computeRegisterLiveness(const TargetRegisterInfo *TRI,
+                                           unsigned Reg, MachineInstr *MI,
+                                           unsigned Neighborhood) {
+  
+  unsigned N = Neighborhood;
+  MachineBasicBlock *MBB = MI->getParent();
+
+  // Start by searching backwards from MI, looking for kills, reads or defs.
+
+  MachineBasicBlock::iterator I(MI);
+  // If this is the first insn in the block, don't search backwards.
+  if (I != MBB->begin()) {
+    do {
+      --I;
+
+      MachineOperandIteratorBase::PhysRegInfo Analysis =
+        MIOperands(I).analyzePhysReg(Reg, TRI);
+
+      if (Analysis.Kills)
+        // Register killed, so isn't live.
+        return LQR_Dead;
+
+      else if (Analysis.DefinesOverlap || Analysis.ReadsOverlap)
+        // Defined or read without a previous kill - live.
+        return (Analysis.Defines || Analysis.Reads) ? 
+          LQR_Live : LQR_OverlappingLive;
+
+    } while (I != MBB->begin() && --N > 0);
+  }
+
+  // Did we get to the start of the block?
+  if (I == MBB->begin()) {
+    // If so, the register's state is definitely defined by the live-in state.
+    for (MCRegAliasIterator RAI(Reg, TRI, /*IncludeSelf=*/true);
+         RAI.isValid(); ++RAI) {
+      if (MBB->isLiveIn(*RAI))
+        return (*RAI == Reg) ? LQR_Live : LQR_OverlappingLive;
+    }
+
+    return LQR_Dead;
+  }
+
+  N = Neighborhood;
+
+  // Try searching forwards from MI, looking for reads or defs.
+  I = MachineBasicBlock::iterator(MI);
+  // If this is the last insn in the block, don't search forwards.
+  if (I != MBB->end()) {
+    for (++I; I != MBB->end() && N > 0; ++I, --N) {
+      MachineOperandIteratorBase::PhysRegInfo Analysis =
+        MIOperands(I).analyzePhysReg(Reg, TRI);
+
+      if (Analysis.ReadsOverlap)
+        // Used, therefore must have been live.
+        return (Analysis.Reads) ?
+          LQR_Live : LQR_OverlappingLive;
+
+      else if (Analysis.DefinesOverlap)
+        // Defined (but not read) therefore cannot have been live.
+        return LQR_Dead;
+    }
+  }
+
+  // At this point we have no idea of the liveness of the register.
+  return LQR_Unknown;
+}
+
 void llvm::WriteAsOperand(raw_ostream &OS, const MachineBasicBlock *MBB,
                           bool t) {
   OS << "BB#" << MBB->getNumber();
diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp
index c4dca2cd15..9a8cc48172 100644
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp
@@ -500,11 +500,10 @@ void MachineBlockPlacement::buildChain(
     assert(BB);
     assert(BlockToChain[BB] == &Chain);
     assert(*llvm::prior(Chain.end()) == BB);
-    MachineBasicBlock *BestSucc = 0;
 
     // Look for the best viable successor if there is one to place immediately
     // after this block.
-    BestSucc = selectBestSuccessor(BB, Chain, BlockFilter);
+    MachineBasicBlock *BestSucc = selectBestSuccessor(BB, Chain, BlockFilter);
 
     // If an immediate successor isn't available, look for the best viable
     // block among those we've identified as not violating the loop's CFG at
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index c282332037..304e39e159 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -60,8 +60,8 @@ MachineFunction::MachineFunction(const Function *F, const TargetMachine &TM,
   MFInfo = 0;
   FrameInfo = new (Allocator) MachineFrameInfo(*TM.getFrameLowering());
   if (Fn->hasFnAttr(Attribute::StackAlignment))
-    FrameInfo->ensureMaxAlignment(Attribute::getStackAlignmentFromAttrs(
-        Fn->getAttributes().getFnAttributes()));
+    FrameInfo->ensureMaxAlignment(Fn->getAttributes().
+                                  getFnAttributes().getStackAlignment());
   ConstantPool = new (Allocator) MachineConstantPool(TM.getTargetData());
   Alignment = TM.getTargetLowering()->getMinFunctionAlignment();
   // FIXME: Shouldn't use pref alignment if explicit alignment is set on Fn.
@@ -284,7 +284,7 @@ MachineFunction::extractStoreMemRefs(MachineInstr::mmo_iterator Begin,
   return std::make_pair(Result, Result + Num);
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void MachineFunction::dump() const {
   print(dbgs());
 }
@@ -534,7 +534,7 @@ void MachineFrameInfo::print(const MachineFunction &MF, raw_ostream &OS) const{
   }
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void MachineFrameInfo::dump(const MachineFunction &MF) const {
   print(MF, dbgs());
 }
@@ -633,7 +633,7 @@ void MachineJumpTableInfo::print(raw_ostream &OS) const {
   OS << '\n';
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void MachineJumpTableInfo::dump() const { print(dbgs()); }
 #endif
 
@@ -768,6 +768,6 @@ void MachineConstantPool::print(raw_ostream &OS) const {
   }
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void MachineConstantPool::dump() const { print(dbgs()); }
 #endif
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index 0508b9f612..b71b37ac6f 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -198,7 +198,8 @@ bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const {
     return !strcmp(getSymbolName(), Other.getSymbolName()) &&
            getOffset() == Other.getOffset();
   case MachineOperand::MO_BlockAddress:
-    return getBlockAddress() == Other.getBlockAddress();
+    return getBlockAddress() == Other.getBlockAddress() &&
+           getOffset() == Other.getOffset();
   case MO_RegisterMask:
     return getRegMask() == Other.getRegMask();
   case MachineOperand::MO_MCSymbol:
@@ -239,7 +240,7 @@ hash_code llvm::hash_value(const MachineOperand &MO) {
                         MO.getOffset());
   case MachineOperand::MO_BlockAddress:
     return hash_combine(MO.getType(), MO.getTargetFlags(),
-                        MO.getBlockAddress());
+                        MO.getBlockAddress(), MO.getOffset());
   case MachineOperand::MO_RegisterMask:
     return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getRegMask());
   case MachineOperand::MO_Metadata:
@@ -362,6 +363,7 @@ void MachineOperand::print(raw_ostream &OS, const TargetMachine *TM) const {
   case MachineOperand::MO_BlockAddress:
     OS << '<';
     WriteAsOperand(OS, getBlockAddress(), /*PrintType=*/false);
+    if (getOffset()) OS << "+" << getOffset();
     OS << '>';
     break;
   case MachineOperand::MO_RegisterMask:
@@ -1496,7 +1498,7 @@ void MachineInstr::copyImplicitOps(const MachineInstr *MI) {
 }
 
 void MachineInstr::dump() const {
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   dbgs() << "  " << *this;
 #endif
 }
diff --git a/lib/CodeGen/MachineInstrBundle.cpp b/lib/CodeGen/MachineInstrBundle.cpp
index b7de7bfb49..1f7fbfc719 100644
--- a/lib/CodeGen/MachineInstrBundle.cpp
+++ b/lib/CodeGen/MachineInstrBundle.cpp
@@ -109,10 +109,10 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
   MachineInstrBuilder MIB = BuildMI(MBB, FirstMI, FirstMI->getDebugLoc(),
                                     TII->get(TargetOpcode::BUNDLE));
 
-  SmallVector<unsigned, 8> LocalDefs;
-  SmallSet<unsigned, 8> LocalDefSet;
+  SmallVector<unsigned, 32> LocalDefs;
+  SmallSet<unsigned, 32> LocalDefSet;
   SmallSet<unsigned, 8> DeadDefSet;
-  SmallSet<unsigned, 8> KilledDefSet;
+  SmallSet<unsigned, 16> KilledDefSet;
   SmallVector<unsigned, 8> ExternUses;
   SmallSet<unsigned, 8> ExternUseSet;
   SmallSet<unsigned, 8> KilledUseSet;
@@ -181,7 +181,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
     Defs.clear();
   }
 
-  SmallSet<unsigned, 8> Added;
+  SmallSet<unsigned, 32> Added;
   for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) {
     unsigned Reg = LocalDefs[i];
     if (Added.insert(Reg)) {
@@ -248,10 +248,10 @@ bool llvm::finalizeBundles(MachineFunction &MF) {
 // MachineOperand iterator
 //===----------------------------------------------------------------------===//
 
-MachineOperandIteratorBase::RegInfo
+MachineOperandIteratorBase::VirtRegInfo
 MachineOperandIteratorBase::analyzeVirtReg(unsigned Reg,
                     SmallVectorImpl<std::pair<MachineInstr*, unsigned> > *Ops) {
-  RegInfo RI = { false, false, false };
+  VirtRegInfo RI = { false, false, false };
   for(; isValid(); ++*this) {
     MachineOperand &MO = deref();
     if (!MO.isReg() || MO.getReg() != Reg)
@@ -276,3 +276,53 @@ MachineOperandIteratorBase::analyzeVirtReg(unsigned Reg,
   }
   return RI;
 }
+
+MachineOperandIteratorBase::PhysRegInfo
+MachineOperandIteratorBase::analyzePhysReg(unsigned Reg,
+                                           const TargetRegisterInfo *TRI) {
+  bool AllDefsDead = true;
+  PhysRegInfo PRI = {false, false, false, false, false, false, false};
+
+  assert(TargetRegisterInfo::isPhysicalRegister(Reg) &&
+         "analyzePhysReg not given a physical register!");
+  for (; isValid(); ++*this) {
+    MachineOperand &MO = deref();
+
+    if (MO.isRegMask() && MO.clobbersPhysReg(Reg))
+      PRI.Clobbers = true;    // Regmask clobbers Reg.
+
+    if (!MO.isReg())
+      continue;
+
+    unsigned MOReg = MO.getReg();
+    if (!MOReg || !TargetRegisterInfo::isPhysicalRegister(MOReg))
+      continue;
+
+    bool IsRegOrSuperReg = MOReg == Reg || TRI->isSubRegister(MOReg, Reg);
+    bool IsRegOrOverlapping = MOReg == Reg || TRI->regsOverlap(MOReg, Reg);
+
+    if (IsRegOrSuperReg && MO.readsReg()) {
+      // Reg or a super-reg is read, and perhaps killed also.
+      PRI.Reads = true;
+      PRI.Kills = MO.isKill();
+    } if (IsRegOrOverlapping && MO.readsReg()) {
+      PRI.ReadsOverlap = true;// Reg or an overlapping register is read.
+    }
+
+    if (!MO.isDef())
+      continue;
+
+    if (IsRegOrSuperReg) {
+      PRI.Defines = true;     // Reg or a super-register is defined.
+      if (!MO.isDead())
+        AllDefsDead = false;
+    }
+    if (IsRegOrOverlapping)
+      PRI.Clobbers = true;    // Reg or an overlapping reg is defined.
+  }
+
+  if (AllDefsDead && PRI.Defines)
+    PRI.DefinesDead = true;   // Reg or super-register was defined and was dead.
+
+  return PRI;
+}
diff --git a/lib/CodeGen/MachineLoopInfo.cpp b/lib/CodeGen/MachineLoopInfo.cpp
index 05d2f2a885..27afeec1d9 100644
--- a/lib/CodeGen/MachineLoopInfo.cpp
+++ b/lib/CodeGen/MachineLoopInfo.cpp
@@ -74,7 +74,7 @@ MachineBasicBlock *MachineLoop::getBottomBlock() {
   return BotMBB;
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void MachineLoop::dump() const {
   print(dbgs());
 }
diff --git a/lib/CodeGen/MachinePostDominators.cpp b/lib/CodeGen/MachinePostDominators.cpp
new file mode 100644
index 0000000000..c3f6e9249e
--- /dev/null
+++ b/lib/CodeGen/MachinePostDominators.cpp
@@ -0,0 +1,55 @@
+//===- MachinePostDominators.cpp -Machine Post Dominator Calculation ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements simple dominator construction algorithms for finding
+// post dominators on machine functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachinePostDominators.h"
+
+using namespace llvm;
+
+char MachinePostDominatorTree::ID = 0;
+
+//declare initializeMachinePostDominatorTreePass
+INITIALIZE_PASS(MachinePostDominatorTree, "machinepostdomtree",
+                "MachinePostDominator Tree Construction", true, true)
+
+MachinePostDominatorTree::MachinePostDominatorTree() : MachineFunctionPass(ID) {
+  initializeMachinePostDominatorTreePass(*PassRegistry::getPassRegistry());
+  DT = new DominatorTreeBase<MachineBasicBlock>(true); //true indicate
+                                                       // postdominator
+}
+
+FunctionPass *
+MachinePostDominatorTree::createMachinePostDominatorTreePass() {
+  return new MachinePostDominatorTree();
+}
+
+bool
+MachinePostDominatorTree::runOnMachineFunction(MachineFunction &F) {
+  DT->recalculate(F);
+  return false;
+}
+
+MachinePostDominatorTree::~MachinePostDominatorTree() {
+  delete DT;
+}
+
+void
+MachinePostDominatorTree::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+void
+MachinePostDominatorTree::print(llvm::raw_ostream &OS, const Module *M) const {
+  DT->print(OS);
+}
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index e36c7d4315..d7ecec4163 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -279,7 +279,7 @@ void MachineScheduler::print(raw_ostream &O, const Module* m) const {
   // unimplemented
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void ReadyQueue::dump() {
   dbgs() << Name << ": ";
   for (unsigned i = 0, e = Queue.size(); i < e; ++i)
@@ -484,6 +484,8 @@ void ScheduleDAGMI::releaseRoots() {
 void ScheduleDAGMI::schedule() {
   buildDAGWithRegPressure();
 
+  postprocessDAG();
+
   DEBUG(for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
           SUnits[su].dumpAll(this));
 
@@ -522,6 +524,13 @@ void ScheduleDAGMI::buildDAGWithRegPressure() {
   initRegPressure();
 }
 
+/// Apply each ScheduleDAGMutation step in order.
+void ScheduleDAGMI::postprocessDAG() {
+  for (unsigned i = 0, e = Mutations.size(); i < e; ++i) {
+    Mutations[i]->apply(this);
+  }
+}
+
 /// Identify DAG roots and setup scheduler queues.
 void ScheduleDAGMI::initQueues() {
   // Initialize the strategy before modifying the DAG.
diff --git a/lib/CodeGen/PostRASchedulerList.cpp b/lib/CodeGen/PostRASchedulerList.cpp
index 6090752081..32c02bf0f0 100644
--- a/lib/CodeGen/PostRASchedulerList.cpp
+++ b/lib/CodeGen/PostRASchedulerList.cpp
@@ -240,7 +240,7 @@ void SchedulePostRATDList::exitRegion() {
   ScheduleDAGInstrs::exitRegion();
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 /// dumpSchedule - dump the scheduled Sequence.
 void SchedulePostRATDList::dumpSchedule() const {
   for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp
index c021a937b6..06f69c1e0d 100644
--- a/lib/CodeGen/RegAllocGreedy.cpp
+++ b/lib/CodeGen/RegAllocGreedy.cpp
@@ -330,9 +330,9 @@ void RAGreedy::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<SlotIndexes>();
   AU.addRequired<LiveDebugVariables>();
   AU.addPreserved<LiveDebugVariables>();
-  AU.addRequired<CalculateSpillWeights>();
   AU.addRequired<LiveStacks>();
   AU.addPreserved<LiveStacks>();
+  AU.addRequired<CalculateSpillWeights>();
   AU.addRequired<MachineDominatorTree>();
   AU.addPreserved<MachineDominatorTree>();
   AU.addRequired<MachineLoopInfo>();
@@ -508,7 +508,7 @@ bool RAGreedy::shouldEvict(LiveInterval &A, bool IsHint,
 ///
 /// @param VirtReg Live range that is about to be assigned.
 /// @param PhysReg Desired register for assignment.
-/// @prarm IsHint  True when PhysReg is VirtReg's preferred register.
+/// @param IsHint  True when PhysReg is VirtReg's preferred register.
 /// @param MaxCost Only look for cheaper candidates and update with new cost
 ///                when returning true.
 /// @returns True when interference can be evicted cheaper than MaxCost.
diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
index d018835456..dd0f548867 100644
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp
@@ -55,6 +55,8 @@ STATISTIC(numCommutes , "Number of instruction commuting performed");
 STATISTIC(numExtends  , "Number of copies extended");
 STATISTIC(NumReMats   , "Number of instructions re-materialized");
 STATISTIC(NumInflated , "Number of register classes inflated");
+STATISTIC(NumLaneConflicts, "Number of dead lane conflicts tested");
+STATISTIC(NumLaneResolves,  "Number of dead lane conflicts resolved");
 
 static cl::opt<bool>
 EnableJoining("join-liveintervals",
@@ -66,6 +68,11 @@ VerifyCoalescing("verify-coalescing",
          cl::desc("Verify machine instrs before and after register coalescing"),
          cl::Hidden);
 
+// Temporary option for testing new coalescer algo.
+static cl::opt<bool>
+NewCoalescer("new-coalescer", cl::Hidden,
+             cl::desc("Use new coalescer algorithm"));
+
 namespace {
   class RegisterCoalescer : public MachineFunctionPass,
                             private LiveRangeEdit::Delegate {
@@ -123,6 +130,9 @@ namespace {
     /// can use this information below to update aliases.
     bool joinIntervals(CoalescerPair &CP);
 
+    /// Attempt joining two virtual registers. Return true on success.
+    bool joinVirtRegs(CoalescerPair &CP);
+
     /// Attempt joining with a reserved physreg.
     bool joinReservedPhysReg(CoalescerPair &CP);
 
@@ -583,7 +593,7 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
 
   MachineOperand &NewDstMO = DefMI->getOperand(NewDstIdx);
   unsigned NewReg = NewDstMO.getReg();
-  if (NewReg != IntB.reg || !NewDstMO.isKill())
+  if (NewReg != IntB.reg || !LiveRangeQuery(IntB, AValNo->def).isKill())
     return false;
 
   // Make sure there are no other definitions of IntB that would reach the
@@ -1102,6 +1112,728 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
   return true;
 }
 
+//===----------------------------------------------------------------------===//
+//                 Interference checking and interval joining
+//===----------------------------------------------------------------------===//
+//
+// In the easiest case, the two live ranges being joined are disjoint, and
+// there is no interference to consider. It is quite common, though, to have
+// overlapping live ranges, and we need to check if the interference can be
+// resolved.
+//
+// The live range of a single SSA value forms a sub-tree of the dominator tree.
+// This means that two SSA values overlap if and only if the def of one value
+// is contained in the live range of the other value. As a special case, the
+// overlapping values can be defined at the same index.
+//
+// The interference from an overlapping def can be resolved in these cases:
+//
+// 1. Coalescable copies. The value is defined by a copy that would become an
+//    identity copy after joining SrcReg and DstReg. The copy instruction will
+//    be removed, and the value will be merged with the source value.
+//
+//    There can be several copies back and forth, causing many values to be
+//    merged into one. We compute a list of ultimate values in the joined live
+//    range as well as a mappings from the old value numbers.
+//
+// 2. IMPLICIT_DEF. This instruction is only inserted to ensure all PHI
+//    predecessors have a live out value. It doesn't cause real interference,
+//    and can be merged into the value it overlaps. Like a coalescable copy, it
+//    can be erased after joining.
+//
+// 3. Copy of external value. The overlapping def may be a copy of a value that
+//    is already in the other register. This is like a coalescable copy, but
+//    the live range of the source register must be trimmed after erasing the
+//    copy instruction:
+//
+//      %src = COPY %ext
+//      %dst = COPY %ext  <-- Remove this COPY, trim the live range of %ext.
+//
+// 4. Clobbering undefined lanes. Vector registers are sometimes built by
+//    defining one lane at a time:
+//
+//      %dst:ssub0<def,read-undef> = FOO
+//      %src = BAR
+//      %dst:ssub1<def> = COPY %src
+//
+//    The live range of %src overlaps the %dst value defined by FOO, but
+//    merging %src into %dst:ssub1 is only going to clobber the ssub1 lane
+//    which was undef anyway.
+//
+//    The value mapping is more complicated in this case. The final live range
+//    will have different value numbers for both FOO and BAR, but there is no
+//    simple mapping from old to new values. It may even be necessary to add
+//    new PHI values.
+//
+// 5. Clobbering dead lanes. A def may clobber a lane of a vector register that
+//    is live, but never read. This can happen because we don't compute
+//    individual live ranges per lane.
+//
+//      %dst<def> = FOO
+//      %src = BAR
+//      %dst:ssub1<def> = COPY %src
+//
+//    This kind of interference is only resolved locally. If the clobbered
+//    lane value escapes the block, the join is aborted.
+
+namespace {
+/// Track information about values in a single virtual register about to be
+/// joined. Objects of this class are always created in pairs - one for each
+/// side of the CoalescerPair.
+class JoinVals {
+  LiveInterval &LI;
+
+  // Location of this register in the final joined register.
+  // Either CP.DstIdx or CP.SrcIdx.
+  unsigned SubIdx;
+
+  // Values that will be present in the final live range.
+  SmallVectorImpl<VNInfo*> &NewVNInfo;
+
+  const CoalescerPair &CP;
+  LiveIntervals *LIS;
+  SlotIndexes *Indexes;
+  const TargetRegisterInfo *TRI;
+
+  // Value number assignments. Maps value numbers in LI to entries in NewVNInfo.
+  // This is suitable for passing to LiveInterval::join().
+  SmallVector<int, 8> Assignments;
+
+  // Conflict resolution for overlapping values.
+  enum ConflictResolution {
+    // No overlap, simply keep this value.
+    CR_Keep,
+
+    // Merge this value into OtherVNI and erase the defining instruction.
+    // Used for IMPLICIT_DEF, coalescable copies, and copies from external
+    // values.
+    CR_Erase,
+
+    // Merge this value into OtherVNI but keep the defining instruction.
+    // This is for the special case where OtherVNI is defined by the same
+    // instruction.
+    CR_Merge,
+
+    // Keep this value, and have it replace OtherVNI where possible. This
+    // complicates value mapping since OtherVNI maps to two different values
+    // before and after this def.
+    // Used when clobbering undefined or dead lanes.
+    CR_Replace,
+
+    // Unresolved conflict. Visit later when all values have been mapped.
+    CR_Unresolved,
+
+    // Unresolvable conflict. Abort the join.
+    CR_Impossible
+  };
+
+  // Per-value info for LI. The lane bit masks are all relative to the final
+  // joined register, so they can be compared directly between SrcReg and
+  // DstReg.
+  struct Val {
+    ConflictResolution Resolution;
+
+    // Lanes written by this def, 0 for unanalyzed values.
+    unsigned WriteLanes;
+
+    // Lanes with defined values in this register. Other lanes are undef and
+    // safe to clobber.
+    unsigned ValidLanes;
+
+    // Value in LI being redefined by this def.
+    VNInfo *RedefVNI;
+
+    // Value in the other live range that overlaps this def, if any.
+    VNInfo *OtherVNI;
+
+    // True when the live range of this value will be pruned because of an
+    // overlapping CR_Replace value in the other live range.
+    bool Pruned;
+
+    // True once Pruned above has been computed.
+    bool PrunedComputed;
+
+    Val() : Resolution(CR_Keep), WriteLanes(0), ValidLanes(0),
+            RedefVNI(0), OtherVNI(0), Pruned(false), PrunedComputed(false) {}
+
+    bool isAnalyzed() const { return WriteLanes != 0; }
+  };
+
+  // One entry per value number in LI.
+  SmallVector<Val, 8> Vals;
+
+  unsigned computeWriteLanes(const MachineInstr *DefMI, bool &Redef);
+  VNInfo *stripCopies(VNInfo *VNI);
+  ConflictResolution analyzeValue(unsigned ValNo, JoinVals &Other);
+  void computeAssignment(unsigned ValNo, JoinVals &Other);
+  bool taintExtent(unsigned, unsigned, JoinVals&,
+                   SmallVectorImpl<std::pair<SlotIndex, unsigned> >&);
+  bool usesLanes(MachineInstr *MI, unsigned, unsigned, unsigned);
+  bool isPrunedValue(unsigned ValNo, JoinVals &Other);
+
+public:
+  JoinVals(LiveInterval &li, unsigned subIdx,
+           SmallVectorImpl<VNInfo*> &newVNInfo,
+           const CoalescerPair &cp,
+           LiveIntervals *lis,
+           const TargetRegisterInfo *tri)
+    : LI(li), SubIdx(subIdx), NewVNInfo(newVNInfo), CP(cp), LIS(lis),
+      Indexes(LIS->getSlotIndexes()), TRI(tri),
+      Assignments(LI.getNumValNums(), -1), Vals(LI.getNumValNums())
+  {}
+
+  /// Analyze defs in LI and compute a value mapping in NewVNInfo.
+  /// Returns false if any conflicts were impossible to resolve.
+  bool mapValues(JoinVals &Other);
+
+  /// Try to resolve conflicts that require all values to be mapped.
+  /// Returns false if any conflicts were impossible to resolve.
+  bool resolveConflicts(JoinVals &Other);
+
+  /// Prune the live range of values in Other.LI where they would conflict with
+  /// CR_Replace values in LI. Collect end points for restoring the live range
+  /// after joining.
+  void pruneValues(JoinVals &Other, SmallVectorImpl<SlotIndex> &EndPoints);
+
+  /// Erase any machine instructions that have been coalesced away.
+  /// Add erased instructions to ErasedInstrs.
+  /// Add foreign virtual registers to ShrinkRegs if their live range ended at
+  /// the erased instrs.
+  void eraseInstrs(SmallPtrSet<MachineInstr*, 8> &ErasedInstrs,
+                   SmallVectorImpl<unsigned> &ShrinkRegs);
+
+  /// Get the value assignments suitable for passing to LiveInterval::join.
+  const int *getAssignments() const { return &Assignments[0]; }
+};
+} // end anonymous namespace
+
+/// Compute the bitmask of lanes actually written by DefMI.
+/// Set Redef if there are any partial register definitions that depend on the
+/// previous value of the register.
+unsigned JoinVals::computeWriteLanes(const MachineInstr *DefMI, bool &Redef) {
+  unsigned L = 0;
+  for (ConstMIOperands MO(DefMI); MO.isValid(); ++MO) {
+    if (!MO->isReg() || MO->getReg() != LI.reg || !MO->isDef())
+      continue;
+    L |= TRI->getSubRegIndexLaneMask(compose(*TRI, SubIdx, MO->getSubReg()));
+    if (MO->readsReg())
+      Redef = true;
+  }
+  return L;
+}
+
+/// Find the ultimate value that VNI was copied from.
+VNInfo *JoinVals::stripCopies(VNInfo *VNI) {
+  while (!VNI->isPHIDef()) {
+    MachineInstr *MI = Indexes->getInstructionFromIndex(VNI->def);
+    assert(MI && "No defining instruction");
+    if (!MI->isFullCopy())
+      break;
+    unsigned Reg = MI->getOperand(1).getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg))
+      break;
+    LiveRangeQuery LRQ(LIS->getInterval(Reg), VNI->def);
+    if (!LRQ.valueIn())
+      break;
+    VNI = LRQ.valueIn();
+  }
+  return VNI;
+}
+
+/// Analyze ValNo in this live range, and set all fields of Vals[ValNo].
+/// Return a conflict resolution when possible, but leave the hard cases as
+/// CR_Unresolved.
+/// Recursively calls computeAssignment() on this and Other, guaranteeing that
+/// both OtherVNI and RedefVNI have been analyzed and mapped before returning.
+/// The recursion always goes upwards in the dominator tree, making loops
+/// impossible.
+JoinVals::ConflictResolution
+JoinVals::analyzeValue(unsigned ValNo, JoinVals &Other) {
+  Val &V = Vals[ValNo];
+  assert(!V.isAnalyzed() && "Value has already been analyzed!");
+  VNInfo *VNI = LI.getValNumInfo(ValNo);
+  if (VNI->isUnused()) {
+    V.WriteLanes = ~0u;
+    return CR_Keep;
+  }
+
+  // Get the instruction defining this value, compute the lanes written.
+  const MachineInstr *DefMI = 0;
+  if (VNI->isPHIDef()) {
+    // Conservatively assume that all lanes in a PHI are valid.
+    V.ValidLanes = V.WriteLanes = TRI->getSubRegIndexLaneMask(SubIdx);
+  } else {
+    DefMI = Indexes->getInstructionFromIndex(VNI->def);
+    bool Redef = false;
+    V.ValidLanes = V.WriteLanes = computeWriteLanes(DefMI, Redef);
+
+    // If this is a read-modify-write instruction, there may be more valid
+    // lanes than the ones written by this instruction.
+    // This only covers partial redef operands. DefMI may have normal use
+    // operands reading the register. They don't contribute valid lanes.
+    //
+    // This adds ssub1 to the set of valid lanes in %src:
+    //
+    //   %src:ssub1<def> = FOO
+    //
+    // This leaves only ssub1 valid, making any other lanes undef:
+    //
+    //   %src:ssub1<def,read-undef> = FOO %src:ssub2
+    //
+    // The <read-undef> flag on the def operand means that old lane values are
+    // not important.
+    if (Redef) {
+      V.RedefVNI = LiveRangeQuery(LI, VNI->def).valueIn();
+      assert(V.RedefVNI && "Instruction is reading nonexistent value");
+      computeAssignment(V.RedefVNI->id, Other);
+      V.ValidLanes |= Vals[V.RedefVNI->id].ValidLanes;
+    }
+
+    // An IMPLICIT_DEF writes undef values.
+    if (DefMI->isImplicitDef())
+      V.ValidLanes &= ~V.WriteLanes;
+  }
+
+  // Find the value in Other that overlaps VNI->def, if any.
+  LiveRangeQuery OtherLRQ(Other.LI, VNI->def);
+
+  // It is possible that both values are defined by the same instruction, or
+  // the values are PHIs defined in the same block. When that happens, the two
+  // values should be merged into one, but not into any preceding value.
+  // The first value defined or visited gets CR_Keep, the other gets CR_Merge.
+  if (VNInfo *OtherVNI = OtherLRQ.valueDefined()) {
+    assert(SlotIndex::isSameInstr(VNI->def, OtherVNI->def) && "Broken LRQ");
+
+    // One value stays, the other is merged. Keep the earlier one, or the first
+    // one we see.
+    if (OtherVNI->def < VNI->def)
+      Other.computeAssignment(OtherVNI->id, *this);
+    else if (VNI->def < OtherVNI->def && OtherLRQ.valueIn()) {
+      // This is an early-clobber def overlapping a live-in value in the other
+      // register. Not mergeable.
+      V.OtherVNI = OtherLRQ.valueIn();
+      return CR_Impossible;
+    }
+    V.OtherVNI = OtherVNI;
+    Val &OtherV = Other.Vals[OtherVNI->id];
+    // Keep this value, check for conflicts when analyzing OtherVNI.
+    if (!OtherV.isAnalyzed())
+      return CR_Keep;
+    // Both sides have been analyzed now.
+    // Allow overlapping PHI values. Any real interference would show up in a
+    // predecessor, the PHI itself can't introduce any conflicts.
+    if (VNI->isPHIDef())
+      return CR_Merge;
+    if (V.ValidLanes & OtherV.ValidLanes)
+      // Overlapping lanes can't be resolved.
+      return CR_Impossible;
+    else
+      return CR_Merge;
+  }
+
+  // No simultaneous def. Is Other live at the def?
+  V.OtherVNI = OtherLRQ.valueIn();
+  if (!V.OtherVNI)
+    // No overlap, no conflict.
+    return CR_Keep;
+
+  assert(!SlotIndex::isSameInstr(VNI->def, V.OtherVNI->def) && "Broken LRQ");
+
+  // We have overlapping values, or possibly a kill of Other.
+  // Recursively compute assignments up the dominator tree.
+  Other.computeAssignment(V.OtherVNI->id, *this);
+  const Val &OtherV = Other.Vals[V.OtherVNI->id];
+
+  // Allow overlapping PHI values. Any real interference would show up in a
+  // predecessor, the PHI itself can't introduce any conflicts.
+  if (VNI->isPHIDef())
+    return CR_Replace;
+
+  // Check for simple erasable conflicts.
+  if (DefMI->isImplicitDef())
+    return CR_Erase;
+
+  // Include the non-conflict where DefMI is a coalescable copy that kills
+  // OtherVNI. We still want the copy erased and value numbers merged.
+  if (CP.isCoalescable(DefMI)) {
+    // Some of the lanes copied from OtherVNI may be undef, making them undef
+    // here too.
+    V.ValidLanes &= ~V.WriteLanes | OtherV.ValidLanes;
+    return CR_Erase;
+  }
+
+  // This may not be a real conflict if DefMI simply kills Other and defines
+  // VNI.
+  if (OtherLRQ.isKill() && OtherLRQ.endPoint() <= VNI->def)
+    return CR_Keep;
+
+  // Handle the case where VNI and OtherVNI can be proven to be identical:
+  //
+  //   %other = COPY %ext
+  //   %this  = COPY %ext <-- Erase this copy
+  //
+  if (DefMI->isFullCopy() && !CP.isPartial() &&
+      stripCopies(VNI) == stripCopies(V.OtherVNI))
+    return CR_Erase;
+
+  // If the lanes written by this instruction were all undef in OtherVNI, it is
+  // still safe to join the live ranges. This can't be done with a simple value
+  // mapping, though - OtherVNI will map to multiple values:
+  //
+  //   1 %dst:ssub0 = FOO                <-- OtherVNI
+  //   2 %src = BAR                      <-- VNI
+  //   3 %dst:ssub1 = COPY %src<kill>    <-- Eliminate this copy.
+  //   4 BAZ %dst<kill>
+  //   5 QUUX %src<kill>
+  //
+  // Here OtherVNI will map to itself in [1;2), but to VNI in [2;5). CR_Replace
+  // handles this complex value mapping.
+  if ((V.WriteLanes & OtherV.ValidLanes) == 0)
+    return CR_Replace;
+
+  // VNI is clobbering live lanes in OtherVNI, but there is still the
+  // possibility that no instructions actually read the clobbered lanes.
+  // If we're clobbering all the lanes in OtherVNI, at least one must be read.
+  // Otherwise Other.LI wouldn't be live here.
+  if ((TRI->getSubRegIndexLaneMask(Other.SubIdx) & ~V.WriteLanes) == 0)
+    return CR_Impossible;
+
+  // We need to verify that no instructions are reading the clobbered lanes. To
+  // save compile time, we'll only check that locally. Don't allow the tainted
+  // value to escape the basic block.
+  MachineBasicBlock *MBB = Indexes->getMBBFromIndex(VNI->def);
+  if (OtherLRQ.endPoint() >= Indexes->getMBBEndIdx(MBB))
+    return CR_Impossible;
+
+  // There are still some things that could go wrong besides clobbered lanes
+  // being read, for example OtherVNI may be only partially redefined in MBB,
+  // and some clobbered lanes could escape the block. Save this analysis for
+  // resolveConflicts() when all values have been mapped. We need to know
+  // RedefVNI and WriteLanes for any later defs in MBB, and we can't compute
+  // that now - the recursive analyzeValue() calls must go upwards in the
+  // dominator tree.
+  return CR_Unresolved;
+}
+
+/// Compute the value assignment for ValNo in LI.
+/// This may be called recursively by analyzeValue(), but never for a ValNo on
+/// the stack.
+void JoinVals::computeAssignment(unsigned ValNo, JoinVals &Other) {
+  Val &V = Vals[ValNo];
+  if (V.isAnalyzed()) {
+    // Recursion should always move up the dominator tree, so ValNo is not
+    // supposed to reappear before it has been assigned.
+    assert(Assignments[ValNo] != -1 && "Bad recursion?");
+    return;
+  }
+  switch ((V.Resolution = analyzeValue(ValNo, Other))) {
+  case CR_Erase:
+  case CR_Merge:
+    // Merge this ValNo into OtherVNI.
+    assert(V.OtherVNI && "OtherVNI not assigned, can't merge.");
+    assert(Other.Vals[V.OtherVNI->id].isAnalyzed() && "Missing recursion");
+    Assignments[ValNo] = Other.Assignments[V.OtherVNI->id];
+    DEBUG(dbgs() << "\t\tmerge " << PrintReg(LI.reg) << ':' << ValNo << '@'
+                 << LI.getValNumInfo(ValNo)->def << " into "
+                 << PrintReg(Other.LI.reg) << ':' << V.OtherVNI->id << '@'
+                 << V.OtherVNI->def << " --> @"
+                 << NewVNInfo[Assignments[ValNo]]->def << '\n');
+    break;
+  case CR_Replace:
+  case CR_Unresolved:
+    // The other value is going to be pruned if this join is successful.
+    assert(V.OtherVNI && "OtherVNI not assigned, can't prune");
+    Other.Vals[V.OtherVNI->id].Pruned = true;
+    // Fall through.
+  default:
+    // This value number needs to go in the final joined live range.
+    Assignments[ValNo] = NewVNInfo.size();
+    NewVNInfo.push_back(LI.getValNumInfo(ValNo));
+    break;
+  }
+}
+
+bool JoinVals::mapValues(JoinVals &Other) {
+  for (unsigned i = 0, e = LI.getNumValNums(); i != e; ++i) {
+    computeAssignment(i, Other);
+    if (Vals[i].Resolution == CR_Impossible) {
+      DEBUG(dbgs() << "\t\tinterference at " << PrintReg(LI.reg) << ':' << i
+                   << '@' << LI.getValNumInfo(i)->def << '\n');
+      return false;
+    }
+  }
+  return true;
+}
+
+/// Assuming ValNo is going to clobber some valid lanes in Other.LI, compute
+/// the extent of the tainted lanes in the block.
+///
+/// Multiple values in Other.LI can be affected since partial redefinitions can
+/// preserve previously tainted lanes.
+///
+///   1 %dst = VLOAD           <-- Define all lanes in %dst
+///   2 %src = FOO             <-- ValNo to be joined with %dst:ssub0
+///   3 %dst:ssub1 = BAR       <-- Partial redef doesn't clear taint in ssub0
+///   4 %dst:ssub0 = COPY %src <-- Conflict resolved, ssub0 wasn't read
+///
+/// For each ValNo in Other that is affected, add an (EndIndex, TaintedLanes)
+/// entry to TaintedVals.
+///
+/// Returns false if the tainted lanes extend beyond the basic block.
+bool JoinVals::
+taintExtent(unsigned ValNo, unsigned TaintedLanes, JoinVals &Other,
+            SmallVectorImpl<std::pair<SlotIndex, unsigned> > &TaintExtent) {
+  VNInfo *VNI = LI.getValNumInfo(ValNo);
+  MachineBasicBlock *MBB = Indexes->getMBBFromIndex(VNI->def);
+  SlotIndex MBBEnd = Indexes->getMBBEndIdx(MBB);
+
+  // Scan Other.LI from VNI.def to MBBEnd.
+  LiveInterval::iterator OtherI = Other.LI.find(VNI->def);
+  assert(OtherI != Other.LI.end() && "No conflict?");
+  do {
+    // OtherI is pointing to a tainted value. Abort the join if the tainted
+    // lanes escape the block.
+    SlotIndex End = OtherI->end;
+    if (End >= MBBEnd) {
+      DEBUG(dbgs() << "\t\ttaints global " << PrintReg(Other.LI.reg) << ':'
+                   << OtherI->valno->id << '@' << OtherI->start << '\n');
+      return false;
+    }
+    DEBUG(dbgs() << "\t\ttaints local " << PrintReg(Other.LI.reg) << ':'
+                 << OtherI->valno->id << '@' << OtherI->start
+                 << " to " << End << '\n');
+    // A dead def is not a problem.
+    if (End.isDead())
+      break;
+    TaintExtent.push_back(std::make_pair(End, TaintedLanes));
+
+    // Check for another def in the MBB.
+    if (++OtherI == Other.LI.end() || OtherI->start >= MBBEnd)
+      break;
+
+    // Lanes written by the new def are no longer tainted.
+    const Val &OV = Other.Vals[OtherI->valno->id];
+    TaintedLanes &= ~OV.WriteLanes;
+    if (!OV.RedefVNI)
+      break;
+  } while (TaintedLanes);
+  return true;
+}
+
+/// Return true if MI uses any of the given Lanes from Reg.
+/// This does not include partial redefinitions of Reg.
+bool JoinVals::usesLanes(MachineInstr *MI, unsigned Reg, unsigned SubIdx,
+                         unsigned Lanes) {
+  if (MI->isDebugValue())
+    return false;
+  for (ConstMIOperands MO(MI); MO.isValid(); ++MO) {
+    if (!MO->isReg() || MO->isDef() || MO->getReg() != Reg)
+      continue;
+    if (!MO->readsReg())
+      continue;
+    if (Lanes &
+        TRI->getSubRegIndexLaneMask(compose(*TRI, SubIdx, MO->getSubReg())))
+      return true;
+  }
+  return false;
+}
+
+bool JoinVals::resolveConflicts(JoinVals &Other) {
+  for (unsigned i = 0, e = LI.getNumValNums(); i != e; ++i) {
+    Val &V = Vals[i];
+    assert (V.Resolution != CR_Impossible && "Unresolvable conflict");
+    if (V.Resolution != CR_Unresolved)
+      continue;
+    DEBUG(dbgs() << "\t\tconflict at " << PrintReg(LI.reg) << ':' << i
+                 << '@' << LI.getValNumInfo(i)->def << '\n');
+    ++NumLaneConflicts;
+    assert(V.OtherVNI && "Inconsistent conflict resolution.");
+    VNInfo *VNI = LI.getValNumInfo(i);
+    const Val &OtherV = Other.Vals[V.OtherVNI->id];
+
+    // VNI is known to clobber some lanes in OtherVNI. If we go ahead with the
+    // join, those lanes will be tainted with a wrong value. Get the extent of
+    // the tainted lanes.
+    unsigned TaintedLanes = V.WriteLanes & OtherV.ValidLanes;
+    SmallVector<std::pair<SlotIndex, unsigned>, 8> TaintExtent;
+    if (!taintExtent(i, TaintedLanes, Other, TaintExtent))
+      // Tainted lanes would extend beyond the basic block.
+      return false;
+
+    assert(!TaintExtent.empty() && "There should be at least one conflict.");
+
+    // Now look at the instructions from VNI->def to TaintExtent (inclusive).
+    MachineBasicBlock *MBB = Indexes->getMBBFromIndex(VNI->def);
+    MachineBasicBlock::iterator MI = MBB->begin();
+    if (!VNI->isPHIDef()) {
+      MI = Indexes->getInstructionFromIndex(VNI->def);
+      // No need to check the instruction defining VNI for reads.
+      ++MI;
+    }
+    assert(!SlotIndex::isSameInstr(VNI->def, TaintExtent.front().first) &&
+           "Interference ends on VNI->def. Should have been handled earlier");
+    MachineInstr *LastMI =
+      Indexes->getInstructionFromIndex(TaintExtent.front().first);
+    assert(LastMI && "Range must end at a proper instruction");
+    unsigned TaintNum = 0;
+    for(;;) {
+      assert(MI != MBB->end() && "Bad LastMI");
+      if (usesLanes(MI, Other.LI.reg, Other.SubIdx, TaintedLanes)) {
+        DEBUG(dbgs() << "\t\ttainted lanes used by: " << *MI);
+        return false;
+      }
+      // LastMI is the last instruction to use the current value.
+      if (&*MI == LastMI) {
+        if (++TaintNum == TaintExtent.size())
+          break;
+        LastMI = Indexes->getInstructionFromIndex(TaintExtent[TaintNum].first);
+        assert(LastMI && "Range must end at a proper instruction");
+        TaintedLanes = TaintExtent[TaintNum].second;
+      }
+      ++MI;
+    }
+
+    // The tainted lanes are unused.
+    V.Resolution = CR_Replace;
+    ++NumLaneResolves;
+  }
+  return true;
+}
+
+// Determine if ValNo is a copy of a value number in LI or Other.LI that will
+// be pruned:
+//
+//   %dst = COPY %src
+//   %src = COPY %dst  <-- This value to be pruned.
+//   %dst = COPY %src  <-- This value is a copy of a pruned value.
+//
+bool JoinVals::isPrunedValue(unsigned ValNo, JoinVals &Other) {
+  Val &V = Vals[ValNo];
+  if (V.Pruned || V.PrunedComputed)
+    return V.Pruned;
+
+  if (V.Resolution != CR_Erase && V.Resolution != CR_Merge)
+    return V.Pruned;
+
+  // Follow copies up the dominator tree and check if any intermediate value
+  // has been pruned.
+  V.PrunedComputed = true;
+  V.Pruned = Other.isPrunedValue(V.OtherVNI->id, *this);
+  return V.Pruned;
+}
+
+void JoinVals::pruneValues(JoinVals &Other,
+                           SmallVectorImpl<SlotIndex> &EndPoints) {
+  for (unsigned i = 0, e = LI.getNumValNums(); i != e; ++i) {
+    SlotIndex Def = LI.getValNumInfo(i)->def;
+    switch (Vals[i].Resolution) {
+    case CR_Keep:
+      break;
+    case CR_Replace:
+      // This value takes precedence over the value in Other.LI.
+      LIS->pruneValue(&Other.LI, Def, &EndPoints);
+      DEBUG(dbgs() << "\t\tpruned " << PrintReg(Other.LI.reg) << " at " << Def
+                   << ": " << Other.LI << '\n');
+      break;
+    case CR_Erase:
+    case CR_Merge:
+      if (isPrunedValue(i, Other)) {
+        // This value is ultimately a copy of a pruned value in LI or Other.LI.
+        // We can no longer trust the value mapping computed by
+        // computeAssignment(), the value that was originally copied could have
+        // been replaced.
+        LIS->pruneValue(&LI, Def, &EndPoints);
+        DEBUG(dbgs() << "\t\tpruned all of " << PrintReg(LI.reg) << " at "
+                     << Def << ": " << LI << '\n');
+      }
+      break;
+    case CR_Unresolved:
+    case CR_Impossible:
+      llvm_unreachable("Unresolved conflicts");
+    }
+  }
+}
+
+void JoinVals::eraseInstrs(SmallPtrSet<MachineInstr*, 8> &ErasedInstrs,
+                           SmallVectorImpl<unsigned> &ShrinkRegs) {
+  for (unsigned i = 0, e = LI.getNumValNums(); i != e; ++i) {
+    if (Vals[i].Resolution != CR_Erase)
+      continue;
+    SlotIndex Def = LI.getValNumInfo(i)->def;
+    MachineInstr *MI = Indexes->getInstructionFromIndex(Def);
+    assert(MI && "No instruction to erase");
+    if (MI->isCopy()) {
+      unsigned Reg = MI->getOperand(1).getReg();
+      if (TargetRegisterInfo::isVirtualRegister(Reg) &&
+          Reg != CP.getSrcReg() && Reg != CP.getDstReg())
+        ShrinkRegs.push_back(Reg);
+    }
+    ErasedInstrs.insert(MI);
+    DEBUG(dbgs() << "\t\terased:\t" << Def << '\t' << *MI);
+    LIS->RemoveMachineInstrFromMaps(MI);
+    MI->eraseFromParent();
+  }
+}
+
+bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
+  SmallVector<VNInfo*, 16> NewVNInfo;
+  LiveInterval &RHS = LIS->getInterval(CP.getSrcReg());
+  LiveInterval &LHS = LIS->getInterval(CP.getDstReg());
+  JoinVals RHSVals(RHS, CP.getSrcIdx(), NewVNInfo, CP, LIS, TRI);
+  JoinVals LHSVals(LHS, CP.getDstIdx(), NewVNInfo, CP, LIS, TRI);
+
+  DEBUG(dbgs() << "\t\tRHS = " << PrintReg(CP.getSrcReg()) << ' ' << RHS
+               << "\n\t\tLHS = " << PrintReg(CP.getDstReg()) << ' ' << LHS
+               << '\n');
+
+  // First compute NewVNInfo and the simple value mappings.
+  // Detect impossible conflicts early.
+  if (!LHSVals.mapValues(RHSVals) || !RHSVals.mapValues(LHSVals))
+    return false;
+
+  // Some conflicts can only be resolved after all values have been mapped.
+  if (!LHSVals.resolveConflicts(RHSVals) || !RHSVals.resolveConflicts(LHSVals))
+    return false;
+
+  // All clear, the live ranges can be merged.
+
+  // The merging algorithm in LiveInterval::join() can't handle conflicting
+  // value mappings, so we need to remove any live ranges that overlap a
+  // CR_Replace resolution. Collect a set of end points that can be used to
+  // restore the live range after joining.
+  SmallVector<SlotIndex, 8> EndPoints;
+  LHSVals.pruneValues(RHSVals, EndPoints);
+  RHSVals.pruneValues(LHSVals, EndPoints);
+
+  // Erase COPY and IMPLICIT_DEF instructions. This may cause some external
+  // registers to require trimming.
+  SmallVector<unsigned, 8> ShrinkRegs;
+  LHSVals.eraseInstrs(ErasedInstrs, ShrinkRegs);
+  RHSVals.eraseInstrs(ErasedInstrs, ShrinkRegs);
+  while (!ShrinkRegs.empty())
+    LIS->shrinkToUses(&LIS->getInterval(ShrinkRegs.pop_back_val()));
+
+  // Join RHS into LHS.
+  LHS.join(RHS, LHSVals.getAssignments(), RHSVals.getAssignments(), NewVNInfo,
+           MRI);
+
+  // Kill flags are going to be wrong if the live ranges were overlapping.
+  // Eventually, we should simply clear all kill flags when computing live
+  // ranges. They are reinserted after register allocation.
+  MRI->clearKillFlags(LHS.reg);
+  MRI->clearKillFlags(RHS.reg);
+
+  if (EndPoints.empty())
+    return true;
+
+  // Recompute the parts of the live range we had to remove because of
+  // CR_Replace conflicts.
+  DEBUG(dbgs() << "\t\trestoring liveness to " << EndPoints.size()
+               << " points: " << LHS << '\n');
+  LIS->extendToIndices(&LHS, EndPoints);
+  return true;
+}
+
 /// ComputeUltimateVN - Assuming we are going to join two live intervals,
 /// compute what the resultant value numbers for each value in the input two
 /// ranges will be.  This is complicated by copies between the two which can
@@ -1229,6 +1961,9 @@ bool RegisterCoalescer::joinIntervals(CoalescerPair &CP) {
   if (CP.isPhys())
     return joinReservedPhysReg(CP);
 
+  if (NewCoalescer)
+    return joinVirtRegs(CP);
+
   LiveInterval &RHS = LIS->getInterval(CP.getSrcReg());
   DEBUG(dbgs() << "\t\tRHS = " << PrintReg(CP.getSrcReg()) << ' ' << RHS
                << '\n');
diff --git a/lib/CodeGen/RegisterPressure.cpp b/lib/CodeGen/RegisterPressure.cpp
index 6cdfe7cd72..b267aea816 100644
--- a/lib/CodeGen/RegisterPressure.cpp
+++ b/lib/CodeGen/RegisterPressure.cpp
@@ -63,7 +63,7 @@ void RegisterPressure::decrease(const TargetRegisterClass *RC,
   decreaseSetPressure(MaxSetPressure, RC, TRI);
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void RegisterPressure::dump(const TargetRegisterInfo *TRI) {
   dbgs() << "Live In: ";
   for (unsigned i = 0, e = LiveInRegs.size(); i < e; ++i)
diff --git a/lib/CodeGen/ScheduleDAG.cpp b/lib/CodeGen/ScheduleDAG.cpp
index af8cd8f347..9a65071001 100644
--- a/lib/CodeGen/ScheduleDAG.cpp
+++ b/lib/CodeGen/ScheduleDAG.cpp
@@ -279,7 +279,7 @@ void SUnit::ComputeHeight() {
   } while (!WorkList.empty());
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 /// SUnit - Scheduling unit. It's an wrapper around either a single SDNode or
 /// a group of nodes flagged together.
 void SUnit::dump(const ScheduleDAG *G) const {
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index 2d8f235c66..a1a4efd108 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -52,6 +52,9 @@ ScheduleDAGInstrs::ScheduleDAGInstrs(MachineFunction &mf,
   DbgValues.clear();
   assert(!(IsPostRA && MRI.getNumVirtRegs()) &&
          "Virtual registers must be removed prior to PostRA scheduling");
+
+  const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
+  SchedModel.init(*ST.getSchedModel(), &ST, TII);
 }
 
 /// getUnderlyingObjectFromInt - This is the function that does the work of
@@ -274,15 +277,13 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
       // perform its own adjustments.
       SDep dep(SU, SDep::Data, LDataLatency, *Alias);
       if (!UnitLatencies) {
-        unsigned Latency =
-          TII->computeOperandLatency(InstrItins, SU->getInstr(), OperIdx,
-                                     (UseOp < 0 ? 0 : UseMI), UseOp);
-        dep.setLatency(Latency);
-        unsigned MinLatency =
-          TII->computeOperandLatency(InstrItins, SU->getInstr(), OperIdx,
-                                     (UseOp < 0 ? 0 : UseMI), UseOp,
-                                     /*FindMin=*/true);
-        dep.setMinLatency(MinLatency);
+        MachineInstr *RegUse = UseOp < 0 ? 0 : UseMI;
+        dep.setLatency(
+          SchedModel.computeOperandLatency(SU->getInstr(), OperIdx,
+                                           RegUse, UseOp, /*FindMin=*/false));
+        dep.setMinLatency(
+          SchedModel.computeOperandLatency(SU->getInstr(), OperIdx,
+                                           RegUse, UseOp, /*FindMin=*/true));
 
         ST.adjustSchedDependency(SU, UseSU, dep);
       }
@@ -477,13 +478,10 @@ void ScheduleDAGInstrs::addVRegUseDeps(SUnit *SU, unsigned OperIdx) {
         // Adjust the dependence latency using operand def/use information, then
         // allow the target to perform its own adjustments.
         int DefOp = Def->findRegisterDefOperandIdx(Reg);
-        unsigned Latency =
-          TII->computeOperandLatency(InstrItins, Def, DefOp, MI, OperIdx);
-        dep.setLatency(Latency);
-        unsigned MinLatency =
-          TII->computeOperandLatency(InstrItins, Def, DefOp, MI, OperIdx,
-                                     /*FindMin=*/true);
-        dep.setMinLatency(MinLatency);
+        dep.setLatency(
+          SchedModel.computeOperandLatency(Def, DefOp, MI, OperIdx, false));
+        dep.setMinLatency(
+          SchedModel.computeOperandLatency(Def, DefOp, MI, OperIdx, true));
 
         const TargetSubtargetInfo &ST = TM.getSubtarget<TargetSubtargetInfo>();
         ST.adjustSchedDependency(DefSU, SU, const_cast<SDep &>(dep));
@@ -1012,7 +1010,7 @@ void ScheduleDAGInstrs::computeLatency(SUnit *SU) {
 }
 
 void ScheduleDAGInstrs::dumpNode(const SUnit *SU) const {
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   SU->getInstr()->dump();
 #endif
 }
diff --git a/lib/CodeGen/ScoreboardHazardRecognizer.cpp b/lib/CodeGen/ScoreboardHazardRecognizer.cpp
index 5ca22b23fe..2cd84d670a 100644
--- a/lib/CodeGen/ScoreboardHazardRecognizer.cpp
+++ b/lib/CodeGen/ScoreboardHazardRecognizer.cpp
@@ -89,7 +89,7 @@ void ScoreboardHazardRecognizer::Reset() {
   ReservedScoreboard.reset();
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void ScoreboardHazardRecognizer::Scoreboard::dump() const {
   dbgs() << "Scoreboard:\n";
 
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 32c97c59b3..0107ded953 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -1644,7 +1644,8 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
     return N0.getOperand(0);
   // fold C2-(A+C1) -> (C2-C1)-A
   if (N1.getOpcode() == ISD::ADD && N0C && N1C1) {
-    SDValue NewC = DAG.getConstant((N0C->getAPIntValue() - N1C1->getAPIntValue()), VT);
+    SDValue NewC = DAG.getConstant(N0C->getAPIntValue() - N1C1->getAPIntValue(),
+                                   VT);
     return DAG.getNode(ISD::SUB, N->getDebugLoc(), VT, NewC,
                        N1.getOperand(0));
   }
@@ -2346,16 +2347,19 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
   // we don't want to undo this promotion.
   // We also handle SCALAR_TO_VECTOR because xor/or/and operations are cheaper
   // on scalars.
-  if ((N0.getOpcode() == ISD::BITCAST || N0.getOpcode() == ISD::SCALAR_TO_VECTOR)
-      && Level == AfterLegalizeTypes) {
+  if ((N0.getOpcode() == ISD::BITCAST ||
+       N0.getOpcode() == ISD::SCALAR_TO_VECTOR) &&
+      Level == AfterLegalizeTypes) {
     SDValue In0 = N0.getOperand(0);
     SDValue In1 = N1.getOperand(0);
     EVT In0Ty = In0.getValueType();
     EVT In1Ty = In1.getValueType();
-    // If both incoming values are integers, and the original types are the same.
+    DebugLoc DL = N->getDebugLoc();
+    // If both incoming values are integers, and the original types are the
+    // same.
     if (In0Ty.isInteger() && In1Ty.isInteger() && In0Ty == In1Ty) {
-      SDValue Op = DAG.getNode(N->getOpcode(), N->getDebugLoc(), In0Ty, In0, In1);
-      SDValue BC = DAG.getNode(N0.getOpcode(), N->getDebugLoc(), VT, Op);
+      SDValue Op = DAG.getNode(N->getOpcode(), DL, In0Ty, In0, In1);
+      SDValue BC = DAG.getNode(N0.getOpcode(), DL, VT, Op);
       AddToWorkList(Op.getNode());
       return BC;
     }
@@ -4057,7 +4061,8 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
   if (VT.isInteger() &&
       (VT0 == MVT::i1 ||
        (VT0.isInteger() &&
-        TLI.getBooleanContents(false) == TargetLowering::ZeroOrOneBooleanContent)) &&
+        TLI.getBooleanContents(false) ==
+        TargetLowering::ZeroOrOneBooleanContent)) &&
       N1C && N2C && N1C->isNullValue() && N2C->getAPIntValue() == 1) {
     SDValue XORNode;
     if (VT == VT0)
@@ -5441,7 +5446,8 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
   // This often reduces constant pool loads.
   if (((N0.getOpcode() == ISD::FNEG && !TLI.isFNegFree(VT)) ||
        (N0.getOpcode() == ISD::FABS && !TLI.isFAbsFree(VT))) &&
-      N0.getNode()->hasOneUse() && VT.isInteger() && !VT.isVector()) {
+      N0.getNode()->hasOneUse() && VT.isInteger() &&
+      !VT.isVector() && !N0.getValueType().isVector()) {
     SDValue NewConv = DAG.getNode(ISD::BITCAST, N0.getDebugLoc(), VT,
                                   N0.getOperand(0));
     AddToWorkList(NewConv.getNode());
@@ -5675,12 +5681,12 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
     return N0;
   // fold (fadd A, (fneg B)) -> (fsub A, B)
   if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) &&
-      isNegatibleForFree(N1, LegalOperations, TLI, &DAG.getTarget().Options) == 2)
+    isNegatibleForFree(N1, LegalOperations, TLI, &DAG.getTarget().Options) == 2)
     return DAG.getNode(ISD::FSUB, N->getDebugLoc(), VT, N0,
                        GetNegatedExpression(N1, DAG, LegalOperations));
   // fold (fadd (fneg A), B) -> (fsub B, A)
   if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) &&
-      isNegatibleForFree(N0, LegalOperations, TLI, &DAG.getTarget().Options) == 2)
+    isNegatibleForFree(N0, LegalOperations, TLI, &DAG.getTarget().Options) == 2)
     return DAG.getNode(ISD::FSUB, N->getDebugLoc(), VT, N1,
                        GetNegatedExpression(N0, DAG, LegalOperations));
 
@@ -7710,9 +7716,9 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
 
   // Transform: (EXTRACT_VECTOR_ELT( VECTOR_SHUFFLE )) -> EXTRACT_VECTOR_ELT.
   // We only perform this optimization before the op legalization phase because
-  // we may introduce new vector instructions which are not backed by TD patterns.
-  // For example on AVX, extracting elements from a wide vector without using
-  // extract_subvector.
+  // we may introduce new vector instructions which are not backed by TD
+  // patterns. For example on AVX, extracting elements from a wide vector
+  // without using extract_subvector.
   if (InVec.getOpcode() == ISD::VECTOR_SHUFFLE
       && ConstEltNo && !LegalOperations) {
     int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
@@ -8096,7 +8102,8 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
     // If VecIn2 is unused then change it to undef.
     VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
 
-    // Check that we were able to transform all incoming values to the same type.
+    // Check that we were able to transform all incoming values to the same
+    // type.
     if (VecIn2.getValueType() != VecIn1.getValueType() ||
         VecIn1.getValueType() != VT)
           return SDValue();
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 6d2cdeabd1..5e631c2d8e 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -314,8 +314,6 @@ InstrEmitter::AddRegisterOperand(MachineInstr *MI, SDValue Op,
     const TargetRegisterClass *DstRC = 0;
     if (IIOpNum < II->getNumOperands())
       DstRC = TRI->getAllocatableClass(TII->getRegClass(*II,IIOpNum,TRI,*MF));
-    assert((DstRC || (MI->isVariadic() && IIOpNum >= MCID.getNumOperands())) &&
-           "Don't have operand info for this instruction!");
     if (DstRC && !MRI->constrainRegClass(VReg, DstRC, MinRCSize)) {
       unsigned NewVReg = MRI->createVirtualRegister(DstRC);
       BuildMI(*MBB, InsertPos, Op.getNode()->getDebugLoc(),
@@ -412,6 +410,7 @@ void InstrEmitter::AddOperand(MachineInstr *MI, SDValue Op,
                                             ES->getTargetFlags()));
   } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(Op)) {
     MI->addOperand(MachineOperand::CreateBA(BA->getBlockAddress(),
+                                            BA->getOffset(),
                                             BA->getTargetFlags()));
   } else if (TargetIndexSDNode *TI = dyn_cast<TargetIndexSDNode>(Op)) {
     MI->addOperand(MachineOperand::CreateTargetIndex(TI->getIndex(),
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 7b341700b8..8b291e9a3c 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -869,25 +869,24 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
     switch (TLI.getOperationAction(Node->getOpcode(), VT)) {
     default: llvm_unreachable("This action is not supported yet!");
     case TargetLowering::Legal:
-             // If this is an unaligned load and the target doesn't support it,
-             // expand it.
-             if (!TLI.allowsUnalignedMemoryAccesses(LD->getMemoryVT())) {
-               Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
-               unsigned ABIAlignment =
-                 TLI.getTargetData()->getABITypeAlignment(Ty);
-               if (LD->getAlignment() < ABIAlignment){
-                 ExpandUnalignedLoad(cast<LoadSDNode>(Node),
-                                     DAG, TLI, RVal, RChain);
-               }
-             }
-             break;
+      // If this is an unaligned load and the target doesn't support it,
+      // expand it.
+      if (!TLI.allowsUnalignedMemoryAccesses(LD->getMemoryVT())) {
+        Type *Ty = LD->getMemoryVT().getTypeForEVT(*DAG.getContext());
+        unsigned ABIAlignment =
+          TLI.getTargetData()->getABITypeAlignment(Ty);
+        if (LD->getAlignment() < ABIAlignment){
+          ExpandUnalignedLoad(cast<LoadSDNode>(Node), DAG, TLI, RVal, RChain);
+        }
+      }
+      break;
     case TargetLowering::Custom: {
-             SDValue Res = TLI.LowerOperation(RVal, DAG);
-             if (Res.getNode()) {
-               RVal = Res;
-               RChain = Res.getValue(1);
-             }
-             break;
+      SDValue Res = TLI.LowerOperation(RVal, DAG);
+      if (Res.getNode()) {
+        RVal = Res;
+        RChain = Res.getValue(1);
+      }
+      break;
     }
     case TargetLowering::Promote: {
       // Only promote a load of vector type to another.
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index e8e968aaef..1cccf1a057 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -2253,32 +2253,35 @@ void DAGTypeLegalizer::ExpandIntRes_UADDSUBO(SDNode *N,
 void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,
                                           SDValue &Lo, SDValue &Hi) {
   EVT VT = N->getValueType(0);
-  Type *RetTy = VT.getTypeForEVT(*DAG.getContext());
-  EVT PtrVT = TLI.getPointerTy();
-  Type *PtrTy = PtrVT.getTypeForEVT(*DAG.getContext());
   DebugLoc dl = N->getDebugLoc();
 
   // A divide for UMULO should be faster than a function call.
   if (N->getOpcode() == ISD::UMULO) {
     SDValue LHS = N->getOperand(0), RHS = N->getOperand(1);
-    DebugLoc DL = N->getDebugLoc();
 
-    SDValue MUL = DAG.getNode(ISD::MUL, DL, LHS.getValueType(), LHS, RHS);
+    SDValue MUL = DAG.getNode(ISD::MUL, dl, LHS.getValueType(), LHS, RHS);
     SplitInteger(MUL, Lo, Hi);
 
     // A divide for UMULO will be faster than a function call. Select to
     // make sure we aren't using 0.
     SDValue isZero = DAG.getSetCC(dl, TLI.getSetCCResultType(VT),
-                                  RHS, DAG.getConstant(0, VT), ISD::SETNE);
+                                  RHS, DAG.getConstant(0, VT), ISD::SETEQ);
     SDValue NotZero = DAG.getNode(ISD::SELECT, dl, VT, isZero,
                                   DAG.getConstant(1, VT), RHS);
-    SDValue DIV = DAG.getNode(ISD::UDIV, DL, LHS.getValueType(), MUL, NotZero);
-    SDValue Overflow;
-    Overflow = DAG.getSetCC(DL, N->getValueType(1), DIV, LHS, ISD::SETNE);
+    SDValue DIV = DAG.getNode(ISD::UDIV, dl, VT, MUL, NotZero);
+    SDValue Overflow = DAG.getSetCC(dl, N->getValueType(1), DIV, LHS,
+                                    ISD::SETNE);
+    Overflow = DAG.getNode(ISD::SELECT, dl, N->getValueType(1), isZero,
+                           DAG.getConstant(0, N->getValueType(1)),
+                           Overflow);
     ReplaceValueWith(SDValue(N, 1), Overflow);
     return;
   }
 
+  Type *RetTy = VT.getTypeForEVT(*DAG.getContext());
+  EVT PtrVT = TLI.getPointerTy();
+  Type *PtrTy = PtrVT.getTypeForEVT(*DAG.getContext());
+  
   // Replace this with a libcall that will check overflow.
   RTLIB::Libcall LC = RTLIB::UNKNOWN_LIBCALL;
   if (VT == MVT::i32)
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index 06f6bd63b6..7b8f138a34 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -94,14 +94,44 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
   if (InVT.isVector() && OutVT.isInteger()) {
     // Handle cases like i64 = BITCAST v1i64 on x86, where the operand
     // is legal but the result is not.
-    EVT NVT = EVT::getVectorVT(*DAG.getContext(), NOutVT, 2);
+    unsigned NumElems = 2;
+    EVT ElemVT = NOutVT;
+    EVT NVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElems);
+
+    // If <ElemVT * N> is not a legal type, try <ElemVT/2 * (N*2)>.
+    while (!isTypeLegal(NVT)) {
+      unsigned NewSizeInBits = ElemVT.getSizeInBits() / 2;
+      // If the element size is smaller than byte, bail.
+      if (NewSizeInBits < 8)
+        break;
+      NumElems *= 2;
+      ElemVT = EVT::getIntegerVT(*DAG.getContext(), NewSizeInBits);
+      NVT = EVT::getVectorVT(*DAG.getContext(), ElemVT, NumElems);
+    }
 
     if (isTypeLegal(NVT)) {
       SDValue CastInOp = DAG.getNode(ISD::BITCAST, dl, NVT, InOp);
-      Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NOutVT, CastInOp,
-                       DAG.getIntPtrConstant(0));
-      Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NOutVT, CastInOp,
-                       DAG.getIntPtrConstant(1));
+
+      SmallVector<SDValue, 8> Vals;
+      for (unsigned i = 0; i < NumElems; ++i)
+        Vals.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ElemVT,
+                                   CastInOp, DAG.getIntPtrConstant(i)));
+
+      // Build Lo, Hi pair by pairing extracted elements if needed.
+      unsigned Slot = 0;
+      for (unsigned e = Vals.size(); e - Slot > 2; Slot += 2, e += 1) {
+        // Each iteration will BUILD_PAIR two nodes and append the result until
+        // there are only two nodes left, i.e. Lo and Hi.
+        SDValue LHS = Vals[Slot];
+        SDValue RHS = Vals[Slot + 1];
+        Vals.push_back(DAG.getNode(ISD::BUILD_PAIR, dl,
+                                   EVT::getIntegerVT(
+                                     *DAG.getContext(),
+                                     LHS.getValueType().getSizeInBits() << 1),
+                                   LHS, RHS));
+      }
+      Lo = Vals[Slot++];
+      Hi = Vals[Slot++];
 
       if (TLI.isBigEndian())
         std::swap(Lo, Hi);
diff --git a/lib/CodeGen/SelectionDAG/SDNodeOrdering.h b/lib/CodeGen/SelectionDAG/SDNodeOrdering.h
index f88b26d5c4..d2269f8acc 100644
--- a/lib/CodeGen/SelectionDAG/SDNodeOrdering.h
+++ b/lib/CodeGen/SelectionDAG/SDNodeOrdering.h
@@ -28,8 +28,8 @@ class SDNode;
 class SDNodeOrdering {
   DenseMap<const SDNode*, unsigned> OrderMap;
 
-  void operator=(const SDNodeOrdering&);   // Do not implement.
-  SDNodeOrdering(const SDNodeOrdering&);   // Do not implement.
+  void operator=(const SDNodeOrdering&) LLVM_DELETED_FUNCTION;
+  SDNodeOrdering(const SDNodeOrdering&) LLVM_DELETED_FUNCTION;
 public:
   SDNodeOrdering() {}
 
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index 2b86e36991..68be9ecc03 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -1758,7 +1758,7 @@ public:
     return V;
   }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   void dump(ScheduleDAG *DAG) const {
     // Emulate pop() without clobbering NodeQueueIds.
     std::vector<SUnit*> DumpQueue = Queue;
@@ -1897,7 +1897,7 @@ unsigned RegReductionPQBase::getNodePriority(const SUnit *SU) const {
 //===----------------------------------------------------------------------===//
 
 void RegReductionPQBase::dumpRegPressure() const {
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   for (TargetRegisterInfo::regclass_iterator I = TRI->regclass_begin(),
          E = TRI->regclass_end(); I != E; ++I) {
     const TargetRegisterClass *RC = *I;
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 222dc559a2..660223a505 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -643,7 +643,7 @@ void ScheduleDAGSDNodes::computeOperandLatency(SDNode *Def, SDNode *Use,
 }
 
 void ScheduleDAGSDNodes::dumpNode(const SUnit *SU) const {
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   if (!SU->getNode()) {
     dbgs() << "PHYS REG COPY\n";
     return;
@@ -663,7 +663,7 @@ void ScheduleDAGSDNodes::dumpNode(const SUnit *SU) const {
 #endif
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void ScheduleDAGSDNodes::dumpSchedule() const {
   for (unsigned i = 0, e = Sequence.size(); i != e; i++) {
     if (SUnit *SU = Sequence[i])
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index d85d41bd86..bd33479b94 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -494,8 +494,10 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
   }
   case ISD::TargetBlockAddress:
   case ISD::BlockAddress: {
-    ID.AddPointer(cast<BlockAddressSDNode>(N)->getBlockAddress());
-    ID.AddInteger(cast<BlockAddressSDNode>(N)->getTargetFlags());
+    const BlockAddressSDNode *BA = cast<BlockAddressSDNode>(N);
+    ID.AddPointer(BA->getBlockAddress());
+    ID.AddInteger(BA->getOffset());
+    ID.AddInteger(BA->getTargetFlags());
     break;
   }
   } // end switch (N->getOpcode())
@@ -1470,6 +1472,7 @@ SDValue SelectionDAG::getEHLabel(DebugLoc dl, SDValue Root, MCSymbol *Label) {
 
 
 SDValue SelectionDAG::getBlockAddress(const BlockAddress *BA, EVT VT,
+                                      int64_t Offset,
                                       bool isTarget,
                                       unsigned char TargetFlags) {
   unsigned Opc = isTarget ? ISD::TargetBlockAddress : ISD::BlockAddress;
@@ -1477,12 +1480,14 @@ SDValue SelectionDAG::getBlockAddress(const BlockAddress *BA, EVT VT,
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, Opc, getVTList(VT), 0, 0);
   ID.AddPointer(BA);
+  ID.AddInteger(Offset);
   ID.AddInteger(TargetFlags);
   void *IP = 0;
   if (SDNode *E = CSEMap.FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  SDNode *N = new (NodeAllocator) BlockAddressSDNode(Opc, VT, BA, TargetFlags);
+  SDNode *N = new (NodeAllocator) BlockAddressSDNode(Opc, VT, BA, Offset,
+                                                     TargetFlags);
   CSEMap.InsertNode(N, IP);
   AllNodes.push_back(N);
   return SDValue(N, 0);
@@ -2454,9 +2459,9 @@ SDValue SelectionDAG::getNode(unsigned Opcode, DebugLoc DL,
     }
     case ISD::BITCAST:
       if (VT == MVT::f32 && C->getValueType(0) == MVT::i32)
-        return getConstantFP(Val.bitsToFloat(), VT);
+        return getConstantFP(APFloat(Val), VT);
       else if (VT == MVT::f64 && C->getValueType(0) == MVT::i64)
-        return getConstantFP(Val.bitsToDouble(), VT);
+        return getConstantFP(APFloat(Val), VT);
       break;
     case ISD::BSWAP:
       return getConstant(Val.byteSwap(), VT);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index bf338990ce..a870ee2ac8 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -482,11 +482,16 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
     OS << "<" << *M->getMemOperand() << ">";
   } else if (const BlockAddressSDNode *BA =
                dyn_cast<BlockAddressSDNode>(this)) {
+    int64_t offset = BA->getOffset();
     OS << "<";
     WriteAsOperand(OS, BA->getBlockAddress()->getFunction(), false);
     OS << ", ";
     WriteAsOperand(OS, BA->getBlockAddress()->getBasicBlock(), false);
     OS << ">";
+    if (offset > 0)
+      OS << " + " << offset;
+    else
+      OS << " " << offset;
     if (unsigned int TF = BA->getTargetFlags())
       OS << " [TF=" << TF << ']';
   }
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 0337492049..213b2b6c80 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -1996,7 +1996,7 @@ MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTList,
   return Res;
 }
 
-/// CheckPatternPredicate - Implements OP_CheckPatternPredicate.
+/// CheckSame - Implements OP_CheckSame.
 LLVM_ATTRIBUTE_ALWAYS_INLINE static bool
 CheckSame(const unsigned char *MatcherTable, unsigned &MatcherIndex,
           SDValue N,
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index dcaa9ba923..56f3a45c9a 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -996,9 +996,9 @@ void llvm::GetReturnInfo(Type* ReturnType, Attributes attr,
     EVT VT = ValueVTs[j];
     ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
 
-    if (attr & Attribute::SExt)
+    if (attr.hasSExtAttr())
       ExtendKind = ISD::SIGN_EXTEND;
-    else if (attr & Attribute::ZExt)
+    else if (attr.hasZExtAttr())
       ExtendKind = ISD::ZERO_EXTEND;
 
     // FIXME: C calling convention requires the return type to be promoted to
@@ -1016,18 +1016,17 @@ void llvm::GetReturnInfo(Type* ReturnType, Attributes attr,
 
     // 'inreg' on function refers to return value
     ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
-    if (attr & Attribute::InReg)
+    if (attr.hasInRegAttr())
       Flags.setInReg();
 
     // Propagate extension type if any
-    if (attr & Attribute::SExt)
+    if (attr.hasSExtAttr())
       Flags.setSExt();
-    else if (attr & Attribute::ZExt)
+    else if (attr.hasZExtAttr())
       Flags.setZExt();
 
-    for (unsigned i = 0; i < NumParts; ++i) {
+    for (unsigned i = 0; i < NumParts; ++i)
       Outs.push_back(ISD::OutputArg(Flags, PartVT, /*isFixed=*/true));
-    }
   }
 }
 
diff --git a/lib/CodeGen/SlotIndexes.cpp b/lib/CodeGen/SlotIndexes.cpp
index c98efb480c..95faafab45 100644
--- a/lib/CodeGen/SlotIndexes.cpp
+++ b/lib/CodeGen/SlotIndexes.cpp
@@ -143,7 +143,7 @@ void SlotIndexes::renumberIndexes(IndexList::iterator curItr) {
 }
 
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void SlotIndexes::dump() const {
   for (IndexList::const_iterator itr = indexList.begin();
        itr != indexList.end(); ++itr) {
@@ -170,7 +170,7 @@ void SlotIndex::print(raw_ostream &os) const {
     os << "invalid";
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 // Dump a SlotIndex to stderr.
 void SlotIndex::dump() const {
   print(dbgs());
diff --git a/lib/CodeGen/SplitKit.cpp b/lib/CodeGen/SplitKit.cpp
index 96151b6363..dca15ee758 100644
--- a/lib/CodeGen/SplitKit.cpp
+++ b/lib/CodeGen/SplitKit.cpp
@@ -356,7 +356,7 @@ void SplitEditor::reset(LiveRangeEdit &LRE, ComplementSpillMode SM) {
   Edit->anyRematerializable(0);
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void SplitEditor::dump() const {
   if (RegAssign.empty()) {
     dbgs() << " empty\n";
diff --git a/lib/CodeGen/StackColoring.cpp b/lib/CodeGen/StackColoring.cpp
index 6df932c1ae..54d8c8cde7 100644
--- a/lib/CodeGen/StackColoring.cpp
+++ b/lib/CodeGen/StackColoring.cpp
@@ -59,12 +59,23 @@ using namespace llvm;
 
 static cl::opt<bool>
 DisableColoring("no-stack-coloring",
-               cl::init(true), cl::Hidden,
-               cl::desc("Suppress stack coloring"));
+        cl::init(false), cl::Hidden,
+        cl::desc("Disable stack coloring"));
 
-STATISTIC(NumMarkerSeen,  "Number of life markers found.");
+/// The user may write code that uses allocas outside of the declared lifetime
+/// zone. This can happen when the user returns a reference to a local
+/// data-structure. We can detect these cases and decide not to optimize the
+/// code. If this flag is enabled, we try to save the user.
+static cl::opt<bool>
+ProtectFromEscapedAllocas("protect-from-escaped-allocas",
+        cl::init(false), cl::Hidden,
+        cl::desc("Do not optimize lifetime zones that are broken"));
+
+STATISTIC(NumMarkerSeen,  "Number of lifetime markers found.");
 STATISTIC(StackSpaceSaved, "Number of bytes saved due to merging slots.");
 STATISTIC(StackSlotMerged, "Number of stack slot merged.");
+STATISTIC(EscapedAllocas,
+          "Number of allocas that escaped the lifetime region");
 
 //===----------------------------------------------------------------------===//
 //                           StackColoring Pass
@@ -104,7 +115,7 @@ class StackColoring : public MachineFunctionPass {
   /// VNInfo is used for the construction of LiveIntervals.
   VNInfo::Allocator VNInfoAllocator;
   /// SlotIndex analysis object.
-  SlotIndexes* Indexes;
+  SlotIndexes *Indexes;
 
   /// The list of lifetime markers found. These markers are to be removed
   /// once the coloring is done.
@@ -158,6 +169,14 @@ private:
   /// slots to use the joint slots.
   void remapInstructions(DenseMap<int, int> &SlotRemap);
 
+  /// The input program may contain intructions which are not inside lifetime
+  /// markers. This can happen due to a bug in the compiler or due to a bug in
+  /// user code (for example, returning a reference to a local variable).
+  /// This procedure checks all of the instructions in the function and
+  /// invalidates lifetime ranges which do not contain all of the instructions
+  /// which access that frame slot.
+  void removeInvalidSlotRanges();
+
   /// Map entries which point to other entries to their destination.
   ///   A->B->C becomes A->C.
    void expungeSlotMap(DenseMap<int, int> &SlotRemap, unsigned NumSlots);
@@ -243,8 +262,8 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot) {
 
       const Value *Allocation = MFI->getObjectAllocation(Slot);
       if (Allocation) {
-        DEBUG(dbgs()<<"Found lifetime marker for allocation: "<<
-              Allocation->getName()<<"\n");
+        DEBUG(dbgs()<<"Found a lifetime marker for slot #"<<Slot<<
+              " with allocation: "<< Allocation->getName()<<"\n");
       }
 
       if (IsStart) {
@@ -521,11 +540,17 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
         // In a debug build, check that the instruction that we are modifying is
         // inside the expected live range. If the instruction is not inside
         // the calculated range then it means that the alloca usage moved
-        // outside of the lifetime markers.
+        // outside of the lifetime markers, or that the user has a bug.
+        // NOTE: Alloca address calculations which happen outside the lifetime
+        // zone are are okay, despite the fact that we don't have a good way
+        // for validating all of the usages of the calculation.
 #ifndef NDEBUG
-        if (!I->isDebugValue()) {
+        bool TouchesMemory = I->mayLoad() || I->mayStore();
+        // If we *don't* protect the user from escaped allocas, don't bother
+        // validating the instructions.
+        if (!I->isDebugValue() && TouchesMemory && ProtectFromEscapedAllocas) {
           SlotIndex Index = Indexes->getInstructionIndex(I);
-          LiveInterval* Interval = Intervals[FromSlot];
+          LiveInterval *Interval = Intervals[FromSlot];
           assert(Interval->find(Index) != Interval->end() &&
                "Found instruction usage outside of live range.");
         }
@@ -543,6 +568,53 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
   DEBUG(dbgs()<<"Fixed "<<FixedInstr<<" machine instructions.\n");
 }
 
+void StackColoring::removeInvalidSlotRanges() {
+  MachineFunction::iterator BB, BBE;
+  MachineBasicBlock::iterator I, IE;
+  for (BB = MF->begin(), BBE = MF->end(); BB != BBE; ++BB)
+    for (I = BB->begin(), IE = BB->end(); I != IE; ++I) {
+
+      if (I->getOpcode() == TargetOpcode::LIFETIME_START ||
+          I->getOpcode() == TargetOpcode::LIFETIME_END || I->isDebugValue())
+        continue;
+
+      // Some intervals are suspicious! In some cases we find address
+      // calculations outside of the lifetime zone, but not actual memory
+      // read or write. Memory accesses outside of the lifetime zone are a clear
+      // violation, but address calculations are okay. This can happen when
+      // GEPs are hoisted outside of the lifetime zone.
+      // So, in here we only check instructions which can read or write memory.
+      if (!I->mayLoad() && !I->mayStore())
+        continue;
+
+      // Check all of the machine operands.
+      for (unsigned i = 0 ; i <  I->getNumOperands(); ++i) {
+        MachineOperand &MO = I->getOperand(i);
+
+        if (!MO.isFI())
+          continue;
+
+        int Slot = MO.getIndex();
+
+        if (Slot<0)
+          continue;
+
+        if (Intervals[Slot]->empty())
+          continue;
+
+        // Check that the used slot is inside the calculated lifetime range.
+        // If it is not, warn about it and invalidate the range.
+        LiveInterval *Interval = Intervals[Slot];
+        SlotIndex Index = Indexes->getInstructionIndex(I);
+        if (Interval->find(Index) == Interval->end()) {
+          Intervals[Slot]->clear();
+          DEBUG(dbgs()<<"Invalidating range #"<<Slot<<"\n");
+          EscapedAllocas++;
+        }
+      }
+    }
+}
+
 void StackColoring::expungeSlotMap(DenseMap<int, int> &SlotRemap,
                                    unsigned NumSlots) {
   // Expunge slot remap map.
@@ -598,7 +670,7 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
   DEBUG(dbgs()<<"Total Stack size: "<<TotalSize<<" bytes\n\n");
 
   // Don't continue because there are not enough lifetime markers, or the
-  // stack or too small, or we are told not to optimize the slots.
+  // stack is too small, or we are told not to optimize the slots.
   if (NumMarkers < 2 || TotalSize < 16 || DisableColoring) {
     DEBUG(dbgs()<<"Will not try to merge slots.\n");
     return removeAllMarkers();
@@ -617,6 +689,11 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
   // Propagate the liveness information.
   calculateLiveIntervals(NumSlots);
 
+  // Search for allocas which are used outside of the declared lifetime
+  // markers.
+  if (ProtectFromEscapedAllocas)
+    removeInvalidSlotRanges();
+
   // Maps old slots to new slots.
   DenseMap<int, int> SlotRemap;
   unsigned RemovedSlots = 0;
diff --git a/lib/CodeGen/StackSlotColoring.cpp b/lib/CodeGen/StackSlotColoring.cpp
index 9d0fd0aa20..d349abc357 100644
--- a/lib/CodeGen/StackSlotColoring.cpp
+++ b/lib/CodeGen/StackSlotColoring.cpp
@@ -11,7 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "stackcoloring"
+#define DEBUG_TYPE "stackslotcoloring"
 #include "llvm/Module.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
diff --git a/lib/CodeGen/TargetInstrInfoImpl.cpp b/lib/CodeGen/TargetInstrInfoImpl.cpp
index 7e7f835040..8ed66f7044 100644
--- a/lib/CodeGen/TargetInstrInfoImpl.cpp
+++ b/lib/CodeGen/TargetInstrInfoImpl.cpp
@@ -606,13 +606,13 @@ getOperandLatency(const InstrItineraryData *ItinData,
 
 /// If we can determine the operand latency from the def only, without itinerary
 /// lookup, do so. Otherwise return -1.
-static int computeDefOperandLatency(
-  const TargetInstrInfo *TII, const InstrItineraryData *ItinData,
-  const MachineInstr *DefMI, bool FindMin) {
+int TargetInstrInfo::computeDefOperandLatency(
+  const InstrItineraryData *ItinData,
+  const MachineInstr *DefMI, bool FindMin) const {
 
   // Let the target hook getInstrLatency handle missing itineraries.
   if (!ItinData)
-    return TII->getInstrLatency(ItinData, DefMI);
+    return getInstrLatency(ItinData, DefMI);
 
   // Return a latency based on the itinerary properties and defining instruction
   // if possible. Some common subtargets don't require per-operand latency,
@@ -621,7 +621,7 @@ static int computeDefOperandLatency(
     // If MinLatency is valid, call getInstrLatency. This uses Stage latency if
     // it exists before defaulting to MinLatency.
     if (ItinData->SchedModel->MinLatency >= 0)
-      return TII->getInstrLatency(ItinData, DefMI);
+      return getInstrLatency(ItinData, DefMI);
 
     // If MinLatency is invalid, OperandLatency is interpreted as MinLatency.
     // For empty itineraries, short-cirtuit the check and default to one cycle.
@@ -629,7 +629,7 @@ static int computeDefOperandLatency(
       return 1;
   }
   else if(ItinData->isEmpty())
-    return TII->defaultDefLatency(ItinData->SchedModel, DefMI);
+    return defaultDefLatency(ItinData->SchedModel, DefMI);
 
   // ...operand lookup required
   return -1;
@@ -652,7 +652,7 @@ computeOperandLatency(const InstrItineraryData *ItinData,
                       const MachineInstr *UseMI, unsigned UseIdx,
                       bool FindMin) const {
 
-  int DefLatency = computeDefOperandLatency(this, ItinData, DefMI, FindMin);
+  int DefLatency = computeDefOperandLatency(ItinData, DefMI, FindMin);
   if (DefLatency >= 0)
     return DefLatency;
 
diff --git a/lib/CodeGen/TargetSchedule.cpp b/lib/CodeGen/TargetSchedule.cpp
new file mode 100644
index 0000000000..353c9cf8aa
--- /dev/null
+++ b/lib/CodeGen/TargetSchedule.cpp
@@ -0,0 +1,182 @@
+//===-- llvm/Target/TargetSchedule.cpp - Sched Machine Model ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a wrapper around MCSchedModel that allows the interface
+// to benefit from information currently only available in TargetInstrInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+static cl::opt<bool> EnableSchedModel("schedmodel", cl::Hidden, cl::init(false),
+  cl::desc("Use TargetSchedModel for latency lookup"));
+
+static cl::opt<bool> EnableSchedItins("scheditins", cl::Hidden, cl::init(true),
+  cl::desc("Use InstrItineraryData for latency lookup"));
+
+void TargetSchedModel::init(const MCSchedModel &sm,
+                            const TargetSubtargetInfo *sti,
+                            const TargetInstrInfo *tii) {
+  SchedModel = sm;
+  STI = sti;
+  TII = tii;
+  STI->initInstrItins(InstrItins);
+}
+
+/// If we can determine the operand latency from the def only, without machine
+/// model or itinerary lookup, do so. Otherwise return -1.
+int TargetSchedModel::getDefLatency(const MachineInstr *DefMI,
+                                    bool FindMin) const {
+
+  // Return a latency based on the itinerary properties and defining instruction
+  // if possible. Some common subtargets don't require per-operand latency,
+  // especially for minimum latencies.
+  if (FindMin) {
+    // If MinLatency is invalid, then use the itinerary for MinLatency. If no
+    // itinerary exists either, then use single cycle latency.
+    if (SchedModel.MinLatency < 0
+        && !(EnableSchedItins && hasInstrItineraries())) {
+      return 1;
+    }
+    return SchedModel.MinLatency;
+  }
+  else if (!(EnableSchedModel && hasInstrSchedModel())
+           && !(EnableSchedItins && hasInstrItineraries())) {
+    return TII->defaultDefLatency(&SchedModel, DefMI);
+  }
+  // ...operand lookup required
+  return -1;
+}
+
+/// Return the MCSchedClassDesc for this instruction. Some SchedClasses require
+/// evaluation of predicates that depend on instruction operands or flags.
+const MCSchedClassDesc *TargetSchedModel::
+resolveSchedClass(const MachineInstr *MI) const {
+
+  // Get the definition's scheduling class descriptor from this machine model.
+  unsigned SchedClass = MI->getDesc().getSchedClass();
+  const MCSchedClassDesc *SCDesc = SchedModel.getSchedClassDesc(SchedClass);
+
+#ifndef NDEBUG
+  unsigned NIter = 0;
+#endif
+  while (SCDesc->isVariant()) {
+    assert(++NIter < 6 && "Variants are nested deeper than the magic number");
+
+    SchedClass = STI->resolveSchedClass(SchedClass, MI, this);
+    SCDesc = SchedModel.getSchedClassDesc(SchedClass);
+  }
+  return SCDesc;
+}
+
+/// Find the def index of this operand. This index maps to the machine model and
+/// is independent of use operands. Def operands may be reordered with uses or
+/// merged with uses without affecting the def index (e.g. before/after
+/// regalloc). However, an instruction's def operands must never be reordered
+/// with respect to each other.
+static unsigned findDefIdx(const MachineInstr *MI, unsigned DefOperIdx) {
+  unsigned DefIdx = 0;
+  for (unsigned i = 0; i != DefOperIdx; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.isDef())
+      ++DefIdx;
+  }
+  return DefIdx;
+}
+
+/// Find the use index of this operand. This is independent of the instruction's
+/// def operands.
+///
+/// Note that uses are not determined by the operand's isUse property, which
+/// is simply the inverse of isDef. Here we consider any readsReg operand to be
+/// a "use". The machine model allows an operand to be both a Def and Use.
+static unsigned findUseIdx(const MachineInstr *MI, unsigned UseOperIdx) {
+  unsigned UseIdx = 0;
+  for (unsigned i = 0; i != UseOperIdx; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    if (MO.isReg() && MO.readsReg())
+      ++UseIdx;
+  }
+  return UseIdx;
+}
+
+// Top-level API for clients that know the operand indices.
+unsigned TargetSchedModel::computeOperandLatency(
+  const MachineInstr *DefMI, unsigned DefOperIdx,
+  const MachineInstr *UseMI, unsigned UseOperIdx,
+  bool FindMin) const {
+
+  int DefLatency = getDefLatency(DefMI, FindMin);
+  if (DefLatency >= 0)
+    return DefLatency;
+
+  if (!FindMin && EnableSchedModel && hasInstrSchedModel()) {
+    const MCSchedClassDesc *SCDesc = resolveSchedClass(DefMI);
+    unsigned DefIdx = findDefIdx(DefMI, DefOperIdx);
+    if (DefIdx < SCDesc->NumWriteLatencyEntries) {
+      // Lookup the definition's write latency in SubtargetInfo.
+      const MCWriteLatencyEntry *WLEntry =
+        STI->getWriteLatencyEntry(SCDesc, DefIdx);
+      unsigned WriteID = WLEntry->WriteResourceID;
+      unsigned Latency = WLEntry->Cycles;
+      if (!UseMI)
+        return Latency;
+
+      // Lookup the use's latency adjustment in SubtargetInfo.
+      const MCSchedClassDesc *UseDesc = resolveSchedClass(UseMI);
+      if (UseDesc->NumReadAdvanceEntries == 0)
+        return Latency;
+      unsigned UseIdx = findUseIdx(UseMI, UseOperIdx);
+      return Latency - STI->getReadAdvanceCycles(UseDesc, UseIdx, WriteID);
+    }
+    // If DefIdx does not exist in the model (e.g. implicit defs), then return
+    // unit latency (defaultDefLatency may be too conservative).
+#ifndef NDEBUG
+    if (SCDesc->isValid() && !DefMI->getOperand(DefOperIdx).isImplicit()
+        && !DefMI->getDesc().OpInfo[DefOperIdx].isOptionalDef()) {
+      std::string Err;
+      raw_string_ostream ss(Err);
+      ss << "DefIdx " << DefIdx << " exceeds machine model writes for "
+         << *DefMI;
+      report_fatal_error(ss.str());
+    }
+#endif
+    return 1;
+  }
+  assert(EnableSchedItins && hasInstrItineraries() &&
+         "operand latency requires itinerary");
+
+  int OperLatency = 0;
+  if (UseMI) {
+    OperLatency =
+      TII->getOperandLatency(&InstrItins, DefMI, DefOperIdx, UseMI, UseOperIdx);
+  }
+  else {
+    unsigned DefClass = DefMI->getDesc().getSchedClass();
+    OperLatency = InstrItins.getOperandCycle(DefClass, DefOperIdx);
+  }
+  if (OperLatency >= 0)
+    return OperLatency;
+
+  // No operand latency was found.
+  unsigned InstrLatency = TII->getInstrLatency(&InstrItins, DefMI);
+
+  // Expected latency is the max of the stage latency and itinerary props.
+  if (!FindMin)
+    InstrLatency = std::max(InstrLatency,
+                            TII->defaultDefLatency(&SchedModel, DefMI));
+  return InstrLatency;
+}
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index bd12f92132..df33a94ca7 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1758,10 +1758,6 @@ bool TwoAddressInstructionPass::EliminateRegSequences() {
         if (MO.isReg() && MO.isDef() && MO.getReg() == DstReg)
           MO.setIsUndef();
       }
-      // Make sure there is a full non-subreg imp-def operand on the
-      // instruction.  This shouldn't be necessary, but it seems that at least
-      // RAFast requires it.
-      Def->addRegisterDefined(DstReg, TRI);
       DEBUG(dbgs() << "First def: " << *Def);
     }
 
diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp
index bd10a4b8d0..a69a8169d3 100644
--- a/lib/CodeGen/VirtRegMap.cpp
+++ b/lib/CodeGen/VirtRegMap.cpp
@@ -20,6 +20,7 @@
 #include "VirtRegMap.h"
 #include "LiveDebugVariables.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveStackAnalysis.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -126,7 +127,7 @@ void VirtRegMap::print(raw_ostream &OS, const Module*) const {
   OS << '\n';
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VirtRegMap::dump() const {
   print(dbgs());
 }
@@ -171,6 +172,7 @@ INITIALIZE_PASS_BEGIN(VirtRegRewriter, "virtregrewriter",
 INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
 INITIALIZE_PASS_DEPENDENCY(LiveDebugVariables)
+INITIALIZE_PASS_DEPENDENCY(LiveStacks)
 INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
 INITIALIZE_PASS_END(VirtRegRewriter, "virtregrewriter",
                     "Virtual Register Rewriter", false, false)
@@ -183,6 +185,8 @@ void VirtRegRewriter::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<SlotIndexes>();
   AU.addPreserved<SlotIndexes>();
   AU.addRequired<LiveDebugVariables>();
+  AU.addRequired<LiveStacks>();
+  AU.addPreserved<LiveStacks>();
   AU.addRequired<VirtRegMap>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
diff --git a/lib/CodeGen/VirtRegMap.h b/lib/CodeGen/VirtRegMap.h
index c3209854a4..7974dda66a 100644
--- a/lib/CodeGen/VirtRegMap.h
+++ b/lib/CodeGen/VirtRegMap.h
@@ -63,8 +63,8 @@ namespace llvm {
     /// createSpillSlot - Allocate a spill slot for RC from MFI.
     unsigned createSpillSlot(const TargetRegisterClass *RC);
 
-    VirtRegMap(const VirtRegMap&);     // DO NOT IMPLEMENT
-    void operator=(const VirtRegMap&); // DO NOT IMPLEMENT
+    VirtRegMap(const VirtRegMap&) LLVM_DELETED_FUNCTION;
+    void operator=(const VirtRegMap&) LLVM_DELETED_FUNCTION;
 
   public:
     static char ID;
diff --git a/lib/DebugInfo/DWARFContext.h b/lib/DebugInfo/DWARFContext.h
index 76339979dd..d10e850870 100644
--- a/lib/DebugInfo/DWARFContext.h
+++ b/lib/DebugInfo/DWARFContext.h
@@ -32,8 +32,8 @@ class DWARFContext : public DIContext {
   OwningPtr<DWARFDebugAranges> Aranges;
   OwningPtr<DWARFDebugLine> Line;
 
-  DWARFContext(DWARFContext &); // = delete
-  DWARFContext &operator=(DWARFContext &); // = delete
+  DWARFContext(DWARFContext &) LLVM_DELETED_FUNCTION;
+  DWARFContext &operator=(DWARFContext &) LLVM_DELETED_FUNCTION;
 
   /// Read compile units from the debug_info section and store them in CUs.
   void parseCompileUnits();
diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.h b/lib/ExecutionEngine/MCJIT/MCJIT.h
index 138a7b64b2..d5c5d77574 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.h
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.h
@@ -71,7 +71,8 @@ public:
   /// Map the address of a JIT section as returned from the memory manager
   /// to the address in the target process as the running code will see it.
   /// This is the address which will be used for relocation resolution.
-  virtual void mapSectionAddress(void *LocalAddress, uint64_t TargetAddress) {
+  virtual void mapSectionAddress(const void *LocalAddress,
+                                 uint64_t TargetAddress) {
     Dyld.mapSectionAddress(LocalAddress, TargetAddress);
   }
 
diff --git a/lib/ExecutionEngine/RuntimeDyld/ObjectImage.h b/lib/ExecutionEngine/RuntimeDyld/ObjectImage.h
index c3e3572f3b..4fa5c1cfea 100644
--- a/lib/ExecutionEngine/RuntimeDyld/ObjectImage.h
+++ b/lib/ExecutionEngine/RuntimeDyld/ObjectImage.h
@@ -19,8 +19,8 @@
 namespace llvm {
 
 class ObjectImage {
-  ObjectImage(); // = delete
-  ObjectImage(const ObjectImage &other); // = delete
+  ObjectImage() LLVM_DELETED_FUNCTION;
+  ObjectImage(const ObjectImage &other) LLVM_DELETED_FUNCTION;
 protected:
   object::ObjectFile *ObjFile;
 
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index d47287b878..d06a1fc984 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -48,7 +48,7 @@ void RuntimeDyldImpl::resolveRelocations() {
   }
 }
 
-void RuntimeDyldImpl::mapSectionAddress(void *LocalAddress,
+void RuntimeDyldImpl::mapSectionAddress(const void *LocalAddress,
                                         uint64_t TargetAddress) {
   for (unsigned i = 0, e = Sections.size(); i != e; ++i) {
     if (Sections[i].Address == LocalAddress) {
@@ -492,7 +492,7 @@ void RuntimeDyld::reassignSectionAddress(unsigned SectionID,
   Dyld->reassignSectionAddress(SectionID, Addr);
 }
 
-void RuntimeDyld::mapSectionAddress(void *LocalAddress,
+void RuntimeDyld::mapSectionAddress(const void *LocalAddress,
                                     uint64_t TargetAddress) {
   Dyld->mapSectionAddress(LocalAddress, TargetAddress);
 }
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
index d5df732b91..d56cab293c 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
@@ -287,7 +287,7 @@ public:
 
   void reassignSectionAddress(unsigned SectionID, uint64_t Addr);
 
-  void mapSectionAddress(void *LocalAddress, uint64_t TargetAddress);
+  void mapSectionAddress(const void *LocalAddress, uint64_t TargetAddress);
 
   // Is the linker in an error state?
   bool hasError() { return HasError; }
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
index 0e3a9d4af5..465e85d7d9 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldMachO.cpp
@@ -57,7 +57,7 @@ void RuntimeDyldMachO::resolveRelocation(uint8_t *LocalAddress,
                           FinalAddress,
                           (uintptr_t)Value,
                           isPCRel,
-                          Type,
+                          MachoType,
                           Size,
                           Addend);
     break;
@@ -246,7 +246,12 @@ void RuntimeDyldMachO::processRelocationRef(const ObjRelocationInfo &Rel,
     }
     assert(si != se && "No section containing relocation!");
     Value.SectionID = findOrEmitSection(Obj, *si, true, ObjSectionToID);
-    Value.Addend = *(const intptr_t *)Target;
+    Value.Addend = 0;
+    // FIXME: The size and type of the relocation determines if we can
+    // encode an Addend in the target location itself, and if so, how many
+    // bytes we should read in order to get it. We don't yet support doing
+    // that, and just assuming it's sizeof(intptr_t) is blatantly wrong.
+    //Value.Addend = *(const intptr_t *)Target;
     if (Value.Addend) {
       // The MachO addend is an offset from the current section.  We need it
       // to be an offset from the destination section
@@ -254,7 +259,7 @@ void RuntimeDyldMachO::processRelocationRef(const ObjRelocationInfo &Rel,
     }
   }
 
-  if (Arch == Triple::arm && RelType == macho::RIT_ARM_Branch24Bit) {
+  if (Arch == Triple::arm && (RelType & 0xf) == macho::RIT_ARM_Branch24Bit) {
     // This is an ARM branch relocation, need to use a stub function.
 
     //  Look up for existing stub.
diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp
index 02bd6deb62..80bf4f9309 100644
--- a/lib/MC/MCAssembler.cpp
+++ b/lib/MC/MCAssembler.cpp
@@ -330,8 +330,7 @@ MCAssembler::MCAssembler(MCContext &Context_, MCAsmBackend &Backend_,
                          MCCodeEmitter &Emitter_, MCObjectWriter &Writer_,
                          raw_ostream &OS_)
   : Context(Context_), Backend(Backend_), Emitter(Emitter_), Writer(Writer_),
-    OS(OS_), RelaxAll(false), NoExecStack(false), SubsectionsViaSymbols(false)
-{
+    OS(OS_), RelaxAll(false), NoExecStack(false), SubsectionsViaSymbols(false) {
 }
 
 MCAssembler::~MCAssembler() {
@@ -631,7 +630,7 @@ static void WriteBundlePadding(const MCAssembler &Asm,
 }
 // @LOCALMOD-END
 
-/// WriteFragmentData - Write the \arg F data to the output file.
+/// WriteFragmentData - Write the \p F data to the output file.
 static void WriteFragmentData(const MCAssembler &Asm, const MCAsmLayout &Layout,
                               const MCFragment &F) {
   MCObjectWriter *OW = &Asm.getWriter();
@@ -810,7 +809,7 @@ void MCAssembler::writeSectionData(const MCSectionData *SD,
   }
 
   uint64_t Start = getWriter().getStream().tell();
-  (void) Start;
+  (void)Start;
 
   for (MCSectionData::const_iterator it = SD->begin(),
          ie = SD->end(); it != ie; ++it)
@@ -1107,7 +1106,7 @@ raw_ostream &operator<<(raw_ostream &OS, const MCFixup &AF) {
 
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void MCFragment::dump() {
   raw_ostream &OS = llvm::errs();
 
diff --git a/lib/MC/MCContext.cpp b/lib/MC/MCContext.cpp
index b5b14b95f6..477bd17c0d 100644
--- a/lib/MC/MCContext.cpp
+++ b/lib/MC/MCContext.cpp
@@ -153,6 +153,12 @@ MCSymbol *MCContext::LookupSymbol(StringRef Name) const {
   return Symbols.lookup(Name);
 }
 
+MCSymbol *MCContext::LookupSymbol(const Twine &Name) const {
+  SmallString<128> NameSV;
+  Name.toVector(NameSV);
+  return LookupSymbol(NameSV.str());
+}
+
 //===----------------------------------------------------------------------===//
 // Section Management
 //===----------------------------------------------------------------------===//
diff --git a/lib/MC/MCDwarf.cpp b/lib/MC/MCDwarf.cpp
index ee597cfa21..a1643b2da5 100644
--- a/lib/MC/MCDwarf.cpp
+++ b/lib/MC/MCDwarf.cpp
@@ -425,7 +425,7 @@ void MCDwarfFile::print(raw_ostream &OS) const {
   OS << '"' << getName() << '"';
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void MCDwarfFile::dump() const {
   print(dbgs());
 }
diff --git a/lib/MC/MCELFStreamer.cpp b/lib/MC/MCELFStreamer.cpp
index bf35ab7fc3..d1aeaf36c7 100644
--- a/lib/MC/MCELFStreamer.cpp
+++ b/lib/MC/MCELFStreamer.cpp
@@ -247,7 +247,6 @@ void MCELFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
   switch (Attribute) {
   case MCSA_LazyReference:
   case MCSA_Reference:
-  case MCSA_NoDeadStrip:
   case MCSA_SymbolResolver:
   case MCSA_PrivateExtern:
   case MCSA_WeakDefinition:
@@ -256,6 +255,7 @@ void MCELFStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
   case MCSA_IndirectSymbol:
     llvm_unreachable("Invalid symbol attribute for ELF!");
 
+  case MCSA_NoDeadStrip:
   case MCSA_ELF_TypeGnuUniqueObject:
     // Ignore for now.
     break;
diff --git a/lib/MC/MCExpr.cpp b/lib/MC/MCExpr.cpp
index b19665949d..8fed48cef2 100644
--- a/lib/MC/MCExpr.cpp
+++ b/lib/MC/MCExpr.cpp
@@ -136,7 +136,7 @@ void MCExpr::print(raw_ostream &OS) const {
   llvm_unreachable("Invalid expression kind!");
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void MCExpr::dump() const {
   print(dbgs());
   dbgs() << '\n';
diff --git a/lib/MC/MCInst.cpp b/lib/MC/MCInst.cpp
index e96010bd5c..124cc149be 100644
--- a/lib/MC/MCInst.cpp
+++ b/lib/MC/MCInst.cpp
@@ -32,7 +32,7 @@ void MCOperand::print(raw_ostream &OS, const MCAsmInfo *MAI) const {
   OS << ">";
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void MCOperand::dump() const {
   print(dbgs(), 0);
   dbgs() << "\n";
@@ -64,7 +64,7 @@ void MCInst::dump_pretty(raw_ostream &OS, const MCAsmInfo *MAI,
   OS << ">";
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void MCInst::dump() const {
   print(dbgs(), 0);
   dbgs() << "\n";
diff --git a/lib/MC/MCLabel.cpp b/lib/MC/MCLabel.cpp
index 95d7d16a19..1d3022a93e 100644
--- a/lib/MC/MCLabel.cpp
+++ b/lib/MC/MCLabel.cpp
@@ -16,7 +16,7 @@ void MCLabel::print(raw_ostream &OS) const {
   OS << '"' << getInstance() << '"';
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void MCLabel::dump() const {
   print(dbgs());
 }
diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp
index 29b4a94653..8053624831 100644
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp
@@ -430,12 +430,20 @@ void MCObjectFileInfo::InitCOFFMCObjectFileInfo(Triple T) {
   }
 
 
-  StaticDtorSection =
-    Ctx->getCOFFSection(".dtors",
-                        COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
-                        COFF::IMAGE_SCN_MEM_READ |
-                        COFF::IMAGE_SCN_MEM_WRITE,
-                        SectionKind::getDataRel());
+  if (T.getOS() == Triple::Win32) {
+    StaticDtorSection =
+      Ctx->getCOFFSection(".CRT$XTX",
+                          COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                          COFF::IMAGE_SCN_MEM_READ,
+                          SectionKind::getReadOnly());
+  } else {
+    StaticDtorSection =
+      Ctx->getCOFFSection(".dtors",
+                          COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                          COFF::IMAGE_SCN_MEM_READ |
+                          COFF::IMAGE_SCN_MEM_WRITE,
+                          SectionKind::getDataRel());
+  }
 
   // FIXME: We're emitting LSDA info into a readonly section on COFF, even
   // though it contains relocatable pointers.  In PIC mode, this is probably a
diff --git a/lib/MC/MCParser/AsmLexer.cpp b/lib/MC/MCParser/AsmLexer.cpp
index c76052d66e..f93f685bf5 100644
--- a/lib/MC/MCParser/AsmLexer.cpp
+++ b/lib/MC/MCParser/AsmLexer.cpp
@@ -396,8 +396,17 @@ AsmToken AsmLexer::LexToken() {
   case 0:
   case ' ':
   case '\t':
-    // Ignore whitespace.
-    return LexToken();
+    if (SkipSpace) {
+      // Ignore whitespace.
+      return LexToken();
+    } else {
+      int len = 1;
+      while (*CurPtr==' ' || *CurPtr=='\t') {
+        CurPtr++;
+        len++;
+      }
+      return AsmToken(AsmToken::Space, StringRef(TokStart, len));
+    }
   case '\n': // FALL THROUGH.
   case '\r':
     isAtStartOfLine = true;
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index 40e6b92174..fc0bc35347 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -47,7 +47,7 @@ namespace {
 /// \brief Helper class for tracking macro definitions.
 typedef std::vector<AsmToken> MacroArgument;
 typedef std::vector<MacroArgument> MacroArguments;
-typedef StringRef MacroParameter;
+typedef std::pair<StringRef, MacroArgument> MacroParameter;
 typedef std::vector<MacroParameter> MacroParameters;
 
 struct Macro {
@@ -84,8 +84,8 @@ public:
 class AsmParser : public MCAsmParser {
   friend class GenericAsmParser;
 
-  AsmParser(const AsmParser &);   // DO NOT IMPLEMENT
-  void operator=(const AsmParser &);  // DO NOT IMPLEMENT
+  AsmParser(const AsmParser &) LLVM_DELETED_FUNCTION;
+  void operator=(const AsmParser &) LLVM_DELETED_FUNCTION;
 private:
   AsmLexer Lexer;
   MCContext &Ctx;
@@ -130,6 +130,9 @@ private:
   /// AssemblerDialect. ~OU means unset value and use value provided by MAI.
   unsigned AssemblerDialect;
 
+  /// IsDarwin - is Darwin compatibility enabled?
+  bool IsDarwin;
+
 public:
   AsmParser(SourceMgr &SM, MCContext &Ctx, MCStreamer &Out,
             const MCAsmInfo &MAI);
@@ -202,14 +205,15 @@ private:
   /// This returns true on failure.
   bool ProcessIncbinFile(const std::string &Filename);
 
-  /// \brief Reset the current lexer position to that given by \arg Loc. The
+  /// \brief Reset the current lexer position to that given by \p Loc. The
   /// current token is not set; clients should ensure Lex() is called
   /// subsequently.
   void JumpToLoc(SMLoc Loc);
 
   virtual void EatToEndOfStatement();
 
-  bool ParseMacroArgument(MacroArgument &MA);
+  bool ParseMacroArgument(MacroArgument &MA,
+                          AsmToken::TokenKind &ArgumentDelimiter);
   bool ParseMacroArguments(const Macro *M, MacroArguments &A);
 
   /// \brief Parse up to the end of statement and a return the contents from the
@@ -221,7 +225,8 @@ private:
   /// return the contents from the current token up to the end or comma.
   StringRef ParseStringToComma();
 
-  bool ParseAssignment(StringRef Name, bool allow_redef);
+  bool ParseAssignment(StringRef Name, bool allow_redef,
+                       bool NoDeadStrip = false);
 
   bool ParsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc);
   bool ParseBinOpRHS(unsigned Precedence, const MCExpr *&Res, SMLoc &EndLoc);
@@ -229,7 +234,7 @@ private:
   bool ParseBracketExpr(const MCExpr *&Res, SMLoc &EndLoc);
 
   /// ParseIdentifier - Parse an identifier or string (as a quoted identifier)
-  /// and set \arg Res to the identifier contents.
+  /// and set \p Res to the identifier contents.
   virtual bool ParseIdentifier(StringRef &Res);
 
   // Directive Parsing.
@@ -413,8 +418,8 @@ AsmParser::AsmParser(SourceMgr &_SM, MCContext &_Ctx,
                      MCStreamer &_Out, const MCAsmInfo &_MAI)
   : Lexer(_MAI), Ctx(_Ctx), Out(_Out), MAI(_MAI), SrcMgr(_SM),
     GenericParser(new GenericAsmParser), PlatformParser(0),
-    CurBuffer(0), MacrosEnabled(true), CppHashLineNumber(0), 
-    AssemblerDialect(~0U) {
+    CurBuffer(0), MacrosEnabled(true), CppHashLineNumber(0),
+    AssemblerDialect(~0U), IsDarwin(false) {
   // Save the old handler.
   SavedDiagHandler = SrcMgr.getDiagHandler();
   SavedDiagContext = SrcMgr.getDiagContext();
@@ -435,6 +440,7 @@ AsmParser::AsmParser(SourceMgr &_SM, MCContext &_Ctx,
   } else if (_MAI.hasSubsectionsViaSymbols()) {
     PlatformParser = createDarwinAsmParser();
     PlatformParser->Initialize(*this);
+    IsDarwin = true;
   } else {
     PlatformParser = createELFAsmParser();
     PlatformParser->Initialize(*this);
@@ -1495,6 +1501,8 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
   if (NParameters != 0 && NParameters != A.size())
     return Error(L, "Wrong number of arguments");
 
+  // A macro without parameters is handled differently on Darwin:
+  // gas accepts no arguments and does no substitutions
   while (!Body.empty()) {
     // Scan for the next substitution.
     std::size_t End = Body.size(), Pos = 0;
@@ -1558,18 +1566,26 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
       StringRef Argument(Begin, I - (Pos +1));
       unsigned Index = 0;
       for (; Index < NParameters; ++Index)
-        if (Parameters[Index] == Argument)
+        if (Parameters[Index].first == Argument)
           break;
 
-      // FIXME: We should error at the macro definition.
-      if (Index == NParameters)
-        return Error(L, "Parameter not found");
-
-      for (MacroArgument::const_iterator it = A[Index].begin(),
-             ie = A[Index].end(); it != ie; ++it)
-        OS << it->getString();
+      if (Index == NParameters) {
+          if (Body[Pos+1] == '(' && Body[Pos+2] == ')')
+            Pos += 3;
+          else {
+            OS << '\\' << Argument;
+            Pos = I;
+          }
+      } else {
+        for (MacroArgument::const_iterator it = A[Index].begin(),
+               ie = A[Index].end(); it != ie; ++it)
+          if (it->getKind() == AsmToken::String)
+            OS << it->getStringContents();
+          else
+            OS << it->getString();
 
-      Pos += 1 + Argument.size();
+        Pos += 1 + Argument.size();
+      }
     }
     // Update the scan point.
     Body = Body.substr(Pos);
@@ -1584,22 +1600,97 @@ MacroInstantiation::MacroInstantiation(const Macro *M, SMLoc IL, SMLoc EL,
 {
 }
 
+static bool IsOperator(AsmToken::TokenKind kind)
+{
+  switch (kind)
+  {
+    default:
+      return false;
+    case AsmToken::Plus:
+    case AsmToken::Minus:
+    case AsmToken::Tilde:
+    case AsmToken::Slash:
+    case AsmToken::Star:
+    case AsmToken::Dot:
+    case AsmToken::Equal:
+    case AsmToken::EqualEqual:
+    case AsmToken::Pipe:
+    case AsmToken::PipePipe:
+    case AsmToken::Caret:
+    case AsmToken::Amp:
+    case AsmToken::AmpAmp:
+    case AsmToken::Exclaim:
+    case AsmToken::ExclaimEqual:
+    case AsmToken::Percent:
+    case AsmToken::Less:
+    case AsmToken::LessEqual:
+    case AsmToken::LessLess:
+    case AsmToken::LessGreater:
+    case AsmToken::Greater:
+    case AsmToken::GreaterEqual:
+    case AsmToken::GreaterGreater:
+      return true;
+  }
+}
+
 /// ParseMacroArgument - Extract AsmTokens for a macro argument.
 /// This is used for both default macro parameter values and the
 /// arguments in macro invocations
-bool AsmParser::ParseMacroArgument(MacroArgument &MA) {
+bool AsmParser::ParseMacroArgument(MacroArgument &MA,
+                                   AsmToken::TokenKind &ArgumentDelimiter) {
   unsigned ParenLevel = 0;
+  unsigned AddTokens = 0;
+
+  // gas accepts arguments separated by whitespace, except on Darwin
+  if (!IsDarwin)
+    Lexer.setSkipSpace(false);
 
   for (;;) {
-    if (Lexer.is(AsmToken::Eof) || Lexer.is(AsmToken::Equal))
+    if (Lexer.is(AsmToken::Eof) || Lexer.is(AsmToken::Equal)) {
+      Lexer.setSkipSpace(true);
       return TokError("unexpected token in macro instantiation");
+    }
+
+    if (ParenLevel == 0 && Lexer.is(AsmToken::Comma)) {
+      // Spaces and commas cannot be mixed to delimit parameters
+      if (ArgumentDelimiter == AsmToken::Eof)
+        ArgumentDelimiter = AsmToken::Comma;
+      else if (ArgumentDelimiter != AsmToken::Comma) {
+        Lexer.setSkipSpace(true);
+        return TokError("expected ' ' for macro argument separator");
+      }
+      break;
+    }
+
+    if (Lexer.is(AsmToken::Space)) {
+      Lex(); // Eat spaces
+
+      // Spaces can delimit parameters, but could also be part an expression.
+      // If the token after a space is an operator, add the token and the next
+      // one into this argument
+      if (ArgumentDelimiter == AsmToken::Space ||
+          ArgumentDelimiter == AsmToken::Eof) {
+        if (IsOperator(Lexer.getKind())) {
+          // Check to see whether the token is used as an operator,
+          // or part of an identifier
+          const char *NextChar = getTok().getEndLoc().getPointer() + 1;
+          if (*NextChar == ' ')
+            AddTokens = 2;
+        }
+
+        if (!AddTokens && ParenLevel == 0) {
+          if (ArgumentDelimiter == AsmToken::Eof &&
+              !IsOperator(Lexer.getKind()))
+            ArgumentDelimiter = AsmToken::Space;
+          break;
+        }
+      }
+    }
 
     // HandleMacroEntry relies on not advancing the lexer here
     // to be able to fill in the remaining default parameter values
     if (Lexer.is(AsmToken::EndOfStatement))
       break;
-    if (ParenLevel == 0 && Lexer.is(AsmToken::Comma))
-      break;
 
     // Adjust the current parentheses level.
     if (Lexer.is(AsmToken::LParen))
@@ -1609,8 +1700,12 @@ bool AsmParser::ParseMacroArgument(MacroArgument &MA) {
 
     // Append the token to the current argument list.
     MA.push_back(getTok());
+    if (AddTokens)
+      AddTokens--;
     Lex();
   }
+
+  Lexer.setSkipSpace(true);
   if (ParenLevel != 0)
     return TokError("unbalanced parentheses in macro argument");
   return false;
@@ -1619,6 +1714,9 @@ bool AsmParser::ParseMacroArgument(MacroArgument &MA) {
 // Parse the macro instantiation arguments.
 bool AsmParser::ParseMacroArguments(const Macro *M, MacroArguments &A) {
   const unsigned NParameters = M ? M->Parameters.size() : 0;
+  // Argument delimiter is initially unknown. It will be set by
+  // ParseMacroArgument()
+  AsmToken::TokenKind ArgumentDelimiter = AsmToken::Eof;
 
   // Parse two kinds of macro invocations:
   // - macros defined without any parameters accept an arbitrary number of them
@@ -1627,13 +1725,30 @@ bool AsmParser::ParseMacroArguments(const Macro *M, MacroArguments &A) {
        ++Parameter) {
     MacroArgument MA;
 
-    if (ParseMacroArgument(MA))
+    if (ParseMacroArgument(MA, ArgumentDelimiter))
       return true;
 
-    A.push_back(MA);
+    if (!MA.empty() || !NParameters)
+      A.push_back(MA);
+    else if (NParameters) {
+      if (!M->Parameters[Parameter].second.empty())
+        A.push_back(M->Parameters[Parameter].second);
+    }
 
-    if (Lexer.is(AsmToken::EndOfStatement))
+    // At the end of the statement, fill in remaining arguments that have
+    // default values. If there aren't any, then the next argument is
+    // required but missing
+    if (Lexer.is(AsmToken::EndOfStatement)) {
+      if (NParameters && Parameter < NParameters - 1) {
+        if (M->Parameters[Parameter + 1].second.empty())
+          return TokError("macro argument '" +
+                          Twine(M->Parameters[Parameter + 1].first) +
+                          "' is missing");
+        else
+          continue;
+      }
       return false;
+    }
 
     if (Lexer.is(AsmToken::Comma))
       Lex();
@@ -1722,7 +1837,8 @@ static bool IsUsedIn(const MCSymbol *Sym, const MCExpr *Value) {
   llvm_unreachable("Unknown expr kind!");
 }
 
-bool AsmParser::ParseAssignment(StringRef Name, bool allow_redef) {
+bool AsmParser::ParseAssignment(StringRef Name, bool allow_redef,
+                                bool NoDeadStrip) {
   // FIXME: Use better location, we should use proper tokens.
   SMLoc EqualLoc = Lexer.getLoc();
 
@@ -1777,6 +1893,9 @@ bool AsmParser::ParseAssignment(StringRef Name, bool allow_redef) {
 
   // Do the assignment.
   Out.EmitAssignment(Sym, Value);
+  if (NoDeadStrip)
+    Out.EmitSymbolAttribute(Sym, MCSA_NoDeadStrip);
+
 
   return false;
 }
@@ -1834,7 +1953,7 @@ bool AsmParser::ParseDirectiveSet(StringRef IDVal, bool allow_redef) {
     return TokError("unexpected token in '" + Twine(IDVal) + "'");
   Lex();
 
-  return ParseAssignment(Name, allow_redef);
+  return ParseAssignment(Name, allow_redef, true);
 }
 
 bool AsmParser::ParseEscapedString(std::string &Data) {
@@ -3152,22 +3271,30 @@ bool GenericAsmParser::ParseDirectiveMacro(StringRef Directive,
     return TokError("expected identifier in '.macro' directive");
 
   MacroParameters Parameters;
+  // Argument delimiter is initially unknown. It will be set by
+  // ParseMacroArgument()
+  AsmToken::TokenKind ArgumentDelimiter = AsmToken::Eof;
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
     for (;;) {
       MacroParameter Parameter;
-      if (getParser().ParseIdentifier(Parameter))
+      if (getParser().ParseIdentifier(Parameter.first))
         return TokError("expected identifier in '.macro' directive");
+
+      if (getLexer().is(AsmToken::Equal)) {
+        Lex();
+        if (getParser().ParseMacroArgument(Parameter.second, ArgumentDelimiter))
+          return true;
+      }
+
       Parameters.push_back(Parameter);
 
-      if (getLexer().isNot(AsmToken::Comma))
+      if (getLexer().is(AsmToken::Comma))
+        Lex();
+      else if (getLexer().is(AsmToken::EndOfStatement))
         break;
-      Lex();
     }
   }
 
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in '.macro' directive");
-
   // Eat the end of statement.
   Lex();
 
@@ -3372,7 +3499,7 @@ bool AsmParser::ParseDirectiveIrp(SMLoc DirectiveLoc) {
   MacroParameters Parameters;
   MacroParameter Parameter;
 
-  if (ParseIdentifier(Parameter))
+  if (ParseIdentifier(Parameter.first))
     return TokError("expected identifier in '.irp' directive");
 
   Parameters.push_back(Parameter);
@@ -3418,7 +3545,7 @@ bool AsmParser::ParseDirectiveIrpc(SMLoc DirectiveLoc) {
   MacroParameters Parameters;
   MacroParameter Parameter;
 
-  if (ParseIdentifier(Parameter))
+  if (ParseIdentifier(Parameter.first))
     return TokError("expected identifier in '.irpc' directive");
 
   Parameters.push_back(Parameter);
@@ -3468,7 +3595,7 @@ bool AsmParser::ParseDirectiveIrpc(SMLoc DirectiveLoc) {
 
 bool AsmParser::ParseDirectiveEndr(SMLoc DirectiveLoc) {
   if (ActiveMacros.empty())
-    return TokError("unexpected '.endr' directive, no current .rept");
+    return TokError("unmatched '.endr' directive");
 
   // The only .repl that should get here are the ones created by
   // InstantiateMacroLikeBody.
diff --git a/lib/MC/MCParser/MCAsmLexer.cpp b/lib/MC/MCParser/MCAsmLexer.cpp
index 3a3ff14711..384b341bc7 100644
--- a/lib/MC/MCParser/MCAsmLexer.cpp
+++ b/lib/MC/MCParser/MCAsmLexer.cpp
@@ -12,7 +12,8 @@
 
 using namespace llvm;
 
-MCAsmLexer::MCAsmLexer() : CurTok(AsmToken::Error, StringRef()), TokStart(0) {
+MCAsmLexer::MCAsmLexer() : CurTok(AsmToken::Error, StringRef()),
+                           TokStart(0), SkipSpace(true) {
 }
 
 MCAsmLexer::~MCAsmLexer() {
diff --git a/lib/MC/MCParser/MCAsmParser.cpp b/lib/MC/MCParser/MCAsmParser.cpp
index 93ee2dd0c0..6967feef24 100644
--- a/lib/MC/MCParser/MCAsmParser.cpp
+++ b/lib/MC/MCParser/MCAsmParser.cpp
@@ -44,7 +44,7 @@ bool MCAsmParser::ParseExpression(const MCExpr *&Res) {
 }
 
 void MCParsedAsmOperand::dump() const {
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   dbgs() << "  " << *this;
 #endif
 }
diff --git a/lib/MC/MCSubtargetInfo.cpp b/lib/MC/MCSubtargetInfo.cpp
index cbf853cd8e..8be07eed82 100644
--- a/lib/MC/MCSubtargetInfo.cpp
+++ b/lib/MC/MCSubtargetInfo.cpp
@@ -19,11 +19,28 @@ using namespace llvm;
 
 MCSchedModel MCSchedModel::DefaultSchedModel; // For unknown processors.
 
+/// ReInitMCSubtargetInfo - Set or change the CPU (optionally supplemented
+/// with feature string). Recompute feature bits and scheduling model.
+void
+MCSubtargetInfo::InitMCProcessorInfo(StringRef CPU, StringRef FS) {
+  SubtargetFeatures Features(FS);
+  FeatureBits = Features.getFeatureBits(CPU, ProcDesc, NumProcs,
+                                        ProcFeatures, NumFeatures);
+
+  if (!CPU.empty())
+    CPUSchedModel = getSchedModelForCPU(CPU);
+  else
+    CPUSchedModel = &MCSchedModel::DefaultSchedModel;
+}
+
 void
 MCSubtargetInfo::InitMCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS,
                                      const SubtargetFeatureKV *PF,
                                      const SubtargetFeatureKV *PD,
                                      const SubtargetInfoKV *ProcSched,
+                                     const MCWriteProcResEntry *WPR,
+                                     const MCWriteLatencyEntry *WL,
+                                     const MCReadAdvanceEntry *RA,
                                      const InstrStage *IS,
                                      const unsigned *OC,
                                      const unsigned *FP,
@@ -31,26 +48,18 @@ MCSubtargetInfo::InitMCSubtargetInfo(StringRef TT, StringRef CPU, StringRef FS,
   TargetTriple = TT;
   ProcFeatures = PF;
   ProcDesc = PD;
-  ProcSchedModel = ProcSched;
+  ProcSchedModels = ProcSched;
+  WriteProcResTable = WPR;
+  WriteLatencyTable = WL;
+  ReadAdvanceTable = RA;
+
   Stages = IS;
   OperandCycles = OC;
   ForwardingPaths = FP;
   NumFeatures = NF;
   NumProcs = NP;
 
-  SubtargetFeatures Features(FS);
-  FeatureBits = Features.getFeatureBits(CPU, ProcDesc, NumProcs,
-                                        ProcFeatures, NumFeatures);
-}
-
-
-/// ReInitMCSubtargetInfo - Change CPU (and optionally supplemented with
-/// feature string) and recompute feature bits.
-uint64_t MCSubtargetInfo::ReInitMCSubtargetInfo(StringRef CPU, StringRef FS) {
-  SubtargetFeatures Features(FS);
-  FeatureBits = Features.getFeatureBits(CPU, ProcDesc, NumProcs,
-                                        ProcFeatures, NumFeatures);
-  return FeatureBits;
+  InitMCProcessorInfo(CPU, FS);
 }
 
 /// ToggleFeature - Toggle a feature and returns the re-computed feature
@@ -72,11 +81,11 @@ uint64_t MCSubtargetInfo::ToggleFeature(StringRef FS) {
 
 const MCSchedModel *
 MCSubtargetInfo::getSchedModelForCPU(StringRef CPU) const {
-  assert(ProcSchedModel && "Processor machine model not available!");
+  assert(ProcSchedModels && "Processor machine model not available!");
 
 #ifndef NDEBUG
   for (size_t i = 1; i < NumProcs; i++) {
-    assert(strcmp(ProcSchedModel[i - 1].Key, ProcSchedModel[i].Key) < 0 &&
+    assert(strcmp(ProcSchedModels[i - 1].Key, ProcSchedModels[i].Key) < 0 &&
            "Processor machine model table is not sorted");
   }
 #endif
@@ -85,8 +94,8 @@ MCSubtargetInfo::getSchedModelForCPU(StringRef CPU) const {
   SubtargetInfoKV KV;
   KV.Key = CPU.data();
   const SubtargetInfoKV *Found =
-    std::lower_bound(ProcSchedModel, ProcSchedModel+NumProcs, KV);
-  if (Found == ProcSchedModel+NumProcs || StringRef(Found->Key) != CPU) {
+    std::lower_bound(ProcSchedModels, ProcSchedModels+NumProcs, KV);
+  if (Found == ProcSchedModels+NumProcs || StringRef(Found->Key) != CPU) {
     errs() << "'" << CPU
            << "' is not a recognized processor for this target"
            << " (ignoring processor)\n";
@@ -101,3 +110,9 @@ MCSubtargetInfo::getInstrItineraryForCPU(StringRef CPU) const {
   const MCSchedModel *SchedModel = getSchedModelForCPU(CPU);
   return InstrItineraryData(SchedModel, Stages, OperandCycles, ForwardingPaths);
 }
+
+/// Initialize an InstrItineraryData instance.
+void MCSubtargetInfo::initInstrItins(InstrItineraryData &InstrItins) const {
+  InstrItins =
+    InstrItineraryData(CPUSchedModel, Stages, OperandCycles, ForwardingPaths);
+}
diff --git a/lib/MC/MCSymbol.cpp b/lib/MC/MCSymbol.cpp
index f60126b8fa..b973c57f7b 100644
--- a/lib/MC/MCSymbol.cpp
+++ b/lib/MC/MCSymbol.cpp
@@ -26,7 +26,7 @@ static bool isAcceptableChar(char C) {
   return true;
 }
 
-/// NameNeedsQuoting - Return true if the identifier \arg Str needs quotes to be
+/// NameNeedsQuoting - Return true if the identifier \p Str needs quotes to be
 /// syntactically correct.
 static bool NameNeedsQuoting(StringRef Str) {
   assert(!Str.empty() && "Cannot create an empty MCSymbol");
@@ -76,7 +76,7 @@ void MCSymbol::print(raw_ostream &OS) const {
   OS << '"' << getName() << '"';
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void MCSymbol::dump() const {
   print(dbgs());
 }
diff --git a/lib/MC/MCValue.cpp b/lib/MC/MCValue.cpp
index a37149d788..4393777211 100644
--- a/lib/MC/MCValue.cpp
+++ b/lib/MC/MCValue.cpp
@@ -31,7 +31,7 @@ void MCValue::print(raw_ostream &OS, const MCAsmInfo *MAI) const {
     OS << " + " << getConstant();
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void MCValue::dump() const {
   print(dbgs(), 0);
 }
diff --git a/lib/MC/MachObjectWriter.cpp b/lib/MC/MachObjectWriter.cpp
index c57b0d65c1..a94b214022 100644
--- a/lib/MC/MachObjectWriter.cpp
+++ b/lib/MC/MachObjectWriter.cpp
@@ -68,6 +68,11 @@ uint64_t MachObjectWriter::getSymbolAddress(const MCSymbolData* SD,
 
   // If this is a variable, then recursively evaluate now.
   if (S.isVariable()) {
+    if (const MCConstantExpr *C =
+          dyn_cast<const MCConstantExpr>(S.getVariableValue()))
+      return C->getValue();
+
+
     MCValue Target;
     if (!S.getVariableValue()->EvaluateAsRelocatable(Target, Layout))
       report_fatal_error("unable to evaluate offset for variable '" +
@@ -140,8 +145,8 @@ void MachObjectWriter::WriteHeader(unsigned NumLoadCommands,
 
 /// WriteSegmentLoadCommand - Write a segment load command.
 ///
-/// \arg NumSections - The number of sections in this segment.
-/// \arg SectionDataSize - The total size of the sections.
+/// \param NumSections The number of sections in this segment.
+/// \param SectionDataSize The total size of the sections.
 void MachObjectWriter::WriteSegmentLoadCommand(unsigned NumSections,
                                                uint64_t VMSize,
                                                uint64_t SectionDataStartOffset,
@@ -315,11 +320,7 @@ void MachObjectWriter::WriteNlist(MachSymbolData &MSD,
 
   // Compute the symbol address.
   if (Symbol.isDefined()) {
-    if (Symbol.isAbsolute()) {
-      Address = cast<MCConstantExpr>(Symbol.getVariableValue())->getValue();
-    } else {
-      Address = getSymbolAddress(&Data, Layout);
-    }
+    Address = getSymbolAddress(&Data, Layout);
   } else if (Data.isCommon()) {
     // Common symbols are encoded with the size in the address
     // field, and their alignment in the flags.
@@ -557,6 +558,26 @@ void MachObjectWriter::computeSectionAddresses(const MCAssembler &Asm,
   }
 }
 
+void MachObjectWriter::markAbsoluteVariableSymbols(MCAssembler &Asm,
+                                                   const MCAsmLayout &Layout) {
+  for (MCAssembler::symbol_iterator i = Asm.symbol_begin(),
+                                    e = Asm.symbol_end();
+      i != e; ++i) {
+    MCSymbolData &SD = *i;
+    if (!SD.getSymbol().isVariable())
+      continue;
+
+    // Is the variable is a symbol difference (SA - SB + C) expression,
+    // and neither symbol is external, mark the variable as absolute.
+    const MCExpr *Expr = SD.getSymbol().getVariableValue();
+    MCValue Value;
+    if (Expr->EvaluateAsRelocatable(Value, Layout)) {
+      if (Value.getSymA() && Value.getSymB())
+        const_cast<MCSymbol*>(&SD.getSymbol())->setAbsolute();
+    }
+  }
+}
+
 void MachObjectWriter::ExecutePostLayoutBinding(MCAssembler &Asm,
                                                 const MCAsmLayout &Layout) {
   computeSectionAddresses(Asm, Layout);
@@ -564,6 +585,10 @@ void MachObjectWriter::ExecutePostLayoutBinding(MCAssembler &Asm,
   // Create symbol data for any indirect symbols.
   BindIndirectSymbols(Asm);
 
+  // Mark symbol difference expressions in variables (from .set or = directives)
+  // as absolute.
+  markAbsoluteVariableSymbols(Asm, Layout);
+
   // Compute symbol table information and bind symbol indices.
   ComputeSymbolTable(Asm, StringTable, LocalSymbolData, ExternalSymbolData,
                      UndefinedSymbolData);
@@ -795,8 +820,12 @@ void MachObjectWriter::WriteObject(MCAssembler &Asm,
          it = Asm.data_region_begin(), ie = Asm.data_region_end();
          it != ie; ++it) {
     const DataRegionData *Data = &(*it);
-    uint64_t Start = getSymbolAddress(&Layout.getAssembler().getSymbolData(*Data->Start), Layout);
-    uint64_t End = getSymbolAddress(&Layout.getAssembler().getSymbolData(*Data->End), Layout);
+    uint64_t Start =
+      getSymbolAddress(&Layout.getAssembler().getSymbolData(*Data->Start),
+                       Layout);
+    uint64_t End =
+      getSymbolAddress(&Layout.getAssembler().getSymbolData(*Data->End),
+                       Layout);
     DEBUG(dbgs() << "data in code region-- kind: " << Data->Kind
                  << "  start: " << Start << "(" << Data->Start->getName() << ")"
                  << "  end: " << End << "(" << Data->End->getName() << ")"
diff --git a/lib/MC/SubtargetFeature.cpp b/lib/MC/SubtargetFeature.cpp
index bd398ceec2..e3a9d3fa52 100644
--- a/lib/MC/SubtargetFeature.cpp
+++ b/lib/MC/SubtargetFeature.cpp
@@ -368,7 +368,7 @@ void SubtargetFeatures::print(raw_ostream &OS) const {
   OS << "\n";
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 /// dump - Dump feature info.
 ///
 void SubtargetFeatures::dump() const {
diff --git a/lib/Support/DAGDeltaAlgorithm.cpp b/lib/Support/DAGDeltaAlgorithm.cpp
index 1e89c6ad2f..34e82cf441 100644
--- a/lib/Support/DAGDeltaAlgorithm.cpp
+++ b/lib/Support/DAGDeltaAlgorithm.cpp
@@ -122,7 +122,7 @@ private:
     DDA.UpdatedSearchState(Changes, Sets, Required);
   }
 
-  /// ExecuteOneTest - Execute a single test predicate on the change set \arg S.
+  /// ExecuteOneTest - Execute a single test predicate on the change set \p S.
   bool ExecuteOneTest(const changeset_ty &S) {
     // Check dependencies invariant.
     DEBUG({
@@ -143,8 +143,8 @@ public:
 
   changeset_ty Run();
 
-  /// GetTestResult - Get the test result for the active set \arg Changes with
-  /// \arg Required changes from the cache, executing the test if necessary.
+  /// GetTestResult - Get the test result for the active set \p Changes with
+  /// \p Required changes from the cache, executing the test if necessary.
   ///
   /// \param Changes - The set of active changes being minimized, which should
   /// have their pred closure included in the test.
@@ -163,11 +163,11 @@ class DeltaActiveSetHelper : public DeltaAlgorithm {
 protected:
   /// UpdatedSearchState - Callback used when the search state changes.
   virtual void UpdatedSearchState(const changeset_ty &Changes,
-                                  const changesetlist_ty &Sets) {
+                                  const changesetlist_ty &Sets) LLVM_OVERRIDE {
     DDAI.UpdatedSearchState(Changes, Sets, Required);
   }
 
-  virtual bool ExecuteOneTest(const changeset_ty &S) {
+  virtual bool ExecuteOneTest(const changeset_ty &S) LLVM_OVERRIDE {
     return DDAI.GetTestResult(S, Required);
   }
 
diff --git a/lib/Support/DataStream.cpp b/lib/Support/DataStream.cpp
index 94d14a5e36..3a38e2a66b 100644
--- a/lib/Support/DataStream.cpp
+++ b/lib/Support/DataStream.cpp
@@ -58,7 +58,7 @@ public:
   virtual ~DataFileStreamer() {
     close(Fd);
   }
-  virtual size_t GetBytes(unsigned char *buf, size_t len) {
+  virtual size_t GetBytes(unsigned char *buf, size_t len) LLVM_OVERRIDE {
     NumStreamFetches++;
     return read(Fd, buf, len);
   }
diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp
index 9a2c39d72e..a13b9e2f87 100644
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp
@@ -303,6 +303,7 @@ std::string sys::getHostCPUName() {
         case 8:  return "k6-2";
         case 9:
         case 13: return "k6-3";
+        case 10: return "geode";
         default: return "pentium";
         }
       case 6:
diff --git a/lib/Support/Memory.cpp b/lib/Support/Memory.cpp
index 3cc8f5ee7b..12f083822f 100644
--- a/lib/Support/Memory.cpp
+++ b/lib/Support/Memory.cpp
@@ -16,10 +16,6 @@
 #include "llvm/Support/Valgrind.h"
 #include "llvm/Config/config.h"
 
-namespace llvm {
-using namespace sys;
-}
-
 // Include the platform-specific parts of this class.
 #ifdef LLVM_ON_UNIX
 #include "Unix/Memory.inc"
@@ -27,51 +23,3 @@ using namespace sys;
 #ifdef LLVM_ON_WIN32
 #include "Windows/Memory.inc"
 #endif
-
-extern "C" void sys_icache_invalidate(const void *Addr, size_t len);
-
-/// InvalidateInstructionCache - Before the JIT can run a block of code
-/// that has been emitted it must invalidate the instruction cache on some
-/// platforms.
-void llvm::sys::Memory::InvalidateInstructionCache(const void *Addr,
-                                                   size_t Len) {
-
-// icache invalidation for PPC and ARM.
-#if defined(__APPLE__)
-
-#  if (defined(__POWERPC__) || defined (__ppc__) || \
-     defined(_POWER) || defined(_ARCH_PPC)) || defined(__arm__)
-  sys_icache_invalidate(const_cast<void *>(Addr), Len);
-#  endif
-
-#else
-
-#  if (defined(__POWERPC__) || defined (__ppc__) || \
-       defined(_POWER) || defined(_ARCH_PPC)) && defined(__GNUC__)
-  const size_t LineSize = 32;
-
-  const intptr_t Mask = ~(LineSize - 1);
-  const intptr_t StartLine = ((intptr_t) Addr) & Mask;
-  const intptr_t EndLine = ((intptr_t) Addr + Len + LineSize - 1) & Mask;
-
-  for (intptr_t Line = StartLine; Line < EndLine; Line += LineSize)
-    asm volatile("dcbf 0, %0" : : "r"(Line));
-  asm volatile("sync");
-
-  for (intptr_t Line = StartLine; Line < EndLine; Line += LineSize)
-    asm volatile("icbi 0, %0" : : "r"(Line));
-  asm volatile("isync");
-#  elif defined(__arm__) && defined(__GNUC__)
-  // FIXME: Can we safely always call this for __GNUC__ everywhere?
-  const char *Start = static_cast<const char *>(Addr);
-  const char *End = Start + Len;
-  __clear_cache(const_cast<char *>(Start), const_cast<char *>(End));
-#  elif defined(__mips__)
-  const char *Start = static_cast<const char *>(Addr);
-  cacheflush(const_cast<char *>(Start), Len, BCACHE);
-#  endif
-
-#endif  // end apple
-
-  ValgrindDiscardTranslations(Addr, Len);
-}
diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp
index 61d51b6b48..8b009b84a5 100644
--- a/lib/Support/MemoryBuffer.cpp
+++ b/lib/Support/MemoryBuffer.cpp
@@ -81,12 +81,12 @@ public:
     init(InputData.begin(), InputData.end(), RequiresNullTerminator);
   }
 
-  virtual const char *getBufferIdentifier() const {
+  virtual const char *getBufferIdentifier() const LLVM_OVERRIDE {
      // The name is stored after the class itself.
     return reinterpret_cast<const char*>(this + 1);
   }
-  
-  virtual BufferKind getBufferKind() const {
+
+  virtual BufferKind getBufferKind() const LLVM_OVERRIDE {
     return MemoryBuffer_Malloc;
   }
 };
@@ -194,8 +194,8 @@ public:
     sys::Path::UnMapFilePages(reinterpret_cast<const char*>(RealStart),
                               RealSize);
   }
-  
-  virtual BufferKind getBufferKind() const {
+
+  virtual BufferKind getBufferKind() const LLVM_OVERRIDE {
     return MemoryBuffer_MMap;
   }
 };
diff --git a/lib/Support/StreamableMemoryObject.cpp b/lib/Support/StreamableMemoryObject.cpp
index fe3752a77a..59e27a263e 100644
--- a/lib/Support/StreamableMemoryObject.cpp
+++ b/lib/Support/StreamableMemoryObject.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/StreamableMemoryObject.h"
+#include "llvm/Support/Compiler.h"
 #include <cassert>
 #include <cstring>
 
@@ -23,18 +24,23 @@ public:
     assert(LastChar >= FirstChar && "Invalid start/end range");
   }
 
-  virtual uint64_t getBase() const { return 0; }
-  virtual uint64_t getExtent() const { return LastChar - FirstChar; }
-  virtual int readByte(uint64_t address, uint8_t* ptr) const;
+  virtual uint64_t getBase() const LLVM_OVERRIDE { return 0; }
+  virtual uint64_t getExtent() const LLVM_OVERRIDE {
+    return LastChar - FirstChar;
+  }
+  virtual int readByte(uint64_t address, uint8_t* ptr) const LLVM_OVERRIDE;
   virtual int readBytes(uint64_t address,
                         uint64_t size,
                         uint8_t* buf,
-                        uint64_t* copied) const;
-  virtual const uint8_t *getPointer(uint64_t address, uint64_t size) const;
-  virtual bool isValidAddress(uint64_t address) const {
+                        uint64_t* copied) const LLVM_OVERRIDE;
+  virtual const uint8_t *getPointer(uint64_t address,
+                                    uint64_t size) const LLVM_OVERRIDE;
+  virtual bool isValidAddress(uint64_t address) const LLVM_OVERRIDE {
     return validAddress(address);
   }
-  virtual bool isObjectEnd(uint64_t address) const {return objectEnd(address);}
+  virtual bool isObjectEnd(uint64_t address) const LLVM_OVERRIDE {
+    return objectEnd(address);
+  }
 
 private:
   const uint8_t* const FirstChar;
@@ -49,8 +55,8 @@ private:
     return static_cast<ptrdiff_t>(address) == LastChar - FirstChar;
   }
 
-  RawMemoryObject(const RawMemoryObject&);  // DO NOT IMPLEMENT
-  void operator=(const RawMemoryObject&);  // DO NOT IMPLEMENT
+  RawMemoryObject(const RawMemoryObject&) LLVM_DELETED_FUNCTION;
+  void operator=(const RawMemoryObject&) LLVM_DELETED_FUNCTION;
 };
 
 int RawMemoryObject::readByte(uint64_t address, uint8_t* ptr) const {
diff --git a/lib/Support/Unix/Memory.inc b/lib/Support/Unix/Memory.inc
index 7136acf6cf..f4cfbc65cf 100644
--- a/lib/Support/Unix/Memory.inc
+++ b/lib/Support/Unix/Memory.inc
@@ -14,6 +14,7 @@
 #include "Unix.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Process.h"
 #include "llvm/Support/Debug.h"
 
@@ -33,14 +34,142 @@
 #  endif
 #endif
 
+extern "C" void sys_icache_invalidate(const void *Addr, size_t len);
+
+namespace {
+
+int getPosixProtectionFlags(unsigned Flags) {
+  switch (Flags) {
+  case llvm::sys::Memory::MF_READ:
+    return PROT_READ;
+  case llvm::sys::Memory::MF_WRITE:
+    return PROT_WRITE;
+  case llvm::sys::Memory::MF_READ|llvm::sys::Memory::MF_WRITE:
+    return PROT_READ | PROT_WRITE;
+  case llvm::sys::Memory::MF_READ|llvm::sys::Memory::MF_EXEC:
+    return PROT_READ | PROT_EXEC;
+  case llvm::sys::Memory::MF_READ |
+	 llvm::sys::Memory::MF_WRITE |
+	 llvm::sys::Memory::MF_EXEC:
+    return PROT_READ | PROT_WRITE | PROT_EXEC;
+  case llvm::sys::Memory::MF_EXEC:
+    return PROT_EXEC;
+  default:
+    llvm_unreachable("Illegal memory protection flag specified!");
+  }
+  // Provide a default return value as required by some compilers.
+  return PROT_NONE;
+}
+
+} // namespace
+
+namespace llvm {
+namespace sys {
+
+MemoryBlock
+Memory::allocateMappedMemory(size_t NumBytes,
+                             const MemoryBlock *const NearBlock,
+                             unsigned PFlags,
+                             error_code &EC) {
+  EC = error_code::success();
+  if (NumBytes == 0)
+    return MemoryBlock();
+
+  static const size_t PageSize = Process::GetPageSize();
+  const size_t NumPages = (NumBytes+PageSize-1)/PageSize;
+
+  int fd = -1;
+#ifdef NEED_DEV_ZERO_FOR_MMAP
+  static int zero_fd = open("/dev/zero", O_RDWR);
+  if (zero_fd == -1) {
+    EC = error_code(errno, system_category());
+    return MemoryBlock();
+  }
+  fd = zero_fd;
+#endif
+
+  int MMFlags = MAP_PRIVATE |
+#ifdef HAVE_MMAP_ANONYMOUS
+  MAP_ANONYMOUS
+#else
+  MAP_ANON
+#endif
+  ; // Ends statement above
+
+  int Protect = getPosixProtectionFlags(PFlags);
+
+  // Use any near hint and the page size to set a page-aligned starting address
+  uintptr_t Start = NearBlock ? reinterpret_cast<uintptr_t>(NearBlock->base()) +
+                                      NearBlock->size() : 0;
+  if (Start && Start % PageSize)
+    Start += PageSize - Start % PageSize;
+
+  void *Addr = ::mmap(reinterpret_cast<void*>(Start), PageSize*NumPages,
+                      Protect, MMFlags, fd, 0);
+  if (Addr == MAP_FAILED) {
+    if (NearBlock) //Try again without a near hint
+      return allocateMappedMemory(NumBytes, 0, PFlags, EC);
+
+    EC = error_code(errno, system_category());
+    return MemoryBlock();
+  }
+
+  MemoryBlock Result;
+  Result.Address = Addr;
+  Result.Size = NumPages*PageSize;
+
+  if (PFlags & MF_EXEC)
+    Memory::InvalidateInstructionCache(Result.Address, Result.Size);
+
+  return Result;
+}
+
+error_code
+Memory::releaseMappedMemory(MemoryBlock &M) {
+  if (M.Address == 0 || M.Size == 0)
+    return error_code::success();
+
+  if (0 != ::munmap(M.Address, M.Size))
+    return error_code(errno, system_category());
+
+  M.Address = 0;
+  M.Size = 0;
+
+  return error_code::success();
+}
+
+error_code
+Memory::protectMappedMemory(const MemoryBlock &M, unsigned Flags) {
+  if (M.Address == 0 || M.Size == 0)
+    return error_code::success();
+
+  if (!Flags)
+    return error_code(EINVAL, generic_category());
+
+  int Protect = getPosixProtectionFlags(Flags);
+#ifndef __native_client__
+  int Result = ::mprotect(M.Address, M.Size, Protect);
+#else
+  int Result = -1;
+  llvm_unreachable("Native client does not support mprotect");
+#endif
+  if (Result != 0)
+    return error_code(errno, system_category());
+
+  if (Flags & MF_EXEC)
+    Memory::InvalidateInstructionCache(M.Address, M.Size);
+
+  return error_code::success();
+}
+
 /// AllocateRWX - Allocate a slab of memory with read/write/execute
 /// permissions.  This is typically used for JIT applications where we want
 /// to emit code to the memory then jump to it.  Getting this type of memory
 /// is very OS specific.
 ///
-llvm::sys::MemoryBlock
-llvm::sys::Memory::AllocateRWX(size_t NumBytes, const MemoryBlock* NearBlock,
-                               std::string *ErrMsg) {
+MemoryBlock
+Memory::AllocateRWX(size_t NumBytes, const MemoryBlock* NearBlock,
+                    std::string *ErrMsg) {
   if (NumBytes == 0) return MemoryBlock();
 
   size_t pageSize = Process::GetPageSize();
@@ -90,7 +219,7 @@ dbgs() << "calling mmap, start " << start << "\n";
                                 VM_PROT_READ | VM_PROT_EXECUTE | VM_PROT_COPY);
   if (KERN_SUCCESS != kr) {
     MakeErrMsg(ErrMsg, "vm_protect max RX failed");
-    return sys::MemoryBlock();
+    return MemoryBlock();
   }
 
   kr = vm_protect(mach_task_self(), (vm_address_t)pa,
@@ -98,7 +227,7 @@ dbgs() << "calling mmap, start " << start << "\n";
                   VM_PROT_READ | VM_PROT_WRITE);
   if (KERN_SUCCESS != kr) {
     MakeErrMsg(ErrMsg, "vm_protect RW failed");
-    return sys::MemoryBlock();
+    return MemoryBlock();
   }
 #endif
 
@@ -109,17 +238,17 @@ dbgs() << "calling mmap, start " << start << "\n";
   return result;
 }
 
-bool llvm::sys::Memory::ReleaseRWX(MemoryBlock &M, std::string *ErrMsg) {
+bool Memory::ReleaseRWX(MemoryBlock &M, std::string *ErrMsg) {
   if (M.Address == 0 || M.Size == 0) return false;
   if (0 != ::munmap(M.Address, M.Size))
     return MakeErrMsg(ErrMsg, "Can't release RWX Memory");
   return false;
 }
 
-bool llvm::sys::Memory::setWritable (MemoryBlock &M, std::string *ErrMsg) {
+bool Memory::setWritable (MemoryBlock &M, std::string *ErrMsg) {
 #if defined(__APPLE__) && defined(__arm__)
   if (M.Address == 0 || M.Size == 0) return false;
-  sys::Memory::InvalidateInstructionCache(M.Address, M.Size);
+  Memory::InvalidateInstructionCache(M.Address, M.Size);
   kern_return_t kr = vm_protect(mach_task_self(), (vm_address_t)M.Address,
     (vm_size_t)M.Size, 0, VM_PROT_READ | VM_PROT_WRITE);
   return KERN_SUCCESS == kr;
@@ -128,10 +257,10 @@ bool llvm::sys::Memory::setWritable (MemoryBlock &M, std::string *ErrMsg) {
 #endif
 }
 
-bool llvm::sys::Memory::setExecutable (MemoryBlock &M, std::string *ErrMsg) {
+bool Memory::setExecutable (MemoryBlock &M, std::string *ErrMsg) {
 #if defined(__APPLE__) && defined(__arm__)
   if (M.Address == 0 || M.Size == 0) return false;
-  sys::Memory::InvalidateInstructionCache(M.Address, M.Size);
+  Memory::InvalidateInstructionCache(M.Address, M.Size);
   kern_return_t kr = vm_protect(mach_task_self(), (vm_address_t)M.Address,
     (vm_size_t)M.Size, 0, VM_PROT_READ | VM_PROT_EXECUTE | VM_PROT_COPY);
   return KERN_SUCCESS == kr;
@@ -140,7 +269,7 @@ bool llvm::sys::Memory::setExecutable (MemoryBlock &M, std::string *ErrMsg) {
 #endif
 }
 
-bool llvm::sys::Memory::setRangeWritable(const void *Addr, size_t Size) {
+bool Memory::setRangeWritable(const void *Addr, size_t Size) {
 #if defined(__APPLE__) && defined(__arm__)
   kern_return_t kr = vm_protect(mach_task_self(), (vm_address_t)Addr,
                                 (vm_size_t)Size, 0,
@@ -151,7 +280,7 @@ bool llvm::sys::Memory::setRangeWritable(const void *Addr, size_t Size) {
 #endif
 }
 
-bool llvm::sys::Memory::setRangeExecutable(const void *Addr, size_t Size) {
+bool Memory::setRangeExecutable(const void *Addr, size_t Size) {
 #if defined(__APPLE__) && defined(__arm__)
   kern_return_t kr = vm_protect(mach_task_self(), (vm_address_t)Addr,
                                 (vm_size_t)Size, 0,
@@ -161,3 +290,52 @@ bool llvm::sys::Memory::setRangeExecutable(const void *Addr, size_t Size) {
   return true;
 #endif
 }
+
+/// InvalidateInstructionCache - Before the JIT can run a block of code
+/// that has been emitted it must invalidate the instruction cache on some
+/// platforms.
+void Memory::InvalidateInstructionCache(const void *Addr,
+                                        size_t Len) {
+
+// icache invalidation for PPC and ARM.
+#if defined(__APPLE__)
+
+#  if (defined(__POWERPC__) || defined (__ppc__) || \
+     defined(_POWER) || defined(_ARCH_PPC)) || defined(__arm__)
+  sys_icache_invalidate(const_cast<void *>(Addr), Len);
+#  endif
+
+#else
+
+#  if (defined(__POWERPC__) || defined (__ppc__) || \
+       defined(_POWER) || defined(_ARCH_PPC)) && defined(__GNUC__)
+  const size_t LineSize = 32;
+
+  const intptr_t Mask = ~(LineSize - 1);
+  const intptr_t StartLine = ((intptr_t) Addr) & Mask;
+  const intptr_t EndLine = ((intptr_t) Addr + Len + LineSize - 1) & Mask;
+
+  for (intptr_t Line = StartLine; Line < EndLine; Line += LineSize)
+    asm volatile("dcbf 0, %0" : : "r"(Line));
+  asm volatile("sync");
+
+  for (intptr_t Line = StartLine; Line < EndLine; Line += LineSize)
+    asm volatile("icbi 0, %0" : : "r"(Line));
+  asm volatile("isync");
+#  elif defined(__arm__) && defined(__GNUC__)
+  // FIXME: Can we safely always call this for __GNUC__ everywhere?
+  const char *Start = static_cast<const char *>(Addr);
+  const char *End = Start + Len;
+  __clear_cache(const_cast<char *>(Start), const_cast<char *>(End));
+#  elif defined(__mips__)
+  const char *Start = static_cast<const char *>(Addr);
+  cacheflush(const_cast<char *>(Start), Len, BCACHE);
+#  endif
+
+#endif  // end apple
+
+  ValgrindDiscardTranslations(Addr, Len);
+}
+
+} // namespace sys
+} // namespace llvm
diff --git a/lib/Support/Unix/Signals.inc b/lib/Support/Unix/Signals.inc
index 704f4681bc..e05e81acaf 100644
--- a/lib/Support/Unix/Signals.inc
+++ b/lib/Support/Unix/Signals.inc
@@ -249,7 +249,7 @@ void llvm::sys::AddSignalHandler(void (*FnPtr)(void *), void *Cookie) {
 // On glibc systems we have the 'backtrace' function, which works nicely, but
 // doesn't demangle symbols.
 static void PrintStackTrace(void *) {
-#ifdef HAVE_BACKTRACE
+#if defined(HAVE_BACKTRACE) && defined(ENABLE_BACKTRACE)
   static void* StackTrace[256];
   // Use backtrace() to output a backtrace on Linux systems with glibc.
   int depth = backtrace(StackTrace,
diff --git a/lib/Support/Windows/Memory.inc b/lib/Support/Windows/Memory.inc
index fcc72837c4..cb80f2817c 100644
--- a/lib/Support/Windows/Memory.inc
+++ b/lib/Support/Windows/Memory.inc
@@ -12,51 +12,163 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Windows.h"
 #include "llvm/Support/DataTypes.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Process.h"
+#include "Windows.h"
+
+namespace {
+
+DWORD getWindowsProtectionFlags(unsigned Flags) {
+  switch (Flags) {
+  // Contrary to what you might expect, the Windows page protection flags
+  // are not a bitwise combination of RWX values
+  case llvm::sys::Memory::MF_READ:
+    return PAGE_READONLY;
+  case llvm::sys::Memory::MF_WRITE:
+    // Note: PAGE_WRITE is not supported by VirtualProtect
+    return PAGE_READWRITE;
+  case llvm::sys::Memory::MF_READ|llvm::sys::Memory::MF_WRITE:
+    return PAGE_READWRITE;
+  case llvm::sys::Memory::MF_READ|llvm::sys::Memory::MF_EXEC:
+    return PAGE_EXECUTE_READ;
+  case llvm::sys::Memory::MF_READ |
+         llvm::sys::Memory::MF_WRITE |
+         llvm::sys::Memory::MF_EXEC:
+    return PAGE_EXECUTE_READWRITE;
+  case llvm::sys::Memory::MF_EXEC:
+    return PAGE_EXECUTE;
+  default:
+    llvm_unreachable("Illegal memory protection flag specified!");
+  }
+  // Provide a default return value as required by some compilers.
+  return PAGE_NOACCESS;
+}
+
+size_t getAllocationGranularity() {
+  SYSTEM_INFO  Info;
+  ::GetSystemInfo(&Info);
+  if (Info.dwPageSize > Info.dwAllocationGranularity)
+    return Info.dwPageSize;
+  else
+    return Info.dwAllocationGranularity;
+}
+
+} // namespace
 
 namespace llvm {
-using namespace sys;
+namespace sys {
 
 //===----------------------------------------------------------------------===//
 //=== WARNING: Implementation here must contain only Win32 specific code
 //===          and must not be UNIX code
 //===----------------------------------------------------------------------===//
 
-MemoryBlock Memory::AllocateRWX(size_t NumBytes,
-                                const MemoryBlock *NearBlock,
-                                std::string *ErrMsg) {
-  if (NumBytes == 0) return MemoryBlock();
+MemoryBlock Memory::allocateMappedMemory(size_t NumBytes,
+                                         const MemoryBlock *const NearBlock,
+                                         unsigned Flags,
+                                         error_code &EC) {
+  EC = error_code::success();
+  if (NumBytes == 0)
+    return MemoryBlock();
+
+  // While we'd be happy to allocate single pages, the Windows allocation
+  // granularity may be larger than a single page (in practice, it is 64K)
+  // so mapping less than that will create an unreachable fragment of memory.
+  static const size_t Granularity = getAllocationGranularity();
+  const size_t NumBlocks = (NumBytes+Granularity-1)/Granularity;
 
-  static const size_t pageSize = Process::GetPageSize();
-  size_t NumPages = (NumBytes+pageSize-1)/pageSize;
+  uintptr_t Start = NearBlock ? reinterpret_cast<uintptr_t>(NearBlock->base()) +
+                                NearBlock->size()
+                           : NULL;
 
-  PVOID start = NearBlock ? static_cast<unsigned char *>(NearBlock->base()) +
-                                NearBlock->size() : NULL;
+  // If the requested address is not aligned to the allocation granularity,
+  // round up to get beyond NearBlock. VirtualAlloc would have rounded down.
+  if (Start && Start % Granularity != 0)
+    Start += Granularity - Start % Granularity;
 
-  void *pa = VirtualAlloc(start, NumPages*pageSize, MEM_RESERVE | MEM_COMMIT,
-                  PAGE_EXECUTE_READWRITE);
-  if (pa == NULL) {
+  DWORD Protect = getWindowsProtectionFlags(Flags);
+
+  void *PA = ::VirtualAlloc(reinterpret_cast<void*>(Start),
+                            NumBlocks*Granularity,
+                            MEM_RESERVE | MEM_COMMIT, Protect);
+  if (PA == NULL) {
     if (NearBlock) {
       // Try again without the NearBlock hint
-      return AllocateRWX(NumBytes, NULL, ErrMsg);
+      return allocateMappedMemory(NumBytes, NULL, Flags, EC);
     }
-    MakeErrMsg(ErrMsg, "Can't allocate RWX Memory: ");
+    EC = error_code(::GetLastError(), system_category());
     return MemoryBlock();
   }
 
-  MemoryBlock result;
-  result.Address = pa;
-  result.Size = NumPages*pageSize;
-  return result;
+  MemoryBlock Result;
+  Result.Address = PA;
+  Result.Size = NumBlocks*Granularity;
+                                 ;
+  if (Flags & MF_EXEC)
+    Memory::InvalidateInstructionCache(Result.Address, Result.Size);
+
+  return Result;
 }
 
-bool Memory::ReleaseRWX(MemoryBlock &M, std::string *ErrMsg) {
-  if (M.Address == 0 || M.Size == 0) return false;
+error_code Memory::releaseMappedMemory(MemoryBlock &M) {
+  if (M.Address == 0 || M.Size == 0)
+    return error_code::success();
+
   if (!VirtualFree(M.Address, 0, MEM_RELEASE))
-    return MakeErrMsg(ErrMsg, "Can't release RWX Memory: ");
-  return false;
+    return error_code(::GetLastError(), system_category());
+
+  M.Address = 0;
+  M.Size = 0;
+
+  return error_code::success();
+}
+
+error_code Memory::protectMappedMemory(const MemoryBlock &M,
+                                       unsigned Flags) {
+  if (M.Address == 0 || M.Size == 0)
+    return error_code::success();
+
+  DWORD Protect = getWindowsProtectionFlags(Flags);
+
+  DWORD OldFlags;
+  if (!VirtualProtect(M.Address, M.Size, Protect, &OldFlags))
+    return error_code(::GetLastError(), system_category());
+
+  if (Flags & MF_EXEC)
+    Memory::InvalidateInstructionCache(M.Address, M.Size);
+
+  return error_code::success();
+}
+
+/// InvalidateInstructionCache - Before the JIT can run a block of code
+/// that has been emitted it must invalidate the instruction cache on some
+/// platforms.
+void Memory::InvalidateInstructionCache(
+    const void *Addr, size_t Len) {
+  FlushInstructionCache(GetCurrentProcess(), Addr, Len);
+}
+
+
+MemoryBlock Memory::AllocateRWX(size_t NumBytes,
+                                const MemoryBlock *NearBlock,
+                                std::string *ErrMsg) {
+  MemoryBlock MB;
+  error_code EC;
+  MB = allocateMappedMemory(NumBytes, NearBlock,
+                            MF_READ|MF_WRITE|MF_EXEC, EC);
+  if (EC != error_code::success() && ErrMsg) {
+    MakeErrMsg(ErrMsg, EC.message());
+  }
+  return MB;
+}
+
+bool Memory::ReleaseRWX(MemoryBlock &M, std::string *ErrMsg) {
+  error_code EC = releaseMappedMemory(M);
+  if (EC == error_code::success())
+    return false;
+  MakeErrMsg(ErrMsg, EC.message());
+  return true;
 }
 
 static DWORD getProtection(const void *addr) {
@@ -93,7 +205,7 @@ bool Memory::setRangeWritable(const void *Addr, size_t Size) {
   }
 
   DWORD oldProt;
-  sys::Memory::InvalidateInstructionCache(Addr, Size);
+  Memory::InvalidateInstructionCache(Addr, Size);
   return ::VirtualProtect(const_cast<LPVOID>(Addr), Size, prot, &oldProt)
             == TRUE;
 }
@@ -112,9 +224,10 @@ bool Memory::setRangeExecutable(const void *Addr, size_t Size) {
   }
 
   DWORD oldProt;
-  sys::Memory::InvalidateInstructionCache(Addr, Size);
+  Memory::InvalidateInstructionCache(Addr, Size);
   return ::VirtualProtect(const_cast<LPVOID>(Addr), Size, prot, &oldProt)
             == TRUE;
 }
 
-}
+} // namespace sys
+} // namespace llvm
diff --git a/lib/Support/system_error.cpp b/lib/Support/system_error.cpp
index 56898de315..2df223ca71 100644
--- a/lib/Support/system_error.cpp
+++ b/lib/Support/system_error.cpp
@@ -48,8 +48,8 @@ _do_message::message(int ev) const {
 
 class _generic_error_category : public _do_message {
 public:
-  virtual const char* name() const;
-  virtual std::string message(int ev) const;
+  virtual const char* name() const LLVM_OVERRIDE;
+  virtual std::string message(int ev) const LLVM_OVERRIDE;
 };
 
 const char*
@@ -74,9 +74,9 @@ generic_category() {
 
 class _system_error_category : public _do_message {
 public:
-  virtual const char* name() const;
-  virtual std::string message(int ev) const;
-  virtual error_condition default_error_condition(int ev) const;
+  virtual const char* name() const LLVM_OVERRIDE;
+  virtual std::string message(int ev) const LLVM_OVERRIDE;
+  virtual error_condition default_error_condition(int ev) const LLVM_OVERRIDE;
 };
 
 const char*
diff --git a/lib/TableGen/Record.cpp b/lib/TableGen/Record.cpp
index b2a7b628e4..9955617aa8 100644
--- a/lib/TableGen/Record.cpp
+++ b/lib/TableGen/Record.cpp
@@ -344,57 +344,53 @@ bool RecordRecTy::baseClassOf(const RecordRecTy *RHS) const {
   return false;
 }
 
-
 /// resolveTypes - Find a common type that T1 and T2 convert to.
 /// Return 0 if no such type exists.
 ///
 RecTy *llvm::resolveTypes(RecTy *T1, RecTy *T2) {
-  if (!T1->typeIsConvertibleTo(T2)) {
-    if (!T2->typeIsConvertibleTo(T1)) {
-      // If one is a Record type, check superclasses
-      RecordRecTy *RecTy1 = dynamic_cast<RecordRecTy*>(T1);
-      if (RecTy1) {
-        // See if T2 inherits from a type T1 also inherits from
-        const std::vector<Record *> &T1SuperClasses =
-          RecTy1->getRecord()->getSuperClasses();
-        for(std::vector<Record *>::const_iterator i = T1SuperClasses.begin(),
-              iend = T1SuperClasses.end();
-            i != iend;
-            ++i) {
-          RecordRecTy *SuperRecTy1 = RecordRecTy::get(*i);
-          RecTy *NewType1 = resolveTypes(SuperRecTy1, T2);
-          if (NewType1 != 0) {
-            if (NewType1 != SuperRecTy1) {
-              delete SuperRecTy1;
-            }
-            return NewType1;
-          }
+  if (T1->typeIsConvertibleTo(T2))
+    return T2;
+  if (T2->typeIsConvertibleTo(T1))
+    return T1;
+
+  // If one is a Record type, check superclasses
+  if (RecordRecTy *RecTy1 = dynamic_cast<RecordRecTy*>(T1)) {
+    // See if T2 inherits from a type T1 also inherits from
+    const std::vector<Record *> &T1SuperClasses =
+      RecTy1->getRecord()->getSuperClasses();
+    for(std::vector<Record *>::const_iterator i = T1SuperClasses.begin(),
+          iend = T1SuperClasses.end();
+        i != iend;
+        ++i) {
+      RecordRecTy *SuperRecTy1 = RecordRecTy::get(*i);
+      RecTy *NewType1 = resolveTypes(SuperRecTy1, T2);
+      if (NewType1 != 0) {
+        if (NewType1 != SuperRecTy1) {
+          delete SuperRecTy1;
         }
+        return NewType1;
       }
-      RecordRecTy *RecTy2 = dynamic_cast<RecordRecTy*>(T2);
-      if (RecTy2) {
-        // See if T1 inherits from a type T2 also inherits from
-        const std::vector<Record *> &T2SuperClasses =
-          RecTy2->getRecord()->getSuperClasses();
-        for (std::vector<Record *>::const_iterator i = T2SuperClasses.begin(),
-              iend = T2SuperClasses.end();
-            i != iend;
-            ++i) {
-          RecordRecTy *SuperRecTy2 = RecordRecTy::get(*i);
-          RecTy *NewType2 = resolveTypes(T1, SuperRecTy2);
-          if (NewType2 != 0) {
-            if (NewType2 != SuperRecTy2) {
-              delete SuperRecTy2;
-            }
-            return NewType2;
-          }
+    }
+  }
+  if (RecordRecTy *RecTy2 = dynamic_cast<RecordRecTy*>(T2)) {
+    // See if T1 inherits from a type T2 also inherits from
+    const std::vector<Record *> &T2SuperClasses =
+      RecTy2->getRecord()->getSuperClasses();
+    for (std::vector<Record *>::const_iterator i = T2SuperClasses.begin(),
+          iend = T2SuperClasses.end();
+        i != iend;
+        ++i) {
+      RecordRecTy *SuperRecTy2 = RecordRecTy::get(*i);
+      RecTy *NewType2 = resolveTypes(T1, SuperRecTy2);
+      if (NewType2 != 0) {
+        if (NewType2 != SuperRecTy2) {
+          delete SuperRecTy2;
         }
+        return NewType2;
       }
-      return 0;
     }
-    return T2;
   }
-  return T1;
+  return 0;
 }
 
 
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index 06ed89395f..38509a3400 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -139,6 +139,11 @@ def ProcA9      : SubtargetFeature<"a9", "ARMProcFamily", "CortexA9",
                                    [FeatureVMLxForwarding,
                                     FeatureT2XtPk, FeatureFP16,
                                     FeatureAvoidPartialCPSR]>;
+// FIXME: It has not been determined if A15 has these features.
+def ProcA15      : SubtargetFeature<"a15", "ARMProcFamily", "CortexA15",
+                                   "Cortex-A15 ARM processors",
+                                   [FeatureT2XtPk, FeatureFP16,
+                                    FeatureAvoidPartialCPSR]>;
 
 class ProcNoItin<string Name, list<SubtargetFeature> Features>
  : Processor<Name, NoItineraries, Features>;
@@ -218,6 +223,10 @@ def : ProcessorModel<"cortex-a9-mp", CortexA9Model,
                                     [ProcA9, HasV7Ops, FeatureNEON, FeatureDB,
                                      FeatureDSPThumb2, FeatureMP,
                                      FeatureHasRAS]>;
+// FIXME: A15 has currently the same ProcessorModel as A9.
+def : ProcessorModel<"cortex-a15",   CortexA9Model,
+                                    [ProcA15, HasV7Ops, FeatureNEON, FeatureDB,
+                                     FeatureDSPThumb2, FeatureHasRAS]>;
 
 // V7M Processors.
 def : ProcNoItin<"cortex-m3",       [HasV7Ops,
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index e1a2d3649a..c08294918e 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -2344,6 +2344,37 @@ bool ARMBaseInstrInfo::FoldImmediate(MachineInstr *UseMI,
   return true;
 }
 
+// Return the number of 32-bit words loaded by LDM or stored by STM. If this
+// can't be easily determined return 0 (missing MachineMemOperand).
+//
+// FIXME: The current MachineInstr design does not support relying on machine
+// mem operands to determine the width of a memory access. Instead, we expect
+// the target to provide this information based on the instruction opcode and
+// operands. However, using MachineMemOperand is a the best solution now for
+// two reasons:
+//
+// 1) getNumMicroOps tries to infer LDM memory width from the total number of MI
+// operands. This is much more dangerous than using the MachineMemOperand
+// sizes because CodeGen passes can insert/remove optional machine operands. In
+// fact, it's totally incorrect for preRA passes and appears to be wrong for
+// postRA passes as well.
+//
+// 2) getNumLDMAddresses is only used by the scheduling machine model and any
+// machine model that calls this should handle the unknown (zero size) case.
+//
+// Long term, we should require a target hook that verifies MachineMemOperand
+// sizes during MC lowering. That target hook should be local to MC lowering
+// because we can't ensure that it is aware of other MI forms. Doing this will
+// ensure that MachineMemOperands are correctly propagated through all passes.
+unsigned ARMBaseInstrInfo::getNumLDMAddresses(const MachineInstr *MI) const {
+  unsigned Size = 0;
+  for (MachineInstr::mmo_iterator I = MI->memoperands_begin(),
+         E = MI->memoperands_end(); I != E; ++I) {
+    Size += (*I)->getSize();
+  }
+  return Size / 4;
+}
+
 unsigned
 ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
                                  const MachineInstr *MI) const {
@@ -2432,7 +2463,7 @@ ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
       if (NumRegs % 2)
         ++A8UOps;
       return A8UOps;
-    } else if (Subtarget.isCortexA9()) {
+    } else if (Subtarget.isLikeA9()) {
       int A9UOps = (NumRegs / 2);
       // If there are odd number of registers or if it's not 64-bit aligned,
       // then it takes an extra AGU (Address Generation Unit) cycle.
@@ -2465,7 +2496,7 @@ ARMBaseInstrInfo::getVLDMDefCycle(const InstrItineraryData *ItinData,
     DefCycle = RegNo / 2 + 1;
     if (RegNo % 2)
       ++DefCycle;
-  } else if (Subtarget.isCortexA9()) {
+  } else if (Subtarget.isLikeA9()) {
     DefCycle = RegNo;
     bool isSLoad = false;
 
@@ -2509,7 +2540,7 @@ ARMBaseInstrInfo::getLDMDefCycle(const InstrItineraryData *ItinData,
       DefCycle = 1;
     // Result latency is issue cycle + 2: E2.
     DefCycle += 2;
-  } else if (Subtarget.isCortexA9()) {
+  } else if (Subtarget.isLikeA9()) {
     DefCycle = (RegNo / 2);
     // If there are odd number of registers or if it's not 64-bit aligned,
     // then it takes an extra AGU (Address Generation Unit) cycle.
@@ -2540,7 +2571,7 @@ ARMBaseInstrInfo::getVSTMUseCycle(const InstrItineraryData *ItinData,
     UseCycle = RegNo / 2 + 1;
     if (RegNo % 2)
       ++UseCycle;
-  } else if (Subtarget.isCortexA9()) {
+  } else if (Subtarget.isLikeA9()) {
     UseCycle = RegNo;
     bool isSStore = false;
 
@@ -2581,7 +2612,7 @@ ARMBaseInstrInfo::getSTMUseCycle(const InstrItineraryData *ItinData,
       UseCycle = 2;
     // Read in E3.
     UseCycle += 2;
-  } else if (Subtarget.isCortexA9()) {
+  } else if (Subtarget.isLikeA9()) {
     UseCycle = (RegNo / 2);
     // If there are odd number of registers or if it's not 64-bit aligned,
     // then it takes an extra AGU (Address Generation Unit) cycle.
@@ -2766,7 +2797,7 @@ static int adjustDefLatency(const ARMSubtarget &Subtarget,
                             const MachineInstr *DefMI,
                             const MCInstrDesc *DefMCID, unsigned DefAlign) {
   int Adjust = 0;
-  if (Subtarget.isCortexA8() || Subtarget.isCortexA9()) {
+  if (Subtarget.isCortexA8() || Subtarget.isLikeA9()) {
     // FIXME: Shifter op hack: no shift (i.e. [r +/- r]) or [r + r << 2]
     // variants are one cycle cheaper.
     switch (DefMCID->getOpcode()) {
@@ -2793,7 +2824,7 @@ static int adjustDefLatency(const ARMSubtarget &Subtarget,
     }
   }
 
-  if (DefAlign < 8 && Subtarget.isCortexA9()) {
+  if (DefAlign < 8 && Subtarget.isLikeA9()) {
     switch (DefMCID->getOpcode()) {
     default: break;
     case ARM::VLD1q8:
@@ -2951,7 +2982,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
   if (Reg == ARM::CPSR) {
     if (DefMI->getOpcode() == ARM::FMSTAT) {
       // fpscr -> cpsr stalls over 20 cycles on A8 (and earlier?)
-      return Subtarget.isCortexA9() ? 1 : 20;
+      return Subtarget.isLikeA9() ? 1 : 20;
     }
 
     // CPSR set and branch can be paired in the same cycle.
@@ -3017,7 +3048,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
 
   if (!UseNode->isMachineOpcode()) {
     int Latency = ItinData->getOperandCycle(DefMCID.getSchedClass(), DefIdx);
-    if (Subtarget.isCortexA9())
+    if (Subtarget.isLikeA9())
       return Latency <= 2 ? 1 : Latency - 1;
     else
       return Latency <= 3 ? 1 : Latency - 2;
@@ -3034,7 +3065,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
                                   UseMCID, UseIdx, UseAlign);
 
   if (Latency > 1 &&
-      (Subtarget.isCortexA8() || Subtarget.isCortexA9())) {
+      (Subtarget.isCortexA8() || Subtarget.isLikeA9())) {
     // FIXME: Shifter op hack: no shift (i.e. [r +/- r]) or [r + r << 2]
     // variants are one cycle cheaper.
     switch (DefMCID.getOpcode()) {
@@ -3063,7 +3094,7 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
     }
   }
 
-  if (DefAlign < 8 && Subtarget.isCortexA9())
+  if (DefAlign < 8 && Subtarget.isLikeA9())
     switch (DefMCID.getOpcode()) {
     default: break;
     case ARM::VLD1q8:
@@ -3356,9 +3387,9 @@ ARMBaseInstrInfo::getExecutionDomain(const MachineInstr *MI) const {
   if (MI->getOpcode() == ARM::VMOVD && !isPredicated(MI))
     return std::make_pair(ExeVFP, (1<<ExeVFP) | (1<<ExeNEON));
 
-  // Cortex-A9 is particularly picky about mixing the two and wants these
+  // A9-like cores are particularly picky about mixing the two and want these
   // converted.
-  if (Subtarget.isCortexA9() && !isPredicated(MI) &&
+  if (Subtarget.isLikeA9() && !isPredicated(MI) &&
       (MI->getOpcode() == ARM::VMOVRS ||
        MI->getOpcode() == ARM::VMOVSR ||
        MI->getOpcode() == ARM::VMOVS))
@@ -3396,6 +3427,48 @@ static unsigned getCorrespondingDRegAndLane(const TargetRegisterInfo *TRI,
   return DReg;
 }
 
+/// getImplicitSPRUseForDPRUse - Given a use of a DPR register and lane, 
+/// set ImplicitSReg to a register number that must be marked as implicit-use or
+/// zero if no register needs to be defined as implicit-use.
+///
+/// If the function cannot determine if an SPR should be marked implicit use or
+/// not, it returns false.
+///
+/// This function handles cases where an instruction is being modified from taking
+/// an SPR to a DPR[Lane]. A use of the DPR is being added, which may conflict 
+/// with an earlier def of an SPR corresponding to DPR[Lane^1] (i.e. the other
+/// lane of the DPR).
+///
+/// If the other SPR is defined, an implicit-use of it should be added. Else,
+/// (including the case where the DPR itself is defined), it should not.
+/// 
+static bool getImplicitSPRUseForDPRUse(const TargetRegisterInfo *TRI,
+                                       MachineInstr *MI,
+                                       unsigned DReg, unsigned Lane,
+                                       unsigned &ImplicitSReg) {
+  // If the DPR is defined or used already, the other SPR lane will be chained
+  // correctly, so there is nothing to be done.
+  if (MI->definesRegister(DReg, TRI) || MI->readsRegister(DReg, TRI)) {
+    ImplicitSReg = 0;
+    return true;
+  }
+
+  // Otherwise we need to go searching to see if the SPR is set explicitly.
+  ImplicitSReg = TRI->getSubReg(DReg,
+                                (Lane & 1) ? ARM::ssub_0 : ARM::ssub_1);
+  MachineBasicBlock::LivenessQueryResult LQR =
+    MI->getParent()->computeRegisterLiveness(TRI, ImplicitSReg, MI);
+
+  if (LQR == MachineBasicBlock::LQR_Live)
+    return true;
+  else if (LQR == MachineBasicBlock::LQR_Unknown)
+    return false;
+
+  // If the register is known not to be live, there is no need to add an
+  // implicit-use.
+  ImplicitSReg = 0;
+  return true;
+}
 
 void
 ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
@@ -3453,7 +3526,7 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
       // was dead before here.
       MIB.addReg(SrcReg, RegState::Implicit);
       break;
-    case ARM::VMOVSR:
+    case ARM::VMOVSR: {
       if (Domain != ExeNEON)
         break;
       assert(!isPredicated(MI) && "Cannot predicate a VSETLN");
@@ -3464,12 +3537,9 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
 
       DReg = getCorrespondingDRegAndLane(TRI, DstReg, Lane);
 
-      // If we insert both a novel <def> and an <undef> on the DReg, we break
-      // any existing dependency chain on the unused lane. Either already being
-      // present means this instruction is in that chain anyway so we can make
-      // the transformation.
-      if (!MI->definesRegister(DReg, TRI) && !MI->readsRegister(DReg, TRI))
-          break;
+      unsigned ImplicitSReg;
+      if (!getImplicitSPRUseForDPRUse(TRI, MI, DReg, Lane, ImplicitSReg))
+        break;
 
       for (unsigned i = MI->getDesc().getNumOperands(); i; --i)
         MI->RemoveOperand(i-1);
@@ -3486,7 +3556,10 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
       // The narrower destination must be marked as set to keep previous chains
       // in place.
       MIB.addReg(DstReg, RegState::Define | RegState::Implicit);
+      if (ImplicitSReg != 0)
+        MIB.addReg(ImplicitSReg, RegState::Implicit);
       break;
+    }
     case ARM::VMOVS: {
       if (Domain != ExeNEON)
         break;
@@ -3499,12 +3572,9 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
       DDst = getCorrespondingDRegAndLane(TRI, DstReg, DstLane);
       DSrc = getCorrespondingDRegAndLane(TRI, SrcReg, SrcLane);
 
-      // If we insert both a novel <def> and an <undef> on the DReg, we break
-      // any existing dependency chain on the unused lane. Either already being
-      // present means this instruction is in that chain anyway so we can make
-      // the transformation.
-      if (!MI->definesRegister(DDst, TRI) && !MI->readsRegister(DDst, TRI))
-          break;
+      unsigned ImplicitSReg;
+      if (!getImplicitSPRUseForDPRUse(TRI, MI, DSrc, SrcLane, ImplicitSReg))
+        break;
 
       for (unsigned i = MI->getDesc().getNumOperands(); i; --i)
         MI->RemoveOperand(i-1);
@@ -3522,6 +3592,8 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
         // more, so add them in manually.
         MIB.addReg(DstReg, RegState::Implicit | RegState::Define);
         MIB.addReg(SrcReg, RegState::Implicit);
+        if (ImplicitSReg != 0)
+          MIB.addReg(ImplicitSReg, RegState::Implicit);
         break;
       }
 
@@ -3580,6 +3652,8 @@ ARMBaseInstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
       // As before, the original destination is no longer represented, add it
       // implicitly.
       MIB.addReg(DstReg, RegState::Define | RegState::Implicit);
+      if (ImplicitSReg != 0)
+        MIB.addReg(ImplicitSReg, RegState::Implicit);
       break;
     }
   }
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index 92e5ee8dcb..304ccc087c 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -235,6 +235,9 @@ public:
   getExecutionDomain(const MachineInstr *MI) const;
   void setExecutionDomain(MachineInstr *MI, unsigned Domain) const;
 
+  /// Get the number of addresses by LDM or VLDM or zero for unknown.
+  unsigned getNumLDMAddresses(const MachineInstr *MI) const;
+
 private:
   unsigned getInstBundleLength(const MachineInstr *MI) const;
 
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index b2ba9703b6..277dd57ef2 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -480,7 +480,7 @@ ARMBaseRegisterInfo::UpdateRegAllocHint(unsigned Reg, unsigned NewReg,
 bool
 ARMBaseRegisterInfo::avoidWriteAfterWrite(const TargetRegisterClass *RC) const {
   // CortexA9 has a Write-after-write hazard for NEON registers.
-  if (!STI.isCortexA9())
+  if (!STI.isLikeA9())
     return false;
 
   switch (RC->getID()) {
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index c292821e79..56cfcface4 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -115,9 +115,9 @@ namespace {
     bool IsLoad;
     bool isUpdating;
     bool hasWritebackOperand;
-    NEONRegSpacing RegSpacing;
-    unsigned char NumRegs; // D registers loaded or stored
-    unsigned char RegElts; // elements per D register; used for lane ops
+    uint8_t RegSpacing; // One of type NEONRegSpacing
+    uint8_t NumRegs; // D registers loaded or stored
+    uint8_t RegElts; // elements per D register; used for lane ops
     // FIXME: Temporary flag to denote whether the real instruction takes
     // a single register (like the encoding) or all of the registers in
     // the list (like the asm syntax and the isel DAG). When all definitions
@@ -389,7 +389,7 @@ void ARMExpandPseudo::ExpandVLD(MachineBasicBlock::iterator &MBBI) {
 
   const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode());
   assert(TableEntry && TableEntry->IsLoad && "NEONLdStTable lookup failed");
-  NEONRegSpacing RegSpc = TableEntry->RegSpacing;
+  NEONRegSpacing RegSpc = (NEONRegSpacing)TableEntry->RegSpacing;
   unsigned NumRegs = TableEntry->NumRegs;
 
   MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
@@ -454,7 +454,7 @@ void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) {
 
   const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode());
   assert(TableEntry && !TableEntry->IsLoad && "NEONLdStTable lookup failed");
-  NEONRegSpacing RegSpc = TableEntry->RegSpacing;
+  NEONRegSpacing RegSpc = (NEONRegSpacing)TableEntry->RegSpacing;
   unsigned NumRegs = TableEntry->NumRegs;
 
   MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
@@ -538,7 +538,7 @@ void ARMExpandPseudo::ExpandLaneOp(MachineBasicBlock::iterator &MBBI) {
 
   const NEONLdStTableEntry *TableEntry = LookupNEONLdSt(MI.getOpcode());
   assert(TableEntry && "NEONLdStTable lookup failed");
-  NEONRegSpacing RegSpc = TableEntry->RegSpacing;
+  NEONRegSpacing RegSpc = (NEONRegSpacing)TableEntry->RegSpacing;
   unsigned NumRegs = TableEntry->NumRegs;
   unsigned RegElts = TableEntry->RegElts;
 
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 873404effd..d6ef3f333b 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -1026,6 +1026,9 @@ bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr,
       RC = &ARM::GPRRegClass;
       break;
     case MVT::i16:
+      if (Alignment && Alignment < 2 && !Subtarget->allowsUnalignedMem())
+        return false;
+
       if (isThumb2) {
         if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
           Opc = isZExt ? ARM::t2LDRHi8 : ARM::t2LDRSHi8;
@@ -1038,6 +1041,9 @@ bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr,
       RC = &ARM::GPRRegClass;
       break;
     case MVT::i32:
+      if (Alignment && Alignment < 4 && !Subtarget->allowsUnalignedMem())
+        return false;
+
       if (isThumb2) {
         if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
           Opc = ARM::t2LDRi8;
@@ -1144,6 +1150,9 @@ bool ARMFastISel::ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr,
       }
       break;
     case MVT::i16:
+      if (Alignment && Alignment < 2 && !Subtarget->allowsUnalignedMem())
+        return false;
+
       if (isThumb2) {
         if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
           StrOpc = ARM::t2STRHi8;
@@ -1155,6 +1164,9 @@ bool ARMFastISel::ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr,
       }
       break;
     case MVT::i32:
+      if (Alignment && Alignment < 4 && !Subtarget->allowsUnalignedMem())
+        return false;
+
       if (isThumb2) {
         if (Addr.Offset < 0 && Addr.Offset > -256 && Subtarget->hasV6T2Ops())
           StrOpc = ARM::t2STRi8;
diff --git a/lib/Target/ARM/ARMHazardRecognizer.cpp b/lib/Target/ARM/ARMHazardRecognizer.cpp
index a5fd15b6bb..1240169e84 100644
--- a/lib/Target/ARM/ARMHazardRecognizer.cpp
+++ b/lib/Target/ARM/ARMHazardRecognizer.cpp
@@ -47,7 +47,7 @@ ARMHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
       // Skip over one non-VFP / NEON instruction.
       if (!LastMI->isBarrier() &&
           // On A9, AGU and NEON/FPU are muxed.
-          !(STI.isCortexA9() && (LastMI->mayLoad() || LastMI->mayStore())) &&
+          !(STI.isLikeA9() && (LastMI->mayLoad() || LastMI->mayStore())) &&
           (LastMCID.TSFlags & ARMII::DomainMask) == ARMII::DomainGeneral) {
         MachineBasicBlock::iterator I = LastMI;
         if (I != LastMI->getParent()->begin()) {
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 3042b07920..a44e2a220a 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -317,7 +317,7 @@ static bool isOpcWithIntImmediate(SDNode *N, unsigned Opc, unsigned& Imm) {
 }
 
 /// \brief Check whether a particular node is a constant value representable as
-/// (N * Scale) where (N in [\arg RangeMin, \arg RangeMax).
+/// (N * Scale) where (N in [\p RangeMin, \p RangeMax).
 ///
 /// \param ScaledConstant [out] - On success, the pre-scaled constant value.
 static bool isScaledConstantInRange(SDValue Node, int Scale,
@@ -347,8 +347,7 @@ bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const {
 
   if (!CheckVMLxHazard)
     return true;
-
-  if (!Subtarget->isCortexA8() && !Subtarget->isCortexA9())
+  if (!Subtarget->isCortexA8() && !Subtarget->isLikeA9())
     return true;
 
   if (!N->hasOneUse())
@@ -386,7 +385,7 @@ bool ARMDAGToDAGISel::hasNoVMLxHazardUse(SDNode *N) const {
 bool ARMDAGToDAGISel::isShifterOpProfitable(const SDValue &Shift,
                                             ARM_AM::ShiftOpc ShOpcVal,
                                             unsigned ShAmt) {
-  if (!Subtarget->isCortexA9())
+  if (!Subtarget->isLikeA9())
     return true;
   if (Shift.hasOneUse())
     return true;
@@ -519,7 +518,7 @@ bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset,
     return false;
   // @LOCALMOD-END
   if (N.getOpcode() == ISD::MUL &&
-      (!Subtarget->isCortexA9() || N.hasOneUse())) {
+      (!Subtarget->isLikeA9() || N.hasOneUse())) {
     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
       // X * [3,5,9] -> X + X * [2,4,8] etc.
       int RHSC = (int)RHS->getZExtValue();
@@ -583,7 +582,7 @@ bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset,
 
   // Try matching (R shl C) + (R).
   if (N.getOpcode() != ISD::SUB && ShOpcVal == ARM_AM::no_shift &&
-      !(Subtarget->isCortexA9() || N.getOperand(0).hasOneUse())) {
+      !(Subtarget->isLikeA9() || N.getOperand(0).hasOneUse())) {
     ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0).getOpcode());
     if (ShOpcVal != ARM_AM::no_shift) {
       // Check to see if the RHS of the shift is a constant, if not, we can't
@@ -631,7 +630,7 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDNode *Op,
   // @LOCALMOD-END
 
   if (N.getOpcode() == ISD::MUL &&
-      (!Subtarget->isCortexA9() || N.hasOneUse())) {
+      (!Subtarget->isLikeA9() || N.hasOneUse())) {
     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
       // X * [3,5,9] -> X + X * [2,4,8] etc.
       int RHSC = (int)RHS->getZExtValue();
@@ -698,7 +697,7 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDNode *Op,
     }
   }
   
-  if (Subtarget->isCortexA9() && !N.hasOneUse()) {
+  if (Subtarget->isLikeA9() && !N.hasOneUse()) {
     // Compute R +/- (R << N) and reuse it.
     Base = N;
     Offset = CurDAG->getRegister(0, MVT::i32);
@@ -754,7 +753,7 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDNode *Op,
 
   // Try matching (R shl C) + (R).
   if (N.getOpcode() != ISD::SUB && ShOpcVal == ARM_AM::no_shift &&
-      !(Subtarget->isCortexA9() || N.getOperand(0).hasOneUse())) {
+      !(Subtarget->isLikeA9() || N.getOperand(0).hasOneUse())) {
     ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOperand(0).getOpcode());
     if (ShOpcVal != ARM_AM::no_shift) {
       // Check to see if the RHS of the shift is a constant, if not, we can't
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index bc15dcf4fc..2e7588b29f 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -860,7 +860,7 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   benefitFromCodePlacementOpt = true;
 
   // Prefer likely predicted branches to selects on out-of-order cores.
-  predictableSelectIsExpensive = Subtarget->isCortexA9();
+  predictableSelectIsExpensive = Subtarget->isLikeA9();
 
   setMinFunctionAlignment(Subtarget->isThumb() ? 1 : 2);
 }
@@ -9308,8 +9308,8 @@ bool ARMTargetLowering::isDesirableToTransformToIntegerOp(unsigned Opc,
 }
 
 bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const {
-  if (!Subtarget->allowsUnalignedMem())
-    return false;
+  // The AllowsUnaliged flag models the SCTLR.A setting in ARM cpus
+  bool AllowsUnaligned = Subtarget->allowsUnalignedMem();
 
   switch (VT.getSimpleVT().SimpleTy) {
   default:
@@ -9317,10 +9317,14 @@ bool ARMTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const {
   case MVT::i8:
   case MVT::i16:
   case MVT::i32:
-    return true;
+    // Unaligned access can use (for example) LRDB, LRDH, LDR
+    return AllowsUnaligned;
   case MVT::f64:
-    return Subtarget->hasNEON();
-  // FIXME: VLD1 etc with standard alignment is legal.
+  case MVT::v2f64:
+    // For any little-endian targets with neon, we can support unaligned ld/st
+    // of D and Q (e.g. {D0,D1}) registers by using vld1.i8/vst1.i8.
+    // A big-endian target may also explictly support unaligned accesses
+    return Subtarget->hasNEON() && (AllowsUnaligned || isLittleEndian());
   }
 }
 
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 697b5b0111..2060bb9374 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -3239,9 +3239,11 @@ def : ARMPat<(ARMaddc GPR:$src, so_imm_neg:$imm),
              (SUBSri  GPR:$src, so_imm_neg:$imm)>;
 
 def : ARMPat<(add     GPR:$src, imm0_65535_neg:$imm),
-             (SUBrr   GPR:$src, (MOVi16 (imm_neg_XFORM imm:$imm)))>;
+             (SUBrr   GPR:$src, (MOVi16 (imm_neg_XFORM imm:$imm)))>,
+             Requires<[IsARM, HasV6T2]>;
 def : ARMPat<(ARMaddc GPR:$src, imm0_65535_neg:$imm),
-             (SUBSrr  GPR:$src, (MOVi16 (imm_neg_XFORM imm:$imm)))>;
+             (SUBSrr  GPR:$src, (MOVi16 (imm_neg_XFORM imm:$imm)))>,
+             Requires<[IsARM, HasV6T2]>;
 
 // The with-carry-in form matches bitwise not instead of the negation.
 // Effectively, the inverse interpretation of the carry flag already accounts
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 8158a11f83..1bcb48776e 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -398,6 +398,20 @@ def VecListFourQWordIndexed : Operand<i32> {
   let MIOperandInfo = (ops DPR:$Vd, i32imm:$idx);
 }
 
+def dword_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() >= 8;
+}]>;
+def dword_alignedstore : PatFrag<(ops node:$val, node:$ptr),
+                                 (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() >= 8;
+}]>;
+def word_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+  return cast<LoadSDNode>(N)->getAlignment() == 4;
+}]>;
+def word_alignedstore : PatFrag<(ops node:$val, node:$ptr),
+                                 (store node:$val, node:$ptr), [{
+  return cast<StoreSDNode>(N)->getAlignment() == 4;
+}]>;
 def hword_alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
   return cast<LoadSDNode>(N)->getAlignment() == 2;
 }]>;
@@ -2273,6 +2287,25 @@ def : Pat<(f64 (non_word_alignedload addrmode6:$addr)),
 def : Pat<(non_word_alignedstore (f64 DPR:$value), addrmode6:$addr),
           (VST1d64 addrmode6:$addr, DPR:$value)>, Requires<[IsBE]>;
 
+// Use vld1/vst1 for Q and QQ. Also use them for unaligned v2f64
+// load / store if it's legal.
+def : Pat<(v2f64 (dword_alignedload addrmode6:$addr)),
+          (VLD1q64 addrmode6:$addr)>;
+def : Pat<(dword_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+          (VST1q64 addrmode6:$addr, QPR:$value)>;
+def : Pat<(v2f64 (word_alignedload addrmode6:$addr)),
+          (VLD1q32 addrmode6:$addr)>;
+def : Pat<(word_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+          (VST1q32 addrmode6:$addr, QPR:$value)>;
+def : Pat<(v2f64 (hword_alignedload addrmode6:$addr)),
+          (VLD1q16 addrmode6:$addr)>, Requires<[IsLE]>;
+def : Pat<(hword_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+          (VST1q16 addrmode6:$addr, QPR:$value)>, Requires<[IsLE]>;
+def : Pat<(v2f64 (byte_alignedload addrmode6:$addr)),
+          (VLD1q8 addrmode6:$addr)>, Requires<[IsLE]>;
+def : Pat<(byte_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
+          (VST1q8 addrmode6:$addr, QPR:$value)>, Requires<[IsLE]>;
+
 //===----------------------------------------------------------------------===//
 // NEON pattern fragments
 //===----------------------------------------------------------------------===//
@@ -4455,10 +4488,23 @@ def  VBSLd    : N3VX<1, 0, 0b01, 0b0001, 0, 1, (outs DPR:$Vd),
                      "vbsl", "$Vd, $Vn, $Vm", "$src1 = $Vd",
                      [(set DPR:$Vd,
                            (v2i32 (NEONvbsl DPR:$src1, DPR:$Vn, DPR:$Vm)))]>;
+def : Pat<(v8i8 (int_arm_neon_vbsl (v8i8 DPR:$src1),
+                                   (v8i8 DPR:$Vn), (v8i8 DPR:$Vm))),
+          (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v4i16 (int_arm_neon_vbsl (v4i16 DPR:$src1),
+                                    (v4i16 DPR:$Vn), (v4i16 DPR:$Vm))),
+          (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v2i32 (int_arm_neon_vbsl (v2i32 DPR:$src1),
+                                    (v2i32 DPR:$Vn), (v2i32 DPR:$Vm))),
+          (VBSLd DPR:$src1, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
 
 def : Pat<(v2i32 (or (and DPR:$Vn, DPR:$Vd),
                      (and DPR:$Vm, (vnotd DPR:$Vd)))),
-          (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>;
+          (VBSLd DPR:$Vd, DPR:$Vn, DPR:$Vm)>,
+        Requires<[HasNEON]>;
 
 def  VBSLq    : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd),
                      (ins QPR:$src1, QPR:$Vn, QPR:$Vm),
@@ -4467,9 +4513,23 @@ def  VBSLq    : N3VX<1, 0, 0b01, 0b0001, 1, 1, (outs QPR:$Vd),
                      [(set QPR:$Vd,
                            (v4i32 (NEONvbsl QPR:$src1, QPR:$Vn, QPR:$Vm)))]>;
 
+def : Pat<(v16i8 (int_arm_neon_vbsl (v16i8 QPR:$src1),
+                                   (v16i8 QPR:$Vn), (v16i8 QPR:$Vm))),
+          (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v8i16 (int_arm_neon_vbsl (v8i16 QPR:$src1),
+                                    (v8i16 QPR:$Vn), (v8i16 QPR:$Vm))),
+          (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
+def : Pat<(v4i32 (int_arm_neon_vbsl (v4i32 QPR:$src1),
+                                    (v4i32 QPR:$Vn), (v4i32 QPR:$Vm))),
+          (VBSLq QPR:$src1, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
+
 def : Pat<(v4i32 (or (and QPR:$Vn, QPR:$Vd),
                      (and QPR:$Vm, (vnotq QPR:$Vd)))),
-          (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>;
+          (VBSLq QPR:$Vd, QPR:$Vn, QPR:$Vm)>,
+        Requires<[HasNEON]>;
 
 //   VBIF     : Vector Bitwise Insert if False
 //              like VBSL but with: "vbif $dst, $src3, $src1", "$src2 = $dst",
diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td
index 7bc590f947..404634fee9 100644
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td
@@ -1876,8 +1876,9 @@ def CortexA9Itineraries : ProcessorItineraries<
 ]>;
 
 // ===---------------------------------------------------------------------===//
-// This following definitions describe the simple machine model which
-// will replace itineraries.
+// The following definitions describe the simpler per-operand machine model.
+// This works with MachineScheduler and will eventually replace itineraries.
+
 
 // Cortex-A9 machine model for scheduling and other instruction cost heuristics.
 def CortexA9Model : SchedMachineModel {
@@ -1891,5 +1892,595 @@ def CortexA9Model : SchedMachineModel {
   let Itineraries = CortexA9Itineraries;
 }
 
-// TODO: Add Cortex-A9 processor and scheduler resources.
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available.
+
+def A9UnitALU : ProcResource<2>;
+def A9UnitMul : ProcResource<1> { let Super = A9UnitALU; }
+def A9UnitAGU : ProcResource<1>;
+def A9UnitLS  : ProcResource<1>;
+def A9UnitFP  : ProcResource<1> { let Buffered = 0; }
+def A9UnitB   : ProcResource<1>;
+
+//===----------------------------------------------------------------------===//
+// Define scheduler read/write types with their resources and latency on A9.
+
+// Consume an issue slot, but no processor resources. This is useful when all
+// other writes associated with the operand have NumMicroOps = 0.
+def A9WriteIssue : SchedWriteRes<[]> { let Latency = 0; }
+
+// Write an integer register.
+def A9WriteI : SchedWriteRes<[A9UnitALU]>;
+// Write an integer shifted-by register
+def A9WriteIsr : SchedWriteRes<[A9UnitALU]> { let Latency = 2; }
+
+// Basic ALU.
+def A9WriteA : SchedWriteRes<[A9UnitALU]>;
+// ALU with operand shifted by immediate.
+def A9WriteAsi : SchedWriteRes<[A9UnitALU]> { let Latency = 2; }
+// ALU with operand shifted by register.
+def A9WriteAsr : SchedWriteRes<[A9UnitALU]> { let Latency = 3; }
+
+// Multiplication
+def A9WriteM   : SchedWriteRes<[A9UnitMul, A9UnitMul]> { let Latency = 4; }
+def A9WriteMHi : SchedWriteRes<[A9UnitMul]> { let Latency = 5;
+                                              let NumMicroOps = 0; }
+def A9WriteM16   : SchedWriteRes<[A9UnitMul]> { let Latency = 3; }
+def A9WriteM16Hi : SchedWriteRes<[A9UnitMul]> { let Latency = 4;
+                                                let NumMicroOps = 0; }
+
+// Floating-point
+// Only one FP or AGU instruction may issue per cycle. We model this
+// by having FP instructions consume the AGU resource.
+def A9WriteF      : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 4; }
+def A9WriteFMov   : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 1; }
+def A9WriteFMulS  : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 5; }
+def A9WriteFMulD  : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 6; }
+def A9WriteFMAS   : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 8; }
+def A9WriteFMAD   : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 9; }
+def A9WriteFDivS  : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 15; }
+def A9WriteFDivD  : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 25; }
+def A9WriteFSqrtS : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 17; }
+def A9WriteFSqrtD : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 32; }
+
+// NEON has an odd mix of latencies. Simply name the write types by latency.
+def A9WriteV1 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 1; }
+def A9WriteV2 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 2; }
+def A9WriteV3 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 3; }
+def A9WriteV4 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 4; }
+def A9WriteV5 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 5; }
+def A9WriteV6 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 6; }
+def A9WriteV7 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 7; }
+def A9WriteV9 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 9; }
+def A9WriteV10 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 10; }
+
+// Reserve A9UnitFP for 2 consecutive cycles.
+def A9Write2V4 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> {
+  let Latency = 4;
+  let ResourceCycles = [2];
+}
+def A9Write2V7 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> {
+  let Latency = 7;
+  let ResourceCycles = [2];
+}
+def A9Write2V9 : SchedWriteRes<[A9UnitFP, A9UnitAGU]> {
+  let Latency = 9;
+  let ResourceCycles = [2];
+}
+
+// Branches don't have a def operand but still consume resources.
+def A9WriteB : SchedWriteRes<[A9UnitB]>;
+
+// Address generation.
+def A9WriteAdr : SchedWriteRes<[A9UnitAGU]> { let NumMicroOps = 0; }
+
+// Load Integer.
+def A9WriteL : SchedWriteRes<[A9UnitLS]> { let Latency = 3; }
+// Load the upper 32-bits using the same micro-op.
+def A9WriteLHi : SchedWriteRes<[]> { let Latency = 3;
+                                     let NumMicroOps = 0; }
+// Offset shifted by register.
+def A9WriteLsi : SchedWriteRes<[A9UnitLS]> { let Latency = 4; }
+// Load (and zero extend) a byte.
+def A9WriteLb : SchedWriteRes<[A9UnitLS]> { let Latency = 4; }
+def A9WriteLbsi : SchedWriteRes<[A9UnitLS]> { let Latency = 5; }
+
+// Load or Store Float, aligned.
+def A9WriteLSfp : SchedWriteRes<[A9UnitLS, A9UnitFP]> { let Latency = 1; }
+
+// Store Integer.
+def A9WriteS : SchedWriteRes<[A9UnitLS]>;
+
+//===----------------------------------------------------------------------===//
+// Define resources dynamically for load multiple variants.
+
+// Define helpers for extra latency without consuming resources.
+def A9WriteCycle1 : SchedWriteRes<[]> { let Latency = 1; let NumMicroOps = 0; }
+foreach NumCycles = 2-8 in {
+def A9WriteCycle#NumCycles : WriteSequence<[A9WriteCycle1], NumCycles>;
+} // foreach NumCycles
+
+// Define TII for use in SchedVariant Predicates.
+def : PredicateProlog<[{
+  const ARMBaseInstrInfo *TII =
+    static_cast<const ARMBaseInstrInfo*>(SchedModel->getInstrInfo());
+  (void)TII;
+}]>;
+
+// Define address generation sequences and predicates for 8 flavors of LDMs.
+foreach NumAddr = 1-8 in {
+
+// Define A9WriteAdr1-8 as a sequence of A9WriteAdr with additive
+// latency for instructions that generate multiple loads or stores.
+def A9WriteAdr#NumAddr : WriteSequence<[A9WriteAdr], NumAddr>;
+
+// Define a predicate to select the LDM based on number of memory addresses.
+def A9LMAdr#NumAddr#Pred :
+  SchedPredicate<"TII->getNumLDMAddresses(MI) == "#NumAddr>;
+
+} // foreach NumAddr
+
+// Fall-back for unknown LDMs.
+def A9LMUnknownPred : SchedPredicate<"TII->getNumLDMAddresses(MI) == 0">;
+
+// LDM/VLDM/VLDn address generation latency & resources.
+// Dynamically select the A9WriteAdrN sequence using a predicate.
+def A9WriteLMAdr : SchedWriteVariant<[
+  SchedVar<A9LMAdr1Pred, [A9WriteAdr1]>,
+  SchedVar<A9LMAdr2Pred, [A9WriteAdr2]>,
+  SchedVar<A9LMAdr3Pred, [A9WriteAdr3]>,
+  SchedVar<A9LMAdr4Pred, [A9WriteAdr4]>,
+  SchedVar<A9LMAdr5Pred, [A9WriteAdr5]>,
+  SchedVar<A9LMAdr6Pred, [A9WriteAdr6]>,
+  SchedVar<A9LMAdr7Pred, [A9WriteAdr7]>,
+  SchedVar<A9LMAdr8Pred, [A9WriteAdr8]>,
+  // For unknown LDM/VLDM/VSTM, assume 2 32-bit registers.
+  SchedVar<A9LMUnknownPred, [A9WriteAdr2]>]>;
+
+// Define LDM Resources.
+// These take no issue resource, so they can be combined with other
+// writes like WriteB.
+// A9WriteLMLo takes a single LS resource and 2 cycles.
+def A9WriteLMLo : SchedWriteRes<[A9UnitLS]> { let Latency = 2;
+                                              let NumMicroOps = 0; }
+// Assuming aligned access, the upper half of each pair is free with
+// the same latency.
+def A9WriteLMHi : SchedWriteRes<[]> { let Latency = 2;
+                                      let NumMicroOps = 0; }
+// Each A9WriteL#N variant adds N cycles of latency without consuming
+// additional resources.
+foreach NumAddr = 1-8 in {
+def A9WriteL#NumAddr : WriteSequence<
+  [A9WriteLMLo, !cast<SchedWrite>("A9WriteCycle"#NumAddr)]>;
+def A9WriteL#NumAddr#Hi : WriteSequence<
+  [A9WriteLMHi, !cast<SchedWrite>("A9WriteCycle"#NumAddr)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// LDM: Load multiple into 32-bit integer registers.
+
+// A9WriteLM variants expand into a pair of writes for each 64-bit
+// value loaded. When the number of registers is odd, the last
+// A9WriteLnHi is naturally ignored because the instruction has no
+// following def operands.  These variants take no issue resource, so
+// they may need to be part of a WriteSequence that includes A9WriteIssue.
+def A9WriteLM : SchedWriteVariant<[
+  SchedVar<A9LMAdr1Pred, [A9WriteL1, A9WriteL1Hi]>,
+  SchedVar<A9LMAdr2Pred, [A9WriteL1, A9WriteL1Hi,
+                          A9WriteL2, A9WriteL2Hi]>,
+  SchedVar<A9LMAdr3Pred, [A9WriteL1, A9WriteL1Hi,
+                          A9WriteL2, A9WriteL2Hi,
+                          A9WriteL3, A9WriteL3Hi]>,
+  SchedVar<A9LMAdr4Pred, [A9WriteL1, A9WriteL1Hi,
+                          A9WriteL2, A9WriteL2Hi,
+                          A9WriteL3, A9WriteL3Hi,
+                          A9WriteL4, A9WriteL4Hi]>,
+  SchedVar<A9LMAdr5Pred, [A9WriteL1, A9WriteL1Hi,
+                          A9WriteL2, A9WriteL2Hi,
+                          A9WriteL3, A9WriteL3Hi,
+                          A9WriteL4, A9WriteL4Hi,
+                          A9WriteL5, A9WriteL5Hi]>,
+  SchedVar<A9LMAdr6Pred, [A9WriteL1, A9WriteL1Hi,
+                          A9WriteL2, A9WriteL2Hi,
+                          A9WriteL3, A9WriteL3Hi,
+                          A9WriteL4, A9WriteL4Hi,
+                          A9WriteL5, A9WriteL5Hi,
+                          A9WriteL6, A9WriteL6Hi]>,
+  SchedVar<A9LMAdr7Pred, [A9WriteL1, A9WriteL1Hi,
+                          A9WriteL2, A9WriteL2Hi,
+                          A9WriteL3, A9WriteL3Hi,
+                          A9WriteL4, A9WriteL4Hi,
+                          A9WriteL5, A9WriteL5Hi,
+                          A9WriteL6, A9WriteL6Hi,
+                          A9WriteL7, A9WriteL7Hi]>,
+  SchedVar<A9LMAdr8Pred, [A9WriteL1, A9WriteL1Hi,
+                          A9WriteL2, A9WriteL2Hi,
+                          A9WriteL3, A9WriteL3Hi,
+                          A9WriteL4, A9WriteL4Hi,
+                          A9WriteL5, A9WriteL5Hi,
+                          A9WriteL6, A9WriteL6Hi,
+                          A9WriteL7, A9WriteL7Hi,
+                          A9WriteL8, A9WriteL8Hi]>,
+  // For unknown LDMs, define the maximum number of writes, but only
+  // make the first two consume resources.
+  SchedVar<A9LMUnknownPred, [A9WriteL1, A9WriteL1Hi,
+                             A9WriteL2, A9WriteL2Hi,
+                             A9WriteL3Hi, A9WriteL3Hi,
+                             A9WriteL4Hi, A9WriteL4Hi,
+                             A9WriteL5Hi, A9WriteL5Hi,
+                             A9WriteL6Hi, A9WriteL6Hi,
+                             A9WriteL7Hi, A9WriteL7Hi,
+                             A9WriteL8Hi, A9WriteL8Hi]>]> {
+  let Variadic = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// VFP Load/Store Multiple Variants, and NEON VLDn/VSTn support.
+
+// A9WriteLfpOp is the same as A9WriteLSfp but takes no issue resources
+// so can be used in WriteSequences for in single-issue instructions that
+// encapsulate multiple loads.
+def A9WriteLfpOp : SchedWriteRes<[A9UnitLS, A9UnitFP]> {
+  let Latency = 1;
+  let NumMicroOps = 0;
+}
+
+foreach NumAddr = 1-8 in {
+
+// Helper for A9WriteLfp1-8: A sequence of fp loads with no micro-ops.
+def A9WriteLfp#NumAddr#Seq : WriteSequence<[A9WriteLfpOp], NumAddr>;
+
+// A9WriteLfp1-8 definitions are statically expanded into a sequence of
+// A9WriteLfpOps with additive latency that takes a single issue slot.
+// Used directly to describe NEON VLDn.
+def A9WriteLfp#NumAddr : WriteSequence<
+  [A9WriteIssue, !cast<SchedWrite>("A9WriteLfp"#NumAddr#Seq)]>;
+
+// A9WriteLfp1-8Mov adds a cycle of latency and FP resource for
+// permuting loaded values.
+def A9WriteLfp#NumAddr#Mov : WriteSequence<
+  [A9WriteF, !cast<SchedWrite>("A9WriteLfp"#NumAddr#Seq)]>;
+
+} // foreach NumAddr
+
+// Define VLDM/VSTM PreRA resources.
+// A9WriteLMfpPreRA are dynamically expanded into the correct
+// A9WriteLfp1-8 sequence based on a predicate. This supports the
+// preRA VLDM variants in which all 64-bit loads are written to the
+// same tuple of either single or double precision registers.
+def A9WriteLMfpPreRA : SchedWriteVariant<[
+  SchedVar<A9LMAdr1Pred, [A9WriteLfp1]>,
+  SchedVar<A9LMAdr2Pred, [A9WriteLfp2]>,
+  SchedVar<A9LMAdr3Pred, [A9WriteLfp3]>,
+  SchedVar<A9LMAdr4Pred, [A9WriteLfp4]>,
+  SchedVar<A9LMAdr5Pred, [A9WriteLfp5]>,
+  SchedVar<A9LMAdr6Pred, [A9WriteLfp6]>,
+  SchedVar<A9LMAdr7Pred, [A9WriteLfp7]>,
+  SchedVar<A9LMAdr8Pred, [A9WriteLfp8]>,
+  // For unknown VLDM/VSTM PreRA, assume 2xS registers.
+  SchedVar<A9LMUnknownPred, [A9WriteLfp2]>]>;
+
+// Define VLDM/VSTM PostRA Resources.
+// A9WriteLMfpLo takes a LS and FP resource and one issue slot but no latency.
+def A9WriteLMfpLo : SchedWriteRes<[A9UnitLS, A9UnitFP]> { let Latency = 0; }
+
+foreach NumAddr = 1-8 in {
+
+// Each A9WriteL#N variant adds N cycles of latency without consuming
+// additional resources.
+def A9WriteLMfp#NumAddr : WriteSequence<
+  [A9WriteLMfpLo, !cast<SchedWrite>("A9WriteCycle"#NumAddr)]>;
+
+// Assuming aligned access, the upper half of each pair is free with
+// the same latency.
+def A9WriteLMfp#NumAddr#Hi : WriteSequence<
+  [A9WriteLMHi, !cast<SchedWrite>("A9WriteCycle"#NumAddr)]>;
+
+} // foreach NumAddr
+
+// VLDM PostRA Variants. These variants expand A9WriteLMfpPostRA into a
+// pair of writes for each 64-bit data loaded. When the number of
+// registers is odd, the last WriteLMfpnHi is naturally ignored because
+// the instruction has no following def operands.
+def A9WriteLMfpPostRA : SchedWriteVariant<[
+  SchedVar<A9LMAdr1Pred, [A9WriteLMfp1, A9WriteLMfp1Hi]>,
+  SchedVar<A9LMAdr2Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
+                          A9WriteLMfp2, A9WriteLMfp2Hi]>,
+  SchedVar<A9LMAdr3Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
+                          A9WriteLMfp2, A9WriteLMfp2Hi,
+                          A9WriteLMfp3, A9WriteLMfp3Hi]>,
+  SchedVar<A9LMAdr4Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
+                          A9WriteLMfp2, A9WriteLMfp2Hi,
+                          A9WriteLMfp3, A9WriteLMfp3Hi,
+                          A9WriteLMfp4, A9WriteLMfp4Hi]>,
+  SchedVar<A9LMAdr5Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
+                          A9WriteLMfp2, A9WriteLMfp2Hi,
+                          A9WriteLMfp3, A9WriteLMfp3Hi,
+                          A9WriteLMfp4, A9WriteLMfp4Hi,
+                          A9WriteLMfp5, A9WriteLMfp5Hi]>,
+  SchedVar<A9LMAdr6Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
+                          A9WriteLMfp2, A9WriteLMfp2Hi,
+                          A9WriteLMfp3, A9WriteLMfp3Hi,
+                          A9WriteLMfp4, A9WriteLMfp4Hi,
+                          A9WriteLMfp5, A9WriteLMfp5Hi,
+                          A9WriteLMfp6, A9WriteLMfp6Hi]>,
+  SchedVar<A9LMAdr7Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
+                          A9WriteLMfp2, A9WriteLMfp2Hi,
+                          A9WriteLMfp3, A9WriteLMfp3Hi,
+                          A9WriteLMfp4, A9WriteLMfp4Hi,
+                          A9WriteLMfp5, A9WriteLMfp5Hi,
+                          A9WriteLMfp6, A9WriteLMfp6Hi,
+                          A9WriteLMfp7, A9WriteLMfp7Hi]>,
+  SchedVar<A9LMAdr8Pred, [A9WriteLMfp1, A9WriteLMfp1Hi,
+                          A9WriteLMfp2, A9WriteLMfp2Hi,
+                          A9WriteLMfp3, A9WriteLMfp3Hi,
+                          A9WriteLMfp4, A9WriteLMfp4Hi,
+                          A9WriteLMfp5, A9WriteLMfp5Hi,
+                          A9WriteLMfp6, A9WriteLMfp6Hi,
+                          A9WriteLMfp7, A9WriteLMfp7Hi,
+                          A9WriteLMfp8, A9WriteLMfp8Hi]>,
+  // For unknown LDMs, define the maximum number of writes, but only
+  // make the first two consume resources.
+  SchedVar<A9LMUnknownPred, [A9WriteLMfp1, A9WriteLMfp1Hi,
+                             A9WriteLMfp2, A9WriteLMfp2Hi,
+                             A9WriteLMfp3Hi, A9WriteLMfp3Hi,
+                             A9WriteLMfp4Hi, A9WriteLMfp4Hi,
+                             A9WriteLMfp5Hi, A9WriteLMfp5Hi,
+                             A9WriteLMfp6Hi, A9WriteLMfp6Hi,
+                             A9WriteLMfp7Hi, A9WriteLMfp7Hi,
+                             A9WriteLMfp8Hi, A9WriteLMfp8Hi]>]> {
+  let Variadic = 1;
+}
+
+// Distinguish between our multiple MI-level forms of the same
+// VLDM/VSTM instructions.
+def A9PreRA : SchedPredicate<
+  "TargetRegisterInfo::isVirtualRegister(MI->getOperand(0).getReg())">;
+def A9PostRA : SchedPredicate<
+  "TargetRegisterInfo::isPhysicalRegister(MI->getOperand(0).getReg())">;
+
+// VLDM represents all destination registers as a single register
+// tuple, unlike LDM. So the number of write operands is not variadic.
+def A9WriteLMfp : SchedWriteVariant<[
+  SchedVar<A9PreRA, [A9WriteLMfpPreRA]>,
+  SchedVar<A9PostRA, [A9WriteLMfpPostRA]>]>;
+
+//===----------------------------------------------------------------------===//
+// Resources for other (non LDM/VLDM) Variants.
+
+// These mov immediate writers are unconditionally expanded with
+// additive latency.
+def A9WriteI2 : WriteSequence<[A9WriteI, A9WriteI]>;
+def A9WriteI2pc : WriteSequence<[A9WriteI, A9WriteI, A9WriteA]>;
+def A9WriteI2ld  : WriteSequence<[A9WriteI, A9WriteI, A9WriteL]>;
+
+// Some ALU operations can read loaded integer values one cycle early.
+def A9ReadA : SchedReadAdvance<1,
+  [A9WriteL, A9WriteLHi, A9WriteLsi, A9WriteLb, A9WriteLbsi,
+   A9WriteL1, A9WriteL2, A9WriteL3, A9WriteL4,
+   A9WriteL5, A9WriteL6, A9WriteL7, A9WriteL8,
+   A9WriteL1Hi, A9WriteL2Hi, A9WriteL3Hi, A9WriteL4Hi,
+   A9WriteL5Hi, A9WriteL6Hi, A9WriteL7Hi, A9WriteL8Hi]>;
+
+// Read types for operands that are unconditionally read in cycle N
+// after the instruction issues, decreases producer latency by N-1.
+def A9Read2 : SchedReadAdvance<1>;
+def A9Read3 : SchedReadAdvance<2>;
+def A9Read4 : SchedReadAdvance<3>;
+
+//===----------------------------------------------------------------------===//
+// Map itinerary classes to scheduler read/write resources per operand.
+//
+// For ARM, we piggyback scheduler resources on the Itinerary classes
+// to avoid perturbing the existing instruction definitions.
+
+// This table follows the ARM Cortex-A9 Technical Reference Manuals,
+// mostly in order.
+let SchedModel = CortexA9Model in {
+
+def :ItinRW<[A9WriteI], [IIC_iMOVi,IIC_iMOVr,IIC_iMOVsi,
+                         IIC_iMVNi,IIC_iMVNsi,
+                         IIC_iCMOVi,IIC_iCMOVr,IIC_iCMOVsi]>;
+def :ItinRW<[A9WriteI,A9ReadA],[IIC_iMVNr]>;
+def :ItinRW<[A9WriteIsr], [IIC_iMOVsr,IIC_iMVNsr,IIC_iCMOVsr]>;
+
+def :ItinRW<[A9WriteI2],   [IIC_iMOVix2,IIC_iCMOVix2]>;
+def :ItinRW<[A9WriteI2pc], [IIC_iMOVix2addpc]>;
+def :ItinRW<[A9WriteI2ld], [IIC_iMOVix2ld]>;
+
+def :ItinRW<[A9WriteA], [IIC_iBITi,IIC_iBITr,IIC_iUNAr,IIC_iTSTi,IIC_iTSTr]>;
+def :ItinRW<[A9WriteA, A9ReadA], [IIC_iALUi, IIC_iCMPi, IIC_iCMPsi]>;
+def :ItinRW<[A9WriteA, A9ReadA, A9ReadA],[IIC_iALUr,IIC_iCMPr]>;
+def :ItinRW<[A9WriteAsi], [IIC_iBITsi,IIC_iUNAsi,IIC_iEXTr,IIC_iTSTsi]>;
+def :ItinRW<[A9WriteAsi, A9ReadA], [IIC_iALUsi]>;
+def :ItinRW<[A9WriteAsi, ReadDefault, A9ReadA], [IIC_iALUsir]>; // RSB
+def :ItinRW<[A9WriteAsr], [IIC_iBITsr,IIC_iTSTsr,IIC_iEXTAr,IIC_iEXTAsr]>;
+def :ItinRW<[A9WriteAsr, A9ReadA], [IIC_iALUsr,IIC_iCMPsr]>;
+
+// A9WriteHi ignored for MUL32.
+def :ItinRW<[A9WriteM, A9WriteMHi], [IIC_iMUL32,IIC_iMAC32,
+                                     IIC_iMUL64,IIC_iMAC64]>;
+// FIXME: SMLALxx needs itin classes
+def :ItinRW<[A9WriteM16, A9WriteM16Hi], [IIC_iMUL16,IIC_iMAC16]>;
+
+// TODO: For floating-point ops, we model the pipeline forwarding
+// latencies here. WAW latencies are sometimes longer.
+
+def :ItinRW<[A9WriteFMov], [IIC_fpSTAT, IIC_fpMOVIS, IIC_fpMOVID, IIC_fpMOVSI,
+                            IIC_fpUNA32, IIC_fpUNA64,
+                            IIC_fpCMP32, IIC_fpCMP64]>;
+def :ItinRW<[A9WriteFMov, A9WriteFMov], [IIC_fpMOVDI]>;
+def :ItinRW<[A9WriteF], [IIC_fpCVTSD, IIC_fpCVTDS, IIC_fpCVTSH, IIC_fpCVTHS,
+                         IIC_fpCVTIS, IIC_fpCVTID, IIC_fpCVTSI, IIC_fpCVTDI,
+                         IIC_fpALU32, IIC_fpALU64]>;
+def :ItinRW<[A9WriteFMulS], [IIC_fpMUL32]>;
+def :ItinRW<[A9WriteFMulD], [IIC_fpMUL64]>;
+def :ItinRW<[A9WriteFMAS], [IIC_fpMAC32]>;
+def :ItinRW<[A9WriteFMAD], [IIC_fpMAC64]>;
+def :ItinRW<[A9WriteFDivS], [IIC_fpDIV32]>;
+def :ItinRW<[A9WriteFDivD], [IIC_fpDIV64]>;
+def :ItinRW<[A9WriteFSqrtS], [IIC_fpSQRT32]>;
+def :ItinRW<[A9WriteFSqrtD], [IIC_fpSQRT64]>;
+
+def :ItinRW<[A9WriteB], [IIC_Br]>;
+
+// A9 PLD is processed in a dedicated unit.
+def :ItinRW<[], [IIC_Preload]>;
+
+// Note: We must assume that loads are aligned, since the machine
+// model cannot know this statically and A9 ignores alignment hints.
+
+// A9WriteAdr consumes AGU regardless address writeback. But it's
+// latency is only relevant for users of an updated address.
+def :ItinRW<[A9WriteL, A9WriteAdr], [IIC_iLoad_i,IIC_iLoad_r,
+                                     IIC_iLoad_iu,IIC_iLoad_ru]>;
+def :ItinRW<[A9WriteLsi, A9WriteAdr], [IIC_iLoad_si,IIC_iLoad_siu]>;
+def :ItinRW<[A9WriteLb, A9WriteAdr2], [IIC_iLoad_bh_i,IIC_iLoad_bh_r,
+                                       IIC_iLoad_bh_iu,IIC_iLoad_bh_ru]>;
+def :ItinRW<[A9WriteLbsi, A9WriteAdr2], [IIC_iLoad_bh_si,IIC_iLoad_bh_siu]>;
+def :ItinRW<[A9WriteL, A9WriteLHi, A9WriteAdr], [IIC_iLoad_d_i,IIC_iLoad_d_r,
+                                            IIC_iLoad_d_ru]>;
+// Store either has no def operands, or the one def for address writeback.
+def :ItinRW<[A9WriteAdr, A9WriteS], [IIC_iStore_i, IIC_iStore_r,
+                                     IIC_iStore_iu, IIC_iStore_ru,
+                                     IIC_iStore_d_i, IIC_iStore_d_r,
+                                     IIC_iStore_d_ru]>;
+def :ItinRW<[A9WriteAdr2, A9WriteS], [IIC_iStore_si, IIC_iStore_siu,
+                                      IIC_iStore_bh_i, IIC_iStore_bh_r,
+                                      IIC_iStore_bh_iu, IIC_iStore_bh_ru]>;
+def :ItinRW<[A9WriteAdr3, A9WriteS], [IIC_iStore_bh_si, IIC_iStore_bh_siu]>;
+
+// A9WriteML will be expanded into a separate write for each def
+// operand. Address generation consumes resources, but A9WriteLMAdr
+// is listed after all def operands, so has no effective latency.
+//
+// Note: A9WriteLM expands into an even number of def operands. The
+// actual number of def operands may be less by one.
+def :ItinRW<[A9WriteLM, A9WriteLMAdr, A9WriteIssue], [IIC_iLoad_m, IIC_iPop]>;
+
+// Load multiple with address writeback has an extra def operand in
+// front of the loaded registers.
+//
+// Reuse the load-multiple variants for store-multiple because the
+// resources are identical, For stores only the address writeback
+// has a def operand so the WriteL latencies are unused.
+def :ItinRW<[A9WriteLMAdr, A9WriteLM, A9WriteIssue], [IIC_iLoad_mu,
+                                                      IIC_iStore_m,
+                                                      IIC_iStore_mu]>;
+def :ItinRW<[A9WriteLM, A9WriteLMAdr, A9WriteB], [IIC_iLoad_mBr, IIC_iPop_Br]>;
+def :ItinRW<[A9WriteL, A9WriteAdr, A9WriteA], [IIC_iLoadiALU]>;
+
+def :ItinRW<[A9WriteLSfp, A9WriteAdr], [IIC_fpLoad32, IIC_fpLoad64]>;
+
+def :ItinRW<[A9WriteLMfp, A9WriteLMAdr], [IIC_fpLoad_m]>;
+def :ItinRW<[A9WriteLMAdr, A9WriteLMfp], [IIC_fpLoad_mu]>;
+def :ItinRW<[A9WriteAdr, A9WriteLSfp], [IIC_fpStore32, IIC_fpStore64,
+                                        IIC_fpStore_m, IIC_fpStore_mu]>;
+
+// Note: Unlike VLDM, VLD1 expects the writeback operand after the
+// normal writes.
+def :ItinRW<[A9WriteLfp1, A9WriteAdr1], [IIC_VLD1, IIC_VLD1u,
+                                         IIC_VLD1x2, IIC_VLD1x2u]>;
+def :ItinRW<[A9WriteLfp2, A9WriteAdr2], [IIC_VLD1x3, IIC_VLD1x3u,
+                                         IIC_VLD1x4, IIC_VLD1x4u,
+                                         IIC_VLD4dup, IIC_VLD4dupu]>;
+def :ItinRW<[A9WriteLfp1Mov, A9WriteAdr1], [IIC_VLD1dup, IIC_VLD1dupu,
+                                            IIC_VLD2, IIC_VLD2u,
+                                            IIC_VLD2dup, IIC_VLD2dupu]>;
+def :ItinRW<[A9WriteLfp2Mov, A9WriteAdr1], [IIC_VLD1ln, IIC_VLD1lnu,
+                                            IIC_VLD2x2, IIC_VLD2x2u,
+                                            IIC_VLD2ln, IIC_VLD2lnu]>;
+def :ItinRW<[A9WriteLfp3Mov, A9WriteAdr3], [IIC_VLD3, IIC_VLD3u,
+                                            IIC_VLD3dup, IIC_VLD3dupu]>;
+def :ItinRW<[A9WriteLfp4Mov, A9WriteAdr4], [IIC_VLD4, IIC_VLD4u,
+                                            IIC_VLD4ln, IIC_VLD4lnu]>;
+def :ItinRW<[A9WriteLfp5Mov, A9WriteAdr5], [IIC_VLD3ln, IIC_VLD3lnu]>;
+
+// Vector stores use similar resources to vector loads, so use the
+// same write types. The address write must be first for stores with
+// address writeback.
+def :ItinRW<[A9WriteAdr1, A9WriteLfp1], [IIC_VST1, IIC_VST1u,
+                                         IIC_VST1x2, IIC_VST1x2u,
+                                         IIC_VST1ln, IIC_VST1lnu,
+                                         IIC_VST2, IIC_VST2u,
+                                         IIC_VST2x2, IIC_VST2x2u,
+                                         IIC_VST2ln, IIC_VST2lnu]>;
+def :ItinRW<[A9WriteAdr2, A9WriteLfp2], [IIC_VST1x3, IIC_VST1x3u,
+                                         IIC_VST1x4, IIC_VST1x4u,
+                                         IIC_VST3, IIC_VST3u,
+                                         IIC_VST3ln, IIC_VST3lnu,
+                                         IIC_VST4, IIC_VST4u,
+                                         IIC_VST4ln, IIC_VST4lnu]>;
+
+// NEON moves.
+def :ItinRW<[A9WriteV2], [IIC_VMOVSI, IIC_VMOVDI, IIC_VMOVD, IIC_VMOVQ]>;
+def :ItinRW<[A9WriteV1], [IIC_VMOV, IIC_VMOVIS, IIC_VMOVID]>;
+def :ItinRW<[A9WriteV3], [IIC_VMOVISL, IIC_VMOVN]>;
+
+// NEON integer arithmetic
+//
+// VADD/VAND/VORR/VEOR/VBIC/VORN/VBIT/VBIF/VBSL
+def :ItinRW<[A9WriteV3, A9Read2, A9Read2], [IIC_VBINiD, IIC_VBINiQ]>;
+// VSUB/VMVN/VCLSD/VCLZD/VCNTD
+def :ItinRW<[A9WriteV3, A9Read2], [IIC_VSUBiD, IIC_VSUBiQ, IIC_VCNTiD]>;
+// VADDL/VSUBL/VNEG are mapped later under IIC_SHLi.
+// ...
+// VHADD/VRHADD/VQADD/VTST/VADH/VRADH
+def :ItinRW<[A9WriteV4, A9Read2, A9Read2], [IIC_VBINi4D, IIC_VBINi4Q]>;
+// VSBH/VRSBH/VHSUB/VQSUB/VABD/VCEQ/VCGE/VCGT/VMAX/VMIN/VPMAX/VPMIN/VABDL
+def :ItinRW<[A9WriteV4, A9Read2], [IIC_VSUBi4D, IIC_VSUBi4Q]>;
+// VQNEG/VQABS
+def :ItinRW<[A9WriteV4], [IIC_VQUNAiD, IIC_VQUNAiQ]>;
+// VABS
+def :ItinRW<[A9WriteV4, A9Read2], [IIC_VUNAiD, IIC_VUNAiQ]>;
+// VPADD/VPADDL are mapped later under IIC_SHLi.
+// ...
+// VCLSQ/VCLZQ/VCNTQ, takes two cycles.
+def :ItinRW<[A9Write2V4, A9Read3], [IIC_VCNTiQ]>;
+// VMOVimm/VMVNimm/VORRimm/VBICimm
+def :ItinRW<[A9WriteV3], [IIC_VMOVImm]>;
+def :ItinRW<[A9WriteV6, A9Read3, A9Read2], [IIC_VABAD, IIC_VABAQ]>;
+def :ItinRW<[A9WriteV6, A9Read3], [IIC_VPALiD, IIC_VPALiQ]>;
+
+// NEON integer multiply
+//
+// Note: these don't quite match the timing docs, but they do match
+// the original A9 itinerary.
+def :ItinRW<[A9WriteV6, A9Read2, A9Read2], [IIC_VMULi16D]>;
+def :ItinRW<[A9WriteV7, A9Read2, A9Read2], [IIC_VMULi16Q]>;
+def :ItinRW<[A9Write2V7, A9Read2], [IIC_VMULi32D]>;
+def :ItinRW<[A9Write2V9, A9Read2], [IIC_VMULi32Q]>;
+def :ItinRW<[A9WriteV6, A9Read3, A9Read2, A9Read2], [IIC_VMACi16D]>;
+def :ItinRW<[A9WriteV7, A9Read3, A9Read2, A9Read2], [IIC_VMACi16Q]>;
+def :ItinRW<[A9Write2V7, A9Read3, A9Read2], [IIC_VMACi32D]>;
+def :ItinRW<[A9Write2V9, A9Read3, A9Read2], [IIC_VMACi32Q]>;
+
+// NEON integer shift
+// TODO: Q,Q,Q shifts should actually reserve FP for 2 cycles.
+def :ItinRW<[A9WriteV3], [IIC_VSHLiD, IIC_VSHLiQ]>;
+def :ItinRW<[A9WriteV4], [IIC_VSHLi4D, IIC_VSHLi4Q]>;
+
+// NEON permute
+def :ItinRW<[A9WriteV2], [IIC_VPERMD, IIC_VPERMQ, IIC_VEXTD]>;
+def :ItinRW<[A9WriteV3, A9WriteV4, ReadDefault, A9Read2],
+            [IIC_VPERMQ3, IIC_VEXTQ]>;
+def :ItinRW<[A9WriteV3, A9Read2], [IIC_VTB1]>;
+def :ItinRW<[A9WriteV3, A9Read2, A9Read2], [IIC_VTB2]>;
+def :ItinRW<[A9WriteV4, A9Read2, A9Read2, A9Read3], [IIC_VTB3]>;
+def :ItinRW<[A9WriteV4, A9Read2, A9Read2, A9Read3, A9Read3], [IIC_VTB4]>;
+def :ItinRW<[A9WriteV3, ReadDefault, A9Read2], [IIC_VTBX1]>;
+def :ItinRW<[A9WriteV3, ReadDefault, A9Read2, A9Read2], [IIC_VTBX2]>;
+def :ItinRW<[A9WriteV4, ReadDefault, A9Read2, A9Read2, A9Read3], [IIC_VTBX3]>;
+def :ItinRW<[A9WriteV4, ReadDefault, A9Read2, A9Read2, A9Read3, A9Read3],
+            [IIC_VTBX4]>;
 
+// NEON floating-point
+def :ItinRW<[A9WriteV5, A9Read2, A9Read2], [IIC_VBIND]>;
+def :ItinRW<[A9WriteV6, A9Read2, A9Read2], [IIC_VBINQ]>;
+def :ItinRW<[A9WriteV5, A9Read2], [IIC_VUNAD, IIC_VFMULD]>;
+def :ItinRW<[A9WriteV6, A9Read2], [IIC_VUNAQ, IIC_VFMULQ]>;
+def :ItinRW<[A9WriteV9, A9Read3, A9Read2], [IIC_VMACD, IIC_VFMACD]>;
+def :ItinRW<[A9WriteV10, A9Read3, A9Read2], [IIC_VMACQ, IIC_VFMACQ]>;
+def :ItinRW<[A9WriteV9, A9Read2, A9Read2], [IIC_VRECSD]>;
+def :ItinRW<[A9WriteV10, A9Read2, A9Read2], [IIC_VRECSQ]>;
+} // SchedModel = CortexA9Model
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 7b0cc7477e..c8aa0779bc 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -13,8 +13,9 @@
 
 #include "ARMSubtarget.h"
 #include "ARMBaseRegisterInfo.h"
+#include "ARMBaseInstrInfo.h"
 #include "llvm/GlobalValue.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Support/CommandLine.h"
 
 #define GET_SUBTARGETINFO_TARGET_DESC
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index 65b41ee8b2..0a5744e5c1 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -38,7 +38,7 @@ class StringRef;
 class ARMSubtarget : public ARMGenSubtargetInfo {
 protected:
   enum ARMProcFamilyEnum {
-    Others, CortexA8, CortexA9
+    Others, CortexA8, CortexA9, CortexA15
   };
 
   /// ARMProcFamily - ARM processor family: Cortex-A8, Cortex-A9, and others.
@@ -213,7 +213,9 @@ protected:
 
   bool isCortexA8() const { return ARMProcFamily == CortexA8; }
   bool isCortexA9() const { return ARMProcFamily == CortexA9; }
+  bool isCortexA15() const { return ARMProcFamily == CortexA15; }
   bool isCortexM3() const { return CPUString == "cortex-m3"; }
+  bool isLikeA9() const { return isCortexA9() || isCortexA15(); }
 
   bool hasARMOps() const { return !NoARM; }
 
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index b12607b206..4675c98f0d 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -157,7 +157,7 @@ bool ARMPassConfig::addPreRegAlloc() {
   // FIXME: temporarily disabling load / store optimization pass for Thumb1.
   if (getOptLevel() != CodeGenOpt::None && !getARMSubtarget().isThumb1Only())
     addPass(createARMLoadStoreOptimizationPass(true));
-  if (getOptLevel() != CodeGenOpt::None && getARMSubtarget().isCortexA9())
+  if (getOptLevel() != CodeGenOpt::None && getARMSubtarget().isLikeA9())
     addPass(createMLxExpansionPass());
   return true;
 }
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index e1e2f6ea73..bc711dc35f 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -257,6 +257,10 @@ public:
                         SmallVectorImpl<MCParsedAsmOperand*> &Operands);
   bool ParseDirective(AsmToken DirectiveID);
 
+  bool mnemonicIsValid(StringRef Mnemonic) {
+    return mnemonicIsValidImpl(Mnemonic);
+  }
+
   unsigned checkTargetMatchPredicate(MCInst &Inst);
 
   bool MatchAndEmitInstruction(SMLoc IDLoc,
@@ -487,7 +491,8 @@ public:
   SMLoc getStartLoc() const { return StartLoc; }
   /// getEndLoc - Get the location of the last token of this operand.
   SMLoc getEndLoc() const { return EndLoc; }
-
+  /// getLocRange - Get the range between the first and last token of this
+  /// operand.
   SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); }
 
   ARMCC::CondCodes getCondCode() const {
@@ -863,7 +868,7 @@ public:
   bool isSPRRegList() const { return Kind == k_SPRRegisterList; }
   bool isToken() const { return Kind == k_Token; }
   bool isMemBarrierOpt() const { return Kind == k_MemBarrierOpt; }
-  bool isMemory() const { return Kind == k_Memory; }
+  bool isMem() const { return Kind == k_Memory; }
   bool isShifterImm() const { return Kind == k_ShifterImmediate; }
   bool isRegShiftedReg() const { return Kind == k_ShiftedRegister; }
   bool isRegShiftedImm() const { return Kind == k_ShiftedImmediate; }
@@ -874,14 +879,14 @@ public:
     return Kind == k_PostIndexRegister && PostIdxReg.ShiftTy ==ARM_AM::no_shift;
   }
   bool isMemNoOffset(bool alignOK = false) const {
-    if (!isMemory())
+    if (!isMem())
       return false;
     // No offset of any kind.
     return Memory.OffsetRegNum == 0 && Memory.OffsetImm == 0 &&
      (alignOK || Memory.Alignment == 0);
   }
   bool isMemPCRelImm12() const {
-    if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
       return false;
     // Base register must be PC.
     if (Memory.BaseRegNum != ARM::PC)
@@ -895,7 +900,7 @@ public:
     return isMemNoOffset(true);
   }
   bool isAddrMode2() const {
-    if (!isMemory() || Memory.Alignment != 0) return false;
+    if (!isMem() || Memory.Alignment != 0) return false;
     // Check for register offset.
     if (Memory.OffsetRegNum) return true;
     // Immediate offset in range [-4095, 4095].
@@ -917,7 +922,7 @@ public:
     // and we reject it.
     if (isImm() && !isa<MCConstantExpr>(getImm()))
       return true;
-    if (!isMemory() || Memory.Alignment != 0) return false;
+    if (!isMem() || Memory.Alignment != 0) return false;
     // No shifts are legal for AM3.
     if (Memory.ShiftType != ARM_AM::no_shift) return false;
     // Check for register offset.
@@ -947,7 +952,7 @@ public:
     // and we reject it.
     if (isImm() && !isa<MCConstantExpr>(getImm()))
       return true;
-    if (!isMemory() || Memory.Alignment != 0) return false;
+    if (!isMem() || Memory.Alignment != 0) return false;
     // Check for register offset.
     if (Memory.OffsetRegNum) return false;
     // Immediate offset in range [-1020, 1020] and a multiple of 4.
@@ -957,25 +962,25 @@ public:
       Val == INT32_MIN;
   }
   bool isMemTBB() const {
-    if (!isMemory() || !Memory.OffsetRegNum || Memory.isNegative ||
+    if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative ||
         Memory.ShiftType != ARM_AM::no_shift || Memory.Alignment != 0)
       return false;
     return true;
   }
   bool isMemTBH() const {
-    if (!isMemory() || !Memory.OffsetRegNum || Memory.isNegative ||
+    if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative ||
         Memory.ShiftType != ARM_AM::lsl || Memory.ShiftImm != 1 ||
         Memory.Alignment != 0 )
       return false;
     return true;
   }
   bool isMemRegOffset() const {
-    if (!isMemory() || !Memory.OffsetRegNum || Memory.Alignment != 0)
+    if (!isMem() || !Memory.OffsetRegNum || Memory.Alignment != 0)
       return false;
     return true;
   }
   bool isT2MemRegOffset() const {
-    if (!isMemory() || !Memory.OffsetRegNum || Memory.isNegative ||
+    if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative ||
         Memory.Alignment != 0)
       return false;
     // Only lsl #{0, 1, 2, 3} allowed.
@@ -988,14 +993,14 @@ public:
   bool isMemThumbRR() const {
     // Thumb reg+reg addressing is simple. Just two registers, a base and
     // an offset. No shifts, negations or any other complicating factors.
-    if (!isMemory() || !Memory.OffsetRegNum || Memory.isNegative ||
+    if (!isMem() || !Memory.OffsetRegNum || Memory.isNegative ||
         Memory.ShiftType != ARM_AM::no_shift || Memory.Alignment != 0)
       return false;
     return isARMLowRegister(Memory.BaseRegNum) &&
       (!Memory.OffsetRegNum || isARMLowRegister(Memory.OffsetRegNum));
   }
   bool isMemThumbRIs4() const {
-    if (!isMemory() || Memory.OffsetRegNum != 0 ||
+    if (!isMem() || Memory.OffsetRegNum != 0 ||
         !isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0)
       return false;
     // Immediate offset, multiple of 4 in range [0, 124].
@@ -1004,7 +1009,7 @@ public:
     return Val >= 0 && Val <= 124 && (Val % 4) == 0;
   }
   bool isMemThumbRIs2() const {
-    if (!isMemory() || Memory.OffsetRegNum != 0 ||
+    if (!isMem() || Memory.OffsetRegNum != 0 ||
         !isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0)
       return false;
     // Immediate offset, multiple of 4 in range [0, 62].
@@ -1013,7 +1018,7 @@ public:
     return Val >= 0 && Val <= 62 && (Val % 2) == 0;
   }
   bool isMemThumbRIs1() const {
-    if (!isMemory() || Memory.OffsetRegNum != 0 ||
+    if (!isMem() || Memory.OffsetRegNum != 0 ||
         !isARMLowRegister(Memory.BaseRegNum) || Memory.Alignment != 0)
       return false;
     // Immediate offset in range [0, 31].
@@ -1022,7 +1027,7 @@ public:
     return Val >= 0 && Val <= 31;
   }
   bool isMemThumbSPI() const {
-    if (!isMemory() || Memory.OffsetRegNum != 0 ||
+    if (!isMem() || Memory.OffsetRegNum != 0 ||
         Memory.BaseRegNum != ARM::SP || Memory.Alignment != 0)
       return false;
     // Immediate offset, multiple of 4 in range [0, 1020].
@@ -1036,7 +1041,7 @@ public:
     // and we reject it.
     if (isImm() && !isa<MCConstantExpr>(getImm()))
       return true;
-    if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
       return false;
     // Immediate offset a multiple of 4 in range [-1020, 1020].
     if (!Memory.OffsetImm) return true;
@@ -1045,7 +1050,7 @@ public:
     return (Val >= -1020 && Val <= 1020 && (Val & 3) == 0) || Val == INT32_MIN;
   }
   bool isMemImm0_1020s4Offset() const {
-    if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
       return false;
     // Immediate offset a multiple of 4 in range [0, 1020].
     if (!Memory.OffsetImm) return true;
@@ -1053,7 +1058,7 @@ public:
     return Val >= 0 && Val <= 1020 && (Val & 3) == 0;
   }
   bool isMemImm8Offset() const {
-    if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
       return false;
     // Base reg of PC isn't allowed for these encodings.
     if (Memory.BaseRegNum == ARM::PC) return false;
@@ -1063,7 +1068,7 @@ public:
     return (Val == INT32_MIN) || (Val > -256 && Val < 256);
   }
   bool isMemPosImm8Offset() const {
-    if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
       return false;
     // Immediate offset in range [0, 255].
     if (!Memory.OffsetImm) return true;
@@ -1071,7 +1076,7 @@ public:
     return Val >= 0 && Val < 256;
   }
   bool isMemNegImm8Offset() const {
-    if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
       return false;
     // Base reg of PC isn't allowed for these encodings.
     if (Memory.BaseRegNum == ARM::PC) return false;
@@ -1081,7 +1086,7 @@ public:
     return (Val == INT32_MIN) || (Val > -256 && Val < 0);
   }
   bool isMemUImm12Offset() const {
-    if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
       return false;
     // Immediate offset in range [0, 4095].
     if (!Memory.OffsetImm) return true;
@@ -1095,7 +1100,7 @@ public:
     if (isImm() && !isa<MCConstantExpr>(getImm()))
       return true;
 
-    if (!isMemory() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
+    if (!isMem() || Memory.OffsetRegNum != 0 || Memory.Alignment != 0)
       return false;
     // Immediate offset in range [-4095, 4095].
     if (!Memory.OffsetImm) return true;
@@ -4439,6 +4444,12 @@ bool ARMAsmParser::parseMemRegOffsetShift(ARM_AM::ShiftOpc &St,
         ((St == ARM_AM::lsl || St == ARM_AM::ror) && Imm > 31) ||
         ((St == ARM_AM::lsr || St == ARM_AM::asr) && Imm > 32))
       return Error(Loc, "immediate shift value out of range");
+    // If <ShiftTy> #0, turn it into a no_shift.
+    if (Imm == 0)
+      St = ARM_AM::lsl;
+    // For consistency, treat lsr #32 and asr #32 as having immediate value 0.
+    if (Imm == 32)
+      Imm = 0;
     Amount = Imm;
   }
 
@@ -4616,7 +4627,7 @@ bool ARMAsmParser::parseOperand(SmallVectorImpl<MCParsedAsmOperand*> &Operands,
       return true;
 
     const MCExpr *ExprVal = ARMMCExpr::Create(RefKind, SubExprVal,
-                                                   getContext());
+                                              getContext());
     E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
     Operands.push_back(ARMOperand::CreateImm(ExprVal, S, E));
     return false;
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 57642e1924..bf0dabb4a0 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -1523,6 +1523,8 @@ DecodeAddrMode2IdxInstruction(MCInst &Inst, unsigned Insn,
         return MCDisassembler::Fail;
     }
     unsigned amt = fieldFromInstruction(Insn, 7, 5);
+    if (Opc == ARM_AM::ror && amt == 0)
+      Opc = ARM_AM::rrx;
     unsigned imm = ARM_AM::getAM2Opc(Op, amt, Opc, idx_mode);
 
     Inst.addOperand(MCOperand::CreateImm(imm));
@@ -1564,6 +1566,9 @@ static DecodeStatus DecodeSORegMemOperand(MCInst &Inst, unsigned Val,
       break;
   }
 
+  if (ShOp == ARM_AM::ror && imm == 0)
+    ShOp = ARM_AM::rrx;
+
   if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
     return MCDisassembler::Fail;
   if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index 9a794c34af..0e16027770 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -29,11 +29,27 @@ using namespace llvm;
 ///
 /// getSORegOffset returns an integer from 0-31, representing '32' as 0.
 static unsigned translateShiftImm(unsigned imm) {
+  // lsr #32 and asr #32 exist, but should be encoded as a 0.
+  assert((imm & ~0x1f) == 0 && "Invalid shift encoding");
+
   if (imm == 0)
     return 32;
   return imm;
 }
 
+/// Prints the shift value with an immediate value.
+static void printRegImmShift(raw_ostream &O, ARM_AM::ShiftOpc ShOpc,
+                          unsigned ShImm) {
+  if (ShOpc == ARM_AM::no_shift || (ShOpc == ARM_AM::lsl && !ShImm))
+    return;
+  O << ", ";
+
+  assert (!(ShOpc == ARM_AM::ror && !ShImm) && "Cannot have ror #0");
+  O << getShiftOpcStr(ShOpc);
+
+  if (ShOpc != ARM_AM::rrx)
+    O << " #" << translateShiftImm(ShImm);
+}
 
 ARMInstPrinter::ARMInstPrinter(const MCAsmInfo &MAI,
                                const MCInstrInfo &MII,
@@ -351,11 +367,8 @@ void ARMInstPrinter::printSORegImmOperand(const MCInst *MI, unsigned OpNum,
   O << getRegisterName(MO1.getReg());
 
   // Print the shift opc.
-  ARM_AM::ShiftOpc ShOpc = ARM_AM::getSORegShOp(MO2.getImm());
-  O << ", " << ARM_AM::getShiftOpcStr(ShOpc);
-  if (ShOpc == ARM_AM::rrx)
-    return;
-  O << " #" << translateShiftImm(ARM_AM::getSORegOffset(MO2.getImm()));
+  printRegImmShift(O, ARM_AM::getSORegShOp(MO2.getImm()),
+                   ARM_AM::getSORegOffset(MO2.getImm()));
 }
 
 
@@ -384,38 +397,11 @@ void ARMInstPrinter::printAM2PreOrOffsetIndexOp(const MCInst *MI, unsigned Op,
     << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm()))
     << getRegisterName(MO2.getReg());
 
-  if (unsigned ShImm = ARM_AM::getAM2Offset(MO3.getImm()))
-    O << ", "
-    << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO3.getImm()))
-    << " #" << ShImm;
+  printRegImmShift(O, ARM_AM::getAM2ShiftOpc(MO3.getImm()),
+                   ARM_AM::getAM2Offset(MO3.getImm()));
   O << "]";
 }
 
-void ARMInstPrinter::printAM2PostIndexOp(const MCInst *MI, unsigned Op,
-                                         raw_ostream &O) {
-  const MCOperand &MO1 = MI->getOperand(Op);
-  const MCOperand &MO2 = MI->getOperand(Op+1);
-  const MCOperand &MO3 = MI->getOperand(Op+2);
-
-  O << "[" << getRegisterName(MO1.getReg()) << "], ";
-
-  if (!MO2.getReg()) {
-    unsigned ImmOffs = ARM_AM::getAM2Offset(MO3.getImm());
-    O << '#'
-      << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm()))
-      << ImmOffs;
-    return;
-  }
-
-  O << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm()))
-    << getRegisterName(MO2.getReg());
-
-  if (unsigned ShImm = ARM_AM::getAM2Offset(MO3.getImm()))
-    O << ", "
-    << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO3.getImm()))
-    << " #" << ShImm;
-}
-
 void ARMInstPrinter::printAddrModeTBB(const MCInst *MI, unsigned Op,
                                            raw_ostream &O) {
   const MCOperand &MO1 = MI->getOperand(Op);
@@ -441,13 +427,13 @@ void ARMInstPrinter::printAddrMode2Operand(const MCInst *MI, unsigned Op,
     return;
   }
 
+#ifndef NDEBUG
   const MCOperand &MO3 = MI->getOperand(Op+2);
   unsigned IdxMode = ARM_AM::getAM2IdxMode(MO3.getImm());
+  assert(IdxMode != ARMII::IndexModePost &&
+         "Should be pre or offset index op");
+#endif
 
-  if (IdxMode == ARMII::IndexModePost) {
-    printAM2PostIndexOp(MI, Op, O);
-    return;
-  }
   printAM2PreOrOffsetIndexOp(MI, Op, O);
 }
 
@@ -468,10 +454,8 @@ void ARMInstPrinter::printAddrMode2OffsetOperand(const MCInst *MI,
   O << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO2.getImm()))
     << getRegisterName(MO1.getReg());
 
-  if (unsigned ShImm = ARM_AM::getAM2Offset(MO2.getImm()))
-    O << ", "
-    << ARM_AM::getShiftOpcStr(ARM_AM::getAM2ShiftOpc(MO2.getImm()))
-    << " #" << ShImm;
+  printRegImmShift(O, ARM_AM::getAM2ShiftOpc(MO2.getImm()),
+                   ARM_AM::getAM2Offset(MO2.getImm()));
 }
 
 //===--------------------------------------------------------------------===//
@@ -512,10 +496,10 @@ void ARMInstPrinter::printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op,
     return;
   }
 
-  //If the op is sub we have to print the immediate even if it is 0 
+  //If the op is sub we have to print the immediate even if it is 0
   unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm());
   ARM_AM::AddrOpc op = ARM_AM::getAM3Op(MO3.getImm());
- 
+
   if (ImmOffs || (op == ARM_AM::sub))
     O << ", #"
       << ARM_AM::getAddrOpcStr(op)
@@ -975,10 +959,8 @@ void ARMInstPrinter::printT2SOOperand(const MCInst *MI, unsigned OpNum,
 
   // Print the shift opc.
   assert(MO2.isImm() && "Not a valid t2_so_reg value!");
-  ARM_AM::ShiftOpc ShOpc = ARM_AM::getSORegShOp(MO2.getImm());
-  O << ", " << ARM_AM::getShiftOpcStr(ShOpc);
-  if (ShOpc != ARM_AM::rrx)
-    O << " #" << translateShiftImm(ARM_AM::getSORegOffset(MO2.getImm()));
+  printRegImmShift(O, ARM_AM::getSORegShOp(MO2.getImm()),
+                   ARM_AM::getSORegOffset(MO2.getImm()));
 }
 
 void ARMInstPrinter::printAddrModeImm12Operand(const MCInst *MI, unsigned OpNum,
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index d8d8d53a57..34c79f945f 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -701,7 +701,7 @@ void DarwinARMAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
 
 } // end anonymous namespace
 
-MCAsmBackend *llvm::createARMAsmBackend(const Target &T, StringRef TT) {
+MCAsmBackend *llvm::createARMAsmBackend(const Target &T, StringRef TT, StringRef CPU) {
   Triple TheTriple(TT);
 
   if (TheTriple.isOSDarwin()) {
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index 1917564904..d0e127a8f3 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -35,8 +35,8 @@ STATISTIC(MCNumCPRelocations, "Number of constant pool relocations created.");
 
 namespace {
 class ARMMCCodeEmitter : public MCCodeEmitter {
-  ARMMCCodeEmitter(const ARMMCCodeEmitter &); // DO NOT IMPLEMENT
-  void operator=(const ARMMCCodeEmitter &); // DO NOT IMPLEMENT
+  ARMMCCodeEmitter(const ARMMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  void operator=(const ARMMCCodeEmitter &) LLVM_DELETED_FUNCTION;
   const MCInstrInfo &MCII;
   const MCSubtargetInfo &STI;
   const MCContext &CTX;
@@ -934,6 +934,10 @@ getLdStSORegOpValue(const MCInst &MI, unsigned OpIdx,
   ARM_AM::ShiftOpc ShOp = ARM_AM::getAM2ShiftOpc(MO2.getImm());
   unsigned SBits = getShiftOp(ShOp);
 
+  // While "lsr #32" and "asr #32" exist, they are encoded with a 0 in the shift
+  // amount. However, it would be an easy mistake to make so check here.
+  assert((ShImm & ~0x1f) == 0 && "Out of range shift amount");
+
   // {16-13} = Rn
   // {12}    = isAdd
   // {11-0}  = shifter
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
index a727e087d2..6bddc42b37 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCExpr.h
@@ -28,7 +28,7 @@ private:
 
   explicit ARMMCExpr(VariantKind _Kind, const MCExpr *_Expr)
     : Kind(_Kind), Expr(_Expr) {}
-  
+
 public:
   /// @name Construction
   /// @{
@@ -67,7 +67,7 @@ public:
   static bool classof(const MCExpr *E) {
     return E->getKind() == MCExpr::Target;
   }
-  
+
   static bool classof(const ARMMCExpr *) { return true; }
 
 };
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
index 510302d4b3..a89981e4f0 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
@@ -46,7 +46,7 @@ MCCodeEmitter *createARMMCCodeEmitter(const MCInstrInfo &MCII,
                                       const MCSubtargetInfo &STI,
                                       MCContext &Ctx);
 
-MCAsmBackend *createARMAsmBackend(const Target &T, StringRef TT);
+MCAsmBackend *createARMAsmBackend(const Target &T, StringRef TT, StringRef CPU);
 
 /// createARMELFObjectWriter - Construct an ELF Mach-O object writer.
 MCObjectWriter *createARMELFObjectWriter(raw_ostream &OS,
diff --git a/lib/Target/ARM/MLxExpansionPass.cpp b/lib/Target/ARM/MLxExpansionPass.cpp
index ad60e3282e..4ebba0e4d3 100644
--- a/lib/Target/ARM/MLxExpansionPass.cpp
+++ b/lib/Target/ARM/MLxExpansionPass.cpp
@@ -51,7 +51,7 @@ namespace {
     const TargetRegisterInfo *TRI;
     MachineRegisterInfo *MRI;
 
-    bool isA9;
+    bool isLikeA9;
     unsigned MIIdx;
     MachineInstr* LastMIs[4];
     SmallPtrSet<MachineInstr*, 4> IgnoreStall;
@@ -179,8 +179,8 @@ bool MLxExpansion::FindMLxHazard(MachineInstr *MI) {
   // preserves the in-order retirement of the instructions.
   // Look at the next few instructions, if *most* of them can cause hazards,
   // then the scheduler can't *fix* this, we'd better break up the VMLA.
-  unsigned Limit1 = isA9 ? 1 : 4;
-  unsigned Limit2 = isA9 ? 1 : 4;
+  unsigned Limit1 = isLikeA9 ? 1 : 4;
+  unsigned Limit2 = isLikeA9 ? 1 : 4;
   for (unsigned i = 1; i <= 4; ++i) {
     int Idx = ((int)MIIdx - i + 4) % 4;
     MachineInstr *NextMI = LastMIs[Idx];
@@ -316,7 +316,7 @@ bool MLxExpansion::runOnMachineFunction(MachineFunction &Fn) {
   TRI = Fn.getTarget().getRegisterInfo();
   MRI = &Fn.getRegInfo();
   const ARMSubtarget *STI = &Fn.getTarget().getSubtarget<ARMSubtarget>();
-  isA9 = STI->isCortexA9();
+  isLikeA9 = STI->isLikeA9();
 
   bool Modified = false;
   for (MachineFunction::iterator MFI = Fn.begin(), E = Fn.end(); MFI != E;
diff --git a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
index 425371d3e1..5d5061054b 100644
--- a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
+++ b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
@@ -67,8 +67,8 @@ namespace {
 
   //! ConstantSDNode predicate for signed 16-bit values
   /*!
-    \arg CN The constant SelectionDAG node holding the value
-    \arg Imm The returned 16-bit value, if returning true
+    \param CN The constant SelectionDAG node holding the value
+    \param Imm The returned 16-bit value, if returning true
 
     This predicate tests the value in \a CN to see whether it can be
     represented as a 16-bit, sign-extended quantity. Returns true if
@@ -305,10 +305,10 @@ namespace {
 }
 
 /*!
- \arg Op The ISD instruction operand
- \arg N The address to be tested
- \arg Base The base address
- \arg Index The base address index
+ \param Op The ISD instruction operand
+ \param N The address to be tested
+ \param Base The base address
+ \param Index The base address index
  */
 bool
 SPUDAGToDAGISel::SelectAFormAddr(SDNode *Op, SDValue N, SDValue &Base,
@@ -375,10 +375,10 @@ SPUDAGToDAGISel::SelectDForm2Addr(SDNode *Op, SDValue N, SDValue &Disp,
 }
 
 /*!
-  \arg Op The ISD instruction (ignored)
-  \arg N The address to be tested
-  \arg Base Base address register/pointer
-  \arg Index Base address index
+  \param Op The ISD instruction (ignored)
+  \param N The address to be tested
+  \param Base Base address register/pointer
+  \param Index Base address index
 
   Examine the input address by a base register plus a signed 10-bit
   displacement, [r+I10] (D-form address).
@@ -541,10 +541,10 @@ SPUDAGToDAGISel::DFormAddressPredicate(SDNode *Op, SDValue N, SDValue &Base,
 }
 
 /*!
-  \arg Op The ISD instruction operand
-  \arg N The address operand
-  \arg Base The base pointer operand
-  \arg Index The offset/index operand
+  \param Op The ISD instruction operand
+  \param N The address operand
+  \param Base The base pointer operand
+  \param Index The offset/index operand
 
   If the address \a N can be expressed as an A-form or D-form address, returns
   false.  Otherwise, creates two operands, Base and Index that will become the
@@ -569,7 +569,7 @@ SPUDAGToDAGISel::SelectXFormAddr(SDNode *Op, SDValue N, SDValue &Base,
  Utility function to use with COPY_TO_REGCLASS instructions. Returns a SDValue
  to be used as the last parameter of a
 CurDAG->getMachineNode(COPY_TO_REGCLASS,..., ) function call
- \arg VT the value type for which we want a register class
+ \param VT the value type for which we want a register class
 */
 SDValue SPUDAGToDAGISel::getRC( MVT VT ) {
   switch( VT.SimpleTy ) {
diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp
index 4ddcd38aca..444da2bb21 100644
--- a/lib/Target/CppBackend/CPPBackend.cpp
+++ b/lib/Target/CppBackend/CPPBackend.cpp
@@ -508,7 +508,7 @@ void CppWriter::printAttributes(const AttrListPtr &PAL,
 #undef HANDLE_ATTR
       if (attrs & Attribute::StackAlignment)
         Out << " | Attribute::constructStackAlignmentFromInt("
-            << Attribute::getStackAlignmentFromAttrs(attrs)
+            << attrs.getStackAlignment()
             << ")"; 
       attrs &= ~Attribute::StackAlignment;
       assert(attrs == 0 && "Unhandled attribute!");
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
index 838f7b5ed7..1db6f5ee61 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.cpp
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
@@ -20,6 +20,22 @@
 
 using namespace llvm;
 
+/// Platform specific modifications to DAG.
+void VLIWMachineScheduler::postprocessDAG() {
+  SUnit* LastSequentialCall = NULL;
+  // Currently we only catch the situation when compare gets scheduled
+  // before preceding call.
+  for (unsigned su = 0, e = SUnits.size(); su != e; ++su) {
+    // Remember the call.
+    if (SUnits[su].getInstr()->isCall())
+      LastSequentialCall = &(SUnits[su]);
+    // Look for a compare that defines a predicate.
+    else if (SUnits[su].getInstr()->isCompare() && LastSequentialCall)
+      SUnits[su].addPred(SDep(LastSequentialCall, SDep::Order, 0, /*Reg=*/0,
+                              false));
+  }
+}
+
 /// Check if scheduling of this SU is possible
 /// in the current packet.
 /// It is _not_ precise (statefull), it is more like
@@ -67,6 +83,13 @@ bool VLIWResourceModel::isResourceAvailable(SUnit *SU) {
 /// Keep track of available resources.
 bool VLIWResourceModel::reserveResources(SUnit *SU) {
   bool startNewCycle = false;
+  // Artificially reset state.
+  if (!SU) {
+    ResourcesModel->clearResources();
+    Packet.clear();
+    TotalPackets++;
+    return false;
+  }
   // If this SU does not fit in the packet
   // start a new one.
   if (!isResourceAvailable(SU)) {
@@ -123,11 +146,14 @@ void VLIWMachineScheduler::schedule() {
         << "********** MI Converging Scheduling VLIW BB#" << BB->getNumber()
         << " " << BB->getName()
         << " in_func " << BB->getParent()->getFunction()->getName()
-        << " at loop depth "  << MLI->getLoopDepth(BB)
+        << " at loop depth "  << MLI.getLoopDepth(BB)
         << " \n");
 
   buildDAGWithRegPressure();
 
+  // Postprocess the DAG to add platform specific artificial dependencies.
+  postprocessDAG();
+
   // To view Height/Depth correctly, they should be accessed at least once.
   DEBUG(unsigned maxH = 0;
         for (unsigned su = 0, e = SUnits.size(); su != e; ++su)
@@ -354,6 +380,7 @@ SUnit *ConvergingVLIWScheduler::SchedBoundary::pickOnlyChoice() {
   for (unsigned i = 0; Available.empty(); ++i) {
     assert(i <= (HazardRec->getMaxLookAhead() + MaxMinLatency) &&
            "permanent hazard"); (void)i;
+    ResourceModel->reserveResources(0);
     bumpCycle();
     releasePending();
   }
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.h b/lib/Target/Hexagon/HexagonMachineScheduler.h
index 212cc9800b..5b6f226a00 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.h
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.h
@@ -107,14 +107,15 @@ public:
 /// Extend the standard ScheduleDAGMI to provide more context and override the
 /// top-level schedule() driver.
 class VLIWMachineScheduler : public ScheduleDAGMI {
-  const MachineLoopInfo *MLI;
 public:
   VLIWMachineScheduler(MachineSchedContext *C, MachineSchedStrategy *S):
-    ScheduleDAGMI(C, S), MLI(C->MLI) {}
+    ScheduleDAGMI(C, S) {}
 
   /// Schedule - This is called back from ScheduleDAGInstrs::Run() when it's
   /// time to do some work.
   virtual void schedule();
+  /// Perform platform specific DAG postprocessing.
+  void postprocessDAG();
 };
 
 /// ConvergingVLIWScheduler shrinks the unscheduled zone using heuristics
@@ -223,6 +224,11 @@ public:
 
   virtual void releaseBottomNode(SUnit *SU);
 
+  unsigned ReportPackets() {
+    return Top.ResourceModel->getTotalPackets() +
+           Bot.ResourceModel->getTotalPackets();
+  }
+
 protected:
   SUnit *pickNodeBidrectional(bool &IsTopNode);
 
diff --git a/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp b/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp
index 40594030e5..daceb88076 100644
--- a/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp
+++ b/lib/Target/MBlaze/AsmParser/MBlazeAsmParser.cpp
@@ -44,6 +44,10 @@ class MBlazeAsmParser : public MCTargetAsmParser {
 
   bool ParseDirectiveWord(unsigned Size, SMLoc L);
 
+  bool mnemonicIsValid(StringRef Mnemonic) {
+    return mnemonicIsValidImpl(Mnemonic);
+  }
+
   bool MatchAndEmitInstruction(SMLoc IDLoc,
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                                MCStreamer &Out);
diff --git a/lib/Target/MBlaze/MCTargetDesc/MBlazeAsmBackend.cpp b/lib/Target/MBlaze/MCTargetDesc/MBlazeAsmBackend.cpp
index f383fecdc2..44feeb49e7 100644
--- a/lib/Target/MBlaze/MCTargetDesc/MBlazeAsmBackend.cpp
+++ b/lib/Target/MBlaze/MCTargetDesc/MBlazeAsmBackend.cpp
@@ -156,7 +156,8 @@ void ELFMBlazeAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
 }
 } // end anonymous namespace
 
-MCAsmBackend *llvm::createMBlazeAsmBackend(const Target &T, StringRef TT) {
+MCAsmBackend *llvm::createMBlazeAsmBackend(const Target &T, StringRef TT,
+                                           StringRef CPU) {
   Triple TheTriple(TT);
 
   if (TheTriple.isOSDarwin())
diff --git a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCCodeEmitter.cpp b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCCodeEmitter.cpp
index bfd11a070b..2b71d9d3c8 100644
--- a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCCodeEmitter.cpp
+++ b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCCodeEmitter.cpp
@@ -29,8 +29,8 @@ STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
 
 namespace {
 class MBlazeMCCodeEmitter : public MCCodeEmitter {
-  MBlazeMCCodeEmitter(const MBlazeMCCodeEmitter &); // DO NOT IMPLEMENT
-  void operator=(const MBlazeMCCodeEmitter &); // DO NOT IMPLEMENT
+  MBlazeMCCodeEmitter(const MBlazeMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  void operator=(const MBlazeMCCodeEmitter &) LLVM_DELETED_FUNCTION;
   const MCInstrInfo &MCII;
 
 public:
diff --git a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h
index 7cc96c62c8..7bc7d8f207 100644
--- a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h
+++ b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.h
@@ -35,7 +35,8 @@ MCCodeEmitter *createMBlazeMCCodeEmitter(const MCInstrInfo &MCII,
                                          const MCSubtargetInfo &STI,
                                          MCContext &Ctx);
 
-MCAsmBackend *createMBlazeAsmBackend(const Target &T, StringRef TT);
+MCAsmBackend *createMBlazeAsmBackend(const Target &T, StringRef TT,
+                                     StringRef CPU);
 
 MCObjectWriter *createMBlazeELFObjectWriter(raw_ostream &OS, uint8_t OSABI);
 } // End llvm namespace
diff --git a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
index 5430d433b6..5efc6a36b8 100644
--- a/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
+++ b/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
@@ -274,8 +274,8 @@ bool MSP430DAGToDAGISel::SelectAddr(SDValue N,
   else if (AM.JT != -1)
     Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i16, 0/*AM.SymbolFlags*/);
   else if (AM.BlockAddr)
-    Disp = CurDAG->getBlockAddress(AM.BlockAddr, MVT::i32,
-                                   true, 0/*AM.SymbolFlags*/);
+    Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, 0,
+                                         0/*AM.SymbolFlags*/);
   else
     Disp = CurDAG->getTargetConstant(AM.Disp, MVT::i16);
 
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index f8b7e149f0..bda688c31d 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -655,7 +655,7 @@ SDValue MSP430TargetLowering::LowerBlockAddress(SDValue Op,
                                                 SelectionDAG &DAG) const {
   DebugLoc dl = Op.getDebugLoc();
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
-  SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), /*isTarget=*/true);
+  SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy());
 
   return DAG.getNode(MSP430ISD::Wrapper, dl, getPointerTy(), Result);
 }
diff --git a/lib/Target/Mangler.cpp b/lib/Target/Mangler.cpp
index 05f6fa64b9..7fe0d0eb22 100644
--- a/lib/Target/Mangler.cpp
+++ b/lib/Target/Mangler.cpp
@@ -44,7 +44,7 @@ static void MangleLetter(SmallVectorImpl<char> &OutName, unsigned char C) {
   OutName.push_back('_');
 }
 
-/// NameNeedsEscaping - Return true if the identifier \arg Str needs quotes
+/// NameNeedsEscaping - Return true if the identifier \p Str needs quotes
 /// for this assembler.
 static bool NameNeedsEscaping(StringRef Str, const MCAsmInfo &MAI) {
   assert(!Str.empty() && "Cannot create an empty MCSymbol");
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index 8418b7542f..4cbd4c8e12 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -41,6 +41,10 @@ class MipsAsmParser : public MCTargetAsmParser {
 #define GET_ASSEMBLER_HEADER
 #include "MipsGenAsmMatcher.inc"
 
+  bool mnemonicIsValid(StringRef Mnemonic) {
+    return mnemonicIsValidImpl(Mnemonic);
+  }
+
   bool MatchAndEmitInstruction(SMLoc IDLoc,
                                SmallVectorImpl<MCParsedAsmOperand*> &Operands,
                                MCStreamer &Out);
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index 61e22d63cb..9a94c75e2f 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -93,7 +93,7 @@ public:
       MCELFObjectTargetWriter::getOSABI(OSType), IsLittle, Is64Bit);
   }
 
-  /// ApplyFixup - Apply the \arg Value for given \arg Fixup into the provided
+  /// ApplyFixup - Apply the \p Value for given \p Fixup into the provided
   /// data fragment, at the offset specified by the fixup and following the
   /// fixup kind as appropriate.
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
@@ -218,7 +218,7 @@ public:
   ///
   /// \param Inst - The instruction to relax, which may be the same
   /// as the output.
-  /// \parm Res [output] - On return, the relaxed instruction.
+  /// \param [out] Res On return, the relaxed instruction.
   void relaxInstruction(const MCInst &Inst, MCInst &Res) const {
   }
 
@@ -259,22 +259,26 @@ public:
 } // namespace
 
 // MCAsmBackend
-MCAsmBackend *llvm::createMipsAsmBackendEL32(const Target &T, StringRef TT) {
+MCAsmBackend *llvm::createMipsAsmBackendEL32(const Target &T, StringRef TT,
+                                             StringRef CPU) {
   return new MipsAsmBackend(T, Triple(TT).getOS(),
                             /*IsLittle*/true, /*Is64Bit*/false);
 }
 
-MCAsmBackend *llvm::createMipsAsmBackendEB32(const Target &T, StringRef TT) {
+MCAsmBackend *llvm::createMipsAsmBackendEB32(const Target &T, StringRef TT,
+                                             StringRef CPU) {
   return new MipsAsmBackend(T, Triple(TT).getOS(),
                             /*IsLittle*/false, /*Is64Bit*/false);
 }
 
-MCAsmBackend *llvm::createMipsAsmBackendEL64(const Target &T, StringRef TT) {
+MCAsmBackend *llvm::createMipsAsmBackendEL64(const Target &T, StringRef TT,
+                                             StringRef CPU) {
   return new MipsAsmBackend(T, Triple(TT).getOS(),
                             /*IsLittle*/true, /*Is64Bit*/true);
 }
 
-MCAsmBackend *llvm::createMipsAsmBackendEB64(const Target &T, StringRef TT) {
+MCAsmBackend *llvm::createMipsAsmBackendEB64(const Target &T, StringRef TT,
+                                             StringRef CPU) {
   return new MipsAsmBackend(T, Triple(TT).getOS(),
                             /*IsLittle*/false, /*Is64Bit*/true);
 }
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index 1d7370a04f..f3a8d3fb0d 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -29,17 +29,14 @@ using namespace llvm;
 
 namespace {
 class MipsMCCodeEmitter : public MCCodeEmitter {
-  MipsMCCodeEmitter(const MipsMCCodeEmitter &); // DO NOT IMPLEMENT
-  void operator=(const MipsMCCodeEmitter &); // DO NOT IMPLEMENT
+  MipsMCCodeEmitter(const MipsMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  void operator=(const MipsMCCodeEmitter &) LLVM_DELETED_FUNCTION;
   const MCInstrInfo &MCII;
-  const MCSubtargetInfo &STI;
-  MCContext &Ctx;
   bool IsLittleEndian;
 
 public:
-  MipsMCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
-                    MCContext &ctx, bool IsLittle) :
-            MCII(mcii), STI(sti) , Ctx(ctx), IsLittleEndian(IsLittle) {}
+  MipsMCCodeEmitter(const MCInstrInfo &mcii, bool IsLittle) :
+            MCII(mcii), IsLittleEndian(IsLittle) {}
 
   ~MipsMCCodeEmitter() {}
 
@@ -95,7 +92,7 @@ MCCodeEmitter *llvm::createMipsMCCodeEmitterEB(const MCInstrInfo &MCII,
                                                const MCSubtargetInfo &STI,
                                                MCContext &Ctx)
 {
-  return new MipsMCCodeEmitter(MCII, STI, Ctx, false);
+  return new MipsMCCodeEmitter(MCII, false);
 }
 
 MCCodeEmitter *llvm::createMipsMCCodeEmitterEL(const MCInstrInfo &MCII,
@@ -103,7 +100,7 @@ MCCodeEmitter *llvm::createMipsMCCodeEmitterEL(const MCInstrInfo &MCII,
                                                const MCSubtargetInfo &STI,
                                                MCContext &Ctx)
 {
-  return new MipsMCCodeEmitter(MCII, STI, Ctx, true);
+  return new MipsMCCodeEmitter(MCII, true);
 }
 
 /// EncodeInstruction - Emit the instruction.
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
index bfcc2a2e4a..71954a4bd8 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
@@ -42,10 +42,14 @@ MCCodeEmitter *createMipsMCCodeEmitterEL(const MCInstrInfo &MCII,
                                          const MCSubtargetInfo &STI,
                                          MCContext &Ctx);
 
-MCAsmBackend *createMipsAsmBackendEB32(const Target &T, StringRef TT);
-MCAsmBackend *createMipsAsmBackendEL32(const Target &T, StringRef TT);
-MCAsmBackend *createMipsAsmBackendEB64(const Target &T, StringRef TT);
-MCAsmBackend *createMipsAsmBackendEL64(const Target &T, StringRef TT);
+MCAsmBackend *createMipsAsmBackendEB32(const Target &T, StringRef TT,
+                                       StringRef CPU);
+MCAsmBackend *createMipsAsmBackendEL32(const Target &T, StringRef TT,
+                                       StringRef CPU);
+MCAsmBackend *createMipsAsmBackendEB64(const Target &T, StringRef TT,
+                                       StringRef CPU);
+MCAsmBackend *createMipsAsmBackendEL64(const Target &T, StringRef TT,
+                                       StringRef CPU);
 
 MCObjectWriter *createMipsELFObjectWriter(raw_ostream &OS,
                                           uint8_t OSABI,
diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td
index 7cec531f9a..90c01d5de0 100644
--- a/lib/Target/Mips/Mips.td
+++ b/lib/Target/Mips/Mips.td
@@ -77,6 +77,10 @@ def FeatureMips64r2    : SubtargetFeature<"mips64r2", "MipsArchVersion",
 def FeatureMips16  : SubtargetFeature<"mips16", "InMips16Mode", "true",
                                       "Mips16 mode">;
 
+def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true", "Mips DSP ASE">;
+def FeatureDSPR2 : SubtargetFeature<"dspr2", "HasDSPR2", "true",
+                                    "Mips DSP-R2 ASE", [FeatureDSP]>;
+
 //===----------------------------------------------------------------------===//
 // Mips processors supported.
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Mips/Mips16FrameLowering.cpp b/lib/Target/Mips/Mips16FrameLowering.cpp
index 030042f2e8..761581fb2b 100644
--- a/lib/Target/Mips/Mips16FrameLowering.cpp
+++ b/lib/Target/Mips/Mips16FrameLowering.cpp
@@ -66,7 +66,41 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                           MachineBasicBlock::iterator MI,
                           const std::vector<CalleeSavedInfo> &CSI,
                           const TargetRegisterInfo *TRI) const {
-  // FIXME: implement.
+  MachineFunction *MF = MBB.getParent();
+  MachineBasicBlock *EntryBlock = MF->begin();
+
+  //
+  // Registers RA, S0,S1 are the callee saved registers and they
+  // will be saved with the "save" instruction
+  // during emitPrologue
+  //
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    // Add the callee-saved register as live-in. Do not add if the register is
+    // RA and return address is taken, because it has already been added in
+    // method MipsTargetLowering::LowerRETURNADDR.
+    // It's killed at the spill, unless the register is RA and return address
+    // is taken.
+    unsigned Reg = CSI[i].getReg();
+    bool IsRAAndRetAddrIsTaken = (Reg == Mips::RA)
+      && MF->getFrameInfo()->isReturnAddressTaken();
+    if (!IsRAAndRetAddrIsTaken)
+      EntryBlock->addLiveIn(Reg);
+  }
+
+  return true;
+}
+
+bool Mips16FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator MI,
+                                       const std::vector<CalleeSavedInfo> &CSI,
+                                       const TargetRegisterInfo *TRI) const {
+  //
+  // Registers RA,S0,S1 are the callee saved registers and they will be restored
+  // with the restore instruction during emitEpilogue.
+  // We need to override this virtual function, otherwise llvm will try and
+  // restore the registers on it's on from the stack.
+  //
+
   return true;
 }
 
@@ -79,6 +113,9 @@ Mips16FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
 void Mips16FrameLowering::
 processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                      RegScavenger *RS) const {
+  MF.getRegInfo().setPhysRegUsed(Mips::RA);
+  MF.getRegInfo().setPhysRegUsed(Mips::S0);
+  MF.getRegInfo().setPhysRegUsed(Mips::S1);
 }
 
 const MipsFrameLowering *
diff --git a/lib/Target/Mips/Mips16FrameLowering.h b/lib/Target/Mips/Mips16FrameLowering.h
index 25cc37b815..01db71e8de 100644
--- a/lib/Target/Mips/Mips16FrameLowering.h
+++ b/lib/Target/Mips/Mips16FrameLowering.h
@@ -32,6 +32,11 @@ public:
                                  const std::vector<CalleeSavedInfo> &CSI,
                                  const TargetRegisterInfo *TRI) const;
 
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   const std::vector<CalleeSavedInfo> &CSI,
+                                   const TargetRegisterInfo *TRI) const;
+
   bool hasReservedCallFrame(const MachineFunction &MF) const;
 
   void processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
diff --git a/lib/Target/Mips/Mips16InstrInfo.cpp b/lib/Target/Mips/Mips16InstrInfo.cpp
index ec84ad81f5..9248032340 100644
--- a/lib/Target/Mips/Mips16InstrInfo.cpp
+++ b/lib/Target/Mips/Mips16InstrInfo.cpp
@@ -62,7 +62,7 @@ void Mips16InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   if (Mips::CPURegsRegClass.contains(DestReg)) { // Copy to CPU Reg.
     if (Mips::CPURegsRegClass.contains(SrcReg))
-      Opc = Mips::Mov32R16;
+      Opc = Mips::Move32R16;
   }
 
   assert(Opc && "Cannot copy registers");
diff --git a/lib/Target/Mips/Mips16InstrInfo.td b/lib/Target/Mips/Mips16InstrInfo.td
index 94cf984769..b0ab464a68 100644
--- a/lib/Target/Mips/Mips16InstrInfo.td
+++ b/lib/Target/Mips/Mips16InstrInfo.td
@@ -20,6 +20,13 @@ class FRRR16_ins<bits<2> _f, string asmstr,  InstrItinClass itin> :
          !strconcat(asmstr, "\t$rz, $rx, $ry"), [], itin>;
 
 //
+// I8_MOVR32 instruction format (used only by the MOVR32 instructio
+//
+class FI8_MOVR3216_ins<string asmstr, InstrItinClass itin>:
+       FI8_MOVR3216<(outs CPU16Regs:$rz), (ins CPURegs:$r32),
+       !strconcat(asmstr,  "\t$rz, $r32"), [], itin>;
+
+//
 // I8_MOV32R instruction format (used only by MOV32R instruction)
 //
 class FI8_MOV32R16_ins<string asmstr, InstrItinClass itin>:
@@ -204,7 +211,14 @@ def LwRxRyOffMemX16: FEXT_RRI16_mem_ins<0b10011, "lw", mem16, IIAlu>;
 // Purpose: Move
 // To move the contents of a GPR to a GPR.
 //
-def Mov32R16: FI8_MOV32R16_ins<"move", IIAlu>;
+def Move32R16: FI8_MOV32R16_ins<"move", IIAlu>;
+
+//
+// Format: MOVE ry, r32 MIPS16e
+//Purpose: Move
+// To move the contents of a GPR to a GPR.
+//
+def MoveR3216: FI8_MOVR3216_ins<"move", IIAlu>;
 
 //
 // Format: NEG rx, ry MIPS16e
@@ -240,10 +254,12 @@ def OrRxRxRy16: FRxRxRy16_ins<0b01101, "or", IIAlu>, ArithLogic16Defs<1>;
 // for direct object emitter, encoding needs to be adjusted for the
 // frame size
 //
-let ra=1, s=0,s0=0,s1=0 in
+let ra=1, s=0,s0=1,s1=1 in
 def RestoreRaF16:
   FI8_SVRS16<0b1, (outs), (ins uimm16:$frame_size),
-             "restore \t$$ra, $frame_size", [], IILoad >;
+             "restore \t$$ra,  $$s0, $$s1, $frame_size", [], IILoad > {
+  let isCodeGenOnly = 1;
+}
 
 //
 // Format: SAVE {ra,}{s0/s1/s0-1,}{framesize} (All arguments are optional)
@@ -252,11 +268,12 @@ def RestoreRaF16:
 // To set up a stack frame on entry to a subroutine,
 // saving return address and static registers, and adjusting stack
 //
-let ra=1, s=1,s0=0,s1=0 in
+let ra=1, s=1,s0=1,s1=1 in
 def SaveRaF16:
   FI8_SVRS16<0b1, (outs), (ins uimm16:$frame_size),
-             "save \t$$ra, $frame_size", [], IILoad >;
-
+             "save \t$$ra, $$s0, $$s1, $frame_size", [], IILoad > {
+  let isCodeGenOnly = 1;
+}
 //
 // Format: SB ry, offset(rx) MIPS16e
 // Purpose: Store Byte (Extended)
@@ -389,16 +406,16 @@ class LoadM16_pat<PatFrag OpNode, Instruction I> :
 
 def: LoadM16_pat<sextloadi8, LbRxRyOffMemX16>;
 def: LoadM16_pat<zextloadi8, LbuRxRyOffMemX16>;
-def: LoadM16_pat<sextloadi16_a, LhRxRyOffMemX16>;
-def: LoadM16_pat<zextloadi16_a, LhuRxRyOffMemX16>;
-def: LoadM16_pat<load_a, LwRxRyOffMemX16>;
+def: LoadM16_pat<sextloadi16, LhRxRyOffMemX16>;
+def: LoadM16_pat<zextloadi16, LhuRxRyOffMemX16>;
+def: LoadM16_pat<load, LwRxRyOffMemX16>;
 
 class StoreM16_pat<PatFrag OpNode, Instruction I> :
   Mips16Pat<(OpNode CPU16Regs:$r, addr:$addr), (I CPU16Regs:$r, addr:$addr)>;
 
 def: StoreM16_pat<truncstorei8, SbRxRyOffMemX16>;
-def: StoreM16_pat<truncstorei16_a, ShRxRyOffMemX16>;
-def: StoreM16_pat<store_a, SwRxRyOffMemX16>;
+def: StoreM16_pat<truncstorei16, ShRxRyOffMemX16>;
+def: StoreM16_pat<store, SwRxRyOffMemX16>;
 
 
 // Jump and Link (Call)
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index 147be5db15..fe1e188303 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -127,24 +127,15 @@ let DecoderNamespace = "Mips64" in {
 ///  aligned
 defm LB64    : LoadM64<0x20, "lb",  sextloadi8>;
 defm LBu64   : LoadM64<0x24, "lbu", zextloadi8>;
-defm LH64    : LoadM64<0x21, "lh",  sextloadi16_a>;
-defm LHu64   : LoadM64<0x25, "lhu", zextloadi16_a>;
-defm LW64    : LoadM64<0x23, "lw",  sextloadi32_a>;
-defm LWu64   : LoadM64<0x27, "lwu", zextloadi32_a>;
+defm LH64    : LoadM64<0x21, "lh",  sextloadi16>;
+defm LHu64   : LoadM64<0x25, "lhu", zextloadi16>;
+defm LW64    : LoadM64<0x23, "lw",  sextloadi32>;
+defm LWu64   : LoadM64<0x27, "lwu", zextloadi32>;
 defm SB64    : StoreM64<0x28, "sb", truncstorei8>;
-defm SH64    : StoreM64<0x29, "sh", truncstorei16_a>;
-defm SW64    : StoreM64<0x2b, "sw", truncstorei32_a>;
-defm LD      : LoadM64<0x37, "ld",  load_a>;
-defm SD      : StoreM64<0x3f, "sd", store_a>;
-
-///  unaligned
-defm ULH64     : LoadM64<0x21, "ulh",  sextloadi16_u, 1>;
-defm ULHu64    : LoadM64<0x25, "ulhu", zextloadi16_u, 1>;
-defm ULW64     : LoadM64<0x23, "ulw",  sextloadi32_u, 1>;
-defm USH64     : StoreM64<0x29, "ush", truncstorei16_u, 1>;
-defm USW64     : StoreM64<0x2b, "usw", truncstorei32_u, 1>;
-defm ULD       : LoadM64<0x37, "uld",  load_u, 1>;
-defm USD       : StoreM64<0x3f, "usd", store_u, 1>;
+defm SH64    : StoreM64<0x29, "sh", truncstorei16>;
+defm SW64    : StoreM64<0x2b, "sw", truncstorei32>;
+defm LD      : LoadM64<0x37, "ld",  load>;
+defm SD      : StoreM64<0x3f, "sd", store>;
 
 /// load/store left/right
 let isCodeGenOnly = 1 in {
@@ -244,21 +235,14 @@ let isCodeGenOnly = 1, rs = 0, shamt = 0 in {
 let Predicates = [NotN64, HasStandardEncoding] in {
   def : MipsPat<(i64 (extloadi1  addr:$src)), (LB64 addr:$src)>;
   def : MipsPat<(i64 (extloadi8  addr:$src)), (LB64 addr:$src)>;
-  def : MipsPat<(i64 (extloadi16_a addr:$src)), (LH64 addr:$src)>;
-  def : MipsPat<(i64 (extloadi16_u addr:$src)), (ULH64 addr:$src)>;
-  def : MipsPat<(i64 (extloadi32_a addr:$src)), (LW64 addr:$src)>;
-  def : MipsPat<(i64 (extloadi32_u addr:$src)), (ULW64 addr:$src)>;
-  def : MipsPat<(zextloadi32_u addr:$a), (DSRL (DSLL (ULW64 addr:$a), 32), 32)>;
+  def : MipsPat<(i64 (extloadi16 addr:$src)), (LH64 addr:$src)>;
+  def : MipsPat<(i64 (extloadi32 addr:$src)), (LW64 addr:$src)>;
 }
 let Predicates = [IsN64, HasStandardEncoding] in {
   def : MipsPat<(i64 (extloadi1  addr:$src)), (LB64_P8 addr:$src)>;
   def : MipsPat<(i64 (extloadi8  addr:$src)), (LB64_P8 addr:$src)>;
-  def : MipsPat<(i64 (extloadi16_a addr:$src)), (LH64_P8 addr:$src)>;
-  def : MipsPat<(i64 (extloadi16_u addr:$src)), (ULH64_P8 addr:$src)>;
-  def : MipsPat<(i64 (extloadi32_a addr:$src)), (LW64_P8 addr:$src)>;
-  def : MipsPat<(i64 (extloadi32_u addr:$src)), (ULW64_P8 addr:$src)>;
-  def : MipsPat<(zextloadi32_u addr:$a),
-                (DSRL (DSLL (ULW64_P8 addr:$a), 32), 32)>;
+  def : MipsPat<(i64 (extloadi16 addr:$src)), (LH64_P8 addr:$src)>;
+  def : MipsPat<(i64 (extloadi32 addr:$src)), (LW64_P8 addr:$src)>;
 }
 
 // hi/lo relocs
diff --git a/lib/Target/Mips/MipsCodeEmitter.cpp b/lib/Target/Mips/MipsCodeEmitter.cpp
index 543329562a..8e8a2a4c0c 100644
--- a/lib/Target/Mips/MipsCodeEmitter.cpp
+++ b/lib/Target/Mips/MipsCodeEmitter.cpp
@@ -218,15 +218,9 @@ unsigned MipsCodeEmitter::getMachineOpValue(const MachineInstr &MI,
     return getMipsRegisterNumbering(MO.getReg());
   else if (MO.isImm())
     return static_cast<unsigned>(MO.getImm());
-  else if (MO.isGlobal()) {
-    if (MI.getOpcode() == Mips::ULW || MI.getOpcode() == Mips::USW ||
-          MI.getOpcode() == Mips::ULH || MI.getOpcode() == Mips::ULHu)
-      emitGlobalAddressUnaligned(MO.getGlobal(), getRelocation(MI, MO), 4);
-    else if (MI.getOpcode() == Mips::USH)
-      emitGlobalAddressUnaligned(MO.getGlobal(), getRelocation(MI, MO), 8);
-    else
-      emitGlobalAddress(MO.getGlobal(), getRelocation(MI, MO), true);
-  } else if (MO.isSymbol())
+  else if (MO.isGlobal())
+    emitGlobalAddress(MO.getGlobal(), getRelocation(MI, MO), true);
+  else if (MO.isSymbol())
     emitExternalSymbolAddress(MO.getSymbolName(), getRelocation(MI, MO));
   else if (MO.isCPI())
     emitConstPoolAddress(MO.getIndex(), getRelocation(MI, MO));
@@ -383,29 +377,8 @@ void MipsCodeEmitter::emitInstruction(const MachineInstr &MI) {
   if ((MI.getDesc().TSFlags & MipsII::FormMask) == MipsII::Pseudo)
     return;
 
-
-  switch (MI.getOpcode()) {
-  case Mips::USW:
-    NumEmitted += emitUSW(MI);
-    break;
-  case Mips::ULW:
-    NumEmitted += emitULW(MI);
-    break;
-  case Mips::ULH:
-    NumEmitted += emitULH(MI);
-    break;
-  case Mips::ULHu:
-    NumEmitted += emitULHu(MI);
-    break;
-  case Mips::USH:
-    NumEmitted += emitUSH(MI);
-    break;
-
-  default:
-    emitWordLE(getBinaryCodeForInstr(MI));
-    ++NumEmitted;  // Keep track of the # of mi's emitted
-    break;
-  }
+  emitWordLE(getBinaryCodeForInstr(MI));
+  ++NumEmitted;  // Keep track of the # of mi's emitted
 
   MCE.processDebugLoc(MI.getDebugLoc(), false);
 }
diff --git a/lib/Target/Mips/MipsDSPInstrFormats.td b/lib/Target/Mips/MipsDSPInstrFormats.td
new file mode 100644
index 0000000000..d9bcccc617
--- /dev/null
+++ b/lib/Target/Mips/MipsDSPInstrFormats.td
@@ -0,0 +1,25 @@
+//===- MipsDSPInstrFormats.td - Mips Instruction Formats ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def HasDSP : Predicate<"Subtarget.hasDSP()">,
+             AssemblerPredicate<"FeatureDSP">;
+def HasDSPR2 : Predicate<"Subtarget.hasDSPR2()">,
+               AssemblerPredicate<"FeatureDSPR2">;
+
+// Fields.
+class Field6<bits<6> val> {
+  bits<6> V = val;
+}
+
+def SPECIAL3_OPCODE : Field6<0b011111>;
+def REGIMM_OPCODE : Field6<0b000001>;
+
+class DSPInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther> {
+  let Predicates = [HasDSP];
+}
diff --git a/lib/Target/Mips/MipsDSPInstrInfo.td b/lib/Target/Mips/MipsDSPInstrInfo.td
new file mode 100644
index 0000000000..1a4fd8733a
--- /dev/null
+++ b/lib/Target/Mips/MipsDSPInstrInfo.td
@@ -0,0 +1,20 @@
+//===- MipsDSPInstrInfo.td - DSP ASE instructions -*- tablegen ------------*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes Mips DSP ASE instructions.
+//
+//===----------------------------------------------------------------------===//
+
+// ImmLeaf
+def immZExt2 : ImmLeaf<i32, [{return isUInt<2>(Imm);}]>;
+def immZExt3 : ImmLeaf<i32, [{return isUInt<3>(Imm);}]>;
+def immZExt4 : ImmLeaf<i32, [{return isUInt<4>(Imm);}]>;
+def immZExt8 : ImmLeaf<i32, [{return isUInt<8>(Imm);}]>;
+def immZExt10 : ImmLeaf<i32, [{return isUInt<10>(Imm);}]>;
+def immSExt6 : ImmLeaf<i32, [{return isInt<6>(Imm);}]>;
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
index edcf8e531b..0d47303e92 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -540,6 +540,15 @@ SDNode* MipsDAGToDAGISel::Select(SDNode *Node) {
     return RegOpnd;
   }
 
+#ifndef NDEBUG
+  case ISD::LOAD:
+  case ISD::STORE:
+    assert(cast<MemSDNode>(Node)->getMemoryVT().getSizeInBits() / 8 <=
+           cast<MemSDNode>(Node)->getAlignment() &&
+           "Unexpected unaligned loads/stores.");
+    break;
+#endif
+
   case MipsISD::ThreadPointer: {
     EVT PtrVT = TLI.getPointerTy();
     unsigned RdhwrOpc, SrcReg, DestReg;
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 146dbea15c..b1220d6250 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -89,6 +89,20 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case MipsISD::LDR:               return "MipsISD::LDR";
   case MipsISD::SDL:               return "MipsISD::SDL";
   case MipsISD::SDR:               return "MipsISD::SDR";
+  case MipsISD::EXTP:              return "MipsISD::EXTP";
+  case MipsISD::EXTPDP:            return "MipsISD::EXTPDP";
+  case MipsISD::EXTR_S_H:          return "MipsISD::EXTR_S_H";
+  case MipsISD::EXTR_W:            return "MipsISD::EXTR_W";
+  case MipsISD::EXTR_R_W:          return "MipsISD::EXTR_R_W";
+  case MipsISD::EXTR_RS_W:         return "MipsISD::EXTR_RS_W";
+  case MipsISD::SHILO:             return "MipsISD::SHILO";
+  case MipsISD::MTHLIP:            return "MipsISD::MTHLIP";
+  case MipsISD::MULT:              return "MipsISD::MULT";
+  case MipsISD::MULTU:             return "MipsISD::MULTU";
+  case MipsISD::MADD_DSP:          return "MipsISD::MADD_DSPDSP";
+  case MipsISD::MADDU_DSP:         return "MipsISD::MADDU_DSP";
+  case MipsISD::MSUB_DSP:          return "MipsISD::MSUB_DSP";
+  case MipsISD::MSUBU_DSP:         return "MipsISD::MSUBU_DSP";
   default:                         return NULL;
   }
 }
@@ -113,7 +127,22 @@ MipsTargetLowering(MipsTargetMachine &TM)
 
   if (Subtarget->inMips16Mode()) {
     addRegisterClass(MVT::i32, &Mips::CPU16RegsRegClass);
-    addRegisterClass(MVT::i32, &Mips::CPURARegRegClass);
+  }
+
+  if (Subtarget->hasDSP()) {
+    MVT::SimpleValueType VecTys[2] = {MVT::v2i16, MVT::v4i8};
+
+    for (unsigned i = 0; i < array_lengthof(VecTys); ++i) {
+      addRegisterClass(VecTys[i], &Mips::DSPRegsRegClass);
+
+      // Expand all builtin opcodes.
+      for (unsigned Opc = 0; Opc < ISD::BUILTIN_OP_END; ++Opc)
+        setOperationAction(Opc, VecTys[i], Expand);
+
+      setOperationAction(ISD::LOAD, VecTys[i], Legal);
+      setOperationAction(ISD::STORE, VecTys[i], Legal);
+      setOperationAction(ISD::BITCAST, VecTys[i], Legal);
+    }
   }
 
   if (!TM.Options.UseSoftFloat) {
@@ -162,8 +191,10 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::FCOPYSIGN,          MVT::f64,   Custom);
   setOperationAction(ISD::MEMBARRIER,         MVT::Other, Custom);
   setOperationAction(ISD::ATOMIC_FENCE,       MVT::Other, Custom);
-  setOperationAction(ISD::LOAD,               MVT::i32, Custom);
-  setOperationAction(ISD::STORE,              MVT::i32, Custom);
+  if (!Subtarget->inMips16Mode()) {
+    setOperationAction(ISD::LOAD,               MVT::i32, Custom);
+    setOperationAction(ISD::STORE,              MVT::i32, Custom);
+  }
 
   if (!TM.Options.NoNaNsFPMath) {
     setOperationAction(ISD::FABS,             MVT::f32,   Custom);
@@ -254,6 +285,9 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::VACOPY,            MVT::Other, Expand);
   setOperationAction(ISD::VAEND,             MVT::Other, Expand);
 
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i64, Custom);
+  setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::i64, Custom);
+
   // Use the default for now
   setOperationAction(ISD::STACKSAVE,         MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE,      MVT::Other, Expand);
@@ -317,6 +351,9 @@ MipsTargetLowering(MipsTargetMachine &TM)
 bool MipsTargetLowering::allowsUnalignedMemoryAccesses(EVT VT) const {
   MVT::SimpleValueType SVT = VT.getSimpleVT().SimpleTy;
 
+  if (Subtarget->inMips16Mode())
+    return false;
+
   switch (SVT) {
   case MVT::i64:
   case MVT::i32:
@@ -792,6 +829,26 @@ SDValue  MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
   return SDValue();
 }
 
+void
+MipsTargetLowering::LowerOperationWrapper(SDNode *N,
+                                          SmallVectorImpl<SDValue> &Results,
+                                          SelectionDAG &DAG) const {
+  SDValue Res = LowerOperation(SDValue(N, 0), DAG);
+
+  for (unsigned I = 0, E = Res->getNumValues(); I != E; ++I)
+    Results.push_back(Res.getValue(I));
+}
+
+void
+MipsTargetLowering::ReplaceNodeResults(SDNode *N,
+                                       SmallVectorImpl<SDValue> &Results,
+                                       SelectionDAG &DAG) const {
+  SDValue Res = LowerOperation(SDValue(N, 0), DAG);
+
+  for (unsigned I = 0, E = Res->getNumValues(); I != E; ++I)
+    Results.push_back(Res.getValue(I));
+}
+
 SDValue MipsTargetLowering::
 LowerOperation(SDValue Op, SelectionDAG &DAG) const
 {
@@ -1583,7 +1640,8 @@ SDValue MipsTargetLowering::LowerGlobalAddress(SDValue Op,
   if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !IsN64) {
     SDVTList VTs = DAG.getVTList(MVT::i32);
 
-    const MipsTargetObjectFile &TLOF = (const MipsTargetObjectFile&)getObjFileLowering();
+    const MipsTargetObjectFile &TLOF =
+      (const MipsTargetObjectFile&)getObjFileLowering();
 
     // %gp_rel relocation
     if (TLOF.IsGlobalInSmallSection(GV, getTargetMachine())) {
@@ -1632,8 +1690,8 @@ SDValue MipsTargetLowering::LowerBlockAddress(SDValue Op,
 
   if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !IsN64) {
     // %hi/%lo relocation
-    SDValue BAHi = DAG.getBlockAddress(BA, MVT::i32, true, MipsII::MO_ABS_HI);
-    SDValue BALo = DAG.getBlockAddress(BA, MVT::i32, true, MipsII::MO_ABS_LO);
+    SDValue BAHi = DAG.getTargetBlockAddress(BA, MVT::i32, 0, MipsII::MO_ABS_HI);
+    SDValue BALo = DAG.getTargetBlockAddress(BA, MVT::i32, 0, MipsII::MO_ABS_LO);
     SDValue Hi = DAG.getNode(MipsISD::Hi, dl, MVT::i32, BAHi);
     SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, BALo);
     return DAG.getNode(ISD::ADD, dl, MVT::i32, Hi, Lo);
@@ -1642,10 +1700,10 @@ SDValue MipsTargetLowering::LowerBlockAddress(SDValue Op,
   EVT ValTy = Op.getValueType();
   unsigned GOTFlag = HasMips64 ? MipsII::MO_GOT_PAGE : MipsII::MO_GOT;
   unsigned OFSTFlag = HasMips64 ? MipsII::MO_GOT_OFST : MipsII::MO_ABS_LO;
-  SDValue BAGOTOffset = DAG.getBlockAddress(BA, ValTy, true, GOTFlag);
+  SDValue BAGOTOffset = DAG.getTargetBlockAddress(BA, ValTy, 0, GOTFlag);
   BAGOTOffset = DAG.getNode(MipsISD::Wrapper, dl, ValTy,
                             GetGlobalReg(DAG, ValTy), BAGOTOffset);
-  SDValue BALOOffset = DAG.getBlockAddress(BA, ValTy, true, OFSTFlag);
+  SDValue BALOOffset = DAG.getTargetBlockAddress(BA, ValTy, 0, OFSTFlag);
   SDValue Load = DAG.getLoad(ValTy, dl, DAG.getEntryNode(), BAGOTOffset,
                              MachinePointerInfo(), false, false, false, 0);
   SDValue Lo = DAG.getNode(MipsISD::Lo, dl, ValTy, BALOOffset);
@@ -3383,8 +3441,11 @@ getRegForInlineAsmConstraint(const std::string &Constraint, EVT VT) const
     case 'd': // Address register. Same as 'r' unless generating MIPS16 code.
     case 'y': // Same as 'r'. Exists for compatibility.
     case 'r':
-      if (VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8)
+      if (VT == MVT::i32 || VT == MVT::i16 || VT == MVT::i8) {
+        if (Subtarget->inMips16Mode())
+          return std::make_pair(0U, &Mips::CPU16RegsRegClass);
         return std::make_pair(0U, &Mips::CPURegsRegClass);
+      }
       if (VT == MVT::i64 && !HasMips64)
         return std::make_pair(0U, &Mips::CPURegsRegClass);
       if (VT == MVT::i64 && HasMips64)
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index d9f8ddc89b..4e9398430b 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -81,6 +81,47 @@ namespace llvm {
       Ext,
       Ins,
 
+      // EXTR.W instrinsic nodes.
+      EXTP,
+      EXTPDP,
+      EXTR_S_H,
+      EXTR_W,
+      EXTR_R_W,
+      EXTR_RS_W,
+      SHILO,
+      MTHLIP,
+
+      // DPA.W intrinsic nodes.
+      MULSAQ_S_W_PH,
+      MAQ_S_W_PHL,
+      MAQ_S_W_PHR,
+      MAQ_SA_W_PHL,
+      MAQ_SA_W_PHR,
+      DPAU_H_QBL,
+      DPAU_H_QBR,
+      DPSU_H_QBL,
+      DPSU_H_QBR,
+      DPAQ_S_W_PH,
+      DPSQ_S_W_PH,
+      DPAQ_SA_L_W,
+      DPSQ_SA_L_W,
+      DPA_W_PH,
+      DPS_W_PH,
+      DPAQX_S_W_PH,
+      DPAQX_SA_W_PH,
+      DPAX_W_PH,
+      DPSX_W_PH,
+      DPSQX_S_W_PH,
+      DPSQX_SA_W_PH,
+      MULSA_W_PH,
+
+      MULT,
+      MULTU,
+      MADD_DSP,
+      MADDU_DSP,
+      MSUB_DSP,
+      MSUBU_DSP,
+
       // Load/Store Left/Right nodes.
       LWL = ISD::FIRST_TARGET_MEMORY_OPCODE,
       LWR,
@@ -105,9 +146,19 @@ namespace llvm {
 
     virtual bool allowsUnalignedMemoryAccesses (EVT VT) const;
 
+    virtual void LowerOperationWrapper(SDNode *N,
+                                       SmallVectorImpl<SDValue> &Results,
+                                       SelectionDAG &DAG) const;
+
     /// LowerOperation - Provide custom lowering hooks for some operations.
     virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
 
+    /// ReplaceNodeResults - Replace the results of node with an illegal result
+    /// type with new values built out of custom code.
+    ///
+    virtual void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue>&Results,
+                                    SelectionDAG &DAG) const;
+
     /// getTargetNodeName - This method returns the name of a target specific
     //  DAG node.
     virtual const char *getTargetNodeName(unsigned Opcode) const;
diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
index e74d45ec43..fa6faf242d 100644
--- a/lib/Target/Mips/MipsInstrFPU.td
+++ b/lib/Target/Mips/MipsInstrFPU.td
@@ -90,13 +90,13 @@ def fpimm0neg : PatLeaf<(fpimm), [{
 let DecoderMethod = "DecodeFMem" in {
 class FPLoad<bits<6> op, string opstr, RegisterClass RC, Operand MemOpnd>:
   FMem<op, (outs RC:$ft), (ins MemOpnd:$addr),
-      !strconcat(opstr, "\t$ft, $addr"), [(set RC:$ft, (load_a addr:$addr))],
+      !strconcat(opstr, "\t$ft, $addr"), [(set RC:$ft, (load addr:$addr))],
       IILoad>;
 
 // FP store.
 class FPStore<bits<6> op, string opstr, RegisterClass RC, Operand MemOpnd>:
   FMem<op, (outs), (ins RC:$ft, MemOpnd:$addr),
-      !strconcat(opstr, "\t$ft, $addr"), [(store_a RC:$ft, addr:$addr)],
+      !strconcat(opstr, "\t$ft, $addr"), [(store RC:$ft, addr:$addr)],
       IIStore>;
 }
 // FP indexed load.
@@ -282,27 +282,27 @@ let Predicates = [NotN64, NotMips64, HasStandardEncoding] in {
 
 // Indexed loads and stores.
 let Predicates = [HasMips32r2Or64, NotNaCl/*@LOCALMOD*/] in {
-  def LWXC1 : FPIdxLoad<0x0, "lwxc1", FGR32, CPURegs, load_a>;
-  def SWXC1 : FPIdxStore<0x8, "swxc1", FGR32, CPURegs, store_a>;
+  def LWXC1 : FPIdxLoad<0x0, "lwxc1", FGR32, CPURegs, load>;
+  def SWXC1 : FPIdxStore<0x8, "swxc1", FGR32, CPURegs, store>;
 }
 
 let Predicates = [HasMips32r2, NotMips64, NotNaCl/*@LOCALMOD*/] in {
-  def LDXC1 : FPIdxLoad<0x1, "ldxc1", AFGR64, CPURegs, load_a>;
-  def SDXC1 : FPIdxStore<0x9, "sdxc1", AFGR64, CPURegs, store_a>;
+  def LDXC1 : FPIdxLoad<0x1, "ldxc1", AFGR64, CPURegs, load>;
+  def SDXC1 : FPIdxStore<0x9, "sdxc1", AFGR64, CPURegs, store>;
 }
 
 let Predicates = [HasMips64, NotN64, NotNaCl/*@LOCALMOD*/],
     DecoderNamespace="Mips64" in {
-  def LDXC164 : FPIdxLoad<0x1, "ldxc1", FGR64, CPURegs, load_a>;
-  def SDXC164 : FPIdxStore<0x9, "sdxc1", FGR64, CPURegs, store_a>;
+  def LDXC164 : FPIdxLoad<0x1, "ldxc1", FGR64, CPURegs, load>;
+  def SDXC164 : FPIdxStore<0x9, "sdxc1", FGR64, CPURegs, store>;
 }
 
 // n64
 let Predicates = [IsN64, NotNaCl/*@LOCALMOD*/], isCodeGenOnly=1 in {
-  def LWXC1_P8   : FPIdxLoad<0x0, "lwxc1", FGR32, CPU64Regs, load_a>;
-  def LDXC164_P8 : FPIdxLoad<0x1, "ldxc1", FGR64, CPU64Regs, load_a>;
-  def SWXC1_P8   : FPIdxStore<0x8, "swxc1", FGR32, CPU64Regs, store_a>;
-  def SDXC164_P8 : FPIdxStore<0x9, "sdxc1", FGR64, CPU64Regs, store_a>;
+  def LWXC1_P8   : FPIdxLoad<0x0, "lwxc1", FGR32, CPU64Regs, load>;
+  def LDXC164_P8 : FPIdxLoad<0x1, "ldxc1", FGR64, CPU64Regs, load>;
+  def SWXC1_P8   : FPIdxStore<0x8, "swxc1", FGR32, CPU64Regs, store>;
+  def SDXC164_P8 : FPIdxStore<0x9, "sdxc1", FGR64, CPU64Regs, store>;
 }
 
 // Load/store doubleword indexed unaligned.
diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
index 8ade891ab5..ca80d43f36 100644
--- a/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/lib/Target/Mips/MipsInstrInfo.cpp
@@ -95,6 +95,7 @@ bool MipsInstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
                                   SmallVectorImpl<MachineOperand> &Cond,
                                   bool AllowModify) const
 {
+
   MachineBasicBlock::reverse_iterator I = MBB.rbegin(), REnd = MBB.rend();
 
   // Skip all the debug instructions.
@@ -177,9 +178,14 @@ void MipsInstrInfo::BuildCondBr(MachineBasicBlock &MBB,
   const MCInstrDesc &MCID = get(Opc);
   MachineInstrBuilder MIB = BuildMI(&MBB, DL, MCID);
 
-  for (unsigned i = 1; i < Cond.size(); ++i)
-    MIB.addReg(Cond[i].getReg());
-
+  for (unsigned i = 1; i < Cond.size(); ++i) {
+    if (Cond[i].isReg())
+      MIB.addReg(Cond[i].getReg());
+    else if (Cond[i].isImm())
+      MIB.addImm(Cond[i].getImm());
+    else
+       assert(true && "Cannot copy operand");
+  }
   MIB.addMBB(TBB);
 }
 
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index f63b143bdd..3f98ae857b 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -295,55 +295,6 @@ def addr :
   ComplexPattern<iPTR, 2, "SelectAddr", [frameindex], [SDNPWantParent]>;
 
 //===----------------------------------------------------------------------===//
-// Pattern fragment for load/store
-//===----------------------------------------------------------------------===//
-class UnalignedLoad<PatFrag Node> :
-  PatFrag<(ops node:$ptr), (Node node:$ptr), [{
-  LoadSDNode *LD = cast<LoadSDNode>(N);
-  return LD->getMemoryVT().getSizeInBits()/8 > LD->getAlignment();
-}]>;
-
-class AlignedLoad<PatFrag Node> :
-  PatFrag<(ops node:$ptr), (Node node:$ptr), [{
-  LoadSDNode *LD = cast<LoadSDNode>(N);
-  return LD->getMemoryVT().getSizeInBits()/8 <= LD->getAlignment();
-}]>;
-
-class UnalignedStore<PatFrag Node> :
-  PatFrag<(ops node:$val, node:$ptr), (Node node:$val, node:$ptr), [{
-  StoreSDNode *SD = cast<StoreSDNode>(N);
-  return SD->getMemoryVT().getSizeInBits()/8 > SD->getAlignment();
-}]>;
-
-class AlignedStore<PatFrag Node> :
-  PatFrag<(ops node:$val, node:$ptr), (Node node:$val, node:$ptr), [{
-  StoreSDNode *SD = cast<StoreSDNode>(N);
-  return SD->getMemoryVT().getSizeInBits()/8 <= SD->getAlignment();
-}]>;
-
-// Load/Store PatFrags.
-def sextloadi16_a   : AlignedLoad<sextloadi16>;
-def zextloadi16_a   : AlignedLoad<zextloadi16>;
-def extloadi16_a    : AlignedLoad<extloadi16>;
-def load_a          : AlignedLoad<load>;
-def sextloadi32_a   : AlignedLoad<sextloadi32>;
-def zextloadi32_a   : AlignedLoad<zextloadi32>;
-def extloadi32_a    : AlignedLoad<extloadi32>;
-def truncstorei16_a : AlignedStore<truncstorei16>;
-def store_a         : AlignedStore<store>;
-def truncstorei32_a : AlignedStore<truncstorei32>;
-def sextloadi16_u   : UnalignedLoad<sextloadi16>;
-def zextloadi16_u   : UnalignedLoad<zextloadi16>;
-def extloadi16_u    : UnalignedLoad<extloadi16>;
-def load_u          : UnalignedLoad<load>;
-def sextloadi32_u   : UnalignedLoad<sextloadi32>;
-def zextloadi32_u   : UnalignedLoad<zextloadi32>;
-def extloadi32_u    : UnalignedLoad<extloadi32>;
-def truncstorei16_u : UnalignedStore<truncstorei16>;
-def store_u         : UnalignedStore<store>;
-def truncstorei32_u : UnalignedStore<truncstorei32>;
-
-//===----------------------------------------------------------------------===//
 // Instructions specific format
 //===----------------------------------------------------------------------===//
 
@@ -990,19 +941,12 @@ let Predicates = [HasMips32r2, HasStandardEncoding] in {
 ///  aligned
 defm LB      : LoadM32<0x20, "lb",  sextloadi8>;
 defm LBu     : LoadM32<0x24, "lbu", zextloadi8>;
-defm LH      : LoadM32<0x21, "lh",  sextloadi16_a>;
-defm LHu     : LoadM32<0x25, "lhu", zextloadi16_a>;
-defm LW      : LoadM32<0x23, "lw",  load_a>;
+defm LH      : LoadM32<0x21, "lh",  sextloadi16>;
+defm LHu     : LoadM32<0x25, "lhu", zextloadi16>;
+defm LW      : LoadM32<0x23, "lw",  load>;
 defm SB      : StoreM32<0x28, "sb", truncstorei8>;
-defm SH      : StoreM32<0x29, "sh", truncstorei16_a>;
-defm SW      : StoreM32<0x2b, "sw", store_a>;
-
-///  unaligned
-defm ULH     : LoadM32<0x21, "ulh",  sextloadi16_u, 1>;
-defm ULHu    : LoadM32<0x25, "ulhu", zextloadi16_u, 1>;
-defm ULW     : LoadM32<0x23, "ulw",  load_u, 1>;
-defm USH     : StoreM32<0x29, "ush", truncstorei16_u, 1>;
-defm USW     : StoreM32<0x2b, "usw", store_u, 1>;
+defm SH      : StoreM32<0x29, "sh", truncstorei16>;
+defm SW      : StoreM32<0x2b, "sw", store>;
 
 /// load/store left/right
 defm LWL : LoadLeftRightM32<0x22, "lwl", MipsLWL>;
@@ -1214,24 +1158,20 @@ def : MipsPat<(not CPURegs:$in),
 let Predicates = [NotN64, HasStandardEncoding] in {
   def : MipsPat<(i32 (extloadi1  addr:$src)), (LBu addr:$src)>;
   def : MipsPat<(i32 (extloadi8  addr:$src)), (LBu addr:$src)>;
-  def : MipsPat<(i32 (extloadi16_a addr:$src)), (LHu addr:$src)>;
-  def : MipsPat<(i32 (extloadi16_u addr:$src)), (ULHu addr:$src)>;
+  def : MipsPat<(i32 (extloadi16 addr:$src)), (LHu addr:$src)>;
 }
 let Predicates = [IsN64, HasStandardEncoding] in {
   def : MipsPat<(i32 (extloadi1  addr:$src)), (LBu_P8 addr:$src)>;
   def : MipsPat<(i32 (extloadi8  addr:$src)), (LBu_P8 addr:$src)>;
-  def : MipsPat<(i32 (extloadi16_a addr:$src)), (LHu_P8 addr:$src)>;
-  def : MipsPat<(i32 (extloadi16_u addr:$src)), (ULHu_P8 addr:$src)>;
+  def : MipsPat<(i32 (extloadi16 addr:$src)), (LHu_P8 addr:$src)>;
 }
 
 // peepholes
 let Predicates = [NotN64, HasStandardEncoding] in {
-  def : MipsPat<(store_a (i32 0), addr:$dst), (SW ZERO, addr:$dst)>;
-  def : MipsPat<(store_u (i32 0), addr:$dst), (USW ZERO, addr:$dst)>;
+  def : MipsPat<(store (i32 0), addr:$dst), (SW ZERO, addr:$dst)>;
 }
 let Predicates = [IsN64, HasStandardEncoding] in {
-  def : MipsPat<(store_a (i32 0), addr:$dst), (SW_P8 ZERO, addr:$dst)>;
-  def : MipsPat<(store_u (i32 0), addr:$dst), (USW_P8 ZERO, addr:$dst)>;
+  def : MipsPat<(store (i32 0), addr:$dst), (SW_P8 ZERO, addr:$dst)>;
 }
 
 // brcond patterns
diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index dafe14519a..5e0f930395 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -124,6 +124,9 @@ getReservedRegs(const MachineFunction &MF) const {
   Reserved.set(Mips::HWR29);
   Reserved.set(Mips::HWR29_64);
 
+  // Reserve DSP control register.
+  Reserved.set(Mips::DSPCtrl);
+
   // Reserve RA if in mips16 mode.
   if (Subtarget.inMips16Mode()) {
     Reserved.set(Mips::RA);
diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td
index 4015addca3..ae4813e128 100644
--- a/lib/Target/Mips/MipsRegisterInfo.td
+++ b/lib/Target/Mips/MipsRegisterInfo.td
@@ -245,13 +245,45 @@ let Namespace = "Mips" in {
   // Hardware register $29
   def HWR29 : Register<"29">;
   def HWR29_64 : Register<"29">;
+
+  // Accum registers
+  def LO0 : Register<"ac0"> {
+    let Aliases = [LO];
+  }
+  def HI0 : Register<"hi0"> {
+    let Aliases = [HI];
+  }
+  def LO1 : Register<"ac1">;
+  def HI1 : Register<"hi1">;
+  def LO2 : Register<"ac2">;
+  def HI2 : Register<"hi2">;
+  def LO3 : Register<"ac3">;
+  def HI3 : Register<"hi3">;
+
+  let SubRegIndices = [sub_32] in {
+    def LO0_64 : RegisterWithSubRegs<"ac0", [LO0]> {
+      let Aliases = [LO64];
+    }
+    def HI0_64 : RegisterWithSubRegs<"hi0", [HI0]> {
+      let Aliases = [HI64];
+    }
+    def LO1_64 : RegisterWithSubRegs<"ac1", [LO1]>;
+    def HI1_64 : RegisterWithSubRegs<"hi1", [HI1]>;
+    def LO2_64 : RegisterWithSubRegs<"ac2", [LO2]>;
+    def HI2_64 : RegisterWithSubRegs<"hi2", [HI2]>;
+    def LO3_64 : RegisterWithSubRegs<"ac3", [LO3]>;
+    def HI3_64 : RegisterWithSubRegs<"hi3", [HI3]>;
+  }
+
+  def DSPCtrl : Register<"dspctrl">;
 }
 
 //===----------------------------------------------------------------------===//
 // Register Classes
 //===----------------------------------------------------------------------===//
 
-def CPURegs : RegisterClass<"Mips", [i32], 32, (add
+class CPURegsClass<list<ValueType> regTypes> :
+  RegisterClass<"Mips", regTypes, 32, (add
   // Reserved
   ZERO, AT,
   // Return Values and Arguments
@@ -265,6 +297,9 @@ def CPURegs : RegisterClass<"Mips", [i32], 32, (add
   // Reserved
   K0, K1, GP, SP, FP, RA)>;
 
+def CPURegs : CPURegsClass<[i32]>;
+def DSPRegs : CPURegsClass<[v4i8, v2i16]>;
+
 def CPU64Regs : RegisterClass<"Mips", [i64], 64, (add
 // Reserved
   ZERO_64, AT_64,
@@ -322,3 +357,9 @@ def HILO64 : RegisterClass<"Mips", [i64], 64, (add HI64, LO64)>;
 def HWRegs : RegisterClass<"Mips", [i32], 32, (add HWR29)>;
 def HWRegs64 : RegisterClass<"Mips", [i64], 32, (add HWR29_64)>;
 
+// Accum Registers
+def HIRegs : RegisterClass<"Mips", [i32], 32, (sequence "HI%u", 0, 3)>;
+def LORegs : RegisterClass<"Mips", [i32], 32, (sequence "LO%u", 0, 3)>;
+
+def HI64Regs : RegisterClass<"Mips", [i64], 64, (sequence "HI%u_64", 0, 3)>;
+def LO64Regs : RegisterClass<"Mips", [i64], 64, (sequence "LO%u_64", 0, 3)>;
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index cc03587f61..6eeab5c351 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -92,6 +92,9 @@ protected:
   // InMips16 -- can process Mips16 instructions
   bool InMips16Mode;
 
+  // HasDSP, HasDSPR2 -- supports DSP ASE.
+  bool HasDSP, HasDSPR2;
+
   // IsAndroid -- target is android
   bool IsAndroid;
 
@@ -136,6 +139,8 @@ public:
   bool isNotSingleFloat() const { return !IsSingleFloat; }
   bool hasVFPU() const { return HasVFPU; }
   bool inMips16Mode() const { return InMips16Mode; }
+  bool hasDSP() const { return HasDSP; }
+  bool hasDSPR2() const { return HasDSPR2; }
   bool isAndroid() const { return IsAndroid; }
   bool isLinux() const { return IsLinux; }
   bool useSmallSection() const { return UseSmallSection; }
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index 48de583afd..1744738622 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -181,7 +181,7 @@ namespace {
 
 
 
-MCAsmBackend *llvm::createPPCAsmBackend(const Target &T, StringRef TT) {
+MCAsmBackend *llvm::createPPCAsmBackend(const Target &T, StringRef TT, StringRef CPU) {
   if (Triple(TT).isOSDarwin())
     return new DarwinPPCAsmBackend(T);
 
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
index b9ea8b5562..215aa40c4a 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -59,8 +59,7 @@ PPCLinuxMCAsmInfo::PPCLinuxMCAsmInfo(bool is64Bit) {
   HasLEB128 = true;  // Target asm supports leb128 directives (little-endian)
 
   // Exceptions handling
-  if (!is64Bit)
-    ExceptionsType = ExceptionHandling::DwarfCFI;
+  ExceptionsType = ExceptionHandling::DwarfCFI;
     
   ZeroDirective = "\t.space\t";
   Data64bitsDirective = is64Bit ? "\t.quad\t" : 0;
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index f6524222fd..c0248a6045 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -25,9 +25,9 @@ STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
 
 namespace {
 class PPCMCCodeEmitter : public MCCodeEmitter {
-  PPCMCCodeEmitter(const PPCMCCodeEmitter &); // DO NOT IMPLEMENT
-  void operator=(const PPCMCCodeEmitter &);   // DO NOT IMPLEMENT
-  
+  PPCMCCodeEmitter(const PPCMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  void operator=(const PPCMCCodeEmitter &) LLVM_DELETED_FUNCTION;
+
 public:
   PPCMCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
                    MCContext &ctx) {
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
index 7162e158f0..a0e4cf3005 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
@@ -36,7 +36,7 @@ MCCodeEmitter *createPPCMCCodeEmitter(const MCInstrInfo &MCII,
                                       const MCSubtargetInfo &STI,
                                       MCContext &Ctx);
 
-MCAsmBackend *createPPCAsmBackend(const Target &T, StringRef TT);
+MCAsmBackend *createPPCAsmBackend(const Target &T, StringRef TT, StringRef CPU);
 
 /// createPPCELFObjectWriter - Construct an PPC ELF object writer.
 MCObjectWriter *createPPCELFObjectWriter(raw_ostream &OS,
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 6e0e8bb8bc..a66677fa0f 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -368,9 +368,14 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     else if (MO.isJTI())
       MOSymbol = GetJTISymbol(MO.getIndex());
     MCSymbol *&TOCEntry = TOC[MOSymbol];
-    if (TOCEntry == 0)
-      TOCEntry = GetTempSymbol("C", TOCLabelID++);
-    
+    // To avoid name clash check if the name already exists.
+    while (TOCEntry == 0) {
+      if (OutContext.LookupSymbol(Twine(MAI->getPrivateGlobalPrefix()) +
+                                  "C" + Twine(TOCLabelID++)) == 0) {
+        TOCEntry = GetTempSymbol("C", TOCLabelID);
+      }
+    }
+
     const MCExpr *Exp =
       MCSymbolRefExpr::Create(TOCEntry, MCSymbolRefExpr::VK_PPC_TOC_ENTRY,
                               OutContext);
@@ -420,6 +425,8 @@ void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
                         8/*size*/, 0/*addrspace*/);
   OutStreamer.EmitValue(MCSymbolRefExpr::Create(Symbol2, OutContext),
                         8/*size*/, 0/*addrspace*/);
+  // Emit a null environment pointer.
+  OutStreamer.EmitIntValue(0, 8 /* size */, 0 /* addrspace */);
   OutStreamer.SwitchSection(Current);
 
   MCSymbol *RealFnSym = OutContext.GetOrCreateSymbol(
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index c24afa908d..97d3600b1b 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -13,6 +13,7 @@
 
 #include "PPCFrameLowering.h"
 #include "PPCInstrInfo.h"
+#include "PPCInstrBuilder.h"
 #include "PPCMachineFunctionInfo.h"
 #include "llvm/Function.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -168,6 +169,11 @@ static void HandleVRSaveUpdate(MachineInstr *MI, const TargetInstrInfo &TII) {
   MI->eraseFromParent();
 }
 
+static bool spillsCR(const MachineFunction &MF) {
+  const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
+  return FuncInfo->isCRSpilled();
+}
+
 /// determineFrameLayout - Determine the size of the frame and maximum call
 /// frame size.
 void PPCFrameLowering::determineFrameLayout(MachineFunction &MF) const {
@@ -184,13 +190,21 @@ void PPCFrameLowering::determineFrameLayout(MachineFunction &MF) const {
 
   // If we are a leaf function, and use up to 224 bytes of stack space,
   // don't have a frame pointer, calls, or dynamic alloca then we do not need
-  // to adjust the stack pointer (we fit in the Red Zone).
+  // to adjust the stack pointer (we fit in the Red Zone).  For 64-bit
+  // SVR4, we also require a stack frame if we need to spill the CR,
+  // since this spill area is addressed relative to the stack pointer.
   bool DisableRedZone = MF.getFunction()->hasFnAttr(Attribute::NoRedZone);
-  // FIXME SVR4 The 32-bit SVR4 ABI has no red zone.
+  // FIXME SVR4 The 32-bit SVR4 ABI has no red zone.  However, it can
+  // still generate stackless code if all local vars are reg-allocated.
+  // Try: (FrameSize <= 224
+  //       || (FrameSize == 0 && Subtarget.isPPC32 && Subtarget.isSVR4ABI()))
   if (!DisableRedZone &&
       FrameSize <= 224 &&                          // Fits in red zone.
       !MFI->hasVarSizedObjects() &&                // No dynamic alloca.
       !MFI->adjustsStack() &&                      // No calls.
+      !(Subtarget.isPPC64() &&                     // No 64-bit SVR4 CRsave.
+	Subtarget.isSVR4ABI()
+	&& spillsCR(MF)) &&
       (!ALIGN_STACK || MaxAlign <= TargetAlign)) { // No special alignment.
     // No need for frame
     MFI->setStackSize(0);
@@ -488,7 +502,6 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
     // Add callee saved registers to move list.
     const std::vector<CalleeSavedInfo> &CSI = MFI->getCalleeSavedInfo();
     for (unsigned I = 0, E = CSI.size(); I != E; ++I) {
-      int Offset = MFI->getObjectOffset(CSI[I].getFrameIdx());
       unsigned Reg = CSI[I].getReg();
       if (Reg == PPC::LR || Reg == PPC::LR8 || Reg == PPC::RM) continue;
 
@@ -497,6 +510,25 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF) const {
       if (PPC::CRBITRCRegClass.contains(Reg))
         continue;
 
+      // For SVR4, don't emit a move for the CR spill slot if we haven't
+      // spilled CRs.
+      if (Subtarget.isSVR4ABI()
+	  && (PPC::CR2 <= Reg && Reg <= PPC::CR4)
+	  && !spillsCR(MF))
+	continue;
+
+      // For 64-bit SVR4 when we have spilled CRs, the spill location
+      // is SP+8, not a frame-relative slot.
+      if (Subtarget.isSVR4ABI()
+	  && Subtarget.isPPC64()
+	  && (PPC::CR2 <= Reg && Reg <= PPC::CR4)) {
+	MachineLocation CSDst(PPC::X1, 8);
+	MachineLocation CSSrc(PPC::CR2);
+	Moves.push_back(MachineMove(Label, CSDst, CSSrc));
+	continue;
+      }
+
+      int Offset = MFI->getObjectOffset(CSI[I].getFrameIdx());
       MachineLocation CSDst(MachineLocation::VirtualFP, Offset);
       MachineLocation CSSrc(Reg);
       Moves.push_back(MachineMove(Label, CSDst, CSSrc));
@@ -714,11 +746,6 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
   }
 }
 
-static bool spillsCR(const MachineFunction &MF) {
-  const PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
-  return FuncInfo->isCRSpilled();
-}
-
 /// MustSaveLR - Return true if this function requires that we save the LR
 /// register onto the stack in the prolog and restore it in the epilog of the
 /// function.
@@ -808,7 +835,6 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF)
   bool HasGPSaveArea = false;
   bool HasG8SaveArea = false;
   bool HasFPSaveArea = false;
-  bool HasCRSaveArea = false;
   bool HasVRSAVESaveArea = false;
   bool HasVRSaveArea = false;
 
@@ -843,10 +869,9 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF)
       if (Reg < MinFPR) {
         MinFPR = Reg;
       }
-// FIXME SVR4: Disable CR save area for now.
     } else if (PPC::CRBITRCRegClass.contains(Reg) ||
                PPC::CRRCRegClass.contains(Reg)) {
-//      HasCRSaveArea = true;
+      ; // do nothing, as we already know whether CRs are spilled
     } else if (PPC::VRSAVERCRegClass.contains(Reg)) {
       HasVRSAVESaveArea = true;
     } else if (PPC::VRRCRegClass.contains(Reg)) {
@@ -926,16 +951,21 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF)
     }
   }
 
-  // The CR save area is below the general register save area.
-  if (HasCRSaveArea) {
-    // FIXME SVR4: Is it actually possible to have multiple elements in CSI
-    //             which have the CR/CRBIT register class?
+  // For 32-bit only, the CR save area is below the general register
+  // save area.  For 64-bit SVR4, the CR save area is addressed relative
+  // to the stack pointer and hence does not need an adjustment here.
+  // Only CR2 (the first nonvolatile spilled) has an associated frame
+  // index so that we have a single uniform save area.
+  if (spillsCR(MF) && !(Subtarget.isPPC64() && Subtarget.isSVR4ABI())) {
     // Adjust the frame index of the CR spill slot.
     for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
       unsigned Reg = CSI[i].getReg();
 
-      if (PPC::CRBITRCRegClass.contains(Reg) ||
-          PPC::CRRCRegClass.contains(Reg)) {
+      if ((Subtarget.isSVR4ABI() && Reg == PPC::CR2)
+	  // Leave Darwin logic as-is.
+	  || (!Subtarget.isSVR4ABI() &&
+	      (PPC::CRBITRCRegClass.contains(Reg) ||
+	       PPC::CRRCRegClass.contains(Reg)))) {
         int FI = CSI[i].getFrameIdx();
 
         FFI->setObjectOffset(FI, LowerBound + FFI->getObjectOffset(FI));
@@ -973,3 +1003,184 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF)
     }
   }
 }
+
+bool 
+PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+				     MachineBasicBlock::iterator MI,
+				     const std::vector<CalleeSavedInfo> &CSI,
+				     const TargetRegisterInfo *TRI) const {
+
+  // Currently, this function only handles SVR4 32- and 64-bit ABIs.
+  // Return false otherwise to maintain pre-existing behavior.
+  if (!Subtarget.isSVR4ABI())
+    return false;
+
+  MachineFunction *MF = MBB.getParent();
+  const PPCInstrInfo &TII =
+    *static_cast<const PPCInstrInfo*>(MF->getTarget().getInstrInfo());
+  DebugLoc DL;
+  bool CRSpilled = false;
+  
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+    // CR2 through CR4 are the nonvolatile CR fields.
+    bool IsCRField = PPC::CR2 <= Reg && Reg <= PPC::CR4;
+
+    if (CRSpilled && IsCRField)
+      continue;
+
+    // Add the callee-saved register as live-in; it's killed at the spill.
+    MBB.addLiveIn(Reg);
+
+    // Insert the spill to the stack frame.
+    if (IsCRField) {
+      CRSpilled = true;
+      // The first time we see a CR field, store the whole CR into the
+      // save slot via GPR12 (available in the prolog for 32- and 64-bit).
+      if (Subtarget.isPPC64()) {
+	// 64-bit:  SP+8
+	MBB.insert(MI, BuildMI(*MF, DL, TII.get(PPC::MFCR), PPC::X12));
+	MBB.insert(MI, BuildMI(*MF, DL, TII.get(PPC::STW))
+			       .addReg(PPC::X12,
+				       getKillRegState(true))
+			       .addImm(8)
+			       .addReg(PPC::X1));
+      } else {
+	// 32-bit:  FP-relative.  Note that we made sure CR2-CR4 all have
+	// the same frame index in PPCRegisterInfo::hasReservedSpillSlot.
+	MBB.insert(MI, BuildMI(*MF, DL, TII.get(PPC::MFCR), PPC::R12));
+	MBB.insert(MI, addFrameReference(BuildMI(*MF, DL, TII.get(PPC::STW))
+					 .addReg(PPC::R12,
+						 getKillRegState(true)),
+					 CSI[i].getFrameIdx()));
+      }
+      
+      // Record that we spill the CR in this function.
+      PPCFunctionInfo *FuncInfo = MF->getInfo<PPCFunctionInfo>();
+      FuncInfo->setSpillsCR();
+    } else {
+      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+      TII.storeRegToStackSlot(MBB, MI, Reg, true,
+			      CSI[i].getFrameIdx(), RC, TRI);
+    }
+  }
+  return true;
+}
+
+static void
+restoreCRs(bool isPPC64, bool CR2Spilled, bool CR3Spilled, bool CR4Spilled,
+	   MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+	   const std::vector<CalleeSavedInfo> &CSI, unsigned CSIIndex) {
+
+  MachineFunction *MF = MBB.getParent();
+  const PPCInstrInfo &TII =
+    *static_cast<const PPCInstrInfo*>(MF->getTarget().getInstrInfo());
+  DebugLoc DL;
+  unsigned RestoreOp, MoveReg;
+
+  if (isPPC64) {
+    // 64-bit:  SP+8
+    MBB.insert(MI, BuildMI(*MF, DL, TII.get(PPC::LWZ), PPC::X12)
+	       .addImm(8)
+	       .addReg(PPC::X1));
+    RestoreOp = PPC::MTCRF8;
+    MoveReg = PPC::X12;
+  } else {
+    // 32-bit:  FP-relative
+    MBB.insert(MI, addFrameReference(BuildMI(*MF, DL, TII.get(PPC::LWZ),
+					     PPC::R12),
+				     CSI[CSIIndex].getFrameIdx()));
+    RestoreOp = PPC::MTCRF;
+    MoveReg = PPC::R12;
+  }
+  
+  if (CR2Spilled)
+    MBB.insert(MI, BuildMI(*MF, DL, TII.get(RestoreOp), PPC::CR2)
+	       .addReg(MoveReg));
+
+  if (CR3Spilled)
+    MBB.insert(MI, BuildMI(*MF, DL, TII.get(RestoreOp), PPC::CR3)
+	       .addReg(MoveReg));
+
+  if (CR4Spilled)
+    MBB.insert(MI, BuildMI(*MF, DL, TII.get(RestoreOp), PPC::CR4)
+	       .addReg(MoveReg));
+}
+
+bool 
+PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+					MachineBasicBlock::iterator MI,
+				        const std::vector<CalleeSavedInfo> &CSI,
+					const TargetRegisterInfo *TRI) const {
+
+  // Currently, this function only handles SVR4 32- and 64-bit ABIs.
+  // Return false otherwise to maintain pre-existing behavior.
+  if (!Subtarget.isSVR4ABI())
+    return false;
+
+  MachineFunction *MF = MBB.getParent();
+  const PPCInstrInfo &TII =
+    *static_cast<const PPCInstrInfo*>(MF->getTarget().getInstrInfo());
+  bool CR2Spilled = false;
+  bool CR3Spilled = false;
+  bool CR4Spilled = false;
+  unsigned CSIIndex = 0;
+
+  // Initialize insertion-point logic; we will be restoring in reverse
+  // order of spill.
+  MachineBasicBlock::iterator I = MI, BeforeI = I;
+  bool AtStart = I == MBB.begin();
+
+  if (!AtStart)
+    --BeforeI;
+
+  for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+    unsigned Reg = CSI[i].getReg();
+
+    if (Reg == PPC::CR2) {
+      CR2Spilled = true;
+      // The spill slot is associated only with CR2, which is the
+      // first nonvolatile spilled.  Save it here.
+      CSIIndex = i;
+      continue;
+    } else if (Reg == PPC::CR3) {
+      CR3Spilled = true;
+      continue;
+    } else if (Reg == PPC::CR4) {
+      CR4Spilled = true;
+      continue;
+    } else {
+      // When we first encounter a non-CR register after seeing at
+      // least one CR register, restore all spilled CRs together.
+      if ((CR2Spilled || CR3Spilled || CR4Spilled)
+	  && !(PPC::CR2 <= Reg && Reg <= PPC::CR4)) {
+	restoreCRs(Subtarget.isPPC64(), CR2Spilled, CR3Spilled, CR4Spilled,
+		   MBB, I, CSI, CSIIndex);
+	CR2Spilled = CR3Spilled = CR4Spilled = false;
+      }
+
+      // Default behavior for non-CR saves.
+      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+      TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(),
+			       RC, TRI);
+      assert(I != MBB.begin() &&
+	     "loadRegFromStackSlot didn't insert any code!");
+      }
+
+    // Insert in reverse order.
+    if (AtStart)
+      I = MBB.begin();
+    else {
+      I = BeforeI;
+      ++I;
+    }	    
+  }
+
+  // If we haven't yet spilled the CRs, do so now.
+  if (CR2Spilled || CR3Spilled || CR4Spilled)
+    restoreCRs(Subtarget.isPPC64(), CR2Spilled, CR3Spilled, CR4Spilled,
+	       MBB, I, CSI, CSIIndex);
+
+  return true;
+}
+
diff --git a/lib/Target/PowerPC/PPCFrameLowering.h b/lib/Target/PowerPC/PPCFrameLowering.h
index d708541c66..4d957b91c7 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.h
+++ b/lib/Target/PowerPC/PPCFrameLowering.h
@@ -45,6 +45,16 @@ public:
                                             RegScavenger *RS = NULL) const;
   void processFunctionBeforeFrameFinalized(MachineFunction &MF) const;
 
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 const TargetRegisterInfo *TRI) const;
+
+  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MI,
+                                   const std::vector<CalleeSavedInfo> &CSI,
+                                   const TargetRegisterInfo *TRI) const;
+
   /// targetHandlesStackFrameRounding - Returns true if the target is
   /// responsible for rounding up the stack frame (probably at emitPrologue
   /// time).
@@ -170,23 +180,11 @@ public:
       {PPC::R15, -68},
       {PPC::R14, -72},
 
-      // CR save area offset.
-      // FIXME SVR4: Disable CR save area for now.
-//      {PPC::CR2, -4},
-//      {PPC::CR3, -4},
-//      {PPC::CR4, -4},
-//      {PPC::CR2LT, -4},
-//      {PPC::CR2GT, -4},
-//      {PPC::CR2EQ, -4},
-//      {PPC::CR2UN, -4},
-//      {PPC::CR3LT, -4},
-//      {PPC::CR3GT, -4},
-//      {PPC::CR3EQ, -4},
-//      {PPC::CR3UN, -4},
-//      {PPC::CR4LT, -4},
-//      {PPC::CR4GT, -4},
-//      {PPC::CR4EQ, -4},
-//      {PPC::CR4UN, -4},
+      // CR save area offset.  We map each of the nonvolatile CR fields
+      // to the slot for CR2, which is the first of the nonvolatile CR
+      // fields to be assigned, so that we only allocate one save slot.
+      // See PPCRegisterInfo::hasReservedSpillSlot() for more information.
+      {PPC::CR2, -4},
 
       // VRSAVE save area offset.
       {PPC::VRSAVE, -4},
@@ -228,27 +226,6 @@ public:
       {PPC::F14, -144},
 
       // General register save area offsets.
-      // FIXME 64-bit SVR4: Are 32-bit registers actually allocated in 64-bit
-      //                    mode?
-      {PPC::R31, -4},
-      {PPC::R30, -12},
-      {PPC::R29, -20},
-      {PPC::R28, -28},
-      {PPC::R27, -36},
-      {PPC::R26, -44},
-      {PPC::R25, -52},
-      {PPC::R24, -60},
-      {PPC::R23, -68},
-      {PPC::R22, -76},
-      {PPC::R21, -84},
-      {PPC::R20, -92},
-      {PPC::R19, -100},
-      {PPC::R18, -108},
-      {PPC::R17, -116},
-      {PPC::R16, -124},
-      {PPC::R15, -132},
-      {PPC::R14, -140},
-
       {PPC::X31, -8},
       {PPC::X30, -16},
       {PPC::X29, -24},
@@ -268,24 +245,6 @@ public:
       {PPC::X15, -136},
       {PPC::X14, -144},
 
-      // CR save area offset.
-      // FIXME SVR4: Disable CR save area for now.
-//      {PPC::CR2, -4},
-//      {PPC::CR3, -4},
-//      {PPC::CR4, -4},
-//      {PPC::CR2LT, -4},
-//      {PPC::CR2GT, -4},
-//      {PPC::CR2EQ, -4},
-//      {PPC::CR2UN, -4},
-//      {PPC::CR3LT, -4},
-//      {PPC::CR3GT, -4},
-//      {PPC::CR3EQ, -4},
-//      {PPC::CR3UN, -4},
-//      {PPC::CR4LT, -4},
-//      {PPC::CR4GT, -4},
-//      {PPC::CR4EQ, -4},
-//      {PPC::CR4UN, -4},
-
       // VRSAVE save area offset.
       {PPC::VRSAVE, -4},
 
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index dbb3b144a7..2e8fa1842a 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1264,8 +1264,8 @@ SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
 
   unsigned MOHiFlag, MOLoFlag;
   bool isPIC = GetLabelAccessInfo(DAG.getTarget(), MOHiFlag, MOLoFlag);
-  SDValue TgtBAHi = DAG.getBlockAddress(BA, PtrVT, /*isTarget=*/true, MOHiFlag);
-  SDValue TgtBALo = DAG.getBlockAddress(BA, PtrVT, /*isTarget=*/true, MOLoFlag);
+  SDValue TgtBAHi = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOHiFlag);
+  SDValue TgtBALo = DAG.getTargetBlockAddress(BA, PtrVT, 0, MOLoFlag);
   return LowerLabelRef(TgtBAHi, TgtBALo, isPIC, DAG);
 }
 
@@ -1717,16 +1717,16 @@ PPCTargetLowering::LowerFormalArguments(SDValue Chain,
                                         SmallVectorImpl<SDValue> &InVals)
                                           const {
   if (PPCSubTarget.isSVR4ABI() && !PPCSubTarget.isPPC64()) {
-    return LowerFormalArguments_SVR4(Chain, CallConv, isVarArg, Ins,
-                                     dl, DAG, InVals);
-  } else {
-    return LowerFormalArguments_Darwin(Chain, CallConv, isVarArg, Ins,
+    return LowerFormalArguments_32SVR4(Chain, CallConv, isVarArg, Ins,
                                        dl, DAG, InVals);
+  } else {
+    return LowerFormalArguments_Darwin_Or_64SVR4(Chain, CallConv, isVarArg, Ins,
+                                                 dl, DAG, InVals);
   }
 }
 
 SDValue
-PPCTargetLowering::LowerFormalArguments_SVR4(
+PPCTargetLowering::LowerFormalArguments_32SVR4(
                                       SDValue Chain,
                                       CallingConv::ID CallConv, bool isVarArg,
                                       const SmallVectorImpl<ISD::InputArg>
@@ -1944,7 +1944,7 @@ PPCTargetLowering::LowerFormalArguments_SVR4(
 }
 
 SDValue
-PPCTargetLowering::LowerFormalArguments_Darwin(
+PPCTargetLowering::LowerFormalArguments_Darwin_Or_64SVR4(
                                       SDValue Chain,
                                       CallingConv::ID CallConv, bool isVarArg,
                                       const SmallVectorImpl<ISD::InputArg>
@@ -1959,6 +1959,7 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
 
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   bool isPPC64 = PtrVT == MVT::i64;
+  bool isSVR4ABI = PPCSubTarget.isSVR4ABI();
   // Potential tail calls could cause overwriting of argument stack slots.
   bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
                        (CallConv == CallingConv::Fast));
@@ -2019,10 +2020,12 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
       default: llvm_unreachable("Unhandled argument type!");
       case MVT::i32:
       case MVT::f32:
-        VecArgOffset += isPPC64 ? 8 : 4;
+        VecArgOffset += 4;
         break;
       case MVT::i64:  // PPC64
       case MVT::f64:
+        // FIXME: We are guaranteed to be !isPPC64 at this point.
+        // Does MVT::i64 apply?
         VecArgOffset += 8;
         break;
       case MVT::v4f32:
@@ -2076,8 +2079,11 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
       // ObjSize is the true size, ArgSize rounded up to multiple of registers.
       ObjSize = Flags.getByValSize();
       ArgSize = ((ObjSize + PtrByteSize - 1)/PtrByteSize) * PtrByteSize;
-      // Objects of size 1 and 2 are right justified, everything else is
-      // left justified.  This means the memory address is adjusted forwards.
+      // FOR DARWIN: Objects of size 1 and 2 are right justified, everything
+      // else is left justified.  This means the memory address is adjusted
+      // forwards.
+      // FOR 64-BIT SVR4: All aggregates smaller than 8 bytes must be passed
+      // right-justified.
       if (ObjSize==1 || ObjSize==2) {
         CurArgOffset = CurArgOffset + (4 - ObjSize);
       }
@@ -2085,7 +2091,8 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
       int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, true);
       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
       InVals.push_back(FIN);
-      if (ObjSize==1 || ObjSize==2) {
+      if (ObjSize==1 || ObjSize==2 ||
+          (ObjSize==4 && isSVR4ABI)) {
         if (GPR_idx != Num_GPR_Regs) {
           unsigned VReg;
           if (isPPC64)
@@ -2093,10 +2100,11 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
           else
             VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::GPRCRegClass);
           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+          EVT ObjType = (ObjSize == 1 ? MVT::i8 :
+                         (ObjSize == 2 ? MVT::i16 : MVT::i32));
           SDValue Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN,
                                             MachinePointerInfo(),
-                                            ObjSize==1 ? MVT::i8 : MVT::i16,
-                                            false, false, 0);
+                                            ObjType, false, false, 0);
           MemOps.push_back(Store);
           ++GPR_idx;
         }
@@ -2107,8 +2115,8 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
       }
       for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
         // Store whatever pieces of the object are in registers
-        // to memory.  ArgVal will be address of the beginning of
-        // the object.
+        // to memory.  ArgOffset will be the address of the beginning
+        // of the object.
         if (GPR_idx != Num_GPR_Regs) {
           unsigned VReg;
           if (isPPC64)
@@ -2118,7 +2126,16 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
           int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true);
           SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
-          SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
+          SDValue Shifted = Val;
+
+          // For 64-bit SVR4, small structs come in right-adjusted.
+          // Shift them left so the following logic works as expected.
+          if (ObjSize < 8 && isSVR4ABI) {
+            SDValue ShiftAmt = DAG.getConstant(64 - 8 * ObjSize, PtrVT);
+            Shifted = DAG.getNode(ISD::SHL, dl, PtrVT, Val, ShiftAmt);
+          }
+
+          SDValue Store = DAG.getStore(Val.getValue(1), dl, Shifted, FIN,
                                        MachinePointerInfo(),
                                        false, false, 0);
           MemOps.push_back(Store);
@@ -2308,8 +2325,8 @@ PPCTargetLowering::LowerFormalArguments_Darwin(
   return Chain;
 }
 
-/// CalculateParameterAndLinkageAreaSize - Get the size of the paramter plus
-/// linkage area for the Darwin ABI.
+/// CalculateParameterAndLinkageAreaSize - Get the size of the parameter plus
+/// linkage area for the Darwin ABI, or the 64-bit SVR4 ABI.
 static unsigned
 CalculateParameterAndLinkageAreaSize(SelectionDAG &DAG,
                                      bool isPPC64,
@@ -2718,7 +2735,7 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
       // Thus for a call through a function pointer, the following actions need
       // to be performed:
       //   1. Save the TOC of the caller in the TOC save area of its stack
-      //      frame (this is done in LowerCall_Darwin()).
+      //      frame (this is done in LowerCall_Darwin_Or_64SVR4()).
       //   2. Load the address of the function entry point from the function
       //      descriptor.
       //   3. Load the TOC of the callee from the function descriptor into r2.
@@ -2808,6 +2825,15 @@ unsigned PrepareCall(SelectionDAG &DAG, SDValue &Callee, SDValue &InFlag,
   return CallOpc;
 }
 
+static
+bool isLocalCall(const SDValue &Callee)
+{
+  if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
+    return !G->getGlobal()->isDeclaration() &&
+           !G->getGlobal()->isWeakForLinker();
+  return false;
+}
+
 SDValue
 PPCTargetLowering::LowerCallResult(SDValue Chain, SDValue InFlag,
                                    CallingConv::ID CallConv, bool isVarArg,
@@ -2916,8 +2942,8 @@ PPCTargetLowering::FinishCall(CallingConv::ID CallConv, DebugLoc dl,
       // from allocating it), resulting in an additional register being
       // allocated and an unnecessary move instruction being generated.
       needsTOCRestore = true;
-    } else if (CallOpc == PPCISD::CALL_SVR4) {
-      // Otherwise insert NOP.
+    } else if ((CallOpc == PPCISD::CALL_SVR4) && !isLocalCall(Callee)) {
+      // Otherwise insert NOP for non-local calls.
       CallOpc = PPCISD::CALL_NOP_SVR4;
     }
   }
@@ -2960,25 +2986,25 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                                    Ins, DAG);
 
   if (PPCSubTarget.isSVR4ABI() && !PPCSubTarget.isPPC64())
-    return LowerCall_SVR4(Chain, Callee, CallConv, isVarArg,
-                          isTailCall, Outs, OutVals, Ins,
-                          dl, DAG, InVals);
+    return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg,
+                            isTailCall, Outs, OutVals, Ins,
+                            dl, DAG, InVals);
 
-  return LowerCall_Darwin(Chain, Callee, CallConv, isVarArg,
-                          isTailCall, Outs, OutVals, Ins,
-                          dl, DAG, InVals);
+  return LowerCall_Darwin_Or_64SVR4(Chain, Callee, CallConv, isVarArg,
+                                    isTailCall, Outs, OutVals, Ins,
+                                    dl, DAG, InVals);
 }
 
 SDValue
-PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
-                                  CallingConv::ID CallConv, bool isVarArg,
-                                  bool isTailCall,
-                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                  const SmallVectorImpl<SDValue> &OutVals,
-                                  const SmallVectorImpl<ISD::InputArg> &Ins,
-                                  DebugLoc dl, SelectionDAG &DAG,
-                                  SmallVectorImpl<SDValue> &InVals) const {
-  // See PPCTargetLowering::LowerFormalArguments_SVR4() for a description
+PPCTargetLowering::LowerCall_32SVR4(SDValue Chain, SDValue Callee,
+                                    CallingConv::ID CallConv, bool isVarArg,
+                                    bool isTailCall,
+                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                    const SmallVectorImpl<SDValue> &OutVals,
+                                    const SmallVectorImpl<ISD::InputArg> &Ins,
+                                    DebugLoc dl, SelectionDAG &DAG,
+                                    SmallVectorImpl<SDValue> &InVals) const {
+  // See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description
   // of the 32-bit SVR4 ABI stack frame layout.
 
   assert((CallConv == CallingConv::C ||
@@ -3183,7 +3209,7 @@ PPCTargetLowering::LowerCall_SVR4(SDValue Chain, SDValue Callee,
 }
 
 SDValue
-PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
+PPCTargetLowering::LowerCall_Darwin_Or_64SVR4(SDValue Chain, SDValue Callee,
                                     CallingConv::ID CallConv, bool isVarArg,
                                     bool isTailCall,
                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
@@ -3192,6 +3218,8 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
                                     DebugLoc dl, SelectionDAG &DAG,
                                     SmallVectorImpl<SDValue> &InVals) const {
 
+  bool isSVR4ABI = PPCSubTarget.isSVR4ABI();
+
   unsigned NumOps  = Outs.size();
 
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
@@ -3299,12 +3327,22 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
     }
 
     // FIXME memcpy is used way more than necessary.  Correctness first.
+    // Note: "by value" is code for passing a structure by value, not
+    // basic types.
     if (Flags.isByVal()) {
+      // Note: Size includes alignment padding, so
+      //   struct x { short a; char b; }
+      // will have Size = 4.  With #pragma pack(1), it will have Size = 3.
+      // These are the proper values we need for right-justifying the
+      // aggregate in a parameter register for 64-bit SVR4.
       unsigned Size = Flags.getByValSize();
-      if (Size==1 || Size==2) {
-        // Very small objects are passed right-justified.
-        // Everything else is passed left-justified.
-        EVT VT = (Size==1) ? MVT::i8 : MVT::i16;
+      // FOR DARWIN ONLY:  Very small objects are passed right-justified.
+      // Everything else is passed left-justified.
+      // FOR 64-BIT SVR4:  All aggregates smaller than 8 bytes must
+      // be passed right-justified.
+      if (Size==1 || Size==2 ||
+          (Size==4 && isSVR4ABI)) {
+        EVT VT = (Size==1) ? MVT::i8 : ((Size==2) ? MVT::i16 : MVT::i32);
         if (GPR_idx != NumGPRs) {
           SDValue Load = DAG.getExtLoad(ISD::EXTLOAD, dl, PtrVT, Chain, Arg,
                                         MachinePointerInfo(), VT,
@@ -3332,15 +3370,67 @@ PPCTargetLowering::LowerCall_Darwin(SDValue Chain, SDValue Callee,
       // Copy entire object into memory.  There are cases where gcc-generated
       // code assumes it is there, even if it could be put entirely into
       // registers.  (This is not what the doc says.)
-      SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff,
-                            CallSeqStart.getNode()->getOperand(0),
-                            Flags, DAG, dl);
-      // This must go outside the CALLSEQ_START..END.
-      SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
-                           CallSeqStart.getNode()->getOperand(1));
-      DAG.ReplaceAllUsesWith(CallSeqStart.getNode(), NewCallSeqStart.getNode());
-      Chain = CallSeqStart = NewCallSeqStart;
-      // And copy the pieces of it that fit into registers.
+
+      // FIXME: The above statement is likely due to a misunderstanding of the
+      // documents.  At least for 64-bit SVR4, all arguments must be copied
+      // into the parameter area BY THE CALLEE in the event that the callee
+      // takes the address of any formal argument.  That has not yet been
+      // implemented.  However, it is reasonable to use the stack area as a
+      // staging area for the register load.
+
+      // Skip this for small aggregates under 64-bit SVR4, as we will use
+      // the same slot for a right-justified copy, below.
+      if (Size >= 8 || !isSVR4ABI) {
+        SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, PtrOff,
+                              CallSeqStart.getNode()->getOperand(0),
+                              Flags, DAG, dl);
+        // This must go outside the CALLSEQ_START..END.
+        SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
+                                   CallSeqStart.getNode()->getOperand(1));
+        DAG.ReplaceAllUsesWith(CallSeqStart.getNode(),
+                               NewCallSeqStart.getNode());
+        Chain = CallSeqStart = NewCallSeqStart;
+      }
+
+      // FOR 64-BIT SVR4:  When a register is available, pass the
+      // aggregate right-justified.
+      if (isSVR4ABI && Size < 8 && GPR_idx != NumGPRs) {
+        // The easiest way to get this right-justified in a register
+        // is to copy the structure into the rightmost portion of a
+        // local variable slot, then load the whole slot into the
+        // register.
+        // FIXME: The memcpy seems to produce pretty awful code for
+        // small aggregates, particularly for packed ones.
+        // FIXME: It would be preferable to use the slot in the 
+        // parameter save area instead of a new local variable.
+        SDValue Const = DAG.getConstant(8 - Size, PtrOff.getValueType());
+        SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, Const);
+        SDValue MemcpyCall = CreateCopyOfByValArgument(Arg, AddPtr,
+                              CallSeqStart.getNode()->getOperand(0),
+                              Flags, DAG, dl);
+
+        // Place the memcpy outside the CALLSEQ_START..END.
+        SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall,
+                                   CallSeqStart.getNode()->getOperand(1));
+        DAG.ReplaceAllUsesWith(CallSeqStart.getNode(), 
+                               NewCallSeqStart.getNode());
+        Chain = CallSeqStart = NewCallSeqStart;
+
+        // Load the slot into the register.
+        SDValue Load = DAG.getLoad(PtrVT, dl, Chain, PtrOff,
+                                   MachinePointerInfo(),
+                                   false, false, false, 0);
+        MemOpChains.push_back(Load.getValue(1));
+        RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Load));
+
+        // Done with this argument.
+        ArgOffset += PtrByteSize;
+        continue;
+      }
+
+      // For small aggregates (Darwin only) and aggregates >= PtrByteSize,
+      // copy the pieces of the object that fit into registers from the
+      // parameter save area.
       for (unsigned j=0; j<Size; j+=PtrByteSize) {
         SDValue Const = DAG.getConstant(j, PtrOff.getValueType());
         SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index 902b188da7..40da4cc33c 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -467,20 +467,21 @@ namespace llvm {
                   DebugLoc dl, SelectionDAG &DAG) const;
 
     SDValue
-      LowerFormalArguments_Darwin(SDValue Chain,
+      LowerFormalArguments_Darwin_Or_64SVR4(SDValue Chain,
                                   CallingConv::ID CallConv, bool isVarArg,
                                   const SmallVectorImpl<ISD::InputArg> &Ins,
                                   DebugLoc dl, SelectionDAG &DAG,
                                   SmallVectorImpl<SDValue> &InVals) const;
     SDValue
-      LowerFormalArguments_SVR4(SDValue Chain,
-                                CallingConv::ID CallConv, bool isVarArg,
-                                const SmallVectorImpl<ISD::InputArg> &Ins,
-                                DebugLoc dl, SelectionDAG &DAG,
-                                SmallVectorImpl<SDValue> &InVals) const;
+      LowerFormalArguments_32SVR4(SDValue Chain,
+                                  CallingConv::ID CallConv, bool isVarArg,
+                                  const SmallVectorImpl<ISD::InputArg> &Ins,
+                                  DebugLoc dl, SelectionDAG &DAG,
+                                  SmallVectorImpl<SDValue> &InVals) const;
 
     SDValue
-      LowerCall_Darwin(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
+      LowerCall_Darwin_Or_64SVR4(SDValue Chain, SDValue Callee,
+                       CallingConv::ID CallConv,
                        bool isVarArg, bool isTailCall,
                        const SmallVectorImpl<ISD::OutputArg> &Outs,
                        const SmallVectorImpl<SDValue> &OutVals,
@@ -488,13 +489,13 @@ namespace llvm {
                        DebugLoc dl, SelectionDAG &DAG,
                        SmallVectorImpl<SDValue> &InVals) const;
     SDValue
-    LowerCall_SVR4(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
-                   bool isVarArg, bool isTailCall,
-                   const SmallVectorImpl<ISD::OutputArg> &Outs,
-                   const SmallVectorImpl<SDValue> &OutVals,
-                   const SmallVectorImpl<ISD::InputArg> &Ins,
-                   DebugLoc dl, SelectionDAG &DAG,
-                   SmallVectorImpl<SDValue> &InVals) const;
+    LowerCall_32SVR4(SDValue Chain, SDValue Callee, CallingConv::ID CallConv,
+                     bool isVarArg, bool isTailCall,
+                     const SmallVectorImpl<ISD::OutputArg> &Outs,
+                     const SmallVectorImpl<SDValue> &OutVals,
+                     const SmallVectorImpl<ISD::InputArg> &Ins,
+                     DebugLoc dl, SelectionDAG &DAG,
+                     SmallVectorImpl<SDValue> &InVals) const;
   };
 }
 
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index ab8bf1f93a..285e74a4c2 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -71,7 +71,7 @@ PPCRegisterInfo::PPCRegisterInfo(const PPCSubtarget &ST,
   : PPCGenRegisterInfo(ST.isPPC64() ? PPC::LR8 : PPC::LR,
                        ST.isPPC64() ? 0 : 1,
                        ST.isPPC64() ? 0 : 1),
-    Subtarget(ST), TII(tii) {
+    Subtarget(ST), TII(tii), CRSpillFrameIdx(0) {
   ImmToIdxMap[PPC::LD]   = PPC::LDX;    ImmToIdxMap[PPC::STD]  = PPC::STDX;
   ImmToIdxMap[PPC::LBZ]  = PPC::LBZX;   ImmToIdxMap[PPC::STB]  = PPC::STBX;
   ImmToIdxMap[PPC::LHZ]  = PPC::LHZX;   ImmToIdxMap[PPC::LHA]  = PPC::LHAX;
@@ -111,10 +111,15 @@ PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     return Subtarget.isPPC64() ? CSR_Darwin64_SaveList :
                                  CSR_Darwin32_SaveList;
 
+  // For 32-bit SVR4, also initialize the frame index associated with
+  // the CR spill slot.
+  if (!Subtarget.isPPC64())
+    CRSpillFrameIdx = 0;
+
   return Subtarget.isPPC64() ? CSR_SVR464_SaveList : CSR_SVR432_SaveList;
 }
 
-const unsigned*
+const uint32_t*
 PPCRegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
   if (Subtarget.isDarwinABI())
     return Subtarget.isPPC64() ? CSR_Darwin64_RegMask :
@@ -477,6 +482,31 @@ void PPCRegisterInfo::lowerCRRestore(MachineBasicBlock::iterator II,
   MBB.erase(II);
 }
 
+bool
+PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
+				      unsigned Reg, int &FrameIdx) const {
+
+  // For the nonvolatile condition registers (CR2, CR3, CR4) in an SVR4
+  // ABI, return true to prevent allocating an additional frame slot.
+  // For 64-bit, the CR save area is at SP+8; the value of FrameIdx = 0
+  // is arbitrary and will be subsequently ignored.  For 32-bit, we must
+  // create exactly one stack slot and return its FrameIdx for all
+  // nonvolatiles.
+  if (Subtarget.isSVR4ABI() && PPC::CR2 <= Reg && Reg <= PPC::CR4) {
+    if (Subtarget.isPPC64()) {
+      FrameIdx = 0;
+    } else if (CRSpillFrameIdx) {
+      FrameIdx = CRSpillFrameIdx;
+    } else {
+      MachineFrameInfo *MFI = ((MachineFunction &)MF).getFrameInfo();
+      FrameIdx = MFI->CreateFixedObject((uint64_t)4, (int64_t)-4, true);
+      CRSpillFrameIdx = FrameIdx;
+    }
+    return true;
+  }
+  return false;
+}
+
 void
 PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                                      int SPAdj, RegScavenger *RS) const {
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h
index 152c36d699..a8fd796d9e 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -30,6 +30,7 @@ class PPCRegisterInfo : public PPCGenRegisterInfo {
   std::map<unsigned, unsigned> ImmToIdxMap;
   const PPCSubtarget &Subtarget;
   const TargetInstrInfo &TII;
+  mutable int CRSpillFrameIdx;
 public:
   PPCRegisterInfo(const PPCSubtarget &SubTarget, const TargetInstrInfo &tii);
   
@@ -43,7 +44,7 @@ public:
 
   /// Code Generation virtual methods...
   const uint16_t *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
-  const unsigned *getCallPreservedMask(CallingConv::ID CC) const;
+  const uint32_t *getCallPreservedMask(CallingConv::ID CC) const;
 
   BitVector getReservedRegs(const MachineFunction &MF) const;
 
@@ -65,6 +66,8 @@ public:
                        int SPAdj, RegScavenger *RS) const;
   void lowerCRRestore(MachineBasicBlock::iterator II, unsigned FrameIndex,
                        int SPAdj, RegScavenger *RS) const;
+  bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg,
+			    int &FrameIdx) const;
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
                            int SPAdj, RegScavenger *RS = NULL) const;
 
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index c89e738a8b..77961e53ae 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -60,6 +60,10 @@ private:
   bool ParseDirectiveWord(unsigned Size, SMLoc L);
   bool ParseDirectiveCode(StringRef IDVal, SMLoc L);
 
+  bool mnemonicIsValid(StringRef Mnemonic) {
+    return mnemonicIsValidImpl(Mnemonic);
+  }
+
   bool processInstruction(MCInst &Inst,
                           const SmallVectorImpl<MCParsedAsmOperand*> &Ops);
 
@@ -202,7 +206,8 @@ struct X86Operand : public MCParsedAsmOperand {
   SMLoc getStartLoc() const { return StartLoc; }
   /// getEndLoc - Get the location of the last token of this operand.
   SMLoc getEndLoc() const { return EndLoc; }
-
+  /// getLocRange - Get the range between the first and last token of this
+  /// operand.
   SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); }
 
   virtual void print(raw_ostream &OS) const {}
@@ -633,14 +638,15 @@ X86Operand *X86AsmParser::ParseOperand() {
 
 /// getIntelMemOperandSize - Return intel memory operand size.
 static unsigned getIntelMemOperandSize(StringRef OpStr) {
-  unsigned Size = 0;
-  if (OpStr == "BYTE") Size = 8;
-  if (OpStr == "WORD") Size = 16;
-  if (OpStr == "DWORD") Size = 32;
-  if (OpStr == "QWORD") Size = 64;
-  if (OpStr == "XWORD") Size = 80;
-  if (OpStr == "XMMWORD") Size = 128;
-  if (OpStr == "YMMWORD") Size = 256;
+  unsigned Size = StringSwitch<unsigned>(OpStr)
+    .Cases("BYTE", "byte", 8)
+    .Cases("WORD", "word", 16)
+    .Cases("DWORD", "dword", 32)
+    .Cases("QWORD", "qword", 64)
+    .Cases("XWORD", "xword", 80)
+    .Cases("XMMWORD", "xmmword", 128)
+    .Cases("YMMWORD", "ymmword", 256)
+    .Default(0);
   return Size;
 }
 
@@ -743,7 +749,8 @@ X86Operand *X86AsmParser::ParseIntelMemOperand() {
   unsigned Size = getIntelMemOperandSize(Tok.getString());
   if (Size) {
     Parser.Lex();
-    assert (Tok.getString() == "PTR" && "Unexpected token!");
+    assert ((Tok.getString() == "PTR" || Tok.getString() == "ptr") &&
+            "Unexpected token!");
     Parser.Lex();
   }
 
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
index af444d196e..b24f517209 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.c
@@ -138,6 +138,10 @@ static InstrUID decode(OpcodeType type,
     if (modFromModRM(modRM) == 0x3)
       return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)+8];
     return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)];
+  case MODRM_SPLITMISC:
+    if (modFromModRM(modRM) == 0x3)
+      return modRMTable[dec->instructionIDs+(modRM & 0x3f)+8];
+    return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)];
   case MODRM_FULL:
     return modRMTable[dec->instructionIDs+modRM];
   }
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
index b0a0e1e78e..23dfe4b5b5 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
@@ -160,6 +160,10 @@ typedef uint16_t InstrUID;
  * MODRM_SPLITRM  - If the ModR/M byte is between 0x00 and 0xbf, the opcode
  *                  corresponds to one instruction; otherwise, it corresponds to
  *                  a different instruction.
+ * MODRM_SPLITMISC- If the ModR/M byte is between 0x00 and 0xbf, ModR/M byte
+ *                  divided by 8 is used to select instruction; otherwise, each
+ *                  value of the ModR/M byte could correspond to a different
+ *                  instruction.
  * MODRM_SPLITREG - ModR/M byte divided by 8 is used to select instruction. This
                     corresponds to instructions that use reg field as opcode
  * MODRM_FULL     - Potentially, each value of the ModR/M byte could correspond
@@ -169,6 +173,7 @@ typedef uint16_t InstrUID;
 #define MODRMTYPES            \
   ENUM_ENTRY(MODRM_ONEENTRY)  \
   ENUM_ENTRY(MODRM_SPLITRM)   \
+  ENUM_ENTRY(MODRM_SPLITMISC)  \
   ENUM_ENTRY(MODRM_SPLITREG)  \
   ENUM_ENTRY(MODRM_FULL)
 
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index fb1ba12a52..2696190d23 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -67,9 +67,10 @@ public:
 };
 
 class X86AsmBackend : public MCAsmBackend {
+  StringRef CPU;
 public:
-  X86AsmBackend(const Target &T)
-    : MCAsmBackend() {}
+  X86AsmBackend(const Target &T, StringRef _CPU)
+    : MCAsmBackend(), CPU(_CPU) {}
 
   unsigned getNumFixupKinds() const {
     return X86::NumTargetFixupKinds;
@@ -279,9 +280,9 @@ void X86AsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const {
   Res.setOpcode(RelaxedOp);
 }
 
-/// writeNopData - Write optimal nops to the output file for the \arg Count
+/// writeNopData - Write optimal nops to the output file for the \p Count
 /// bytes.  This returns the number of bytes written.  It may return 0 if
-/// the \arg Count is more than the maximum optimal nops.
+/// the \p Count is more than the maximum optimal nops.
 bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
   static const uint8_t Nops[10][10] = {
     // nop
@@ -306,6 +307,13 @@ bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
     {0x66, 0x2e, 0x0f, 0x1f, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00},
   };
 
+  // This CPU doesnt support long nops. If needed add more.
+  if (CPU == "geode") {
+    for (uint64_t i = 0; i < Count; ++i)
+      OW->Write8(0x90);
+    return true;
+  }
+
   // Write an optimal sequence for the first 15 bytes.
   const uint64_t OptimalCount = (Count < 16) ? Count : 15;
   const uint64_t Prefixes = OptimalCount <= 10 ? 0 : OptimalCount - 10;
@@ -329,9 +337,9 @@ class ELFX86AsmBackend : public X86AsmBackend {
 public:
   uint8_t OSABI;
   Triple::OSType OSType; // @LOCALMOD: kept OSTYPE vs upstream. FIXME: remove.
-  ELFX86AsmBackend(const Target &T, uint8_t _OSABI,
+  ELFX86AsmBackend(const Target &T, uint8_t _OSABI, StringRef CPU,
                    Triple::OSType _OSType)
-    : X86AsmBackend(T), OSABI(_OSABI), OSType(_OSType) {
+    : X86AsmBackend(T, CPU), OSABI(_OSABI), OSType(_OSType) {
     HasReliableSymbolDifference = true;
   }
 
@@ -358,9 +366,9 @@ public:
 
 class ELFX86_32AsmBackend : public ELFX86AsmBackend {
 public:
-  ELFX86_32AsmBackend(const Target &T, uint8_t OSABI,
+  ELFX86_32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU,
                       Triple::OSType OSType) // @LOCALMOD: kept OSType
-    : ELFX86AsmBackend(T, OSABI, OSType) {}
+    : ELFX86AsmBackend(T, OSABI, CPU, OSType) {}
 
   MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
     return createX86ELFObjectWriter(OS, /*Is64Bit*/ false, OSABI);
@@ -369,9 +377,9 @@ public:
 
 class ELFX86_64AsmBackend : public ELFX86AsmBackend {
 public:
-  ELFX86_64AsmBackend(const Target &T, uint8_t OSABI,
+  ELFX86_64AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU,
                       Triple::OSType OSType) // @LOCALMOD: kept OSType
-    : ELFX86AsmBackend(T, OSABI, OSType) {}
+    : ELFX86AsmBackend(T, OSABI, CPU, OSType) {}
 
   MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
     return createX86ELFObjectWriter(OS, /*Is64Bit*/ true, OSABI);
@@ -382,8 +390,8 @@ class WindowsX86AsmBackend : public X86AsmBackend {
   bool Is64Bit;
 
 public:
-  WindowsX86AsmBackend(const Target &T, bool is64Bit)
-    : X86AsmBackend(T)
+  WindowsX86AsmBackend(const Target &T, bool is64Bit, StringRef CPU)
+    : X86AsmBackend(T, CPU)
     , Is64Bit(is64Bit) {
   }
 
@@ -394,14 +402,14 @@ public:
 
 class DarwinX86AsmBackend : public X86AsmBackend {
 public:
-  DarwinX86AsmBackend(const Target &T)
-    : X86AsmBackend(T) { }
+  DarwinX86AsmBackend(const Target &T, StringRef CPU)
+    : X86AsmBackend(T, CPU) { }
 };
 
 class DarwinX86_32AsmBackend : public DarwinX86AsmBackend {
 public:
-  DarwinX86_32AsmBackend(const Target &T)
-    : DarwinX86AsmBackend(T) {}
+  DarwinX86_32AsmBackend(const Target &T, StringRef CPU)
+    : DarwinX86AsmBackend(T, CPU) {}
 
   MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
     return createX86MachObjectWriter(OS, /*Is64Bit=*/false,
@@ -412,8 +420,8 @@ public:
 
 class DarwinX86_64AsmBackend : public DarwinX86AsmBackend {
 public:
-  DarwinX86_64AsmBackend(const Target &T)
-    : DarwinX86AsmBackend(T) {
+  DarwinX86_64AsmBackend(const Target &T, StringRef CPU)
+    : DarwinX86AsmBackend(T, CPU) {
     HasReliableSymbolDifference = true;
   }
 
@@ -459,28 +467,28 @@ public:
 
 } // end anonymous namespace
 
-MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, StringRef TT) {
+MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, StringRef TT, StringRef CPU) {
   Triple TheTriple(TT);
 
   if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
-    return new DarwinX86_32AsmBackend(T);
+    return new DarwinX86_32AsmBackend(T, CPU);
 
   if (TheTriple.isOSWindows())
-    return new WindowsX86AsmBackend(T, false);
+    return new WindowsX86AsmBackend(T, false, CPU);
 
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
-  return new ELFX86_32AsmBackend(T, OSABI, TheTriple.getOS());
+  return new ELFX86_32AsmBackend(T, OSABI, CPU, TheTriple.getOS());
 }
 
-MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T, StringRef TT) {
+MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T, StringRef TT, StringRef CPU) {
   Triple TheTriple(TT);
 
   if (TheTriple.isOSDarwin() || TheTriple.getEnvironment() == Triple::MachO)
-    return new DarwinX86_64AsmBackend(T);
+    return new DarwinX86_64AsmBackend(T, CPU);
 
   if (TheTriple.isOSWindows())
-    return new WindowsX86AsmBackend(T, true);
+    return new WindowsX86AsmBackend(T, true, CPU);
 
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
-  return new ELFX86_64AsmBackend(T, OSABI, TheTriple.getOS());
+  return new ELFX86_64AsmBackend(T, OSABI, CPU, TheTriple.getOS());
 }
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 65b2444da7..06138bf335 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -28,8 +28,8 @@ using namespace llvm;
 
 namespace {
 class X86MCCodeEmitter : public MCCodeEmitter {
-  X86MCCodeEmitter(const X86MCCodeEmitter &); // DO NOT IMPLEMENT
-  void operator=(const X86MCCodeEmitter &); // DO NOT IMPLEMENT
+  X86MCCodeEmitter(const X86MCCodeEmitter &) LLVM_DELETED_FUNCTION;
+  void operator=(const X86MCCodeEmitter &) LLVM_DELETED_FUNCTION;
   const MCInstrInfo &MCII;
   const MCSubtargetInfo &STI;
   MCContext &Ctx;
@@ -560,15 +560,6 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   }
 
 
-  // Set the vector length to 256-bit if YMM0-YMM15 is used
-  for (unsigned i = 0; i != MI.getNumOperands(); ++i) {
-    if (!MI.getOperand(i).isReg())
-      continue;
-    unsigned SrcReg = MI.getOperand(i).getReg();
-    if (SrcReg >= X86::YMM0 && SrcReg <= X86::YMM15)
-      VEX_L = 1;
-  }
-
   // Classify VEX_B, VEX_4V, VEX_R, VEX_X
   unsigned NumOps = Desc.getNumOperands();
   unsigned CurOp = 0;
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index 46500699eb..4b0cacecfa 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -80,8 +80,8 @@ MCCodeEmitter *createX86MCCodeEmitter(const MCInstrInfo &MCII,
                                       const MCSubtargetInfo &STI,
                                       MCContext &Ctx);
 
-MCAsmBackend *createX86_32AsmBackend(const Target &T, StringRef TT);
-MCAsmBackend *createX86_64AsmBackend(const Target &T, StringRef TT);
+MCAsmBackend *createX86_32AsmBackend(const Target &T, StringRef TT, StringRef CPU);
+MCAsmBackend *createX86_64AsmBackend(const Target &T, StringRef TT, StringRef CPU);
 
 /// createX86MachObjectWriter - Construct an X86 Mach-O object writer.
 MCObjectWriter *createX86MachObjectWriter(raw_ostream &OS,
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index d078a7b5df..def0f16e96 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -231,6 +231,7 @@ def : Proc<"bdver2",          [FeatureXOP, FeatureFMA4, FeatureCMPXCHG16B,
                                FeatureAES, FeaturePCLMUL,
                                FeatureF16C, FeatureLZCNT,
                                FeaturePOPCNT, FeatureBMI, FeatureFMA]>;
+def : Proc<"geode",           [Feature3DNowA]>;
 
 def : Proc<"winchip-c6",      [FeatureMMX]>;
 def : Proc<"winchip2",        [Feature3DNow]>;
diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
index 8448556720..f89e399658 100644
--- a/lib/Target/X86/X86CodeEmitter.cpp
+++ b/lib/Target/X86/X86CodeEmitter.cpp
@@ -922,17 +922,6 @@ void Emitter<CodeEmitter>::emitVEXOpcodePrefix(uint64_t TSFlags,
   }
 
 
-  // Set the vector length to 256-bit if YMM0-YMM15 is used
-  for (unsigned i = 0; i != MI.getNumOperands(); ++i) {
-    if (!MI.getOperand(i).isReg())
-      continue;
-    if (MI.getOperand(i).isImplicit())
-      continue;
-    unsigned SrcReg = MI.getOperand(i).getReg();
-    if (SrcReg >= X86::YMM0 && SrcReg <= X86::YMM15)
-      VEX_L = 1;
-  }
-
   // Classify VEX_B, VEX_4V, VEX_R, VEX_X
   unsigned NumOps = Desc->getNumOperands();
   unsigned CurOp = 0;
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index 9d5de814be..791f5982af 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -171,7 +171,7 @@ namespace {
     // Shuffle live registers to match the expectations of successor blocks.
     void finishBlockStack();
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
     void dumpStack() const {
       dbgs() << "Stack contents:";
       for (unsigned i = 0; i != StackTop; ++i) {
@@ -577,8 +577,8 @@ namespace {
     friend bool operator<(const TableEntry &TE, unsigned V) {
       return TE.from < V;
     }
-    friend bool LLVM_ATTRIBUTE_USED operator<(unsigned V,
-                                              const TableEntry &TE) {
+    friend bool LLVM_ATTRIBUTE_UNUSED operator<(unsigned V,
+                                                const TableEntry &TE) {
       return V < TE.from;
     }
   };
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 6c0369f70a..b409e88148 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -101,7 +101,7 @@ namespace {
       Base_Reg = Reg;
     }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
     void dump() {
       dbgs() << "X86ISelAddressMode " << this << '\n';
       dbgs() << "Base_Reg ";
@@ -192,7 +192,6 @@ namespace {
     SDNode *Select(SDNode *N);
     SDNode *SelectGather(SDNode *N, unsigned Opc);
     SDNode *SelectAtomic64(SDNode *Node, unsigned Opc);
-    SDNode *SelectAtomicLoadAdd(SDNode *Node, EVT NVT);
     SDNode *SelectAtomicLoadArith(SDNode *Node, EVT NVT);
 
     bool FoldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
@@ -252,13 +251,15 @@ namespace {
       else if (AM.CP)
         Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32,
                                              AM.Align, AM.Disp, AM.SymbolFlags);
-      else if (AM.ES)
+      else if (AM.ES) {
+        assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
         Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
-      else if (AM.JT != -1)
+      } else if (AM.JT != -1) {
+        assert(!AM.Disp && "Non-zero displacement is ignored with JT.");
         Disp = CurDAG->getTargetJumpTable(AM.JT, MVT::i32, AM.SymbolFlags);
-      else if (AM.BlockAddr)
-        Disp = CurDAG->getBlockAddress(AM.BlockAddr, MVT::i32,
-                                       true, AM.SymbolFlags);
+      } else if (AM.BlockAddr)
+        Disp = CurDAG->getTargetBlockAddress(AM.BlockAddr, MVT::i32, AM.Disp,
+                                             AM.SymbolFlags);
       else
         Disp = CurDAG->getTargetConstant(AM.Disp, MVT::i32);
 
@@ -678,10 +679,16 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) {
     } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
       AM.JT = J->getIndex();
       AM.SymbolFlags = J->getTargetFlags();
-    } else {
-      AM.BlockAddr = cast<BlockAddressSDNode>(N0)->getBlockAddress();
-      AM.SymbolFlags = cast<BlockAddressSDNode>(N0)->getTargetFlags();
-    }
+    } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
+      X86ISelAddressMode Backup = AM;
+      AM.BlockAddr = BA->getBlockAddress();
+      AM.SymbolFlags = BA->getTargetFlags();
+      if (FoldOffsetIntoAddress(BA->getOffset(), AM)) {
+        AM = Backup;
+        return true;
+      }
+    } else
+      llvm_unreachable("Unhandled symbol reference node.");
 
     if (N.getOpcode() == X86ISD::WrapperRIP)
       AM.setBaseReg(CurDAG->getRegister(X86::RIP, MVT::i64));
@@ -710,10 +717,12 @@ bool X86DAGToDAGISel::MatchWrapper(SDValue N, X86ISelAddressMode &AM) {
     } else if (JumpTableSDNode *J = dyn_cast<JumpTableSDNode>(N0)) {
       AM.JT = J->getIndex();
       AM.SymbolFlags = J->getTargetFlags();
-    } else {
-      AM.BlockAddr = cast<BlockAddressSDNode>(N0)->getBlockAddress();
-      AM.SymbolFlags = cast<BlockAddressSDNode>(N0)->getTargetFlags();
-    }
+    } else if (BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(N0)) {
+      AM.BlockAddr = BA->getBlockAddress();
+      AM.Disp += BA->getOffset();
+      AM.SymbolFlags = BA->getTargetFlags();
+    } else
+      llvm_unreachable("Unhandled symbol reference node.");
     return false;
   }
 
@@ -1683,6 +1692,7 @@ SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) {
   SDValue In1 = Node->getOperand(1);
   SDValue In2L = Node->getOperand(2);
   SDValue In2H = Node->getOperand(3);
+
   SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
   if (!SelectAddr(Node, In1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4))
     return NULL;
@@ -1696,159 +1706,13 @@ SDNode *X86DAGToDAGISel::SelectAtomic64(SDNode *Node, unsigned Opc) {
   return ResNode;
 }
 
-// FIXME: Figure out some way to unify this with the 'or' and other code
-// below.
-SDNode *X86DAGToDAGISel::SelectAtomicLoadAdd(SDNode *Node, EVT NVT) {
-  if (Node->hasAnyUseOfValue(0))
-    return 0;
-
-  // Optimize common patterns for __sync_add_and_fetch and
-  // __sync_sub_and_fetch where the result is not used. This allows us
-  // to use "lock" version of add, sub, inc, dec instructions.
-  // FIXME: Do not use special instructions but instead add the "lock"
-  // prefix to the target node somehow. The extra information will then be
-  // transferred to machine instruction and it denotes the prefix.
-  SDValue Chain = Node->getOperand(0);
-  SDValue Ptr = Node->getOperand(1);
-  SDValue Val = Node->getOperand(2);
-  SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
-  if (!SelectAddr(Node, Ptr, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4))
-    return 0;
-
-  bool isInc = false, isDec = false, isSub = false, isCN = false;
-  ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val);
-  if (CN && CN->getSExtValue() == (int32_t)CN->getSExtValue()) {
-    isCN = true;
-    int64_t CNVal = CN->getSExtValue();
-    if (CNVal == 1)
-      isInc = true;
-    else if (CNVal == -1)
-      isDec = true;
-    else if (CNVal >= 0)
-      Val = CurDAG->getTargetConstant(CNVal, NVT);
-    else {
-      isSub = true;
-      Val = CurDAG->getTargetConstant(-CNVal, NVT);
-    }
-  } else if (Val.hasOneUse() &&
-             Val.getOpcode() == ISD::SUB &&
-             X86::isZeroNode(Val.getOperand(0))) {
-    isSub = true;
-    Val = Val.getOperand(1);
-  }
-
-  DebugLoc dl = Node->getDebugLoc();
-  unsigned Opc = 0;
-  switch (NVT.getSimpleVT().SimpleTy) {
-  default: return 0;
-  case MVT::i8:
-    if (isInc)
-      Opc = X86::LOCK_INC8m;
-    else if (isDec)
-      Opc = X86::LOCK_DEC8m;
-    else if (isSub) {
-      if (isCN)
-        Opc = X86::LOCK_SUB8mi;
-      else
-        Opc = X86::LOCK_SUB8mr;
-    } else {
-      if (isCN)
-        Opc = X86::LOCK_ADD8mi;
-      else
-        Opc = X86::LOCK_ADD8mr;
-    }
-    break;
-  case MVT::i16:
-    if (isInc)
-      Opc = X86::LOCK_INC16m;
-    else if (isDec)
-      Opc = X86::LOCK_DEC16m;
-    else if (isSub) {
-      if (isCN) {
-        if (immSext8(Val.getNode()))
-          Opc = X86::LOCK_SUB16mi8;
-        else
-          Opc = X86::LOCK_SUB16mi;
-      } else
-        Opc = X86::LOCK_SUB16mr;
-    } else {
-      if (isCN) {
-        if (immSext8(Val.getNode()))
-          Opc = X86::LOCK_ADD16mi8;
-        else
-          Opc = X86::LOCK_ADD16mi;
-      } else
-        Opc = X86::LOCK_ADD16mr;
-    }
-    break;
-  case MVT::i32:
-    if (isInc)
-      Opc = X86::LOCK_INC32m;
-    else if (isDec)
-      Opc = X86::LOCK_DEC32m;
-    else if (isSub) {
-      if (isCN) {
-        if (immSext8(Val.getNode()))
-          Opc = X86::LOCK_SUB32mi8;
-        else
-          Opc = X86::LOCK_SUB32mi;
-      } else
-        Opc = X86::LOCK_SUB32mr;
-    } else {
-      if (isCN) {
-        if (immSext8(Val.getNode()))
-          Opc = X86::LOCK_ADD32mi8;
-        else
-          Opc = X86::LOCK_ADD32mi;
-      } else
-        Opc = X86::LOCK_ADD32mr;
-    }
-    break;
-  case MVT::i64:
-    if (isInc)
-      Opc = X86::LOCK_INC64m;
-    else if (isDec)
-      Opc = X86::LOCK_DEC64m;
-    else if (isSub) {
-      Opc = X86::LOCK_SUB64mr;
-      if (isCN) {
-        if (immSext8(Val.getNode()))
-          Opc = X86::LOCK_SUB64mi8;
-        else if (i64immSExt32(Val.getNode()))
-          Opc = X86::LOCK_SUB64mi32;
-      }
-    } else {
-      Opc = X86::LOCK_ADD64mr;
-      if (isCN) {
-        if (immSext8(Val.getNode()))
-          Opc = X86::LOCK_ADD64mi8;
-        else if (i64immSExt32(Val.getNode()))
-          Opc = X86::LOCK_ADD64mi32;
-      }
-    }
-    break;
-  }
-
-  SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
-                                                 dl, NVT), 0);
-  MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
-  MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
-  if (isInc || isDec) {
-    SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain };
-    SDValue Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 6), 0);
-    cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1);
-    SDValue RetVals[] = { Undef, Ret };
-    return CurDAG->getMergeValues(RetVals, 2, dl).getNode();
-  } else {
-    SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Val, Chain };
-    SDValue Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 7), 0);
-    cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1);
-    SDValue RetVals[] = { Undef, Ret };
-    return CurDAG->getMergeValues(RetVals, 2, dl).getNode();
-  }
-}
-
+/// Atomic opcode table
+///
 enum AtomicOpc {
+  ADD,
+  SUB,
+  INC,
+  DEC,
   OR,
   AND,
   XOR,
@@ -1872,6 +1736,58 @@ enum AtomicSz {
 
 static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = {
   {
+    X86::LOCK_ADD8mi,
+    X86::LOCK_ADD8mr,
+    X86::LOCK_ADD16mi8,
+    X86::LOCK_ADD16mi,
+    X86::LOCK_ADD16mr,
+    X86::LOCK_ADD32mi8,
+    X86::LOCK_ADD32mi,
+    X86::LOCK_ADD32mr,
+    X86::LOCK_ADD64mi8,
+    X86::LOCK_ADD64mi32,
+    X86::LOCK_ADD64mr,
+  },
+  {
+    X86::LOCK_SUB8mi,
+    X86::LOCK_SUB8mr,
+    X86::LOCK_SUB16mi8,
+    X86::LOCK_SUB16mi,
+    X86::LOCK_SUB16mr,
+    X86::LOCK_SUB32mi8,
+    X86::LOCK_SUB32mi,
+    X86::LOCK_SUB32mr,
+    X86::LOCK_SUB64mi8,
+    X86::LOCK_SUB64mi32,
+    X86::LOCK_SUB64mr,
+  },
+  {
+    0,
+    X86::LOCK_INC8m,
+    0,
+    0,
+    X86::LOCK_INC16m,
+    0,
+    0,
+    X86::LOCK_INC32m,
+    0,
+    0,
+    X86::LOCK_INC64m,
+  },
+  {
+    0,
+    X86::LOCK_DEC8m,
+    0,
+    0,
+    X86::LOCK_DEC16m,
+    0,
+    0,
+    X86::LOCK_DEC32m,
+    0,
+    0,
+    X86::LOCK_DEC64m,
+  },
+  {
     X86::LOCK_OR8mi,
     X86::LOCK_OR8mr,
     X86::LOCK_OR16mi8,
@@ -1882,7 +1798,7 @@ static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = {
     X86::LOCK_OR32mr,
     X86::LOCK_OR64mi8,
     X86::LOCK_OR64mi32,
-    X86::LOCK_OR64mr
+    X86::LOCK_OR64mr,
   },
   {
     X86::LOCK_AND8mi,
@@ -1895,7 +1811,7 @@ static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = {
     X86::LOCK_AND32mr,
     X86::LOCK_AND64mi8,
     X86::LOCK_AND64mi32,
-    X86::LOCK_AND64mr
+    X86::LOCK_AND64mr,
   },
   {
     X86::LOCK_XOR8mi,
@@ -1908,18 +1824,74 @@ static const uint16_t AtomicOpcTbl[AtomicOpcEnd][AtomicSzEnd] = {
     X86::LOCK_XOR32mr,
     X86::LOCK_XOR64mi8,
     X86::LOCK_XOR64mi32,
-    X86::LOCK_XOR64mr
+    X86::LOCK_XOR64mr,
   }
 };
 
+// Return the target constant operand for atomic-load-op and do simple
+// translations, such as from atomic-load-add to lock-sub. The return value is
+// one of the following 3 cases:
+// + target-constant, the operand could be supported as a target constant.
+// + empty, the operand is not needed any more with the new op selected.
+// + non-empty, otherwise.
+static SDValue getAtomicLoadArithTargetConstant(SelectionDAG *CurDAG,
+                                                DebugLoc dl,
+                                                enum AtomicOpc &Op, EVT NVT,
+                                                SDValue Val) {
+  if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val)) {
+    int64_t CNVal = CN->getSExtValue();
+    // Quit if not 32-bit imm.
+    if ((int32_t)CNVal != CNVal)
+      return Val;
+    // For atomic-load-add, we could do some optimizations.
+    if (Op == ADD) {
+      // Translate to INC/DEC if ADD by 1 or -1.
+      if ((CNVal == 1) || (CNVal == -1)) {
+        Op = (CNVal == 1) ? INC : DEC;
+        // No more constant operand after being translated into INC/DEC.
+        return SDValue();
+      }
+      // Translate to SUB if ADD by negative value.
+      if (CNVal < 0) {
+        Op = SUB;
+        CNVal = -CNVal;
+      }
+    }
+    return CurDAG->getTargetConstant(CNVal, NVT);
+  }
+
+  // If the value operand is single-used, try to optimize it.
+  if (Op == ADD && Val.hasOneUse()) {
+    // Translate (atomic-load-add ptr (sub 0 x)) back to (lock-sub x).
+    if (Val.getOpcode() == ISD::SUB && X86::isZeroNode(Val.getOperand(0))) {
+      Op = SUB;
+      return Val.getOperand(1);
+    }
+    // A special case for i16, which needs truncating as, in most cases, it's
+    // promoted to i32. We will translate
+    // (atomic-load-add (truncate (sub 0 x))) to (lock-sub (EXTRACT_SUBREG x))
+    if (Val.getOpcode() == ISD::TRUNCATE && NVT == MVT::i16 &&
+        Val.getOperand(0).getOpcode() == ISD::SUB &&
+        X86::isZeroNode(Val.getOperand(0).getOperand(0))) {
+      Op = SUB;
+      Val = Val.getOperand(0);
+      return CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl, NVT,
+                                            Val.getOperand(1));
+    }
+  }
+
+  return Val;
+}
+
 SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) {
   if (Node->hasAnyUseOfValue(0))
     return 0;
 
+  DebugLoc dl = Node->getDebugLoc();
+
   // Optimize common patterns for __sync_or_and_fetch and similar arith
   // operations where the result is not used. This allows us to use the "lock"
   // version of the arithmetic instruction.
-  // FIXME: Same as for 'add' and 'sub', try to merge those down here.
   SDValue Chain = Node->getOperand(0);
   SDValue Ptr = Node->getOperand(1);
   SDValue Val = Node->getOperand(2);
@@ -1930,6 +1902,8 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) {
   // Which index into the table.
   enum AtomicOpc Op;
   switch (Node->getOpcode()) {
+    default:
+      return 0;
     case ISD::ATOMIC_LOAD_OR:
       Op = OR;
       break;
@@ -1939,16 +1913,14 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) {
     case ISD::ATOMIC_LOAD_XOR:
       Op = XOR;
       break;
-    default:
-      return 0;
-  }
-
-  bool isCN = false;
-  ConstantSDNode *CN = dyn_cast<ConstantSDNode>(Val);
-  if (CN && (int32_t)CN->getSExtValue() == CN->getSExtValue()) {
-    isCN = true;
-    Val = CurDAG->getTargetConstant(CN->getSExtValue(), NVT);
+    case ISD::ATOMIC_LOAD_ADD:
+      Op = ADD;
+      break;
   }
+  
+  Val = getAtomicLoadArithTargetConstant(CurDAG, dl, Op, NVT, Val);
+  bool isUnOp = !Val.getNode();
+  bool isCN = Val.getNode() && (Val.getOpcode() == ISD::TargetConstant);
 
   unsigned Opc = 0;
   switch (NVT.getSimpleVT().SimpleTy) {
@@ -1990,13 +1962,20 @@ SDNode *X86DAGToDAGISel::SelectAtomicLoadArith(SDNode *Node, EVT NVT) {
 
   assert(Opc != 0 && "Invalid arith lock transform!");
 
-  DebugLoc dl = Node->getDebugLoc();
+  SDValue Ret;
   SDValue Undef = SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
                                                  dl, NVT), 0);
   MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
   MemOp[0] = cast<MemSDNode>(Node)->getMemOperand();
-  SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Val, Chain };
-  SDValue Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops, 7), 0);
+  if (isUnOp) {
+    SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Chain };
+    Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops,
+                                         array_lengthof(Ops)), 0);
+  } else {
+    SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Val, Chain };
+    Ret = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops,
+                                         array_lengthof(Ops)), 0);
+  }
   cast<MachineSDNode>(Ret)->setMemRefs(MemOp, MemOp + 1);
   SDValue RetVals[] = { Undef, Ret };
   return CurDAG->getMergeValues(RetVals, 2, dl).getNode();
@@ -2292,15 +2271,10 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     break;
   }
 
-  case ISD::ATOMIC_LOAD_ADD: {
-    SDNode *RetVal = SelectAtomicLoadAdd(Node, NVT);
-    if (RetVal)
-      return RetVal;
-    break;
-  }
   case ISD::ATOMIC_LOAD_XOR:
   case ISD::ATOMIC_LOAD_AND:
-  case ISD::ATOMIC_LOAD_OR: {
+  case ISD::ATOMIC_LOAD_OR:
+  case ISD::ATOMIC_LOAD_ADD: {
     SDNode *RetVal = SelectAtomicLoadArith(Node, NVT);
     if (RetVal)
       return RetVal;
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 440e1727c8..bdfe245027 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -658,7 +658,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
 
     if (!TM.Options.UnsafeFPMath) {
+      setOperationAction(ISD::FSIN           , MVT::f32  , Expand);
       setOperationAction(ISD::FSIN           , MVT::f64  , Expand);
+      setOperationAction(ISD::FCOS           , MVT::f32  , Expand);
       setOperationAction(ISD::FCOS           , MVT::f64  , Expand);
     }
     addLegalFPImmediate(APFloat(+0.0)); // FLD0
@@ -6968,9 +6970,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
 
   if (VT.getSizeInBits() == 8) {
     SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
-                                    Op.getOperand(0), Op.getOperand(1));
+                                  Op.getOperand(0), Op.getOperand(1));
     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
-                                    DAG.getValueType(VT));
+                                  DAG.getValueType(VT));
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
   }
 
@@ -6985,9 +6987,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op,
                                                  Op.getOperand(0)),
                                      Op.getOperand(1)));
     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
-                                    Op.getOperand(0), Op.getOperand(1));
+                                  Op.getOperand(0), Op.getOperand(1));
     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
-                                    DAG.getValueType(VT));
+                                  DAG.getValueType(VT));
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
   }
 
@@ -7071,9 +7073,9 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     // Transform it so it match pextrw which produces a 32-bit result.
     EVT EltVT = MVT::i32;
     SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, EltVT,
-                                    Op.getOperand(0), Op.getOperand(1));
+                                  Op.getOperand(0), Op.getOperand(1));
     SDValue Assert  = DAG.getNode(ISD::AssertZext, dl, EltVT, Extract,
-                                    DAG.getValueType(VT));
+                                  DAG.getValueType(VT));
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
   }
 
@@ -7412,9 +7414,10 @@ X86TargetLowering::LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const {
     Subtarget->ClassifyBlockAddressReference();
   CodeModel::Model M = getTargetMachine().getCodeModel();
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
+  int64_t Offset = cast<BlockAddressSDNode>(Op)->getOffset();
   DebugLoc dl = Op.getDebugLoc();
-  SDValue Result = DAG.getBlockAddress(BA, getPointerTy(),
-                                       /*isTarget=*/true, OpFlags);
+  SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy(), Offset,
+                                             OpFlags);
 
   if (Subtarget->isPICStyleRIPRel() &&
       (M == CodeModel::Small || M == CodeModel::Kernel))
@@ -7520,8 +7523,8 @@ LowerToTLSGeneralDynamicModel32(GlobalAddressSDNode *GA, SelectionDAG &DAG,
   SDValue InFlag;
   DebugLoc dl = GA->getDebugLoc();  // ? function entry point might be better
   SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
-                                     DAG.getNode(X86ISD::GlobalBaseReg,
-                                                 DebugLoc(), PtrVT), InFlag);
+                                   DAG.getNode(X86ISD::GlobalBaseReg,
+                                               DebugLoc(), PtrVT), InFlag);
   InFlag = Chain.getValue(1);
 
   return GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX, X86II::MO_TLSGD);
@@ -8447,6 +8450,98 @@ static SDValue LowerFGETSIGN(SDValue Op, SelectionDAG &DAG) {
   return DAG.getNode(ISD::AND, dl, VT, xFGETSIGN, DAG.getConstant(1, VT));
 }
 
+// LowerVectorAllZeroTest - Check whether an OR'd tree is PTEST-able.
+//
+SDValue X86TargetLowering::LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const {
+  assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
+
+  if (!Subtarget->hasSSE41())
+    return SDValue();
+
+  if (!Op->hasOneUse())
+    return SDValue();
+
+  SDNode *N = Op.getNode();
+  DebugLoc DL = N->getDebugLoc();
+
+  SmallVector<SDValue, 8> Opnds;
+  DenseMap<SDValue, unsigned> VecInMap;
+  EVT VT = MVT::Other;
+
+  // Recognize a special case where a vector is casted into wide integer to
+  // test all 0s.
+  Opnds.push_back(N->getOperand(0));
+  Opnds.push_back(N->getOperand(1));
+
+  for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
+    SmallVector<SDValue, 8>::const_iterator I = Opnds.begin() + Slot;
+    // BFS traverse all OR'd operands.
+    if (I->getOpcode() == ISD::OR) {
+      Opnds.push_back(I->getOperand(0));
+      Opnds.push_back(I->getOperand(1));
+      // Re-evaluate the number of nodes to be traversed.
+      e += 2; // 2 more nodes (LHS and RHS) are pushed.
+      continue;
+    }
+
+    // Quit if a non-EXTRACT_VECTOR_ELT
+    if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      return SDValue();
+
+    // Quit if without a constant index.
+    SDValue Idx = I->getOperand(1);
+    if (!isa<ConstantSDNode>(Idx))
+      return SDValue();
+
+    SDValue ExtractedFromVec = I->getOperand(0);
+    DenseMap<SDValue, unsigned>::iterator M = VecInMap.find(ExtractedFromVec);
+    if (M == VecInMap.end()) {
+      VT = ExtractedFromVec.getValueType();
+      // Quit if not 128/256-bit vector.
+      if (!VT.is128BitVector() && !VT.is256BitVector())
+        return SDValue();
+      // Quit if not the same type.
+      if (VecInMap.begin() != VecInMap.end() &&
+          VT != VecInMap.begin()->first.getValueType())
+        return SDValue();
+      M = VecInMap.insert(std::make_pair(ExtractedFromVec, 0)).first;
+    }
+    M->second |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
+  }
+
+  assert((VT.is128BitVector() || VT.is256BitVector()) &&
+         "Not extracted from 128-/256-bit vector.");
+
+  unsigned FullMask = (1U << VT.getVectorNumElements()) - 1U;
+  SmallVector<SDValue, 8> VecIns;
+
+  for (DenseMap<SDValue, unsigned>::const_iterator
+        I = VecInMap.begin(), E = VecInMap.end(); I != E; ++I) {
+    // Quit if not all elements are used.
+    if (I->second != FullMask)
+      return SDValue();
+    VecIns.push_back(I->first);
+  }
+
+  EVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+
+  // Cast all vectors into TestVT for PTEST.
+  for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
+    VecIns[i] = DAG.getNode(ISD::BITCAST, DL, TestVT, VecIns[i]);
+
+  // If more than one full vectors are evaluated, OR them first before PTEST.
+  for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
+    // Each iteration will OR 2 nodes and append the result until there is only
+    // 1 node left, i.e. the final OR'd value of all vectors.
+    SDValue LHS = VecIns[Slot];
+    SDValue RHS = VecIns[Slot + 1];
+    VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
+  }
+
+  return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
+                     VecIns.back(), VecIns.back());
+}
+
 /// Emit nodes that will be selected as "test Op0,Op0", or something
 /// equivalent.
 SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
@@ -8604,9 +8699,17 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
     switch (ArithOp.getOpcode()) {
     default: llvm_unreachable("unexpected operator!");
     case ISD::SUB: Opcode = X86ISD::SUB; break;
-    case ISD::OR:  Opcode = X86ISD::OR;  break;
     case ISD::XOR: Opcode = X86ISD::XOR; break;
     case ISD::AND: Opcode = X86ISD::AND; break;
+    case ISD::OR: {
+      if (!NeedTruncation && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
+        SDValue EFLAGS = LowerVectorAllZeroTest(Op, DAG);
+        if (EFLAGS.getNode())
+          return EFLAGS;
+      }
+      Opcode = X86ISD::OR;
+      break;
+    }
     }
 
     NumOperands = 2;
@@ -11994,385 +12097,534 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
 //===----------------------------------------------------------------------===//
 
 // private utility function
+
+// Get CMPXCHG opcode for the specified data type.
+static unsigned getCmpXChgOpcode(EVT VT) {
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::i8:  return X86::LCMPXCHG8;
+  case MVT::i16: return X86::LCMPXCHG16;
+  case MVT::i32: return X86::LCMPXCHG32;
+  case MVT::i64: return X86::LCMPXCHG64;
+  default:
+    break;
+  }
+  llvm_unreachable("Invalid operand size!");
+}
+
+// Get LOAD opcode for the specified data type.
+static unsigned getLoadOpcode(EVT VT) {
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::i8:  return X86::MOV8rm;
+  case MVT::i16: return X86::MOV16rm;
+  case MVT::i32: return X86::MOV32rm;
+  case MVT::i64: return X86::MOV64rm;
+  default:
+    break;
+  }
+  llvm_unreachable("Invalid operand size!");
+}
+
+// Get opcode of the non-atomic one from the specified atomic instruction.
+static unsigned getNonAtomicOpcode(unsigned Opc) {
+  switch (Opc) {
+  case X86::ATOMAND8:  return X86::AND8rr;
+  case X86::ATOMAND16: return X86::AND16rr;
+  case X86::ATOMAND32: return X86::AND32rr;
+  case X86::ATOMAND64: return X86::AND64rr;
+  case X86::ATOMOR8:   return X86::OR8rr;
+  case X86::ATOMOR16:  return X86::OR16rr;
+  case X86::ATOMOR32:  return X86::OR32rr;
+  case X86::ATOMOR64:  return X86::OR64rr;
+  case X86::ATOMXOR8:  return X86::XOR8rr;
+  case X86::ATOMXOR16: return X86::XOR16rr;
+  case X86::ATOMXOR32: return X86::XOR32rr;
+  case X86::ATOMXOR64: return X86::XOR64rr;
+  }
+  llvm_unreachable("Unhandled atomic-load-op opcode!");
+}
+
+// Get opcode of the non-atomic one from the specified atomic instruction with
+// extra opcode.
+static unsigned getNonAtomicOpcodeWithExtraOpc(unsigned Opc,
+                                               unsigned &ExtraOpc) {
+  switch (Opc) {
+  case X86::ATOMNAND8:  ExtraOpc = X86::NOT8r;   return X86::AND8rr;
+  case X86::ATOMNAND16: ExtraOpc = X86::NOT16r;  return X86::AND16rr;
+  case X86::ATOMNAND32: ExtraOpc = X86::NOT32r;  return X86::AND32rr;
+  case X86::ATOMNAND64: ExtraOpc = X86::NOT64r;  return X86::AND64rr;
+  case X86::ATOMMAX8:   ExtraOpc = X86::CMP8rr;  return X86::CMOVL32rr;
+  case X86::ATOMMAX16:  ExtraOpc = X86::CMP16rr; return X86::CMOVL16rr;
+  case X86::ATOMMAX32:  ExtraOpc = X86::CMP32rr; return X86::CMOVL32rr;
+  case X86::ATOMMAX64:  ExtraOpc = X86::CMP64rr; return X86::CMOVL64rr;
+  case X86::ATOMMIN8:   ExtraOpc = X86::CMP8rr;  return X86::CMOVG32rr;
+  case X86::ATOMMIN16:  ExtraOpc = X86::CMP16rr; return X86::CMOVG16rr;
+  case X86::ATOMMIN32:  ExtraOpc = X86::CMP32rr; return X86::CMOVG32rr;
+  case X86::ATOMMIN64:  ExtraOpc = X86::CMP64rr; return X86::CMOVG64rr;
+  case X86::ATOMUMAX8:  ExtraOpc = X86::CMP8rr;  return X86::CMOVB32rr;
+  case X86::ATOMUMAX16: ExtraOpc = X86::CMP16rr; return X86::CMOVB16rr;
+  case X86::ATOMUMAX32: ExtraOpc = X86::CMP32rr; return X86::CMOVB32rr;
+  case X86::ATOMUMAX64: ExtraOpc = X86::CMP64rr; return X86::CMOVB64rr;
+  case X86::ATOMUMIN8:  ExtraOpc = X86::CMP8rr;  return X86::CMOVA32rr;
+  case X86::ATOMUMIN16: ExtraOpc = X86::CMP16rr; return X86::CMOVA16rr;
+  case X86::ATOMUMIN32: ExtraOpc = X86::CMP32rr; return X86::CMOVA32rr;
+  case X86::ATOMUMIN64: ExtraOpc = X86::CMP64rr; return X86::CMOVA64rr;
+  }
+  llvm_unreachable("Unhandled atomic-load-op opcode!");
+}
+
+// Get opcode of the non-atomic one from the specified atomic instruction for
+// 64-bit data type on 32-bit target.
+static unsigned getNonAtomic6432Opcode(unsigned Opc, unsigned &HiOpc) {
+  switch (Opc) {
+  case X86::ATOMAND6432:  HiOpc = X86::AND32rr; return X86::AND32rr;
+  case X86::ATOMOR6432:   HiOpc = X86::OR32rr;  return X86::OR32rr;
+  case X86::ATOMXOR6432:  HiOpc = X86::XOR32rr; return X86::XOR32rr;
+  case X86::ATOMADD6432:  HiOpc = X86::ADC32rr; return X86::ADD32rr;
+  case X86::ATOMSUB6432:  HiOpc = X86::SBB32rr; return X86::SUB32rr;
+  case X86::ATOMSWAP6432: HiOpc = X86::MOV32rr; return X86::MOV32rr;
+  }
+  llvm_unreachable("Unhandled atomic-load-op opcode!");
+}
+
+// Get opcode of the non-atomic one from the specified atomic instruction for
+// 64-bit data type on 32-bit target with extra opcode.
+static unsigned getNonAtomic6432OpcodeWithExtraOpc(unsigned Opc,
+                                                   unsigned &HiOpc,
+                                                   unsigned &ExtraOpc) {
+  switch (Opc) {
+  case X86::ATOMNAND6432:
+    ExtraOpc = X86::NOT32r;
+    HiOpc = X86::AND32rr;
+    return X86::AND32rr;
+  }
+  llvm_unreachable("Unhandled atomic-load-op opcode!");
+}
+
+// Get pseudo CMOV opcode from the specified data type.
+static unsigned getPseudoCMOVOpc(EVT VT) {
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::i8:  return X86::CMOV_GR8;
+  case MVT::i16: return X86::CMOV_GR16;
+  case MVT::i32: return X86::CMOV_GR32;
+  default:
+    break;
+  }
+  llvm_unreachable("Unknown CMOV opcode!");
+}
+
+// EmitAtomicLoadArith - emit the code sequence for pseudo atomic instructions.
+// They will be translated into a spin-loop or compare-exchange loop from
+//
+//    ...
+//    dst = atomic-fetch-op MI.addr, MI.val
+//    ...
+//
+// to
+//
+//    ...
+//    EAX = LOAD MI.addr
+// loop:
+//    t1 = OP MI.val, EAX
+//    LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined]
+//    JNE loop
+// sink:
+//    dst = EAX
+//    ...
 MachineBasicBlock *
-X86TargetLowering::EmitAtomicBitwiseWithCustomInserter(MachineInstr *bInstr,
-                                                       MachineBasicBlock *MBB,
-                                                       unsigned regOpc,
-                                                       unsigned immOpc,
-                                                       unsigned LoadOpc,
-                                                       unsigned CXchgOpc,
-                                                       unsigned notOpc,
-                                                       unsigned EAXreg,
-                                                 const TargetRegisterClass *RC,
-                                                       bool Invert) const {
-  // For the atomic bitwise operator, we generate
-  //   thisMBB:
-  //   newMBB:
-  //     ld  t1 = [bitinstr.addr]
-  //     op  t2 = t1, [bitinstr.val]
-  //     not t3 = t2  (if Invert)
-  //     mov EAX = t1
-  //     lcs dest = [bitinstr.addr], t3  [EAX is implicit]
-  //     bz  newMBB
-  //     fallthrough -->nextMBB
+X86TargetLowering::EmitAtomicLoadArith(MachineInstr *MI,
+                                       MachineBasicBlock *MBB) const {
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
-  MachineFunction::iterator MBBIter = MBB;
-  ++MBBIter;
+  DebugLoc DL = MI->getDebugLoc();
+
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  const BasicBlock *BB = MBB->getBasicBlock();
+  MachineFunction::iterator I = MBB;
+  ++I;
+
+  assert(MI->getNumOperands() <= X86::AddrNumOperands + 2 &&
+         "Unexpected number of operands");
+
+  assert(MI->hasOneMemOperand() &&
+         "Expected atomic-load-op to have one memoperand");
+
+  // Memory Reference
+  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
+
+  unsigned DstReg, SrcReg;
+  unsigned MemOpndSlot;
+
+  unsigned CurOp = 0;
+
+  DstReg = MI->getOperand(CurOp++).getReg();
+  MemOpndSlot = CurOp;
+  CurOp += X86::AddrNumOperands;
+  SrcReg = MI->getOperand(CurOp++).getReg();
+
+  const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
+  EVT VT = *RC->vt_begin();
+  unsigned AccPhyReg = getX86SubSuperRegister(X86::EAX, VT);
+
+  unsigned LCMPXCHGOpc = getCmpXChgOpcode(VT);
+  unsigned LOADOpc = getLoadOpcode(VT);
+
+  // For the atomic load-arith operator, we generate
+  //
+  //  thisMBB:
+  //    EAX = LOAD [MI.addr]
+  //  mainMBB:
+  //    t1 = OP MI.val, EAX
+  //    LCMPXCHG [MI.addr], t1, [EAX is implicitly used & defined]
+  //    JNE mainMBB
+  //  sinkMBB:
 
-  /// First build the CFG
-  MachineFunction *F = MBB->getParent();
   MachineBasicBlock *thisMBB = MBB;
-  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  F->insert(MBBIter, newMBB);
-  F->insert(MBBIter, nextMBB);
-
-  // Transfer the remainder of thisMBB and its successor edges to nextMBB.
-  nextMBB->splice(nextMBB->begin(), thisMBB,
-                  llvm::next(MachineBasicBlock::iterator(bInstr)),
-                  thisMBB->end());
-  nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
-
-  // Update thisMBB to fall through to newMBB
-  thisMBB->addSuccessor(newMBB);
-
-  // newMBB jumps to itself and fall through to nextMBB
-  newMBB->addSuccessor(nextMBB);
-  newMBB->addSuccessor(newMBB);
-
-  // Insert instructions into newMBB based on incoming instruction
-  assert(bInstr->getNumOperands() < X86::AddrNumOperands + 4 &&
-         "unexpected number of operands");
-  DebugLoc dl = bInstr->getDebugLoc();
-  MachineOperand& destOper = bInstr->getOperand(0);
-  MachineOperand* argOpers[2 + X86::AddrNumOperands];
-  int numArgs = bInstr->getNumOperands() - 1;
-  for (int i=0; i < numArgs; ++i)
-    argOpers[i] = &bInstr->getOperand(i+1);
-
-  // x86 address has 4 operands: base, index, scale, and displacement
-  int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
-  int valArgIndx = lastAddrIndx + 1;
-
-  unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
-  MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(LoadOpc), t1);
-  for (int i=0; i <= lastAddrIndx; ++i)
-    (*MIB).addOperand(*argOpers[i]);
-
-  unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
-  assert((argOpers[valArgIndx]->isReg() ||
-          argOpers[valArgIndx]->isImm()) &&
-         "invalid operand");
-  if (argOpers[valArgIndx]->isReg())
-    MIB = BuildMI(newMBB, dl, TII->get(regOpc), t2);
-  else
-    MIB = BuildMI(newMBB, dl, TII->get(immOpc), t2);
-  MIB.addReg(t1);
-  (*MIB).addOperand(*argOpers[valArgIndx]);
+  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+  MF->insert(I, mainMBB);
+  MF->insert(I, sinkMBB);
+
+  MachineInstrBuilder MIB;
+
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), MBB,
+                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // thisMBB:
+  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), AccPhyReg);
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
+    MIB.addOperand(MI->getOperand(MemOpndSlot + i));
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+
+  thisMBB->addSuccessor(mainMBB);
 
-  unsigned t3 = F->getRegInfo().createVirtualRegister(RC);
-  if (Invert) {
-    MIB = BuildMI(newMBB, dl, TII->get(notOpc), t3).addReg(t2);
+  // mainMBB:
+  MachineBasicBlock *origMainMBB = mainMBB;
+  mainMBB->addLiveIn(AccPhyReg);
+
+  // Copy AccPhyReg as it is used more than once.
+  unsigned AccReg = MRI.createVirtualRegister(RC);
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), AccReg)
+    .addReg(AccPhyReg);
+
+  unsigned t1 = MRI.createVirtualRegister(RC);
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+  default:
+    llvm_unreachable("Unhandled atomic-load-op opcode!");
+  case X86::ATOMAND8:
+  case X86::ATOMAND16:
+  case X86::ATOMAND32:
+  case X86::ATOMAND64:
+  case X86::ATOMOR8:
+  case X86::ATOMOR16:
+  case X86::ATOMOR32:
+  case X86::ATOMOR64:
+  case X86::ATOMXOR8:
+  case X86::ATOMXOR16:
+  case X86::ATOMXOR32:
+  case X86::ATOMXOR64: {
+    unsigned ARITHOpc = getNonAtomicOpcode(Opc);
+    BuildMI(mainMBB, DL, TII->get(ARITHOpc), t1).addReg(SrcReg)
+      .addReg(AccReg);
+    break;
   }
-  else
-    t3 = t2;
+  case X86::ATOMNAND8:
+  case X86::ATOMNAND16:
+  case X86::ATOMNAND32:
+  case X86::ATOMNAND64: {
+    unsigned t2 = MRI.createVirtualRegister(RC);
+    unsigned NOTOpc;
+    unsigned ANDOpc = getNonAtomicOpcodeWithExtraOpc(Opc, NOTOpc);
+    BuildMI(mainMBB, DL, TII->get(ANDOpc), t2).addReg(SrcReg)
+      .addReg(AccReg);
+    BuildMI(mainMBB, DL, TII->get(NOTOpc), t1).addReg(t2);
+    break;
+  }
+  case X86::ATOMMAX8:
+  case X86::ATOMMAX16:
+  case X86::ATOMMAX32:
+  case X86::ATOMMAX64:
+  case X86::ATOMMIN8:
+  case X86::ATOMMIN16:
+  case X86::ATOMMIN32:
+  case X86::ATOMMIN64:
+  case X86::ATOMUMAX8:
+  case X86::ATOMUMAX16:
+  case X86::ATOMUMAX32:
+  case X86::ATOMUMAX64:
+  case X86::ATOMUMIN8:
+  case X86::ATOMUMIN16:
+  case X86::ATOMUMIN32:
+  case X86::ATOMUMIN64: {
+    unsigned CMPOpc;
+    unsigned CMOVOpc = getNonAtomicOpcodeWithExtraOpc(Opc, CMPOpc);
+
+    BuildMI(mainMBB, DL, TII->get(CMPOpc))
+      .addReg(SrcReg)
+      .addReg(AccReg);
+
+    if (Subtarget->hasCMov()) {
+      if (VT != MVT::i8) {
+        // Native support
+        BuildMI(mainMBB, DL, TII->get(CMOVOpc), t1)
+          .addReg(SrcReg)
+          .addReg(AccReg);
+      } else {
+        // Promote i8 to i32 to use CMOV32
+        const TargetRegisterClass *RC32 = getRegClassFor(MVT::i32);
+        unsigned SrcReg32 = MRI.createVirtualRegister(RC32);
+        unsigned AccReg32 = MRI.createVirtualRegister(RC32);
+        unsigned t2 = MRI.createVirtualRegister(RC32);
+
+        unsigned Undef = MRI.createVirtualRegister(RC32);
+        BuildMI(mainMBB, DL, TII->get(TargetOpcode::IMPLICIT_DEF), Undef);
+
+        BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), SrcReg32)
+          .addReg(Undef)
+          .addReg(SrcReg)
+          .addImm(X86::sub_8bit);
+        BuildMI(mainMBB, DL, TII->get(TargetOpcode::INSERT_SUBREG), AccReg32)
+          .addReg(Undef)
+          .addReg(AccReg)
+          .addImm(X86::sub_8bit);
+
+        BuildMI(mainMBB, DL, TII->get(CMOVOpc), t2)
+          .addReg(SrcReg32)
+          .addReg(AccReg32);
+
+        BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), t1)
+          .addReg(t2, 0, X86::sub_8bit);
+      }
+    } else {
+      // Use pseudo select and lower them.
+      assert((VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32) &&
+             "Invalid atomic-load-op transformation!");
+      unsigned SelOpc = getPseudoCMOVOpc(VT);
+      X86::CondCode CC = X86::getCondFromCMovOpc(CMOVOpc);
+      assert(CC != X86::COND_INVALID && "Invalid atomic-load-op transformation!");
+      MIB = BuildMI(mainMBB, DL, TII->get(SelOpc), t1)
+              .addReg(SrcReg).addReg(AccReg)
+              .addImm(CC);
+      mainMBB = EmitLoweredSelect(MIB, mainMBB);
+    }
+    break;
+  }
+  }
+
+  // Copy AccPhyReg back from virtual register.
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), AccPhyReg)
+    .addReg(AccReg);
 
-  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), EAXreg);
+  MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc));
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
+    MIB.addOperand(MI->getOperand(MemOpndSlot + i));
   MIB.addReg(t1);
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+
+  BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB);
 
-  MIB = BuildMI(newMBB, dl, TII->get(CXchgOpc));
-  for (int i=0; i <= lastAddrIndx; ++i)
-    (*MIB).addOperand(*argOpers[i]);
-  MIB.addReg(t3);
-  assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
-  (*MIB).setMemRefs(bInstr->memoperands_begin(),
-                    bInstr->memoperands_end());
+  mainMBB->addSuccessor(origMainMBB);
+  mainMBB->addSuccessor(sinkMBB);
 
-  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg());
-  MIB.addReg(EAXreg);
+  // sinkMBB:
+  sinkMBB->addLiveIn(AccPhyReg);
 
-  // insert branch
-  BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
+  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+          TII->get(TargetOpcode::COPY), DstReg)
+    .addReg(AccPhyReg);
 
-  bInstr->eraseFromParent();   // The pseudo instruction is gone now.
-  return nextMBB;
+  MI->eraseFromParent();
+  return sinkMBB;
 }
 
-// private utility function:  64 bit atomics on 32 bit host.
+// EmitAtomicLoadArith6432 - emit the code sequence for pseudo atomic
+// instructions. They will be translated into a spin-loop or compare-exchange
+// loop from
+//
+//    ...
+//    dst = atomic-fetch-op MI.addr, MI.val
+//    ...
+//
+// to
+//
+//    ...
+//    EAX = LOAD [MI.addr + 0]
+//    EDX = LOAD [MI.addr + 4]
+// loop:
+//    EBX = OP MI.val.lo, EAX
+//    ECX = OP MI.val.hi, EDX
+//    LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined]
+//    JNE loop
+// sink:
+//    dst = EDX:EAX
+//    ...
 MachineBasicBlock *
-X86TargetLowering::EmitAtomicBit6432WithCustomInserter(MachineInstr *bInstr,
-                                                       MachineBasicBlock *MBB,
-                                                       unsigned regOpcL,
-                                                       unsigned regOpcH,
-                                                       unsigned immOpcL,
-                                                       unsigned immOpcH,
-                                                       bool Invert) const {
-  // For the atomic bitwise operator, we generate
-  //   thisMBB (instructions are in pairs, except cmpxchg8b)
-  //     ld t1,t2 = [bitinstr.addr]
-  //   newMBB:
-  //     out1, out2 = phi (thisMBB, t1/t2) (newMBB, t3/t4)
-  //     op  t5, t6 <- out1, out2, [bitinstr.val]
-  //      (for SWAP, substitute:  mov t5, t6 <- [bitinstr.val])
-  //     neg t7, t8 < t5, t6  (if Invert)
-  //     mov ECX, EBX <- t5, t6
-  //     mov EAX, EDX <- t1, t2
-  //     cmpxchg8b [bitinstr.addr]  [EAX, EDX, EBX, ECX implicit]
-  //     mov t3, t4 <- EAX, EDX
-  //     bz  newMBB
-  //     result in out1, out2
-  //     fallthrough -->nextMBB
-
-  const TargetRegisterClass *RC = &X86::GR32RegClass;
-  const unsigned LoadOpc = X86::MOV32rm;
-  const unsigned NotOpc = X86::NOT32r;
+X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI,
+                                           MachineBasicBlock *MBB) const {
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
-  MachineFunction::iterator MBBIter = MBB;
-  ++MBBIter;
+  DebugLoc DL = MI->getDebugLoc();
 
-  /// First build the CFG
-  MachineFunction *F = MBB->getParent();
-  MachineBasicBlock *thisMBB = MBB;
-  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  F->insert(MBBIter, newMBB);
-  F->insert(MBBIter, nextMBB);
-
-  // Transfer the remainder of thisMBB and its successor edges to nextMBB.
-  nextMBB->splice(nextMBB->begin(), thisMBB,
-                  llvm::next(MachineBasicBlock::iterator(bInstr)),
-                  thisMBB->end());
-  nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
-
-  // Update thisMBB to fall through to newMBB
-  thisMBB->addSuccessor(newMBB);
-
-  // newMBB jumps to itself and fall through to nextMBB
-  newMBB->addSuccessor(nextMBB);
-  newMBB->addSuccessor(newMBB);
-
-  DebugLoc dl = bInstr->getDebugLoc();
-  // Insert instructions into newMBB based on incoming instruction
-  // There are 8 "real" operands plus 9 implicit def/uses, ignored here.
-  assert(bInstr->getNumOperands() < X86::AddrNumOperands + 14 &&
-         "unexpected number of operands");
-  MachineOperand& dest1Oper = bInstr->getOperand(0);
-  MachineOperand& dest2Oper = bInstr->getOperand(1);
-  MachineOperand* argOpers[2 + X86::AddrNumOperands];
-  for (int i=0; i < 2 + X86::AddrNumOperands; ++i) {
-    argOpers[i] = &bInstr->getOperand(i+2);
-
-    // We use some of the operands multiple times, so conservatively just
-    // clear any kill flags that might be present.
-    if (argOpers[i]->isReg() && argOpers[i]->isUse())
-      argOpers[i]->setIsKill(false);
-  }
-
-  // x86 address has 5 operands: base, index, scale, displacement, and segment.
-  int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
-
-  unsigned t1 = F->getRegInfo().createVirtualRegister(RC);
-  MachineInstrBuilder MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t1);
-  for (int i=0; i <= lastAddrIndx; ++i)
-    (*MIB).addOperand(*argOpers[i]);
-  unsigned t2 = F->getRegInfo().createVirtualRegister(RC);
-  MIB = BuildMI(thisMBB, dl, TII->get(LoadOpc), t2);
-  // add 4 to displacement.
-  for (int i=0; i <= lastAddrIndx-2; ++i)
-    (*MIB).addOperand(*argOpers[i]);
-  MachineOperand newOp3 = *(argOpers[3]);
-  if (newOp3.isImm())
-    newOp3.setImm(newOp3.getImm()+4);
-  else
-    newOp3.setOffset(newOp3.getOffset()+4);
-  (*MIB).addOperand(newOp3);
-  (*MIB).addOperand(*argOpers[lastAddrIndx]);
-
-  // t3/4 are defined later, at the bottom of the loop
-  unsigned t3 = F->getRegInfo().createVirtualRegister(RC);
-  unsigned t4 = F->getRegInfo().createVirtualRegister(RC);
-  BuildMI(newMBB, dl, TII->get(X86::PHI), dest1Oper.getReg())
-    .addReg(t1).addMBB(thisMBB).addReg(t3).addMBB(newMBB);
-  BuildMI(newMBB, dl, TII->get(X86::PHI), dest2Oper.getReg())
-    .addReg(t2).addMBB(thisMBB).addReg(t4).addMBB(newMBB);
-
-  // The subsequent operations should be using the destination registers of
-  // the PHI instructions.
-  t1 = dest1Oper.getReg();
-  t2 = dest2Oper.getReg();
-
-  int valArgIndx = lastAddrIndx + 1;
-  assert((argOpers[valArgIndx]->isReg() ||
-          argOpers[valArgIndx]->isImm()) &&
-         "invalid operand");
-  unsigned t5 = F->getRegInfo().createVirtualRegister(RC);
-  unsigned t6 = F->getRegInfo().createVirtualRegister(RC);
-  if (argOpers[valArgIndx]->isReg())
-    MIB = BuildMI(newMBB, dl, TII->get(regOpcL), t5);
-  else
-    MIB = BuildMI(newMBB, dl, TII->get(immOpcL), t5);
-  if (regOpcL != X86::MOV32rr)
-    MIB.addReg(t1);
-  (*MIB).addOperand(*argOpers[valArgIndx]);
-  assert(argOpers[valArgIndx + 1]->isReg() ==
-         argOpers[valArgIndx]->isReg());
-  assert(argOpers[valArgIndx + 1]->isImm() ==
-         argOpers[valArgIndx]->isImm());
-  if (argOpers[valArgIndx + 1]->isReg())
-    MIB = BuildMI(newMBB, dl, TII->get(regOpcH), t6);
-  else
-    MIB = BuildMI(newMBB, dl, TII->get(immOpcH), t6);
-  if (regOpcH != X86::MOV32rr)
-    MIB.addReg(t2);
-  (*MIB).addOperand(*argOpers[valArgIndx + 1]);
-
-  unsigned t7, t8;
-  if (Invert) {
-    t7 = F->getRegInfo().createVirtualRegister(RC);
-    t8 = F->getRegInfo().createVirtualRegister(RC);
-    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t7).addReg(t5);
-    MIB = BuildMI(newMBB, dl, TII->get(NotOpc), t8).addReg(t6);
-  } else {
-    t7 = t5;
-    t8 = t6;
-  }
+  MachineFunction *MF = MBB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
 
-  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX);
-  MIB.addReg(t1);
-  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EDX);
-  MIB.addReg(t2);
+  const BasicBlock *BB = MBB->getBasicBlock();
+  MachineFunction::iterator I = MBB;
+  ++I;
 
-  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EBX);
-  MIB.addReg(t7);
-  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::ECX);
-  MIB.addReg(t8);
+  assert(MI->getNumOperands() <= X86::AddrNumOperands + 4 &&
+         "Unexpected number of operands");
+
+  assert(MI->hasOneMemOperand() &&
+         "Expected atomic-load-op32 to have one memoperand");
+
+  // Memory Reference
+  MachineInstr::mmo_iterator MMOBegin = MI->memoperands_begin();
+  MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
 
-  MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG8B));
-  for (int i=0; i <= lastAddrIndx; ++i)
-    (*MIB).addOperand(*argOpers[i]);
+  unsigned DstLoReg, DstHiReg;
+  unsigned SrcLoReg, SrcHiReg;
+  unsigned MemOpndSlot;
 
-  assert(bInstr->hasOneMemOperand() && "Unexpected number of memoperand");
-  (*MIB).setMemRefs(bInstr->memoperands_begin(),
-                    bInstr->memoperands_end());
+  unsigned CurOp = 0;
 
-  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t3);
-  MIB.addReg(X86::EAX);
-  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t4);
-  MIB.addReg(X86::EDX);
+  DstLoReg = MI->getOperand(CurOp++).getReg();
+  DstHiReg = MI->getOperand(CurOp++).getReg();
+  MemOpndSlot = CurOp;
+  CurOp += X86::AddrNumOperands;
+  SrcLoReg = MI->getOperand(CurOp++).getReg();
+  SrcHiReg = MI->getOperand(CurOp++).getReg();
 
-  // insert branch
-  BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
+  const TargetRegisterClass *RC = &X86::GR32RegClass;
 
-  bInstr->eraseFromParent();   // The pseudo instruction is gone now.
-  return nextMBB;
-}
+  unsigned LCMPXCHGOpc = X86::LCMPXCHG8B;
+  unsigned LOADOpc = X86::MOV32rm;
 
-// private utility function
-MachineBasicBlock *
-X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
-                                                      MachineBasicBlock *MBB,
-                                                      unsigned cmovOpc) const {
-  // For the atomic min/max operator, we generate
-  //   thisMBB:
-  //   newMBB:
-  //     ld t1 = [min/max.addr]
-  //     mov t2 = [min/max.val]
-  //     cmp  t1, t2
-  //     cmov[cond] t2 = t1
-  //     mov EAX = t1
-  //     lcs dest = [bitinstr.addr], t2  [EAX is implicit]
-  //     bz   newMBB
-  //     fallthrough -->nextMBB
+  // For the atomic load-arith operator, we generate
   //
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
-  MachineFunction::iterator MBBIter = MBB;
-  ++MBBIter;
+  //  thisMBB:
+  //    EAX = LOAD [MI.addr + 0]
+  //    EDX = LOAD [MI.addr + 4]
+  //  mainMBB:
+  //    EBX = OP MI.vallo, EAX
+  //    ECX = OP MI.valhi, EDX
+  //    LCMPXCHG8B [MI.addr], [ECX:EBX & EDX:EAX are implicitly used and EDX:EAX is implicitly defined]
+  //    JNE mainMBB
+  //  sinkMBB:
 
-  /// First build the CFG
-  MachineFunction *F = MBB->getParent();
   MachineBasicBlock *thisMBB = MBB;
-  MachineBasicBlock *newMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  MachineBasicBlock *nextMBB = F->CreateMachineBasicBlock(LLVM_BB);
-  F->insert(MBBIter, newMBB);
-  F->insert(MBBIter, nextMBB);
-
-  // Transfer the remainder of thisMBB and its successor edges to nextMBB.
-  nextMBB->splice(nextMBB->begin(), thisMBB,
-                  llvm::next(MachineBasicBlock::iterator(mInstr)),
-                  thisMBB->end());
-  nextMBB->transferSuccessorsAndUpdatePHIs(thisMBB);
-
-  // Update thisMBB to fall through to newMBB
-  thisMBB->addSuccessor(newMBB);
-
-  // newMBB jumps to newMBB and fall through to nextMBB
-  newMBB->addSuccessor(nextMBB);
-  newMBB->addSuccessor(newMBB);
-
-  DebugLoc dl = mInstr->getDebugLoc();
-  // Insert instructions into newMBB based on incoming instruction
-  assert(mInstr->getNumOperands() < X86::AddrNumOperands + 4 &&
-         "unexpected number of operands");
-  MachineOperand& destOper = mInstr->getOperand(0);
-  MachineOperand* argOpers[2 + X86::AddrNumOperands];
-  int numArgs = mInstr->getNumOperands() - 1;
-  for (int i=0; i < numArgs; ++i)
-    argOpers[i] = &mInstr->getOperand(i+1);
-
-  // x86 address has 4 operands: base, index, scale, and displacement
-  int lastAddrIndx = X86::AddrNumOperands - 1; // [0,3]
-  int valArgIndx = lastAddrIndx + 1;
-
-  unsigned t1 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
-  MachineInstrBuilder MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rm), t1);
-  for (int i=0; i <= lastAddrIndx; ++i)
-    (*MIB).addOperand(*argOpers[i]);
-
-  // We only support register and immediate values
-  assert((argOpers[valArgIndx]->isReg() ||
-          argOpers[valArgIndx]->isImm()) &&
-         "invalid operand");
-
-  unsigned t2 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
-  if (argOpers[valArgIndx]->isReg())
-    MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), t2);
-  else
-    MIB = BuildMI(newMBB, dl, TII->get(X86::MOV32rr), t2);
-  (*MIB).addOperand(*argOpers[valArgIndx]);
+  MachineBasicBlock *mainMBB = MF->CreateMachineBasicBlock(BB);
+  MachineBasicBlock *sinkMBB = MF->CreateMachineBasicBlock(BB);
+  MF->insert(I, mainMBB);
+  MF->insert(I, sinkMBB);
 
-  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), X86::EAX);
-  MIB.addReg(t1);
+  MachineInstrBuilder MIB;
 
-  MIB = BuildMI(newMBB, dl, TII->get(X86::CMP32rr));
-  MIB.addReg(t1);
-  MIB.addReg(t2);
+  // Transfer the remainder of BB and its successor edges to sinkMBB.
+  sinkMBB->splice(sinkMBB->begin(), MBB,
+                  llvm::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  sinkMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // thisMBB:
+  // Lo
+  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), X86::EAX);
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
+    MIB.addOperand(MI->getOperand(MemOpndSlot + i));
+  MIB.setMemRefs(MMOBegin, MMOEnd);
+  // Hi
+  MIB = BuildMI(thisMBB, DL, TII->get(LOADOpc), X86::EDX);
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i) {
+    if (i == X86::AddrDisp)
+      MIB.addDisp(MI->getOperand(MemOpndSlot + i), 4); // 4 == sizeof(i32)
+    else
+      MIB.addOperand(MI->getOperand(MemOpndSlot + i));
+  }
+  MIB.setMemRefs(MMOBegin, MMOEnd);
 
-  // Generate movc
-  unsigned t3 = F->getRegInfo().createVirtualRegister(&X86::GR32RegClass);
-  MIB = BuildMI(newMBB, dl, TII->get(cmovOpc),t3);
-  MIB.addReg(t2);
-  MIB.addReg(t1);
+  thisMBB->addSuccessor(mainMBB);
+
+  // mainMBB:
+  MachineBasicBlock *origMainMBB = mainMBB;
+  mainMBB->addLiveIn(X86::EAX);
+  mainMBB->addLiveIn(X86::EDX);
+
+  // Copy EDX:EAX as they are used more than once.
+  unsigned LoReg = MRI.createVirtualRegister(RC);
+  unsigned HiReg = MRI.createVirtualRegister(RC);
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), LoReg).addReg(X86::EAX);
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), HiReg).addReg(X86::EDX);
+
+  unsigned t1L = MRI.createVirtualRegister(RC);
+  unsigned t1H = MRI.createVirtualRegister(RC);
+
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+  default:
+    llvm_unreachable("Unhandled atomic-load-op6432 opcode!");
+  case X86::ATOMAND6432:
+  case X86::ATOMOR6432:
+  case X86::ATOMXOR6432:
+  case X86::ATOMADD6432:
+  case X86::ATOMSUB6432: {
+    unsigned HiOpc;
+    unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
+    BuildMI(mainMBB, DL, TII->get(LoOpc), t1L).addReg(SrcLoReg).addReg(LoReg);
+    BuildMI(mainMBB, DL, TII->get(HiOpc), t1H).addReg(SrcHiReg).addReg(HiReg);
+    break;
+  }
+  case X86::ATOMNAND6432: {
+    unsigned HiOpc, NOTOpc;
+    unsigned LoOpc = getNonAtomic6432OpcodeWithExtraOpc(Opc, HiOpc, NOTOpc);
+    unsigned t2L = MRI.createVirtualRegister(RC);
+    unsigned t2H = MRI.createVirtualRegister(RC);
+    BuildMI(mainMBB, DL, TII->get(LoOpc), t2L).addReg(SrcLoReg).addReg(LoReg);
+    BuildMI(mainMBB, DL, TII->get(HiOpc), t2H).addReg(SrcHiReg).addReg(HiReg);
+    BuildMI(mainMBB, DL, TII->get(NOTOpc), t1L).addReg(t2L);
+    BuildMI(mainMBB, DL, TII->get(NOTOpc), t1H).addReg(t2H);
+    break;
+  }
+  case X86::ATOMSWAP6432: {
+    unsigned HiOpc;
+    unsigned LoOpc = getNonAtomic6432Opcode(Opc, HiOpc);
+    BuildMI(mainMBB, DL, TII->get(LoOpc), t1L).addReg(SrcLoReg);
+    BuildMI(mainMBB, DL, TII->get(HiOpc), t1H).addReg(SrcHiReg);
+    break;
+  }
+  }
+
+  // Copy EDX:EAX back from HiReg:LoReg
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EAX).addReg(LoReg);
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EDX).addReg(HiReg);
+  // Copy ECX:EBX from t1H:t1L
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::EBX).addReg(t1L);
+  BuildMI(mainMBB, DL, TII->get(TargetOpcode::COPY), X86::ECX).addReg(t1H);
 
-  // Cmp and exchange if none has modified the memory location
-  MIB = BuildMI(newMBB, dl, TII->get(X86::LCMPXCHG32));
-  for (int i=0; i <= lastAddrIndx; ++i)
-    (*MIB).addOperand(*argOpers[i]);
-  MIB.addReg(t3);
-  assert(mInstr->hasOneMemOperand() && "Unexpected number of memoperand");
-  (*MIB).setMemRefs(mInstr->memoperands_begin(),
-                    mInstr->memoperands_end());
+  MIB = BuildMI(mainMBB, DL, TII->get(LCMPXCHGOpc));
+  for (unsigned i = 0; i < X86::AddrNumOperands; ++i)
+    MIB.addOperand(MI->getOperand(MemOpndSlot + i));
+  MIB.setMemRefs(MMOBegin, MMOEnd);
 
-  MIB = BuildMI(newMBB, dl, TII->get(TargetOpcode::COPY), destOper.getReg());
-  MIB.addReg(X86::EAX);
+  BuildMI(mainMBB, DL, TII->get(X86::JNE_4)).addMBB(origMainMBB);
 
-  // insert branch
-  BuildMI(newMBB, dl, TII->get(X86::JNE_4)).addMBB(newMBB);
+  mainMBB->addSuccessor(origMainMBB);
+  mainMBB->addSuccessor(sinkMBB);
+
+  // sinkMBB:
+  sinkMBB->addLiveIn(X86::EAX);
+  sinkMBB->addLiveIn(X86::EDX);
+
+  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+          TII->get(TargetOpcode::COPY), DstLoReg)
+    .addReg(X86::EAX);
+  BuildMI(*sinkMBB, sinkMBB->begin(), DL,
+          TII->get(TargetOpcode::COPY), DstHiReg)
+    .addReg(X86::EDX);
 
-  mInstr->eraseFromParent();   // The pseudo instruction is gone now.
-  return nextMBB;
+  MI->eraseFromParent();
+  return sinkMBB;
 }
 
 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
@@ -13276,130 +13528,46 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     return EmitMonitor(MI, BB);
 
     // Atomic Lowering.
-  case X86::ATOMMIN32:
-  case X86::ATOMMAX32:
-  case X86::ATOMUMIN32:
-  case X86::ATOMUMAX32:
-  case X86::ATOMMIN16:
-  case X86::ATOMMAX16:
-  case X86::ATOMUMIN16:
-  case X86::ATOMUMAX16:
-  case X86::ATOMMIN64:
-  case X86::ATOMMAX64:
-  case X86::ATOMUMIN64:
-  case X86::ATOMUMAX64: {
-    unsigned Opc;
-    switch (MI->getOpcode()) {
-    default: llvm_unreachable("illegal opcode!");
-    case X86::ATOMMIN32:  Opc = X86::CMOVL32rr; break;
-    case X86::ATOMMAX32:  Opc = X86::CMOVG32rr; break;
-    case X86::ATOMUMIN32: Opc = X86::CMOVB32rr; break;
-    case X86::ATOMUMAX32: Opc = X86::CMOVA32rr; break;
-    case X86::ATOMMIN16:  Opc = X86::CMOVL16rr; break;
-    case X86::ATOMMAX16:  Opc = X86::CMOVG16rr; break;
-    case X86::ATOMUMIN16: Opc = X86::CMOVB16rr; break;
-    case X86::ATOMUMAX16: Opc = X86::CMOVA16rr; break;
-    case X86::ATOMMIN64:  Opc = X86::CMOVL64rr; break;
-    case X86::ATOMMAX64:  Opc = X86::CMOVG64rr; break;
-    case X86::ATOMUMIN64: Opc = X86::CMOVB64rr; break;
-    case X86::ATOMUMAX64: Opc = X86::CMOVA64rr; break;
-    // FIXME: There are no CMOV8 instructions; MIN/MAX need some other way.
-    }
-    return EmitAtomicMinMaxWithCustomInserter(MI, BB, Opc);
-  }
-
-  case X86::ATOMAND32:
-  case X86::ATOMOR32:
-  case X86::ATOMXOR32:
-  case X86::ATOMNAND32: {
-    bool Invert = false;
-    unsigned RegOpc, ImmOpc;
-    switch (MI->getOpcode()) {
-    default: llvm_unreachable("illegal opcode!");
-    case X86::ATOMAND32:
-      RegOpc = X86::AND32rr; ImmOpc = X86::AND32ri; break;
-    case X86::ATOMOR32:
-      RegOpc = X86::OR32rr;  ImmOpc = X86::OR32ri; break;
-    case X86::ATOMXOR32:
-      RegOpc = X86::XOR32rr; ImmOpc = X86::XOR32ri; break;
-    case X86::ATOMNAND32:
-      RegOpc = X86::AND32rr; ImmOpc = X86::AND32ri; Invert = true; break;
-    }
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, RegOpc, ImmOpc,
-                                               X86::MOV32rm, X86::LCMPXCHG32,
-                                               X86::NOT32r, X86::EAX,
-                                               &X86::GR32RegClass, Invert);
-  }
-
+  case X86::ATOMAND8:
   case X86::ATOMAND16:
+  case X86::ATOMAND32:
+  case X86::ATOMAND64:
+    // Fall through
+  case X86::ATOMOR8:
   case X86::ATOMOR16:
+  case X86::ATOMOR32:
+  case X86::ATOMOR64:
+    // Fall through
   case X86::ATOMXOR16:
-  case X86::ATOMNAND16: {
-    bool Invert = false;
-    unsigned RegOpc, ImmOpc;
-    switch (MI->getOpcode()) {
-    default: llvm_unreachable("illegal opcode!");
-    case X86::ATOMAND16:
-      RegOpc = X86::AND16rr; ImmOpc = X86::AND16ri; break;
-    case X86::ATOMOR16:
-      RegOpc = X86::OR16rr;  ImmOpc = X86::OR16ri; break;
-    case X86::ATOMXOR16:
-      RegOpc = X86::XOR16rr; ImmOpc = X86::XOR16ri; break;
-    case X86::ATOMNAND16:
-      RegOpc = X86::AND16rr; ImmOpc = X86::AND16ri; Invert = true; break;
-    }
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, RegOpc, ImmOpc,
-                                               X86::MOV16rm, X86::LCMPXCHG16,
-                                               X86::NOT16r, X86::AX,
-                                               &X86::GR16RegClass, Invert);
-  }
-
-  case X86::ATOMAND8:
-  case X86::ATOMOR8:
   case X86::ATOMXOR8:
-  case X86::ATOMNAND8: {
-    bool Invert = false;
-    unsigned RegOpc, ImmOpc;
-    switch (MI->getOpcode()) {
-    default: llvm_unreachable("illegal opcode!");
-    case X86::ATOMAND8:
-      RegOpc = X86::AND8rr; ImmOpc = X86::AND8ri; break;
-    case X86::ATOMOR8:
-      RegOpc = X86::OR8rr;  ImmOpc = X86::OR8ri; break;
-    case X86::ATOMXOR8:
-      RegOpc = X86::XOR8rr; ImmOpc = X86::XOR8ri; break;
-    case X86::ATOMNAND8:
-      RegOpc = X86::AND8rr; ImmOpc = X86::AND8ri; Invert = true; break;
-    }
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, RegOpc, ImmOpc,
-                                               X86::MOV8rm, X86::LCMPXCHG8,
-                                               X86::NOT8r, X86::AL,
-                                               &X86::GR8RegClass, Invert);
-  }
-
-  // This group is for 64-bit host.
-  case X86::ATOMAND64:
-  case X86::ATOMOR64:
+  case X86::ATOMXOR32:
   case X86::ATOMXOR64:
-  case X86::ATOMNAND64: {
-    bool Invert = false;
-    unsigned RegOpc, ImmOpc;
-    switch (MI->getOpcode()) {
-    default: llvm_unreachable("illegal opcode!");
-    case X86::ATOMAND64:
-      RegOpc = X86::AND64rr; ImmOpc = X86::AND64ri32; break;
-    case X86::ATOMOR64:
-      RegOpc = X86::OR64rr;  ImmOpc = X86::OR64ri32; break;
-    case X86::ATOMXOR64:
-      RegOpc = X86::XOR64rr; ImmOpc = X86::XOR64ri32; break;
-    case X86::ATOMNAND64:
-      RegOpc = X86::AND64rr; ImmOpc = X86::AND64ri32; Invert = true; break;
-    }
-    return EmitAtomicBitwiseWithCustomInserter(MI, BB, RegOpc, ImmOpc,
-                                               X86::MOV64rm, X86::LCMPXCHG64,
-                                               X86::NOT64r, X86::RAX,
-                                               &X86::GR64RegClass, Invert);
-  }
+    // Fall through
+  case X86::ATOMNAND8:
+  case X86::ATOMNAND16:
+  case X86::ATOMNAND32:
+  case X86::ATOMNAND64:
+    // Fall through
+  case X86::ATOMMAX8:
+  case X86::ATOMMAX16:
+  case X86::ATOMMAX32:
+  case X86::ATOMMAX64:
+    // Fall through
+  case X86::ATOMMIN8:
+  case X86::ATOMMIN16:
+  case X86::ATOMMIN32:
+  case X86::ATOMMIN64:
+    // Fall through
+  case X86::ATOMUMAX8:
+  case X86::ATOMUMAX16:
+  case X86::ATOMUMAX32:
+  case X86::ATOMUMAX64:
+    // Fall through
+  case X86::ATOMUMIN8:
+  case X86::ATOMUMIN16:
+  case X86::ATOMUMIN32:
+  case X86::ATOMUMIN64:
+    return EmitAtomicLoadArith(MI, BB);
 
   // This group does 64-bit operations on a 32-bit host.
   case X86::ATOMAND6432:
@@ -13408,44 +13576,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case X86::ATOMNAND6432:
   case X86::ATOMADD6432:
   case X86::ATOMSUB6432:
-  case X86::ATOMSWAP6432: {
-    bool Invert = false;
-    unsigned RegOpcL, RegOpcH, ImmOpcL, ImmOpcH;
-    switch (MI->getOpcode()) {
-    default: llvm_unreachable("illegal opcode!");
-    case X86::ATOMAND6432:
-      RegOpcL = RegOpcH = X86::AND32rr;
-      ImmOpcL = ImmOpcH = X86::AND32ri;
-      break;
-    case X86::ATOMOR6432:
-      RegOpcL = RegOpcH = X86::OR32rr;
-      ImmOpcL = ImmOpcH = X86::OR32ri;
-      break;
-    case X86::ATOMXOR6432:
-      RegOpcL = RegOpcH = X86::XOR32rr;
-      ImmOpcL = ImmOpcH = X86::XOR32ri;
-      break;
-    case X86::ATOMNAND6432:
-      RegOpcL = RegOpcH = X86::AND32rr;
-      ImmOpcL = ImmOpcH = X86::AND32ri;
-      Invert = true;
-      break;
-    case X86::ATOMADD6432:
-      RegOpcL = X86::ADD32rr; RegOpcH = X86::ADC32rr;
-      ImmOpcL = X86::ADD32ri; ImmOpcH = X86::ADC32ri;
-      break;
-    case X86::ATOMSUB6432:
-      RegOpcL = X86::SUB32rr; RegOpcH = X86::SBB32rr;
-      ImmOpcL = X86::SUB32ri; ImmOpcH = X86::SBB32ri;
-      break;
-    case X86::ATOMSWAP6432:
-      RegOpcL = RegOpcH = X86::MOV32rr;
-      ImmOpcL = ImmOpcH = X86::MOV32ri;
-      break;
-    }
-    return EmitAtomicBit6432WithCustomInserter(MI, BB, RegOpcL, RegOpcH,
-                                               ImmOpcL, ImmOpcH, Invert);
-  }
+  case X86::ATOMSWAP6432:
+    return EmitAtomicLoadArith6432(MI, BB);
 
   case X86::VASTART_SAVE_XMM_REGS:
     return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
@@ -14408,84 +14540,6 @@ static SDValue checkBoolTestSetCCCombine(SDValue Cmp, X86::CondCode &CC) {
   return SDValue();
 }
 
-/// checkFlaggedOrCombine - DAG combination on X86ISD::OR, i.e. with EFLAGS
-/// updated. If only flag result is used and the result is evaluated from a
-/// series of element extraction, try to combine it into a PTEST.
-static SDValue checkFlaggedOrCombine(SDValue Or, X86::CondCode &CC,
-                                     SelectionDAG &DAG,
-                                     const X86Subtarget *Subtarget) {
-  SDNode *N = Or.getNode();
-  DebugLoc DL = N->getDebugLoc();
-
-  // Only SSE4.1 and beyond supports PTEST or like.
-  if (!Subtarget->hasSSE41())
-    return SDValue();
-
-  if (N->getOpcode() != X86ISD::OR)
-    return SDValue();
-
-  // Quit if the value result of OR is used.
-  if (N->hasAnyUseOfValue(0))
-    return SDValue();
-
-  // Quit if not used as a boolean value.
-  if (CC != X86::COND_E && CC != X86::COND_NE)
-    return SDValue();
-
-  SmallVector<SDValue, 8> Opnds;
-  SDValue VecIn;
-  EVT VT = MVT::Other;
-  unsigned Mask = 0;
-
-  // Recognize a special case where a vector is casted into wide integer to
-  // test all 0s.
-  Opnds.push_back(N->getOperand(0));
-  Opnds.push_back(N->getOperand(1));
-
-  for (unsigned Slot = 0, e = Opnds.size(); Slot < e; ++Slot) {
-    SmallVector<SDValue, 8>::const_iterator I = Opnds.begin() + Slot;
-    // BFS traverse all OR'd operands.
-    if (I->getOpcode() == ISD::OR) {
-      Opnds.push_back(I->getOperand(0));
-      Opnds.push_back(I->getOperand(1));
-      // Re-evaluate the number of nodes to be traversed.
-      e += 2; // 2 more nodes (LHS and RHS) are pushed.
-      continue;
-    }
-
-    // Quit if a non-EXTRACT_VECTOR_ELT
-    if (I->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
-      return SDValue();
-
-    // Quit if without a constant index.
-    SDValue Idx = I->getOperand(1);
-    if (!isa<ConstantSDNode>(Idx))
-      return SDValue();
-
-    // Check if all elements are extracted from the same vector.
-    SDValue ExtractedFromVec = I->getOperand(0);
-    if (VecIn.getNode() == 0) {
-      VT = ExtractedFromVec.getValueType();
-      // FIXME: only 128-bit vector is supported so far.
-      if (!VT.is128BitVector())
-        return SDValue();
-      VecIn = ExtractedFromVec;
-    } else if (VecIn != ExtractedFromVec)
-      return SDValue();
-
-    // Record the constant index.
-    Mask |= 1U << cast<ConstantSDNode>(Idx)->getZExtValue();
-  }
-
-  assert(VT.is128BitVector() && "Only 128-bit vector PTEST is supported so far.");
-
-  // Quit if not all elements are used.
-  if (Mask != (1U << VT.getVectorNumElements()) - 1U)
-    return SDValue();
-
-  return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIn, VecIn);
-}
-
 /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
 static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
                                   TargetLowering::DAGCombinerInfo &DCI,
@@ -14524,14 +14578,6 @@ static SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG,
                        Ops, array_lengthof(Ops));
   }
 
-  Flags = checkFlaggedOrCombine(Cond, CC, DAG, Subtarget);
-  if (Flags.getNode()) {
-    SDValue Ops[] = { FalseOp, TrueOp,
-                      DAG.getConstant(CC, MVT::i8), Flags };
-    return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(),
-                       Ops, array_lengthof(Ops));
-  }
-
   // If this is a select between two integer constants, try to do some
   // optimizations.  Note that the operands are ordered the opposite of SELECT
   // operands.
@@ -16069,12 +16115,6 @@ static SDValue PerformSETCCCombine(SDNode *N, SelectionDAG &DAG,
     return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
   }
 
-  Flags = checkFlaggedOrCombine(EFLAGS, CC, DAG, Subtarget);
-  if (Flags.getNode()) {
-    SDValue Cond = DAG.getConstant(CC, MVT::i8);
-    return DAG.getNode(X86ISD::SETCC, DL, N->getVTList(), Cond, Flags);
-  }
-
   return SDValue();
 }
 
@@ -16098,13 +16138,6 @@ static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
                        Flags);
   }
 
-  Flags = checkFlaggedOrCombine(EFLAGS, CC, DAG, Subtarget);
-  if (Flags.getNode()) {
-    SDValue Cond = DAG.getConstant(CC, MVT::i8);
-    return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), Chain, Dest, Cond,
-                       Flags);
-  }
-
   return SDValue();
 }
 
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 436be7fd98..d3545b0e9f 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -825,6 +825,8 @@ namespace llvm {
     SDValue LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const;
     SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const;
     
+    SDValue LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const;
+
     SDValue LowerVectorFpExtend(SDValue &Op, SelectionDAG &DAG) const;
 
     virtual SDValue
@@ -873,36 +875,17 @@ namespace llvm {
                                    MachineBasicBlock *BB) const;
     MachineBasicBlock *EmitMwait(MachineInstr *MI, MachineBasicBlock *BB) const;
 
-    /// Utility function to emit atomic bitwise operations (and, or, xor).
-    /// It takes the bitwise instruction to expand, the associated machine basic
-    /// block, and the associated X86 opcodes for reg/reg and reg/imm.
-    MachineBasicBlock *EmitAtomicBitwiseWithCustomInserter(
-                                                    MachineInstr *BInstr,
-                                                    MachineBasicBlock *BB,
-                                                    unsigned regOpc,
-                                                    unsigned immOpc,
-                                                    unsigned loadOpc,
-                                                    unsigned cxchgOpc,
-                                                    unsigned notOpc,
-                                                    unsigned EAXreg,
-                                              const TargetRegisterClass *RC,
-                                                    bool Invert = false) const;
-
-    MachineBasicBlock *EmitAtomicBit6432WithCustomInserter(
-                                                    MachineInstr *BInstr,
-                                                    MachineBasicBlock *BB,
-                                                    unsigned regOpcL,
-                                                    unsigned regOpcH,
-                                                    unsigned immOpcL,
-                                                    unsigned immOpcH,
-                                                    bool Invert = false) const;
-
-    /// Utility function to emit atomic min and max.  It takes the min/max
-    /// instruction to expand, the associated basic block, and the associated
-    /// cmov opcode for moving the min or max value.
-    MachineBasicBlock *EmitAtomicMinMaxWithCustomInserter(MachineInstr *BInstr,
-                                                          MachineBasicBlock *BB,
-                                                        unsigned cmovOpc) const;
+    /// Utility function to emit atomic-load-arith operations (and, or, xor,
+    /// nand, max, min, umax, umin). It takes the corresponding instruction to
+    /// expand, the associated machine basic block, and the associated X86
+    /// opcodes for reg/reg.
+    MachineBasicBlock *EmitAtomicLoadArith(MachineInstr *MI,
+                                           MachineBasicBlock *MBB) const;
+
+    /// Utility function to emit atomic-load-arith operations (and, or, xor,
+    /// nand, add, sub, swap) for 64-bit operands on 32-bit target.
+    MachineBasicBlock *EmitAtomicLoadArith6432(MachineInstr *MI,
+                                               MachineBasicBlock *MBB) const;
 
     // Utility function to emit the low-level va_arg code for X86-64.
     MachineBasicBlock *EmitVAARG64WithCustomInserter(
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 43b7c2596d..1296bcbe89 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -482,130 +482,74 @@ def CMOV_RFP80 : I<0, Pseudo,
 // Atomic Instruction Pseudo Instructions
 //===----------------------------------------------------------------------===//
 
-// Atomic exchange, and, or, xor
-let Constraints = "$val = $dst", Defs = [EFLAGS],
-                  usesCustomInserter = 1 in {
-
-def ATOMAND8 : I<0, Pseudo, (outs GR8:$dst),(ins i8mem:$ptr, GR8:$val),
-               "#ATOMAND8 PSEUDO!",
-               [(set GR8:$dst, (atomic_load_and_8 addr:$ptr, GR8:$val))]>;
-def ATOMOR8 : I<0, Pseudo, (outs GR8:$dst),(ins i8mem:$ptr, GR8:$val),
-               "#ATOMOR8 PSEUDO!",
-               [(set GR8:$dst, (atomic_load_or_8 addr:$ptr, GR8:$val))]>;
-def ATOMXOR8 : I<0, Pseudo,(outs GR8:$dst),(ins i8mem:$ptr, GR8:$val),
-               "#ATOMXOR8 PSEUDO!",
-               [(set GR8:$dst, (atomic_load_xor_8 addr:$ptr, GR8:$val))]>;
-def ATOMNAND8 : I<0, Pseudo,(outs GR8:$dst),(ins i8mem:$ptr, GR8:$val),
-               "#ATOMNAND8 PSEUDO!",
-               [(set GR8:$dst, (atomic_load_nand_8 addr:$ptr, GR8:$val))]>;
-
-def ATOMAND16 : I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
-               "#ATOMAND16 PSEUDO!",
-               [(set GR16:$dst, (atomic_load_and_16 addr:$ptr, GR16:$val))]>;
-def ATOMOR16 : I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
-               "#ATOMOR16 PSEUDO!",
-               [(set GR16:$dst, (atomic_load_or_16 addr:$ptr, GR16:$val))]>;
-def ATOMXOR16 : I<0, Pseudo,(outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
-               "#ATOMXOR16 PSEUDO!",
-               [(set GR16:$dst, (atomic_load_xor_16 addr:$ptr, GR16:$val))]>;
-def ATOMNAND16 : I<0, Pseudo,(outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
-               "#ATOMNAND16 PSEUDO!",
-               [(set GR16:$dst, (atomic_load_nand_16 addr:$ptr, GR16:$val))]>;
-def ATOMMIN16: I<0, Pseudo, (outs GR16:$dst), (ins i16mem:$ptr, GR16:$val),
-               "#ATOMMIN16 PSEUDO!",
-               [(set GR16:$dst, (atomic_load_min_16 addr:$ptr, GR16:$val))]>;
-def ATOMMAX16: I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
-               "#ATOMMAX16 PSEUDO!",
-               [(set GR16:$dst, (atomic_load_max_16 addr:$ptr, GR16:$val))]>;
-def ATOMUMIN16: I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
-               "#ATOMUMIN16 PSEUDO!",
-               [(set GR16:$dst, (atomic_load_umin_16 addr:$ptr, GR16:$val))]>;
-def ATOMUMAX16: I<0, Pseudo, (outs GR16:$dst),(ins i16mem:$ptr, GR16:$val),
-               "#ATOMUMAX16 PSEUDO!",
-               [(set GR16:$dst, (atomic_load_umax_16 addr:$ptr, GR16:$val))]>;
-
-
-def ATOMAND32 : I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
-               "#ATOMAND32 PSEUDO!",
-               [(set GR32:$dst, (atomic_load_and_32 addr:$ptr, GR32:$val))]>;
-def ATOMOR32 : I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
-               "#ATOMOR32 PSEUDO!",
-               [(set GR32:$dst, (atomic_load_or_32 addr:$ptr, GR32:$val))]>;
-def ATOMXOR32 : I<0, Pseudo,(outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
-               "#ATOMXOR32 PSEUDO!",
-               [(set GR32:$dst, (atomic_load_xor_32 addr:$ptr, GR32:$val))]>;
-def ATOMNAND32 : I<0, Pseudo,(outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
-               "#ATOMNAND32 PSEUDO!",
-               [(set GR32:$dst, (atomic_load_nand_32 addr:$ptr, GR32:$val))]>;
-def ATOMMIN32: I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$ptr, GR32:$val),
-               "#ATOMMIN32 PSEUDO!",
-               [(set GR32:$dst, (atomic_load_min_32 addr:$ptr, GR32:$val))]>;
-def ATOMMAX32: I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
-               "#ATOMMAX32 PSEUDO!",
-               [(set GR32:$dst, (atomic_load_max_32 addr:$ptr, GR32:$val))]>;
-def ATOMUMIN32: I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
-               "#ATOMUMIN32 PSEUDO!",
-               [(set GR32:$dst, (atomic_load_umin_32 addr:$ptr, GR32:$val))]>;
-def ATOMUMAX32: I<0, Pseudo, (outs GR32:$dst),(ins i32mem:$ptr, GR32:$val),
-               "#ATOMUMAX32 PSEUDO!",
-               [(set GR32:$dst, (atomic_load_umax_32 addr:$ptr, GR32:$val))]>;
-
-
-
-def ATOMAND64 : I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
-               "#ATOMAND64 PSEUDO!",
-               [(set GR64:$dst, (atomic_load_and_64 addr:$ptr, GR64:$val))]>;
-def ATOMOR64 : I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
-               "#ATOMOR64 PSEUDO!",
-               [(set GR64:$dst, (atomic_load_or_64 addr:$ptr, GR64:$val))]>;
-def ATOMXOR64 : I<0, Pseudo,(outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
-               "#ATOMXOR64 PSEUDO!",
-               [(set GR64:$dst, (atomic_load_xor_64 addr:$ptr, GR64:$val))]>;
-def ATOMNAND64 : I<0, Pseudo,(outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
-               "#ATOMNAND64 PSEUDO!",
-               [(set GR64:$dst, (atomic_load_nand_64 addr:$ptr, GR64:$val))]>;
-def ATOMMIN64: I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$ptr, GR64:$val),
-               "#ATOMMIN64 PSEUDO!",
-               [(set GR64:$dst, (atomic_load_min_64 addr:$ptr, GR64:$val))]>;
-def ATOMMAX64: I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
-               "#ATOMMAX64 PSEUDO!",
-               [(set GR64:$dst, (atomic_load_max_64 addr:$ptr, GR64:$val))]>;
-def ATOMUMIN64: I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
-               "#ATOMUMIN64 PSEUDO!",
-               [(set GR64:$dst, (atomic_load_umin_64 addr:$ptr, GR64:$val))]>;
-def ATOMUMAX64: I<0, Pseudo, (outs GR64:$dst),(ins i64mem:$ptr, GR64:$val),
-               "#ATOMUMAX64 PSEUDO!",
-               [(set GR64:$dst, (atomic_load_umax_64 addr:$ptr, GR64:$val))]>;
+// Pseudo atomic instructions
+
+multiclass PSEUDO_ATOMIC_LOAD_BINOP<string mnemonic> {
+  let usesCustomInserter = 1, mayLoad = 1, mayStore = 1 in {
+    def #NAME#8  : I<0, Pseudo, (outs GR8:$dst),
+                     (ins i8mem:$ptr, GR8:$val),
+                     !strconcat(mnemonic, "8 PSEUDO!"), []>;
+    def #NAME#16 : I<0, Pseudo,(outs GR16:$dst),
+                     (ins i16mem:$ptr, GR16:$val),
+                     !strconcat(mnemonic, "16 PSEUDO!"), []>;
+    def #NAME#32 : I<0, Pseudo, (outs GR32:$dst),
+                     (ins i32mem:$ptr, GR32:$val),
+                     !strconcat(mnemonic, "32 PSEUDO!"), []>;
+    def #NAME#64 : I<0, Pseudo, (outs GR64:$dst),
+                     (ins i64mem:$ptr, GR64:$val),
+                     !strconcat(mnemonic, "64 PSEUDO!"), []>;
+  }
+}
+
+multiclass PSEUDO_ATOMIC_LOAD_BINOP_PATS<string name, string frag> {
+  def : Pat<(!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val),
+            (!cast<Instruction>(name # "8") addr:$ptr, GR8:$val)>;
+  def : Pat<(!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val),
+            (!cast<Instruction>(name # "16") addr:$ptr, GR16:$val)>;
+  def : Pat<(!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val),
+            (!cast<Instruction>(name # "32") addr:$ptr, GR32:$val)>;
+  def : Pat<(!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val),
+            (!cast<Instruction>(name # "64") addr:$ptr, GR64:$val)>;
 }
 
-let Constraints = "$val1 = $dst1, $val2 = $dst2",
-                  Defs = [EFLAGS, EAX, EBX, ECX, EDX],
-                  Uses = [EAX, EBX, ECX, EDX],
-                  mayLoad = 1, mayStore = 1,
-                  usesCustomInserter = 1 in {
-def ATOMAND6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
-                               (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
-               "#ATOMAND6432 PSEUDO!", []>;
-def ATOMOR6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
-                               (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
-               "#ATOMOR6432 PSEUDO!", []>;
-def ATOMXOR6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
-                               (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
-               "#ATOMXOR6432 PSEUDO!", []>;
-def ATOMNAND6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
-                               (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
-               "#ATOMNAND6432 PSEUDO!", []>;
-def ATOMADD6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
-                               (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
-               "#ATOMADD6432 PSEUDO!", []>;
-def ATOMSUB6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
-                               (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
-               "#ATOMSUB6432 PSEUDO!", []>;
-def ATOMSWAP6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
-                               (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
-               "#ATOMSWAP6432 PSEUDO!", []>;
+// Atomic exchange, and, or, xor
+defm ATOMAND  : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMAND">;
+defm ATOMOR   : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMOR">;
+defm ATOMXOR  : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMXOR">;
+defm ATOMNAND : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMNAND">;
+defm ATOMMAX  : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMMAX">;
+defm ATOMMIN  : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMMIN">;
+defm ATOMUMAX : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMUMAX">;
+defm ATOMUMIN : PSEUDO_ATOMIC_LOAD_BINOP<"#ATOMUMIN">;
+
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMAND",  "atomic_load_and">;
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMOR",   "atomic_load_or">;
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMXOR",  "atomic_load_xor">;
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMNAND", "atomic_load_nand">;
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMMAX",  "atomic_load_max">;
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMMIN",  "atomic_load_min">;
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMUMAX", "atomic_load_umax">;
+defm : PSEUDO_ATOMIC_LOAD_BINOP_PATS<"ATOMUMIN", "atomic_load_umin">;
+
+multiclass PSEUDO_ATOMIC_LOAD_BINOP6432<string mnemonic> {
+  let usesCustomInserter = 1, mayLoad = 1, mayStore = 1 in
+    def #NAME#6432 : I<0, Pseudo, (outs GR32:$dst1, GR32:$dst2),
+                       (ins i64mem:$ptr, GR32:$val1, GR32:$val2),
+                       !strconcat(mnemonic, "6432 PSEUDO!"), []>;
 }
 
+defm ATOMAND  : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMAND">;
+defm ATOMOR   : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMOR">;
+defm ATOMXOR  : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMXOR">;
+defm ATOMNAND : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMNAND">;
+defm ATOMADD  : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMADD">;
+defm ATOMSUB  : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMSUB">;
+defm ATOMMAX  : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMMAX">;
+defm ATOMMIN  : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMMIN">;
+defm ATOMUMAX : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMUMAX">;
+defm ATOMUMIN : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMUMIN">;
+defm ATOMSWAP : PSEUDO_ATOMIC_LOAD_BINOP6432<"#ATOMSWAP">;
+
 //===----------------------------------------------------------------------===//
 // Normal-Instructions-With-Lock-Prefix Pseudo Instructions
 //===----------------------------------------------------------------------===//
@@ -671,7 +615,7 @@ def #NAME#16mi : Ii16<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
                        ImmMod, (outs), (ins i16mem :$dst, i16imm :$src2),
                        !strconcat("lock\n\t", mnemonic, "{w}\t",
                                   "{$src2, $dst|$dst, $src2}"),
-                       [], IIC_ALU_MEM>, LOCK;
+                       [], IIC_ALU_MEM>, OpSize, LOCK;
 
 def #NAME#32mi : Ii32<{ImmOpc{7}, ImmOpc{6}, ImmOpc{5}, ImmOpc{4},
                        ImmOpc{3}, ImmOpc{2}, ImmOpc{1}, 1 },
@@ -692,7 +636,7 @@ def #NAME#16mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
                        ImmMod, (outs), (ins i16mem :$dst, i16i8imm :$src2),
                        !strconcat("lock\n\t", mnemonic, "{w}\t",
                                   "{$src2, $dst|$dst, $src2}"),
-                       [], IIC_ALU_MEM>, LOCK;
+                       [], IIC_ALU_MEM>, OpSize, LOCK;
 def #NAME#32mi8 : Ii8<{ImmOpc8{7}, ImmOpc8{6}, ImmOpc8{5}, ImmOpc8{4},
                        ImmOpc8{3}, ImmOpc8{2}, ImmOpc8{1}, 1 },
                        ImmMod, (outs), (ins i32mem :$dst, i32i8imm :$src2),
@@ -717,107 +661,125 @@ defm LOCK_AND : LOCK_ArithBinOp<0x20, 0x80, 0x83, MRM4m, "and">;
 defm LOCK_XOR : LOCK_ArithBinOp<0x30, 0x80, 0x83, MRM6m, "xor">;
 
 // Optimized codegen when the non-memory output is not used.
+multiclass LOCK_ArithUnOp<bits<8> Opc8, bits<8> Opc, Format Form,
+                          string mnemonic> {
 let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1 in {
 
-def LOCK_INC8m  : I<0xFE, MRM0m, (outs), (ins i8mem :$dst),
-                    "lock\n\t"
-                    "inc{b}\t$dst", [], IIC_UNARY_MEM>, LOCK;
-def LOCK_INC16m : I<0xFF, MRM0m, (outs), (ins i16mem:$dst),
-                    "lock\n\t"
-                    "inc{w}\t$dst", [], IIC_UNARY_MEM>, OpSize, LOCK;
-def LOCK_INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst),
-                    "lock\n\t"
-                    "inc{l}\t$dst", [], IIC_UNARY_MEM>, LOCK;
-def LOCK_INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst),
-                     "lock\n\t"
-                     "inc{q}\t$dst", [], IIC_UNARY_MEM>, LOCK;
-
-def LOCK_DEC8m  : I<0xFE, MRM1m, (outs), (ins i8mem :$dst),
-                    "lock\n\t"
-                    "dec{b}\t$dst", [], IIC_UNARY_MEM>, LOCK;
-def LOCK_DEC16m : I<0xFF, MRM1m, (outs), (ins i16mem:$dst),
-                    "lock\n\t"
-                    "dec{w}\t$dst", [], IIC_UNARY_MEM>, OpSize, LOCK;
-def LOCK_DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst),
-                    "lock\n\t"
-                    "dec{l}\t$dst", [], IIC_UNARY_MEM>, LOCK;
-def LOCK_DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst),
-                      "lock\n\t"
-                      "dec{q}\t$dst", [], IIC_UNARY_MEM>, LOCK;
+def #NAME#8m  : I<Opc8, Form, (outs), (ins i8mem :$dst),
+                  !strconcat("lock\n\t", mnemonic, "{b}\t$dst"),
+                  [], IIC_UNARY_MEM>, LOCK;
+def #NAME#16m : I<Opc, Form, (outs), (ins i16mem:$dst),
+                  !strconcat("lock\n\t", mnemonic, "{w}\t$dst"),
+                  [], IIC_UNARY_MEM>, OpSize, LOCK;
+def #NAME#32m : I<Opc, Form, (outs), (ins i32mem:$dst),
+                  !strconcat("lock\n\t", mnemonic, "{l}\t$dst"),
+                  [], IIC_UNARY_MEM>, LOCK;
+def #NAME#64m : RI<Opc, Form, (outs), (ins i64mem:$dst),
+                   !strconcat("lock\n\t", mnemonic, "{q}\t$dst"),
+                   [], IIC_UNARY_MEM>, LOCK;
+}
 }
 
-// Atomic compare and swap.
-let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX],
-    isCodeGenOnly = 1 in
-def LCMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$ptr),
-               "lock\n\t"
-               "cmpxchg8b\t$ptr",
-               [(X86cas8 addr:$ptr)], IIC_CMPX_LOCK_8B>, TB, LOCK;
+defm LOCK_INC    : LOCK_ArithUnOp<0xFE, 0xFF, MRM0m, "inc">;
+defm LOCK_DEC    : LOCK_ArithUnOp<0xFE, 0xFF, MRM1m, "dec">;
 
-let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
-    isCodeGenOnly = 1 in
-def LCMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$ptr),
-                    "lock\n\t"
-                    "cmpxchg16b\t$ptr",
-                    [(X86cas16 addr:$ptr)], IIC_CMPX_LOCK_16B>, TB, LOCK,
-                    Requires<[HasCmpxchg16b]>;
-
-let Defs = [AL, EFLAGS], Uses = [AL], isCodeGenOnly = 1 in {
-def LCMPXCHG8 : I<0xB0, MRMDestMem, (outs), (ins i8mem:$ptr, GR8:$swap),
-               "lock\n\t"
-               "cmpxchg{b}\t{$swap, $ptr|$ptr, $swap}",
-               [(X86cas addr:$ptr, GR8:$swap, 1)], IIC_CMPX_LOCK_8>, TB, LOCK;
+// Atomic compare and swap.
+multiclass LCMPXCHG_UnOp<bits<8> Opc, Format Form, string mnemonic,
+                         SDPatternOperator frag, X86MemOperand x86memop,
+                         InstrItinClass itin> {
+let isCodeGenOnly = 1 in {
+  def #NAME# : I<Opc, Form, (outs), (ins x86memop:$ptr),
+                 !strconcat("lock\n\t", mnemonic, "\t$ptr"),
+                 [(frag addr:$ptr)], itin>, TB, LOCK;
+}
 }
 
-let Defs = [AX, EFLAGS], Uses = [AX], isCodeGenOnly = 1 in {
-def LCMPXCHG16 : I<0xB1, MRMDestMem, (outs), (ins i16mem:$ptr, GR16:$swap),
-               "lock\n\t"
-               "cmpxchg{w}\t{$swap, $ptr|$ptr, $swap}",
-               [(X86cas addr:$ptr, GR16:$swap, 2)], IIC_CMPX_LOCK>, TB, OpSize, LOCK;
+multiclass LCMPXCHG_BinOp<bits<8> Opc8, bits<8> Opc, Format Form,
+                          string mnemonic, SDPatternOperator frag,
+                          InstrItinClass itin8, InstrItinClass itin> {
+let isCodeGenOnly = 1 in {
+  let Defs = [AL, EFLAGS], Uses = [AL] in
+  def #NAME#8  : I<Opc8, Form, (outs), (ins i8mem:$ptr, GR8:$swap),
+                   !strconcat("lock\n\t", mnemonic,
+                              "{b}\t{$swap, $ptr|$ptr, $swap}"),
+                   [(frag addr:$ptr, GR8:$swap, 1)], itin8>, TB, LOCK;
+  let Defs = [AX, EFLAGS], Uses = [AX] in
+  def #NAME#16 : I<Opc, Form, (outs), (ins i16mem:$ptr, GR16:$swap),
+                   !strconcat("lock\n\t", mnemonic,
+                              "{w}\t{$swap, $ptr|$ptr, $swap}"),
+                   [(frag addr:$ptr, GR16:$swap, 2)], itin>, TB, OpSize, LOCK;
+  let Defs = [EAX, EFLAGS], Uses = [EAX] in
+  def #NAME#32 : I<Opc, Form, (outs), (ins i32mem:$ptr, GR32:$swap),
+                   !strconcat("lock\n\t", mnemonic,
+                              "{l}\t{$swap, $ptr|$ptr, $swap}"),
+                   [(frag addr:$ptr, GR32:$swap, 4)], itin>, TB, LOCK;
+  let Defs = [RAX, EFLAGS], Uses = [RAX] in
+  def #NAME#64 : RI<Opc, Form, (outs), (ins i64mem:$ptr, GR64:$swap),
+                    !strconcat("lock\n\t", mnemonic,
+                               "{q}\t{$swap, $ptr|$ptr, $swap}"),
+                    [(frag addr:$ptr, GR64:$swap, 8)], itin>, TB, LOCK;
+}
 }
 
-let Defs = [EAX, EFLAGS], Uses = [EAX], isCodeGenOnly = 1 in {
-def LCMPXCHG32 : I<0xB1, MRMDestMem, (outs), (ins i32mem:$ptr, GR32:$swap),
-               "lock\n\t"
-               "cmpxchg{l}\t{$swap, $ptr|$ptr, $swap}",
-               [(X86cas addr:$ptr, GR32:$swap, 4)], IIC_CMPX_LOCK>, TB, LOCK;
+let Defs = [EAX, EDX, EFLAGS], Uses = [EAX, EBX, ECX, EDX] in {
+defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b",
+                                X86cas8, i64mem,
+                                IIC_CMPX_LOCK_8B>;
 }
 
-let Defs = [RAX, EFLAGS], Uses = [RAX], isCodeGenOnly = 1 in {
-def LCMPXCHG64 : RI<0xB1, MRMDestMem, (outs), (ins i64mem:$ptr, GR64:$swap),
-               "lock\n\t"
-               "cmpxchg{q}\t{$swap, $ptr|$ptr, $swap}",
-               [(X86cas addr:$ptr, GR64:$swap, 8)], IIC_CMPX_LOCK>, TB, LOCK;
+let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX],
+    Predicates = [HasCmpxchg16b] in {
+defm LCMPXCHG16B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg16b",
+                                 X86cas16, i128mem,
+                                 IIC_CMPX_LOCK_16B>, REX_W;
 }
 
+defm LCMPXCHG : LCMPXCHG_BinOp<0xB0, 0xB1, MRMDestMem, "cmpxchg",
+                               X86cas, IIC_CMPX_LOCK_8, IIC_CMPX_LOCK>;
+
 // Atomic exchange and add
-let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1 in {
-def LXADD8  : I<0xC0, MRMSrcMem, (outs GR8:$dst), (ins GR8:$val, i8mem:$ptr),
-               "lock\n\t"
-               "xadd{b}\t{$val, $ptr|$ptr, $val}",
-               [(set GR8:$dst, (atomic_load_add_8 addr:$ptr, GR8:$val))],
-                IIC_XADD_LOCK_MEM8>,
-                TB, LOCK;
-def LXADD16 : I<0xC1, MRMSrcMem, (outs GR16:$dst), (ins GR16:$val, i16mem:$ptr),
-               "lock\n\t"
-               "xadd{w}\t{$val, $ptr|$ptr, $val}",
-               [(set GR16:$dst, (atomic_load_add_16 addr:$ptr, GR16:$val))],
-                IIC_XADD_LOCK_MEM>,
-                TB, OpSize, LOCK;
-def LXADD32 : I<0xC1, MRMSrcMem, (outs GR32:$dst), (ins GR32:$val, i32mem:$ptr),
-               "lock\n\t"
-               "xadd{l}\t{$val, $ptr|$ptr, $val}",
-               [(set GR32:$dst, (atomic_load_add_32 addr:$ptr, GR32:$val))],
-                IIC_XADD_LOCK_MEM>,
-                TB, LOCK;
-def LXADD64 : RI<0xC1, MRMSrcMem, (outs GR64:$dst), (ins GR64:$val,i64mem:$ptr),
-               "lock\n\t"
-               "xadd{q}\t{$val, $ptr|$ptr, $val}",
-               [(set GR64:$dst, (atomic_load_add_64 addr:$ptr, GR64:$val))],
-                IIC_XADD_LOCK_MEM>,
-                TB, LOCK;
+multiclass ATOMIC_LOAD_BINOP<bits<8> opc8, bits<8> opc, string mnemonic,
+                             string frag,
+                             InstrItinClass itin8, InstrItinClass itin> {
+  let Constraints = "$val = $dst", Defs = [EFLAGS], isCodeGenOnly = 1 in {
+    def #NAME#8  : I<opc8, MRMSrcMem, (outs GR8:$dst),
+                     (ins GR8:$val, i8mem:$ptr),
+                     !strconcat("lock\n\t", mnemonic,
+                                "{b}\t{$val, $ptr|$ptr, $val}"),
+                     [(set GR8:$dst,
+                           (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))],
+                     itin8>;
+    def #NAME#16 : I<opc, MRMSrcMem, (outs GR16:$dst),
+                     (ins GR16:$val, i16mem:$ptr),
+                     !strconcat("lock\n\t", mnemonic,
+                                "{w}\t{$val, $ptr|$ptr, $val}"),
+                     [(set
+                        GR16:$dst,
+                        (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))],
+                     itin>, OpSize;
+    def #NAME#32 : I<opc, MRMSrcMem, (outs GR32:$dst),
+                     (ins GR32:$val, i32mem:$ptr),
+                     !strconcat("lock\n\t", mnemonic,
+                                "{l}\t{$val, $ptr|$ptr, $val}"),
+                     [(set
+                        GR32:$dst,
+                        (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))],
+                     itin>;
+    def #NAME#64 : RI<opc, MRMSrcMem, (outs GR64:$dst),
+                      (ins GR64:$val, i64mem:$ptr),
+                      !strconcat("lock\n\t", mnemonic,
+                                 "{q}\t{$val, $ptr|$ptr, $val}"),
+                      [(set
+                         GR64:$dst,
+                         (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))],
+                      itin>;
+  }
 }
 
+defm LXADD : ATOMIC_LOAD_BINOP<0xc0, 0xc1, "xadd", "atomic_load_add",
+                               IIC_XADD_LOCK_MEM8, IIC_XADD_LOCK_MEM>,
+             TB, LOCK;
+
 def ACQUIRE_MOV8rm  : I<0, Pseudo, (outs GR8 :$dst), (ins i8mem :$src),
                       "#ACQUIRE_MOV PSEUDO!",
                       [(set GR8:$dst,  (atomic_load_8  addr:$src))]>;
@@ -1017,7 +979,24 @@ def : Pat<(X86call (i64 tglobaladdr:$dst)),
 def : Pat<(X86call (i64 texternalsym:$dst)),
           (CALL64pcrel32 texternalsym:$dst)>, Requires<[NotNaCl]>;
 
-// tailcall stuff
+// Tailcall stuff. The TCRETURN instructions execute after the epilog, so they
+// can never use callee-saved registers. That is the purpose of the GR64_TC
+// register classes.
+//
+// The only volatile register that is never used by the calling convention is
+// %r11. This happens when calling a vararg function with 6 arguments.
+//
+// Match an X86tcret that uses less than 7 volatile registers.
+def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off),
+                             (X86tcret node:$ptr, node:$off), [{
+  // X86tcret args: (*chain, ptr, imm, regs..., glue)
+  unsigned NumRegs = 0;
+  for (unsigned i = 3, e = N->getNumOperands(); i != e; ++i)
+    if (isa<RegisterSDNode>(N->getOperand(i)) && ++NumRegs > 6)
+      return false;
+  return true;
+}]>;
+
 def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
           (TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>,
           Requires<[In32BitMode]>;
@@ -1041,7 +1020,9 @@ def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
           (TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>,
           Requires<[In64BitMode, NotNaCl]>;
 
-def : Pat<(X86tcret (load addr:$dst), imm:$off),
+// Don't fold loads into X86tcret requiring more than 6 regs.
+// There wouldn't be enough scratch registers for base+index.
+def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off),
           (TCRETURNmi64 addr:$dst, imm:$off)>,
           Requires<[In64BitMode, NotNaCl]>;
 
diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td
index 56638002d8..959d91a9ab 100644
--- a/lib/Target/X86/X86InstrFMA.td
+++ b/lib/Target/X86/X86InstrFMA.td
@@ -42,7 +42,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
                    !strconcat(OpcodeStr,
                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                    [(set VR256:$dst, (OpVT256 (Op VR256:$src2, VR256:$src1,
-                                               VR256:$src3)))]>;
+                                               VR256:$src3)))]>, VEX_L;
 
   let mayLoad = 1 in
   def mY    : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
@@ -51,7 +51,7 @@ multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
                               "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
                    [(set VR256:$dst,
                      (OpVT256 (Op VR256:$src2, VR256:$src1,
-                               (MemFrag256 addr:$src3))))]>;
+                               (MemFrag256 addr:$src3))))]>, VEX_L;
 }
 } // Constraints = "$src1 = $dst"
 
@@ -280,19 +280,19 @@ multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR256:$dst,
              (OpVT256 (OpNode VR256:$src1, VR256:$src2, VR256:$src3)))]>,
-           VEX_W, MemOp4;
+           VEX_W, MemOp4, VEX_L;
   def rmY : FMA4<opc, MRMSrcMem, (outs VR256:$dst),
            (ins VR256:$src1, VR256:$src2, f256mem:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR256:$dst, (OpNode VR256:$src1, VR256:$src2,
-                              (ld_frag256 addr:$src3)))]>, VEX_W, MemOp4;
+                              (ld_frag256 addr:$src3)))]>, VEX_W, MemOp4, VEX_L;
   def mrY : FMA4<opc, MRMSrcMem, (outs VR256:$dst),
            (ins VR256:$src1, f256mem:$src2, VR256:$src3),
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-           [(set VR256:$dst,
-             (OpNode VR256:$src1, (ld_frag256 addr:$src2), VR256:$src3))]>;
+           [(set VR256:$dst, (OpNode VR256:$src1,
+                              (ld_frag256 addr:$src2), VR256:$src3))]>, VEX_L;
 // For disassembler
 let isCodeGenOnly = 1 in {
   def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
@@ -302,7 +302,8 @@ let isCodeGenOnly = 1 in {
   def rrY_REV : FMA4<opc, MRMSrcReg, (outs VR256:$dst),
                 (ins VR256:$src1, VR256:$src2, VR256:$src3),
                 !strconcat(OpcodeStr,
-                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>;
+                "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
+                VEX_L;
 } // isCodeGenOnly = 1
 }
 
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index bad694a04e..e595876dcf 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -2271,7 +2271,7 @@ static X86::CondCode getCondFromSETOpc(unsigned Opc) {
 }
 
 /// getCondFromCmovOpc - return condition code of a CMov opcode.
-static X86::CondCode getCondFromCMovOpc(unsigned Opc) {
+X86::CondCode X86::getCondFromCMovOpc(unsigned Opc) {
   switch (Opc) {
   default: return X86::COND_INVALID;
   case X86::CMOVA16rm:  case X86::CMOVA16rr:  case X86::CMOVA32rm:
@@ -3133,11 +3133,19 @@ inline static bool isDefConvertible(MachineInstr *MI) {
   case X86::SUB8ri:    case X86::SUB64rr:  case X86::SUB32rr:
   case X86::SUB16rr:   case X86::SUB8rr:   case X86::SUB64rm:
   case X86::SUB32rm:   case X86::SUB16rm:  case X86::SUB8rm:
+  case X86::DEC64r:  case X86::DEC32r:  case X86::DEC16r: case X86::DEC8r:
+  case X86::DEC64m:  case X86::DEC32m:  case X86::DEC16m: case X86::DEC8m:
+  case X86::DEC64_32r: case X86::DEC64_16r:
+  case X86::DEC64_32m: case X86::DEC64_16m:
   case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD32ri:
   case X86::ADD32ri8:  case X86::ADD16ri:  case X86::ADD16ri8:
   case X86::ADD8ri:    case X86::ADD64rr:  case X86::ADD32rr:
   case X86::ADD16rr:   case X86::ADD8rr:   case X86::ADD64rm:
   case X86::ADD32rm:   case X86::ADD16rm:  case X86::ADD8rm:
+  case X86::INC64r:  case X86::INC32r:  case X86::INC16r: case X86::INC8r:
+  case X86::INC64m:  case X86::INC32m:  case X86::INC16m: case X86::INC8m:
+  case X86::INC64_32r: case X86::INC64_16r:
+  case X86::INC64_32m: case X86::INC64_16m:
   case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri:
   case X86::AND32ri8:  case X86::AND16ri:  case X86::AND16ri8:
   case X86::AND8ri:    case X86::AND64rr:  case X86::AND32rr:
@@ -3312,7 +3320,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
         if (OldCC != X86::COND_INVALID)
           OpcIsSET = true;
         else
-          OldCC = getCondFromCMovOpc(Instr.getOpcode());
+          OldCC = X86::getCondFromCMovOpc(Instr.getOpcode());
       }
       if (OldCC == X86::COND_INVALID) return false;
     }
@@ -3377,12 +3385,14 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
     Sub->getParent()->insert(MachineBasicBlock::iterator(Sub), Movr0Inst);
   }
 
-  // Make sure Sub instruction defines EFLAGS.
+  // Make sure Sub instruction defines EFLAGS and mark the def live.
+  unsigned LastOperand = Sub->getNumOperands() - 1;
   assert(Sub->getNumOperands() >= 2 &&
-         Sub->getOperand(Sub->getNumOperands()-1).isReg() &&
-         Sub->getOperand(Sub->getNumOperands()-1).getReg() == X86::EFLAGS &&
+         Sub->getOperand(LastOperand).isReg() &&
+         Sub->getOperand(LastOperand).getReg() == X86::EFLAGS &&
          "EFLAGS should be the last operand of SUB, ADD, OR, XOR, AND");
-  Sub->getOperand(Sub->getNumOperands()-1).setIsDef(true);
+  Sub->getOperand(LastOperand).setIsDef(true);
+  Sub->getOperand(LastOperand).setIsDead(false);
   CmpInstr->eraseFromParent();
 
   // Modify the condition code of instructions in OpsToUpdate.
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index b6f69af037..260f054d69 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -61,6 +61,9 @@ namespace X86 {
   // Turn condition code into conditional branch opcode.
   unsigned GetCondBranchFromCond(CondCode CC);
 
+  // Turn CMov opcode into condition code.
+  CondCode getCondFromCMovOpc(unsigned Opc);
+
   /// GetOppositeBranchCondition - Return the inverse of the specified cond,
   /// e.g. turning COND_E to COND_NE.
   CondCode GetOppositeBranchCondition(X86::CondCode CC);
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index d203937fe0..4fce5acc23 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -1282,28 +1282,46 @@ def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
 // Atomic support
 //
 
-
 // Atomic swap. These are just normal xchg instructions. But since a memory
 // operand is referenced, the atomicity is ensured.
+multiclass ATOMIC_SWAP<bits<8> opc8, bits<8> opc, string mnemonic, string frag,
+                       InstrItinClass itin> {
+  let Constraints = "$val = $dst" in {
+    def #NAME#8rm  : I<opc8, MRMSrcMem, (outs GR8:$dst),
+                       (ins GR8:$val, i8mem:$ptr),
+                       !strconcat(mnemonic, "{b}\t{$val, $ptr|$ptr, $val}"),
+                       [(set
+                          GR8:$dst,
+                          (!cast<PatFrag>(frag # "_8") addr:$ptr, GR8:$val))],
+                       itin>;
+    def #NAME#16rm : I<opc, MRMSrcMem, (outs GR16:$dst),
+                       (ins GR16:$val, i16mem:$ptr),
+                       !strconcat(mnemonic, "{w}\t{$val, $ptr|$ptr, $val}"),
+                       [(set
+                          GR16:$dst,
+                          (!cast<PatFrag>(frag # "_16") addr:$ptr, GR16:$val))],
+                       itin>, OpSize;
+    def #NAME#32rm : I<opc, MRMSrcMem, (outs GR32:$dst),
+                       (ins GR32:$val, i32mem:$ptr),
+                       !strconcat(mnemonic, "{l}\t{$val, $ptr|$ptr, $val}"),
+                       [(set
+                          GR32:$dst,
+                          (!cast<PatFrag>(frag # "_32") addr:$ptr, GR32:$val))],
+                       itin>;
+    def #NAME#64rm : RI<opc, MRMSrcMem, (outs GR64:$dst),
+                        (ins GR64:$val, i64mem:$ptr),
+                        !strconcat(mnemonic, "{q}\t{$val, $ptr|$ptr, $val}"),
+                        [(set
+                          GR64:$dst,
+                          (!cast<PatFrag>(frag # "_64") addr:$ptr, GR64:$val))],
+                        itin>;
+  }
+}
+
+defm XCHG    : ATOMIC_SWAP<0x86, 0x87, "xchg", "atomic_swap", IIC_XCHG_MEM>;
+
+// Swap between registers.
 let Constraints = "$val = $dst" in {
-def XCHG8rm  : I<0x86, MRMSrcMem, (outs GR8:$dst), (ins GR8:$val, i8mem:$ptr),
-               "xchg{b}\t{$val, $ptr|$ptr, $val}",
-               [(set GR8:$dst, (atomic_swap_8 addr:$ptr, GR8:$val))],
-               IIC_XCHG_MEM>;
-def XCHG16rm : I<0x87, MRMSrcMem, (outs GR16:$dst),(ins GR16:$val, i16mem:$ptr),
-               "xchg{w}\t{$val, $ptr|$ptr, $val}",
-               [(set GR16:$dst, (atomic_swap_16 addr:$ptr, GR16:$val))],
-               IIC_XCHG_MEM>,
-                OpSize;
-def XCHG32rm : I<0x87, MRMSrcMem, (outs GR32:$dst),(ins GR32:$val, i32mem:$ptr),
-               "xchg{l}\t{$val, $ptr|$ptr, $val}",
-               [(set GR32:$dst, (atomic_swap_32 addr:$ptr, GR32:$val))],
-               IIC_XCHG_MEM>;
-def XCHG64rm : RI<0x87, MRMSrcMem, (outs GR64:$dst),(ins GR64:$val,i64mem:$ptr),
-                  "xchg{q}\t{$val, $ptr|$ptr, $val}",
-                  [(set GR64:$dst, (atomic_swap_64 addr:$ptr, GR64:$val))],
-                  IIC_XCHG_MEM>;
-
 def XCHG8rr : I<0x86, MRMSrcReg, (outs GR8:$dst), (ins GR8:$val, GR8:$src),
                 "xchg{b}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
 def XCHG16rr : I<0x87, MRMSrcReg, (outs GR16:$dst), (ins GR16:$val, GR16:$src),
@@ -1314,6 +1332,7 @@ def XCHG64rr : RI<0x87, MRMSrcReg, (outs GR64:$dst), (ins GR64:$val,GR64:$src),
                   "xchg{q}\t{$val, $src|$src, $val}", [], IIC_XCHG_REG>;
 }
 
+// Swap between EAX and other registers.
 def XCHG16ar : I<0x90, AddRegFrm, (outs), (ins GR16:$src),
                   "xchg{w}\t{$src, %ax|AX, $src}", [], IIC_XCHG_REG>, OpSize;
 def XCHG32ar : I<0x90, AddRegFrm, (outs), (ins GR32:$src),
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 17e91a6efb..2aa4f3f4db 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -813,16 +813,16 @@ defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
 
 defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32,
                               "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
-                              TB, VEX;
+                              TB, VEX, VEX_L;
 defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64,
                               "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
-                              TB, OpSize, VEX;
+                              TB, OpSize, VEX, VEX_L;
 defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32,
                               "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
-                              TB, VEX;
+                              TB, VEX, VEX_L;
 defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64,
                               "movupd", SSEPackedDouble, SSE_MOVU_ITINS, 0>,
-                              TB, OpSize, VEX;
+                              TB, OpSize, VEX, VEX_L;
 defm MOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
                               "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
                               TB;
@@ -855,19 +855,19 @@ def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
 def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movaps\t{$src, $dst|$dst, $src}",
                    [(alignedstore256 (v8f32 VR256:$src), addr:$dst)],
-                   IIC_SSE_MOVA_P_MR>, VEX;
+                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
 def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movapd\t{$src, $dst|$dst, $src}",
                    [(alignedstore256 (v4f64 VR256:$src), addr:$dst)],
-                   IIC_SSE_MOVA_P_MR>, VEX;
+                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
 def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movups\t{$src, $dst|$dst, $src}",
                    [(store (v8f32 VR256:$src), addr:$dst)],
-                   IIC_SSE_MOVU_P_MR>, VEX;
+                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
 def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movupd\t{$src, $dst|$dst, $src}",
                    [(store (v4f64 VR256:$src), addr:$dst)],
-                   IIC_SSE_MOVU_P_MR>, VEX;
+                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
 
 // For disassembler
 let isCodeGenOnly = 1 in {
@@ -890,19 +890,19 @@ let isCodeGenOnly = 1 in {
   def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
                             (ins VR256:$src),
                             "movaps\t{$src, $dst|$dst, $src}", [],
-                            IIC_SSE_MOVA_P_RR>, VEX;
+                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
   def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
                             (ins VR256:$src),
                             "movapd\t{$src, $dst|$dst, $src}", [],
-                            IIC_SSE_MOVA_P_RR>, VEX;
+                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
   def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
                             (ins VR256:$src),
                             "movups\t{$src, $dst|$dst, $src}", [],
-                            IIC_SSE_MOVU_P_RR>, VEX;
+                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
   def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
                             (ins VR256:$src),
                             "movupd\t{$src, $dst|$dst, $src}", [],
-                            IIC_SSE_MOVU_P_RR>, VEX;
+                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
 }
 
 let Predicates = [HasAVX] in {
@@ -1659,7 +1659,7 @@ defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
 defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, VR256, i256mem,
                                "vcvtdq2ps\t{$src, $dst|$dst, $src}",
                                SSEPackedSingle, SSE_CVT_PS>,
-                               TB, VEX, Requires<[HasAVX]>;
+                               TB, VEX, VEX_L, Requires<[HasAVX]>;
 
 defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, VR128, i128mem,
                             "cvtdq2ps\t{$src, $dst|$dst, $src}",
@@ -1814,12 +1814,12 @@ def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                         "cvtps2dq\t{$src, $dst|$dst, $src}",
                         [(set VR256:$dst,
                           (int_x86_avx_cvt_ps2dq_256 VR256:$src))],
-                        IIC_SSE_CVT_PS_RR>, VEX;
+                        IIC_SSE_CVT_PS_RR>, VEX, VEX_L;
 def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                         "cvtps2dq\t{$src, $dst|$dst, $src}",
                         [(set VR256:$dst,
                           (int_x86_avx_cvt_ps2dq_256 (memopv8f32 addr:$src)))],
-                        IIC_SSE_CVT_PS_RM>, VEX;
+                        IIC_SSE_CVT_PS_RM>, VEX, VEX_L;
 def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                      "cvtps2dq\t{$src, $dst|$dst, $src}",
                      [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
@@ -1853,7 +1853,7 @@ def VCVTPD2DQXrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
 def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
                        "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
-                         (int_x86_avx_cvt_pd2dq_256 VR256:$src))]>, VEX;
+                         (int_x86_avx_cvt_pd2dq_256 VR256:$src))]>, VEX, VEX_L;
 def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                        "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
@@ -1889,12 +1889,12 @@ def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                           "cvttps2dq\t{$src, $dst|$dst, $src}",
                           [(set VR256:$dst,
                             (int_x86_avx_cvtt_ps2dq_256 VR256:$src))],
-                          IIC_SSE_CVT_PS_RR>, VEX;
+                          IIC_SSE_CVT_PS_RR>, VEX, VEX_L;
 def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                           "cvttps2dq\t{$src, $dst|$dst, $src}",
                           [(set VR256:$dst, (int_x86_avx_cvtt_ps2dq_256
                                              (memopv8f32 addr:$src)))],
-                          IIC_SSE_CVT_PS_RM>, VEX;
+                          IIC_SSE_CVT_PS_RM>, VEX, VEX_L;
 
 def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvttps2dq\t{$src, $dst|$dst, $src}",
@@ -1974,7 +1974,7 @@ def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
                          "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst,
                            (int_x86_avx_cvtt_pd2dq_256 VR256:$src))],
-                         IIC_SSE_CVT_PD_RR>, VEX;
+                         IIC_SSE_CVT_PD_RR>, VEX, VEX_L;
 def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                          "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst,
@@ -2015,12 +2015,12 @@ def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
                      [(set VR256:$dst,
                        (int_x86_avx_cvt_ps2_pd_256 VR128:$src))],
-                     IIC_SSE_CVT_PD_RR>, TB, VEX;
+                     IIC_SSE_CVT_PD_RR>, TB, VEX, VEX_L;
 def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
                      [(set VR256:$dst,
                        (int_x86_avx_cvt_ps2_pd_256 (memopv4f32 addr:$src)))],
-                     IIC_SSE_CVT_PD_RM>, TB, VEX;
+                     IIC_SSE_CVT_PD_RM>, TB, VEX, VEX_L;
 }
 
 let Predicates = [UseSSE2] in {
@@ -2048,11 +2048,11 @@ def VCVTDQ2PDYrm  : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
                      "vcvtdq2pd\t{$src, $dst|$dst, $src}",
                      [(set VR256:$dst,
                        (int_x86_avx_cvtdq2_pd_256
-                        (bitconvert (memopv2i64 addr:$src))))]>, VEX;
+                        (bitconvert (memopv2i64 addr:$src))))]>, VEX, VEX_L;
 def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
                      "vcvtdq2pd\t{$src, $dst|$dst, $src}",
                      [(set VR256:$dst,
-                       (int_x86_avx_cvtdq2_pd_256 VR128:$src))]>, VEX;
+                       (int_x86_avx_cvtdq2_pd_256 VR128:$src))]>, VEX, VEX_L;
 }
 
 let neverHasSideEffects = 1, mayLoad = 1 in
@@ -2095,7 +2095,7 @@ def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
                         "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
                           (int_x86_avx_cvt_pd2_ps_256 VR256:$src))],
-                        IIC_SSE_CVT_PD_RR>, VEX;
+                        IIC_SSE_CVT_PD_RR>, VEX, VEX_L;
 def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                         "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
@@ -2329,11 +2329,11 @@ defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse2_cmp_pd,
 defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_ps_256,
                "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SSEPackedSingle>, TB, VEX_4V;
+               SSEPackedSingle>, TB, VEX_4V, VEX_L;
 defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_pd_256,
                "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SSEPackedDouble>, TB, OpSize, VEX_4V;
+               SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
 let Constraints = "$src1 = $dst" in {
   defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps,
                  "cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
@@ -2403,13 +2403,13 @@ defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
            memopv4f32, SSEPackedSingle>, TB, VEX_4V;
 defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
            "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-           memopv8f32, SSEPackedSingle>, TB, VEX_4V;
+           memopv8f32, SSEPackedSingle>, TB, VEX_4V, VEX_L;
 defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
            "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}",
            memopv2f64, SSEPackedDouble>, TB, OpSize, VEX_4V;
 defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
            "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src2, $src2, $src3}",
-           memopv4f64, SSEPackedDouble>, TB, OpSize, VEX_4V;
+           memopv4f64, SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
 
 let Constraints = "$src1 = $dst" in {
   defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
@@ -2503,16 +2503,16 @@ defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64,
 
 defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, memopv8f32,
       VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedSingle>, TB, VEX_4V;
+                     SSEPackedSingle>, TB, VEX_4V, VEX_L;
 defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, memopv4f64,
       VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedDouble>, TB, OpSize, VEX_4V;
+                     SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
 defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, memopv8f32,
       VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedSingle>, TB, VEX_4V;
+                     SSEPackedSingle>, TB, VEX_4V, VEX_L;
 defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, memopv4f64,
       VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedDouble>, TB, OpSize, VEX_4V;
+                     SSEPackedDouble>, TB, OpSize, VEX_4V, VEX_L;
 
 let Constraints = "$src1 = $dst" in {
   defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
@@ -2589,10 +2589,11 @@ let Predicates = [HasAVX] in {
                                         "movmskpd", SSEPackedDouble>, TB,
                                         OpSize, VEX;
   defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_ps_256,
-                                        "movmskps", SSEPackedSingle>, TB, VEX;
+                                        "movmskps", SSEPackedSingle>, TB,
+                                        VEX, VEX_L;
   defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, int_x86_avx_movmsk_pd_256,
                                         "movmskpd", SSEPackedDouble>, TB,
-                                        OpSize, VEX;
+                                        OpSize, VEX, VEX_L;
 
   def : Pat<(i32 (X86fgetsign FR32:$src)),
             (VMOVMSKPSrr32 (COPY_TO_REGCLASS FR32:$src, VR128))>;
@@ -2613,11 +2614,11 @@ let Predicates = [HasAVX] in {
              OpSize, VEX;
   def VMOVMSKPSYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
              "movmskps\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
-             SSEPackedSingle>, TB, VEX;
+             SSEPackedSingle>, TB, VEX, VEX_L;
   def VMOVMSKPDYr64r : PI<0x50, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
              "movmskpd\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVMSK,
              SSEPackedDouble>, TB,
-             OpSize, VEX;
+             OpSize, VEX, VEX_L;
 }
 
 defm MOVMSKPS : sse12_extr_sign_mask<VR128, int_x86_sse_movmsk_ps, "movmskps",
@@ -2695,13 +2696,13 @@ defm PANDN : PDI_binop_rm<0xDF, "pandn", X86andnp, v2i64, VR128, memopv2i64,
 
 let Predicates = [HasAVX2] in {
 defm VPANDY : PDI_binop_rm<0xDB, "vpand", and, v4i64, VR256, memopv4i64,
-                           i256mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V;
+                           i256mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPORY  : PDI_binop_rm<0xEB, "vpor", or, v4i64, VR256, memopv4i64,
-                           i256mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V;
+                           i256mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPXORY : PDI_binop_rm<0xEF, "vpxor", xor, v4i64, VR256, memopv4i64,
-                           i256mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V;
+                           i256mem, SSE_BIT_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPANDNY : PDI_binop_rm<0xDF, "vpandn", X86andnp, v4i64, VR256, memopv4i64,
-                            i256mem, SSE_BIT_ITINS_P, 0, 0>, VEX_4V;
+                            i256mem, SSE_BIT_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2787,7 +2788,7 @@ multiclass sse12_fp_packed_logical_y<bits<8> opc, string OpcodeStr,
           !strconcat(OpcodeStr, "ps"), f256mem,
           [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))],
           [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
-                                    (memopv4i64 addr:$src2)))], 0>, TB, VEX_4V;
+                             (memopv4i64 addr:$src2)))], 0>, TB, VEX_4V, VEX_L;
 
     defm PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
           !strconcat(OpcodeStr, "pd"), f256mem,
@@ -2795,7 +2796,7 @@ multiclass sse12_fp_packed_logical_y<bits<8> opc, string OpcodeStr,
                                     (bc_v4i64 (v4f64 VR256:$src2))))],
           [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
                                     (memopv4i64 addr:$src2)))], 0>,
-                                    TB, OpSize, VEX_4V;
+                                    TB, OpSize, VEX_4V, VEX_L;
 }
 
 // AVX 256-bit packed logical ops forms
@@ -2854,10 +2855,10 @@ multiclass basic_sse12_fp_binop_p_y<bits<8> opc, string OpcodeStr,
                                     SizeItins itins> {
   defm PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, VR256,
                 v8f32, f256mem, memopv8f32, SSEPackedSingle, itins.s, 0>,
-                TB;
+                TB, VEX_L;
   defm PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, VR256,
                 v4f64, f256mem, memopv4f64, SSEPackedDouble, itins.d, 0>,
-                TB, OpSize;
+                TB, OpSize, VEX_L;
 }
 
 multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
@@ -2889,11 +2890,11 @@ multiclass basic_sse12_fp_binop_p_y_int<bits<8> opc, string OpcodeStr,
                                         SizeItins itins> {
   defm PSY : sse12_fp_packed_int<opc, OpcodeStr, VR256,
      !strconcat(OpcodeStr, "ps"), "avx", "_ps_256", f256mem, memopv8f32,
-      SSEPackedSingle, itins.s, 0>, TB;
+      SSEPackedSingle, itins.s, 0>, TB, VEX_L;
 
   defm PDY : sse12_fp_packed_int<opc, OpcodeStr, VR256,
      !strconcat(OpcodeStr, "pd"), "avx", "_pd_256", f256mem, memopv4f64,
-      SSEPackedDouble, itins.d, 0>, TB, OpSize;
+      SSEPackedDouble, itins.d, 0>, TB, OpSize, VEX_L;
 }
 
 // Binary Arithmetic instructions
@@ -3063,11 +3064,11 @@ multiclass sse1_fp_unop_p_y<bits<8> opc, string OpcodeStr, SDNode OpNode,
   def PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
               !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
               [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))],
-              itins.rr>;
+              itins.rr>, VEX_L;
   def PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                 !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
                 [(set VR256:$dst, (OpNode (memopv8f32 addr:$src)))],
-                itins.rm>;
+                itins.rm>, VEX_L;
 }
 
 /// sse1_fp_unop_p_int - SSE1 intrinsics unops in packed forms.
@@ -3089,11 +3090,11 @@ multiclass sse1_fp_unop_p_y_int<bits<8> opc, string OpcodeStr,
   def PSYr_Int : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                     !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
                     [(set VR256:$dst, (V4F32Int VR256:$src))],
-                    itins.rr>;
+                    itins.rr>, VEX_L;
   def PSYm_Int : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                     !strconcat(OpcodeStr, "ps\t{$src, $dst|$dst, $src}"),
                     [(set VR256:$dst, (V4F32Int (memopv8f32 addr:$src)))],
-                    itins.rm>;
+                    itins.rm>, VEX_L;
 }
 
 /// sse2_fp_unop_s - SSE2 unops in scalar form.
@@ -3149,11 +3150,11 @@ multiclass sse2_fp_unop_p_y<bits<8> opc, string OpcodeStr, SDNode OpNode,
   def PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
               !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
               [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))],
-              itins.rr>;
+              itins.rr>, VEX_L;
   def PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                 !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
                 [(set VR256:$dst, (OpNode (memopv4f64 addr:$src)))],
-                itins.rm>;
+                itins.rm>, VEX_L;
 }
 
 /// sse2_fp_unop_p_int - SSE2 intrinsic unops in vector forms.
@@ -3175,11 +3176,11 @@ multiclass sse2_fp_unop_p_y_int<bits<8> opc, string OpcodeStr,
   def PDYr_Int : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                     !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
                     [(set VR256:$dst, (V2F64Int VR256:$src))],
-                    itins.rr>;
+                    itins.rr>, VEX_L;
   def PDYm_Int : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                     !strconcat(OpcodeStr, "pd\t{$src, $dst|$dst, $src}"),
                     [(set VR256:$dst, (V2F64Int (memopv4f64 addr:$src)))],
-                    itins.rm>;
+                    itins.rm>, VEX_L;
 }
 
 let Predicates = [HasAVX] in {
@@ -3331,20 +3332,20 @@ let AddedComplexity = 400 in { // Prefer non-temporal versions
                        "movntps\t{$src, $dst|$dst, $src}",
                        [(alignednontemporalstore (v8f32 VR256:$src),
                                                  addr:$dst)],
-                                                 IIC_SSE_MOVNT>, VEX;
+                                                 IIC_SSE_MOVNT>, VEX, VEX_L;
   def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
                        (ins f256mem:$dst, VR256:$src),
                        "movntpd\t{$src, $dst|$dst, $src}",
                        [(alignednontemporalstore (v4f64 VR256:$src),
                                                  addr:$dst)],
-                                                 IIC_SSE_MOVNT>, VEX;
+                                                 IIC_SSE_MOVNT>, VEX, VEX_L;
   let ExeDomain = SSEPackedInt in
   def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
                       (ins f256mem:$dst, VR256:$src),
                       "movntdq\t{$src, $dst|$dst, $src}",
                       [(alignednontemporalstore (v4i64 VR256:$src),
                                                 addr:$dst)],
-                                                IIC_SSE_MOVNT>, VEX;
+                                                IIC_SSE_MOVNT>, VEX, VEX_L;
 }
 
 let AddedComplexity = 400 in { // Prefer non-temporal versions
@@ -3453,14 +3454,14 @@ def VMOVDQArr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     VEX;
 def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                     "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
-                    VEX;
+                    VEX, VEX_L;
 }
 def VMOVDQUrr  : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
                     VEX;
 def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                     "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
-                    VEX;
+                    VEX, VEX_L;
 
 // For Disassembler
 let isCodeGenOnly = 1 in {
@@ -3470,16 +3471,14 @@ def VMOVDQArr_REV  : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                         VEX;
 def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
                         "movdqa\t{$src, $dst|$dst, $src}", [],
-                        IIC_SSE_MOVA_P_RR>,
-                        VEX;
+                        IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
 def VMOVDQUrr_REV  : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                         "movdqu\t{$src, $dst|$dst, $src}", [],
                         IIC_SSE_MOVU_P_RR>,
                         VEX;
 def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
                         "movdqu\t{$src, $dst|$dst, $src}", [],
-                        IIC_SSE_MOVU_P_RR>,
-                        VEX;
+                        IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
 }
 
 let canFoldAsLoad = 1, mayLoad = 1 in {
@@ -3488,14 +3487,14 @@ def VMOVDQArm  : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                    VEX;
 def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
                    "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
-                   VEX;
+                   VEX, VEX_L;
 let Predicates = [HasAVX] in {
   def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                     "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
                     XS, VEX;
   def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
                     "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
-                    XS, VEX;
+                    XS, VEX, VEX_L;
 }
 }
 
@@ -3507,14 +3506,14 @@ def VMOVDQAmr  : VPDI<0x7F, MRMDestMem, (outs),
 def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
                      (ins i256mem:$dst, VR256:$src),
                      "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
-                     VEX;
+                     VEX, VEX_L;
 let Predicates = [HasAVX] in {
 def VMOVDQUmr  : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
                   "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
                   XS, VEX;
 def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
                   "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
-                  XS, VEX;
+                  XS, VEX, VEX_L;
 }
 }
 
@@ -3750,82 +3749,82 @@ defm VPSADBW  : PDI_binop_rm_int<0xF6, "vpsadbw", int_x86_sse2_psad_bw,
 
 let Predicates = [HasAVX2] in {
 defm VPADDBY  : PDI_binop_rm<0xFC, "vpaddb", add, v32i8, VR256, memopv4i64,
-                             i256mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                             i256mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPADDWY  : PDI_binop_rm<0xFD, "vpaddw", add, v16i16, VR256, memopv4i64,
-                             i256mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                             i256mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPADDDY  : PDI_binop_rm<0xFE, "vpaddd", add, v8i32, VR256, memopv4i64,
-                             i256mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                             i256mem, SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPADDQY  : PDI_binop_rm<0xD4, "vpaddq", add, v4i64, VR256, memopv4i64,
-                             i256mem, SSE_INTALUQ_ITINS_P, 1, 0>, VEX_4V;
+                             i256mem, SSE_INTALUQ_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPMULLWY : PDI_binop_rm<0xD5, "vpmullw", mul, v16i16, VR256, memopv4i64,
-                             i256mem, SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
+                             i256mem, SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPSUBBY  : PDI_binop_rm<0xF8, "vpsubb", sub, v32i8, VR256, memopv4i64,
-                             i256mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                             i256mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 defm VPSUBWY  : PDI_binop_rm<0xF9, "vpsubw", sub, v16i16,VR256, memopv4i64,
-                             i256mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                             i256mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 defm VPSUBDY  : PDI_binop_rm<0xFA, "vpsubd", sub, v8i32, VR256, memopv4i64,
-                             i256mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                             i256mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 defm VPSUBQY  : PDI_binop_rm<0xFB, "vpsubq", sub, v4i64, VR256, memopv4i64,
-                             i256mem, SSE_INTALUQ_ITINS_P, 0, 0>, VEX_4V;
+                             i256mem, SSE_INTALUQ_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32,
                                VR256, memopv4i64, i256mem,
-                               SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
+                               SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 
 // Intrinsic forms
 defm VPSUBSBY  : PDI_binop_rm_int<0xE8, "vpsubsb" , int_x86_avx2_psubs_b,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 defm VPSUBSWY  : PDI_binop_rm_int<0xE9, "vpsubsw" , int_x86_avx2_psubs_w,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 defm VPSUBUSBY : PDI_binop_rm_int<0xD8, "vpsubusb", int_x86_avx2_psubus_b,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 defm VPSUBUSWY : PDI_binop_rm_int<0xD9, "vpsubusw", int_x86_avx2_psubus_w,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 defm VPADDSBY  : PDI_binop_rm_int<0xEC, "vpaddsb" , int_x86_avx2_padds_b,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPADDSWY  : PDI_binop_rm_int<0xED, "vpaddsw" , int_x86_avx2_padds_w,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPADDUSBY : PDI_binop_rm_int<0xDC, "vpaddusb", int_x86_avx2_paddus_b,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPADDUSWY : PDI_binop_rm_int<0xDD, "vpaddusw", int_x86_avx2_paddus_w,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPMULHUWY : PDI_binop_rm_int<0xE4, "vpmulhuw", int_x86_avx2_pmulhu_w,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPMULHWY  : PDI_binop_rm_int<0xE5, "vpmulhw" , int_x86_avx2_pmulh_w,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPMADDWDY : PDI_binop_rm_int<0xF5, "vpmaddwd", int_x86_avx2_pmadd_wd,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_PMADD, 1, 0>, VEX_4V;
+                                  SSE_PMADD, 1, 0>, VEX_4V, VEX_L;
 defm VPAVGBY   : PDI_binop_rm_int<0xE0, "vpavgb", int_x86_avx2_pavg_b,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPAVGWY   : PDI_binop_rm_int<0xE3, "vpavgw", int_x86_avx2_pavg_w,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPMINUBY  : PDI_binop_rm_int<0xDA, "vpminub", int_x86_avx2_pminu_b,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPMINSWY  : PDI_binop_rm_int<0xEA, "vpminsw", int_x86_avx2_pmins_w,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPMAXUBY  : PDI_binop_rm_int<0xDE, "vpmaxub", int_x86_avx2_pmaxu_b,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPMAXSWY  : PDI_binop_rm_int<0xEE, "vpmaxsw", int_x86_avx2_pmaxs_w,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 defm VPSADBWY  : PDI_binop_rm_int<0xF6, "vpsadbw", int_x86_avx2_psad_bw,
                                   VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -3961,30 +3960,30 @@ let ExeDomain = SSEPackedInt in {
 let Predicates = [HasAVX2] in {
 defm VPSLLWY : PDI_binop_rmi<0xF1, 0x71, MRM6r, "vpsllw", X86vshl, X86vshli,
                              VR256, v16i16, v8i16, bc_v8i16,
-                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 defm VPSLLDY : PDI_binop_rmi<0xF2, 0x72, MRM6r, "vpslld", X86vshl, X86vshli,
                              VR256, v8i32, v4i32, bc_v4i32,
-                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 defm VPSLLQY : PDI_binop_rmi<0xF3, 0x73, MRM6r, "vpsllq", X86vshl, X86vshli,
                              VR256, v4i64, v2i64, bc_v2i64,
-                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 
 defm VPSRLWY : PDI_binop_rmi<0xD1, 0x71, MRM2r, "vpsrlw", X86vsrl, X86vsrli,
                              VR256, v16i16, v8i16, bc_v8i16,
-                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 defm VPSRLDY : PDI_binop_rmi<0xD2, 0x72, MRM2r, "vpsrld", X86vsrl, X86vsrli,
                              VR256, v8i32, v4i32, bc_v4i32,
-                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 defm VPSRLQY : PDI_binop_rmi<0xD3, 0x73, MRM2r, "vpsrlq", X86vsrl, X86vsrli,
                              VR256, v4i64, v2i64, bc_v2i64,
-                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 
 defm VPSRAWY : PDI_binop_rmi<0xE1, 0x71, MRM4r, "vpsraw", X86vsra, X86vsrai,
                              VR256, v16i16, v8i16, bc_v8i16,
-                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 defm VPSRADY : PDI_binop_rmi<0xE2, 0x72, MRM4r, "vpsrad", X86vsra, X86vsrai,
                              VR256, v8i32, v4i32, bc_v4i32,
-                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V;
+                             SSE_INTSHIFT_ITINS_P, 0>, VEX_4V, VEX_L;
 
 let ExeDomain = SSEPackedInt in {
   // 256-bit logical shifts.
@@ -3993,13 +3992,13 @@ let ExeDomain = SSEPackedInt in {
                     "vpslldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set VR256:$dst,
                       (int_x86_avx2_psll_dq_bs VR256:$src1, imm:$src2))]>,
-                    VEX_4V;
+                    VEX_4V, VEX_L;
   def VPSRLDQYri : PDIi8<0x73, MRM3r,
                     (outs VR256:$dst), (ins VR256:$src1, i32i8imm:$src2),
                     "vpsrldq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set VR256:$dst,
                       (int_x86_avx2_psrl_dq_bs VR256:$src1, imm:$src2))]>,
-                    VEX_4V;
+                    VEX_4V, VEX_L;
   // PSRADQYri doesn't exist in SSE[1-3].
 }
 } // Predicates = [HasAVX2]
@@ -4113,22 +4112,22 @@ let Predicates = [HasAVX] in {
 let Predicates = [HasAVX2] in {
   defm VPCMPEQBY : PDI_binop_rm<0x74, "vpcmpeqb", X86pcmpeq, v32i8,
                                 VR256, memopv4i64, i256mem,
-                                SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
   defm VPCMPEQWY : PDI_binop_rm<0x75, "vpcmpeqw", X86pcmpeq, v16i16,
                                 VR256, memopv4i64, i256mem,
-                                SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
   defm VPCMPEQDY : PDI_binop_rm<0x76, "vpcmpeqd", X86pcmpeq, v8i32,
                                 VR256, memopv4i64, i256mem,
-                                SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
+                                SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
   defm VPCMPGTBY : PDI_binop_rm<0x64, "vpcmpgtb", X86pcmpgt, v32i8,
                                 VR256, memopv4i64, i256mem,
-                                SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                                SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
   defm VPCMPGTWY : PDI_binop_rm<0x65, "vpcmpgtw", X86pcmpgt, v16i16,
                                 VR256, memopv4i64, i256mem,
-                                SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                                SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
   defm VPCMPGTDY : PDI_binop_rm<0x66, "vpcmpgtd", X86pcmpgt, v8i32,
                                 VR256, memopv4i64, i256mem,
-                                SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                                SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -4171,13 +4170,13 @@ defm VPACKUSWB : PDI_binop_rm_int<0x67, "vpackuswb", int_x86_sse2_packuswb_128,
 let Predicates = [HasAVX2] in {
 defm VPACKSSWBY : PDI_binop_rm_int<0x63, "vpacksswb", int_x86_avx2_packsswb,
                                    VR256, memopv4i64, i256mem,
-                                   SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                                   SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 defm VPACKSSDWY : PDI_binop_rm_int<0x6B, "vpackssdw", int_x86_avx2_packssdw,
                                    VR256, memopv4i64, i256mem,
-                                   SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                                   SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 defm VPACKUSWBY : PDI_binop_rm_int<0x67, "vpackuswb", int_x86_avx2_packuswb,
                                    VR256, memopv4i64, i256mem,
-                                   SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
+                                   SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -4247,9 +4246,12 @@ let Predicates = [HasAVX] in {
 }
 
 let Predicates = [HasAVX2] in {
-  defm VPSHUFD : sse2_pshuffle_y<"vpshufd", v8i32, X86PShufd>, TB, OpSize, VEX;
-  defm VPSHUFHW : sse2_pshuffle_y<"vpshufhw", v16i16, X86PShufhw>, XS, VEX;
-  defm VPSHUFLW : sse2_pshuffle_y<"vpshuflw", v16i16, X86PShuflw>, XD, VEX;
+  defm VPSHUFD : sse2_pshuffle_y<"vpshufd", v8i32, X86PShufd>,
+                                TB, OpSize, VEX,VEX_L;
+  defm VPSHUFHW : sse2_pshuffle_y<"vpshufhw", v16i16, X86PShufhw>,
+                                  XS, VEX, VEX_L;
+  defm VPSHUFLW : sse2_pshuffle_y<"vpshuflw", v16i16, X86PShuflw>,
+                                  XD, VEX, VEX_L;
 }
 
 let Predicates = [UseSSE2] in {
@@ -4328,22 +4330,22 @@ let Predicates = [HasAVX] in {
 
 let Predicates = [HasAVX2] in {
   defm VPUNPCKLBW  : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl,
-                                   bc_v32i8>, VEX_4V;
+                                   bc_v32i8>, VEX_4V, VEX_L;
   defm VPUNPCKLWD  : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl,
-                                   bc_v16i16>, VEX_4V;
+                                   bc_v16i16>, VEX_4V, VEX_L;
   defm VPUNPCKLDQ  : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl,
-                                   bc_v8i32>, VEX_4V;
+                                   bc_v8i32>, VEX_4V, VEX_L;
   defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl,
-                                   bc_v4i64>, VEX_4V;
+                                   bc_v4i64>, VEX_4V, VEX_L;
 
   defm VPUNPCKHBW  : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh,
-                                   bc_v32i8>, VEX_4V;
+                                   bc_v32i8>, VEX_4V, VEX_L;
   defm VPUNPCKHWD  : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh,
-                                   bc_v16i16>, VEX_4V;
+                                   bc_v16i16>, VEX_4V, VEX_L;
   defm VPUNPCKHDQ  : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh,
-                                   bc_v8i32>, VEX_4V;
+                                   bc_v8i32>, VEX_4V, VEX_L;
   defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh,
-                                   bc_v4i64>, VEX_4V;
+                                   bc_v4i64>, VEX_4V, VEX_L;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -4435,9 +4437,9 @@ def VPMOVMSKBr64r : VPDI<0xD7, MRMSrcReg, (outs GR64:$dst), (ins VR128:$src),
 let Predicates = [HasAVX2] in {
 def VPMOVMSKBYrr  : VPDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR256:$src),
            "pmovmskb\t{$src, $dst|$dst, $src}",
-           [(set GR32:$dst, (int_x86_avx2_pmovmskb VR256:$src))]>, VEX;
+           [(set GR32:$dst, (int_x86_avx2_pmovmskb VR256:$src))]>, VEX, VEX_L;
 def VPMOVMSKBYr64r : VPDI<0xD7, MRMSrcReg, (outs GR64:$dst), (ins VR256:$src),
-           "pmovmskb\t{$src, $dst|$dst, $src}", []>, VEX;
+           "pmovmskb\t{$src, $dst|$dst, $src}", []>, VEX, VEX_L;
 }
 
 def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32:$dst), (ins VR128:$src),
@@ -4900,9 +4902,9 @@ let Predicates = [HasAVX] in {
   defm VMOVSLDUP  : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
                                        v4f32, VR128, memopv4f32, f128mem>, VEX;
   defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
-                                       v8f32, VR256, memopv8f32, f256mem>, VEX;
+                                 v8f32, VR256, memopv8f32, f256mem>, VEX, VEX_L;
   defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
-                                       v8f32, VR256, memopv8f32, f256mem>, VEX;
+                                 v8f32, VR256, memopv8f32, f256mem>, VEX, VEX_L;
 }
 defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
                                    memopv4f32, f128mem>;
@@ -4970,7 +4972,7 @@ def rm  : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
 
 let Predicates = [HasAVX] in {
   defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup">, VEX;
-  defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX;
+  defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L;
 }
 
 defm MOVDDUP : sse3_replicate_dfp<"movddup">;
@@ -5019,7 +5021,8 @@ let Predicates = [HasAVX] in {
                    [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX;
   def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
                    "vlddqu\t{$src, $dst|$dst, $src}",
-                   [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>, VEX;
+                   [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
+                   VEX, VEX_L;
 }
 def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                    "lddqu\t{$src, $dst|$dst, $src}",
@@ -5052,13 +5055,13 @@ let Predicates = [HasAVX] in {
     defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128,
                                  f128mem, SSE_ALU_F32P, 0>, TB, XD, VEX_4V;
     defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256,
-                                 f256mem, SSE_ALU_F32P, 0>, TB, XD, VEX_4V;
+                               f256mem, SSE_ALU_F32P, 0>, TB, XD, VEX_4V, VEX_L;
   }
   let ExeDomain = SSEPackedDouble in {
     defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128,
                                  f128mem, SSE_ALU_F64P, 0>, TB, OpSize, VEX_4V;
     defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256,
-                                 f256mem, SSE_ALU_F64P, 0>, TB, OpSize, VEX_4V;
+                           f256mem, SSE_ALU_F64P, 0>, TB, OpSize, VEX_4V, VEX_L;
   }
 }
 let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
@@ -5113,9 +5116,9 @@ let Predicates = [HasAVX] in {
     defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
                             X86fhsub, 0>, VEX_4V;
     defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
-                            X86fhadd, 0>, VEX_4V;
+                            X86fhadd, 0>, VEX_4V, VEX_L;
     defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
-                            X86fhsub, 0>, VEX_4V;
+                            X86fhsub, 0>, VEX_4V, VEX_L;
   }
   let ExeDomain = SSEPackedDouble in {
     defm VHADDPD  : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem,
@@ -5123,9 +5126,9 @@ let Predicates = [HasAVX] in {
     defm VHSUBPD  : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem,
                             X86fhsub, 0>, VEX_4V;
     defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem,
-                            X86fhadd, 0>, VEX_4V;
+                            X86fhadd, 0>, VEX_4V, VEX_L;
     defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem,
-                            X86fhsub, 0>, VEX_4V;
+                            X86fhsub, 0>, VEX_4V, VEX_L;
   }
 }
 
@@ -5191,11 +5194,11 @@ let Predicates = [HasAVX] in {
 
 let Predicates = [HasAVX2] in {
   defm VPABSB  : SS3I_unop_rm_int_y<0x1C, "vpabsb",
-                                    int_x86_avx2_pabs_b>, VEX;
+                                    int_x86_avx2_pabs_b>, VEX, VEX_L;
   defm VPABSW  : SS3I_unop_rm_int_y<0x1D, "vpabsw",
-                                    int_x86_avx2_pabs_w>, VEX;
+                                    int_x86_avx2_pabs_w>, VEX, VEX_L;
   defm VPABSD  : SS3I_unop_rm_int_y<0x1E, "vpabsd",
-                                    int_x86_avx2_pabs_d>, VEX;
+                                    int_x86_avx2_pabs_d>, VEX, VEX_L;
 }
 
 defm PABSB : SS3I_unop_rm_int<0x1C, "pabsb",
@@ -5334,37 +5337,37 @@ let ImmT = NoImm, Predicates = [HasAVX2] in {
 let isCommutable = 0 in {
   defm VPHADDWY   : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, VR256,
                                   memopv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPHADDDY   : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, VR256,
                                   memopv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPHSUBWY   : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, VR256,
                                   memopv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, VR256,
                                   memopv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPSIGNBY   : SS3I_binop_rm<0x08, "vpsignb", X86psign, v32i8, VR256,
                                   memopv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPSIGNWY   : SS3I_binop_rm<0x09, "vpsignw", X86psign, v16i16, VR256,
                                   memopv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPSIGNDY   : SS3I_binop_rm<0x0A, "vpsignd", X86psign, v8i32, VR256,
                                   memopv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, VR256,
                                   memopv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPHADDSW   : SS3I_binop_rm_int_y<0x03, "vphaddsw",
-                                        int_x86_avx2_phadd_sw>, VEX_4V;
+                                        int_x86_avx2_phadd_sw>, VEX_4V, VEX_L;
   defm VPHSUBSW   : SS3I_binop_rm_int_y<0x07, "vphsubsw",
-                                        int_x86_avx2_phsub_sw>, VEX_4V;
+                                        int_x86_avx2_phsub_sw>, VEX_4V, VEX_L;
   defm VPMADDUBSW : SS3I_binop_rm_int_y<0x04, "vpmaddubsw",
-                                        int_x86_avx2_pmadd_ub_sw>, VEX_4V;
+                                       int_x86_avx2_pmadd_ub_sw>, VEX_4V, VEX_L;
 }
 defm VPMULHRSW    : SS3I_binop_rm_int_y<0x0B, "vpmulhrsw",
-                                        int_x86_avx2_pmul_hr_sw>, VEX_4V;
+                                        int_x86_avx2_pmul_hr_sw>, VEX_4V, VEX_L;
 }
 
 // None of these have i8 immediate fields.
@@ -5443,7 +5446,7 @@ multiclass ssse3_palign_y<string asm, bit Is2Addr = 1> {
 let Predicates = [HasAVX] in
   defm VPALIGN : ssse3_palign<"vpalignr", 0>, VEX_4V;
 let Predicates = [HasAVX2] in
-  defm VPALIGN : ssse3_palign_y<"vpalignr", 0>, VEX_4V;
+  defm VPALIGN : ssse3_palign_y<"vpalignr", 0>, VEX_4V, VEX_L;
 let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
   defm PALIGN : ssse3_palign<"palignr">;
 
@@ -5550,17 +5553,17 @@ defm VPMOVZXDQ : SS41I_binop_rm_int8<0x35, "vpmovzxdq", int_x86_sse41_pmovzxdq>,
 
 let Predicates = [HasAVX2] in {
 defm VPMOVSXBW : SS41I_binop_rm_int16_y<0x20, "vpmovsxbw",
-                                        int_x86_avx2_pmovsxbw>, VEX;
+                                        int_x86_avx2_pmovsxbw>, VEX, VEX_L;
 defm VPMOVSXWD : SS41I_binop_rm_int16_y<0x23, "vpmovsxwd",
-                                        int_x86_avx2_pmovsxwd>, VEX;
+                                        int_x86_avx2_pmovsxwd>, VEX, VEX_L;
 defm VPMOVSXDQ : SS41I_binop_rm_int16_y<0x25, "vpmovsxdq",
-                                        int_x86_avx2_pmovsxdq>, VEX;
+                                        int_x86_avx2_pmovsxdq>, VEX, VEX_L;
 defm VPMOVZXBW : SS41I_binop_rm_int16_y<0x30, "vpmovzxbw",
-                                        int_x86_avx2_pmovzxbw>, VEX;
+                                        int_x86_avx2_pmovzxbw>, VEX, VEX_L;
 defm VPMOVZXWD : SS41I_binop_rm_int16_y<0x33, "vpmovzxwd",
-                                        int_x86_avx2_pmovzxwd>, VEX;
+                                        int_x86_avx2_pmovzxwd>, VEX, VEX_L;
 defm VPMOVZXDQ : SS41I_binop_rm_int16_y<0x35, "vpmovzxdq",
-                                        int_x86_avx2_pmovzxdq>, VEX;
+                                        int_x86_avx2_pmovzxdq>, VEX, VEX_L;
 }
 
 defm PMOVSXBW   : SS41I_binop_rm_int8<0x20, "pmovsxbw", int_x86_sse41_pmovsxbw>;
@@ -5576,31 +5579,43 @@ let Predicates = [HasAVX] in {
             (VPMOVSXBWrm addr:$src)>;
   def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
             (VPMOVSXBWrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxbw (bc_v16i8 (loadv2i64 addr:$src))),
+            (VPMOVSXBWrm addr:$src)>;
 
   def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
             (VPMOVSXWDrm addr:$src)>;
   def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
             (VPMOVSXWDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxwd (bc_v8i16 (loadv2i64 addr:$src))),
+            (VPMOVSXWDrm addr:$src)>;
 
   def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
             (VPMOVSXDQrm addr:$src)>;
   def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
             (VPMOVSXDQrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxdq (bc_v4i32 (loadv2i64 addr:$src))),
+            (VPMOVSXDQrm addr:$src)>;
 
   def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
             (VPMOVZXBWrm addr:$src)>;
   def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
             (VPMOVZXBWrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxbw (bc_v16i8 (loadv2i64 addr:$src))),
+            (VPMOVZXBWrm addr:$src)>;
 
   def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
             (VPMOVZXWDrm addr:$src)>;
   def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
             (VPMOVZXWDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxwd (bc_v8i16 (loadv2i64 addr:$src))),
+            (VPMOVZXWDrm addr:$src)>;
 
   def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
             (VPMOVZXDQrm addr:$src)>;
   def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
             (VPMOVZXDQrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxdq (bc_v4i32 (loadv2i64 addr:$src))),
+            (VPMOVZXDQrm addr:$src)>;
 }
 
 let Predicates = [UseSSE41] in {
@@ -5609,31 +5624,43 @@ let Predicates = [UseSSE41] in {
             (PMOVSXBWrm addr:$src)>;
   def : Pat<(int_x86_sse41_pmovsxbw (vzload_v2i64 addr:$src)),
             (PMOVSXBWrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxbw (bc_v16i8 (loadv2i64 addr:$src))),
+            (PMOVSXBWrm addr:$src)>;
 
   def : Pat<(int_x86_sse41_pmovsxwd (vzmovl_v2i64 addr:$src)),
             (PMOVSXWDrm addr:$src)>;
   def : Pat<(int_x86_sse41_pmovsxwd (vzload_v2i64 addr:$src)),
             (PMOVSXWDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxwd (bc_v8i16 (loadv2i64 addr:$src))),
+            (PMOVSXWDrm addr:$src)>;
 
   def : Pat<(int_x86_sse41_pmovsxdq (vzmovl_v2i64 addr:$src)),
             (PMOVSXDQrm addr:$src)>;
   def : Pat<(int_x86_sse41_pmovsxdq (vzload_v2i64 addr:$src)),
             (PMOVSXDQrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovsxdq (bc_v4i32 (loadv2i64 addr:$src))),
+            (PMOVSXDQrm addr:$src)>;
 
   def : Pat<(int_x86_sse41_pmovzxbw (vzmovl_v2i64 addr:$src)),
             (PMOVZXBWrm addr:$src)>;
   def : Pat<(int_x86_sse41_pmovzxbw (vzload_v2i64 addr:$src)),
             (PMOVZXBWrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxbw (bc_v16i8 (loadv2i64 addr:$src))),
+            (PMOVZXBWrm addr:$src)>;
 
   def : Pat<(int_x86_sse41_pmovzxwd (vzmovl_v2i64 addr:$src)),
             (PMOVZXWDrm addr:$src)>;
   def : Pat<(int_x86_sse41_pmovzxwd (vzload_v2i64 addr:$src)),
             (PMOVZXWDrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxwd (bc_v8i16 (loadv2i64 addr:$src))),
+            (PMOVZXWDrm addr:$src)>;
 
   def : Pat<(int_x86_sse41_pmovzxdq (vzmovl_v2i64 addr:$src)),
             (PMOVZXDQrm addr:$src)>;
   def : Pat<(int_x86_sse41_pmovzxdq (vzload_v2i64 addr:$src)),
             (PMOVZXDQrm addr:$src)>;
+  def : Pat<(int_x86_sse41_pmovzxdq (bc_v4i32 (loadv2i64 addr:$src))),
+            (PMOVZXDQrm addr:$src)>;
 }
 
 let Predicates = [HasAVX2] in {
@@ -5697,13 +5724,13 @@ defm VPMOVZXWQ : SS41I_binop_rm_int4<0x34, "vpmovzxwq", int_x86_sse41_pmovzxwq>,
 
 let Predicates = [HasAVX2] in {
 defm VPMOVSXBD : SS41I_binop_rm_int8_y<0x21, "vpmovsxbd",
-                                       int_x86_avx2_pmovsxbd>, VEX;
+                                       int_x86_avx2_pmovsxbd>, VEX, VEX_L;
 defm VPMOVSXWQ : SS41I_binop_rm_int8_y<0x24, "vpmovsxwq",
-                                       int_x86_avx2_pmovsxwq>, VEX;
+                                       int_x86_avx2_pmovsxwq>, VEX, VEX_L;
 defm VPMOVZXBD : SS41I_binop_rm_int8_y<0x31, "vpmovzxbd",
-                                       int_x86_avx2_pmovzxbd>, VEX;
+                                       int_x86_avx2_pmovzxbd>, VEX, VEX_L;
 defm VPMOVZXWQ : SS41I_binop_rm_int8_y<0x34, "vpmovzxwq",
-                                       int_x86_avx2_pmovzxwq>, VEX;
+                                       int_x86_avx2_pmovzxwq>, VEX, VEX_L;
 }
 
 defm PMOVSXBD   : SS41I_binop_rm_int4<0x21, "pmovsxbd", int_x86_sse41_pmovsxbd>;
@@ -5772,9 +5799,9 @@ defm VPMOVZXBQ : SS41I_binop_rm_int2<0x32, "vpmovzxbq", int_x86_sse41_pmovzxbq>,
 }
 let Predicates = [HasAVX2] in {
 defm VPMOVSXBQ : SS41I_binop_rm_int4_y<0x22, "vpmovsxbq",
-                                       int_x86_avx2_pmovsxbq>, VEX;
+                                       int_x86_avx2_pmovsxbq>, VEX, VEX_L;
 defm VPMOVZXBQ : SS41I_binop_rm_int4_y<0x32, "vpmovzxbq",
-                                       int_x86_avx2_pmovzxbq>, VEX;
+                                       int_x86_avx2_pmovzxbq>, VEX, VEX_L;
 }
 defm PMOVSXBQ   : SS41I_binop_rm_int2<0x22, "pmovsxbq", int_x86_sse41_pmovsxbq>;
 defm PMOVZXBQ   : SS41I_binop_rm_int2<0x32, "pmovzxbq", int_x86_sse41_pmovzxbq>;
@@ -6185,7 +6212,7 @@ let Predicates = [HasAVX] in {
   defm VROUNDY : sse41_fp_unop_rm<0x08, 0x09, "vround", f256mem, VR256,
                                   memopv8f32, memopv4f64,
                                   int_x86_avx_round_ps_256,
-                                  int_x86_avx_round_pd_256>, VEX;
+                                  int_x86_avx_round_pd_256>, VEX, VEX_L;
   defm VROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "vround",
                                   int_x86_sse41_round_ss,
                                   int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
@@ -6275,11 +6302,11 @@ def VPTESTrm  : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
 def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
                 "vptest\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
-                OpSize, VEX;
+                OpSize, VEX, VEX_L;
 def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
                 "vptest\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS,(X86ptest VR256:$src1, (memopv4i64 addr:$src2)))]>,
-                OpSize, VEX;
+                OpSize, VEX, VEX_L;
 }
 
 let Defs = [EFLAGS] in {
@@ -6308,11 +6335,13 @@ multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
 let Defs = [EFLAGS], Predicates = [HasAVX] in {
 let ExeDomain = SSEPackedSingle in {
 defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, memopv4f32, v4f32>;
-defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, memopv8f32, v8f32>;
+defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, memopv8f32, v8f32>,
+                            VEX_L;
 }
 let ExeDomain = SSEPackedDouble in {
 defm VTESTPD  : avx_bittest<0x0F, "vtestpd", VR128, f128mem, memopv2f64, v2f64>;
-defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, memopv4f64, v4f64>;
+defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, memopv4f64, v4f64>,
+                            VEX_L;
 }
 }
 
@@ -6435,25 +6464,25 @@ let Predicates = [HasAVX] in {
 let Predicates = [HasAVX2] in {
   let isCommutable = 0 in
   defm VPACKUSDW : SS41I_binop_rm_int_y<0x2B, "vpackusdw",
-                                        int_x86_avx2_packusdw>, VEX_4V;
+                                        int_x86_avx2_packusdw>, VEX_4V, VEX_L;
   defm VPMINSB   : SS41I_binop_rm_int_y<0x38, "vpminsb",
-                                        int_x86_avx2_pmins_b>, VEX_4V;
+                                        int_x86_avx2_pmins_b>, VEX_4V, VEX_L;
   defm VPMINSD   : SS41I_binop_rm_int_y<0x39, "vpminsd",
-                                        int_x86_avx2_pmins_d>, VEX_4V;
+                                        int_x86_avx2_pmins_d>, VEX_4V, VEX_L;
   defm VPMINUD   : SS41I_binop_rm_int_y<0x3B, "vpminud",
-                                        int_x86_avx2_pminu_d>, VEX_4V;
+                                        int_x86_avx2_pminu_d>, VEX_4V, VEX_L;
   defm VPMINUW   : SS41I_binop_rm_int_y<0x3A, "vpminuw",
-                                        int_x86_avx2_pminu_w>, VEX_4V;
+                                        int_x86_avx2_pminu_w>, VEX_4V, VEX_L;
   defm VPMAXSB   : SS41I_binop_rm_int_y<0x3C, "vpmaxsb",
-                                        int_x86_avx2_pmaxs_b>, VEX_4V;
+                                        int_x86_avx2_pmaxs_b>, VEX_4V, VEX_L;
   defm VPMAXSD   : SS41I_binop_rm_int_y<0x3D, "vpmaxsd",
-                                        int_x86_avx2_pmaxs_d>, VEX_4V;
+                                        int_x86_avx2_pmaxs_d>, VEX_4V, VEX_L;
   defm VPMAXUD   : SS41I_binop_rm_int_y<0x3F, "vpmaxud",
-                                        int_x86_avx2_pmaxu_d>, VEX_4V;
+                                        int_x86_avx2_pmaxu_d>, VEX_4V, VEX_L;
   defm VPMAXUW   : SS41I_binop_rm_int_y<0x3E, "vpmaxuw",
-                                        int_x86_avx2_pmaxu_w>, VEX_4V;
+                                        int_x86_avx2_pmaxu_w>, VEX_4V, VEX_L;
   defm VPMULDQ   : SS41I_binop_rm_int_y<0x28, "vpmuldq",
-                                        int_x86_avx2_pmul_dq>, VEX_4V;
+                                        int_x86_avx2_pmul_dq>, VEX_4V, VEX_L;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -6499,9 +6528,9 @@ let Predicates = [HasAVX] in {
 }
 let Predicates = [HasAVX2] in {
   defm VPMULLDY  : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V;
+                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
   defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V;
+                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -6544,13 +6573,15 @@ let Predicates = [HasAVX] in {
     defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps,
                                         VR128, memopv4f32, f128mem, 0>, VEX_4V;
     defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps",
-              int_x86_avx_blend_ps_256, VR256, memopv8f32, f256mem, 0>, VEX_4V;
+                                    int_x86_avx_blend_ps_256, VR256, memopv8f32,
+                                    f256mem, 0>, VEX_4V, VEX_L;
     }
     let ExeDomain = SSEPackedDouble in {
     defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd,
                                         VR128, memopv2f64, f128mem, 0>, VEX_4V;
     defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd",
-              int_x86_avx_blend_pd_256, VR256, memopv4f64, f256mem, 0>, VEX_4V;
+                                     int_x86_avx_blend_pd_256,VR256, memopv4f64,
+                                     f256mem, 0>, VEX_4V, VEX_L;
     }
   defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw,
                                       VR128, memopv2i64, i128mem, 0>, VEX_4V;
@@ -6565,15 +6596,15 @@ let Predicates = [HasAVX] in {
                                    VR128, memopv2f64, f128mem, 0>, VEX_4V;
   let ExeDomain = SSEPackedSingle in
   defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
-                                   VR256, memopv8f32, i256mem, 0>, VEX_4V;
+                                  VR256, memopv8f32, i256mem, 0>, VEX_4V, VEX_L;
 }
 
 let Predicates = [HasAVX2] in {
   let isCommutable = 0 in {
   defm VPBLENDWY : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_avx2_pblendw,
-                                       VR256, memopv4i64, i256mem, 0>, VEX_4V;
+                                  VR256, memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
   defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
-                                       VR256, memopv4i64, i256mem, 0>, VEX_4V;
+                                  VR256, memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
   }
 }
 
@@ -6624,13 +6655,13 @@ let ExeDomain = SSEPackedDouble in {
 defm VBLENDVPD  : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, f128mem,
                                            memopv2f64, int_x86_sse41_blendvpd>;
 defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, f256mem,
-                                         memopv4f64, int_x86_avx_blendv_pd_256>;
+                                  memopv4f64, int_x86_avx_blendv_pd_256>, VEX_L;
 } // ExeDomain = SSEPackedDouble
 let ExeDomain = SSEPackedSingle in {
 defm VBLENDVPS  : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, f128mem,
                                            memopv4f32, int_x86_sse41_blendvps>;
 defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, f256mem,
-                                         memopv8f32, int_x86_avx_blendv_ps_256>;
+                                  memopv8f32, int_x86_avx_blendv_ps_256>, VEX_L;
 } // ExeDomain = SSEPackedSingle
 defm VPBLENDVB  : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
                                            memopv2i64, int_x86_sse41_pblendvb>;
@@ -6638,7 +6669,7 @@ defm VPBLENDVB  : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
 
 let Predicates = [HasAVX2] in {
 defm VPBLENDVBY : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR256, i256mem,
-                                           memopv4i64, int_x86_avx2_pblendvb>;
+                                      memopv4i64, int_x86_avx2_pblendvb>, VEX_L;
 }
 
 let Predicates = [HasAVX] in {
@@ -6779,7 +6810,7 @@ let Predicates = [HasAVX2] in
 def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
                          "vmovntdqa\t{$src, $dst|$dst, $src}",
                          [(set VR256:$dst, (int_x86_avx2_movntdqa addr:$src))]>,
-                         OpSize, VEX;
+                         OpSize, VEX, VEX_L;
 def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                        "movntdqa\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
@@ -6815,7 +6846,7 @@ let Predicates = [HasAVX] in
 
 let Predicates = [HasAVX2] in
   defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
-                                  memopv4i64, i256mem, 0>, VEX_4V;
+                                  memopv4i64, i256mem, 0>, VEX_4V, VEX_L;
 
 let Constraints = "$src1 = $dst" in
   defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
@@ -7227,27 +7258,27 @@ let ExeDomain = SSEPackedSingle in {
   def VBROADCASTSSrm  : avx_broadcast<0x18, "vbroadcastss", VR128, f32mem,
                                       int_x86_avx_vbroadcast_ss>;
   def VBROADCASTSSYrm : avx_broadcast<0x18, "vbroadcastss", VR256, f32mem,
-                                      int_x86_avx_vbroadcast_ss_256>;
+                                      int_x86_avx_vbroadcast_ss_256>, VEX_L;
 }
 let ExeDomain = SSEPackedDouble in
 def VBROADCASTSDYrm  : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem,
-                                    int_x86_avx_vbroadcast_sd_256>;
+                                    int_x86_avx_vbroadcast_sd_256>, VEX_L;
 def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem,
-                                   int_x86_avx_vbroadcastf128_pd_256>;
+                                   int_x86_avx_vbroadcastf128_pd_256>, VEX_L;
 
 let ExeDomain = SSEPackedSingle in {
   def VBROADCASTSSrr  : avx2_broadcast_reg<0x18, "vbroadcastss", VR128,
                                            int_x86_avx2_vbroadcast_ss_ps>;
   def VBROADCASTSSYrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR256,
-                                           int_x86_avx2_vbroadcast_ss_ps_256>;
+                                      int_x86_avx2_vbroadcast_ss_ps_256>, VEX_L;
 }
 let ExeDomain = SSEPackedDouble in
 def VBROADCASTSDYrr  : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256,
-                                          int_x86_avx2_vbroadcast_sd_pd_256>;
+                                      int_x86_avx2_vbroadcast_sd_pd_256>, VEX_L;
 
 let Predicates = [HasAVX2] in
 def VBROADCASTI128 : avx_broadcast<0x5A, "vbroadcasti128", VR256, i128mem,
-                                   int_x86_avx2_vbroadcasti128>;
+                                   int_x86_avx2_vbroadcasti128>, VEX_L;
 
 let Predicates = [HasAVX] in
 def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src),
@@ -7261,12 +7292,12 @@ let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in {
 def VINSERTF128rr : AVXAIi8<0x18, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR128:$src2, i8imm:$src3),
           "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          []>, VEX_4V;
+          []>, VEX_4V, VEX_L;
 let mayLoad = 1 in
 def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, f128mem:$src2, i8imm:$src3),
           "vinsertf128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          []>, VEX_4V;
+          []>, VEX_4V, VEX_L;
 }
 
 let Predicates = [HasAVX] in {
@@ -7335,12 +7366,12 @@ let neverHasSideEffects = 1, ExeDomain = SSEPackedSingle in {
 def VEXTRACTF128rr : AVXAIi8<0x19, MRMDestReg, (outs VR128:$dst),
           (ins VR256:$src1, i8imm:$src2),
           "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-          []>, VEX;
+          []>, VEX, VEX_L;
 let mayStore = 1 in
 def VEXTRACTF128mr : AVXAIi8<0x19, MRMDestMem, (outs),
           (ins f128mem:$dst, VR256:$src1, i8imm:$src2),
           "vextractf128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-          []>, VEX;
+          []>, VEX, VEX_L;
 }
 
 // AVX1 patterns
@@ -7415,7 +7446,7 @@ multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
              (ins VR256:$src1, f256mem:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
-             VEX_4V;
+             VEX_4V, VEX_L;
   def mr  : AVX8I<opc_mr, MRMDestMem, (outs),
              (ins f128mem:$dst, VR128:$src1, VR128:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -7423,7 +7454,7 @@ multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
   def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
              (ins f256mem:$dst, VR256:$src1, VR256:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V;
+             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
 }
 
 let ExeDomain = SSEPackedSingle in
@@ -7471,13 +7502,13 @@ let ExeDomain = SSEPackedSingle in {
   defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
                                memopv2i64, int_x86_avx_vpermilvar_ps, v4f32>;
   defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
-                              memopv4i64, int_x86_avx_vpermilvar_ps_256, v8f32>;
+                       memopv4i64, int_x86_avx_vpermilvar_ps_256, v8f32>, VEX_L;
 }
 let ExeDomain = SSEPackedDouble in {
   defm VPERMILPD  : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
                                memopv2i64, int_x86_avx_vpermilvar_pd, v2f64>;
   defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
-                              memopv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>;
+                       memopv4i64, int_x86_avx_vpermilvar_pd_256, v4f64>, VEX_L;
 }
 
 let Predicates = [HasAVX] in {
@@ -7505,12 +7536,12 @@ def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR256:$src2, i8imm:$src3),
           "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
           [(set VR256:$dst, (v8f32 (X86VPerm2x128 VR256:$src1, VR256:$src2,
-                              (i8 imm:$src3))))]>, VEX_4V;
+                              (i8 imm:$src3))))]>, VEX_4V, VEX_L;
 def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, f256mem:$src2, i8imm:$src3),
           "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
           [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (memopv8f32 addr:$src2),
-                             (i8 imm:$src3)))]>, VEX_4V;
+                             (i8 imm:$src3)))]>, VEX_4V, VEX_L;
 }
 
 let Predicates = [HasAVX] in {
@@ -7587,9 +7618,9 @@ multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
 
 let Predicates = [HasAVX, HasF16C] in {
   defm VCVTPH2PS  : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>;
-  defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>;
+  defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L;
   defm VCVTPS2PH  : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>;
-  defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>;
+  defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>, VEX_L;
 }
 
 //===----------------------------------------------------------------------===//
@@ -7621,7 +7652,7 @@ let isCommutable = 0 in {
 defm VPBLENDD : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_128,
                                    VR128, memopv2i64, i128mem>;
 defm VPBLENDDY : AVX2_binop_rmi_int<0x02, "vpblendd", int_x86_avx2_pblendd_256,
-                                    VR256, memopv4i64, i256mem>;
+                                    VR256, memopv4i64, i256mem>, VEX_L;
 }
 
 //===----------------------------------------------------------------------===//
@@ -7640,11 +7671,12 @@ multiclass avx2_broadcast<bits<8> opc, string OpcodeStr,
                     (Int128 (scalar_to_vector (ld_frag addr:$src))))]>, VEX;
   def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                   [(set VR256:$dst, (Int256 VR128:$src))]>, VEX;
+                   [(set VR256:$dst, (Int256 VR128:$src))]>, VEX, VEX_L;
   def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst), (ins x86memop:$src),
                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                    [(set VR256:$dst,
-                    (Int256 (scalar_to_vector (ld_frag addr:$src))))]>, VEX;
+                    (Int256 (scalar_to_vector (ld_frag addr:$src))))]>,
+                   VEX, VEX_L;
 }
 
 defm VPBROADCASTB  : avx2_broadcast<0x78, "vpbroadcastb", i8mem, loadi8,
@@ -7779,7 +7811,8 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                    !strconcat(OpcodeStr,
                        "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                    [(set VR256:$dst,
-                     (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>, VEX_4V;
+                     (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
+                   VEX_4V, VEX_L;
   def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
                    (ins VR256:$src1, i256mem:$src2),
                    !strconcat(OpcodeStr,
@@ -7787,7 +7820,7 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                    [(set VR256:$dst,
                      (OpVT (X86VPermv VR256:$src1,
                             (bitconvert (mem_frag addr:$src2)))))]>,
-                   VEX_4V;
+                   VEX_4V, VEX_L;
 }
 
 defm VPERMD : avx2_perm<0x36, "vpermd", memopv4i64, v8i32>;
@@ -7801,14 +7834,15 @@ multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                      !strconcat(OpcodeStr,
                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set VR256:$dst,
-                       (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>, VEX;
+                       (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>,
+                     VEX, VEX_L;
   def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
                      (ins i256mem:$src1, i8imm:$src2),
                      !strconcat(OpcodeStr,
                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set VR256:$dst,
                        (OpVT (X86VPermi (mem_frag addr:$src1),
-                              (i8 imm:$src2))))]>, VEX;
+                              (i8 imm:$src2))))]>, VEX, VEX_L;
 }
 
 defm VPERMQ : avx2_perm_imm<0x00, "vpermq", memopv4i64, v4i64>, VEX_W;
@@ -7822,12 +7856,12 @@ def VPERM2I128rr : AVX2AIi8<0x46, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR256:$src2, i8imm:$src3),
           "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
           [(set VR256:$dst, (v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
-                            (i8 imm:$src3))))]>, VEX_4V;
+                            (i8 imm:$src3))))]>, VEX_4V, VEX_L;
 def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, f256mem:$src2, i8imm:$src3),
           "vperm2i128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
           [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (memopv4i64 addr:$src2),
-                             (i8 imm:$src3)))]>, VEX_4V;
+                             (i8 imm:$src3)))]>, VEX_4V, VEX_L;
 
 let Predicates = [HasAVX2] in {
 def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
@@ -7856,12 +7890,12 @@ let neverHasSideEffects = 1 in {
 def VINSERTI128rr : AVX2AIi8<0x38, MRMSrcReg, (outs VR256:$dst),
           (ins VR256:$src1, VR128:$src2, i8imm:$src3),
           "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          []>, VEX_4V;
+          []>, VEX_4V, VEX_L;
 let mayLoad = 1 in
 def VINSERTI128rm : AVX2AIi8<0x38, MRMSrcMem, (outs VR256:$dst),
           (ins VR256:$src1, i128mem:$src2, i8imm:$src3),
           "vinserti128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-          []>, VEX_4V;
+          []>, VEX_4V, VEX_L;
 }
 
 let Predicates = [HasAVX2] in {
@@ -7911,11 +7945,12 @@ def VEXTRACTI128rr : AVX2AIi8<0x39, MRMDestReg, (outs VR128:$dst),
           "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}",
           [(set VR128:$dst,
             (int_x86_avx2_vextracti128 VR256:$src1, imm:$src2))]>,
-          VEX;
+          VEX, VEX_L;
 let neverHasSideEffects = 1, mayStore = 1 in
 def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
           (ins i128mem:$dst, VR256:$src1, i8imm:$src2),
-          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, VEX;
+          "vextracti128\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+          VEX, VEX_L;
 
 let Predicates = [HasAVX2] in {
 def : Pat<(vextractf128_extract:$ext VR256:$src1, (iPTR imm)),
@@ -7966,7 +8001,8 @@ multiclass avx2_pmovmask<string OpcodeStr,
   def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
              (ins VR256:$src1, i256mem:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, VEX_4V;
+             [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
+             VEX_4V, VEX_L;
   def mr  : AVX28I<0x8e, MRMDestMem, (outs),
              (ins i128mem:$dst, VR128:$src1, VR128:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -7974,7 +8010,7 @@ multiclass avx2_pmovmask<string OpcodeStr,
   def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
              (ins i256mem:$dst, VR256:$src1, VR256:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V;
+             [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
 }
 
 defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
@@ -8012,14 +8048,14 @@ multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR256:$dst,
                (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
-             VEX_4V;
+             VEX_4V, VEX_L;
   def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
              (ins VR256:$src1, i256mem:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR256:$dst,
                (vt256 (OpNode VR256:$src1,
                        (vt256 (bitconvert (memopv4i64 addr:$src2))))))]>,
-             VEX_4V;
+             VEX_4V, VEX_L;
 }
 
 defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>;
diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td
index 8ec2c688d3..2aa08fad78 100644
--- a/lib/Target/X86/X86InstrXOP.td
+++ b/lib/Target/X86/X86InstrXOP.td
@@ -75,10 +75,10 @@ multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int,
                      PatFrag memop> {
   def rrY : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR256:$dst, (Int VR256:$src))]>, VEX;
+           [(set VR256:$dst, (Int VR256:$src))]>, VEX, VEX_L;
   def rmY : IXOP<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX;
+           [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, VEX, VEX_L;
 }
 
 let isAsmParserOnly = 1 in {
@@ -238,7 +238,7 @@ multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> {
            !strconcat(OpcodeStr,
            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
            [(set VR256:$dst, (Int VR256:$src1, VR256:$src2, VR256:$src3))]>,
-           VEX_4V, VEX_I8IMM;
+           VEX_4V, VEX_I8IMM, VEX_L;
   def rmY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst),
            (ins VR256:$src1, VR256:$src2, i256mem:$src3),
            !strconcat(OpcodeStr,
@@ -246,7 +246,7 @@ multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> {
            [(set VR256:$dst,
              (Int VR256:$src1, VR256:$src2,
               (bitconvert (memopv4i64 addr:$src3))))]>,
-           VEX_4V, VEX_I8IMM, VEX_W, MemOp4;
+           VEX_4V, VEX_I8IMM, VEX_W, MemOp4, VEX_L;
   def mrY : IXOPi8<opc, MRMSrcMem, (outs VR256:$dst),
            (ins VR256:$src1, f256mem:$src2, VR256:$src3),
            !strconcat(OpcodeStr,
@@ -254,7 +254,7 @@ multiclass xop4op256<bits<8> opc, string OpcodeStr, Intrinsic Int> {
            [(set VR256:$dst,
              (Int VR256:$src1, (bitconvert (memopv4i64 addr:$src2)),
               VR256:$src3))]>,
-           VEX_4V, VEX_I8IMM;
+           VEX_4V, VEX_I8IMM, VEX_L;
 }
 
 let isAsmParserOnly = 1 in {
@@ -287,20 +287,21 @@ multiclass xop5op<bits<8> opc, string OpcodeStr, Intrinsic Int128,
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
         [(set VR256:$dst,
-          (Int256 VR256:$src1, VR256:$src2, VR256:$src3, imm:$src4))]>;
+          (Int256 VR256:$src1, VR256:$src2, VR256:$src3, imm:$src4))]>, VEX_L;
   def rmY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst),
         (ins VR256:$src1, VR256:$src2, f256mem:$src3, i8imm:$src4),
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
         [(set VR256:$dst,
           (Int256 VR256:$src1, VR256:$src2, (ld_256 addr:$src3), imm:$src4))]>,
-        VEX_W, MemOp4;
+        VEX_W, MemOp4, VEX_L;
   def mrY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst),
         (ins VR256:$src1, f256mem:$src2, VR256:$src3, i8imm:$src4),
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
         [(set VR256:$dst,
-           (Int256 VR256:$src1, (ld_256 addr:$src2), VR256:$src3, imm:$src4))]>;
+           (Int256 VR256:$src1, (ld_256 addr:$src2), VR256:$src3, imm:$src4))]>,
+        VEX_L;
 }
 
 defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", int_x86_xop_vpermil2pd,
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index 8643ffc19d..3695ce2324 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -298,7 +298,7 @@ LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const
   DebugLoc DL = Op.getDebugLoc();
 
   const BlockAddress *BA = cast<BlockAddressSDNode>(Op)->getBlockAddress();
-  SDValue Result = DAG.getBlockAddress(BA, getPointerTy(), /*isTarget=*/true);
+  SDValue Result = DAG.getTargetBlockAddress(BA, getPointerTy());
 
   return DAG.getNode(XCoreISD::PCRelativeWrapper, DL, getPointerTy(), Result);
 }
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index 69a22fb1e6..a9263baa44 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -210,7 +210,8 @@ unsigned Inliner::getInlineThreshold(CallSite CS) const {
   Function *Caller = CS.getCaller();
   bool OptSize = Caller && !Caller->isDeclaration() &&
     Caller->hasFnAttr(Attribute::OptimizeForSize);
-  if (!(InlineLimit.getNumOccurrences() > 0) && OptSize && OptSizeThreshold < thres)
+  if (!(InlineLimit.getNumOccurrences() > 0) && OptSize &&
+      OptSizeThreshold < thres)
     thres = OptSizeThreshold;
 
   // Listen to the inlinehint attribute when it would increase the threshold.
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 43b4ab5efa..c81b333813 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -40,6 +40,10 @@ UseGVNAfterVectorization("use-gvn-after-vectorization",
   cl::init(false), cl::Hidden,
   cl::desc("Run GVN instead of Early CSE after vectorization passes"));
 
+static cl::opt<bool> UseNewSROA("use-new-sroa",
+  cl::init(false), cl::Hidden,
+  cl::desc("Enable the new, experimental SROA pass"));
+
 PassManagerBuilder::PassManagerBuilder() {
     OptLevel = 2;
     SizeLevel = 0;
@@ -100,7 +104,10 @@ void PassManagerBuilder::populateFunctionPassManager(FunctionPassManager &FPM) {
   addInitialAliasAnalysisPasses(FPM);
 
   FPM.add(createCFGSimplificationPass());
-  FPM.add(createScalarReplAggregatesPass());
+  if (UseNewSROA)
+    FPM.add(createSROAPass());
+  else
+    FPM.add(createScalarReplAggregatesPass());
   FPM.add(createEarlyCSEPass());
   FPM.add(createLowerExpectIntrinsicPass());
 }
@@ -147,7 +154,10 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
 
   // Start of function pass.
   // Break up aggregate allocas, using SSAUpdater.
-  MPM.add(createScalarReplAggregatesPass(-1, false));
+  if (UseNewSROA)
+    MPM.add(createSROAPass(/*RequiresDomTree*/ false));
+  else
+    MPM.add(createScalarReplAggregatesPass(-1, false));
   MPM.add(createEarlyCSEPass());              // Catch trivial redundancies
   if (!DisableSimplifyLibCalls)
     MPM.add(createSimplifyLibCallsPass());    // Library Call Optimizations
@@ -265,7 +275,10 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM,
   PM.add(createInstructionCombiningPass());
   PM.add(createJumpThreadingPass());
   // Break up allocas
-  PM.add(createScalarReplAggregatesPass());
+  if (UseNewSROA)
+    PM.add(createSROAPass());
+  else
+    PM.add(createScalarReplAggregatesPass());
 
   // Run a few AA driven optimizations here and now, to cleanup the code.
   PM.add(createFunctionAttrsPass()); // Add nocapture.
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index e7f3b21b15..23c08699ff 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -29,6 +29,26 @@ static Type *getPromotedType(Type *Ty) {
   return Ty;
 }
 
+/// reduceToSingleValueType - Given an aggregate type which ultimately holds a
+/// single scalar element, like {{{type}}} or [1 x type], return type.
+static Type *reduceToSingleValueType(Type *T) {
+  while (!T->isSingleValueType()) {
+    if (StructType *STy = dyn_cast<StructType>(T)) {
+      if (STy->getNumElements() == 1)
+        T = STy->getElementType(0);
+      else
+        break;
+    } else if (ArrayType *ATy = dyn_cast<ArrayType>(T)) {
+      if (ATy->getNumElements() == 1)
+        T = ATy->getElementType();
+      else
+        break;
+    } else
+      break;
+  }
+
+  return T;
+}
 
 Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
   unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), TD);
@@ -74,35 +94,34 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
   // dest address will be promotable.  See if we can find a better type than the
   // integer datatype.
   Value *StrippedDest = MI->getArgOperand(0)->stripPointerCasts();
+  MDNode *CopyMD = 0;
   if (StrippedDest != MI->getArgOperand(0)) {
     Type *SrcETy = cast<PointerType>(StrippedDest->getType())
                                     ->getElementType();
     if (TD && SrcETy->isSized() && TD->getTypeStoreSize(SrcETy) == Size) {
       // The SrcETy might be something like {{{double}}} or [1 x double].  Rip
       // down through these levels if so.
-      while (!SrcETy->isSingleValueType()) {
-        if (StructType *STy = dyn_cast<StructType>(SrcETy)) {
-          if (STy->getNumElements() == 1)
-            SrcETy = STy->getElementType(0);
-          else
-            break;
-        } else if (ArrayType *ATy = dyn_cast<ArrayType>(SrcETy)) {
-          if (ATy->getNumElements() == 1)
-            SrcETy = ATy->getElementType();
-          else
-            break;
-        } else
-          break;
-      }
+      SrcETy = reduceToSingleValueType(SrcETy);
 
       if (SrcETy->isSingleValueType()) {
         NewSrcPtrTy = PointerType::get(SrcETy, SrcAddrSp);
         NewDstPtrTy = PointerType::get(SrcETy, DstAddrSp);
+
+        // If the memcpy has metadata describing the members, see if we can
+        // get the TBAA tag describing our copy.
+        if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) {
+          if (M->getNumOperands() == 3 &&
+              isa<ConstantInt>(M->getOperand(0)) &&
+              cast<ConstantInt>(M->getOperand(0))->isNullValue() &&
+              isa<ConstantInt>(M->getOperand(1)) &&
+              cast<ConstantInt>(M->getOperand(1))->getValue() == Size &&
+              isa<MDNode>(M->getOperand(2)))
+            CopyMD = cast<MDNode>(M->getOperand(2));
+        }
       }
     }
   }
 
-
   // If the memcpy/memmove provides better alignment info than we can
   // infer, use it.
   SrcAlign = std::max(SrcAlign, CopyAlign);
@@ -112,8 +131,12 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
   Value *Dest = Builder->CreateBitCast(MI->getArgOperand(0), NewDstPtrTy);
   LoadInst *L = Builder->CreateLoad(Src, MI->isVolatile());
   L->setAlignment(SrcAlign);
+  if (CopyMD)
+    L->setMetadata(LLVMContext::MD_tbaa, CopyMD);
   StoreInst *S = Builder->CreateStore(L, Dest, MI->isVolatile());
   S->setAlignment(DstAlign);
+  if (CopyMD)
+    S->setMetadata(LLVMContext::MD_tbaa, CopyMD);
 
   // Set the size of the copy to 0, it will be deleted on the next iteration.
   MI->setArgOperand(2, Constant::getNullValue(MemOpLength->getType()));
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 6ecb4c52c4..5b6cf4a4a8 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -246,12 +246,16 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
           return &AI;
         }
 
+        // If the alignment of the entry block alloca is 0 (unspecified),
+        // assign it the preferred alignment.
+        if (EntryAI->getAlignment() == 0)
+          EntryAI->setAlignment(
+            TD->getPrefTypeAlignment(EntryAI->getAllocatedType()));
         // Replace this zero-sized alloca with the one at the start of the entry
         // block after ensuring that the address will be aligned enough for both
         // types.
-        unsigned MaxAlign =
-          std::max(TD->getPrefTypeAlignment(EntryAI->getAllocatedType()),
-                   TD->getPrefTypeAlignment(AI.getAllocatedType()));
+        unsigned MaxAlign = std::max(EntryAI->getAlignment(),
+                                     AI.getAlignment());
         EntryAI->setAlignment(MaxAlign);
         if (AI.getType() != EntryAI->getType())
           return new BitCastInst(EntryAI, AI.getType());
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 2a7182fc1d..3361a1e7fb 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -477,7 +477,8 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
     if (match(Op1, m_Shl(m_Power2(CI), m_Value(N))) ||
         match(Op1, m_ZExt(m_Shl(m_Power2(CI), m_Value(N))))) {
       if (*CI != 1)
-        N = Builder->CreateAdd(N, ConstantInt::get(I.getType(),CI->logBase2()));
+        N = Builder->CreateAdd(N,
+                               ConstantInt::get(N->getType(), CI->logBase2()));
       if (ZExtInst *Z = dyn_cast<ZExtInst>(Op1))
         N = Builder->CreateZExt(N, Z->getDestTy());
       if (I.isExact())
diff --git a/lib/Transforms/InstCombine/InstCombineWorklist.h b/lib/Transforms/InstCombine/InstCombineWorklist.h
index 99a02fc0df..ea654ae9ed 100644
--- a/lib/Transforms/InstCombine/InstCombineWorklist.h
+++ b/lib/Transforms/InstCombine/InstCombineWorklist.h
@@ -26,8 +26,8 @@ class LLVM_LIBRARY_VISIBILITY InstCombineWorklist {
   SmallVector<Instruction*, 256> Worklist;
   DenseMap<Instruction*, unsigned> WorklistMap;
   
-  void operator=(const InstCombineWorklist&RHS);   // DO NOT IMPLEMENT
-  InstCombineWorklist(const InstCombineWorklist&); // DO NOT IMPLEMENT
+  void operator=(const InstCombineWorklist&RHS) LLVM_DELETED_FUNCTION;
+  InstCombineWorklist(const InstCombineWorklist&) LLVM_DELETED_FUNCTION;
 public:
   InstCombineWorklist() {}
   
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 0775cf4a22..afa6a4b5e6 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -392,7 +392,7 @@ bool AddressSanitizer::HasDynamicInitializer(GlobalVariable *G) {
 }
 
 void AddressSanitizer::instrumentMop(AsanFunctionContext &AFC, Instruction *I) {
-  bool IsWrite;
+  bool IsWrite = false;
   Value *Addr = isInterestingMemoryAccess(I, &IsWrite);
   assert(Addr);
   if (ClOpt && ClOptGlobals) {
diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 9fcde316c0..bcee0c5f04 100644
--- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -90,6 +90,7 @@ namespace {
     // list.
     void insertCounterWriteout(ArrayRef<std::pair<GlobalVariable*, MDNode*> >);
     void insertIndirectCounterIncrement();
+    void insertFlush(ArrayRef<std::pair<GlobalVariable*, MDNode*> >);
 
     std::string mangleName(DICompileUnit CU, const char *NewStem);
 
@@ -518,6 +519,7 @@ bool GCOVProfiler::emitProfileArcs() {
     }
 
     insertCounterWriteout(CountersBySP);
+    insertFlush(CountersBySP);
   }
 
   if (InsertIndCounterIncrCode)
@@ -630,13 +632,14 @@ GlobalVariable *GCOVProfiler::getEdgeStateValue() {
 
 void GCOVProfiler::insertCounterWriteout(
     ArrayRef<std::pair<GlobalVariable *, MDNode *> > CountersBySP) {
-  FunctionType *WriteoutFTy =
-      FunctionType::get(Type::getVoidTy(*Ctx), false);
-  Function *WriteoutF = Function::Create(WriteoutFTy,
-                                         GlobalValue::InternalLinkage,
-                                         "__llvm_gcov_writeout", M);
+  FunctionType *WriteoutFTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  Function *WriteoutF = M->getFunction("__llvm_gcov_writeout");
+  if (!WriteoutF)
+    WriteoutF = Function::Create(WriteoutFTy, GlobalValue::InternalLinkage,
+                                 "__llvm_gcov_writeout", M);
   WriteoutF->setUnnamedAddr(true);
-  BasicBlock *BB = BasicBlock::Create(*Ctx, "", WriteoutF);
+
+  BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", WriteoutF);
   IRBuilder<> Builder(BB);
 
   Constant *StartFile = getStartFileFunc();
@@ -647,8 +650,8 @@ void GCOVProfiler::insertCounterWriteout(
   NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
   if (CU_Nodes) {
     for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
-      DICompileUnit compile_unit(CU_Nodes->getOperand(i));
-      std::string FilenameGcda = mangleName(compile_unit, "gcda");
+      DICompileUnit CU(CU_Nodes->getOperand(i));
+      std::string FilenameGcda = mangleName(CU, "gcda");
       Builder.CreateCall(StartFile,
                          Builder.CreateGlobalStringPtr(FilenameGcda));
       for (ArrayRef<std::pair<GlobalVariable *, MDNode *> >::iterator
@@ -744,3 +747,42 @@ void GCOVProfiler::insertIndirectCounterIncrement() {
   Builder.SetInsertPoint(Exit);
   Builder.CreateRetVoid();
 }
+
+void GCOVProfiler::
+insertFlush(ArrayRef<std::pair<GlobalVariable*, MDNode*> > CountersBySP) {
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  Function *FlushF = M->getFunction("__gcov_flush");
+  if (!FlushF)
+    FlushF = Function::Create(FTy, GlobalValue::InternalLinkage,
+                              "__gcov_flush", M);
+  else
+    FlushF->setLinkage(GlobalValue::InternalLinkage);
+  FlushF->setUnnamedAddr(true);
+
+  BasicBlock *Entry = BasicBlock::Create(*Ctx, "entry", FlushF);
+
+  // Write out the current counters.
+  Constant *WriteoutF = M->getFunction("__llvm_gcov_writeout");
+  assert(WriteoutF && "Need to create the writeout function first!");
+
+  IRBuilder<> Builder(Entry);
+  Builder.CreateCall(WriteoutF);
+
+  // Zero out the counters.
+  for (ArrayRef<std::pair<GlobalVariable *, MDNode *> >::iterator
+         I = CountersBySP.begin(), E = CountersBySP.end();
+       I != E; ++I) {
+    GlobalVariable *GV = I->first;
+    Constant *Null = Constant::getNullValue(GV->getType()->getElementType());
+    Builder.CreateStore(Null, GV);
+  }
+
+  Type *RetTy = FlushF->getReturnType();
+  if (RetTy == Type::getVoidTy(*Ctx))
+    Builder.CreateRetVoid();
+  else if (RetTy->isIntegerTy())
+    // Used if __gcov_flush was implicitly declared.
+    Builder.CreateRet(ConstantInt::get(RetTy, 0));
+  else
+    report_fatal_error("invalid return type for __gcov_flush");
+}
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index 283758f395..06ef4b4a9b 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -25,6 +25,7 @@ add_llvm_library(LLVMScalarOpts
   Reassociate.cpp
   Reg2Mem.cpp
   SCCP.cpp
+  SROA.cpp
   Scalar.cpp
   ScalarReplAggregates.cpp
   SimplifyCFGPass.cpp
diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp
index 59121078cb..495cdc6321 100644
--- a/lib/Transforms/Scalar/CodeGenPrepare.cpp
+++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
+#include "llvm/GlobalVariable.h"
 #include "llvm/IRBuilder.h"
 #include "llvm/InlineAsm.h"
 #include "llvm/Instructions.h"
@@ -126,6 +127,7 @@ namespace {
     bool OptimizeSelectInst(SelectInst *SI);
     bool DupRetToEnableTailCallOpts(ReturnInst *RI);
     bool PlaceDbgValues(Function &F);
+    bool ConvertLoadToSwitch(LoadInst *LI);
   };
 }
 
@@ -152,13 +154,9 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   /// This optimization identifies DIV instructions that can be
   /// profitably bypassed and carried out with a shorter, faster divide.
   if (TLI && TLI->isSlowDivBypassed()) {
-    const DenseMap<Type *, Type *> &BypassTypeMap = TLI->getBypassSlowDivTypes();
-
-    for (Function::iterator I = F.begin(); I != F.end(); I++) {
-      EverMadeChange |= bypassSlowDivision(F,
-                                           I,
-                                           BypassTypeMap);
-    }
+    const DenseMap<Type*, Type*> &BypassTypeMap = TLI->getBypassSlowDivTypes();
+    for (Function::iterator I = F.begin(); I != F.end(); I++)
+      EverMadeChange |= bypassSlowDivision(F, I, BypassTypeMap);
   }
 
   // Eliminate blocks that contain only PHI nodes and an
@@ -173,7 +171,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   bool MadeChange = true;
   while (MadeChange) {
     MadeChange = false;
-    for (Function::iterator I = F.begin(), E = F.end(); I != E; ) {
+    for (Function::iterator I = F.begin(); I != F.end(); ) {
       BasicBlock *BB = I++;
       MadeChange |= OptimizeBlock(*BB);
     }
@@ -662,6 +660,7 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
 /// DupRetToEnableTailCallOpts - Look for opportunities to duplicate return
 /// instructions to the predecessor to enable tail call optimizations. The
 /// case it is currently looking for is:
+/// @code
 /// bb0:
 ///   %tmp0 = tail call i32 @f0()
 ///   br label %return
@@ -674,9 +673,11 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
 /// return:
 ///   %retval = phi i32 [ %tmp0, %bb0 ], [ %tmp1, %bb1 ], [ %tmp2, %bb2 ]
 ///   ret i32 %retval
+/// @endcode
 ///
 /// =>
 ///
+/// @code
 /// bb0:
 ///   %tmp0 = tail call i32 @f0()
 ///   ret i32 %tmp0
@@ -686,7 +687,7 @@ bool CodeGenPrepare::OptimizeCallInst(CallInst *CI) {
 /// bb2:
 ///   %tmp2 = tail call i32 @f2()
 ///   ret i32 %tmp2
-///
+/// @endcode
 bool CodeGenPrepare::DupRetToEnableTailCallOpts(ReturnInst *RI) {
   if (!TLI)
     return false;
@@ -1284,9 +1285,11 @@ bool CodeGenPrepare::OptimizeInst(Instruction *I) {
     return OptimizeCmpExpression(CI);
 
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    bool Changed = false;
     if (TLI)
-      return OptimizeMemoryInst(I, I->getOperand(0), LI->getType());
-    return false;
+      Changed |= OptimizeMemoryInst(I, I->getOperand(0), LI->getType());
+    Changed |= ConvertLoadToSwitch(LI);
+    return Changed;
   }
 
   if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
@@ -1330,7 +1333,7 @@ bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) {
   bool MadeChange = false;
 
   CurInstIterator = BB.begin();
-  for (BasicBlock::iterator E = BB.end(); CurInstIterator != E; )
+  while (CurInstIterator != BB.end())
     MadeChange |= OptimizeInst(CurInstIterator++);
 
   return MadeChange;
@@ -1366,3 +1369,109 @@ bool CodeGenPrepare::PlaceDbgValues(Function &F) {
   }
   return MadeChange;
 }
+
+static bool TargetSupportsJumpTables(const TargetLowering &TLI) {
+  return TLI.supportJumpTables() &&
+          (TLI.isOperationLegalOrCustom(ISD::BR_JT, MVT::Other) ||
+           TLI.isOperationLegalOrCustom(ISD::BRIND, MVT::Other));
+}
+
+/// ConvertLoadToSwitch - Convert loads from constant lookup tables into
+/// switches. This undos the switch-to-lookup table transformation in
+/// SimplifyCFG for targets where that is inprofitable.
+bool CodeGenPrepare::ConvertLoadToSwitch(LoadInst *LI) {
+  // This only applies to targets that don't support jump tables.
+  if (!TLI || TargetSupportsJumpTables(*TLI))
+    return false;
+
+  // FIXME: In the future, it would be desirable to have enough target
+  // information in SimplifyCFG, so we could decide at that stage whether to
+  // transform the switch to a lookup table or not, and this
+  // reverse-transformation could be removed.
+
+  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(LI->getPointerOperand());
+  if (!GEP || !GEP->isInBounds() || GEP->getPointerAddressSpace())
+    return false;
+  if (GEP->getNumIndices() != 2)
+    return false;
+  Value *FirstIndex = GEP->idx_begin()[0];
+  ConstantInt *FirstIndexInt = dyn_cast<ConstantInt>(FirstIndex);
+  if (!FirstIndexInt || !FirstIndexInt->isZero())
+    return false;
+
+  Value *TableIndex = GEP->idx_begin()[1];
+  IntegerType *TableIndexTy = cast<IntegerType>(TableIndex->getType());
+
+  GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getPointerOperand());
+  if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer())
+    return false;
+
+  Constant *Arr = GV->getInitializer();
+  uint64_t NumElements;
+  if (ConstantArray *CA = dyn_cast<ConstantArray>(Arr))
+    NumElements = CA->getType()->getNumElements();
+  else if (ConstantDataArray *CDA = dyn_cast<ConstantDataArray>(Arr))
+    NumElements = CDA->getNumElements();
+  else
+    return false;
+  if (NumElements < 2)
+    return false;
+
+  // Split the block.
+  BasicBlock *OriginalBB = LI->getParent();
+  BasicBlock *PostSwitchBB = OriginalBB->splitBasicBlock(LI);
+
+  // Replace OriginalBB's terminator with a switch.
+  IRBuilder<> Builder(OriginalBB->getTerminator());
+  SwitchInst *Switch = Builder.CreateSwitch(TableIndex, PostSwitchBB,
+                                            NumElements - 1);
+  OriginalBB->getTerminator()->eraseFromParent();
+
+  // Count the frequency of each value to decide which to use as default.
+  SmallDenseMap<Constant*, uint64_t> ValueFreq;
+  for (uint64_t I = 0; I < NumElements; ++I)
+    ++ValueFreq[Arr->getAggregateElement(I)];
+  uint64_t MaxCount = 0;
+  Constant *DefaultValue = NULL;
+  for (SmallDenseMap<Constant*, uint64_t>::iterator I = ValueFreq.begin(),
+       E = ValueFreq.end(); I != E; ++I) {
+    if (I->second > MaxCount) {
+      MaxCount = I->second;
+      DefaultValue = I->first;
+    }
+  }
+  assert(DefaultValue && "No values in the array?");
+
+  // Create the phi node in PostSwitchBB, which will replace the load.
+  Builder.SetInsertPoint(PostSwitchBB->begin());
+  PHINode *PHI = Builder.CreatePHI(LI->getType(), NumElements);
+  PHI->addIncoming(DefaultValue, OriginalBB);
+
+  // Build basic blocks to target with the switch.
+  for (uint64_t I = 0; I < NumElements; ++I) {
+    Constant *C = Arr->getAggregateElement(I);
+    if (C == DefaultValue) continue; // Already covered by the default case.
+
+    BasicBlock *BB = BasicBlock::Create(PostSwitchBB->getContext(),
+                                        "lookup.bb",
+                                        PostSwitchBB->getParent(),
+                                        PostSwitchBB);
+    Switch->addCase(ConstantInt::get(TableIndexTy, I), BB);
+    Builder.SetInsertPoint(BB);
+    Builder.CreateBr(PostSwitchBB);
+    PHI->addIncoming(C, BB);
+  }
+
+  // Remove the load.
+  LI->replaceAllUsesWith(PHI);
+  LI->eraseFromParent();
+
+  // Clean up.
+  if (GEP->use_empty())
+    GEP->eraseFromParent();
+  if (GV->hasUnnamedAddr() && GV->hasPrivateLinkage() && GV->use_empty())
+    GV->eraseFromParent();
+
+  CurInstIterator = Switch;
+  return true;
+}
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index 26271133e6..258a961df4 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -274,7 +274,8 @@ private:
         CallScope(*availableCalls) {}
 
    private:
-    NodeScope(const NodeScope&); // DO NOT IMPLEMENT
+    NodeScope(const NodeScope&) LLVM_DELETED_FUNCTION;
+    void operator=(const NodeScope&) LLVM_DELETED_FUNCTION;
 
     ScopedHTType::ScopeTy Scope;
     LoadHTType::ScopeTy LoadScope;
@@ -313,7 +314,8 @@ private:
     void process() { Processed = true; }
 
    private:
-    StackNode(const StackNode&); // DO NOT IMPLEMENT
+    StackNode(const StackNode&) LLVM_DELETED_FUNCTION;
+    void operator=(const StackNode&) LLVM_DELETED_FUNCTION;
 
     // Members.
     unsigned CurrentGeneration;
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 16ae6add86..e67d8af46c 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -632,7 +632,7 @@ INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
 INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
 INITIALIZE_PASS_END(GVN, "gvn", "Global Value Numbering", false, false)
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void GVN::dump(DenseMap<uint32_t, Value*>& d) {
   errs() << "{\n";
   for (DenseMap<uint32_t, Value*>::iterator I = d.begin(),
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index a72e288303..1553328c07 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -175,6 +175,11 @@ static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE,
 bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
   CurLoop = L;
 
+  // If the loop could not be converted to canonical form, it must have an
+  // indirectbr in it, just give up.
+  if (!L->getLoopPreheader())
+    return false;
+
   // Disable loop idiom recognition if the function's name is a common idiom.
   StringRef Name = L->getHeader()->getParent()->getName();
   if (Name == "memset" || Name == "memcpy")
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index d7495da5ef..1d3d156887 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -121,7 +121,7 @@ void RegSortData::print(raw_ostream &OS) const {
   OS << "[NumUses=" << UsedByIndices.count() << ']';
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void RegSortData::dump() const {
   print(errs()); errs() << '\n';
 }
@@ -416,7 +416,7 @@ void Formula::print(raw_ostream &OS) const {
   }
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void Formula::dump() const {
   print(errs()); errs() << '\n';
 }
@@ -978,7 +978,7 @@ void Cost::print(raw_ostream &OS) const {
     OS << ", plus " << SetupCost << " setup cost";
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void Cost::dump() const {
   print(errs()); errs() << '\n';
 }
@@ -1066,7 +1066,7 @@ void LSRFixup::print(raw_ostream &OS) const {
     OS << ", Offset=" << Offset;
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void LSRFixup::dump() const {
   print(errs()); errs() << '\n';
 }
@@ -1260,7 +1260,7 @@ void LSRUse::print(raw_ostream &OS) const {
     OS << ", widest fixup type: " << *WidestFixupType;
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void LSRUse::dump() const {
   print(errs()); errs() << '\n';
 }
@@ -3446,7 +3446,7 @@ void WorkItem::print(raw_ostream &OS) const {
      << " , add offset " << Imm;
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void WorkItem::dump() const {
   print(errs()); errs() << '\n';
 }
@@ -4464,17 +4464,21 @@ void LSRInstance::RewriteForPHI(PHINode *PN,
             SplitLandingPadPredecessors(Parent, BB, "", "", P, NewBBs);
             NewBB = NewBBs[0];
           }
-
-          // If PN is outside of the loop and BB is in the loop, we want to
-          // move the block to be immediately before the PHI block, not
-          // immediately after BB.
-          if (L->contains(BB) && !L->contains(PN))
-            NewBB->moveBefore(PN->getParent());
-
-          // Splitting the edge can reduce the number of PHI entries we have.
-          e = PN->getNumIncomingValues();
-          BB = NewBB;
-          i = PN->getBasicBlockIndex(BB);
+          // If NewBB==NULL, then SplitCriticalEdge refused to split because all
+          // phi predecessors are identical. The simple thing to do is skip
+          // splitting in this case rather than complicate the API.
+          if (NewBB) {
+            // If PN is outside of the loop and BB is in the loop, we want to
+            // move the block to be immediately before the PHI block, not
+            // immediately after BB.
+            if (L->contains(BB) && !L->contains(PN))
+              NewBB->moveBefore(PN->getParent());
+
+            // Splitting the edge can reduce the number of PHI entries we have.
+            e = PN->getNumIncomingValues();
+            BB = NewBB;
+            i = PN->getBasicBlockIndex(BB);
+          }
         }
       }
 
@@ -4743,7 +4747,7 @@ void LSRInstance::print(raw_ostream &OS) const {
   print_uses(OS);
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void LSRInstance::dump() const {
   print(errs()); errs() << '\n';
 }
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 2a5ee33eb1..35acc79238 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -38,7 +38,7 @@ STATISTIC(NumMemSetInfer, "Number of memsets inferred");
 STATISTIC(NumMoveToCpy,   "Number of memmoves converted to memcpy");
 STATISTIC(NumCpyToSet,    "Number of memcpys converted to memset");
 
-static int64_t GetOffsetFromIndex(const GetElementPtrInst *GEP, unsigned Idx,
+static int64_t GetOffsetFromIndex(const GEPOperator *GEP, unsigned Idx,
                                   bool &VariableIdxFound, const TargetData &TD){
   // Skip over the first indices.
   gep_type_iterator GTI = gep_type_begin(GEP);
@@ -75,8 +75,8 @@ static bool IsPointerOffset(Value *Ptr1, Value *Ptr2, int64_t &Offset,
                             const TargetData &TD) {
   Ptr1 = Ptr1->stripPointerCasts();
   Ptr2 = Ptr2->stripPointerCasts();
-  GetElementPtrInst *GEP1 = dyn_cast<GetElementPtrInst>(Ptr1);
-  GetElementPtrInst *GEP2 = dyn_cast<GetElementPtrInst>(Ptr2);
+  GEPOperator *GEP1 = dyn_cast<GEPOperator>(Ptr1);
+  GEPOperator *GEP2 = dyn_cast<GEPOperator>(Ptr2);
 
   bool VariableIdxFound = false;
 
diff --git a/lib/Transforms/Scalar/ObjCARC.cpp b/lib/Transforms/Scalar/ObjCARC.cpp
index dce8e8beb6..d83b069b1c 100644
--- a/lib/Transforms/Scalar/ObjCARC.cpp
+++ b/lib/Transforms/Scalar/ObjCARC.cpp
@@ -29,6 +29,7 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "objc-arc"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/ADT/DenseMap.h"
 using namespace llvm;
@@ -1120,9 +1121,8 @@ namespace {
     bool relatedSelect(const SelectInst *A, const Value *B);
     bool relatedPHI(const PHINode *A, const Value *B);
 
-    // Do not implement.
-    void operator=(const ProvenanceAnalysis &);
-    ProvenanceAnalysis(const ProvenanceAnalysis &);
+    void operator=(const ProvenanceAnalysis &) LLVM_DELETED_FUNCTION;
+    ProvenanceAnalysis(const ProvenanceAnalysis &) LLVM_DELETED_FUNCTION;
 
   public:
     ProvenanceAnalysis() {}
@@ -1597,6 +1597,12 @@ void BBState::MergePred(const BBState &Other) {
   // loop backedge. Loop backedges are special.
   TopDownPathCount += Other.TopDownPathCount;
 
+  // Check for overflow. If we have overflow, fall back to conservative behavior.
+  if (TopDownPathCount < Other.TopDownPathCount) {
+    clearTopDownPointers();
+    return;
+  }
+
   // For each entry in the other set, if our set has an entry with the same key,
   // merge the entries. Otherwise, copy the entry and merge it with an empty
   // entry.
@@ -1622,6 +1628,12 @@ void BBState::MergeSucc(const BBState &Other) {
   // loop backedge. Loop backedges are special.
   BottomUpPathCount += Other.BottomUpPathCount;
 
+  // Check for overflow. If we have overflow, fall back to conservative behavior.
+  if (BottomUpPathCount < Other.BottomUpPathCount) {
+    clearBottomUpPointers();
+    return;
+  }
+
   // For each entry in the other set, if our set has an entry with the
   // same key, merge the entries. Otherwise, copy the entry and merge
   // it with an empty entry.
@@ -3537,19 +3549,19 @@ bool ObjCARCOpt::OptimizeSequences(Function &F) {
 }
 
 /// OptimizeReturns - Look for this pattern:
-///
+/// \code
 ///    %call = call i8* @something(...)
 ///    %2 = call i8* @objc_retain(i8* %call)
 ///    %3 = call i8* @objc_autorelease(i8* %2)
 ///    ret i8* %3
-///
+/// \endcode
 /// And delete the retain and autorelease.
 ///
 /// Otherwise if it's just this:
-///
+/// \code
 ///    %3 = call i8* @objc_autorelease(i8* %2)
 ///    ret i8* %3
-///
+/// \endcode
 /// convert the autorelease to autoreleaseRV.
 void ObjCARCOpt::OptimizeReturns(Function &F) {
   if (!F.getReturnType()->isPointerTy())
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
new file mode 100644
index 0000000000..e3182d319c
--- /dev/null
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -0,0 +1,3213 @@
+//===- SROA.cpp - Scalar Replacement Of Aggregates ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This transformation implements the well known scalar replacement of
+/// aggregates transformation. It tries to identify promotable elements of an
+/// aggregate alloca, and promote them to registers. It will also try to
+/// convert uses of an element (or set of elements) of an alloca into a vector
+/// or bitfield-style integer scalar if appropriate.
+///
+/// It works to do this with minimal slicing of the alloca so that regions
+/// which are merely transferred in and out of external memory remain unchanged
+/// and are not decomposed to scalar code.
+///
+/// Because this also performs alloca promotion, it can be thought of as also
+/// serving the purpose of SSA formation. The algorithm iterates on the
+/// function until all opportunities for promotion have been realized.
+///
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "sroa"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/DIBuilder.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/IRBuilder.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Module.h"
+#include "llvm/Operator.h"
+#include "llvm/Pass.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/InstVisitor.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/ValueHandle.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
+using namespace llvm;
+
+STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
+STATISTIC(NumNewAllocas,      "Number of new, smaller allocas introduced");
+STATISTIC(NumPromoted,        "Number of allocas promoted to SSA values");
+STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
+STATISTIC(NumDeleted,         "Number of instructions deleted");
+STATISTIC(NumVectorized,      "Number of vectorized aggregates");
+
+/// Hidden option to force the pass to not use DomTree and mem2reg, instead
+/// forming SSA values through the SSAUpdater infrastructure.
+static cl::opt<bool>
+ForceSSAUpdater("force-ssa-updater", cl::init(false), cl::Hidden);
+
+namespace {
+/// \brief Alloca partitioning representation.
+///
+/// This class represents a partitioning of an alloca into slices, and
+/// information about the nature of uses of each slice of the alloca. The goal
+/// is that this information is sufficient to decide if and how to split the
+/// alloca apart and replace slices with scalars. It is also intended that this
+/// structure can capture the relevant information needed both to decide about
+/// and to enact these transformations.
+class AllocaPartitioning {
+public:
+  /// \brief A common base class for representing a half-open byte range.
+  struct ByteRange {
+    /// \brief The beginning offset of the range.
+    uint64_t BeginOffset;
+
+    /// \brief The ending offset, not included in the range.
+    uint64_t EndOffset;
+
+    ByteRange() : BeginOffset(), EndOffset() {}
+    ByteRange(uint64_t BeginOffset, uint64_t EndOffset)
+        : BeginOffset(BeginOffset), EndOffset(EndOffset) {}
+
+    /// \brief Support for ordering ranges.
+    ///
+    /// This provides an ordering over ranges such that start offsets are
+    /// always increasing, and within equal start offsets, the end offsets are
+    /// decreasing. Thus the spanning range comes first in a cluster with the
+    /// same start position.
+    bool operator<(const ByteRange &RHS) const {
+      if (BeginOffset < RHS.BeginOffset) return true;
+      if (BeginOffset > RHS.BeginOffset) return false;
+      if (EndOffset > RHS.EndOffset) return true;
+      return false;
+    }
+
+    /// \brief Support comparison with a single offset to allow binary searches.
+    friend bool operator<(const ByteRange &LHS, uint64_t RHSOffset) {
+      return LHS.BeginOffset < RHSOffset;
+    }
+
+    friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset,
+                                                const ByteRange &RHS) {
+      return LHSOffset < RHS.BeginOffset;
+    }
+
+    bool operator==(const ByteRange &RHS) const {
+      return BeginOffset == RHS.BeginOffset && EndOffset == RHS.EndOffset;
+    }
+    bool operator!=(const ByteRange &RHS) const { return !operator==(RHS); }
+  };
+
+  /// \brief A partition of an alloca.
+  ///
+  /// This structure represents a contiguous partition of the alloca. These are
+  /// formed by examining the uses of the alloca. During formation, they may
+  /// overlap but once an AllocaPartitioning is built, the Partitions within it
+  /// are all disjoint.
+  struct Partition : public ByteRange {
+    /// \brief Whether this partition is splittable into smaller partitions.
+    ///
+    /// We flag partitions as splittable when they are formed entirely due to
+    /// accesses by trivially splittable operations such as memset and memcpy.
+    ///
+    /// FIXME: At some point we should consider loads and stores of FCAs to be
+    /// splittable and eagerly split them into scalar values.
+    bool IsSplittable;
+
+    Partition() : ByteRange(), IsSplittable() {}
+    Partition(uint64_t BeginOffset, uint64_t EndOffset, bool IsSplittable)
+        : ByteRange(BeginOffset, EndOffset), IsSplittable(IsSplittable) {}
+  };
+
+  /// \brief A particular use of a partition of the alloca.
+  ///
+  /// This structure is used to associate uses of a partition with it. They
+  /// mark the range of bytes which are referenced by a particular instruction,
+  /// and includes a handle to the user itself and the pointer value in use.
+  /// The bounds of these uses are determined by intersecting the bounds of the
+  /// memory use itself with a particular partition. As a consequence there is
+  /// intentionally overlap between various uses of the same partition.
+  struct PartitionUse : public ByteRange {
+    /// \brief The user of this range of the alloca.
+    AssertingVH<Instruction> User;
+
+    /// \brief The particular pointer value derived from this alloca in use.
+    AssertingVH<Instruction> Ptr;
+
+    PartitionUse() : ByteRange(), User(), Ptr() {}
+    PartitionUse(uint64_t BeginOffset, uint64_t EndOffset,
+                 Instruction *User, Instruction *Ptr)
+        : ByteRange(BeginOffset, EndOffset), User(User), Ptr(Ptr) {}
+  };
+
+  /// \brief Construct a partitioning of a particular alloca.
+  ///
+  /// Construction does most of the work for partitioning the alloca. This
+  /// performs the necessary walks of users and builds a partitioning from it.
+  AllocaPartitioning(const TargetData &TD, AllocaInst &AI);
+
+  /// \brief Test whether a pointer to the allocation escapes our analysis.
+  ///
+  /// If this is true, the partitioning is never fully built and should be
+  /// ignored.
+  bool isEscaped() const { return PointerEscapingInstr; }
+
+  /// \brief Support for iterating over the partitions.
+  /// @{
+  typedef SmallVectorImpl<Partition>::iterator iterator;
+  iterator begin() { return Partitions.begin(); }
+  iterator end() { return Partitions.end(); }
+
+  typedef SmallVectorImpl<Partition>::const_iterator const_iterator;
+  const_iterator begin() const { return Partitions.begin(); }
+  const_iterator end() const { return Partitions.end(); }
+  /// @}
+
+  /// \brief Support for iterating over and manipulating a particular
+  /// partition's uses.
+  ///
+  /// The iteration support provided for uses is more limited, but also
+  /// includes some manipulation routines to support rewriting the uses of
+  /// partitions during SROA.
+  /// @{
+  typedef SmallVectorImpl<PartitionUse>::iterator use_iterator;
+  use_iterator use_begin(unsigned Idx) { return Uses[Idx].begin(); }
+  use_iterator use_begin(const_iterator I) { return Uses[I - begin()].begin(); }
+  use_iterator use_end(unsigned Idx) { return Uses[Idx].end(); }
+  use_iterator use_end(const_iterator I) { return Uses[I - begin()].end(); }
+  void use_insert(unsigned Idx, use_iterator UI, const PartitionUse &U) {
+    Uses[Idx].insert(UI, U);
+  }
+  void use_insert(const_iterator I, use_iterator UI, const PartitionUse &U) {
+    Uses[I - begin()].insert(UI, U);
+  }
+  void use_erase(unsigned Idx, use_iterator UI) { Uses[Idx].erase(UI); }
+  void use_erase(const_iterator I, use_iterator UI) {
+    Uses[I - begin()].erase(UI);
+  }
+
+  typedef SmallVectorImpl<PartitionUse>::const_iterator const_use_iterator;
+  const_use_iterator use_begin(unsigned Idx) const { return Uses[Idx].begin(); }
+  const_use_iterator use_begin(const_iterator I) const {
+    return Uses[I - begin()].begin();
+  }
+  const_use_iterator use_end(unsigned Idx) const { return Uses[Idx].end(); }
+  const_use_iterator use_end(const_iterator I) const {
+    return Uses[I - begin()].end();
+  }
+  /// @}
+
+  /// \brief Allow iterating the dead users for this alloca.
+  ///
+  /// These are instructions which will never actually use the alloca as they
+  /// are outside the allocated range. They are safe to replace with undef and
+  /// delete.
+  /// @{
+  typedef SmallVectorImpl<Instruction *>::const_iterator dead_user_iterator;
+  dead_user_iterator dead_user_begin() const { return DeadUsers.begin(); }
+  dead_user_iterator dead_user_end() const { return DeadUsers.end(); }
+  /// @}
+
+  /// \brief Allow iterating the dead expressions referring to this alloca.
+  ///
+  /// These are operands which have cannot actually be used to refer to the
+  /// alloca as they are outside its range and the user doesn't correct for
+  /// that. These mostly consist of PHI node inputs and the like which we just
+  /// need to replace with undef.
+  /// @{
+  typedef SmallVectorImpl<Use *>::const_iterator dead_op_iterator;
+  dead_op_iterator dead_op_begin() const { return DeadOperands.begin(); }
+  dead_op_iterator dead_op_end() const { return DeadOperands.end(); }
+  /// @}
+
+  /// \brief MemTransferInst auxiliary data.
+  /// This struct provides some auxiliary data about memory transfer
+  /// intrinsics such as memcpy and memmove. These intrinsics can use two
+  /// different ranges within the same alloca, and provide other challenges to
+  /// correctly represent. We stash extra data to help us untangle this
+  /// after the partitioning is complete.
+  struct MemTransferOffsets {
+    uint64_t DestBegin, DestEnd;
+    uint64_t SourceBegin, SourceEnd;
+    bool IsSplittable;
+  };
+  MemTransferOffsets getMemTransferOffsets(MemTransferInst &II) const {
+    return MemTransferInstData.lookup(&II);
+  }
+
+  /// \brief Map from a PHI or select operand back to a partition.
+  ///
+  /// When manipulating PHI nodes or selects, they can use more than one
+  /// partition of an alloca. We store a special mapping to allow finding the
+  /// partition referenced by each of these operands, if any.
+  iterator findPartitionForPHIOrSelectOperand(Instruction &I, Value *Op) {
+    SmallDenseMap<std::pair<Instruction *, Value *>,
+                  std::pair<unsigned, unsigned> >::const_iterator MapIt
+      = PHIOrSelectOpMap.find(std::make_pair(&I, Op));
+    if (MapIt == PHIOrSelectOpMap.end())
+      return end();
+
+    return begin() + MapIt->second.first;
+  }
+
+  /// \brief Map from a PHI or select operand back to the specific use of
+  /// a partition.
+  ///
+  /// Similar to mapping these operands back to the partitions, this maps
+  /// directly to the use structure of that partition.
+  use_iterator findPartitionUseForPHIOrSelectOperand(Instruction &I,
+                                                     Value *Op) {
+    SmallDenseMap<std::pair<Instruction *, Value *>,
+                  std::pair<unsigned, unsigned> >::const_iterator MapIt
+      = PHIOrSelectOpMap.find(std::make_pair(&I, Op));
+    assert(MapIt != PHIOrSelectOpMap.end());
+    return Uses[MapIt->second.first].begin() + MapIt->second.second;
+  }
+
+  /// \brief Compute a common type among the uses of a particular partition.
+  ///
+  /// This routines walks all of the uses of a particular partition and tries
+  /// to find a common type between them. Untyped operations such as memset and
+  /// memcpy are ignored.
+  Type *getCommonType(iterator I) const;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  void print(raw_ostream &OS, const_iterator I, StringRef Indent = "  ") const;
+  void printUsers(raw_ostream &OS, const_iterator I,
+                  StringRef Indent = "  ") const;
+  void print(raw_ostream &OS) const;
+  void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump(const_iterator I) const;
+  void LLVM_ATTRIBUTE_NOINLINE LLVM_ATTRIBUTE_USED dump() const;
+#endif
+
+private:
+  template <typename DerivedT, typename RetT = void> class BuilderBase;
+  class PartitionBuilder;
+  friend class AllocaPartitioning::PartitionBuilder;
+  class UseBuilder;
+  friend class AllocaPartitioning::UseBuilder;
+
+#ifndef NDEBUG
+  /// \brief Handle to alloca instruction to simplify method interfaces.
+  AllocaInst &AI;
+#endif
+
+  /// \brief The instruction responsible for this alloca having no partitioning.
+  ///
+  /// When an instruction (potentially) escapes the pointer to the alloca, we
+  /// store a pointer to that here and abort trying to partition the alloca.
+  /// This will be null if the alloca is partitioned successfully.
+  Instruction *PointerEscapingInstr;
+
+  /// \brief The partitions of the alloca.
+  ///
+  /// We store a vector of the partitions over the alloca here. This vector is
+  /// sorted by increasing begin offset, and then by decreasing end offset. See
+  /// the Partition inner class for more details. Initially (during
+  /// construction) there are overlaps, but we form a disjoint sequence of
+  /// partitions while finishing construction and a fully constructed object is
+  /// expected to always have this as a disjoint space.
+  SmallVector<Partition, 8> Partitions;
+
+  /// \brief The uses of the partitions.
+  ///
+  /// This is essentially a mapping from each partition to a list of uses of
+  /// that partition. The mapping is done with a Uses vector that has the exact
+  /// same number of entries as the partition vector. Each entry is itself
+  /// a vector of the uses.
+  SmallVector<SmallVector<PartitionUse, 2>, 8> Uses;
+
+  /// \brief Instructions which will become dead if we rewrite the alloca.
+  ///
+  /// Note that these are not separated by partition. This is because we expect
+  /// a partitioned alloca to be completely rewritten or not rewritten at all.
+  /// If rewritten, all these instructions can simply be removed and replaced
+  /// with undef as they come from outside of the allocated space.
+  SmallVector<Instruction *, 8> DeadUsers;
+
+  /// \brief Operands which will become dead if we rewrite the alloca.
+  ///
+  /// These are operands that in their particular use can be replaced with
+  /// undef when we rewrite the alloca. These show up in out-of-bounds inputs
+  /// to PHI nodes and the like. They aren't entirely dead (there might be
+  /// a GEP back into the bounds using it elsewhere) and nor is the PHI, but we
+  /// want to swap this particular input for undef to simplify the use lists of
+  /// the alloca.
+  SmallVector<Use *, 8> DeadOperands;
+
+  /// \brief The underlying storage for auxiliary memcpy and memset info.
+  SmallDenseMap<MemTransferInst *, MemTransferOffsets, 4> MemTransferInstData;
+
+  /// \brief A side datastructure used when building up the partitions and uses.
+  ///
+  /// This mapping is only really used during the initial building of the
+  /// partitioning so that we can retain information about PHI and select nodes
+  /// processed.
+  SmallDenseMap<Instruction *, std::pair<uint64_t, bool> > PHIOrSelectSizes;
+
+  /// \brief Auxiliary information for particular PHI or select operands.
+  SmallDenseMap<std::pair<Instruction *, Value *>,
+                std::pair<unsigned, unsigned>, 4> PHIOrSelectOpMap;
+
+  /// \brief A utility routine called from the constructor.
+  ///
+  /// This does what it says on the tin. It is the key of the alloca partition
+  /// splitting and merging. After it is called we have the desired disjoint
+  /// collection of partitions.
+  void splitAndMergePartitions();
+};
+}
+
+template <typename DerivedT, typename RetT>
+class AllocaPartitioning::BuilderBase
+    : public InstVisitor<DerivedT, RetT> {
+public:
+  BuilderBase(const TargetData &TD, AllocaInst &AI, AllocaPartitioning &P)
+      : TD(TD),
+        AllocSize(TD.getTypeAllocSize(AI.getAllocatedType())),
+        P(P) {
+    enqueueUsers(AI, 0);
+  }
+
+protected:
+  const TargetData &TD;
+  const uint64_t AllocSize;
+  AllocaPartitioning &P;
+
+  struct OffsetUse {
+    Use *U;
+    int64_t Offset;
+  };
+  SmallVector<OffsetUse, 8> Queue;
+
+  // The active offset and use while visiting.
+  Use *U;
+  int64_t Offset;
+
+  void enqueueUsers(Instruction &I, int64_t UserOffset) {
+    SmallPtrSet<User *, 8> UserSet;
+    for (Value::use_iterator UI = I.use_begin(), UE = I.use_end();
+         UI != UE; ++UI) {
+      if (!UserSet.insert(*UI))
+        continue;
+
+      OffsetUse OU = { &UI.getUse(), UserOffset };
+      Queue.push_back(OU);
+    }
+  }
+
+  bool computeConstantGEPOffset(GetElementPtrInst &GEPI, int64_t &GEPOffset) {
+    GEPOffset = Offset;
+    for (gep_type_iterator GTI = gep_type_begin(GEPI), GTE = gep_type_end(GEPI);
+         GTI != GTE; ++GTI) {
+      ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
+      if (!OpC)
+        return false;
+      if (OpC->isZero())
+        continue;
+
+      // Handle a struct index, which adds its field offset to the pointer.
+      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+        unsigned ElementIdx = OpC->getZExtValue();
+        const StructLayout *SL = TD.getStructLayout(STy);
+        uint64_t ElementOffset = SL->getElementOffset(ElementIdx);
+        // Check that we can continue to model this GEP in a signed 64-bit offset.
+        if (ElementOffset > INT64_MAX ||
+            (GEPOffset >= 0 &&
+             ((uint64_t)GEPOffset + ElementOffset) > INT64_MAX)) {
+          DEBUG(dbgs() << "WARNING: Encountered a cumulative offset exceeding "
+                       << "what can be represented in an int64_t!\n"
+                       << "  alloca: " << P.AI << "\n");
+          return false;
+        }
+        if (GEPOffset < 0)
+          GEPOffset = ElementOffset + (uint64_t)-GEPOffset;
+        else
+          GEPOffset += ElementOffset;
+        continue;
+      }
+
+      APInt Index = OpC->getValue().sextOrTrunc(TD.getPointerSizeInBits());
+      Index *= APInt(Index.getBitWidth(),
+                     TD.getTypeAllocSize(GTI.getIndexedType()));
+      Index += APInt(Index.getBitWidth(), (uint64_t)GEPOffset,
+                     /*isSigned*/true);
+      // Check if the result can be stored in our int64_t offset.
+      if (!Index.isSignedIntN(sizeof(GEPOffset) * 8)) {
+        DEBUG(dbgs() << "WARNING: Encountered a cumulative offset exceeding "
+                     << "what can be represented in an int64_t!\n"
+                     << "  alloca: " << P.AI << "\n");
+        return false;
+      }
+
+      GEPOffset = Index.getSExtValue();
+    }
+    return true;
+  }
+
+  Value *foldSelectInst(SelectInst &SI) {
+    // If the condition being selected on is a constant or the same value is
+    // being selected between, fold the select. Yes this does (rarely) happen
+    // early on.
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
+      return SI.getOperand(1+CI->isZero());
+    if (SI.getOperand(1) == SI.getOperand(2)) {
+      assert(*U == SI.getOperand(1));
+      return SI.getOperand(1);
+    }
+    return 0;
+  }
+};
+
+/// \brief Builder for the alloca partitioning.
+///
+/// This class builds an alloca partitioning by recursively visiting the uses
+/// of an alloca and splitting the partitions for each load and store at each
+/// offset.
+class AllocaPartitioning::PartitionBuilder
+    : public BuilderBase<PartitionBuilder, bool> {
+  friend class InstVisitor<PartitionBuilder, bool>;
+
+  SmallDenseMap<Instruction *, unsigned> MemTransferPartitionMap;
+
+public:
+  PartitionBuilder(const TargetData &TD, AllocaInst &AI, AllocaPartitioning &P)
+      : BuilderBase<PartitionBuilder, bool>(TD, AI, P) {}
+
+  /// \brief Run the builder over the allocation.
+  bool operator()() {
+    // Note that we have to re-evaluate size on each trip through the loop as
+    // the queue grows at the tail.
+    for (unsigned Idx = 0; Idx < Queue.size(); ++Idx) {
+      U = Queue[Idx].U;
+      Offset = Queue[Idx].Offset;
+      if (!visit(cast<Instruction>(U->getUser())))
+        return false;
+    }
+    return true;
+  }
+
+private:
+  bool markAsEscaping(Instruction &I) {
+    P.PointerEscapingInstr = &I;
+    return false;
+  }
+
+  void insertUse(Instruction &I, int64_t Offset, uint64_t Size,
+                 bool IsSplittable = false) {
+    // Completely skip uses which don't overlap the allocation.
+    if ((Offset >= 0 && (uint64_t)Offset >= AllocSize) ||
+        (Offset < 0 && (uint64_t)-Offset >= Size)) {
+      DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset
+                   << " which starts past the end of the " << AllocSize
+                   << " byte alloca:\n"
+                   << "    alloca: " << P.AI << "\n"
+                   << "       use: " << I << "\n");
+      return;
+    }
+
+    // Clamp the start to the beginning of the allocation.
+    if (Offset < 0) {
+      DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset
+                   << " to start at the beginning of the alloca:\n"
+                   << "    alloca: " << P.AI << "\n"
+                   << "       use: " << I << "\n");
+      Size -= (uint64_t)-Offset;
+      Offset = 0;
+    }
+
+    uint64_t BeginOffset = Offset, EndOffset = BeginOffset + Size;
+
+    // Clamp the end offset to the end of the allocation. Note that this is
+    // formulated to handle even the case where "BeginOffset + Size" overflows.
+    assert(AllocSize >= BeginOffset); // Established above.
+    if (Size > AllocSize - BeginOffset) {
+      DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset
+                   << " to remain within the " << AllocSize << " byte alloca:\n"
+                   << "    alloca: " << P.AI << "\n"
+                   << "       use: " << I << "\n");
+      EndOffset = AllocSize;
+    }
+
+    // See if we can just add a user onto the last slot currently occupied.
+    if (!P.Partitions.empty() &&
+        P.Partitions.back().BeginOffset == BeginOffset &&
+        P.Partitions.back().EndOffset == EndOffset) {
+      P.Partitions.back().IsSplittable &= IsSplittable;
+      return;
+    }
+
+    Partition New(BeginOffset, EndOffset, IsSplittable);
+    P.Partitions.push_back(New);
+  }
+
+  bool handleLoadOrStore(Type *Ty, Instruction &I, int64_t Offset) {
+    uint64_t Size = TD.getTypeStoreSize(Ty);
+
+    // If this memory access can be shown to *statically* extend outside the
+    // bounds of of the allocation, it's behavior is undefined, so simply
+    // ignore it. Note that this is more strict than the generic clamping
+    // behavior of insertUse. We also try to handle cases which might run the
+    // risk of overflow.
+    // FIXME: We should instead consider the pointer to have escaped if this
+    // function is being instrumented for addressing bugs or race conditions.
+    if (Offset < 0 || (uint64_t)Offset >= AllocSize ||
+        Size > (AllocSize - (uint64_t)Offset)) {
+      DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte "
+                   << (isa<LoadInst>(I) ? "load" : "store") << " @" << Offset
+                   << " which extends past the end of the " << AllocSize
+                   << " byte alloca:\n"
+                   << "    alloca: " << P.AI << "\n"
+                   << "       use: " << I << "\n");
+      return true;
+    }
+
+    insertUse(I, Offset, Size);
+    return true;
+  }
+
+  bool visitBitCastInst(BitCastInst &BC) {
+    enqueueUsers(BC, Offset);
+    return true;
+  }
+
+  bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+    int64_t GEPOffset;
+    if (!computeConstantGEPOffset(GEPI, GEPOffset))
+      return markAsEscaping(GEPI);
+
+    enqueueUsers(GEPI, GEPOffset);
+    return true;
+  }
+
+  bool visitLoadInst(LoadInst &LI) {
+    assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
+           "All simple FCA loads should have been pre-split");
+    return handleLoadOrStore(LI.getType(), LI, Offset);
+  }
+
+  bool visitStoreInst(StoreInst &SI) {
+    Value *ValOp = SI.getValueOperand();
+    if (ValOp == *U)
+      return markAsEscaping(SI);
+
+    assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
+           "All simple FCA stores should have been pre-split");
+    return handleLoadOrStore(ValOp->getType(), SI, Offset);
+  }
+
+
+  bool visitMemSetInst(MemSetInst &II) {
+    assert(II.getRawDest() == *U && "Pointer use is not the destination?");
+    ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
+    uint64_t Size = Length ? Length->getZExtValue() : AllocSize - Offset;
+    insertUse(II, Offset, Size, Length);
+    return true;
+  }
+
+  bool visitMemTransferInst(MemTransferInst &II) {
+    ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
+    uint64_t Size = Length ? Length->getZExtValue() : AllocSize - Offset;
+    if (!Size)
+      // Zero-length mem transfer intrinsics can be ignored entirely.
+      return true;
+
+    MemTransferOffsets &Offsets = P.MemTransferInstData[&II];
+
+    // Only intrinsics with a constant length can be split.
+    Offsets.IsSplittable = Length;
+
+    if (*U != II.getRawDest()) {
+      assert(*U == II.getRawSource());
+      Offsets.SourceBegin = Offset;
+      Offsets.SourceEnd = Offset + Size;
+    } else {
+      Offsets.DestBegin = Offset;
+      Offsets.DestEnd = Offset + Size;
+    }
+
+    insertUse(II, Offset, Size, Offsets.IsSplittable);
+    unsigned NewIdx = P.Partitions.size() - 1;
+
+    SmallDenseMap<Instruction *, unsigned>::const_iterator PMI;
+    bool Inserted = false;
+    llvm::tie(PMI, Inserted)
+      = MemTransferPartitionMap.insert(std::make_pair(&II, NewIdx));
+    if (!Inserted && Offsets.IsSplittable) {
+      // We've found a memory transfer intrinsic which refers to the alloca as
+      // both a source and dest. We refuse to split these to simplify splitting
+      // logic. If possible, SROA will still split them into separate allocas
+      // and then re-analyze.
+      Offsets.IsSplittable = false;
+      P.Partitions[PMI->second].IsSplittable = false;
+      P.Partitions[NewIdx].IsSplittable = false;
+    }
+
+    return true;
+  }
+
+  // Disable SRoA for any intrinsics except for lifetime invariants.
+  // FIXME: What about debug instrinsics? This matches old behavior, but
+  // doesn't make sense.
+  bool visitIntrinsicInst(IntrinsicInst &II) {
+    if (II.getIntrinsicID() == Intrinsic::lifetime_start ||
+        II.getIntrinsicID() == Intrinsic::lifetime_end) {
+      ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0));
+      uint64_t Size = std::min(AllocSize - Offset, Length->getLimitedValue());
+      insertUse(II, Offset, Size, true);
+      return true;
+    }
+
+    return markAsEscaping(II);
+  }
+
+  Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
+    // We consider any PHI or select that results in a direct load or store of
+    // the same offset to be a viable use for partitioning purposes. These uses
+    // are considered unsplittable and the size is the maximum loaded or stored
+    // size.
+    SmallPtrSet<Instruction *, 4> Visited;
+    SmallVector<std::pair<Instruction *, Instruction *>, 4> Uses;
+    Visited.insert(Root);
+    Uses.push_back(std::make_pair(cast<Instruction>(*U), Root));
+    do {
+      Instruction *I, *UsedI;
+      llvm::tie(UsedI, I) = Uses.pop_back_val();
+
+      if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+        Size = std::max(Size, TD.getTypeStoreSize(LI->getType()));
+        continue;
+      }
+      if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+        Value *Op = SI->getOperand(0);
+        if (Op == UsedI)
+          return SI;
+        Size = std::max(Size, TD.getTypeStoreSize(Op->getType()));
+        continue;
+      }
+
+      if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I)) {
+        if (!GEP->hasAllZeroIndices())
+          return GEP;
+      } else if (!isa<BitCastInst>(I) && !isa<PHINode>(I) &&
+                 !isa<SelectInst>(I)) {
+        return I;
+      }
+
+      for (Value::use_iterator UI = I->use_begin(), UE = I->use_end(); UI != UE;
+           ++UI)
+        if (Visited.insert(cast<Instruction>(*UI)))
+          Uses.push_back(std::make_pair(I, cast<Instruction>(*UI)));
+    } while (!Uses.empty());
+
+    return 0;
+  }
+
+  bool visitPHINode(PHINode &PN) {
+    // See if we already have computed info on this node.
+    std::pair<uint64_t, bool> &PHIInfo = P.PHIOrSelectSizes[&PN];
+    if (PHIInfo.first) {
+      PHIInfo.second = true;
+      insertUse(PN, Offset, PHIInfo.first);
+      return true;
+    }
+
+    // Check for an unsafe use of the PHI node.
+    if (Instruction *EscapingI = hasUnsafePHIOrSelectUse(&PN, PHIInfo.first))
+      return markAsEscaping(*EscapingI);
+
+    insertUse(PN, Offset, PHIInfo.first);
+    return true;
+  }
+
+  bool visitSelectInst(SelectInst &SI) {
+    if (Value *Result = foldSelectInst(SI)) {
+      if (Result == *U)
+        // If the result of the constant fold will be the pointer, recurse
+        // through the select as if we had RAUW'ed it.
+        enqueueUsers(SI, Offset);
+
+      return true;
+    }
+
+    // See if we already have computed info on this node.
+    std::pair<uint64_t, bool> &SelectInfo = P.PHIOrSelectSizes[&SI];
+    if (SelectInfo.first) {
+      SelectInfo.second = true;
+      insertUse(SI, Offset, SelectInfo.first);
+      return true;
+    }
+
+    // Check for an unsafe use of the PHI node.
+    if (Instruction *EscapingI = hasUnsafePHIOrSelectUse(&SI, SelectInfo.first))
+      return markAsEscaping(*EscapingI);
+
+    insertUse(SI, Offset, SelectInfo.first);
+    return true;
+  }
+
+  /// \brief Disable SROA entirely if there are unhandled users of the alloca.
+  bool visitInstruction(Instruction &I) { return markAsEscaping(I); }
+};
+
+
+/// \brief Use adder for the alloca partitioning.
+///
+/// This class adds the uses of an alloca to all of the partitions which they
+/// use. For splittable partitions, this can end up doing essentially a linear
+/// walk of the partitions, but the number of steps remains bounded by the
+/// total result instruction size:
+/// - The number of partitions is a result of the number unsplittable
+///   instructions using the alloca.
+/// - The number of users of each partition is at worst the total number of
+///   splittable instructions using the alloca.
+/// Thus we will produce N * M instructions in the end, where N are the number
+/// of unsplittable uses and M are the number of splittable. This visitor does
+/// the exact same number of updates to the partitioning.
+///
+/// In the more common case, this visitor will leverage the fact that the
+/// partition space is pre-sorted, and do a logarithmic search for the
+/// partition needed, making the total visit a classical ((N + M) * log(N))
+/// complexity operation.
+class AllocaPartitioning::UseBuilder : public BuilderBase<UseBuilder> {
+  friend class InstVisitor<UseBuilder>;
+
+  /// \brief Set to de-duplicate dead instructions found in the use walk.
+  SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
+
+public:
+  UseBuilder(const TargetData &TD, AllocaInst &AI, AllocaPartitioning &P)
+      : BuilderBase<UseBuilder>(TD, AI, P) {}
+
+  /// \brief Run the builder over the allocation.
+  void operator()() {
+    // Note that we have to re-evaluate size on each trip through the loop as
+    // the queue grows at the tail.
+    for (unsigned Idx = 0; Idx < Queue.size(); ++Idx) {
+      U = Queue[Idx].U;
+      Offset = Queue[Idx].Offset;
+      this->visit(cast<Instruction>(U->getUser()));
+    }
+  }
+
+private:
+  void markAsDead(Instruction &I) {
+    if (VisitedDeadInsts.insert(&I))
+      P.DeadUsers.push_back(&I);
+  }
+
+  void insertUse(Instruction &User, int64_t Offset, uint64_t Size) {
+    // If the use extends outside of the allocation, record it as a dead use
+    // for elimination later.
+    if ((uint64_t)Offset >= AllocSize ||
+        (Offset < 0 && (uint64_t)-Offset >= Size))
+      return markAsDead(User);
+
+    // Clamp the start to the beginning of the allocation.
+    if (Offset < 0) {
+      Size -= (uint64_t)-Offset;
+      Offset = 0;
+    }
+
+    uint64_t BeginOffset = Offset, EndOffset = BeginOffset + Size;
+
+    // Clamp the end offset to the end of the allocation. Note that this is
+    // formulated to handle even the case where "BeginOffset + Size" overflows.
+    assert(AllocSize >= BeginOffset); // Established above.
+    if (Size > AllocSize - BeginOffset)
+      EndOffset = AllocSize;
+
+    // NB: This only works if we have zero overlapping partitions.
+    iterator B = std::lower_bound(P.begin(), P.end(), BeginOffset);
+    if (B != P.begin() && llvm::prior(B)->EndOffset > BeginOffset)
+      B = llvm::prior(B);
+    for (iterator I = B, E = P.end(); I != E && I->BeginOffset < EndOffset;
+         ++I) {
+      PartitionUse NewUse(std::max(I->BeginOffset, BeginOffset),
+                          std::min(I->EndOffset, EndOffset),
+                          &User, cast<Instruction>(*U));
+      P.Uses[I - P.begin()].push_back(NewUse);
+      if (isa<PHINode>(U->getUser()) || isa<SelectInst>(U->getUser()))
+        P.PHIOrSelectOpMap[std::make_pair(&User, U->get())]
+          = std::make_pair(I - P.begin(), P.Uses[I - P.begin()].size() - 1);
+    }
+  }
+
+  void handleLoadOrStore(Type *Ty, Instruction &I, int64_t Offset) {
+    uint64_t Size = TD.getTypeStoreSize(Ty);
+
+    // If this memory access can be shown to *statically* extend outside the
+    // bounds of of the allocation, it's behavior is undefined, so simply
+    // ignore it. Note that this is more strict than the generic clamping
+    // behavior of insertUse.
+    if (Offset < 0 || (uint64_t)Offset >= AllocSize ||
+        Size > (AllocSize - (uint64_t)Offset))
+      return markAsDead(I);
+
+    insertUse(I, Offset, Size);
+  }
+
+  void visitBitCastInst(BitCastInst &BC) {
+    if (BC.use_empty())
+      return markAsDead(BC);
+
+    enqueueUsers(BC, Offset);
+  }
+
+  void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+    if (GEPI.use_empty())
+      return markAsDead(GEPI);
+
+    int64_t GEPOffset;
+    if (!computeConstantGEPOffset(GEPI, GEPOffset))
+      llvm_unreachable("Unable to compute constant offset for use");
+
+    enqueueUsers(GEPI, GEPOffset);
+  }
+
+  void visitLoadInst(LoadInst &LI) {
+    handleLoadOrStore(LI.getType(), LI, Offset);
+  }
+
+  void visitStoreInst(StoreInst &SI) {
+    handleLoadOrStore(SI.getOperand(0)->getType(), SI, Offset);
+  }
+
+  void visitMemSetInst(MemSetInst &II) {
+    ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
+    uint64_t Size = Length ? Length->getZExtValue() : AllocSize - Offset;
+    insertUse(II, Offset, Size);
+  }
+
+  void visitMemTransferInst(MemTransferInst &II) {
+    ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
+    uint64_t Size = Length ? Length->getZExtValue() : AllocSize - Offset;
+    insertUse(II, Offset, Size);
+  }
+
+  void visitIntrinsicInst(IntrinsicInst &II) {
+    assert(II.getIntrinsicID() == Intrinsic::lifetime_start ||
+           II.getIntrinsicID() == Intrinsic::lifetime_end);
+
+    ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0));
+    insertUse(II, Offset,
+              std::min(AllocSize - Offset, Length->getLimitedValue()));
+  }
+
+  void insertPHIOrSelect(Instruction &User, uint64_t Offset) {
+    uint64_t Size = P.PHIOrSelectSizes.lookup(&User).first;
+
+    // For PHI and select operands outside the alloca, we can't nuke the entire
+    // phi or select -- the other side might still be relevant, so we special
+    // case them here and use a separate structure to track the operands
+    // themselves which should be replaced with undef.
+    if (Offset >= AllocSize) {
+      P.DeadOperands.push_back(U);
+      return;
+    }
+
+    insertUse(User, Offset, Size);
+  }
+  void visitPHINode(PHINode &PN) {
+    if (PN.use_empty())
+      return markAsDead(PN);
+
+    insertPHIOrSelect(PN, Offset);
+  }
+  void visitSelectInst(SelectInst &SI) {
+    if (SI.use_empty())
+      return markAsDead(SI);
+
+    if (Value *Result = foldSelectInst(SI)) {
+      if (Result == *U)
+        // If the result of the constant fold will be the pointer, recurse
+        // through the select as if we had RAUW'ed it.
+        enqueueUsers(SI, Offset);
+      else
+        // Otherwise the operand to the select is dead, and we can replace it
+        // with undef.
+        P.DeadOperands.push_back(U);
+
+      return;
+    }
+
+    insertPHIOrSelect(SI, Offset);
+  }
+
+  /// \brief Unreachable, we've already visited the alloca once.
+  void visitInstruction(Instruction &I) {
+    llvm_unreachable("Unhandled instruction in use builder.");
+  }
+};
+
+void AllocaPartitioning::splitAndMergePartitions() {
+  size_t NumDeadPartitions = 0;
+
+  // Track the range of splittable partitions that we pass when accumulating
+  // overlapping unsplittable partitions.
+  uint64_t SplitEndOffset = 0ull;
+
+  Partition New(0ull, 0ull, false);
+
+  for (unsigned i = 0, j = i, e = Partitions.size(); i != e; i = j) {
+    ++j;
+
+    if (!Partitions[i].IsSplittable || New.BeginOffset == New.EndOffset) {
+      assert(New.BeginOffset == New.EndOffset);
+      New = Partitions[i];
+    } else {
+      assert(New.IsSplittable);
+      New.EndOffset = std::max(New.EndOffset, Partitions[i].EndOffset);
+    }
+    assert(New.BeginOffset != New.EndOffset);
+
+    // Scan the overlapping partitions.
+    while (j != e && New.EndOffset > Partitions[j].BeginOffset) {
+      // If the new partition we are forming is splittable, stop at the first
+      // unsplittable partition.
+      if (New.IsSplittable && !Partitions[j].IsSplittable)
+        break;
+
+      // Grow the new partition to include any equally splittable range. 'j' is
+      // always equally splittable when New is splittable, but when New is not
+      // splittable, we may subsume some (or part of some) splitable partition
+      // without growing the new one.
+      if (New.IsSplittable == Partitions[j].IsSplittable) {
+        New.EndOffset = std::max(New.EndOffset, Partitions[j].EndOffset);
+      } else {
+        assert(!New.IsSplittable);
+        assert(Partitions[j].IsSplittable);
+        SplitEndOffset = std::max(SplitEndOffset, Partitions[j].EndOffset);
+      }
+
+      Partitions[j].BeginOffset = Partitions[j].EndOffset = UINT64_MAX;
+      ++NumDeadPartitions;
+      ++j;
+    }
+
+    // If the new partition is splittable, chop off the end as soon as the
+    // unsplittable subsequent partition starts and ensure we eventually cover
+    // the splittable area.
+    if (j != e && New.IsSplittable) {
+      SplitEndOffset = std::max(SplitEndOffset, New.EndOffset);
+      New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset);
+    }
+
+    // Add the new partition if it differs from the original one and is
+    // non-empty. We can end up with an empty partition here if it was
+    // splittable but there is an unsplittable one that starts at the same
+    // offset.
+    if (New != Partitions[i]) {
+      if (New.BeginOffset != New.EndOffset)
+        Partitions.push_back(New);
+      // Mark the old one for removal.
+      Partitions[i].BeginOffset = Partitions[i].EndOffset = UINT64_MAX;
+      ++NumDeadPartitions;
+    }
+
+    New.BeginOffset = New.EndOffset;
+    if (!New.IsSplittable) {
+      New.EndOffset = std::max(New.EndOffset, SplitEndOffset);
+      if (j != e && !Partitions[j].IsSplittable)
+        New.EndOffset = std::min(New.EndOffset, Partitions[j].BeginOffset);
+      New.IsSplittable = true;
+      // If there is a trailing splittable partition which won't be fused into
+      // the next splittable partition go ahead and add it onto the partitions
+      // list.
+      if (New.BeginOffset < New.EndOffset &&
+          (j == e || !Partitions[j].IsSplittable ||
+           New.EndOffset < Partitions[j].BeginOffset)) {
+        Partitions.push_back(New);
+        New.BeginOffset = New.EndOffset = 0ull;
+      }
+    }
+  }
+
+  // Re-sort the partitions now that they have been split and merged into
+  // disjoint set of partitions. Also remove any of the dead partitions we've
+  // replaced in the process.
+  std::sort(Partitions.begin(), Partitions.end());
+  if (NumDeadPartitions) {
+    assert(Partitions.back().BeginOffset == UINT64_MAX);
+    assert(Partitions.back().EndOffset == UINT64_MAX);
+    assert((ptrdiff_t)NumDeadPartitions ==
+           std::count(Partitions.begin(), Partitions.end(), Partitions.back()));
+  }
+  Partitions.erase(Partitions.end() - NumDeadPartitions, Partitions.end());
+}
+
+AllocaPartitioning::AllocaPartitioning(const TargetData &TD, AllocaInst &AI)
+    :
+#ifndef NDEBUG
+      AI(AI),
+#endif
+      PointerEscapingInstr(0) {
+  PartitionBuilder PB(TD, AI, *this);
+  if (!PB())
+    return;
+
+  if (Partitions.size() > 1) {
+    // Sort the uses. This arranges for the offsets to be in ascending order,
+    // and the sizes to be in descending order.
+    std::sort(Partitions.begin(), Partitions.end());
+
+    // Intersect splittability for all partitions with equal offsets and sizes.
+    // Then remove all but the first so that we have a sequence of non-equal but
+    // potentially overlapping partitions.
+    for (iterator I = Partitions.begin(), J = I, E = Partitions.end(); I != E;
+         I = J) {
+      ++J;
+      while (J != E && *I == *J) {
+        I->IsSplittable &= J->IsSplittable;
+        ++J;
+      }
+    }
+    Partitions.erase(std::unique(Partitions.begin(), Partitions.end()),
+                     Partitions.end());
+
+    // Split splittable and merge unsplittable partitions into a disjoint set
+    // of partitions over the used space of the allocation.
+    splitAndMergePartitions();
+  }
+
+  // Now build up the user lists for each of these disjoint partitions by
+  // re-walking the recursive users of the alloca.
+  Uses.resize(Partitions.size());
+  UseBuilder UB(TD, AI, *this);
+  UB();
+  for (iterator I = Partitions.begin(), E = Partitions.end(); I != E; ++I)
+    std::stable_sort(use_begin(I), use_end(I));
+}
+
+Type *AllocaPartitioning::getCommonType(iterator I) const {
+  Type *Ty = 0;
+  for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) {
+    if (isa<IntrinsicInst>(*UI->User))
+      continue;
+    if (UI->BeginOffset != I->BeginOffset || UI->EndOffset != I->EndOffset)
+      continue;
+
+    Type *UserTy = 0;
+    if (LoadInst *LI = dyn_cast<LoadInst>(&*UI->User)) {
+      UserTy = LI->getType();
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(&*UI->User)) {
+      UserTy = SI->getValueOperand()->getType();
+    } else if (SelectInst *SI = dyn_cast<SelectInst>(&*UI->User)) {
+      if (PointerType *PtrTy = dyn_cast<PointerType>(SI->getType()))
+        UserTy = PtrTy->getElementType();
+    } else if (PHINode *PN = dyn_cast<PHINode>(&*UI->User)) {
+      if (PointerType *PtrTy = dyn_cast<PointerType>(PN->getType()))
+        UserTy = PtrTy->getElementType();
+    }
+
+    if (Ty && Ty != UserTy)
+      return 0;
+
+    Ty = UserTy;
+  }
+  return Ty;
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+
+void AllocaPartitioning::print(raw_ostream &OS, const_iterator I,
+                               StringRef Indent) const {
+  OS << Indent << "partition #" << (I - begin())
+     << " [" << I->BeginOffset << "," << I->EndOffset << ")"
+     << (I->IsSplittable ? " (splittable)" : "")
+     << (Uses[I - begin()].empty() ? " (zero uses)" : "")
+     << "\n";
+}
+
+void AllocaPartitioning::printUsers(raw_ostream &OS, const_iterator I,
+                                    StringRef Indent) const {
+  for (const_use_iterator UI = use_begin(I), UE = use_end(I);
+       UI != UE; ++UI) {
+    OS << Indent << "  [" << UI->BeginOffset << "," << UI->EndOffset << ") "
+       << "used by: " << *UI->User << "\n";
+    if (MemTransferInst *II = dyn_cast<MemTransferInst>(&*UI->User)) {
+      const MemTransferOffsets &MTO = MemTransferInstData.lookup(II);
+      bool IsDest;
+      if (!MTO.IsSplittable)
+        IsDest = UI->BeginOffset == MTO.DestBegin;
+      else
+        IsDest = MTO.DestBegin != 0u;
+      OS << Indent << "    (original " << (IsDest ? "dest" : "source") << ": "
+         << "[" << (IsDest ? MTO.DestBegin : MTO.SourceBegin)
+         << "," << (IsDest ? MTO.DestEnd : MTO.SourceEnd) << ")\n";
+    }
+  }
+}
+
+void AllocaPartitioning::print(raw_ostream &OS) const {
+  if (PointerEscapingInstr) {
+    OS << "No partitioning for alloca: " << AI << "\n"
+       << "  A pointer to this alloca escaped by:\n"
+       << "  " << *PointerEscapingInstr << "\n";
+    return;
+  }
+
+  OS << "Partitioning of alloca: " << AI << "\n";
+  unsigned Num = 0;
+  for (const_iterator I = begin(), E = end(); I != E; ++I, ++Num) {
+    print(OS, I);
+    printUsers(OS, I);
+  }
+}
+
+void AllocaPartitioning::dump(const_iterator I) const { print(dbgs(), I); }
+void AllocaPartitioning::dump() const { print(dbgs()); }
+
+#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+
+
+namespace {
+/// \brief Implementation of LoadAndStorePromoter for promoting allocas.
+///
+/// This subclass of LoadAndStorePromoter adds overrides to handle promoting
+/// the loads and stores of an alloca instruction, as well as updating its
+/// debug information. This is used when a domtree is unavailable and thus
+/// mem2reg in its full form can't be used to handle promotion of allocas to
+/// scalar values.
+class AllocaPromoter : public LoadAndStorePromoter {
+  AllocaInst &AI;
+  DIBuilder &DIB;
+
+  SmallVector<DbgDeclareInst *, 4> DDIs;
+  SmallVector<DbgValueInst *, 4> DVIs;
+
+public:
+  AllocaPromoter(const SmallVectorImpl<Instruction*> &Insts, SSAUpdater &S,
+                 AllocaInst &AI, DIBuilder &DIB)
+    : LoadAndStorePromoter(Insts, S), AI(AI), DIB(DIB) {}
+
+  void run(const SmallVectorImpl<Instruction*> &Insts) {
+    // Remember which alloca we're promoting (for isInstInList).
+    if (MDNode *DebugNode = MDNode::getIfExists(AI.getContext(), &AI)) {
+      for (Value::use_iterator UI = DebugNode->use_begin(),
+                               UE = DebugNode->use_end();
+           UI != UE; ++UI)
+        if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(*UI))
+          DDIs.push_back(DDI);
+        else if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(*UI))
+          DVIs.push_back(DVI);
+    }
+
+    LoadAndStorePromoter::run(Insts);
+    AI.eraseFromParent();
+    while (!DDIs.empty())
+      DDIs.pop_back_val()->eraseFromParent();
+    while (!DVIs.empty())
+      DVIs.pop_back_val()->eraseFromParent();
+  }
+
+  virtual bool isInstInList(Instruction *I,
+                            const SmallVectorImpl<Instruction*> &Insts) const {
+    if (LoadInst *LI = dyn_cast<LoadInst>(I))
+      return LI->getOperand(0) == &AI;
+    return cast<StoreInst>(I)->getPointerOperand() == &AI;
+  }
+
+  virtual void updateDebugInfo(Instruction *Inst) const {
+    for (SmallVector<DbgDeclareInst *, 4>::const_iterator I = DDIs.begin(),
+           E = DDIs.end(); I != E; ++I) {
+      DbgDeclareInst *DDI = *I;
+      if (StoreInst *SI = dyn_cast<StoreInst>(Inst))
+        ConvertDebugDeclareToDebugValue(DDI, SI, DIB);
+      else if (LoadInst *LI = dyn_cast<LoadInst>(Inst))
+        ConvertDebugDeclareToDebugValue(DDI, LI, DIB);
+    }
+    for (SmallVector<DbgValueInst *, 4>::const_iterator I = DVIs.begin(),
+           E = DVIs.end(); I != E; ++I) {
+      DbgValueInst *DVI = *I;
+      Value *Arg = NULL;
+      if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+        // If an argument is zero extended then use argument directly. The ZExt
+        // may be zapped by an optimization pass in future.
+        if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0)))
+          Arg = dyn_cast<Argument>(ZExt->getOperand(0));
+        if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0)))
+          Arg = dyn_cast<Argument>(SExt->getOperand(0));
+        if (!Arg)
+          Arg = SI->getOperand(0);
+      } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+        Arg = LI->getOperand(0);
+      } else {
+        continue;
+      }
+      Instruction *DbgVal =
+        DIB.insertDbgValueIntrinsic(Arg, 0, DIVariable(DVI->getVariable()),
+                                     Inst);
+      DbgVal->setDebugLoc(DVI->getDebugLoc());
+    }
+  }
+};
+} // end anon namespace
+
+
+namespace {
+/// \brief An optimization pass providing Scalar Replacement of Aggregates.
+///
+/// This pass takes allocations which can be completely analyzed (that is, they
+/// don't escape) and tries to turn them into scalar SSA values. There are
+/// a few steps to this process.
+///
+/// 1) It takes allocations of aggregates and analyzes the ways in which they
+///    are used to try to split them into smaller allocations, ideally of
+///    a single scalar data type. It will split up memcpy and memset accesses
+///    as necessary and try to isolate invidual scalar accesses.
+/// 2) It will transform accesses into forms which are suitable for SSA value
+///    promotion. This can be replacing a memset with a scalar store of an
+///    integer value, or it can involve speculating operations on a PHI or
+///    select to be a PHI or select of the results.
+/// 3) Finally, this will try to detect a pattern of accesses which map cleanly
+///    onto insert and extract operations on a vector value, and convert them to
+///    this form. By doing so, it will enable promotion of vector aggregates to
+///    SSA vector values.
+class SROA : public FunctionPass {
+  const bool RequiresDomTree;
+
+  LLVMContext *C;
+  const TargetData *TD;
+  DominatorTree *DT;
+
+  /// \brief Worklist of alloca instructions to simplify.
+  ///
+  /// Each alloca in the function is added to this. Each new alloca formed gets
+  /// added to it as well to recursively simplify unless that alloca can be
+  /// directly promoted. Finally, each time we rewrite a use of an alloca other
+  /// the one being actively rewritten, we add it back onto the list if not
+  /// already present to ensure it is re-visited.
+  SetVector<AllocaInst *, SmallVector<AllocaInst *, 16> > Worklist;
+
+  /// \brief A collection of instructions to delete.
+  /// We try to batch deletions to simplify code and make things a bit more
+  /// efficient.
+  SmallVector<Instruction *, 8> DeadInsts;
+
+  /// \brief A set to prevent repeatedly marking an instruction split into many
+  /// uses as dead. Only used to guard insertion into DeadInsts.
+  SmallPtrSet<Instruction *, 4> DeadSplitInsts;
+
+  /// \brief A collection of alloca instructions we can directly promote.
+  std::vector<AllocaInst *> PromotableAllocas;
+
+public:
+  SROA(bool RequiresDomTree = true)
+      : FunctionPass(ID), RequiresDomTree(RequiresDomTree),
+        C(0), TD(0), DT(0) {
+    initializeSROAPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F);
+  void getAnalysisUsage(AnalysisUsage &AU) const;
+
+  const char *getPassName() const { return "SROA"; }
+  static char ID;
+
+private:
+  friend class AllocaPartitionRewriter;
+  friend class AllocaPartitionVectorRewriter;
+
+  bool rewriteAllocaPartition(AllocaInst &AI,
+                              AllocaPartitioning &P,
+                              AllocaPartitioning::iterator PI);
+  bool splitAlloca(AllocaInst &AI, AllocaPartitioning &P);
+  bool runOnAlloca(AllocaInst &AI);
+  void deleteDeadInstructions(SmallPtrSet<AllocaInst *, 4> &DeletedAllocas);
+  bool promoteAllocas(Function &F);
+};
+}
+
+char SROA::ID = 0;
+
+FunctionPass *llvm::createSROAPass(bool RequiresDomTree) {
+  return new SROA(RequiresDomTree);
+}
+
+INITIALIZE_PASS_BEGIN(SROA, "sroa", "Scalar Replacement Of Aggregates",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTree)
+INITIALIZE_PASS_END(SROA, "sroa", "Scalar Replacement Of Aggregates",
+                    false, false)
+
+/// \brief Accumulate the constant offsets in a GEP into a single APInt offset.
+///
+/// If the provided GEP is all-constant, the total byte offset formed by the
+/// GEP is computed and Offset is set to it. If the GEP has any non-constant
+/// operands, the function returns false and the value of Offset is unmodified.
+static bool accumulateGEPOffsets(const TargetData &TD, GEPOperator &GEP,
+                                 APInt &Offset) {
+  APInt GEPOffset(Offset.getBitWidth(), 0);
+  for (gep_type_iterator GTI = gep_type_begin(GEP), GTE = gep_type_end(GEP);
+       GTI != GTE; ++GTI) {
+    ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
+    if (!OpC)
+      return false;
+    if (OpC->isZero()) continue;
+
+    // Handle a struct index, which adds its field offset to the pointer.
+    if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+      unsigned ElementIdx = OpC->getZExtValue();
+      const StructLayout *SL = TD.getStructLayout(STy);
+      GEPOffset += APInt(Offset.getBitWidth(),
+                         SL->getElementOffset(ElementIdx));
+      continue;
+    }
+
+    APInt TypeSize(Offset.getBitWidth(),
+                   TD.getTypeAllocSize(GTI.getIndexedType()));
+    if (VectorType *VTy = dyn_cast<VectorType>(*GTI)) {
+      assert((VTy->getScalarSizeInBits() % 8) == 0 &&
+             "vector element size is not a multiple of 8, cannot GEP over it");
+      TypeSize = VTy->getScalarSizeInBits() / 8;
+    }
+
+    GEPOffset += OpC->getValue().sextOrTrunc(Offset.getBitWidth()) * TypeSize;
+  }
+  Offset = GEPOffset;
+  return true;
+}
+
+/// \brief Build a GEP out of a base pointer and indices.
+///
+/// This will return the BasePtr if that is valid, or build a new GEP
+/// instruction using the IRBuilder if GEP-ing is needed.
+static Value *buildGEP(IRBuilder<> &IRB, Value *BasePtr,
+                       SmallVectorImpl<Value *> &Indices,
+                       const Twine &Prefix) {
+  if (Indices.empty())
+    return BasePtr;
+
+  // A single zero index is a no-op, so check for this and avoid building a GEP
+  // in that case.
+  if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero())
+    return BasePtr;
+
+  return IRB.CreateInBoundsGEP(BasePtr, Indices, Prefix + ".idx");
+}
+
+/// \brief Get a natural GEP off of the BasePtr walking through Ty toward
+/// TargetTy without changing the offset of the pointer.
+///
+/// This routine assumes we've already established a properly offset GEP with
+/// Indices, and arrived at the Ty type. The goal is to continue to GEP with
+/// zero-indices down through type layers until we find one the same as
+/// TargetTy. If we can't find one with the same type, we at least try to use
+/// one with the same size. If none of that works, we just produce the GEP as
+/// indicated by Indices to have the correct offset.
+static Value *getNaturalGEPWithType(IRBuilder<> &IRB, const TargetData &TD,
+                                    Value *BasePtr, Type *Ty, Type *TargetTy,
+                                    SmallVectorImpl<Value *> &Indices,
+                                    const Twine &Prefix) {
+  if (Ty == TargetTy)
+    return buildGEP(IRB, BasePtr, Indices, Prefix);
+
+  // See if we can descend into a struct and locate a field with the correct
+  // type.
+  unsigned NumLayers = 0;
+  Type *ElementTy = Ty;
+  do {
+    if (ElementTy->isPointerTy())
+      break;
+    if (SequentialType *SeqTy = dyn_cast<SequentialType>(ElementTy)) {
+      ElementTy = SeqTy->getElementType();
+      Indices.push_back(IRB.getInt(APInt(TD.getPointerSizeInBits(), 0)));
+    } else if (StructType *STy = dyn_cast<StructType>(ElementTy)) {
+      ElementTy = *STy->element_begin();
+      Indices.push_back(IRB.getInt32(0));
+    } else {
+      break;
+    }
+    ++NumLayers;
+  } while (ElementTy != TargetTy);
+  if (ElementTy != TargetTy)
+    Indices.erase(Indices.end() - NumLayers, Indices.end());
+
+  return buildGEP(IRB, BasePtr, Indices, Prefix);
+}
+
+/// \brief Recursively compute indices for a natural GEP.
+///
+/// This is the recursive step for getNaturalGEPWithOffset that walks down the
+/// element types adding appropriate indices for the GEP.
+static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const TargetData &TD,
+                                       Value *Ptr, Type *Ty, APInt &Offset,
+                                       Type *TargetTy,
+                                       SmallVectorImpl<Value *> &Indices,
+                                       const Twine &Prefix) {
+  if (Offset == 0)
+    return getNaturalGEPWithType(IRB, TD, Ptr, Ty, TargetTy, Indices, Prefix);
+
+  // We can't recurse through pointer types.
+  if (Ty->isPointerTy())
+    return 0;
+
+  // We try to analyze GEPs over vectors here, but note that these GEPs are
+  // extremely poorly defined currently. The long-term goal is to remove GEPing
+  // over a vector from the IR completely.
+  if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) {
+    unsigned ElementSizeInBits = VecTy->getScalarSizeInBits();
+    if (ElementSizeInBits % 8)
+      return 0; // GEPs over non-multiple of 8 size vector elements are invalid.
+    APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8);
+    APInt NumSkippedElements = Offset.udiv(ElementSize);
+    if (NumSkippedElements.ugt(VecTy->getNumElements()))
+      return 0;
+    Offset -= NumSkippedElements * ElementSize;
+    Indices.push_back(IRB.getInt(NumSkippedElements));
+    return getNaturalGEPRecursively(IRB, TD, Ptr, VecTy->getElementType(),
+                                    Offset, TargetTy, Indices, Prefix);
+  }
+
+  if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
+    Type *ElementTy = ArrTy->getElementType();
+    APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy));
+    APInt NumSkippedElements = Offset.udiv(ElementSize);
+    if (NumSkippedElements.ugt(ArrTy->getNumElements()))
+      return 0;
+
+    Offset -= NumSkippedElements * ElementSize;
+    Indices.push_back(IRB.getInt(NumSkippedElements));
+    return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy,
+                                    Indices, Prefix);
+  }
+
+  StructType *STy = dyn_cast<StructType>(Ty);
+  if (!STy)
+    return 0;
+
+  const StructLayout *SL = TD.getStructLayout(STy);
+  uint64_t StructOffset = Offset.getZExtValue();
+  if (StructOffset >= SL->getSizeInBytes())
+    return 0;
+  unsigned Index = SL->getElementContainingOffset(StructOffset);
+  Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index));
+  Type *ElementTy = STy->getElementType(Index);
+  if (Offset.uge(TD.getTypeAllocSize(ElementTy)))
+    return 0; // The offset points into alignment padding.
+
+  Indices.push_back(IRB.getInt32(Index));
+  return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy,
+                                  Indices, Prefix);
+}
+
+/// \brief Get a natural GEP from a base pointer to a particular offset and
+/// resulting in a particular type.
+///
+/// The goal is to produce a "natural" looking GEP that works with the existing
+/// composite types to arrive at the appropriate offset and element type for
+/// a pointer. TargetTy is the element type the returned GEP should point-to if
+/// possible. We recurse by decreasing Offset, adding the appropriate index to
+/// Indices, and setting Ty to the result subtype.
+///
+/// If no natural GEP can be constructed, this function returns null.
+static Value *getNaturalGEPWithOffset(IRBuilder<> &IRB, const TargetData &TD,
+                                      Value *Ptr, APInt Offset, Type *TargetTy,
+                                      SmallVectorImpl<Value *> &Indices,
+                                      const Twine &Prefix) {
+  PointerType *Ty = cast<PointerType>(Ptr->getType());
+
+  // Don't consider any GEPs through an i8* as natural unless the TargetTy is
+  // an i8.
+  if (Ty == IRB.getInt8PtrTy() && TargetTy->isIntegerTy(8))
+    return 0;
+
+  Type *ElementTy = Ty->getElementType();
+  if (!ElementTy->isSized())
+    return 0; // We can't GEP through an unsized element.
+  APInt ElementSize(Offset.getBitWidth(), TD.getTypeAllocSize(ElementTy));
+  if (ElementSize == 0)
+    return 0; // Zero-length arrays can't help us build a natural GEP.
+  APInt NumSkippedElements = Offset.udiv(ElementSize);
+
+  Offset -= NumSkippedElements * ElementSize;
+  Indices.push_back(IRB.getInt(NumSkippedElements));
+  return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy,
+                                  Indices, Prefix);
+}
+
+/// \brief Compute an adjusted pointer from Ptr by Offset bytes where the
+/// resulting pointer has PointerTy.
+///
+/// This tries very hard to compute a "natural" GEP which arrives at the offset
+/// and produces the pointer type desired. Where it cannot, it will try to use
+/// the natural GEP to arrive at the offset and bitcast to the type. Where that
+/// fails, it will try to use an existing i8* and GEP to the byte offset and
+/// bitcast to the type.
+///
+/// The strategy for finding the more natural GEPs is to peel off layers of the
+/// pointer, walking back through bit casts and GEPs, searching for a base
+/// pointer from which we can compute a natural GEP with the desired
+/// properities. The algorithm tries to fold as many constant indices into
+/// a single GEP as possible, thus making each GEP more independent of the
+/// surrounding code.
+static Value *getAdjustedPtr(IRBuilder<> &IRB, const TargetData &TD,
+                             Value *Ptr, APInt Offset, Type *PointerTy,
+                             const Twine &Prefix) {
+  // Even though we don't look through PHI nodes, we could be called on an
+  // instruction in an unreachable block, which may be on a cycle.
+  SmallPtrSet<Value *, 4> Visited;
+  Visited.insert(Ptr);
+  SmallVector<Value *, 4> Indices;
+
+  // We may end up computing an offset pointer that has the wrong type. If we
+  // never are able to compute one directly that has the correct type, we'll
+  // fall back to it, so keep it around here.
+  Value *OffsetPtr = 0;
+
+  // Remember any i8 pointer we come across to re-use if we need to do a raw
+  // byte offset.
+  Value *Int8Ptr = 0;
+  APInt Int8PtrOffset(Offset.getBitWidth(), 0);
+
+  Type *TargetTy = PointerTy->getPointerElementType();
+
+  do {
+    // First fold any existing GEPs into the offset.
+    while (GEPOperator *GEP = dyn_cast<GEPOperator>(Ptr)) {
+      APInt GEPOffset(Offset.getBitWidth(), 0);
+      if (!accumulateGEPOffsets(TD, *GEP, GEPOffset))
+        break;
+      Offset += GEPOffset;
+      Ptr = GEP->getPointerOperand();
+      if (!Visited.insert(Ptr))
+        break;
+    }
+
+    // See if we can perform a natural GEP here.
+    Indices.clear();
+    if (Value *P = getNaturalGEPWithOffset(IRB, TD, Ptr, Offset, TargetTy,
+                                           Indices, Prefix)) {
+      if (P->getType() == PointerTy) {
+        // Zap any offset pointer that we ended up computing in previous rounds.
+        if (OffsetPtr && OffsetPtr->use_empty())
+          if (Instruction *I = dyn_cast<Instruction>(OffsetPtr))
+            I->eraseFromParent();
+        return P;
+      }
+      if (!OffsetPtr) {
+        OffsetPtr = P;
+      }
+    }
+
+    // Stash this pointer if we've found an i8*.
+    if (Ptr->getType()->isIntegerTy(8)) {
+      Int8Ptr = Ptr;
+      Int8PtrOffset = Offset;
+    }
+
+    // Peel off a layer of the pointer and update the offset appropriately.
+    if (Operator::getOpcode(Ptr) == Instruction::BitCast) {
+      Ptr = cast<Operator>(Ptr)->getOperand(0);
+    } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(Ptr)) {
+      if (GA->mayBeOverridden())
+        break;
+      Ptr = GA->getAliasee();
+    } else {
+      break;
+    }
+    assert(Ptr->getType()->isPointerTy() && "Unexpected operand type!");
+  } while (Visited.insert(Ptr));
+
+  if (!OffsetPtr) {
+    if (!Int8Ptr) {
+      Int8Ptr = IRB.CreateBitCast(Ptr, IRB.getInt8PtrTy(),
+                                  Prefix + ".raw_cast");
+      Int8PtrOffset = Offset;
+    }
+
+    OffsetPtr = Int8PtrOffset == 0 ? Int8Ptr :
+      IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset),
+                            Prefix + ".raw_idx");
+  }
+  Ptr = OffsetPtr;
+
+  // On the off chance we were targeting i8*, guard the bitcast here.
+  if (Ptr->getType() != PointerTy)
+    Ptr = IRB.CreateBitCast(Ptr, PointerTy, Prefix + ".cast");
+
+  return Ptr;
+}
+
+/// \brief Test whether the given alloca partition can be promoted to a vector.
+///
+/// This is a quick test to check whether we can rewrite a particular alloca
+/// partition (and its newly formed alloca) into a vector alloca with only
+/// whole-vector loads and stores such that it could be promoted to a vector
+/// SSA value. We only can ensure this for a limited set of operations, and we
+/// don't want to do the rewrites unless we are confident that the result will
+/// be promotable, so we have an early test here.
+static bool isVectorPromotionViable(const TargetData &TD,
+                                    Type *AllocaTy,
+                                    AllocaPartitioning &P,
+                                    uint64_t PartitionBeginOffset,
+                                    uint64_t PartitionEndOffset,
+                                    AllocaPartitioning::const_use_iterator I,
+                                    AllocaPartitioning::const_use_iterator E) {
+  VectorType *Ty = dyn_cast<VectorType>(AllocaTy);
+  if (!Ty)
+    return false;
+
+  uint64_t VecSize = TD.getTypeSizeInBits(Ty);
+  uint64_t ElementSize = Ty->getScalarSizeInBits();
+
+  // While the definition of LLVM vectors is bitpacked, we don't support sizes
+  // that aren't byte sized.
+  if (ElementSize % 8)
+    return false;
+  assert((VecSize % 8) == 0 && "vector size not a multiple of element size?");
+  VecSize /= 8;
+  ElementSize /= 8;
+
+  for (; I != E; ++I) {
+    uint64_t BeginOffset = I->BeginOffset - PartitionBeginOffset;
+    uint64_t BeginIndex = BeginOffset / ElementSize;
+    if (BeginIndex * ElementSize != BeginOffset ||
+        BeginIndex >= Ty->getNumElements())
+      return false;
+    uint64_t EndOffset = I->EndOffset - PartitionBeginOffset;
+    uint64_t EndIndex = EndOffset / ElementSize;
+    if (EndIndex * ElementSize != EndOffset ||
+        EndIndex > Ty->getNumElements())
+      return false;
+
+    // FIXME: We should build shuffle vector instructions to handle
+    // non-element-sized accesses.
+    if ((EndOffset - BeginOffset) != ElementSize &&
+        (EndOffset - BeginOffset) != VecSize)
+      return false;
+
+    if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(&*I->User)) {
+      if (MI->isVolatile())
+        return false;
+      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(&*I->User)) {
+        const AllocaPartitioning::MemTransferOffsets &MTO
+          = P.getMemTransferOffsets(*MTI);
+        if (!MTO.IsSplittable)
+          return false;
+      }
+    } else if (I->Ptr->getType()->getPointerElementType()->isStructTy()) {
+      // Disable vector promotion when there are loads or stores of an FCA.
+      return false;
+    } else if (!isa<LoadInst>(*I->User) && !isa<StoreInst>(*I->User)) {
+      return false;
+    }
+  }
+  return true;
+}
+
+/// \brief Test whether the given alloca partition can be promoted to an int.
+///
+/// This is a quick test to check whether we can rewrite a particular alloca
+/// partition (and its newly formed alloca) into an integer alloca suitable for
+/// promotion to an SSA value. We only can ensure this for a limited set of
+/// operations, and we don't want to do the rewrites unless we are confident
+/// that the result will be promotable, so we have an early test here.
+static bool isIntegerPromotionViable(const TargetData &TD,
+                                     Type *AllocaTy,
+                                     AllocaPartitioning &P,
+                                     AllocaPartitioning::const_use_iterator I,
+                                     AllocaPartitioning::const_use_iterator E) {
+  IntegerType *Ty = dyn_cast<IntegerType>(AllocaTy);
+  if (!Ty)
+    return false;
+
+  // Check the uses to ensure the uses are (likely) promoteable integer uses.
+  // Also ensure that the alloca has a covering load or store. We don't want
+  // promote because of some other unsplittable entry (which we may make
+  // splittable later) and lose the ability to promote each element access.
+  bool WholeAllocaOp = false;
+  for (; I != E; ++I) {
+    if (LoadInst *LI = dyn_cast<LoadInst>(&*I->User)) {
+      if (LI->isVolatile() || !LI->getType()->isIntegerTy())
+        return false;
+      if (LI->getType() == Ty)
+        WholeAllocaOp = true;
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(&*I->User)) {
+      if (SI->isVolatile() || !SI->getValueOperand()->getType()->isIntegerTy())
+        return false;
+      if (SI->getValueOperand()->getType() == Ty)
+        WholeAllocaOp = true;
+    } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(&*I->User)) {
+      if (MI->isVolatile())
+        return false;
+      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(&*I->User)) {
+        const AllocaPartitioning::MemTransferOffsets &MTO
+          = P.getMemTransferOffsets(*MTI);
+        if (!MTO.IsSplittable)
+          return false;
+      }
+    } else {
+      return false;
+    }
+  }
+  return WholeAllocaOp;
+}
+
+namespace {
+/// \brief Visitor to rewrite instructions using a partition of an alloca to
+/// use a new alloca.
+///
+/// Also implements the rewriting to vector-based accesses when the partition
+/// passes the isVectorPromotionViable predicate. Most of the rewriting logic
+/// lives here.
+class AllocaPartitionRewriter : public InstVisitor<AllocaPartitionRewriter,
+                                                   bool> {
+  // Befriend the base class so it can delegate to private visit methods.
+  friend class llvm::InstVisitor<AllocaPartitionRewriter, bool>;
+
+  const TargetData &TD;
+  AllocaPartitioning &P;
+  SROA &Pass;
+  AllocaInst &OldAI, &NewAI;
+  const uint64_t NewAllocaBeginOffset, NewAllocaEndOffset;
+
+  // If we are rewriting an alloca partition which can be written as pure
+  // vector operations, we stash extra information here. When VecTy is
+  // non-null, we have some strict guarantees about the rewriten alloca:
+  //   - The new alloca is exactly the size of the vector type here.
+  //   - The accesses all either map to the entire vector or to a single
+  //     element.
+  //   - The set of accessing instructions is only one of those handled above
+  //     in isVectorPromotionViable. Generally these are the same access kinds
+  //     which are promotable via mem2reg.
+  VectorType *VecTy;
+  Type *ElementTy;
+  uint64_t ElementSize;
+
+  // This is a convenience and flag variable that will be null unless the new
+  // alloca has a promotion-targeted integer type due to passing
+  // isIntegerPromotionViable above. If it is non-null does, the desired
+  // integer type will be stored here for easy access during rewriting.
+  IntegerType *IntPromotionTy;
+
+  // The offset of the partition user currently being rewritten.
+  uint64_t BeginOffset, EndOffset;
+  Instruction *OldPtr;
+
+  // The name prefix to use when rewriting instructions for this alloca.
+  std::string NamePrefix;
+
+public:
+  AllocaPartitionRewriter(const TargetData &TD, AllocaPartitioning &P,
+                          AllocaPartitioning::iterator PI,
+                          SROA &Pass, AllocaInst &OldAI, AllocaInst &NewAI,
+                          uint64_t NewBeginOffset, uint64_t NewEndOffset)
+    : TD(TD), P(P), Pass(Pass),
+      OldAI(OldAI), NewAI(NewAI),
+      NewAllocaBeginOffset(NewBeginOffset),
+      NewAllocaEndOffset(NewEndOffset),
+      VecTy(), ElementTy(), ElementSize(), IntPromotionTy(),
+      BeginOffset(), EndOffset() {
+  }
+
+  /// \brief Visit the users of the alloca partition and rewrite them.
+  bool visitUsers(AllocaPartitioning::const_use_iterator I,
+                  AllocaPartitioning::const_use_iterator E) {
+    if (isVectorPromotionViable(TD, NewAI.getAllocatedType(), P,
+                                NewAllocaBeginOffset, NewAllocaEndOffset,
+                                I, E)) {
+      ++NumVectorized;
+      VecTy = cast<VectorType>(NewAI.getAllocatedType());
+      ElementTy = VecTy->getElementType();
+      assert((VecTy->getScalarSizeInBits() % 8) == 0 &&
+             "Only multiple-of-8 sized vector elements are viable");
+      ElementSize = VecTy->getScalarSizeInBits() / 8;
+    } else if (isIntegerPromotionViable(TD, NewAI.getAllocatedType(),
+                                        P, I, E)) {
+      IntPromotionTy = cast<IntegerType>(NewAI.getAllocatedType());
+    }
+    bool CanSROA = true;
+    for (; I != E; ++I) {
+      BeginOffset = I->BeginOffset;
+      EndOffset = I->EndOffset;
+      OldPtr = I->Ptr;
+      NamePrefix = (Twine(NewAI.getName()) + "." + Twine(BeginOffset)).str();
+      CanSROA &= visit(I->User);
+    }
+    if (VecTy) {
+      assert(CanSROA);
+      VecTy = 0;
+      ElementTy = 0;
+      ElementSize = 0;
+    }
+    return CanSROA;
+  }
+
+private:
+  // Every instruction which can end up as a user must have a rewrite rule.
+  bool visitInstruction(Instruction &I) {
+    DEBUG(dbgs() << "    !!!! Cannot rewrite: " << I << "\n");
+    llvm_unreachable("No rewrite rule for this instruction!");
+  }
+
+  Twine getName(const Twine &Suffix) {
+    return NamePrefix + Suffix;
+  }
+
+  Value *getAdjustedAllocaPtr(IRBuilder<> &IRB, Type *PointerTy) {
+    assert(BeginOffset >= NewAllocaBeginOffset);
+    APInt Offset(TD.getPointerSizeInBits(), BeginOffset - NewAllocaBeginOffset);
+    return getAdjustedPtr(IRB, TD, &NewAI, Offset, PointerTy, getName(""));
+  }
+
+  ConstantInt *getIndex(IRBuilder<> &IRB, uint64_t Offset) {
+    assert(VecTy && "Can only call getIndex when rewriting a vector");
+    uint64_t RelOffset = Offset - NewAllocaBeginOffset;
+    assert(RelOffset / ElementSize < UINT32_MAX && "Index out of bounds");
+    uint32_t Index = RelOffset / ElementSize;
+    assert(Index * ElementSize == RelOffset);
+    return IRB.getInt32(Index);
+  }
+
+  Value *extractInteger(IRBuilder<> &IRB, IntegerType *TargetTy,
+                        uint64_t Offset) {
+    assert(IntPromotionTy && "Alloca is not an integer we can extract from");
+    Value *V = IRB.CreateLoad(&NewAI, getName(".load"));
+    assert(Offset >= NewAllocaBeginOffset && "Out of bounds offset");
+    uint64_t RelOffset = Offset - NewAllocaBeginOffset;
+    if (RelOffset)
+      V = IRB.CreateLShr(V, RelOffset*8, getName(".shift"));
+    if (TargetTy != IntPromotionTy) {
+      assert(TargetTy->getBitWidth() < IntPromotionTy->getBitWidth() &&
+             "Cannot extract to a larger integer!");
+      V = IRB.CreateTrunc(V, TargetTy, getName(".trunc"));
+    }
+    return V;
+  }
+
+  StoreInst *insertInteger(IRBuilder<> &IRB, Value *V, uint64_t Offset) {
+    IntegerType *Ty = cast<IntegerType>(V->getType());
+    if (Ty == IntPromotionTy)
+      return IRB.CreateStore(V, &NewAI);
+
+    assert(Ty->getBitWidth() < IntPromotionTy->getBitWidth() &&
+           "Cannot insert a larger integer!");
+    V = IRB.CreateZExt(V, IntPromotionTy, getName(".ext"));
+    assert(Offset >= NewAllocaBeginOffset && "Out of bounds offset");
+    uint64_t RelOffset = Offset - NewAllocaBeginOffset;
+    if (RelOffset)
+      V = IRB.CreateShl(V, RelOffset*8, getName(".shift"));
+
+    APInt Mask = ~Ty->getMask().zext(IntPromotionTy->getBitWidth())
+                               .shl(RelOffset*8);
+    Value *Old = IRB.CreateAnd(IRB.CreateLoad(&NewAI, getName(".oldload")),
+                               Mask, getName(".mask"));
+    return IRB.CreateStore(IRB.CreateOr(Old, V, getName(".insert")),
+                           &NewAI);
+  }
+
+  void deleteIfTriviallyDead(Value *V) {
+    Instruction *I = cast<Instruction>(V);
+    if (isInstructionTriviallyDead(I))
+      Pass.DeadInsts.push_back(I);
+  }
+
+  Value *getValueCast(IRBuilder<> &IRB, Value *V, Type *Ty) {
+    if (V->getType()->isIntegerTy() && Ty->isPointerTy())
+      return IRB.CreateIntToPtr(V, Ty);
+    if (V->getType()->isPointerTy() && Ty->isIntegerTy())
+      return IRB.CreatePtrToInt(V, Ty);
+
+    return IRB.CreateBitCast(V, Ty);
+  }
+
+  bool rewriteVectorizedLoadInst(IRBuilder<> &IRB, LoadInst &LI, Value *OldOp) {
+    Value *Result;
+    if (LI.getType() == VecTy->getElementType() ||
+        BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset) {
+      Result
+        = IRB.CreateExtractElement(IRB.CreateLoad(&NewAI, getName(".load")),
+                                   getIndex(IRB, BeginOffset),
+                                   getName(".extract"));
+    } else {
+      Result = IRB.CreateLoad(&NewAI, getName(".load"));
+    }
+    if (Result->getType() != LI.getType())
+      Result = getValueCast(IRB, Result, LI.getType());
+    LI.replaceAllUsesWith(Result);
+    Pass.DeadInsts.push_back(&LI);
+
+    DEBUG(dbgs() << "          to: " << *Result << "\n");
+    return true;
+  }
+
+  bool rewriteIntegerLoad(IRBuilder<> &IRB, LoadInst &LI) {
+    assert(!LI.isVolatile());
+    Value *Result = extractInteger(IRB, cast<IntegerType>(LI.getType()),
+                                   BeginOffset);
+    LI.replaceAllUsesWith(Result);
+    Pass.DeadInsts.push_back(&LI);
+    DEBUG(dbgs() << "          to: " << *Result << "\n");
+    return true;
+  }
+
+  bool visitLoadInst(LoadInst &LI) {
+    DEBUG(dbgs() << "    original: " << LI << "\n");
+    Value *OldOp = LI.getOperand(0);
+    assert(OldOp == OldPtr);
+    IRBuilder<> IRB(&LI);
+
+    if (VecTy)
+      return rewriteVectorizedLoadInst(IRB, LI, OldOp);
+    if (IntPromotionTy)
+      return rewriteIntegerLoad(IRB, LI);
+
+    Value *NewPtr = getAdjustedAllocaPtr(IRB,
+                                         LI.getPointerOperand()->getType());
+    LI.setOperand(0, NewPtr);
+    DEBUG(dbgs() << "          to: " << LI << "\n");
+
+    deleteIfTriviallyDead(OldOp);
+    return NewPtr == &NewAI && !LI.isVolatile();
+  }
+
+  bool rewriteVectorizedStoreInst(IRBuilder<> &IRB, StoreInst &SI,
+                                  Value *OldOp) {
+    Value *V = SI.getValueOperand();
+    if (V->getType() == ElementTy ||
+        BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset) {
+      if (V->getType() != ElementTy)
+        V = getValueCast(IRB, V, ElementTy);
+      V = IRB.CreateInsertElement(IRB.CreateLoad(&NewAI, getName(".load")), V,
+                                  getIndex(IRB, BeginOffset),
+                                  getName(".insert"));
+    } else if (V->getType() != VecTy) {
+      V = getValueCast(IRB, V, VecTy);
+    }
+    StoreInst *Store = IRB.CreateStore(V, &NewAI);
+    Pass.DeadInsts.push_back(&SI);
+
+    (void)Store;
+    DEBUG(dbgs() << "          to: " << *Store << "\n");
+    return true;
+  }
+
+  bool rewriteIntegerStore(IRBuilder<> &IRB, StoreInst &SI) {
+    assert(!SI.isVolatile());
+    StoreInst *Store = insertInteger(IRB, SI.getValueOperand(), BeginOffset);
+    Pass.DeadInsts.push_back(&SI);
+    (void)Store;
+    DEBUG(dbgs() << "          to: " << *Store << "\n");
+    return true;
+  }
+
+  bool visitStoreInst(StoreInst &SI) {
+    DEBUG(dbgs() << "    original: " << SI << "\n");
+    Value *OldOp = SI.getOperand(1);
+    assert(OldOp == OldPtr);
+    IRBuilder<> IRB(&SI);
+
+    if (VecTy)
+      return rewriteVectorizedStoreInst(IRB, SI, OldOp);
+    if (IntPromotionTy)
+      return rewriteIntegerStore(IRB, SI);
+
+    Value *NewPtr = getAdjustedAllocaPtr(IRB,
+                                         SI.getPointerOperand()->getType());
+    SI.setOperand(1, NewPtr);
+    DEBUG(dbgs() << "          to: " << SI << "\n");
+
+    deleteIfTriviallyDead(OldOp);
+    return NewPtr == &NewAI && !SI.isVolatile();
+  }
+
+  bool visitMemSetInst(MemSetInst &II) {
+    DEBUG(dbgs() << "    original: " << II << "\n");
+    IRBuilder<> IRB(&II);
+    assert(II.getRawDest() == OldPtr);
+
+    // If the memset has a variable size, it cannot be split, just adjust the
+    // pointer to the new alloca.
+    if (!isa<Constant>(II.getLength())) {
+      II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType()));
+      deleteIfTriviallyDead(OldPtr);
+      return false;
+    }
+
+    // Record this instruction for deletion.
+    if (Pass.DeadSplitInsts.insert(&II))
+      Pass.DeadInsts.push_back(&II);
+
+    Type *AllocaTy = NewAI.getAllocatedType();
+    Type *ScalarTy = AllocaTy->getScalarType();
+
+    // If this doesn't map cleanly onto the alloca type, and that type isn't
+    // a single value type, just emit a memset.
+    if (!VecTy && (BeginOffset != NewAllocaBeginOffset ||
+                   EndOffset != NewAllocaEndOffset ||
+                   !AllocaTy->isSingleValueType() ||
+                   !TD.isLegalInteger(TD.getTypeSizeInBits(ScalarTy)))) {
+      Type *SizeTy = II.getLength()->getType();
+      Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset);
+
+      CallInst *New
+        = IRB.CreateMemSet(getAdjustedAllocaPtr(IRB,
+                                                II.getRawDest()->getType()),
+                           II.getValue(), Size, II.getAlignment(),
+                           II.isVolatile());
+      (void)New;
+      DEBUG(dbgs() << "          to: " << *New << "\n");
+      return false;
+    }
+
+    // If we can represent this as a simple value, we have to build the actual
+    // value to store, which requires expanding the byte present in memset to
+    // a sensible representation for the alloca type. This is essentially
+    // splatting the byte to a sufficiently wide integer, bitcasting to the
+    // desired scalar type, and splatting it across any desired vector type.
+    Value *V = II.getValue();
+    IntegerType *VTy = cast<IntegerType>(V->getType());
+    Type *IntTy = Type::getIntNTy(VTy->getContext(),
+                                  TD.getTypeSizeInBits(ScalarTy));
+    if (TD.getTypeSizeInBits(ScalarTy) > VTy->getBitWidth())
+      V = IRB.CreateMul(IRB.CreateZExt(V, IntTy, getName(".zext")),
+                        ConstantExpr::getUDiv(
+                          Constant::getAllOnesValue(IntTy),
+                          ConstantExpr::getZExt(
+                            Constant::getAllOnesValue(V->getType()),
+                            IntTy)),
+                        getName(".isplat"));
+    if (V->getType() != ScalarTy) {
+      if (ScalarTy->isPointerTy())
+        V = IRB.CreateIntToPtr(V, ScalarTy);
+      else if (ScalarTy->isPrimitiveType() || ScalarTy->isVectorTy())
+        V = IRB.CreateBitCast(V, ScalarTy);
+      else if (ScalarTy->isIntegerTy())
+        llvm_unreachable("Computed different integer types with equal widths");
+      else
+        llvm_unreachable("Invalid scalar type");
+    }
+
+    // If this is an element-wide memset of a vectorizable alloca, insert it.
+    if (VecTy && (BeginOffset > NewAllocaBeginOffset ||
+                  EndOffset < NewAllocaEndOffset)) {
+      StoreInst *Store = IRB.CreateStore(
+        IRB.CreateInsertElement(IRB.CreateLoad(&NewAI, getName(".load")), V,
+                                getIndex(IRB, BeginOffset),
+                                getName(".insert")),
+        &NewAI);
+      (void)Store;
+      DEBUG(dbgs() << "          to: " << *Store << "\n");
+      return true;
+    }
+
+    // Splat to a vector if needed.
+    if (VectorType *VecTy = dyn_cast<VectorType>(AllocaTy)) {
+      VectorType *SplatSourceTy = VectorType::get(V->getType(), 1);
+      V = IRB.CreateShuffleVector(
+        IRB.CreateInsertElement(UndefValue::get(SplatSourceTy), V,
+                                IRB.getInt32(0), getName(".vsplat.insert")),
+        UndefValue::get(SplatSourceTy),
+        ConstantVector::getSplat(VecTy->getNumElements(), IRB.getInt32(0)),
+        getName(".vsplat.shuffle"));
+      assert(V->getType() == VecTy);
+    }
+
+    Value *New = IRB.CreateStore(V, &NewAI, II.isVolatile());
+    (void)New;
+    DEBUG(dbgs() << "          to: " << *New << "\n");
+    return !II.isVolatile();
+  }
+
+  bool visitMemTransferInst(MemTransferInst &II) {
+    // Rewriting of memory transfer instructions can be a bit tricky. We break
+    // them into two categories: split intrinsics and unsplit intrinsics.
+
+    DEBUG(dbgs() << "    original: " << II << "\n");
+    IRBuilder<> IRB(&II);
+
+    assert(II.getRawSource() == OldPtr || II.getRawDest() == OldPtr);
+    bool IsDest = II.getRawDest() == OldPtr;
+
+    const AllocaPartitioning::MemTransferOffsets &MTO
+      = P.getMemTransferOffsets(II);
+
+    // For unsplit intrinsics, we simply modify the source and destination
+    // pointers in place. This isn't just an optimization, it is a matter of
+    // correctness. With unsplit intrinsics we may be dealing with transfers
+    // within a single alloca before SROA ran, or with transfers that have
+    // a variable length. We may also be dealing with memmove instead of
+    // memcpy, and so simply updating the pointers is the necessary for us to
+    // update both source and dest of a single call.
+    if (!MTO.IsSplittable) {
+      Value *OldOp = IsDest ? II.getRawDest() : II.getRawSource();
+      if (IsDest)
+        II.setDest(getAdjustedAllocaPtr(IRB, II.getRawDest()->getType()));
+      else
+        II.setSource(getAdjustedAllocaPtr(IRB, II.getRawSource()->getType()));
+
+      DEBUG(dbgs() << "          to: " << II << "\n");
+      deleteIfTriviallyDead(OldOp);
+      return false;
+    }
+    // For split transfer intrinsics we have an incredibly useful assurance:
+    // the source and destination do not reside within the same alloca, and at
+    // least one of them does not escape. This means that we can replace
+    // memmove with memcpy, and we don't need to worry about all manner of
+    // downsides to splitting and transforming the operations.
+
+    // Compute the relative offset within the transfer.
+    unsigned IntPtrWidth = TD.getPointerSizeInBits();
+    APInt RelOffset(IntPtrWidth, BeginOffset - (IsDest ? MTO.DestBegin
+                                                       : MTO.SourceBegin));
+
+    // If this doesn't map cleanly onto the alloca type, and that type isn't
+    // a single value type, just emit a memcpy.
+    bool EmitMemCpy
+      = !VecTy && (BeginOffset != NewAllocaBeginOffset ||
+                   EndOffset != NewAllocaEndOffset ||
+                   !NewAI.getAllocatedType()->isSingleValueType());
+
+    // If we're just going to emit a memcpy, the alloca hasn't changed, and the
+    // size hasn't been shrunk based on analysis of the viable range, this is
+    // a no-op.
+    if (EmitMemCpy && &OldAI == &NewAI) {
+      uint64_t OrigBegin = IsDest ? MTO.DestBegin : MTO.SourceBegin;
+      uint64_t OrigEnd = IsDest ? MTO.DestEnd : MTO.SourceEnd;
+      // Ensure the start lines up.
+      assert(BeginOffset == OrigBegin);
+      (void)OrigBegin;
+
+      // Rewrite the size as needed.
+      if (EndOffset != OrigEnd)
+        II.setLength(ConstantInt::get(II.getLength()->getType(),
+                                      EndOffset - BeginOffset));
+      return false;
+    }
+    // Record this instruction for deletion.
+    if (Pass.DeadSplitInsts.insert(&II))
+      Pass.DeadInsts.push_back(&II);
+
+    bool IsVectorElement = VecTy && (BeginOffset > NewAllocaBeginOffset ||
+                                     EndOffset < NewAllocaEndOffset);
+
+    Type *OtherPtrTy = IsDest ? II.getRawSource()->getType()
+                              : II.getRawDest()->getType();
+    if (!EmitMemCpy)
+      OtherPtrTy = IsVectorElement ? VecTy->getElementType()->getPointerTo()
+                                   : NewAI.getType();
+
+    // Compute the other pointer, folding as much as possible to produce
+    // a single, simple GEP in most cases.
+    Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
+    OtherPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy,
+                              getName("." + OtherPtr->getName()));
+
+    // Strip all inbounds GEPs and pointer casts to try to dig out any root
+    // alloca that should be re-examined after rewriting this instruction.
+    if (AllocaInst *AI
+          = dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets()))
+      Pass.Worklist.insert(AI);
+
+    if (EmitMemCpy) {
+      Value *OurPtr
+        = getAdjustedAllocaPtr(IRB, IsDest ? II.getRawDest()->getType()
+                                           : II.getRawSource()->getType());
+      Type *SizeTy = II.getLength()->getType();
+      Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset);
+
+      CallInst *New = IRB.CreateMemCpy(IsDest ? OurPtr : OtherPtr,
+                                       IsDest ? OtherPtr : OurPtr,
+                                       Size, II.getAlignment(),
+                                       II.isVolatile());
+      (void)New;
+      DEBUG(dbgs() << "          to: " << *New << "\n");
+      return false;
+    }
+
+    Value *SrcPtr = OtherPtr;
+    Value *DstPtr = &NewAI;
+    if (!IsDest)
+      std::swap(SrcPtr, DstPtr);
+
+    Value *Src;
+    if (IsVectorElement && !IsDest) {
+      // We have to extract rather than load.
+      Src = IRB.CreateExtractElement(IRB.CreateLoad(SrcPtr,
+                                                    getName(".copyload")),
+                                     getIndex(IRB, BeginOffset),
+                                     getName(".copyextract"));
+    } else {
+      Src = IRB.CreateLoad(SrcPtr, II.isVolatile(), getName(".copyload"));
+    }
+
+    if (IsVectorElement && IsDest) {
+      // We have to insert into a loaded copy before storing.
+      Src = IRB.CreateInsertElement(IRB.CreateLoad(&NewAI, getName(".load")),
+                                    Src, getIndex(IRB, BeginOffset),
+                                    getName(".insert"));
+    }
+
+    Value *Store = IRB.CreateStore(Src, DstPtr, II.isVolatile());
+    (void)Store;
+    DEBUG(dbgs() << "          to: " << *Store << "\n");
+    return !II.isVolatile();
+  }
+
+  bool visitIntrinsicInst(IntrinsicInst &II) {
+    assert(II.getIntrinsicID() == Intrinsic::lifetime_start ||
+           II.getIntrinsicID() == Intrinsic::lifetime_end);
+    DEBUG(dbgs() << "    original: " << II << "\n");
+    IRBuilder<> IRB(&II);
+    assert(II.getArgOperand(1) == OldPtr);
+
+    // Record this instruction for deletion.
+    if (Pass.DeadSplitInsts.insert(&II))
+      Pass.DeadInsts.push_back(&II);
+
+    ConstantInt *Size
+      = ConstantInt::get(cast<IntegerType>(II.getArgOperand(0)->getType()),
+                         EndOffset - BeginOffset);
+    Value *Ptr = getAdjustedAllocaPtr(IRB, II.getArgOperand(1)->getType());
+    Value *New;
+    if (II.getIntrinsicID() == Intrinsic::lifetime_start)
+      New = IRB.CreateLifetimeStart(Ptr, Size);
+    else
+      New = IRB.CreateLifetimeEnd(Ptr, Size);
+
+    DEBUG(dbgs() << "          to: " << *New << "\n");
+    return true;
+  }
+
+  /// PHI instructions that use an alloca and are subsequently loaded can be
+  /// rewritten to load both input pointers in the pred blocks and then PHI the
+  /// results, allowing the load of the alloca to be promoted.
+  /// From this:
+  ///   %P2 = phi [i32* %Alloca, i32* %Other]
+  ///   %V = load i32* %P2
+  /// to:
+  ///   %V1 = load i32* %Alloca      -> will be mem2reg'd
+  ///   ...
+  ///   %V2 = load i32* %Other
+  ///   ...
+  ///   %V = phi [i32 %V1, i32 %V2]
+  ///
+  /// We can do this to a select if its only uses are loads and if the operand
+  /// to the select can be loaded unconditionally.
+  ///
+  /// FIXME: This should be hoisted into a generic utility, likely in
+  /// Transforms/Util/Local.h
+  bool isSafePHIToSpeculate(PHINode &PN, SmallVectorImpl<LoadInst *> &Loads) {
+    // For now, we can only do this promotion if the load is in the same block
+    // as the PHI, and if there are no stores between the phi and load.
+    // TODO: Allow recursive phi users.
+    // TODO: Allow stores.
+    BasicBlock *BB = PN.getParent();
+    unsigned MaxAlign = 0;
+    for (Value::use_iterator UI = PN.use_begin(), UE = PN.use_end();
+         UI != UE; ++UI) {
+      LoadInst *LI = dyn_cast<LoadInst>(*UI);
+      if (LI == 0 || !LI->isSimple()) return false;
+
+      // For now we only allow loads in the same block as the PHI.  This is
+      // a common case that happens when instcombine merges two loads through
+      // a PHI.
+      if (LI->getParent() != BB) return false;
+
+      // Ensure that there are no instructions between the PHI and the load that
+      // could store.
+      for (BasicBlock::iterator BBI = &PN; &*BBI != LI; ++BBI)
+        if (BBI->mayWriteToMemory())
+          return false;
+
+      MaxAlign = std::max(MaxAlign, LI->getAlignment());
+      Loads.push_back(LI);
+    }
+
+    // We can only transform this if it is safe to push the loads into the
+    // predecessor blocks. The only thing to watch out for is that we can't put
+    // a possibly trapping load in the predecessor if it is a critical edge.
+    for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num;
+         ++Idx) {
+      TerminatorInst *TI = PN.getIncomingBlock(Idx)->getTerminator();
+      Value *InVal = PN.getIncomingValue(Idx);
+
+      // If the value is produced by the terminator of the predecessor (an
+      // invoke) or it has side-effects, there is no valid place to put a load
+      // in the predecessor.
+      if (TI == InVal || TI->mayHaveSideEffects())
+        return false;
+
+      // If the predecessor has a single successor, then the edge isn't
+      // critical.
+      if (TI->getNumSuccessors() == 1)
+        continue;
+
+      // If this pointer is always safe to load, or if we can prove that there
+      // is already a load in the block, then we can move the load to the pred
+      // block.
+      if (InVal->isDereferenceablePointer() ||
+          isSafeToLoadUnconditionally(InVal, TI, MaxAlign, &TD))
+        continue;
+
+      return false;
+    }
+
+    return true;
+  }
+
+  bool visitPHINode(PHINode &PN) {
+    DEBUG(dbgs() << "    original: " << PN << "\n");
+    // We would like to compute a new pointer in only one place, but have it be
+    // as local as possible to the PHI. To do that, we re-use the location of
+    // the old pointer, which necessarily must be in the right position to
+    // dominate the PHI.
+    IRBuilder<> PtrBuilder(cast<Instruction>(OldPtr));
+
+    SmallVector<LoadInst *, 4> Loads;
+    if (!isSafePHIToSpeculate(PN, Loads)) {
+      Value *NewPtr = getAdjustedAllocaPtr(PtrBuilder, OldPtr->getType());
+      // Replace the operands which were using the old pointer.
+      User::op_iterator OI = PN.op_begin(), OE = PN.op_end();
+      for (; OI != OE; ++OI)
+        if (*OI == OldPtr)
+          *OI = NewPtr;
+
+      DEBUG(dbgs() << "          to: " << PN << "\n");
+      deleteIfTriviallyDead(OldPtr);
+      return false;
+    }
+    assert(!Loads.empty());
+
+    Type *LoadTy = cast<PointerType>(PN.getType())->getElementType();
+    IRBuilder<> PHIBuilder(&PN);
+    PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues());
+    NewPN->takeName(&PN);
+
+    // Get the TBAA tag and alignment to use from one of the loads.  It doesn't
+    // matter which one we get and if any differ, it doesn't matter.
+    LoadInst *SomeLoad = cast<LoadInst>(Loads.back());
+    MDNode *TBAATag = SomeLoad->getMetadata(LLVMContext::MD_tbaa);
+    unsigned Align = SomeLoad->getAlignment();
+    Value *NewPtr = getAdjustedAllocaPtr(PtrBuilder, OldPtr->getType());
+
+    // Rewrite all loads of the PN to use the new PHI.
+    do {
+      LoadInst *LI = Loads.pop_back_val();
+      LI->replaceAllUsesWith(NewPN);
+      Pass.DeadInsts.push_back(LI);
+    } while (!Loads.empty());
+
+    // Inject loads into all of the pred blocks.
+    for (unsigned Idx = 0, Num = PN.getNumIncomingValues(); Idx != Num; ++Idx) {
+      BasicBlock *Pred = PN.getIncomingBlock(Idx);
+      TerminatorInst *TI = Pred->getTerminator();
+      Value *InVal = PN.getIncomingValue(Idx);
+      IRBuilder<> PredBuilder(TI);
+
+      // Map the value to the new alloca pointer if this was the old alloca
+      // pointer.
+      bool ThisOperand = InVal == OldPtr;
+      if (ThisOperand)
+        InVal = NewPtr;
+
+      LoadInst *Load
+        = PredBuilder.CreateLoad(InVal, getName(".sroa.speculate." +
+                                                Pred->getName()));
+      ++NumLoadsSpeculated;
+      Load->setAlignment(Align);
+      if (TBAATag)
+        Load->setMetadata(LLVMContext::MD_tbaa, TBAATag);
+      NewPN->addIncoming(Load, Pred);
+
+      if (ThisOperand)
+        continue;
+      Instruction *OtherPtr = dyn_cast<Instruction>(InVal);
+      if (!OtherPtr)
+        // No uses to rewrite.
+        continue;
+
+      // Try to lookup and rewrite any partition uses corresponding to this phi
+      // input.
+      AllocaPartitioning::iterator PI
+        = P.findPartitionForPHIOrSelectOperand(PN, OtherPtr);
+      if (PI != P.end()) {
+        // If the other pointer is within the partitioning, replace the PHI in
+        // its uses with the load we just speculated, or add another load for
+        // it to rewrite if we've already replaced the PHI.
+        AllocaPartitioning::use_iterator UI
+          = P.findPartitionUseForPHIOrSelectOperand(PN, OtherPtr);
+        if (isa<PHINode>(*UI->User))
+          UI->User = Load;
+        else {
+          AllocaPartitioning::PartitionUse OtherUse = *UI;
+          OtherUse.User = Load;
+          P.use_insert(PI, std::upper_bound(UI, P.use_end(PI), OtherUse),
+                       OtherUse);
+        }
+      }
+    }
+    DEBUG(dbgs() << "          speculated to: " << *NewPN << "\n");
+    return NewPtr == &NewAI;
+  }
+
+  /// Select instructions that use an alloca and are subsequently loaded can be
+  /// rewritten to load both input pointers and then select between the result,
+  /// allowing the load of the alloca to be promoted.
+  /// From this:
+  ///   %P2 = select i1 %cond, i32* %Alloca, i32* %Other
+  ///   %V = load i32* %P2
+  /// to:
+  ///   %V1 = load i32* %Alloca      -> will be mem2reg'd
+  ///   %V2 = load i32* %Other
+  ///   %V = select i1 %cond, i32 %V1, i32 %V2
+  ///
+  /// We can do this to a select if its only uses are loads and if the operand
+  /// to the select can be loaded unconditionally.
+  bool isSafeSelectToSpeculate(SelectInst &SI,
+                               SmallVectorImpl<LoadInst *> &Loads) {
+    Value *TValue = SI.getTrueValue();
+    Value *FValue = SI.getFalseValue();
+    bool TDerefable = TValue->isDereferenceablePointer();
+    bool FDerefable = FValue->isDereferenceablePointer();
+
+    for (Value::use_iterator UI = SI.use_begin(), UE = SI.use_end();
+         UI != UE; ++UI) {
+      LoadInst *LI = dyn_cast<LoadInst>(*UI);
+      if (LI == 0 || !LI->isSimple()) return false;
+
+      // Both operands to the select need to be dereferencable, either
+      // absolutely (e.g. allocas) or at this point because we can see other
+      // accesses to it.
+      if (!TDerefable && !isSafeToLoadUnconditionally(TValue, LI,
+                                                      LI->getAlignment(), &TD))
+        return false;
+      if (!FDerefable && !isSafeToLoadUnconditionally(FValue, LI,
+                                                      LI->getAlignment(), &TD))
+        return false;
+      Loads.push_back(LI);
+    }
+
+    return true;
+  }
+
+  bool visitSelectInst(SelectInst &SI) {
+    DEBUG(dbgs() << "    original: " << SI << "\n");
+    IRBuilder<> IRB(&SI);
+
+    // Find the operand we need to rewrite here.
+    bool IsTrueVal = SI.getTrueValue() == OldPtr;
+    if (IsTrueVal)
+      assert(SI.getFalseValue() != OldPtr && "Pointer is both operands!");
+    else
+      assert(SI.getFalseValue() == OldPtr && "Pointer isn't an operand!");
+    Value *NewPtr = getAdjustedAllocaPtr(IRB, OldPtr->getType());
+
+    // If the select isn't safe to speculate, just use simple logic to emit it.
+    SmallVector<LoadInst *, 4> Loads;
+    if (!isSafeSelectToSpeculate(SI, Loads)) {
+      SI.setOperand(IsTrueVal ? 1 : 2, NewPtr);
+      DEBUG(dbgs() << "          to: " << SI << "\n");
+      deleteIfTriviallyDead(OldPtr);
+      return false;
+    }
+
+    Value *OtherPtr = IsTrueVal ? SI.getFalseValue() : SI.getTrueValue();
+    AllocaPartitioning::iterator PI
+      = P.findPartitionForPHIOrSelectOperand(SI, OtherPtr);
+    AllocaPartitioning::PartitionUse OtherUse;
+    if (PI != P.end()) {
+      // If the other pointer is within the partitioning, remove the select
+      // from its uses. We'll add in the new loads below.
+      AllocaPartitioning::use_iterator UI
+        = P.findPartitionUseForPHIOrSelectOperand(SI, OtherPtr);
+      OtherUse = *UI;
+      P.use_erase(PI, UI);
+    }
+
+    Value *TV = IsTrueVal ? NewPtr : SI.getTrueValue();
+    Value *FV = IsTrueVal ? SI.getFalseValue() : NewPtr;
+    // Replace the loads of the select with a select of two loads.
+    while (!Loads.empty()) {
+      LoadInst *LI = Loads.pop_back_val();
+
+      IRB.SetInsertPoint(LI);
+      LoadInst *TL =
+        IRB.CreateLoad(TV, getName("." + LI->getName() + ".true"));
+      LoadInst *FL =
+        IRB.CreateLoad(FV, getName("." + LI->getName() + ".false"));
+      NumLoadsSpeculated += 2;
+      if (PI != P.end()) {
+        LoadInst *OtherLoad = IsTrueVal ? FL : TL;
+        assert(OtherUse.Ptr == OtherLoad->getOperand(0));
+        OtherUse.User = OtherLoad;
+        P.use_insert(PI, P.use_end(PI), OtherUse);
+      }
+
+      // Transfer alignment and TBAA info if present.
+      TL->setAlignment(LI->getAlignment());
+      FL->setAlignment(LI->getAlignment());
+      if (MDNode *Tag = LI->getMetadata(LLVMContext::MD_tbaa)) {
+        TL->setMetadata(LLVMContext::MD_tbaa, Tag);
+        FL->setMetadata(LLVMContext::MD_tbaa, Tag);
+      }
+
+      Value *V = IRB.CreateSelect(SI.getCondition(), TL, FL);
+      V->takeName(LI);
+      DEBUG(dbgs() << "          speculated to: " << *V << "\n");
+      LI->replaceAllUsesWith(V);
+      Pass.DeadInsts.push_back(LI);
+    }
+    if (PI != P.end())
+      std::stable_sort(P.use_begin(PI), P.use_end(PI));
+
+    deleteIfTriviallyDead(OldPtr);
+    return NewPtr == &NewAI;
+  }
+
+};
+}
+
+namespace {
+/// \brief Visitor to rewrite aggregate loads and stores as scalar.
+///
+/// This pass aggressively rewrites all aggregate loads and stores on
+/// a particular pointer (or any pointer derived from it which we can identify)
+/// with scalar loads and stores.
+class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
+  // Befriend the base class so it can delegate to private visit methods.
+  friend class llvm::InstVisitor<AggLoadStoreRewriter, bool>;
+
+  const TargetData &TD;
+
+  /// Queue of pointer uses to analyze and potentially rewrite.
+  SmallVector<Use *, 8> Queue;
+
+  /// Set to prevent us from cycling with phi nodes and loops.
+  SmallPtrSet<User *, 8> Visited;
+
+  /// The current pointer use being rewritten. This is used to dig up the used
+  /// value (as opposed to the user).
+  Use *U;
+
+public:
+  AggLoadStoreRewriter(const TargetData &TD) : TD(TD) {}
+
+  /// Rewrite loads and stores through a pointer and all pointers derived from
+  /// it.
+  bool rewrite(Instruction &I) {
+    DEBUG(dbgs() << "  Rewriting FCA loads and stores...\n");
+    enqueueUsers(I);
+    bool Changed = false;
+    while (!Queue.empty()) {
+      U = Queue.pop_back_val();
+      Changed |= visit(cast<Instruction>(U->getUser()));
+    }
+    return Changed;
+  }
+
+private:
+  /// Enqueue all the users of the given instruction for further processing.
+  /// This uses a set to de-duplicate users.
+  void enqueueUsers(Instruction &I) {
+    for (Value::use_iterator UI = I.use_begin(), UE = I.use_end(); UI != UE;
+         ++UI)
+      if (Visited.insert(*UI))
+        Queue.push_back(&UI.getUse());
+  }
+
+  // Conservative default is to not rewrite anything.
+  bool visitInstruction(Instruction &I) { return false; }
+
+  /// \brief Generic recursive split emission class.
+  template <typename Derived>
+  class OpSplitter {
+  protected:
+    /// The builder used to form new instructions.
+    IRBuilder<> IRB;
+    /// The indices which to be used with insert- or extractvalue to select the
+    /// appropriate value within the aggregate.
+    SmallVector<unsigned, 4> Indices;
+    /// The indices to a GEP instruction which will move Ptr to the correct slot
+    /// within the aggregate.
+    SmallVector<Value *, 4> GEPIndices;
+    /// The base pointer of the original op, used as a base for GEPing the
+    /// split operations.
+    Value *Ptr;
+
+    /// Initialize the splitter with an insertion point, Ptr and start with a
+    /// single zero GEP index.
+    OpSplitter(Instruction *InsertionPoint, Value *Ptr)
+      : IRB(InsertionPoint), GEPIndices(1, IRB.getInt32(0)), Ptr(Ptr) {}
+
+  public:
+    /// \brief Generic recursive split emission routine.
+    ///
+    /// This method recursively splits an aggregate op (load or store) into
+    /// scalar or vector ops. It splits recursively until it hits a single value
+    /// and emits that single value operation via the template argument.
+    ///
+    /// The logic of this routine relies on GEPs and insertvalue and
+    /// extractvalue all operating with the same fundamental index list, merely
+    /// formatted differently (GEPs need actual values).
+    ///
+    /// \param Ty  The type being split recursively into smaller ops.
+    /// \param Agg The aggregate value being built up or stored, depending on
+    /// whether this is splitting a load or a store respectively.
+    void emitSplitOps(Type *Ty, Value *&Agg, const Twine &Name) {
+      if (Ty->isSingleValueType())
+        return static_cast<Derived *>(this)->emitFunc(Ty, Agg, Name);
+
+      if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+        unsigned OldSize = Indices.size();
+        (void)OldSize;
+        for (unsigned Idx = 0, Size = ATy->getNumElements(); Idx != Size;
+             ++Idx) {
+          assert(Indices.size() == OldSize && "Did not return to the old size");
+          Indices.push_back(Idx);
+          GEPIndices.push_back(IRB.getInt32(Idx));
+          emitSplitOps(ATy->getElementType(), Agg, Name + "." + Twine(Idx));
+          GEPIndices.pop_back();
+          Indices.pop_back();
+        }
+        return;
+      }
+
+      if (StructType *STy = dyn_cast<StructType>(Ty)) {
+        unsigned OldSize = Indices.size();
+        (void)OldSize;
+        for (unsigned Idx = 0, Size = STy->getNumElements(); Idx != Size;
+             ++Idx) {
+          assert(Indices.size() == OldSize && "Did not return to the old size");
+          Indices.push_back(Idx);
+          GEPIndices.push_back(IRB.getInt32(Idx));
+          emitSplitOps(STy->getElementType(Idx), Agg, Name + "." + Twine(Idx));
+          GEPIndices.pop_back();
+          Indices.pop_back();
+        }
+        return;
+      }
+
+      llvm_unreachable("Only arrays and structs are aggregate loadable types");
+    }
+  };
+
+  struct LoadOpSplitter : public OpSplitter<LoadOpSplitter> {
+    LoadOpSplitter(Instruction *InsertionPoint, Value *Ptr)
+      : OpSplitter<LoadOpSplitter>(InsertionPoint, Ptr) {}
+
+    /// Emit a leaf load of a single value. This is called at the leaves of the
+    /// recursive emission to actually load values.
+    void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) {
+      assert(Ty->isSingleValueType());
+      // Load the single value and insert it using the indices.
+      Value *Load = IRB.CreateLoad(IRB.CreateInBoundsGEP(Ptr, GEPIndices,
+                                                         Name + ".gep"),
+                                   Name + ".load");
+      Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
+      DEBUG(dbgs() << "          to: " << *Load << "\n");
+    }
+  };
+
+  bool visitLoadInst(LoadInst &LI) {
+    assert(LI.getPointerOperand() == *U);
+    if (!LI.isSimple() || LI.getType()->isSingleValueType())
+      return false;
+
+    // We have an aggregate being loaded, split it apart.
+    DEBUG(dbgs() << "    original: " << LI << "\n");
+    LoadOpSplitter Splitter(&LI, *U);
+    Value *V = UndefValue::get(LI.getType());
+    Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
+    LI.replaceAllUsesWith(V);
+    LI.eraseFromParent();
+    return true;
+  }
+
+  struct StoreOpSplitter : public OpSplitter<StoreOpSplitter> {
+    StoreOpSplitter(Instruction *InsertionPoint, Value *Ptr)
+      : OpSplitter<StoreOpSplitter>(InsertionPoint, Ptr) {}
+
+    /// Emit a leaf store of a single value. This is called at the leaves of the
+    /// recursive emission to actually produce stores.
+    void emitFunc(Type *Ty, Value *&Agg, const Twine &Name) {
+      assert(Ty->isSingleValueType());
+      // Extract the single value and store it using the indices.
+      Value *Store = IRB.CreateStore(
+        IRB.CreateExtractValue(Agg, Indices, Name + ".extract"),
+        IRB.CreateInBoundsGEP(Ptr, GEPIndices, Name + ".gep"));
+      (void)Store;
+      DEBUG(dbgs() << "          to: " << *Store << "\n");
+    }
+  };
+
+  bool visitStoreInst(StoreInst &SI) {
+    if (!SI.isSimple() || SI.getPointerOperand() != *U)
+      return false;
+    Value *V = SI.getValueOperand();
+    if (V->getType()->isSingleValueType())
+      return false;
+
+    // We have an aggregate being stored, split it apart.
+    DEBUG(dbgs() << "    original: " << SI << "\n");
+    StoreOpSplitter Splitter(&SI, *U);
+    Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
+    SI.eraseFromParent();
+    return true;
+  }
+
+  bool visitBitCastInst(BitCastInst &BC) {
+    enqueueUsers(BC);
+    return false;
+  }
+
+  bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+    enqueueUsers(GEPI);
+    return false;
+  }
+
+  bool visitPHINode(PHINode &PN) {
+    enqueueUsers(PN);
+    return false;
+  }
+
+  bool visitSelectInst(SelectInst &SI) {
+    enqueueUsers(SI);
+    return false;
+  }
+};
+}
+
+/// \brief Try to find a partition of the aggregate type passed in for a given
+/// offset and size.
+///
+/// This recurses through the aggregate type and tries to compute a subtype
+/// based on the offset and size. When the offset and size span a sub-section
+/// of an array, it will even compute a new array type for that sub-section,
+/// and the same for structs.
+///
+/// Note that this routine is very strict and tries to find a partition of the
+/// type which produces the *exact* right offset and size. It is not forgiving
+/// when the size or offset cause either end of type-based partition to be off.
+/// Also, this is a best-effort routine. It is reasonable to give up and not
+/// return a type if necessary.
+static Type *getTypePartition(const TargetData &TD, Type *Ty,
+                              uint64_t Offset, uint64_t Size) {
+  if (Offset == 0 && TD.getTypeAllocSize(Ty) == Size)
+    return Ty;
+
+  if (SequentialType *SeqTy = dyn_cast<SequentialType>(Ty)) {
+    // We can't partition pointers...
+    if (SeqTy->isPointerTy())
+      return 0;
+
+    Type *ElementTy = SeqTy->getElementType();
+    uint64_t ElementSize = TD.getTypeAllocSize(ElementTy);
+    uint64_t NumSkippedElements = Offset / ElementSize;
+    if (ArrayType *ArrTy = dyn_cast<ArrayType>(SeqTy))
+      if (NumSkippedElements >= ArrTy->getNumElements())
+        return 0;
+    if (VectorType *VecTy = dyn_cast<VectorType>(SeqTy))
+      if (NumSkippedElements >= VecTy->getNumElements())
+        return 0;
+    Offset -= NumSkippedElements * ElementSize;
+
+    // First check if we need to recurse.
+    if (Offset > 0 || Size < ElementSize) {
+      // Bail if the partition ends in a different array element.
+      if ((Offset + Size) > ElementSize)
+        return 0;
+      // Recurse through the element type trying to peel off offset bytes.
+      return getTypePartition(TD, ElementTy, Offset, Size);
+    }
+    assert(Offset == 0);
+
+    if (Size == ElementSize)
+      return ElementTy;
+    assert(Size > ElementSize);
+    uint64_t NumElements = Size / ElementSize;
+    if (NumElements * ElementSize != Size)
+      return 0;
+    return ArrayType::get(ElementTy, NumElements);
+  }
+
+  StructType *STy = dyn_cast<StructType>(Ty);
+  if (!STy)
+    return 0;
+
+  const StructLayout *SL = TD.getStructLayout(STy);
+  if (Offset >= SL->getSizeInBytes())
+    return 0;
+  uint64_t EndOffset = Offset + Size;
+  if (EndOffset > SL->getSizeInBytes())
+    return 0;
+
+  unsigned Index = SL->getElementContainingOffset(Offset);
+  Offset -= SL->getElementOffset(Index);
+
+  Type *ElementTy = STy->getElementType(Index);
+  uint64_t ElementSize = TD.getTypeAllocSize(ElementTy);
+  if (Offset >= ElementSize)
+    return 0; // The offset points into alignment padding.
+
+  // See if any partition must be contained by the element.
+  if (Offset > 0 || Size < ElementSize) {
+    if ((Offset + Size) > ElementSize)
+      return 0;
+    return getTypePartition(TD, ElementTy, Offset, Size);
+  }
+  assert(Offset == 0);
+
+  if (Size == ElementSize)
+    return ElementTy;
+
+  StructType::element_iterator EI = STy->element_begin() + Index,
+                               EE = STy->element_end();
+  if (EndOffset < SL->getSizeInBytes()) {
+    unsigned EndIndex = SL->getElementContainingOffset(EndOffset);
+    if (Index == EndIndex)
+      return 0; // Within a single element and its padding.
+
+    // Don't try to form "natural" types if the elements don't line up with the
+    // expected size.
+    // FIXME: We could potentially recurse down through the last element in the
+    // sub-struct to find a natural end point.
+    if (SL->getElementOffset(EndIndex) != EndOffset)
+      return 0;
+
+    assert(Index < EndIndex);
+    EE = STy->element_begin() + EndIndex;
+  }
+
+  // Try to build up a sub-structure.
+  SmallVector<Type *, 4> ElementTys;
+  do {
+    ElementTys.push_back(*EI++);
+  } while (EI != EE);
+  StructType *SubTy = StructType::get(STy->getContext(), ElementTys,
+                                      STy->isPacked());
+  const StructLayout *SubSL = TD.getStructLayout(SubTy);
+  if (Size != SubSL->getSizeInBytes())
+    return 0; // The sub-struct doesn't have quite the size needed.
+
+  return SubTy;
+}
+
+/// \brief Rewrite an alloca partition's users.
+///
+/// This routine drives both of the rewriting goals of the SROA pass. It tries
+/// to rewrite uses of an alloca partition to be conducive for SSA value
+/// promotion. If the partition needs a new, more refined alloca, this will
+/// build that new alloca, preserving as much type information as possible, and
+/// rewrite the uses of the old alloca to point at the new one and have the
+/// appropriate new offsets. It also evaluates how successful the rewrite was
+/// at enabling promotion and if it was successful queues the alloca to be
+/// promoted.
+bool SROA::rewriteAllocaPartition(AllocaInst &AI,
+                                  AllocaPartitioning &P,
+                                  AllocaPartitioning::iterator PI) {
+  uint64_t AllocaSize = PI->EndOffset - PI->BeginOffset;
+  if (P.use_begin(PI) == P.use_end(PI))
+    return false; // No live uses left of this partition.
+
+  // Try to compute a friendly type for this partition of the alloca. This
+  // won't always succeed, in which case we fall back to a legal integer type
+  // or an i8 array of an appropriate size.
+  Type *AllocaTy = 0;
+  if (Type *PartitionTy = P.getCommonType(PI))
+    if (TD->getTypeAllocSize(PartitionTy) >= AllocaSize)
+      AllocaTy = PartitionTy;
+  if (!AllocaTy)
+    if (Type *PartitionTy = getTypePartition(*TD, AI.getAllocatedType(),
+                                             PI->BeginOffset, AllocaSize))
+      AllocaTy = PartitionTy;
+  if ((!AllocaTy ||
+       (AllocaTy->isArrayTy() &&
+        AllocaTy->getArrayElementType()->isIntegerTy())) &&
+      TD->isLegalInteger(AllocaSize * 8))
+    AllocaTy = Type::getIntNTy(*C, AllocaSize * 8);
+  if (!AllocaTy)
+    AllocaTy = ArrayType::get(Type::getInt8Ty(*C), AllocaSize);
+  assert(TD->getTypeAllocSize(AllocaTy) >= AllocaSize);
+
+  // Check for the case where we're going to rewrite to a new alloca of the
+  // exact same type as the original, and with the same access offsets. In that
+  // case, re-use the existing alloca, but still run through the rewriter to
+  // performe phi and select speculation.
+  AllocaInst *NewAI;
+  if (AllocaTy == AI.getAllocatedType()) {
+    assert(PI->BeginOffset == 0 &&
+           "Non-zero begin offset but same alloca type");
+    assert(PI == P.begin() && "Begin offset is zero on later partition");
+    NewAI = &AI;
+  } else {
+    // FIXME: The alignment here is overly conservative -- we could in many
+    // cases get away with much weaker alignment constraints.
+    NewAI = new AllocaInst(AllocaTy, 0, AI.getAlignment(),
+                           AI.getName() + ".sroa." + Twine(PI - P.begin()),
+                           &AI);
+    ++NumNewAllocas;
+  }
+
+  DEBUG(dbgs() << "Rewriting alloca partition "
+               << "[" << PI->BeginOffset << "," << PI->EndOffset << ") to: "
+               << *NewAI << "\n");
+
+  AllocaPartitionRewriter Rewriter(*TD, P, PI, *this, AI, *NewAI,
+                                   PI->BeginOffset, PI->EndOffset);
+  DEBUG(dbgs() << "  rewriting ");
+  DEBUG(P.print(dbgs(), PI, ""));
+  if (Rewriter.visitUsers(P.use_begin(PI), P.use_end(PI))) {
+    DEBUG(dbgs() << "  and queuing for promotion\n");
+    PromotableAllocas.push_back(NewAI);
+  } else if (NewAI != &AI) {
+    // If we can't promote the alloca, iterate on it to check for new
+    // refinements exposed by splitting the current alloca. Don't iterate on an
+    // alloca which didn't actually change and didn't get promoted.
+    Worklist.insert(NewAI);
+  }
+  return true;
+}
+
+/// \brief Walks the partitioning of an alloca rewriting uses of each partition.
+bool SROA::splitAlloca(AllocaInst &AI, AllocaPartitioning &P) {
+  bool Changed = false;
+  for (AllocaPartitioning::iterator PI = P.begin(), PE = P.end(); PI != PE;
+       ++PI)
+    Changed |= rewriteAllocaPartition(AI, P, PI);
+
+  return Changed;
+}
+
+/// \brief Analyze an alloca for SROA.
+///
+/// This analyzes the alloca to ensure we can reason about it, builds
+/// a partitioning of the alloca, and then hands it off to be split and
+/// rewritten as needed.
+bool SROA::runOnAlloca(AllocaInst &AI) {
+  DEBUG(dbgs() << "SROA alloca: " << AI << "\n");
+  ++NumAllocasAnalyzed;
+
+  // Special case dead allocas, as they're trivial.
+  if (AI.use_empty()) {
+    AI.eraseFromParent();
+    return true;
+  }
+
+  // Skip alloca forms that this analysis can't handle.
+  if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() ||
+      TD->getTypeAllocSize(AI.getAllocatedType()) == 0)
+    return false;
+
+  // First check if this is a non-aggregate type that we should simply promote.
+  if (!AI.getAllocatedType()->isAggregateType() && isAllocaPromotable(&AI)) {
+    DEBUG(dbgs() << "  Trivially scalar type, queuing for promotion...\n");
+    PromotableAllocas.push_back(&AI);
+    return false;
+  }
+
+  bool Changed = false;
+
+  // First, split any FCA loads and stores touching this alloca to promote
+  // better splitting and promotion opportunities.
+  AggLoadStoreRewriter AggRewriter(*TD);
+  Changed |= AggRewriter.rewrite(AI);
+
+  // Build the partition set using a recursive instruction-visiting builder.
+  AllocaPartitioning P(*TD, AI);
+  DEBUG(P.print(dbgs()));
+  if (P.isEscaped())
+    return Changed;
+
+  // No partitions to split. Leave the dead alloca for a later pass to clean up.
+  if (P.begin() == P.end())
+    return Changed;
+
+  // Delete all the dead users of this alloca before splitting and rewriting it.
+  for (AllocaPartitioning::dead_user_iterator DI = P.dead_user_begin(),
+                                              DE = P.dead_user_end();
+       DI != DE; ++DI) {
+    Changed = true;
+    (*DI)->replaceAllUsesWith(UndefValue::get((*DI)->getType()));
+    DeadInsts.push_back(*DI);
+  }
+  for (AllocaPartitioning::dead_op_iterator DO = P.dead_op_begin(),
+                                            DE = P.dead_op_end();
+       DO != DE; ++DO) {
+    Value *OldV = **DO;
+    // Clobber the use with an undef value.
+    **DO = UndefValue::get(OldV->getType());
+    if (Instruction *OldI = dyn_cast<Instruction>(OldV))
+      if (isInstructionTriviallyDead(OldI)) {
+        Changed = true;
+        DeadInsts.push_back(OldI);
+      }
+  }
+
+  return splitAlloca(AI, P) || Changed;
+}
+
+/// \brief Delete the dead instructions accumulated in this run.
+///
+/// Recursively deletes the dead instructions we've accumulated. This is done
+/// at the very end to maximize locality of the recursive delete and to
+/// minimize the problems of invalidated instruction pointers as such pointers
+/// are used heavily in the intermediate stages of the algorithm.
+///
+/// We also record the alloca instructions deleted here so that they aren't
+/// subsequently handed to mem2reg to promote.
+void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) {
+  DeadSplitInsts.clear();
+  while (!DeadInsts.empty()) {
+    Instruction *I = DeadInsts.pop_back_val();
+    DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
+
+    for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
+      if (Instruction *U = dyn_cast<Instruction>(*OI)) {
+        // Zero out the operand and see if it becomes trivially dead.
+        *OI = 0;
+        if (isInstructionTriviallyDead(U))
+          DeadInsts.push_back(U);
+      }
+
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
+      DeletedAllocas.insert(AI);
+
+    ++NumDeleted;
+    I->eraseFromParent();
+  }
+}
+
+/// \brief Promote the allocas, using the best available technique.
+///
+/// This attempts to promote whatever allocas have been identified as viable in
+/// the PromotableAllocas list. If that list is empty, there is nothing to do.
+/// If there is a domtree available, we attempt to promote using the full power
+/// of mem2reg. Otherwise, we build and use the AllocaPromoter above which is
+/// based on the SSAUpdater utilities. This function returns whether any
+/// promotion occured.
+bool SROA::promoteAllocas(Function &F) {
+  if (PromotableAllocas.empty())
+    return false;
+
+  NumPromoted += PromotableAllocas.size();
+
+  if (DT && !ForceSSAUpdater) {
+    DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
+    PromoteMemToReg(PromotableAllocas, *DT);
+    PromotableAllocas.clear();
+    return true;
+  }
+
+  DEBUG(dbgs() << "Promoting allocas with SSAUpdater...\n");
+  SSAUpdater SSA;
+  DIBuilder DIB(*F.getParent());
+  SmallVector<Instruction*, 64> Insts;
+
+  for (unsigned Idx = 0, Size = PromotableAllocas.size(); Idx != Size; ++Idx) {
+    AllocaInst *AI = PromotableAllocas[Idx];
+    for (Value::use_iterator UI = AI->use_begin(), UE = AI->use_end();
+         UI != UE;) {
+      Instruction *I = cast<Instruction>(*UI++);
+      // FIXME: Currently the SSAUpdater infrastructure doesn't reason about
+      // lifetime intrinsics and so we strip them (and the bitcasts+GEPs
+      // leading to them) here. Eventually it should use them to optimize the
+      // scalar values produced.
+      if (isa<BitCastInst>(I) || isa<GetElementPtrInst>(I)) {
+        assert(onlyUsedByLifetimeMarkers(I) &&
+               "Found a bitcast used outside of a lifetime marker.");
+        while (!I->use_empty())
+          cast<Instruction>(*I->use_begin())->eraseFromParent();
+        I->eraseFromParent();
+        continue;
+      }
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+        assert(II->getIntrinsicID() == Intrinsic::lifetime_start ||
+               II->getIntrinsicID() == Intrinsic::lifetime_end);
+        II->eraseFromParent();
+        continue;
+      }
+
+      Insts.push_back(I);
+    }
+    AllocaPromoter(Insts, SSA, *AI, DIB).run(Insts);
+    Insts.clear();
+  }
+
+  PromotableAllocas.clear();
+  return true;
+}
+
+namespace {
+  /// \brief A predicate to test whether an alloca belongs to a set.
+  class IsAllocaInSet {
+    typedef SmallPtrSet<AllocaInst *, 4> SetType;
+    const SetType &Set;
+
+  public:
+    IsAllocaInSet(const SetType &Set) : Set(Set) {}
+    bool operator()(AllocaInst *AI) { return Set.count(AI); }
+  };
+}
+
+bool SROA::runOnFunction(Function &F) {
+  DEBUG(dbgs() << "SROA function: " << F.getName() << "\n");
+  C = &F.getContext();
+  TD = getAnalysisIfAvailable<TargetData>();
+  if (!TD) {
+    DEBUG(dbgs() << "  Skipping SROA -- no target data!\n");
+    return false;
+  }
+  DT = getAnalysisIfAvailable<DominatorTree>();
+
+  BasicBlock &EntryBB = F.getEntryBlock();
+  for (BasicBlock::iterator I = EntryBB.begin(), E = llvm::prior(EntryBB.end());
+       I != E; ++I)
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
+      Worklist.insert(AI);
+
+  bool Changed = false;
+  // A set of deleted alloca instruction pointers which should be removed from
+  // the list of promotable allocas.
+  SmallPtrSet<AllocaInst *, 4> DeletedAllocas;
+
+  while (!Worklist.empty()) {
+    Changed |= runOnAlloca(*Worklist.pop_back_val());
+    deleteDeadInstructions(DeletedAllocas);
+    if (!DeletedAllocas.empty()) {
+      PromotableAllocas.erase(std::remove_if(PromotableAllocas.begin(),
+                                             PromotableAllocas.end(),
+                                             IsAllocaInSet(DeletedAllocas)),
+                              PromotableAllocas.end());
+      DeletedAllocas.clear();
+    }
+  }
+
+  Changed |= promoteAllocas(F);
+
+  return Changed;
+}
+
+void SROA::getAnalysisUsage(AnalysisUsage &AU) const {
+  if (RequiresDomTree)
+    AU.addRequired<DominatorTree>();
+  AU.setPreservesCFG();
+}
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index 48318c8a55..95b6fa7c7f 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -59,6 +59,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeRegToMemPass(Registry);
   initializeSCCPPass(Registry);
   initializeIPSCCPPass(Registry);
+  initializeSROAPass(Registry);
   initializeSROA_DTPass(Registry);
   initializeSROA_SSAUpPass(Registry);
   initializeCFGSimplifyPassPass(Registry);
diff --git a/lib/Transforms/Utils/AddrModeMatcher.cpp b/lib/Transforms/Utils/AddrModeMatcher.cpp
index 1e6586bf0d..0f9dfddf57 100644
--- a/lib/Transforms/Utils/AddrModeMatcher.cpp
+++ b/lib/Transforms/Utils/AddrModeMatcher.cpp
@@ -55,7 +55,7 @@ void ExtAddrMode::print(raw_ostream &OS) const {
   OS << ']';
 }
 
-#ifndef NDEBUG
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void ExtAddrMode::dump() const {
   print(dbgs());
   dbgs() << '\n';
diff --git a/lib/Transforms/Utils/BypassSlowDivision.cpp b/lib/Transforms/Utils/BypassSlowDivision.cpp
index 30d60be277..ac18b7d5a0 100644
--- a/lib/Transforms/Utils/BypassSlowDivision.cpp
+++ b/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -221,7 +221,7 @@ static bool reuseOrInsertFastDiv(Function &F,
 // be profitably bypassed and carried out with a shorter, faster divide.
 bool llvm::bypassSlowDivision(Function &F,
                               Function::iterator &I,
-                              const DenseMap<Type *, Type *> &BypassTypeMap) {
+                              const DenseMap<Type*, Type*> &BypassTypeMap) {
   DivCacheTy DivCache;
 
   bool MadeChange = false;
diff --git a/lib/Transforms/Utils/CMakeLists.txt b/lib/Transforms/Utils/CMakeLists.txt
index aace75b745..c3f72b13af 100644
--- a/lib/Transforms/Utils/CMakeLists.txt
+++ b/lib/Transforms/Utils/CMakeLists.txt
@@ -11,6 +11,7 @@ add_llvm_library(LLVMTransformUtils
   DemoteRegToStack.cpp
   InlineFunction.cpp
   InstructionNamer.cpp
+  IntegerDivision.cpp
   LCSSA.cpp
   Local.cpp
   LoopSimplify.cpp
diff --git a/lib/Transforms/Utils/IntegerDivision.cpp b/lib/Transforms/Utils/IntegerDivision.cpp
new file mode 100644
index 0000000000..9d630349ab
--- /dev/null
+++ b/lib/Transforms/Utils/IntegerDivision.cpp
@@ -0,0 +1,312 @@
+//===-- IntegerDivision.cpp - Expand integer division ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains an implementation of 32bit scalar integer division for
+// targets that don't have native support. It's largely derived from
+// compiler-rt's implementation of __udivsi3, but hand-tuned to reduce the
+// amount of control flow
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "integer-division"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/IRBuilder.h"
+#include "llvm/Transforms/Utils/IntegerDivision.h"
+
+using namespace llvm;
+
+/// Generate code to divide two signed integers. Returns the quotient, rounded
+/// towards 0. Builder's insert point should be pointing at the sdiv
+/// instruction. This will generate a udiv in the process, and Builder's insert
+/// point will be pointing at the udiv (if present, i.e. not folded), ready to
+/// be expanded if the user wishes.
+static Value *generateSignedDivisionCode(Value *Dividend, Value *Divisor,
+                                         IRBuilder<> &Builder) {
+  // Implementation taken from compiler-rt's __divsi3
+
+  ConstantInt *ThirtyOne = Builder.getInt32(31);
+
+  // ;   %tmp    = ashr i32 %dividend, 31
+  // ;   %tmp1   = ashr i32 %divisor, 31
+  // ;   %tmp2   = xor i32 %tmp, %dividend
+  // ;   %u_dvnd = sub nsw i32 %tmp2, %tmp
+  // ;   %tmp3   = xor i32 %tmp1, %divisor
+  // ;   %u_dvsr = sub nsw i32 %tmp3, %tmp1
+  // ;   %q_sgn  = xor i32 %tmp1, %tmp
+  // ;   %q_mag  = udiv i32 %u_dvnd, %u_dvsr
+  // ;   %tmp4   = xor i32 %q_mag, %q_sgn
+  // ;   %q      = sub i32 %tmp4, %q_sgn
+  Value *Tmp    = Builder.CreateAShr(Dividend, ThirtyOne);
+  Value *Tmp1   = Builder.CreateAShr(Divisor, ThirtyOne);
+  Value *Tmp2   = Builder.CreateXor(Tmp, Dividend);
+  Value *U_Dvnd = Builder.CreateSub(Tmp2, Tmp);
+  Value *Tmp3   = Builder.CreateXor(Tmp1, Divisor);
+  Value *U_Dvsr = Builder.CreateSub(Tmp3, Tmp1);
+  Value *Q_Sgn  = Builder.CreateXor(Tmp1, Tmp);
+  Value *Q_Mag  = Builder.CreateUDiv(U_Dvnd, U_Dvsr);
+  Value *Tmp4   = Builder.CreateXor(Q_Mag, Q_Sgn);
+  Value *Q      = Builder.CreateSub(Tmp4, Q_Sgn);
+
+  if (Instruction *UDiv = dyn_cast<Instruction>(Q_Mag))
+    Builder.SetInsertPoint(UDiv);
+
+  return Q;
+}
+
+/// Generates code to divide two unsigned scalar 32-bit integers. Returns the
+/// quotient, rounded towards 0. Builder's insert point should be pointing at
+/// the udiv instruction.
+static Value *generateUnsignedDivisionCode(Value *Dividend, Value *Divisor,
+                                           IRBuilder<> &Builder) {
+  // The basic algorithm can be found in the compiler-rt project's
+  // implementation of __udivsi3.c. Here, we do a lower-level IR based approach
+  // that's been hand-tuned to lessen the amount of control flow involved.
+
+  // Some helper values
+  IntegerType *I32Ty = Builder.getInt32Ty();
+
+  ConstantInt *Zero      = Builder.getInt32(0);
+  ConstantInt *One       = Builder.getInt32(1);
+  ConstantInt *ThirtyOne = Builder.getInt32(31);
+  ConstantInt *NegOne    = ConstantInt::getSigned(I32Ty, -1);
+  ConstantInt *True      = Builder.getTrue();
+
+  BasicBlock *IBB = Builder.GetInsertBlock();
+  Function *F = IBB->getParent();
+  Function *CTLZi32 = Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz,
+                                                I32Ty);
+
+  // Our CFG is going to look like:
+  // +---------------------+
+  // | special-cases       |
+  // |   ...               |
+  // +---------------------+
+  //  |       |
+  //  |   +----------+
+  //  |   |  bb1     |
+  //  |   |  ...     |
+  //  |   +----------+
+  //  |    |      |
+  //  |    |  +------------+
+  //  |    |  |  preheader |
+  //  |    |  |  ...       |
+  //  |    |  +------------+
+  //  |    |      |
+  //  |    |      |      +---+
+  //  |    |      |      |   |
+  //  |    |  +------------+ |
+  //  |    |  |  do-while  | |
+  //  |    |  |  ...       | |
+  //  |    |  +------------+ |
+  //  |    |      |      |   |
+  //  |   +-----------+  +---+
+  //  |   | loop-exit |
+  //  |   |  ...      |
+  //  |   +-----------+
+  //  |     |
+  // +-------+
+  // | ...   |
+  // | end   |
+  // +-------+
+  BasicBlock *SpecialCases = Builder.GetInsertBlock();
+  SpecialCases->setName(Twine(SpecialCases->getName(), "_udiv-special-cases"));
+  BasicBlock *End = SpecialCases->splitBasicBlock(Builder.GetInsertPoint(),
+                                                  "udiv-end");
+  BasicBlock *LoopExit  = BasicBlock::Create(Builder.getContext(),
+                                             "udiv-loop-exit", F, End);
+  BasicBlock *DoWhile   = BasicBlock::Create(Builder.getContext(),
+                                             "udiv-do-while", F, End);
+  BasicBlock *Preheader = BasicBlock::Create(Builder.getContext(),
+                                             "udiv-preheader", F, End);
+  BasicBlock *BB1       = BasicBlock::Create(Builder.getContext(),
+                                             "udiv-bb1", F, End);
+
+  // We'll be overwriting the terminator to insert our extra blocks
+  SpecialCases->getTerminator()->eraseFromParent();
+
+  // First off, check for special cases: dividend or divisor is zero, divisor
+  // is greater than dividend, and divisor is 1.
+  // ; special-cases:
+  // ;   %ret0_1      = icmp eq i32 %divisor, 0
+  // ;   %ret0_2      = icmp eq i32 %dividend, 0
+  // ;   %ret0_3      = or i1 %ret0_1, %ret0_2
+  // ;   %tmp0        = tail call i32 @llvm.ctlz.i32(i32 %divisor, i1 true)
+  // ;   %tmp1        = tail call i32 @llvm.ctlz.i32(i32 %dividend, i1 true)
+  // ;   %sr          = sub nsw i32 %tmp0, %tmp1
+  // ;   %ret0_4      = icmp ugt i32 %sr, 31
+  // ;   %ret0        = or i1 %ret0_3, %ret0_4
+  // ;   %retDividend = icmp eq i32 %sr, 31
+  // ;   %retVal      = select i1 %ret0, i32 0, i32 %dividend
+  // ;   %earlyRet    = or i1 %ret0, %retDividend
+  // ;   br i1 %earlyRet, label %end, label %bb1
+  Builder.SetInsertPoint(SpecialCases);
+  Value *Ret0_1      = Builder.CreateICmpEQ(Divisor, Zero);
+  Value *Ret0_2      = Builder.CreateICmpEQ(Dividend, Zero);
+  Value *Ret0_3      = Builder.CreateOr(Ret0_1, Ret0_2);
+  Value *Tmp0        = Builder.CreateCall2(CTLZi32, Divisor, True);
+  Value *Tmp1        = Builder.CreateCall2(CTLZi32, Dividend, True);
+  Value *SR          = Builder.CreateSub(Tmp0, Tmp1);
+  Value *Ret0_4      = Builder.CreateICmpUGT(SR, ThirtyOne);
+  Value *Ret0        = Builder.CreateOr(Ret0_3, Ret0_4);
+  Value *RetDividend = Builder.CreateICmpEQ(SR, ThirtyOne);
+  Value *RetVal      = Builder.CreateSelect(Ret0, Zero, Dividend);
+  Value *EarlyRet    = Builder.CreateOr(Ret0, RetDividend);
+  Builder.CreateCondBr(EarlyRet, End, BB1);
+
+  // ; bb1:                                             ; preds = %special-cases
+  // ;   %sr_1     = add i32 %sr, 1
+  // ;   %tmp2     = sub i32 31, %sr
+  // ;   %q        = shl i32 %dividend, %tmp2
+  // ;   %skipLoop = icmp eq i32 %sr_1, 0
+  // ;   br i1 %skipLoop, label %loop-exit, label %preheader
+  Builder.SetInsertPoint(BB1);
+  Value *SR_1     = Builder.CreateAdd(SR, One);
+  Value *Tmp2     = Builder.CreateSub(ThirtyOne, SR);
+  Value *Q        = Builder.CreateShl(Dividend, Tmp2);
+  Value *SkipLoop = Builder.CreateICmpEQ(SR_1, Zero);
+  Builder.CreateCondBr(SkipLoop, LoopExit, Preheader);
+
+  // ; preheader:                                           ; preds = %bb1
+  // ;   %tmp3 = lshr i32 %dividend, %sr_1
+  // ;   %tmp4 = add i32 %divisor, -1
+  // ;   br label %do-while
+  Builder.SetInsertPoint(Preheader);
+  Value *Tmp3 = Builder.CreateLShr(Dividend, SR_1);
+  Value *Tmp4 = Builder.CreateAdd(Divisor, NegOne);
+  Builder.CreateBr(DoWhile);
+
+  // ; do-while:                                 ; preds = %do-while, %preheader
+  // ;   %carry_1 = phi i32 [ 0, %preheader ], [ %carry, %do-while ]
+  // ;   %sr_3    = phi i32 [ %sr_1, %preheader ], [ %sr_2, %do-while ]
+  // ;   %r_1     = phi i32 [ %tmp3, %preheader ], [ %r, %do-while ]
+  // ;   %q_2     = phi i32 [ %q, %preheader ], [ %q_1, %do-while ]
+  // ;   %tmp5  = shl i32 %r_1, 1
+  // ;   %tmp6  = lshr i32 %q_2, 31
+  // ;   %tmp7  = or i32 %tmp5, %tmp6
+  // ;   %tmp8  = shl i32 %q_2, 1
+  // ;   %q_1   = or i32 %carry_1, %tmp8
+  // ;   %tmp9  = sub i32 %tmp4, %tmp7
+  // ;   %tmp10 = ashr i32 %tmp9, 31
+  // ;   %carry = and i32 %tmp10, 1
+  // ;   %tmp11 = and i32 %tmp10, %divisor
+  // ;   %r     = sub i32 %tmp7, %tmp11
+  // ;   %sr_2  = add i32 %sr_3, -1
+  // ;   %tmp12 = icmp eq i32 %sr_2, 0
+  // ;   br i1 %tmp12, label %loop-exit, label %do-while
+  Builder.SetInsertPoint(DoWhile);
+  PHINode *Carry_1 = Builder.CreatePHI(I32Ty, 2);
+  PHINode *SR_3    = Builder.CreatePHI(I32Ty, 2);
+  PHINode *R_1     = Builder.CreatePHI(I32Ty, 2);
+  PHINode *Q_2     = Builder.CreatePHI(I32Ty, 2);
+  Value *Tmp5  = Builder.CreateShl(R_1, One);
+  Value *Tmp6  = Builder.CreateLShr(Q_2, ThirtyOne);
+  Value *Tmp7  = Builder.CreateOr(Tmp5, Tmp6);
+  Value *Tmp8  = Builder.CreateShl(Q_2, One);
+  Value *Q_1   = Builder.CreateOr(Carry_1, Tmp8);
+  Value *Tmp9  = Builder.CreateSub(Tmp4, Tmp7);
+  Value *Tmp10 = Builder.CreateAShr(Tmp9, 31);
+  Value *Carry = Builder.CreateAnd(Tmp10, One);
+  Value *Tmp11 = Builder.CreateAnd(Tmp10, Divisor);
+  Value *R     = Builder.CreateSub(Tmp7, Tmp11);
+  Value *SR_2  = Builder.CreateAdd(SR_3, NegOne);
+  Value *Tmp12 = Builder.CreateICmpEQ(SR_2, Zero);
+  Builder.CreateCondBr(Tmp12, LoopExit, DoWhile);
+
+  // ; loop-exit:                                      ; preds = %do-while, %bb1
+  // ;   %carry_2 = phi i32 [ 0, %bb1 ], [ %carry, %do-while ]
+  // ;   %q_3     = phi i32 [ %q, %bb1 ], [ %q_1, %do-while ]
+  // ;   %tmp13 = shl i32 %q_3, 1
+  // ;   %q_4   = or i32 %carry_2, %tmp13
+  // ;   br label %end
+  Builder.SetInsertPoint(LoopExit);
+  PHINode *Carry_2 = Builder.CreatePHI(I32Ty, 2);
+  PHINode *Q_3     = Builder.CreatePHI(I32Ty, 2);
+  Value *Tmp13 = Builder.CreateShl(Q_3, One);
+  Value *Q_4   = Builder.CreateOr(Carry_2, Tmp13);
+  Builder.CreateBr(End);
+
+  // ; end:                                 ; preds = %loop-exit, %special-cases
+  // ;   %q_5 = phi i32 [ %q_4, %loop-exit ], [ %retVal, %special-cases ]
+  // ;   ret i32 %q_5
+  Builder.SetInsertPoint(End, End->begin());
+  PHINode *Q_5 = Builder.CreatePHI(I32Ty, 2);
+
+  // Populate the Phis, since all values have now been created. Our Phis were:
+  // ;   %carry_1 = phi i32 [ 0, %preheader ], [ %carry, %do-while ]
+  Carry_1->addIncoming(Zero, Preheader);
+  Carry_1->addIncoming(Carry, DoWhile);
+  // ;   %sr_3 = phi i32 [ %sr_1, %preheader ], [ %sr_2, %do-while ]
+  SR_3->addIncoming(SR_1, Preheader);
+  SR_3->addIncoming(SR_2, DoWhile);
+  // ;   %r_1 = phi i32 [ %tmp3, %preheader ], [ %r, %do-while ]
+  R_1->addIncoming(Tmp3, Preheader);
+  R_1->addIncoming(R, DoWhile);
+  // ;   %q_2 = phi i32 [ %q, %preheader ], [ %q_1, %do-while ]
+  Q_2->addIncoming(Q, Preheader);
+  Q_2->addIncoming(Q_1, DoWhile);
+  // ;   %carry_2 = phi i32 [ 0, %bb1 ], [ %carry, %do-while ]
+  Carry_2->addIncoming(Zero, BB1);
+  Carry_2->addIncoming(Carry, DoWhile);
+  // ;   %q_3 = phi i32 [ %q, %bb1 ], [ %q_1, %do-while ]
+  Q_3->addIncoming(Q, BB1);
+  Q_3->addIncoming(Q_1, DoWhile);
+  // ;   %q_5 = phi i32 [ %q_4, %loop-exit ], [ %retVal, %special-cases ]
+  Q_5->addIncoming(Q_4, LoopExit);
+  Q_5->addIncoming(RetVal, SpecialCases);
+
+  return Q_5;
+}
+
+/// Generate code to divide two integers, replacing Div with the generated
+/// code. This currently generates code similarly to compiler-rt's
+/// implementations, but future work includes generating more specialized code
+/// when more information about the operands are known. Currently only
+/// implements 32bit scalar division, but future work is removing this
+/// limitation.
+///
+/// @brief Replace Div with generated code.
+bool llvm::expandDivision(BinaryOperator *Div) {
+  assert((Div->getOpcode() == Instruction::SDiv ||
+          Div->getOpcode() == Instruction::UDiv) &&
+         "Trying to expand division from a non-division function");
+
+  IRBuilder<> Builder(Div);
+
+  if (Div->getType()->isVectorTy())
+    llvm_unreachable("Div over vectors not supported");
+
+  // First prepare the sign if it's a signed division
+  if (Div->getOpcode() == Instruction::SDiv) {
+    // Lower the code to unsigned division, and reset Div to point to the udiv.
+    Value *Quotient = generateSignedDivisionCode(Div->getOperand(0),
+                                                Div->getOperand(1), Builder);
+    Div->replaceAllUsesWith(Quotient);
+    Div->dropAllReferences();
+    Div->eraseFromParent();
+
+    // If we didn't actually generate a udiv instruction, we're done
+    BinaryOperator *BO = dyn_cast<BinaryOperator>(Builder.GetInsertPoint());
+    if (!BO || BO->getOpcode() != Instruction::UDiv)
+      return true;
+
+    Div = BO;
+  }
+
+  // Insert the unsigned division code
+  Value *Quotient = generateUnsignedDivisionCode(Div->getOperand(0),
+                                                 Div->getOperand(1),
+                                                 Builder);
+  Div->replaceAllUsesWith(Quotient);
+  Div->dropAllReferences();
+  Div->eraseFromParent();
+
+  return true;
+}
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 0601433565..2d89516c43 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Intrinsics.h"
+#include "llvm/MDBuilder.h"
 #include "llvm/Metadata.h"
 #include "llvm/Operator.h"
 #include "llvm/ADT/DenseMap.h"
@@ -122,6 +123,27 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
       // Check to see if this branch is going to the same place as the default
       // dest.  If so, eliminate it as an explicit compare.
       if (i.getCaseSuccessor() == DefaultDest) {
+        MDNode* MD = SI->getMetadata(LLVMContext::MD_prof);
+        // MD should have 2 + NumCases operands.
+        if (MD && MD->getNumOperands() == 2 + SI->getNumCases()) {
+          // Collect branch weights into a vector.
+          SmallVector<uint32_t, 8> Weights;
+          for (unsigned MD_i = 1, MD_e = MD->getNumOperands(); MD_i < MD_e;
+               ++MD_i) {
+            ConstantInt* CI = dyn_cast<ConstantInt>(MD->getOperand(MD_i));
+            assert(CI);
+            Weights.push_back(CI->getValue().getZExtValue());
+          }
+          // Merge weight of this case to the default weight.
+          unsigned idx = i.getCaseIndex();
+          Weights[0] += Weights[idx+1];
+          // Remove weight for this case.
+          std::swap(Weights[idx+1], Weights.back());
+          Weights.pop_back();
+          SI->setMetadata(LLVMContext::MD_prof,
+                          MDBuilder(BB->getContext()).
+                          createBranchWeights(Weights));
+        }
         // Remove this entry.
         DefaultDest->removePredecessor(SI->getParent());
         SI->removeCase(i);
@@ -178,8 +200,20 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
             "cond");
 
         // Insert the new branch.
-        Builder.CreateCondBr(Cond, FirstCase.getCaseSuccessor(),
-                             SI->getDefaultDest());
+        BranchInst *NewBr = Builder.CreateCondBr(Cond,
+                                FirstCase.getCaseSuccessor(),
+                                SI->getDefaultDest());
+        MDNode* MD = SI->getMetadata(LLVMContext::MD_prof);
+        if (MD && MD->getNumOperands() == 3) {
+          ConstantInt *SICase = dyn_cast<ConstantInt>(MD->getOperand(2));
+          ConstantInt *SIDef = dyn_cast<ConstantInt>(MD->getOperand(1));
+          assert(SICase && SIDef);
+          // The TrueWeight should be the weight for the single case of SI.
+          NewBr->setMetadata(LLVMContext::MD_prof,
+                 MDBuilder(BB->getContext()).
+                 createBranchWeights(SICase->getValue().getZExtValue(),
+                                     SIDef->getValue().getZExtValue()));
+        }
 
         // Delete the old switch.
         SI->eraseFromParent();
diff --git a/lib/Transforms/Utils/MetaRenamer.cpp b/lib/Transforms/Utils/MetaRenamer.cpp
index 60f031e16f..233bc12d3c 100644
--- a/lib/Transforms/Utils/MetaRenamer.cpp
+++ b/lib/Transforms/Utils/MetaRenamer.cpp
@@ -13,15 +13,15 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Type.h"
 #include "llvm/TypeFinder.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallString.h"
 
 using namespace llvm;
 
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 142f157c3c..876ff2c337 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -54,8 +54,13 @@ static cl::opt<bool>
 DupRet("simplifycfg-dup-ret", cl::Hidden, cl::init(false),
        cl::desc("Duplicate return instructions into unconditional branches"));
 
+static cl::opt<bool>
+SinkCommon("simplifycfg-sink-common", cl::Hidden, cl::init(true),
+       cl::desc("Sink common instructions down to the end block"));
+
 STATISTIC(NumSpeculations, "Number of speculative executed instructions");
 STATISTIC(NumLookupTables, "Number of switch instructions turned into lookup tables");
+STATISTIC(NumSinkCommons, "Number of common instructions sunk down to the end block");
 
 namespace {
   /// ValueEqualityComparisonCase - Represents a case of a switch.
@@ -667,13 +672,32 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
     DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
                  << "Through successor TI: " << *TI);
 
+    // Collect branch weights into a vector.
+    SmallVector<uint32_t, 8> Weights;
+    MDNode* MD = SI->getMetadata(LLVMContext::MD_prof);
+    bool HasWeight = MD && (MD->getNumOperands() == 2 + SI->getNumCases());
+    if (HasWeight)
+      for (unsigned MD_i = 1, MD_e = MD->getNumOperands(); MD_i < MD_e;
+           ++MD_i) {
+        ConstantInt* CI = dyn_cast<ConstantInt>(MD->getOperand(MD_i));
+        assert(CI);
+        Weights.push_back(CI->getValue().getZExtValue());
+      }
     for (SwitchInst::CaseIt i = SI->case_end(), e = SI->case_begin(); i != e;) {
       --i;
       if (DeadCases.count(i.getCaseValue())) {
+        if (HasWeight) {
+          std::swap(Weights[i.getCaseIndex()+1], Weights.back());
+          Weights.pop_back();
+        }
         i.getCaseSuccessor()->removePredecessor(TI->getParent());
         SI->removeCase(i);
       }
     }
+    if (HasWeight)
+      SI->setMetadata(LLVMContext::MD_prof,
+                      MDBuilder(SI->getParent()->getContext()).
+                      createBranchWeights(Weights));
 
     DEBUG(dbgs() << "Leaving: " << *TI << "\n");
     return true;
@@ -830,18 +854,24 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
       bool PredHasWeights = HasBranchWeights(PTI);
       bool SuccHasWeights = HasBranchWeights(TI);
 
-      if (PredHasWeights)
+      if (PredHasWeights) {
         GetBranchWeights(PTI, Weights);
-      else if (SuccHasWeights)
+        // branch-weight metadata is inconsistant here.
+        if (Weights.size() != 1 + PredCases.size())
+          PredHasWeights = SuccHasWeights = false;
+      } else if (SuccHasWeights)
         // If there are no predecessor weights but there are successor weights,
         // populate Weights with 1, which will later be scaled to the sum of
         // successor's weights
         Weights.assign(1 + PredCases.size(), 1);
 
       SmallVector<uint64_t, 8> SuccWeights;
-      if (SuccHasWeights)
+      if (SuccHasWeights) {
         GetBranchWeights(TI, SuccWeights);
-      else if (PredHasWeights)
+        // branch-weight metadata is inconsistant here.
+        if (SuccWeights.size() != 1 + BBCases.size())
+          PredHasWeights = SuccHasWeights = false;
+      } else if (PredHasWeights)
         SuccWeights.assign(1 + BBCases.size(), 1);
 
       if (PredDefault == BB) {
@@ -898,18 +928,21 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
           Weights[0] *= SuccWeights[0];
         }
       } else {
-        // FIXME: preserve branch weight metadata, similarly to the 'then'
-        // above. For now, drop it.
-        PredHasWeights = false;
-        SuccHasWeights = false;
-
         // If this is not the default destination from PSI, only the edges
         // in SI that occur in PSI with a destination of BB will be
         // activated.
         std::set<ConstantInt*, ConstantIntOrdering> PTIHandled;
+        std::map<ConstantInt*, uint64_t> WeightsForHandled;
         for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
           if (PredCases[i].Dest == BB) {
             PTIHandled.insert(PredCases[i].Value);
+
+            if (PredHasWeights || SuccHasWeights) {
+              WeightsForHandled[PredCases[i].Value] = Weights[i+1];
+              std::swap(Weights[i+1], Weights.back());
+              Weights.pop_back();
+            }
+
             std::swap(PredCases[i], PredCases.back());
             PredCases.pop_back();
             --i; --e;
@@ -920,6 +953,8 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
         for (unsigned i = 0, e = BBCases.size(); i != e; ++i)
           if (PTIHandled.count(BBCases[i].Value)) {
             // If this is one we are capable of getting...
+            if (PredHasWeights || SuccHasWeights)
+              Weights.push_back(WeightsForHandled[BBCases[i].Value]);
             PredCases.push_back(BBCases[i]);
             NewSuccessors.push_back(BBCases[i].Dest);
             PTIHandled.erase(BBCases[i].Value);// This constant is taken care of
@@ -930,6 +965,8 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
         for (std::set<ConstantInt*, ConstantIntOrdering>::iterator I =
                                     PTIHandled.begin(),
                E = PTIHandled.end(); I != E; ++I) {
+          if (PredHasWeights || SuccHasWeights) 
+            Weights.push_back(WeightsForHandled[*I]); 
           PredCases.push_back(ValueEqualityComparisonCase(*I, BBDefault));
           NewSuccessors.push_back(BBDefault);
         }
@@ -1124,6 +1161,171 @@ HoistTerminator:
   return true;
 }
 
+/// SinkThenElseCodeToEnd - Given an unconditional branch that goes to BBEnd,
+/// check whether BBEnd has only two predecessors and the other predecessor
+/// ends with an unconditional branch. If it is true, sink any common code
+/// in the two predecessors to BBEnd.
+static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
+  assert(BI1->isUnconditional());
+  BasicBlock *BB1 = BI1->getParent();
+  BasicBlock *BBEnd = BI1->getSuccessor(0);
+
+  // Check that BBEnd has two predecessors and the other predecessor ends with
+  // an unconditional branch.
+  SmallVector<BasicBlock*, 16> Preds(pred_begin(BBEnd), pred_end(BBEnd));
+  if (Preds.size() != 2)
+    return false;
+  BasicBlock *BB2 = (Preds[0] == BB1) ? Preds[1] : Preds[0];
+  BranchInst *BI2 = dyn_cast<BranchInst>(BB2->getTerminator());
+  if (!BI2 || !BI2->isUnconditional())
+    return false;
+
+  // Gather the PHI nodes in BBEnd.
+  std::map<Value*, std::pair<Value*, PHINode*> > MapValueFromBB1ToBB2;
+  Instruction *FirstNonPhiInBBEnd = 0;
+  for (BasicBlock::iterator I = BBEnd->begin(), E = BBEnd->end();
+       I != E; ++I) {
+    if (PHINode *PN = dyn_cast<PHINode>(I)) {
+      Value *BB1V = PN->getIncomingValueForBlock(BB1);
+      Value *BB2V = PN->getIncomingValueForBlock(BB2); 
+      MapValueFromBB1ToBB2[BB1V] = std::make_pair(BB2V, PN);
+    } else {
+      FirstNonPhiInBBEnd = &*I;
+      break;
+    }
+  }
+  if (!FirstNonPhiInBBEnd)
+    return false;
+  
+
+  // This does very trivial matching, with limited scanning, to find identical
+  // instructions in the two blocks.  We scan backward for obviously identical
+  // instructions in an identical order.
+  BasicBlock::InstListType::reverse_iterator RI1 = BB1->getInstList().rbegin(),
+      RE1 = BB1->getInstList().rend(), RI2 = BB2->getInstList().rbegin(),
+      RE2 = BB2->getInstList().rend();
+  // Skip debug info.
+  while (RI1 != RE1 && isa<DbgInfoIntrinsic>(&*RI1)) ++RI1;
+  if (RI1 == RE1)
+    return false;
+  while (RI2 != RE2 && isa<DbgInfoIntrinsic>(&*RI2)) ++RI2;
+  if (RI2 == RE2)
+    return false;
+  // Skip the unconditional branches.
+  ++RI1;
+  ++RI2;
+
+  bool Changed = false;
+  while (RI1 != RE1 && RI2 != RE2) {
+    // Skip debug info.
+    while (RI1 != RE1 && isa<DbgInfoIntrinsic>(&*RI1)) ++RI1;
+    if (RI1 == RE1)
+      return Changed;
+    while (RI2 != RE2 && isa<DbgInfoIntrinsic>(&*RI2)) ++RI2;
+    if (RI2 == RE2)
+      return Changed;
+
+    Instruction *I1 = &*RI1, *I2 = &*RI2;
+    // I1 and I2 should have a single use in the same PHI node, and they
+    // perform the same operation.
+    // Cannot move control-flow-involving, volatile loads, vaarg, etc.
+    if (isa<PHINode>(I1) || isa<PHINode>(I2) ||
+        isa<TerminatorInst>(I1) || isa<TerminatorInst>(I2) ||
+        isa<LandingPadInst>(I1) || isa<LandingPadInst>(I2) ||
+        isa<AllocaInst>(I1) || isa<AllocaInst>(I2) ||
+        I1->mayHaveSideEffects() || I2->mayHaveSideEffects() ||
+        I1->mayReadOrWriteMemory() || I2->mayReadOrWriteMemory() ||
+        !I1->hasOneUse() || !I2->hasOneUse() ||
+        MapValueFromBB1ToBB2.find(I1) == MapValueFromBB1ToBB2.end() ||
+        MapValueFromBB1ToBB2[I1].first != I2)
+      return Changed;
+
+    // Check whether we should swap the operands of ICmpInst.
+    ICmpInst *ICmp1 = dyn_cast<ICmpInst>(I1), *ICmp2 = dyn_cast<ICmpInst>(I2);
+    bool SwapOpnds = false;
+    if (ICmp1 && ICmp2 &&
+        ICmp1->getOperand(0) != ICmp2->getOperand(0) &&
+        ICmp1->getOperand(1) != ICmp2->getOperand(1) &&
+        (ICmp1->getOperand(0) == ICmp2->getOperand(1) ||
+         ICmp1->getOperand(1) == ICmp2->getOperand(0))) {
+      ICmp2->swapOperands();
+      SwapOpnds = true;
+    }
+    if (!I1->isSameOperationAs(I2)) {
+      if (SwapOpnds)
+        ICmp2->swapOperands();
+      return Changed;
+    }
+
+    // The operands should be either the same or they need to be generated
+    // with a PHI node after sinking. We only handle the case where there is
+    // a single pair of different operands.
+    Value *DifferentOp1 = 0, *DifferentOp2 = 0;
+    unsigned Op1Idx = 0;
+    for (unsigned I = 0, E = I1->getNumOperands(); I != E; ++I) {
+      if (I1->getOperand(I) == I2->getOperand(I))
+        continue;
+      // Early exit if we have more-than one pair of different operands or
+      // the different operand is already in MapValueFromBB1ToBB2.
+      // Early exit if we need a PHI node to replace a constant.
+      if (DifferentOp1 ||
+          MapValueFromBB1ToBB2.find(I1->getOperand(I)) !=
+          MapValueFromBB1ToBB2.end() ||
+          isa<Constant>(I1->getOperand(I)) ||
+          isa<Constant>(I2->getOperand(I))) {
+        // If we can't sink the instructions, undo the swapping.
+        if (SwapOpnds)
+          ICmp2->swapOperands();
+        return Changed;
+      }
+      DifferentOp1 = I1->getOperand(I);
+      Op1Idx = I;
+      DifferentOp2 = I2->getOperand(I);
+    }
+
+    // We insert the pair of different operands to MapValueFromBB1ToBB2 and
+    // remove (I1, I2) from MapValueFromBB1ToBB2.
+    if (DifferentOp1) {
+      PHINode *NewPN = PHINode::Create(DifferentOp1->getType(), 2,
+                                       DifferentOp1->getName() + ".sink",
+                                       BBEnd->begin());
+      MapValueFromBB1ToBB2[DifferentOp1] = std::make_pair(DifferentOp2, NewPN);
+      // I1 should use NewPN instead of DifferentOp1.
+      I1->setOperand(Op1Idx, NewPN);
+      NewPN->addIncoming(DifferentOp1, BB1);
+      NewPN->addIncoming(DifferentOp2, BB2);
+      DEBUG(dbgs() << "Create PHI node " << *NewPN << "\n";);
+    }
+    PHINode *OldPN = MapValueFromBB1ToBB2[I1].second;
+    MapValueFromBB1ToBB2.erase(I1);
+
+    DEBUG(dbgs() << "SINK common instructions " << *I1 << "\n";);
+    DEBUG(dbgs() << "                         " << *I2 << "\n";);
+    // We need to update RE1 and RE2 if we are going to sink the first
+    // instruction in the basic block down.
+    bool UpdateRE1 = (I1 == BB1->begin()), UpdateRE2 = (I2 == BB2->begin());
+    // Sink the instruction.
+    BBEnd->getInstList().splice(FirstNonPhiInBBEnd, BB1->getInstList(), I1);
+    if (!OldPN->use_empty())
+      OldPN->replaceAllUsesWith(I1);
+    OldPN->eraseFromParent();
+
+    if (!I2->use_empty())
+      I2->replaceAllUsesWith(I1);
+    I1->intersectOptionalDataWith(I2);
+    I2->eraseFromParent();
+
+    if (UpdateRE1)
+      RE1 = BB1->getInstList().rend();
+    if (UpdateRE2)
+      RE2 = BB2->getInstList().rend();
+    FirstNonPhiInBBEnd = I1;
+    NumSinkCommons++;
+    Changed = true;
+  }
+  return Changed;
+}
+
 /// SpeculativelyExecuteBB - Given a conditional branch that goes to BB1
 /// and an BB2 and the only successor of BB1 is BB2, hoist simple code
 /// (for now, restricted to a single instruction that's side effect free) from
@@ -1626,7 +1828,7 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI,
 /// parameters and return true, or returns false if no or invalid metadata was
 /// found.
 static bool ExtractBranchMetadata(BranchInst *BI,
-                                  APInt &ProbTrue, APInt &ProbFalse) {
+                                  uint64_t &ProbTrue, uint64_t &ProbFalse) {
   assert(BI->isConditional() &&
          "Looking for probabilities on unconditional branch?");
   MDNode *ProfileData = BI->getMetadata(LLVMContext::MD_prof);
@@ -1634,35 +1836,11 @@ static bool ExtractBranchMetadata(BranchInst *BI,
   ConstantInt *CITrue = dyn_cast<ConstantInt>(ProfileData->getOperand(1));
   ConstantInt *CIFalse = dyn_cast<ConstantInt>(ProfileData->getOperand(2));
   if (!CITrue || !CIFalse) return false;
-  ProbTrue = CITrue->getValue();
-  ProbFalse = CIFalse->getValue();
-  assert(ProbTrue.getBitWidth() == 32 && ProbFalse.getBitWidth() == 32 &&
-         "Branch probability metadata must be 32-bit integers");
+  ProbTrue = CITrue->getValue().getZExtValue();
+  ProbFalse = CIFalse->getValue().getZExtValue();
   return true;
 }
 
-/// MultiplyAndLosePrecision - Multiplies A and B, then returns the result. In
-/// the event of overflow, logically-shifts all four inputs right until the
-/// multiply fits.
-static APInt MultiplyAndLosePrecision(APInt &A, APInt &B, APInt &C, APInt &D,
-                                      unsigned &BitsLost) {
-  BitsLost = 0;
-  bool Overflow = false;
-  APInt Result = A.umul_ov(B, Overflow);
-  if (Overflow) {
-    APInt MaxB = APInt::getMaxValue(A.getBitWidth()).udiv(A);
-    do {
-      B = B.lshr(1);
-      ++BitsLost;
-    } while (B.ugt(MaxB));
-    A = A.lshr(BitsLost);
-    C = C.lshr(BitsLost);
-    D = D.lshr(BitsLost);
-    Result = A * B;
-  }
-  return Result;
-}
-
 /// checkCSEInPredecessor - Return true if the given instruction is available
 /// in its predecessor block. If yes, the instruction will be removed.
 ///
@@ -1788,7 +1966,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
       continue;
 
     // Determine if the two branches share a common destination.
-    Instruction::BinaryOps Opc;
+    Instruction::BinaryOps Opc = Instruction::BinaryOpsEnd;
     bool InvertPredCond = false;
 
     if (BI->isConditional()) {
@@ -1887,14 +2065,53 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
                                             New, "or.cond"));
       PBI->setCondition(NewCond);
 
+      uint64_t PredTrueWeight, PredFalseWeight, SuccTrueWeight, SuccFalseWeight;
+      bool PredHasWeights = ExtractBranchMetadata(PBI, PredTrueWeight,
+                                                  PredFalseWeight);
+      bool SuccHasWeights = ExtractBranchMetadata(BI, SuccTrueWeight,
+                                                  SuccFalseWeight);
+      SmallVector<uint64_t, 8> NewWeights;
+
       if (PBI->getSuccessor(0) == BB) {
+        if (PredHasWeights && SuccHasWeights) {
+          // PBI: br i1 %x, BB, FalseDest
+          // BI:  br i1 %y, TrueDest, FalseDest
+          //TrueWeight is TrueWeight for PBI * TrueWeight for BI.
+          NewWeights.push_back(PredTrueWeight * SuccTrueWeight);
+          //FalseWeight is FalseWeight for PBI * TotalWeight for BI +
+          //               TrueWeight for PBI * FalseWeight for BI.
+          // We assume that total weights of a BranchInst can fit into 32 bits.
+          // Therefore, we will not have overflow using 64-bit arithmetic.
+          NewWeights.push_back(PredFalseWeight * (SuccFalseWeight +
+               SuccTrueWeight) + PredTrueWeight * SuccFalseWeight);
+        }
         AddPredecessorToBlock(TrueDest, PredBlock, BB);
         PBI->setSuccessor(0, TrueDest);
       }
       if (PBI->getSuccessor(1) == BB) {
+        if (PredHasWeights && SuccHasWeights) {
+          // PBI: br i1 %x, TrueDest, BB
+          // BI:  br i1 %y, TrueDest, FalseDest
+          //TrueWeight is TrueWeight for PBI * TotalWeight for BI +
+          //              FalseWeight for PBI * TrueWeight for BI.
+          NewWeights.push_back(PredTrueWeight * (SuccFalseWeight +
+              SuccTrueWeight) + PredFalseWeight * SuccTrueWeight);
+          //FalseWeight is FalseWeight for PBI * FalseWeight for BI.
+          NewWeights.push_back(PredFalseWeight * SuccFalseWeight);
+        }
         AddPredecessorToBlock(FalseDest, PredBlock, BB);
         PBI->setSuccessor(1, FalseDest);
       }
+      if (NewWeights.size() == 2) {
+        // Halve the weights if any of them cannot fit in an uint32_t
+        FitWeights(NewWeights);
+
+        SmallVector<uint32_t, 8> MDWeights(NewWeights.begin(),NewWeights.end());
+        PBI->setMetadata(LLVMContext::MD_prof,
+                         MDBuilder(BI->getContext()).
+                         createBranchWeights(MDWeights));
+      } else
+        PBI->setMetadata(LLVMContext::MD_prof, NULL);
     } else {
       // Update PHI nodes in the common successors.
       for (unsigned i = 0, e = PHIs.size(); i != e; ++i) {
@@ -1949,90 +2166,6 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
     // TODO: If BB is reachable from all paths through PredBlock, then we
     // could replace PBI's branch probabilities with BI's.
 
-    // Merge probability data into PredBlock's branch.
-    APInt A, B, C, D;
-    if (PBI->isConditional() && BI->isConditional() &&
-        ExtractBranchMetadata(PBI, C, D) && ExtractBranchMetadata(BI, A, B)) {
-      // Given IR which does:
-      //   bbA:
-      //     br i1 %x, label %bbB, label %bbC
-      //   bbB:
-      //     br i1 %y, label %bbD, label %bbC
-      // Let's call the probability that we take the edge from %bbA to %bbB
-      // 'a', from %bbA to %bbC, 'b', from %bbB to %bbD 'c' and from %bbB to
-      // %bbC probability 'd'.
-      //
-      // We transform the IR into:
-      //   bbA:
-      //     br i1 %z, label %bbD, label %bbC
-      // where the probability of going to %bbD is (a*c) and going to bbC is
-      // (b+a*d).
-      //
-      // Probabilities aren't stored as ratios directly. Using branch weights,
-      // we get:
-      // (a*c)% = A*C, (b+(a*d))% = A*D+B*C+B*D.
-
-      // In the event of overflow, we want to drop the LSB of the input
-      // probabilities.
-      unsigned BitsLost;
-
-      // Ignore overflow result on ProbTrue.
-      APInt ProbTrue = MultiplyAndLosePrecision(A, C, B, D, BitsLost);
-
-      APInt Tmp1 = MultiplyAndLosePrecision(B, D, A, C, BitsLost);
-      if (BitsLost) {
-        ProbTrue = ProbTrue.lshr(BitsLost*2);
-      }
-
-      APInt Tmp2 = MultiplyAndLosePrecision(A, D, C, B, BitsLost);
-      if (BitsLost) {
-        ProbTrue = ProbTrue.lshr(BitsLost*2);
-        Tmp1 = Tmp1.lshr(BitsLost*2);
-      }
-
-      APInt Tmp3 = MultiplyAndLosePrecision(B, C, A, D, BitsLost);
-      if (BitsLost) {
-        ProbTrue = ProbTrue.lshr(BitsLost*2);
-        Tmp1 = Tmp1.lshr(BitsLost*2);
-        Tmp2 = Tmp2.lshr(BitsLost*2);
-      }
-
-      bool Overflow1 = false, Overflow2 = false;
-      APInt Tmp4 = Tmp2.uadd_ov(Tmp3, Overflow1);
-      APInt ProbFalse = Tmp4.uadd_ov(Tmp1, Overflow2);
-
-      if (Overflow1 || Overflow2) {
-        ProbTrue = ProbTrue.lshr(1);
-        Tmp1 = Tmp1.lshr(1);
-        Tmp2 = Tmp2.lshr(1);
-        Tmp3 = Tmp3.lshr(1);
-        Tmp4 = Tmp2 + Tmp3;
-        ProbFalse = Tmp4 + Tmp1;
-      }
-
-      // The sum of branch weights must fit in 32-bits.
-      if (ProbTrue.isNegative() && ProbFalse.isNegative()) {
-        ProbTrue = ProbTrue.lshr(1);
-        ProbFalse = ProbFalse.lshr(1);
-      }
-
-      if (ProbTrue != ProbFalse) {
-        // Normalize the result.
-        APInt GCD = APIntOps::GreatestCommonDivisor(ProbTrue, ProbFalse);
-        ProbTrue = ProbTrue.udiv(GCD);
-        ProbFalse = ProbFalse.udiv(GCD);
-
-        MDBuilder MDB(BI->getContext());
-        MDNode *N = MDB.createBranchWeights(ProbTrue.getZExtValue(),
-                                            ProbFalse.getZExtValue());
-        PBI->setMetadata(LLVMContext::MD_prof, N);
-      } else {
-        PBI->setMetadata(LLVMContext::MD_prof, NULL);
-      }
-    } else {
-      PBI->setMetadata(LLVMContext::MD_prof, NULL);
-    }
-
     // Copy any debug value intrinsics into the end of PredBlock.
     for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
       if (isa<DbgInfoIntrinsic>(*I))
@@ -2187,6 +2320,33 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
   PBI->setSuccessor(0, CommonDest);
   PBI->setSuccessor(1, OtherDest);
 
+  // Update branch weight for PBI.
+  uint64_t PredTrueWeight, PredFalseWeight, SuccTrueWeight, SuccFalseWeight;
+  bool PredHasWeights = ExtractBranchMetadata(PBI, PredTrueWeight,
+                                              PredFalseWeight);
+  bool SuccHasWeights = ExtractBranchMetadata(BI, SuccTrueWeight,
+                                              SuccFalseWeight);
+  if (PredHasWeights && SuccHasWeights) {
+    uint64_t PredCommon = PBIOp ? PredFalseWeight : PredTrueWeight;
+    uint64_t PredOther = PBIOp ?PredTrueWeight : PredFalseWeight;
+    uint64_t SuccCommon = BIOp ? SuccFalseWeight : SuccTrueWeight;
+    uint64_t SuccOther = BIOp ? SuccTrueWeight : SuccFalseWeight;
+    // The weight to CommonDest should be PredCommon * SuccTotal +
+    //                                    PredOther * SuccCommon.
+    // The weight to OtherDest should be PredOther * SuccOther.
+    SmallVector<uint64_t, 2> NewWeights;
+    NewWeights.push_back(PredCommon * (SuccCommon + SuccOther) +
+                         PredOther * SuccCommon);
+    NewWeights.push_back(PredOther * SuccOther);
+    // Halve the weights if any of them cannot fit in an uint32_t
+    FitWeights(NewWeights);
+
+    SmallVector<uint32_t, 2> MDWeights(NewWeights.begin(),NewWeights.end());
+    PBI->setMetadata(LLVMContext::MD_prof,
+                     MDBuilder(BI->getContext()).
+                     createBranchWeights(MDWeights));
+  }
+
   // OtherDest may have phi nodes.  If so, add an entry from PBI's
   // block that are identical to the entries for BI's block.
   AddPredecessorToBlock(OtherDest, PBI->getParent(), BB);
@@ -2223,7 +2383,9 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
 // Also makes sure not to introduce new successors by assuming that edges to
 // non-successor TrueBBs and FalseBBs aren't reachable.
 static bool SimplifyTerminatorOnSelect(TerminatorInst *OldTerm, Value *Cond,
-                                       BasicBlock *TrueBB, BasicBlock *FalseBB){
+                                       BasicBlock *TrueBB, BasicBlock *FalseBB,
+                                       uint32_t TrueWeight,
+                                       uint32_t FalseWeight){
   // Remove any superfluous successor edges from the CFG.
   // First, figure out which successors to preserve.
   // If TrueBB and FalseBB are equal, only try to preserve one copy of that
@@ -2252,10 +2414,15 @@ static bool SimplifyTerminatorOnSelect(TerminatorInst *OldTerm, Value *Cond,
       // We were only looking for one successor, and it was present.
       // Create an unconditional branch to it.
       Builder.CreateBr(TrueBB);
-    else
+    else {
       // We found both of the successors we were looking for.
       // Create a conditional branch sharing the condition of the select.
-      Builder.CreateCondBr(Cond, TrueBB, FalseBB);
+      BranchInst *NewBI = Builder.CreateCondBr(Cond, TrueBB, FalseBB);
+      if (TrueWeight != FalseWeight)
+        NewBI->setMetadata(LLVMContext::MD_prof,
+                           MDBuilder(OldTerm->getContext()).
+                           createBranchWeights(TrueWeight, FalseWeight));
+    }
   } else if (KeepEdge1 && (KeepEdge2 || TrueBB == FalseBB)) {
     // Neither of the selected blocks were successors, so this
     // terminator must be unreachable.
@@ -2292,8 +2459,23 @@ static bool SimplifySwitchOnSelect(SwitchInst *SI, SelectInst *Select) {
   BasicBlock *TrueBB = SI->findCaseValue(TrueVal).getCaseSuccessor();
   BasicBlock *FalseBB = SI->findCaseValue(FalseVal).getCaseSuccessor();
 
+  // Get weight for TrueBB and FalseBB.
+  uint32_t TrueWeight = 0, FalseWeight = 0;
+  SmallVector<uint64_t, 8> Weights;
+  bool HasWeights = HasBranchWeights(SI);
+  if (HasWeights) {
+    GetBranchWeights(SI, Weights);
+    if (Weights.size() == 1 + SI->getNumCases()) {
+      TrueWeight = (uint32_t)Weights[SI->findCaseValue(TrueVal).
+                                     getSuccessorIndex()];
+      FalseWeight = (uint32_t)Weights[SI->findCaseValue(FalseVal).
+                                      getSuccessorIndex()];
+    }
+  }
+
   // Perform the actual simplification.
-  return SimplifyTerminatorOnSelect(SI, Condition, TrueBB, FalseBB);
+  return SimplifyTerminatorOnSelect(SI, Condition, TrueBB, FalseBB,
+                                    TrueWeight, FalseWeight);
 }
 
 // SimplifyIndirectBrOnSelect - Replaces
@@ -2313,7 +2495,8 @@ static bool SimplifyIndirectBrOnSelect(IndirectBrInst *IBI, SelectInst *SI) {
   BasicBlock *FalseBB = FBA->getBasicBlock();
 
   // Perform the actual simplification.
-  return SimplifyTerminatorOnSelect(IBI, SI->getCondition(), TrueBB, FalseBB);
+  return SimplifyTerminatorOnSelect(IBI, SI->getCondition(), TrueBB, FalseBB,
+                                    0, 0);
 }
 
 /// TryToSimplifyUncondBranchWithICmpInIt - This is called when we find an icmp
@@ -2412,6 +2595,21 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
   // the switch to the merge point on the compared value.
   BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "switch.edge",
                                          BB->getParent(), BB);
+  SmallVector<uint64_t, 8> Weights;
+  bool HasWeights = HasBranchWeights(SI);
+  if (HasWeights) {
+    GetBranchWeights(SI, Weights);
+    if (Weights.size() == 1 + SI->getNumCases()) {
+      // Split weight for default case to case for "Cst".
+      Weights[0] = (Weights[0]+1) >> 1;
+      Weights.push_back(Weights[0]);
+
+      SmallVector<uint32_t, 8> MDWeights(Weights.begin(), Weights.end());
+      SI->setMetadata(LLVMContext::MD_prof,
+                      MDBuilder(SI->getContext()).
+                      createBranchWeights(MDWeights));
+    }
+  }
   SI->addCase(Cst, NewBB);
 
   // NewBB branches to the phi block, add the uncond branch and the phi entry.
@@ -2825,9 +3023,28 @@ static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) {
   if (!Offset->isNullValue())
     Sub = Builder.CreateAdd(Sub, Offset, Sub->getName()+".off");
   Value *Cmp = Builder.CreateICmpULT(Sub, NumCases, "switch");
-  Builder.CreateCondBr(
+  BranchInst *NewBI = Builder.CreateCondBr(
       Cmp, SI->case_begin().getCaseSuccessor(), SI->getDefaultDest());
 
+  // Update weight for the newly-created conditional branch.
+  SmallVector<uint64_t, 8> Weights;
+  bool HasWeights = HasBranchWeights(SI);
+  if (HasWeights) {
+    GetBranchWeights(SI, Weights);
+    if (Weights.size() == 1 + SI->getNumCases()) {
+      // Combine all weights for the cases to be the true weight of NewBI.
+      // We assume that the sum of all weights for a Terminator can fit into 32
+      // bits.
+      uint32_t NewTrueWeight = 0;
+      for (unsigned I = 1, E = Weights.size(); I != E; ++I)
+        NewTrueWeight += (uint32_t)Weights[I];
+      NewBI->setMetadata(LLVMContext::MD_prof,
+                         MDBuilder(SI->getContext()).
+                         createBranchWeights(NewTrueWeight,
+                                             (uint32_t)Weights[0]));
+    }
+  }
+
   // Prune obsolete incoming values off the successor's PHI nodes.
   for (BasicBlock::iterator BBI = SI->case_begin().getCaseSuccessor()->begin();
        isa<PHINode>(BBI); ++BBI) {
@@ -2858,15 +3075,33 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI) {
     }
   }
 
+  SmallVector<uint64_t, 8> Weights;
+  bool HasWeight = HasBranchWeights(SI);
+  if (HasWeight) {
+    GetBranchWeights(SI, Weights);
+    HasWeight = (Weights.size() == 1 + SI->getNumCases());
+  }
+
   // Remove dead cases from the switch.
   for (unsigned I = 0, E = DeadCases.size(); I != E; ++I) {
     SwitchInst::CaseIt Case = SI->findCaseValue(DeadCases[I]);
     assert(Case != SI->case_default() &&
            "Case was not found. Probably mistake in DeadCases forming.");
+    if (HasWeight) {
+      std::swap(Weights[Case.getCaseIndex()+1], Weights.back());
+      Weights.pop_back();
+    }
+
     // Prune unused values from PHI nodes.
     Case.getCaseSuccessor()->removePredecessor(SI->getParent());
     SI->removeCase(Case);
   }
+  if (HasWeight) {
+    SmallVector<uint32_t, 8> MDWeights(Weights.begin(), Weights.end());
+    SI->setMetadata(LLVMContext::MD_prof,
+                    MDBuilder(SI->getParent()->getContext()).
+                    createBranchWeights(MDWeights));
+  }
 
   return !DeadCases.empty();
 }
@@ -3193,17 +3428,16 @@ static bool SwitchToLookupTable(SwitchInst *SI,
                                            "switch.gep");
     Value *Result = Builder.CreateLoad(GEP, "switch.load");
 
-    // If the result is only going to be used to return from the function,
-    // we want to do that right here.
-    if (PHI->hasOneUse() && isa<ReturnInst>(*PHI->use_begin())) {
-      if (CommonDest->getFirstNonPHIOrDbg() == CommonDest->getTerminator()) {
-        Builder.CreateRet(Result);
-        ReturnedEarly = true;
-      }
+    // If the result is used to return immediately from the function, we want to
+    // do that right here.
+    if (PHI->hasOneUse() && isa<ReturnInst>(*PHI->use_begin()) &&
+        *PHI->use_begin() == CommonDest->getFirstNonPHIOrDbg()) {
+      Builder.CreateRet(Result);
+      ReturnedEarly = true;
+      break;
     }
 
-    if (!ReturnedEarly)
-      PHI->addIncoming(Result, LookupBB);
+    PHI->addIncoming(Result, LookupBB);
   }
 
   if (!ReturnedEarly)
@@ -3306,6 +3540,9 @@ bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) {
 bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){
   BasicBlock *BB = BI->getParent();
 
+  if (SinkCommon && SinkThenElseCodeToEnd(BI))
+    return true;
+
   // If the Terminator is the only non-phi instruction, simplify the block.
   BasicBlock::iterator I = BB->getFirstNonPHIOrDbgOrLifetime();
   if (I->isTerminator() && BB != &BB->getParent()->getEntryBlock() &&
diff --git a/lib/VMCore/AsmWriter.cpp b/lib/VMCore/AsmWriter.cpp
index f3f24ae5c8..19fe156b53 100644
--- a/lib/VMCore/AsmWriter.cpp
+++ b/lib/VMCore/AsmWriter.cpp
@@ -66,6 +66,24 @@ static const Module *getModuleFromVal(const Value *V) {
   return 0;
 }
 
+static void PrintCallingConv(unsigned cc, raw_ostream &Out)
+{
+  switch (cc) {
+    case CallingConv::Fast:         Out << "fastcc"; break;
+    case CallingConv::Cold:         Out << "coldcc"; break;
+    case CallingConv::X86_StdCall:  Out << "x86_stdcallcc"; break;
+    case CallingConv::X86_FastCall: Out << "x86_fastcallcc"; break;
+    case CallingConv::X86_ThisCall: Out << "x86_thiscallcc"; break;
+    case CallingConv::ARM_APCS:     Out << "arm_apcscc"; break;
+    case CallingConv::ARM_AAPCS:    Out << "arm_aapcscc"; break;
+    case CallingConv::ARM_AAPCS_VFP:Out << "arm_aapcs_vfpcc"; break;
+    case CallingConv::MSP430_INTR:  Out << "msp430_intrcc"; break;
+    case CallingConv::PTX_Kernel:   Out << "ptx_kernel"; break;
+    case CallingConv::PTX_Device:   Out << "ptx_device"; break;
+    default:                        Out << "cc" << cc; break;
+  }
+}
+ 
 // PrintEscapedString - Print each character of the specified string, escaping
 // it if it is not printable or if it is an escape char.
 static void PrintEscapedString(StringRef Name, raw_ostream &Out) {
@@ -141,8 +159,8 @@ static void PrintLLVMName(raw_ostream &OS, const Value *V) {
 /// TypePrinting - Type printing machinery.
 namespace {
 class TypePrinting {
-  TypePrinting(const TypePrinting &);   // DO NOT IMPLEMENT
-  void operator=(const TypePrinting&);  // DO NOT IMPLEMENT
+  TypePrinting(const TypePrinting &) LLVM_DELETED_FUNCTION;
+  void operator=(const TypePrinting&) LLVM_DELETED_FUNCTION;
 public:
 
   /// NamedTypes - The named types that are used by the current module.
@@ -380,8 +398,8 @@ private:
   /// Add all of the functions arguments, basic blocks, and instructions.
   void processFunction();
 
-  SlotTracker(const SlotTracker &);  // DO NOT IMPLEMENT
-  void operator=(const SlotTracker &);  // DO NOT IMPLEMENT
+  SlotTracker(const SlotTracker &) LLVM_DELETED_FUNCTION;
+  void operator=(const SlotTracker &) LLVM_DELETED_FUNCTION;
 };
 
 }  // end anonymous namespace
@@ -1226,7 +1244,7 @@ void AssemblyWriter::writeParamOperand(const Value *Operand,
   TypePrinter.print(Operand->getType(), Out);
   // Print parameter attributes list
   if (Attrs != Attribute::None)
-    Out << ' ' << Attribute::getAsString(Attrs);
+    Out << ' ' << Attrs.getAsString();
   Out << ' ';
   // Print the operand
   WriteAsOperandInternal(Out, Operand, &TypePrinter, &Machine, TheModule);
@@ -1288,8 +1306,9 @@ void AssemblyWriter::printModule(const Module *M) {
   // Output all globals.
   if (!M->global_empty()) Out << '\n';
   for (Module::const_global_iterator I = M->global_begin(), E = M->global_end();
-       I != E; ++I)
-    printGlobal(I);
+       I != E; ++I) {
+    printGlobal(I); Out << '\n';
+  }
 
   // Output all aliases.
   if (!M->alias_empty()) Out << "\n";
@@ -1439,7 +1458,6 @@ void AssemblyWriter::printGlobal(const GlobalVariable *GV) {
     Out << ", align " << GV->getAlignment();
 
   printInfoComment(*GV);
-  Out << '\n';
 }
 
 void AssemblyWriter::printAlias(const GlobalAlias *GA) {
@@ -1530,27 +1548,16 @@ void AssemblyWriter::printFunction(const Function *F) {
   PrintVisibility(F->getVisibility(), Out);
 
   // Print the calling convention.
-  switch (F->getCallingConv()) {
-  case CallingConv::C: break;   // default
-  case CallingConv::Fast:         Out << "fastcc "; break;
-  case CallingConv::Cold:         Out << "coldcc "; break;
-  case CallingConv::X86_StdCall:  Out << "x86_stdcallcc "; break;
-  case CallingConv::X86_FastCall: Out << "x86_fastcallcc "; break;
-  case CallingConv::X86_ThisCall: Out << "x86_thiscallcc "; break;
-  case CallingConv::ARM_APCS:     Out << "arm_apcscc "; break;
-  case CallingConv::ARM_AAPCS:    Out << "arm_aapcscc "; break;
-  case CallingConv::ARM_AAPCS_VFP:Out << "arm_aapcs_vfpcc "; break;
-  case CallingConv::MSP430_INTR:  Out << "msp430_intrcc "; break;
-  case CallingConv::PTX_Kernel:   Out << "ptx_kernel "; break;
-  case CallingConv::PTX_Device:   Out << "ptx_device "; break;
-  default: Out << "cc" << F->getCallingConv() << " "; break;
+  if (F->getCallingConv() != CallingConv::C) {
+    PrintCallingConv(F->getCallingConv(), Out);
+    Out << " ";
   }
 
   FunctionType *FT = F->getFunctionType();
   const AttrListPtr &Attrs = F->getAttributes();
   Attributes RetAttrs = Attrs.getRetAttributes();
   if (RetAttrs != Attribute::None)
-    Out <<  Attribute::getAsString(Attrs.getRetAttributes()) << ' ';
+    Out <<  Attrs.getRetAttributes().getAsString() << ' ';
   TypePrinter.print(F->getReturnType(), Out);
   Out << ' ';
   WriteAsOperandInternal(Out, F, &TypePrinter, &Machine, F->getParent());
@@ -1580,7 +1587,7 @@ void AssemblyWriter::printFunction(const Function *F) {
 
       Attributes ArgAttrs = Attrs.getParamAttributes(i+1);
       if (ArgAttrs != Attribute::None)
-        Out << ' ' << Attribute::getAsString(ArgAttrs);
+        Out << ' ' << ArgAttrs.getAsString();
     }
   }
 
@@ -1594,7 +1601,7 @@ void AssemblyWriter::printFunction(const Function *F) {
     Out << " unnamed_addr";
   Attributes FnAttrs = Attrs.getFnAttributes();
   if (FnAttrs != Attribute::None)
-    Out << ' ' << Attribute::getAsString(Attrs.getFnAttributes());
+    Out << ' ' << Attrs.getFnAttributes().getAsString();
   if (F->hasSection()) {
     Out << " section \"";
     PrintEscapedString(F->getSection(), Out);
@@ -1628,7 +1635,7 @@ void AssemblyWriter::printArgument(const Argument *Arg,
 
   // Output parameter attributes list
   if (Attrs != Attribute::None)
-    Out << ' ' << Attribute::getAsString(Attrs);
+    Out << ' ' << Attrs.getAsString();
 
   // Output name, if available...
   if (Arg->hasName()) {
@@ -1831,20 +1838,9 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     Out << " void";
   } else if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
     // Print the calling convention being used.
-    switch (CI->getCallingConv()) {
-    case CallingConv::C: break;   // default
-    case CallingConv::Fast:  Out << " fastcc"; break;
-    case CallingConv::Cold:  Out << " coldcc"; break;
-    case CallingConv::X86_StdCall:  Out << " x86_stdcallcc"; break;
-    case CallingConv::X86_FastCall: Out << " x86_fastcallcc"; break;
-    case CallingConv::X86_ThisCall: Out << " x86_thiscallcc"; break;
-    case CallingConv::ARM_APCS:     Out << " arm_apcscc "; break;
-    case CallingConv::ARM_AAPCS:    Out << " arm_aapcscc "; break;
-    case CallingConv::ARM_AAPCS_VFP:Out << " arm_aapcs_vfpcc "; break;
-    case CallingConv::MSP430_INTR:  Out << " msp430_intrcc "; break;
-    case CallingConv::PTX_Kernel:   Out << " ptx_kernel"; break;
-    case CallingConv::PTX_Device:   Out << " ptx_device"; break;
-    default: Out << " cc" << CI->getCallingConv(); break;
+    if (CI->getCallingConv() != CallingConv::C) {
+      Out << " ";
+      PrintCallingConv(CI->getCallingConv(), Out);
     }
 
     Operand = CI->getCalledValue();
@@ -1854,7 +1850,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     const AttrListPtr &PAL = CI->getAttributes();
 
     if (PAL.getRetAttributes() != Attribute::None)
-      Out << ' ' << Attribute::getAsString(PAL.getRetAttributes());
+      Out << ' ' << PAL.getRetAttributes().getAsString();
 
     // If possible, print out the short form of the call instruction.  We can
     // only do this if the first argument is a pointer to a nonvararg function,
@@ -1878,7 +1874,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     }
     Out << ')';
     if (PAL.getFnAttributes() != Attribute::None)
-      Out << ' ' << Attribute::getAsString(PAL.getFnAttributes());
+      Out << ' ' << PAL.getFnAttributes().getAsString();
   } else if (const InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
     Operand = II->getCalledValue();
     PointerType *PTy = cast<PointerType>(Operand->getType());
@@ -1887,24 +1883,13 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     const AttrListPtr &PAL = II->getAttributes();
 
     // Print the calling convention being used.
-    switch (II->getCallingConv()) {
-    case CallingConv::C: break;   // default
-    case CallingConv::Fast:  Out << " fastcc"; break;
-    case CallingConv::Cold:  Out << " coldcc"; break;
-    case CallingConv::X86_StdCall:  Out << " x86_stdcallcc"; break;
-    case CallingConv::X86_FastCall: Out << " x86_fastcallcc"; break;
-    case CallingConv::X86_ThisCall: Out << " x86_thiscallcc"; break;
-    case CallingConv::ARM_APCS:     Out << " arm_apcscc "; break;
-    case CallingConv::ARM_AAPCS:    Out << " arm_aapcscc "; break;
-    case CallingConv::ARM_AAPCS_VFP:Out << " arm_aapcs_vfpcc "; break;
-    case CallingConv::MSP430_INTR:  Out << " msp430_intrcc "; break;
-    case CallingConv::PTX_Kernel:   Out << " ptx_kernel"; break;
-    case CallingConv::PTX_Device:   Out << " ptx_device"; break;
-    default: Out << " cc" << II->getCallingConv(); break;
+    if (II->getCallingConv() != CallingConv::C) {
+      Out << " ";
+      PrintCallingConv(II->getCallingConv(), Out);
     }
 
     if (PAL.getRetAttributes() != Attribute::None)
-      Out << ' ' << Attribute::getAsString(PAL.getRetAttributes());
+      Out << ' ' << PAL.getRetAttributes().getAsString();
 
     // If possible, print out the short form of the invoke instruction. We can
     // only do this if the first argument is a pointer to a nonvararg function,
@@ -1929,7 +1914,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
 
     Out << ')';
     if (PAL.getFnAttributes() != Attribute::None)
-      Out << ' ' << Attribute::getAsString(PAL.getFnAttributes());
+      Out << ' ' << PAL.getFnAttributes().getAsString();
 
     Out << "\n          to ";
     writeOperand(II->getNormalDest(), true);
diff --git a/lib/VMCore/Attributes.cpp b/lib/VMCore/Attributes.cpp
index d466ac60b2..af8163fd40 100644
--- a/lib/VMCore/Attributes.cpp
+++ b/lib/VMCore/Attributes.cpp
@@ -26,66 +26,66 @@ using namespace llvm;
 // Attribute Function Definitions
 //===----------------------------------------------------------------------===//
 
-std::string Attribute::getAsString(Attributes Attrs) {
+std::string Attributes::getAsString() const {
   std::string Result;
-  if (Attrs & Attribute::ZExt)
+  if (hasZExtAttr())
     Result += "zeroext ";
-  if (Attrs & Attribute::SExt)
+  if (hasSExtAttr())
     Result += "signext ";
-  if (Attrs & Attribute::NoReturn)
+  if (hasNoReturnAttr())
     Result += "noreturn ";
-  if (Attrs & Attribute::NoUnwind)
+  if (hasNoUnwindAttr())
     Result += "nounwind ";
-  if (Attrs & Attribute::UWTable)
+  if (hasUWTableAttr())
     Result += "uwtable ";
-  if (Attrs & Attribute::ReturnsTwice)
+  if (hasReturnsTwiceAttr())
     Result += "returns_twice ";
-  if (Attrs & Attribute::InReg)
+  if (hasInRegAttr())
     Result += "inreg ";
-  if (Attrs & Attribute::NoAlias)
+  if (hasNoAliasAttr())
     Result += "noalias ";
-  if (Attrs & Attribute::NoCapture)
+  if (hasNoCaptureAttr())
     Result += "nocapture ";
-  if (Attrs & Attribute::StructRet)
+  if (hasStructRetAttr())
     Result += "sret ";
-  if (Attrs & Attribute::ByVal)
+  if (hasByValAttr())
     Result += "byval ";
-  if (Attrs & Attribute::Nest)
+  if (hasNestAttr())
     Result += "nest ";
-  if (Attrs & Attribute::ReadNone)
+  if (hasReadNoneAttr())
     Result += "readnone ";
-  if (Attrs & Attribute::ReadOnly)
+  if (hasReadOnlyAttr())
     Result += "readonly ";
-  if (Attrs & Attribute::OptimizeForSize)
+  if (hasOptimizeForSizeAttr())
     Result += "optsize ";
-  if (Attrs & Attribute::NoInline)
+  if (hasNoInlineAttr())
     Result += "noinline ";
-  if (Attrs & Attribute::InlineHint)
+  if (hasInlineHintAttr())
     Result += "inlinehint ";
-  if (Attrs & Attribute::AlwaysInline)
+  if (hasAlwaysInlineAttr())
     Result += "alwaysinline ";
-  if (Attrs & Attribute::StackProtect)
+  if (hasStackProtectAttr())
     Result += "ssp ";
-  if (Attrs & Attribute::StackProtectReq)
+  if (hasStackProtectReqAttr())
     Result += "sspreq ";
-  if (Attrs & Attribute::NoRedZone)
+  if (hasNoRedZoneAttr())
     Result += "noredzone ";
-  if (Attrs & Attribute::NoImplicitFloat)
+  if (hasNoImplicitFloatAttr())
     Result += "noimplicitfloat ";
-  if (Attrs & Attribute::Naked)
+  if (hasNakedAttr())
     Result += "naked ";
-  if (Attrs & Attribute::NonLazyBind)
+  if (hasNonLazyBindAttr())
     Result += "nonlazybind ";
-  if (Attrs & Attribute::AddressSafety)
+  if (hasAddressSafetyAttr())
     Result += "address_safety ";
-  if (Attrs & Attribute::StackAlignment) {
+  if (hasStackAlignmentAttr()) {
     Result += "alignstack(";
-    Result += utostr(Attribute::getStackAlignmentFromAttrs(Attrs));
+    Result += utostr(getStackAlignment());
     Result += ") ";
   }
-  if (Attrs & Attribute::Alignment) {
+  if (hasAlignmentAttr()) {
     Result += "align ";
-    Result += utostr(Attribute::getAlignmentFromAttrs(Attrs));
+    Result += utostr(getAlignment());
     Result += " ";
   }
   // Trim the trailing space.
@@ -125,8 +125,8 @@ class AttributeListImpl : public FoldingSetNode {
   sys::cas_flag RefCount;
   
   // AttributesList is uniqued, these should not be publicly available.
-  void operator=(const AttributeListImpl &); // Do not implement
-  AttributeListImpl(const AttributeListImpl &); // Do not implement
+  void operator=(const AttributeListImpl &) LLVM_DELETED_FUNCTION;
+  AttributeListImpl(const AttributeListImpl &) LLVM_DELETED_FUNCTION;
   ~AttributeListImpl();                        // Private implementation
 public:
   SmallVector<AttributeWithIndex, 4> Attrs;
@@ -174,7 +174,7 @@ AttrListPtr AttrListPtr::get(ArrayRef<AttributeWithIndex> Attrs) {
   
 #ifndef NDEBUG
   for (unsigned i = 0, e = Attrs.size(); i != e; ++i) {
-    assert(Attrs[i].Attrs != Attribute::None && 
+    assert(Attrs[i].Attrs.hasAttributes() && 
            "Pointless attribute!");
     assert((!i || Attrs[i-1].Index < Attrs[i].Index) &&
            "Misordered AttributesList!");
@@ -247,13 +247,14 @@ const AttributeWithIndex &AttrListPtr::getSlot(unsigned Slot) const {
 /// returned.  Attributes for the result are denoted with Idx = 0.
 /// Function notes are denoted with idx = ~0.
 Attributes AttrListPtr::getAttributes(unsigned Idx) const {
-  if (AttrList == 0) return Attribute::None;
+  if (AttrList == 0) return Attributes();
   
   const SmallVector<AttributeWithIndex, 4> &Attrs = AttrList->Attrs;
   for (unsigned i = 0, e = Attrs.size(); i != e && Attrs[i].Index <= Idx; ++i)
     if (Attrs[i].Index == Idx)
       return Attrs[i].Attrs;
-  return Attribute::None;
+
+  return Attributes();
 }
 
 /// hasAttrSomewhere - Return true if the specified attribute is set for at
@@ -263,7 +264,7 @@ bool AttrListPtr::hasAttrSomewhere(Attributes Attr) const {
   
   const SmallVector<AttributeWithIndex, 4> &Attrs = AttrList->Attrs;
   for (unsigned i = 0, e = Attrs.size(); i != e; ++i)
-    if (Attrs[i].Attrs & Attr)
+    if (Attrs[i].Attrs.hasAttributes(Attr))
       return true;
   return false;
 }
@@ -274,8 +275,8 @@ AttrListPtr AttrListPtr::addAttr(unsigned Idx, Attributes Attrs) const {
 #ifndef NDEBUG
   // FIXME it is not obvious how this should work for alignment.
   // For now, say we can't change a known alignment.
-  Attributes OldAlign = OldAttrs & Attribute::Alignment;
-  Attributes NewAlign = Attrs & Attribute::Alignment;
+  unsigned OldAlign = OldAttrs.getAlignment();
+  unsigned NewAlign = Attrs.getAlignment();
   assert((!OldAlign || !NewAlign || OldAlign == NewAlign) &&
          "Attempt to change alignment!");
 #endif
@@ -314,7 +315,7 @@ AttrListPtr AttrListPtr::removeAttr(unsigned Idx, Attributes Attrs) const {
 #ifndef NDEBUG
   // FIXME it is not obvious how this should work for alignment.
   // For now, say we can't pass in alignment, which no current use does.
-  assert(!(Attrs & Attribute::Alignment) && "Attempt to exclude alignment!");
+  assert(!Attrs.hasAlignmentAttr() && "Attempt to exclude alignment!");
 #endif
   if (AttrList == 0) return AttrListPtr();
   
diff --git a/lib/VMCore/ConstantsContext.h b/lib/VMCore/ConstantsContext.h
index 0f81b3ee4e..996eb12d69 100644
--- a/lib/VMCore/ConstantsContext.h
+++ b/lib/VMCore/ConstantsContext.h
@@ -33,7 +33,7 @@ struct ConstantTraits;
 /// behind the scenes to implement unary constant exprs.
 class UnaryConstantExpr : public ConstantExpr {
   virtual void anchor();
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
 public:
   // allocate space for exactly one operand
   void *operator new(size_t s) {
@@ -50,7 +50,7 @@ public:
 /// behind the scenes to implement binary constant exprs.
 class BinaryConstantExpr : public ConstantExpr {
   virtual void anchor();
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
 public:
   // allocate space for exactly two operands
   void *operator new(size_t s) {
@@ -71,7 +71,7 @@ public:
 /// behind the scenes to implement select constant exprs.
 class SelectConstantExpr : public ConstantExpr {
   virtual void anchor();
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
 public:
   // allocate space for exactly three operands
   void *operator new(size_t s) {
@@ -92,7 +92,7 @@ public:
 /// extractelement constant exprs.
 class ExtractElementConstantExpr : public ConstantExpr {
   virtual void anchor();
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
 public:
   // allocate space for exactly two operands
   void *operator new(size_t s) {
@@ -113,7 +113,7 @@ public:
 /// insertelement constant exprs.
 class InsertElementConstantExpr : public ConstantExpr {
   virtual void anchor();
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
 public:
   // allocate space for exactly three operands
   void *operator new(size_t s) {
@@ -135,7 +135,7 @@ public:
 /// shufflevector constant exprs.
 class ShuffleVectorConstantExpr : public ConstantExpr {
   virtual void anchor();
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
 public:
   // allocate space for exactly three operands
   void *operator new(size_t s) {
@@ -160,7 +160,7 @@ public:
 /// extractvalue constant exprs.
 class ExtractValueConstantExpr : public ConstantExpr {
   virtual void anchor();
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
 public:
   // allocate space for exactly one operand
   void *operator new(size_t s) {
@@ -186,7 +186,7 @@ public:
 /// insertvalue constant exprs.
 class InsertValueConstantExpr : public ConstantExpr {
   virtual void anchor();
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
 public:
   // allocate space for exactly one operand
   void *operator new(size_t s) {
@@ -234,7 +234,7 @@ public:
 // needed in order to store the predicate value for these instructions.
 class CompareConstantExpr : public ConstantExpr {
   virtual void anchor();
-  void *operator new(size_t, unsigned);  // DO NOT IMPLEMENT
+  void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
 public:
   // allocate space for exactly two operands
   void *operator new(size_t s) {
diff --git a/lib/VMCore/Core.cpp b/lib/VMCore/Core.cpp
index a56f1b282b..90ecdaecf4 100644
--- a/lib/VMCore/Core.cpp
+++ b/lib/VMCore/Core.cpp
@@ -568,6 +568,19 @@ const char *LLVMGetMDString(LLVMValueRef V, unsigned* Len) {
   return 0;
 }
 
+unsigned LLVMGetMDNodeNumOperands(LLVMValueRef V)
+{
+  return cast<MDNode>(unwrap(V))->getNumOperands();
+}
+
+void LLVMGetMDNodeOperands(LLVMValueRef V, LLVMValueRef *Dest)
+{
+  const MDNode *N = cast<MDNode>(unwrap(V));
+  const unsigned numOperands = N->getNumOperands();
+  for (unsigned i = 0; i < numOperands; i++)
+    Dest[i] = wrap(N->getOperand(i));
+}
+
 unsigned LLVMGetNamedMetadataNumOperands(LLVMModuleRef M, const char* name)
 {
   if (NamedMDNode *N = unwrap(M)->getNamedMetadata(name)) {
@@ -1462,7 +1475,7 @@ LLVMAttribute LLVMGetAttribute(LLVMValueRef Arg) {
 
 void LLVMSetParamAlignment(LLVMValueRef Arg, unsigned align) {
   unwrap<Argument>(Arg)->addAttr(
-          Attribute::constructAlignmentFromInt(align));
+          Attributes::constructAlignmentFromInt(align));
 }
 
 /*--.. Operations on basic blocks ..........................................--*/
@@ -1667,7 +1680,7 @@ void LLVMSetInstrParamAlignment(LLVMValueRef Instr, unsigned index,
   CallSite Call = CallSite(unwrap<Instruction>(Instr));
   Call.setAttributes(
     Call.getAttributes().addAttr(index, 
-        Attribute::constructAlignmentFromInt(align)));
+        Attributes::constructAlignmentFromInt(align)));
 }
 
 /*--.. Operations on call instructions (only) ..............................--*/
diff --git a/lib/VMCore/DIBuilder.cpp b/lib/VMCore/DIBuilder.cpp
index f5894e9a32..6a29a02399 100644
--- a/lib/VMCore/DIBuilder.cpp
+++ b/lib/VMCore/DIBuilder.cpp
@@ -640,6 +640,30 @@ DIType DIBuilder::createArtificialType(DIType Ty) {
   return DIType(MDNode::get(VMContext, Elts));
 }
 
+/// createArtificialType - Create a new DIType with "artificial" flag set.
+DIType DIBuilder::createObjectPointerType(DIType Ty) {
+  if (Ty.isObjectPointer())
+    return Ty;
+
+  SmallVector<Value *, 9> Elts;
+  MDNode *N = Ty;
+  assert (N && "Unexpected input DIType!");
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    if (Value *V = N->getOperand(i))
+      Elts.push_back(V);
+    else
+      Elts.push_back(Constant::getNullValue(Type::getInt32Ty(VMContext)));
+  }
+
+  unsigned CurFlags = Ty.getFlags();
+  CurFlags = CurFlags | (DIType::FlagObjectPointer | DIType::FlagArtificial);
+
+  // Flags are stored at this slot.
+  Elts[8] = ConstantInt::get(Type::getInt32Ty(VMContext), CurFlags);
+
+  return DIType(MDNode::get(VMContext, Elts));
+}
+
 /// retainType - Retain DIType in a module even if it is not referenced
 /// through debug info anchors.
 void DIBuilder::retainType(DIType T) {
diff --git a/lib/VMCore/GCOV.cpp b/lib/VMCore/GCOV.cpp
index 5bc1ac9f5d..ea2f0a6d55 100644
--- a/lib/VMCore/GCOV.cpp
+++ b/lib/VMCore/GCOV.cpp
@@ -48,7 +48,7 @@ bool GCOVFile::read(GCOVBuffer &Buffer) {
     GCOVFunction *GFun = NULL;
     if (isGCDAFile(Format)) {
       // Use existing function while reading .gcda file.
-      assert (i < Functions.size() && ".gcda data does not match .gcno data");
+      assert(i < Functions.size() && ".gcda data does not match .gcno data");
       GFun = Functions[i];
     } else if (isGCNOFile(Format)){
       GFun = new GCOVFunction();
@@ -113,7 +113,9 @@ bool GCOVFunction::read(GCOVBuffer &Buff, GCOV::GCOVFormat Format) {
   LineNumber = Buff.readInt();
 
   // read blocks.
-  assert (Buff.readBlockTag() && "Block Tag not found!");
+  bool BlockTagFound = Buff.readBlockTag();
+  (void)BlockTagFound;
+  assert(BlockTagFound && "Block Tag not found!");
   uint32_t BlockCount = Buff.readInt();
   for (int i = 0, e = BlockCount; i != e; ++i) {
     Buff.readInt(); // Block flags;
@@ -124,7 +126,7 @@ bool GCOVFunction::read(GCOVBuffer &Buff, GCOV::GCOVFormat Format) {
   while (Buff.readEdgeTag()) {
     uint32_t EdgeCount = (Buff.readInt() - 1) / 2;
     uint32_t BlockNo = Buff.readInt();
-    assert (BlockNo < BlockCount && "Unexpected Block number!");
+    assert(BlockNo < BlockCount && "Unexpected Block number!");
     for (int i = 0, e = EdgeCount; i != e; ++i) {
       Blocks[BlockNo]->addEdge(Buff.readInt());
       Buff.readInt(); // Edge flag
@@ -136,7 +138,7 @@ bool GCOVFunction::read(GCOVBuffer &Buff, GCOV::GCOVFormat Format) {
     uint32_t LineTableLength = Buff.readInt();
     uint32_t Size = Buff.getCursor() + LineTableLength*4;
     uint32_t BlockNo = Buff.readInt();
-    assert (BlockNo < BlockCount && "Unexpected Block number!");
+    assert(BlockNo < BlockCount && "Unexpected Block number!");
     GCOVBlock *Block = Blocks[BlockNo];
     Buff.readInt(); // flag
     while (Buff.getCursor() != (Size - 4)) {
diff --git a/lib/VMCore/Instructions.cpp b/lib/VMCore/Instructions.cpp
index 9af98e8a9b..d5b756dac0 100644
--- a/lib/VMCore/Instructions.cpp
+++ b/lib/VMCore/Instructions.cpp
@@ -2836,7 +2836,7 @@ BitCastInst::BitCastInst(
 //                               CmpInst Classes
 //===----------------------------------------------------------------------===//
 
-void CmpInst::Anchor() const {}
+void CmpInst::anchor() {}
 
 CmpInst::CmpInst(Type *ty, OtherOps op, unsigned short predicate,
                  Value *LHS, Value *RHS, const Twine &Name,
diff --git a/lib/VMCore/LLVMContext.cpp b/lib/VMCore/LLVMContext.cpp
index f07f0b3939..2446ec996d 100644
--- a/lib/VMCore/LLVMContext.cpp
+++ b/lib/VMCore/LLVMContext.cpp
@@ -53,6 +53,11 @@ LLVMContext::LLVMContext() : pImpl(new LLVMContextImpl(*this)) {
   unsigned RangeID = getMDKindID("range");
   assert(RangeID == MD_range && "range kind id drifted");
   (void)RangeID;
+
+  // Create the 'tbaa.struct' metadata kind.
+  unsigned TBAAStructID = getMDKindID("tbaa.struct");
+  assert(TBAAStructID == MD_tbaa_struct && "tbaa.struct kind id drifted");
+  (void)TBAAStructID;
 }
 LLVMContext::~LLVMContext() { delete pImpl; }
 
diff --git a/lib/VMCore/Type.cpp b/lib/VMCore/Type.cpp
index 5e9a00fc08..1a7a650989 100644
--- a/lib/VMCore/Type.cpp
+++ b/lib/VMCore/Type.cpp
@@ -220,8 +220,6 @@ Type *Type::getStructElementType(unsigned N) const {
   return cast<StructType>(this)->getElementType(N);
 }
 
-
-
 Type *Type::getSequentialElementType() const {
   return cast<SequentialType>(this)->getElementType();
 }
@@ -239,8 +237,6 @@ unsigned Type::getPointerAddressSpace() const {
 }
 
 
-
-
 //===----------------------------------------------------------------------===//
 //                          Primitive 'Type' data
 //===----------------------------------------------------------------------===//
@@ -400,12 +396,10 @@ FunctionType *FunctionType::get(Type *ReturnType,
   return FT;
 }
 
-
 FunctionType *FunctionType::get(Type *Result, bool isVarArg) {
   return get(Result, ArrayRef<Type *>(), isVarArg);
 }
 
-
 /// isValidReturnType - Return true if the specified type is valid as a return
 /// type.
 bool FunctionType::isValidReturnType(Type *RetTy) {
@@ -553,7 +547,6 @@ StructType *StructType::create(LLVMContext &Context) {
   return create(Context, StringRef());
 }
 
-
 StructType *StructType::create(ArrayRef<Type*> Elements, StringRef Name,
                                bool isPacked) {
   assert(!Elements.empty() &&
@@ -637,7 +630,6 @@ bool StructType::isLayoutIdentical(StructType *Other) const {
   return std::equal(element_begin(), element_end(), Other->element_begin());
 }
 
-
 /// getTypeByName - Return the type with the specified name, or null if there
 /// is none by that name.
 StructType *Module::getTypeByName(StringRef Name) const {
@@ -700,7 +692,6 @@ ArrayType::ArrayType(Type *ElType, uint64_t NumEl)
   NumElements = NumEl;
 }
 
-
 ArrayType *ArrayType::get(Type *elementType, uint64_t NumElements) {
   Type *ElementType = const_cast<Type*>(elementType);
   assert(isValidElementType(ElementType) && "Invalid type for array element!");
diff --git a/lib/VMCore/ValueTypes.cpp b/lib/VMCore/ValueTypes.cpp
index d1ca953175..e9370f62e6 100644
--- a/lib/VMCore/ValueTypes.cpp
+++ b/lib/VMCore/ValueTypes.cpp
@@ -55,6 +55,14 @@ bool EVT::isExtendedVector() const {
   return LLVMTy->isVectorTy();
 }
 
+bool EVT::isExtended16BitVector() const {
+  return isExtendedVector() && getSizeInBits() == 16;
+}
+
+bool EVT::isExtended32BitVector() const {
+  return isExtendedVector() && getSizeInBits() == 32;
+}
+
 bool EVT::isExtended64BitVector() const {
   return isExtendedVector() && getSizeInBits() == 64;
 }
@@ -120,15 +128,21 @@ std::string EVT::getEVTString() const {
   case MVT::Other:   return "ch";
   case MVT::Glue:    return "glue";
   case MVT::x86mmx:  return "x86mmx";
+  case MVT::v2i1:    return "v2i1";
+  case MVT::v4i1:    return "v4i1";
+  case MVT::v8i1:    return "v8i1";
+  case MVT::v16i1:   return "v16i1";
   case MVT::v2i8:    return "v2i8";
   case MVT::v4i8:    return "v4i8";
   case MVT::v8i8:    return "v8i8";
   case MVT::v16i8:   return "v16i8";
   case MVT::v32i8:   return "v32i8";
+  case MVT::v1i16:   return "v1i16";
   case MVT::v2i16:   return "v2i16";
   case MVT::v4i16:   return "v4i16";
   case MVT::v8i16:   return "v8i16";
   case MVT::v16i16:  return "v16i16";
+  case MVT::v1i32:   return "v1i32";
   case MVT::v2i32:   return "v2i32";
   case MVT::v4i32:   return "v4i32";
   case MVT::v8i32:   return "v8i32";
@@ -171,15 +185,21 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
   case MVT::f128:    return Type::getFP128Ty(Context);
   case MVT::ppcf128: return Type::getPPC_FP128Ty(Context);
   case MVT::x86mmx:  return Type::getX86_MMXTy(Context);
+  case MVT::v2i1:    return VectorType::get(Type::getInt1Ty(Context), 2);
+  case MVT::v4i1:    return VectorType::get(Type::getInt1Ty(Context), 4);
+  case MVT::v8i1:    return VectorType::get(Type::getInt1Ty(Context), 8);
+  case MVT::v16i1:   return VectorType::get(Type::getInt1Ty(Context), 16);
   case MVT::v2i8:    return VectorType::get(Type::getInt8Ty(Context), 2);
   case MVT::v4i8:    return VectorType::get(Type::getInt8Ty(Context), 4);
   case MVT::v8i8:    return VectorType::get(Type::getInt8Ty(Context), 8);
   case MVT::v16i8:   return VectorType::get(Type::getInt8Ty(Context), 16);
   case MVT::v32i8:   return VectorType::get(Type::getInt8Ty(Context), 32);
+  case MVT::v1i16:   return VectorType::get(Type::getInt16Ty(Context), 1);
   case MVT::v2i16:   return VectorType::get(Type::getInt16Ty(Context), 2);
   case MVT::v4i16:   return VectorType::get(Type::getInt16Ty(Context), 4);
   case MVT::v8i16:   return VectorType::get(Type::getInt16Ty(Context), 8);
   case MVT::v16i16:  return VectorType::get(Type::getInt16Ty(Context), 16);
+  case MVT::v1i32:   return VectorType::get(Type::getInt32Ty(Context), 1);
   case MVT::v2i32:   return VectorType::get(Type::getInt32Ty(Context), 2);
   case MVT::v4i32:   return VectorType::get(Type::getInt32Ty(Context), 4);
   case MVT::v8i32:   return VectorType::get(Type::getInt32Ty(Context), 8);
diff --git a/lib/VMCore/Verifier.cpp b/lib/VMCore/Verifier.cpp
index c932d9e539..647a52fbdd 100644
--- a/lib/VMCore/Verifier.cpp
+++ b/lib/VMCore/Verifier.cpp
@@ -530,12 +530,12 @@ void Verifier::VerifyParameterAttrs(Attributes Attrs, Type *Ty,
     return;
 
   Attributes FnCheckAttr = Attrs & Attribute::FunctionOnly;
-  Assert1(!FnCheckAttr, "Attribute " + Attribute::getAsString(FnCheckAttr) +
+  Assert1(!FnCheckAttr, "Attribute " + FnCheckAttr.getAsString() +
           " only applies to the function!", V);
 
   if (isReturnValue) {
     Attributes RetI = Attrs & Attribute::ParameterOnly;
-    Assert1(!RetI, "Attribute " + Attribute::getAsString(RetI) +
+    Assert1(!RetI, "Attribute " + RetI.getAsString() +
             " does not apply to return values!", V);
   }
 
@@ -543,21 +543,21 @@ void Verifier::VerifyParameterAttrs(Attributes Attrs, Type *Ty,
        i < array_lengthof(Attribute::MutuallyIncompatible); ++i) {
     Attributes MutI = Attrs & Attribute::MutuallyIncompatible[i];
     Assert1(MutI.isEmptyOrSingleton(), "Attributes " +
-            Attribute::getAsString(MutI) + " are incompatible!", V);
+            MutI.getAsString() + " are incompatible!", V);
   }
 
   Attributes TypeI = Attrs & Attribute::typeIncompatible(Ty);
   Assert1(!TypeI, "Wrong type for attribute " +
-          Attribute::getAsString(TypeI), V);
+          TypeI.getAsString(), V);
 
   Attributes ByValI = Attrs & Attribute::ByVal;
   if (PointerType *PTy = dyn_cast<PointerType>(Ty)) {
     Assert1(!ByValI || PTy->getElementType()->isSized(),
-            "Attribute " + Attribute::getAsString(ByValI) +
+            "Attribute " + ByValI.getAsString() +
             " does not support unsized types!", V);
   } else {
     Assert1(!ByValI,
-            "Attribute " + Attribute::getAsString(ByValI) +
+            "Attribute " + ByValI.getAsString() +
             " only applies to parameters with pointer type!", V);
   }
 }
@@ -585,25 +585,25 @@ void Verifier::VerifyFunctionAttrs(FunctionType *FT,
 
     VerifyParameterAttrs(Attr.Attrs, Ty, Attr.Index == 0, V);
 
-    if (Attr.Attrs & Attribute::Nest) {
+    if (Attr.Attrs.hasNestAttr()) {
       Assert1(!SawNest, "More than one parameter has attribute nest!", V);
       SawNest = true;
     }
 
-    if (Attr.Attrs & Attribute::StructRet)
+    if (Attr.Attrs.hasStructRetAttr())
       Assert1(Attr.Index == 1, "Attribute sret not on first parameter!", V);
   }
 
   Attributes FAttrs = Attrs.getFnAttributes();
   Attributes NotFn = FAttrs & (~Attribute::FunctionOnly);
-  Assert1(!NotFn, "Attribute " + Attribute::getAsString(NotFn) +
+  Assert1(!NotFn, "Attribute " + NotFn.getAsString() +
           " does not apply to the function!", V);
 
   for (unsigned i = 0;
        i < array_lengthof(Attribute::MutuallyIncompatible); ++i) {
     Attributes MutI = FAttrs & Attribute::MutuallyIncompatible[i];
     Assert1(MutI.isEmptyOrSingleton(), "Attributes " +
-            Attribute::getAsString(MutI) + " are incompatible!", V);
+            MutI.getAsString() + " are incompatible!", V);
   }
 }
 
@@ -1171,7 +1171,7 @@ void Verifier::VerifyCallSite(CallSite CS) {
       VerifyParameterAttrs(Attr, CS.getArgument(Idx-1)->getType(), false, I);
 
       Attributes VArgI = Attr & Attribute::VarArgsIncompatible;
-      Assert1(!VArgI, "Attribute " + Attribute::getAsString(VArgI) +
+      Assert1(!VArgI, "Attribute " + VArgI.getAsString() +
               " cannot be used for vararg call arguments!", I);
     }
 
diff --git a/projects/CMakeLists.txt b/projects/CMakeLists.txt
index c4567a35b0..a9d2af6ad2 100644
--- a/projects/CMakeLists.txt
+++ b/projects/CMakeLists.txt
@@ -14,6 +14,7 @@ endforeach(entry)
 # Also add in the compiler-rt tree if present and we have a sufficiently
 # recent version of CMake.
 if(${CMAKE_VERSION} VERSION_GREATER 2.8.7 AND
+   ${LLVM_BUILD_RUNTIME} AND
    IS_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/compiler-rt AND
    EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/compiler-rt/CMakeLists.txt)
   add_subdirectory(compiler-rt)
diff --git a/test/Bitcode/blockaddress.ll b/test/Bitcode/blockaddress.ll
index b9f334176c..8ac54be00d 100644
--- a/test/Bitcode/blockaddress.ll
+++ b/test/Bitcode/blockaddress.ll
@@ -28,3 +28,18 @@ here:
 end:
   ret void
 }
+
+; PR13895
+define void @doitagain(i8** nocapture %pptr) {
+; CHECK: define void @doitagain
+entry:
+  br label %here
+
+here:
+  store i8* blockaddress(@doit, %here), i8** %pptr, align 8
+; CHECK: blockaddress(@doit, %here)
+  br label %end
+
+end:
+  ret void
+}
diff --git a/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll b/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll
index 42b1491481..6e0ef96196 100644
--- a/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll
+++ b/test/CodeGen/ARM/2011-10-26-memset-with-neon.ll
@@ -9,8 +9,8 @@ entry:
 }
 
 ; Trigger multiple NEON stores.
-; CHECK:      vstmia
-; CHECK-NEXT: vstmia
+; CHECK:      vst1.64
+; CHECK-NEXT: vst1.64
 define void @f_0_40(i8* nocapture %c) nounwind optsize {
 entry:
   call void @llvm.memset.p0i8.i64(i8* %c, i8 0, i64 40, i32 16, i1 false)
diff --git a/test/CodeGen/ARM/2011-11-29-128bitArithmetics.ll b/test/CodeGen/ARM/2011-11-29-128bitArithmetics.ll
index 1769ee5d71..f9ede7401a 100644
--- a/test/CodeGen/ARM/2011-11-29-128bitArithmetics.ll
+++ b/test/CodeGen/ARM/2011-11-29-128bitArithmetics.ll
@@ -8,12 +8,12 @@ define void @test_sqrt(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw    r1, :lower16:{{.*}}
 ; CHECK:      movt    r1, :upper16:{{.*}}
-; CHECK:      vldmia  r1
+; CHECK:      vld1.64 {{.*}}, [r1, :128]
 ; CHECK:      vsqrt.f32       {{s[0-9]+}}, {{s[0-9]+}}
 ; CHECK:      vsqrt.f32       {{s[0-9]+}}, {{s[0-9]+}}
 ; CHECK:      vsqrt.f32       {{s[0-9]+}}, {{s[0-9]+}}
 ; CHECK:      vsqrt.f32       {{s[0-9]+}}, {{s[0-9]+}}
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64  {{.*}}
 
 L.entry:
   %0 = load <4 x float>* @A, align 16
@@ -31,7 +31,7 @@ define void @test_cos(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:      movt  [[reg0]], :upper16:{{.*}}
-; CHECK:      vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK:      vld1.64
 
 ; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}cosf
@@ -45,7 +45,7 @@ define void @test_cos(<4 x float>* %X) nounwind {
 ; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}cosf
 
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64
 
 L.entry:
   %0 = load <4 x float>* @A, align 16
@@ -62,7 +62,7 @@ define void @test_exp(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:      movt  [[reg0]], :upper16:{{.*}}
-; CHECK:      vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK:      vld1.64
 
 ; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}expf
@@ -76,7 +76,7 @@ define void @test_exp(<4 x float>* %X) nounwind {
 ; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}expf
 
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64
 
 L.entry:
   %0 = load <4 x float>* @A, align 16
@@ -93,7 +93,7 @@ define void @test_exp2(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:      movt  [[reg0]], :upper16:{{.*}}
-; CHECK:      vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK:      vld1.64
 
 ; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}exp2f
@@ -107,7 +107,7 @@ define void @test_exp2(<4 x float>* %X) nounwind {
 ; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}exp2f
 
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64
 
 L.entry:
   %0 = load <4 x float>* @A, align 16
@@ -124,7 +124,7 @@ define void @test_log10(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:      movt  [[reg0]], :upper16:{{.*}}
-; CHECK:      vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK:      vld1.64
 
 ; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}log10f
@@ -138,7 +138,7 @@ define void @test_log10(<4 x float>* %X) nounwind {
 ; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}log10f
 
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64
 
 L.entry:
   %0 = load <4 x float>* @A, align 16
@@ -155,7 +155,7 @@ define void @test_log(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:      movt  [[reg0]], :upper16:{{.*}}
-; CHECK:      vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK:      vld1.64
 
 ; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}logf
@@ -169,7 +169,7 @@ define void @test_log(<4 x float>* %X) nounwind {
 ; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}logf
 
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64
 
 L.entry:
   %0 = load <4 x float>* @A, align 16
@@ -186,7 +186,7 @@ define void @test_log2(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:      movt  [[reg0]], :upper16:{{.*}}
-; CHECK:      vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK:      vld1.64
 
 ; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}log2f
@@ -200,7 +200,7 @@ define void @test_log2(<4 x float>* %X) nounwind {
 ; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}log2f
 
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64
 
 L.entry:
   %0 = load <4 x float>* @A, align 16
@@ -218,7 +218,7 @@ define void @test_pow(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:      movt  [[reg0]], :upper16:{{.*}}
-; CHECK:      vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK:      vld1.64
 
 ; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}powf
@@ -232,7 +232,7 @@ define void @test_pow(<4 x float>* %X) nounwind {
 ; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}powf
 
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64
 
 L.entry:
 
@@ -252,10 +252,10 @@ define void @test_powi(<4 x float>* %X) nounwind {
 
 ; CHECK:       movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:       movt  [[reg0]], :upper16:{{.*}}
-; CHECK:       vldmia  [[reg0]], {{.*}}
+; CHECK:       vld1.64 {{.*}}, :128
 ; CHECK:       vmul.f32 {{.*}}
 
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64
 
 L.entry:
 
@@ -275,7 +275,7 @@ define void @test_sin(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:      movt  [[reg0]], :upper16:{{.*}}
-; CHECK:      vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK:      vld1.64
 
 ; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}sinf
@@ -289,7 +289,7 @@ define void @test_sin(<4 x float>* %X) nounwind {
 ; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}sinf
 
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64
 
 L.entry:
   %0 = load <4 x float>* @A, align 16
@@ -306,7 +306,7 @@ define void @test_floor(<4 x float>* %X) nounwind {
 
 ; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
 ; CHECK:      movt  [[reg0]], :upper16:{{.*}}
-; CHECK:      vldmia r{{[0-9][0-9]?}}, {{.*}}
+; CHECK:      vld1.64
 
 ; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}floorf
@@ -320,7 +320,7 @@ define void @test_floor(<4 x float>* %X) nounwind {
 ; CHECK:      {{v?mov(.32)?}}  r0,
 ; CHECK:      bl  {{.*}}floorf
 
-; CHECK:      vstmia  {{.*}}
+; CHECK:      vst1.64
 
 L.entry:
   %0 = load <4 x float>* @A, align 16
diff --git a/test/CodeGen/ARM/2012-09-18-ARMv4ISelBug.ll b/test/CodeGen/ARM/2012-09-18-ARMv4ISelBug.ll
new file mode 100644
index 0000000000..e761ffe72c
--- /dev/null
+++ b/test/CodeGen/ARM/2012-09-18-ARMv4ISelBug.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -march=arm -mcpu=arm7tdmi | FileCheck %s
+
+; movw is only legal for V6T2 and later.
+; rdar://12300648
+
+define i32 @t(i32 %x) {
+; CHECK: t:
+; CHECK-NOT: movw
+  %tmp = add i32 %x, -65535
+  ret i32 %tmp
+}
diff --git a/test/CodeGen/ARM/a15-mla.ll b/test/CodeGen/ARM/a15-mla.ll
new file mode 100644
index 0000000000..25f6de4762
--- /dev/null
+++ b/test/CodeGen/ARM/a15-mla.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s  -march=arm -float-abi=hard -mcpu=cortex-a15 -mattr=+neon,+neonfp | FileCheck %s
+
+; This test checks that the VMLxForwarting feature is disabled for A15.
+; CHECK: fun_a
+define <4 x i32> @fun_a(<4 x i32> %x, <4 x i32> %y) nounwind{
+  %1 = add <4 x i32> %x, %y
+; CHECK-NOT: vmul
+; CHECK: vmla
+  %2 = mul <4 x i32> %1, %1
+  %3 = add <4 x i32> %y, %2
+  ret <4 x i32> %3
+}
diff --git a/test/CodeGen/ARM/a15.ll b/test/CodeGen/ARM/a15.ll
new file mode 100644
index 0000000000..6f816c1c2c
--- /dev/null
+++ b/test/CodeGen/ARM/a15.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s  -mcpu=cortex-a15 | FileCheck %s
+
+; CHECK: a
+define i32 @a(i32 %x) {
+  ret i32 %x;
+}
diff --git a/test/CodeGen/ARM/coalesce-subregs.ll b/test/CodeGen/ARM/coalesce-subregs.ll
index fb0f4c67c9..8d7ded5be0 100644
--- a/test/CodeGen/ARM/coalesce-subregs.ll
+++ b/test/CodeGen/ARM/coalesce-subregs.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=cortex-a9 | FileCheck %s
+; RUN: llc < %s -mcpu=cortex-a9 -new-coalescer | FileCheck %s
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
 target triple = "thumbv7-apple-ios0.0.0"
 
@@ -66,3 +66,126 @@ do.end:                                           ; preds = %do.body
 
 declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8*, i32) nounwind readonly
 declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind
+
+; CHECK: f3
+; This function has lane insertions that span basic blocks.
+; The trivial REG_SEQUENCE lowering can't handle that, but the coalescer can.
+;
+; void f3(float *p, float *q) {
+;   float32x2_t x;
+;   x[1] = p[3];
+;   if (q)
+;     x[0] = q[0] + q[1];
+;   else
+;     x[0] = p[2];
+;   vst1_f32(p+4, x);
+; }
+;
+; CHECK-NOT: vmov
+; CHECK-NOT: vorr
+define void @f3(float* %p, float* %q) nounwind ssp {
+entry:
+  %arrayidx = getelementptr inbounds float* %p, i32 3
+  %0 = load float* %arrayidx, align 4
+  %vecins = insertelement <2 x float> undef, float %0, i32 1
+  %tobool = icmp eq float* %q, null
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:                                          ; preds = %entry
+  %1 = load float* %q, align 4
+  %arrayidx2 = getelementptr inbounds float* %q, i32 1
+  %2 = load float* %arrayidx2, align 4
+  %add = fadd float %1, %2
+  %vecins3 = insertelement <2 x float> %vecins, float %add, i32 0
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %arrayidx4 = getelementptr inbounds float* %p, i32 2
+  %3 = load float* %arrayidx4, align 4
+  %vecins5 = insertelement <2 x float> %vecins, float %3, i32 0
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %x.0 = phi <2 x float> [ %vecins3, %if.then ], [ %vecins5, %if.else ]
+  %add.ptr = getelementptr inbounds float* %p, i32 4
+  %4 = bitcast float* %add.ptr to i8*
+  tail call void @llvm.arm.neon.vst1.v2f32(i8* %4, <2 x float> %x.0, i32 4)
+  ret void
+}
+
+declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>, i32) nounwind
+declare <2 x float> @llvm.arm.neon.vld1.v2f32(i8*, i32) nounwind readonly
+
+; CHECK: f4
+; This function inserts a lane into a fully defined vector.
+; The destination lane isn't read, so the subregs can coalesce.
+; CHECK-NOT: vmov
+; CHECK-NOT: vorr
+define void @f4(float* %p, float* %q) nounwind ssp {
+entry:
+  %0 = bitcast float* %p to i8*
+  %vld1 = tail call <2 x float> @llvm.arm.neon.vld1.v2f32(i8* %0, i32 4)
+  %tobool = icmp eq float* %q, null
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %1 = load float* %q, align 4
+  %arrayidx1 = getelementptr inbounds float* %q, i32 1
+  %2 = load float* %arrayidx1, align 4
+  %add = fadd float %1, %2
+  %vecins = insertelement <2 x float> %vld1, float %add, i32 1
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  %x.0 = phi <2 x float> [ %vecins, %if.then ], [ %vld1, %entry ]
+  tail call void @llvm.arm.neon.vst1.v2f32(i8* %0, <2 x float> %x.0, i32 4)
+  ret void
+}
+
+; CHECK: f5
+; Coalesce vector lanes through phis.
+; CHECK: vmov.f32 {{.*}}, #1.0
+; CHECK-NOT: vmov
+; CHECK-NOT: vorr
+; CHECK: %if.end
+; We may leave the last insertelement in the if.end block.
+; It is inserting the %add value into a dead lane, but %add causes interference
+; in the entry block, and we don't do dead lane checks across basic blocks.
+define void @f5(float* %p, float* %q) nounwind ssp {
+entry:
+  %0 = bitcast float* %p to i8*
+  %vld1 = tail call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %0, i32 4)
+  %vecext = extractelement <4 x float> %vld1, i32 0
+  %vecext1 = extractelement <4 x float> %vld1, i32 1
+  %vecext2 = extractelement <4 x float> %vld1, i32 2
+  %vecext3 = extractelement <4 x float> %vld1, i32 3
+  %add = fadd float %vecext3, 1.000000e+00
+  %tobool = icmp eq float* %q, null
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %arrayidx = getelementptr inbounds float* %q, i32 1
+  %1 = load float* %arrayidx, align 4
+  %add4 = fadd float %vecext, %1
+  %2 = load float* %q, align 4
+  %add6 = fadd float %vecext1, %2
+  %arrayidx7 = getelementptr inbounds float* %q, i32 2
+  %3 = load float* %arrayidx7, align 4
+  %add8 = fadd float %vecext2, %3
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  %a.0 = phi float [ %add4, %if.then ], [ %vecext, %entry ]
+  %b.0 = phi float [ %add6, %if.then ], [ %vecext1, %entry ]
+  %c.0 = phi float [ %add8, %if.then ], [ %vecext2, %entry ]
+  %vecinit = insertelement <4 x float> undef, float %a.0, i32 0
+  %vecinit9 = insertelement <4 x float> %vecinit, float %b.0, i32 1
+  %vecinit10 = insertelement <4 x float> %vecinit9, float %c.0, i32 2
+  %vecinit11 = insertelement <4 x float> %vecinit10, float %add, i32 3
+  tail call void @llvm.arm.neon.vst1.v4f32(i8* %0, <4 x float> %vecinit11, i32 4)
+  ret void
+}
+
+declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32) nounwind readonly
+
+declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind
diff --git a/test/CodeGen/ARM/constants.ll b/test/CodeGen/ARM/constants.ll
index 0ac8b48d39..3baa103e3d 100644
--- a/test/CodeGen/ARM/constants.ll
+++ b/test/CodeGen/ARM/constants.ll
@@ -45,6 +45,16 @@ r:
         ret void
 }
 
+define i32 @f8() nounwind {
+; Check that constant propagation through (i32)-1 => (float)Nan => (i32)-1
+; gives expected result
+; CHECK: f8
+; CHECK: mvn r0, #0
+        %tmp0 = bitcast i32 -1 to float
+        %tmp1 = bitcast float %tmp0 to i32
+        ret i32 %tmp1
+}
+
 %t1 = type { <3 x float>, <3 x float> }
 
 @const1 = global %t1 { <3 x float> zeroinitializer,
diff --git a/test/CodeGen/ARM/domain-conv-vmovs.ll b/test/CodeGen/ARM/domain-conv-vmovs.ll
index e19185b0eb..18e169357b 100644
--- a/test/CodeGen/ARM/domain-conv-vmovs.ll
+++ b/test/CodeGen/ARM/domain-conv-vmovs.ll
@@ -79,6 +79,22 @@ define float @test_ineligible(float, float %in) {
   ; internal fault).
   call void @bar()
 ; CHECL: bl bar
-; CHECK: vmov.f32 {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK: vext.32 
+; CHECK: vext.32 
   ret float %val
-}
-\ No newline at end of file
+}
+
+define i32 @test_vmovs_no_sreg(i32 %in) {
+; CHECK: test_vmovs_no_sreg:
+
+  ; Check that the movement to and from GPRs takes place in the NEON domain.
+; CHECK: vmov.32 d
+  %x = bitcast i32 %in to float
+
+  %res = fadd float %x, %x
+
+; CHECK: vmov.32 r{{[0-9]+}}, d
+  %resi = bitcast float %res to i32
+
+  ret i32 %resi
+}
diff --git a/test/CodeGen/ARM/fast-isel.ll b/test/CodeGen/ARM/fast-isel.ll
index ecd5fe27a4..41fda41326 100644
--- a/test/CodeGen/ARM/fast-isel.ll
+++ b/test/CodeGen/ARM/fast-isel.ll
@@ -1,5 +1,7 @@
 ; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM
 ; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARM-STRICT-ALIGN
+; RUN: llc < %s -O0 -arm-strict-align -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB-STRICT-ALIGN
 
 ; Very basic fast-isel functionality.
 define i32 @add(i32 %a, i32 %b) nounwind {
@@ -238,3 +240,67 @@ entry:
 }
 
 declare void @llvm.trap() nounwind
+
+define void @unaligned_i16_store(i16 %x, i16* %y) nounwind {
+entry:
+; ARM-STRICT-ALIGN: @unaligned_i16_store
+; ARM-STRICT-ALIGN: strb
+; ARM-STRICT-ALIGN: strb
+
+; THUMB-STRICT-ALIGN: @unaligned_i16_store
+; THUMB-STRICT-ALIGN: strb
+; THUMB-STRICT-ALIGN: strb
+
+  store i16 %x, i16* %y, align 1
+  ret void
+}
+
+define i16 @unaligned_i16_load(i16* %x) nounwind {
+entry:
+; ARM-STRICT-ALIGN: @unaligned_i16_load
+; ARM-STRICT-ALIGN: ldrb
+; ARM-STRICT-ALIGN: ldrb
+
+; THUMB-STRICT-ALIGN: @unaligned_i16_load
+; THUMB-STRICT-ALIGN: ldrb
+; THUMB-STRICT-ALIGN: ldrb
+
+  %0 = load i16* %x, align 1
+  ret i16 %0
+}
+
+define void @unaligned_i32_store(i32 %x, i32* %y) nounwind {
+entry:
+; ARM-STRICT-ALIGN: @unaligned_i32_store
+; ARM-STRICT-ALIGN: strb
+; ARM-STRICT-ALIGN: strb
+; ARM-STRICT-ALIGN: strb
+; ARM-STRICT-ALIGN: strb
+
+; THUMB-STRICT-ALIGN: @unaligned_i32_store
+; THUMB-STRICT-ALIGN: strb
+; THUMB-STRICT-ALIGN: strb
+; THUMB-STRICT-ALIGN: strb
+; THUMB-STRICT-ALIGN: strb
+
+  store i32 %x, i32* %y, align 1
+  ret void
+}
+
+define i32 @unaligned_i32_load(i32* %x) nounwind {
+entry:
+; ARM-STRICT-ALIGN: @unaligned_i32_load
+; ARM-STRICT-ALIGN: ldrb
+; ARM-STRICT-ALIGN: ldrb
+; ARM-STRICT-ALIGN: ldrb
+; ARM-STRICT-ALIGN: ldrb
+
+; THUMB-STRICT-ALIGN: @unaligned_i32_load
+; THUMB-STRICT-ALIGN: ldrb
+; THUMB-STRICT-ALIGN: ldrb
+; THUMB-STRICT-ALIGN: ldrb
+; THUMB-STRICT-ALIGN: ldrb
+
+  %0 = load i32* %x, align 1
+  ret i32 %0
+}
diff --git a/test/CodeGen/ARM/neon_ld2.ll b/test/CodeGen/ARM/neon_ld2.ll
index 944bfe0602..630db93035 100644
--- a/test/CodeGen/ARM/neon_ld2.ll
+++ b/test/CodeGen/ARM/neon_ld2.ll
@@ -1,10 +1,10 @@
 ; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
 
 ; CHECK: t1
-; CHECK: vldmia
-; CHECK: vldmia
+; CHECK: vld1.64
+; CHECK: vld1.64
 ; CHECK: vadd.i64 q
-; CHECK: vstmia
+; CHECK: vst1.64
 define void @t1(<4 x i32>* %r, <2 x i64>* %a, <2 x i64>* %b) nounwind {
 entry:
 	%0 = load <2 x i64>* %a, align 16		; <<2 x i64>> [#uses=1]
@@ -16,8 +16,8 @@ entry:
 }
 
 ; CHECK: t2
-; CHECK: vldmia
-; CHECK: vldmia
+; CHECK: vld1.64
+; CHECK: vld1.64
 ; CHECK: vsub.i64 q
 ; CHECK: vmov r0, r1, d
 ; CHECK: vmov r2, r3, d
diff --git a/test/CodeGen/ARM/reg_sequence.ll b/test/CodeGen/ARM/reg_sequence.ll
index 05794e4ebd..206b96cd07 100644
--- a/test/CodeGen/ARM/reg_sequence.ll
+++ b/test/CodeGen/ARM/reg_sequence.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=arm -mcpu=cortex-a8 | FileCheck %s
-; RUN: llc < %s -march=arm -mcpu=cortex-a8 -regalloc=basic | FileCheck %s
+; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -mtriple=arm-apple-ios -mcpu=cortex-a8 -regalloc=basic | FileCheck %s
 ; Implementing vld / vst as REG_SEQUENCE eliminates the extra vmov's.
 
 %struct.int16x8_t = type { <8 x i16> }
@@ -137,7 +137,7 @@ return2:
 
 define <8 x i16> @t5(i16* %A, <8 x i16>* %B) nounwind {
 ; CHECK:        t5:
-; CHECK:        vldmia
+; CHECK:        vld1.32
 ; How can FileCheck match Q and D registers? We need a lisp interpreter.
 ; CHECK:        vorr {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
 ; CHECK-NOT:    vmov
@@ -243,8 +243,8 @@ define arm_aapcs_vfpcc float @t9(%0* nocapture, %3* nocapture) nounwind {
 ; CHECK:        vldr
 ; CHECK-NOT:    vmov d{{.*}}, d16
 ; CHECK:        vmov.i32 d17
-; CHECK-NEXT:   vstmia r0, {d16, d17}
-; CHECK-NEXT:   vstmia r0, {d16, d17}
+; CHECK-NEXT:   vst1.64 {d16, d17}, [r0, :128]
+; CHECK-NEXT:   vst1.64 {d16, d17}, [r0, :128]
   %3 = bitcast double 0.000000e+00 to <2 x float> ; <<2 x float>> [#uses=2]
   %4 = shufflevector <2 x float> %3, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> ; <<4 x float>> [#uses=1]
   store <4 x float> %4, <4 x float>* undef, align 16
diff --git a/test/CodeGen/ARM/sub.ll b/test/CodeGen/ARM/sub.ll
index 474043afc1..7f82ca7012 100644
--- a/test/CodeGen/ARM/sub.ll
+++ b/test/CodeGen/ARM/sub.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=arm < %s | FileCheck %s
+; RUN: llc -march=arm -mcpu=cortex-a8 < %s | FileCheck %s
 
 ; 171 = 0x000000ab
 define i64 @f1(i64 %a) {
diff --git a/test/CodeGen/ARM/twoaddrinstr.ll b/test/CodeGen/ARM/twoaddrinstr.ll
index 4e227dd5be..fc2aa1e568 100644
--- a/test/CodeGen/ARM/twoaddrinstr.ll
+++ b/test/CodeGen/ARM/twoaddrinstr.ll
@@ -4,18 +4,18 @@
 define void @PR13378() nounwind {
 ; This was orriginally a crasher trying to schedule the instructions.
 ; CHECK:      PR13378:
-; CHECK:        vldmia
+; CHECK:        vld1.32
+; CHECK-NEXT:   vst1.32
+; CHECK-NEXT:   vst1.32
 ; CHECK-NEXT:   vmov.f32
-; CHECK-NEXT:   vstmia
-; CHECK-NEXT:   vstmia
 ; CHECK-NEXT:   vmov.f32
-; CHECK-NEXT:   vstmia
+; CHECK-NEXT:   vst1.32
 
 entry:
-  %0 = load <4 x float>* undef
-  store <4 x float> zeroinitializer, <4 x float>* undef
-  store <4 x float> %0, <4 x float>* undef
+  %0 = load <4 x float>* undef, align 4
+  store <4 x float> zeroinitializer, <4 x float>* undef, align 4
+  store <4 x float> %0, <4 x float>* undef, align 4
   %1 = insertelement <4 x float> %0, float 1.000000e+00, i32 3
-  store <4 x float> %1, <4 x float>* undef
+  store <4 x float> %1, <4 x float>* undef, align 4
   unreachable
 }
diff --git a/test/CodeGen/ARM/unaligned_load_store.ll b/test/CodeGen/ARM/unaligned_load_store.ll
index 869b92675d..3064202eb3 100644
--- a/test/CodeGen/ARM/unaligned_load_store.ll
+++ b/test/CodeGen/ARM/unaligned_load_store.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=arm -pre-RA-sched=source | FileCheck %s -check-prefix=EXPANDED
-; RUN: llc < %s -mtriple=armv6-apple-darwin -mcpu=cortex-a8 -arm-strict-align -pre-RA-sched=source | FileCheck %s -check-prefix=EXPANDED
+; RUN: llc < %s -mtriple=armv6-apple-darwin -mcpu=cortex-a8 -mattr=-neon -arm-strict-align -pre-RA-sched=source | FileCheck %s -check-prefix=EXPANDED
 ; RUN: llc < %s -mtriple=armv6-apple-darwin -mcpu=cortex-a8 | FileCheck %s -check-prefix=UNALIGNED
 
 ; rdar://7113725
@@ -59,3 +59,19 @@ entry:
   store double %tmp, double* %b, align 1
   ret void
 }
+
+define void @byte_word_ops(i32* %a, i32* %b) nounwind {
+entry:
+; EXPANDED: byte_word_ops:
+; EXPANDED: ldrb
+; EXPANDED: strb
+
+; UNALIGNED: byte_word_ops:
+; UNALIGNED-NOT: ldrb
+; UNALIGNED: ldr
+; UNALIGNED-NOT: strb
+; UNALIGNED: str
+  %tmp = load i32* %a, align 1
+  store i32 %tmp, i32* %b, align 1
+  ret void
+}
diff --git a/test/CodeGen/ARM/unaligned_load_store_vector.ll b/test/CodeGen/ARM/unaligned_load_store_vector.ll
new file mode 100644
index 0000000000..25ae651793
--- /dev/null
+++ b/test/CodeGen/ARM/unaligned_load_store_vector.ll
@@ -0,0 +1,487 @@
+;RUN: llc < %s -march=arm -mattr=+v7 -mattr=+neon | FileCheck %s
+
+;ALIGN = 1
+;SIZE  = 64
+;TYPE  = <8 x i8>
+define void @v64_v8i8_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v8i8_1:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <8 x i8>*
+  %vo  = bitcast i8* %po to <8 x i8>*
+;CHECK: vld1.8
+  %v1 = load  <8 x i8>* %vi, align 1
+;CHECK: vst1.8
+  store <8 x i8> %v1, <8 x i8>* %vo, align 1
+  ret void
+}
+
+
+;ALIGN = 1
+;SIZE  = 64
+;TYPE  = <4 x i16>
+define void @v64_v4i16_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v4i16_1:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <4 x i16>*
+  %vo  = bitcast i8* %po to <4 x i16>*
+;CHECK: vld1.8
+  %v1 = load  <4 x i16>* %vi, align 1
+;CHECK: vst1.8
+  store <4 x i16> %v1, <4 x i16>* %vo, align 1
+  ret void
+}
+
+
+;ALIGN = 1
+;SIZE  = 64
+;TYPE  = <2 x i32>
+define void @v64_v2i32_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v2i32_1:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <2 x i32>*
+  %vo  = bitcast i8* %po to <2 x i32>*
+;CHECK: vld1.8
+  %v1 = load  <2 x i32>* %vi, align 1
+;CHECK: vst1.8
+  store <2 x i32> %v1, <2 x i32>* %vo, align 1
+  ret void
+}
+
+
+;ALIGN = 1
+;SIZE  = 64
+;TYPE  = <2 x float>
+define void @v64_v2f32_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v2f32_1:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <2 x float>*
+  %vo  = bitcast i8* %po to <2 x float>*
+;CHECK: vld1.8
+  %v1 = load  <2 x float>* %vi, align 1
+;CHECK: vst1.8
+  store <2 x float> %v1, <2 x float>* %vo, align 1
+  ret void
+}
+
+
+;ALIGN = 1
+;SIZE  = 128
+;TYPE  = <16 x i8>
+define void @v128_v16i8_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v16i8_1:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <16 x i8>*
+  %vo  = bitcast i8* %po to <16 x i8>*
+;CHECK: vld1.8
+  %v1 = load  <16 x i8>* %vi, align 1
+;CHECK: vst1.8
+  store <16 x i8> %v1, <16 x i8>* %vo, align 1
+  ret void
+}
+
+
+;ALIGN = 1
+;SIZE  = 128
+;TYPE  = <8 x i16>
+define void @v128_v8i16_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v8i16_1:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <8 x i16>*
+  %vo  = bitcast i8* %po to <8 x i16>*
+;CHECK: vld1.8
+  %v1 = load  <8 x i16>* %vi, align 1
+;CHECK: vst1.8
+  store <8 x i16> %v1, <8 x i16>* %vo, align 1
+  ret void
+}
+
+
+;ALIGN = 1
+;SIZE  = 128
+;TYPE  = <4 x i32>
+define void @v128_v4i32_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v4i32_1:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <4 x i32>*
+  %vo  = bitcast i8* %po to <4 x i32>*
+;CHECK: vld1.8
+  %v1 = load  <4 x i32>* %vi, align 1
+;CHECK: vst1.8
+  store <4 x i32> %v1, <4 x i32>* %vo, align 1
+  ret void
+}
+
+
+;ALIGN = 1
+;SIZE  = 128
+;TYPE  = <2 x i64>
+define void @v128_v2i64_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v2i64_1:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <2 x i64>*
+  %vo  = bitcast i8* %po to <2 x i64>*
+;CHECK: vld1.8
+  %v1 = load  <2 x i64>* %vi, align 1
+;CHECK: vst1.8
+  store <2 x i64> %v1, <2 x i64>* %vo, align 1
+  ret void
+}
+
+
+;ALIGN = 1
+;SIZE  = 128
+;TYPE  = <4 x float>
+define void @v128_v4f32_1(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v4f32_1:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <4 x float>*
+  %vo  = bitcast i8* %po to <4 x float>*
+;CHECK: vld1.8
+  %v1 = load  <4 x float>* %vi, align 1
+;CHECK: vst1.8
+  store <4 x float> %v1, <4 x float>* %vo, align 1
+  ret void
+}
+
+
+;ALIGN = 2
+;SIZE  = 64
+;TYPE  = <8 x i8>
+define void @v64_v8i8_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v8i8_2:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <8 x i8>*
+  %vo  = bitcast i8* %po to <8 x i8>*
+;CHECK: vld1.16
+  %v1 = load  <8 x i8>* %vi, align 2
+;CHECK: vst1.16
+  store <8 x i8> %v1, <8 x i8>* %vo, align 2
+  ret void
+}
+
+
+;ALIGN = 2
+;SIZE  = 64
+;TYPE  = <4 x i16>
+define void @v64_v4i16_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v4i16_2:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <4 x i16>*
+  %vo  = bitcast i8* %po to <4 x i16>*
+;CHECK: vld1.16
+  %v1 = load  <4 x i16>* %vi, align 2
+;CHECK: vst1.16
+  store <4 x i16> %v1, <4 x i16>* %vo, align 2
+  ret void
+}
+
+
+;ALIGN = 2
+;SIZE  = 64
+;TYPE  = <2 x i32>
+define void @v64_v2i32_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v2i32_2:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <2 x i32>*
+  %vo  = bitcast i8* %po to <2 x i32>*
+;CHECK: vld1.16
+  %v1 = load  <2 x i32>* %vi, align 2
+;CHECK: vst1.16
+  store <2 x i32> %v1, <2 x i32>* %vo, align 2
+  ret void
+}
+
+
+;ALIGN = 2
+;SIZE  = 64
+;TYPE  = <2 x float>
+define void @v64_v2f32_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v2f32_2:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <2 x float>*
+  %vo  = bitcast i8* %po to <2 x float>*
+;CHECK: vld1.16
+  %v1 = load  <2 x float>* %vi, align 2
+;CHECK: vst1.16
+  store <2 x float> %v1, <2 x float>* %vo, align 2
+  ret void
+}
+
+
+;ALIGN = 2
+;SIZE  = 128
+;TYPE  = <16 x i8>
+define void @v128_v16i8_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v16i8_2:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <16 x i8>*
+  %vo  = bitcast i8* %po to <16 x i8>*
+;CHECK: vld1.16
+  %v1 = load  <16 x i8>* %vi, align 2
+;CHECK: vst1.16
+  store <16 x i8> %v1, <16 x i8>* %vo, align 2
+  ret void
+}
+
+
+;ALIGN = 2
+;SIZE  = 128
+;TYPE  = <8 x i16>
+define void @v128_v8i16_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v8i16_2:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <8 x i16>*
+  %vo  = bitcast i8* %po to <8 x i16>*
+;CHECK: vld1.16
+  %v1 = load  <8 x i16>* %vi, align 2
+;CHECK: vst1.16
+  store <8 x i16> %v1, <8 x i16>* %vo, align 2
+  ret void
+}
+
+
+;ALIGN = 2
+;SIZE  = 128
+;TYPE  = <4 x i32>
+define void @v128_v4i32_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v4i32_2:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <4 x i32>*
+  %vo  = bitcast i8* %po to <4 x i32>*
+;CHECK: vld1.16
+  %v1 = load  <4 x i32>* %vi, align 2
+;CHECK: vst1.16
+  store <4 x i32> %v1, <4 x i32>* %vo, align 2
+  ret void
+}
+
+
+;ALIGN = 2
+;SIZE  = 128
+;TYPE  = <2 x i64>
+define void @v128_v2i64_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v2i64_2:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <2 x i64>*
+  %vo  = bitcast i8* %po to <2 x i64>*
+;CHECK: vld1.16
+  %v1 = load  <2 x i64>* %vi, align 2
+;CHECK: vst1.16
+  store <2 x i64> %v1, <2 x i64>* %vo, align 2
+  ret void
+}
+
+
+;ALIGN = 2
+;SIZE  = 128
+;TYPE  = <4 x float>
+define void @v128_v4f32_2(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v4f32_2:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <4 x float>*
+  %vo  = bitcast i8* %po to <4 x float>*
+;CHECK: vld1.16
+  %v1 = load  <4 x float>* %vi, align 2
+;CHECK: vst1.16
+  store <4 x float> %v1, <4 x float>* %vo, align 2
+  ret void
+}
+
+
+;ALIGN = 4
+;SIZE  = 64
+;TYPE  = <8 x i8>
+define void @v64_v8i8_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v8i8_4:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <8 x i8>*
+  %vo  = bitcast i8* %po to <8 x i8>*
+;CHECK: vldr
+  %v1 = load  <8 x i8>* %vi, align 4
+;CHECK: vstr
+  store <8 x i8> %v1, <8 x i8>* %vo, align 4
+  ret void
+}
+
+
+;ALIGN = 4
+;SIZE  = 64
+;TYPE  = <4 x i16>
+define void @v64_v4i16_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v4i16_4:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <4 x i16>*
+  %vo  = bitcast i8* %po to <4 x i16>*
+;CHECK: vldr
+  %v1 = load  <4 x i16>* %vi, align 4
+;CHECK: vstr
+  store <4 x i16> %v1, <4 x i16>* %vo, align 4
+  ret void
+}
+
+
+;ALIGN = 4
+;SIZE  = 64
+;TYPE  = <2 x i32>
+define void @v64_v2i32_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v2i32_4:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <2 x i32>*
+  %vo  = bitcast i8* %po to <2 x i32>*
+;CHECK: vldr
+  %v1 = load  <2 x i32>* %vi, align 4
+;CHECK: vstr
+  store <2 x i32> %v1, <2 x i32>* %vo, align 4
+  ret void
+}
+
+
+;ALIGN = 4
+;SIZE  = 64
+;TYPE  = <2 x float>
+define void @v64_v2f32_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v64_v2f32_4:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <2 x float>*
+  %vo  = bitcast i8* %po to <2 x float>*
+;CHECK: vldr
+  %v1 = load  <2 x float>* %vi, align 4
+;CHECK: vstr
+  store <2 x float> %v1, <2 x float>* %vo, align 4
+  ret void
+}
+
+
+;ALIGN = 4
+;SIZE  = 128
+;TYPE  = <16 x i8>
+define void @v128_v16i8_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v16i8_4:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <16 x i8>*
+  %vo  = bitcast i8* %po to <16 x i8>*
+;CHECK: vld1.32
+  %v1 = load  <16 x i8>* %vi, align 4
+;CHECK: vst1.32
+  store <16 x i8> %v1, <16 x i8>* %vo, align 4
+  ret void
+}
+
+
+;ALIGN = 4
+;SIZE  = 128
+;TYPE  = <8 x i16>
+define void @v128_v8i16_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v8i16_4:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <8 x i16>*
+  %vo  = bitcast i8* %po to <8 x i16>*
+;CHECK: vld1.32
+  %v1 = load  <8 x i16>* %vi, align 4
+;CHECK: vst1.32
+  store <8 x i16> %v1, <8 x i16>* %vo, align 4
+  ret void
+}
+
+
+;ALIGN = 4
+;SIZE  = 128
+;TYPE  = <4 x i32>
+define void @v128_v4i32_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v4i32_4:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <4 x i32>*
+  %vo  = bitcast i8* %po to <4 x i32>*
+;CHECK: vld1.32
+  %v1 = load  <4 x i32>* %vi, align 4
+;CHECK: vst1.32
+  store <4 x i32> %v1, <4 x i32>* %vo, align 4
+  ret void
+}
+
+
+;ALIGN = 4
+;SIZE  = 128
+;TYPE  = <2 x i64>
+define void @v128_v2i64_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v2i64_4:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <2 x i64>*
+  %vo  = bitcast i8* %po to <2 x i64>*
+;CHECK: vld1.32
+  %v1 = load  <2 x i64>* %vi, align 4
+;CHECK: vst1.32
+  store <2 x i64> %v1, <2 x i64>* %vo, align 4
+  ret void
+}
+
+
+;ALIGN = 4
+;SIZE  = 128
+;TYPE  = <4 x float>
+define void @v128_v4f32_4(i8* noalias nocapture %out, i8* noalias nocapture %in) nounwind {
+;CHECK: v128_v4f32_4:
+entry:
+  %po = getelementptr i8* %out, i32 0
+  %pi = getelementptr i8* %in,  i32 0
+  %vi  = bitcast i8* %pi to <4 x float>*
+  %vo  = bitcast i8* %po to <4 x float>*
+;CHECK: vld1.32
+  %v1 = load  <4 x float>* %vi, align 4
+;CHECK: vst1.32
+  store <4 x float> %v1, <4 x float>* %vo, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/ARM/vbsl-constant.ll b/test/CodeGen/ARM/vbsl-constant.ll
index f157dbdb97..ffda0a51bd 100644
--- a/test/CodeGen/ARM/vbsl-constant.ll
+++ b/test/CodeGen/ARM/vbsl-constant.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s
+; RUN: llc < %s -mtriple=arm-apple-ios -mattr=+neon | FileCheck %s
 
 define <8 x i8> @v_bsli8(<8 x i8>* %A, <8 x i8>* %B, <8 x i8>* %C) nounwind {
 ;CHECK: v_bsli8:
@@ -59,8 +59,8 @@ define <1 x i64> @v_bsli64(<1 x i64>* %A, <1 x i64>* %B, <1 x i64>* %C) nounwind
 
 define <16 x i8> @v_bslQi8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind {
 ;CHECK: v_bslQi8:
-;CHECK: vldmia
-;CHECK: vldmia
+;CHECK: vld1.32
+;CHECK: vld1.32
 ;CHECK: vbsl
 	%tmp1 = load <16 x i8>* %A
 	%tmp2 = load <16 x i8>* %B
@@ -73,8 +73,8 @@ define <16 x i8> @v_bslQi8(<16 x i8>* %A, <16 x i8>* %B, <16 x i8>* %C) nounwind
 
 define <8 x i16> @v_bslQi16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwind {
 ;CHECK: v_bslQi16:
-;CHECK: vldmia
-;CHECK: vldmia
+;CHECK: vld1.32
+;CHECK: vld1.32
 ;CHECK: vbsl
 	%tmp1 = load <8 x i16>* %A
 	%tmp2 = load <8 x i16>* %B
@@ -87,8 +87,8 @@ define <8 x i16> @v_bslQi16(<8 x i16>* %A, <8 x i16>* %B, <8 x i16>* %C) nounwin
 
 define <4 x i32> @v_bslQi32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwind {
 ;CHECK: v_bslQi32:
-;CHECK: vldmia
-;CHECK: vldmia
+;CHECK: vld1.32
+;CHECK: vld1.32
 ;CHECK: vbsl
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = load <4 x i32>* %B
@@ -101,9 +101,9 @@ define <4 x i32> @v_bslQi32(<4 x i32>* %A, <4 x i32>* %B, <4 x i32>* %C) nounwin
 
 define <2 x i64> @v_bslQi64(<2 x i64>* %A, <2 x i64>* %B, <2 x i64>* %C) nounwind {
 ;CHECK: v_bslQi64:
-;CHECK: vldmia
-;CHECK: vldmia
-;CHECK: vldmia
+;CHECK: vld1.32
+;CHECK: vld1.32
+;CHECK: vld1.64
 ;CHECK: vbsl
 	%tmp1 = load <2 x i64>* %A
 	%tmp2 = load <2 x i64>* %B
diff --git a/test/CodeGen/ARM/vbsl.ll b/test/CodeGen/ARM/vbsl.ll
index 9f3bb4e103..8ca2fd26b6 100644
--- a/test/CodeGen/ARM/vbsl.ll
+++ b/test/CodeGen/ARM/vbsl.ll
@@ -103,3 +103,52 @@ define <2 x i64> @v_bslQi64(<2 x i64>* %A, <2 x i64>* %B, <2 x i64>* %C) nounwin
 	%tmp7 = or <2 x i64> %tmp4, %tmp6
 	ret <2 x i64> %tmp7
 }
+
+define <8 x i8> @f1(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) nounwind readnone optsize ssp {
+; CHECK: f1:
+; CHECK: vbsl
+  %vbsl.i = tail call <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c) nounwind
+  ret <8 x i8> %vbsl.i
+}
+
+define <4 x i16> @f2(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) nounwind readnone optsize ssp {
+; CHECK: f2:
+; CHECK: vbsl
+  %vbsl3.i = tail call <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16> %a, <4 x i16> %b, <4 x i16> %c) nounwind
+  ret <4 x i16> %vbsl3.i
+}
+
+define <2 x i32> @f3(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) nounwind readnone optsize ssp {
+; CHECK: f3:
+; CHECK: vbsl
+  %vbsl3.i = tail call <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32> %a, <2 x i32> %b, <2 x i32> %c) nounwind
+  ret <2 x i32> %vbsl3.i
+}
+
+define <16 x i8> @g1(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) nounwind readnone optsize ssp {
+; CHECK: g1:
+; CHECK: vbsl
+  %vbsl.i = tail call <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c) nounwind
+  ret <16 x i8> %vbsl.i
+}
+
+define <8 x i16> @g2(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind readnone optsize ssp {
+; CHECK: g2:
+; CHECK: vbsl
+  %vbsl3.i = tail call <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c) nounwind
+  ret <8 x i16> %vbsl3.i
+}
+
+define <4 x i32> @g3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind readnone optsize ssp {
+; CHECK: g3:
+; CHECK: vbsl
+  %vbsl3.i = tail call <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) nounwind
+  ret <4 x i32> %vbsl3.i
+}
+
+declare <4 x i32> @llvm.arm.neon.vbsl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
+declare <8 x i16> @llvm.arm.neon.vbsl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
+declare <16 x i8> @llvm.arm.neon.vbsl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
+declare <2 x i32> @llvm.arm.neon.vbsl.v2i32(<2 x i32>, <2 x i32>, <2 x i32>) nounwind readnone
+declare <4 x i16> @llvm.arm.neon.vbsl.v4i16(<4 x i16>, <4 x i16>, <4 x i16>) nounwind readnone
+declare <8 x i8> @llvm.arm.neon.vbsl.v8i8(<8 x i8>, <8 x i8>, <8 x i8>) nounwind readnone
diff --git a/test/CodeGen/Mips/ul1.ll b/test/CodeGen/Mips/ul1.ll
new file mode 100644
index 0000000000..7e64ff4d90
--- /dev/null
+++ b/test/CodeGen/Mips/ul1.ll
@@ -0,0 +1,15 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+%struct.ua = type <{ i16, i32 }>
+
+@foo = common global %struct.ua zeroinitializer, align 1
+
+define i32 @main() nounwind {
+entry:
+  store i32 10, i32* getelementptr inbounds (%struct.ua* @foo, i32 0, i32 1), align 1
+; 16:   sb  ${{[0-9]+}}, {{[0-9]+}}(${{[0-9]+}})
+; 16:   sb  ${{[0-9]+}}, {{[0-9]+}}(${{[0-9]+}})
+; 16:   sb  ${{[0-9]+}}, {{[0-9]+}}(${{[0-9]+}})
+; 16:   sb  ${{[0-9]+}}, {{[0-9]+}}(${{[0-9]+}})
+  ret i32 0
+}
+
diff --git a/test/CodeGen/PowerPC/2012-09-16-TOC-entry-check.ll b/test/CodeGen/PowerPC/2012-09-16-TOC-entry-check.ll
new file mode 100644
index 0000000000..9d2e390c1c
--- /dev/null
+++ b/test/CodeGen/PowerPC/2012-09-16-TOC-entry-check.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; This test check if the TOC entry symbol name won't clash with global .LC0
+; and .LC2 symbols defined in the module.
+
+@.LC0 = internal global [5 x i8] c".LC0\00"
+@.LC2 = internal global [5 x i8] c".LC2\00"
+
+define i32 @foo(double %X, double %Y) nounwind readnone {
+  ; The 1.0 and 3.0 constants generate two TOC entries
+  %cmp = fcmp oeq double %X, 1.000000e+00
+  %conv = zext i1 %cmp to i32
+  %cmp1 = fcmp oeq double %Y, 3.000000e+00
+  %conv2 = zext i1 %cmp1 to i32
+  %add = add nsw i32 %conv2, %conv
+  ret i32 %add
+}
+
+; Check the creation of 2 .tc entries for both double constants. They
+; should be .LC1 and .LC3 to avoid name clash with global constants
+; .LC0 and .LC2
+; CHECK: .LC{{[13]}}:
+; CHECK-NEXT: .tc {{[\._a-zA-Z0-9]+}}[TC],{{[\._a-zA-Z0-9]+}}
+; CHECK: .LC{{[13]}}:
+; CHECK-NEXT: .tc {{[\._a-zA-Z0-9]+}}[TC],{{[\._a-zA-Z0-9]+}}
diff --git a/test/CodeGen/PowerPC/bl8_elf_nop.ll b/test/CodeGen/PowerPC/bl8_elf_nop.ll
deleted file mode 100644
index 386c59e322..0000000000
--- a/test/CodeGen/PowerPC/bl8_elf_nop.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu | FileCheck  %s
-target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
-target triple = "powerpc64-unknown-linux-gnu"
-
-declare i32 @clock() nounwind
-
-define i32 @func() {
-entry:
-  %call = call i32 @clock() nounwind
-  %call2 = add i32 %call, 7
-  ret i32 %call2
-}
-
-; CHECK: bl clock
-; CHECK-NEXT: nop
-
diff --git a/test/CodeGen/PowerPC/crsave.ll b/test/CodeGen/PowerPC/crsave.ll
new file mode 100644
index 0000000000..3e98dbd254
--- /dev/null
+++ b/test/CodeGen/PowerPC/crsave.ll
@@ -0,0 +1,49 @@
+; RUN: llc -O0 -disable-fp-elim -mtriple=powerpc-unknown-linux-gnu < %s | FileCheck %s -check-prefix=PPC32
+; RUN: llc -O0 -disable-fp-elim -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s -check-prefix=PPC64
+
+declare void @foo()
+
+define i32 @test_cr2() nounwind {
+entry:
+  %ret = alloca i32, align 4
+  %0 = call i32 asm sideeffect "\0A\09mtcr $4\0A\09cmp 2,$2,$1\0A\09mfcr $0", "=r,r,r,r,r,~{cr2}"(i32 1, i32 2, i32 3, i32 0) nounwind
+  store i32 %0, i32* %ret, align 4
+  call void @foo()
+  %1 = load i32* %ret, align 4
+  ret i32 %1
+}
+
+; PPC32: mfcr 12
+; PPC32-NEXT: stw 12, {{[0-9]+}}(31)
+; PPC32: lwz 12, {{[0-9]+}}(31)
+; PPC32-NEXT: mtcrf 32, 12
+
+; PPC64: mfcr 12
+; PPC64-NEXT: stw 12, 8(1)
+; PPC64: lwz 12, 8(1)
+; PPC64-NEXT: mtcrf 32, 12
+
+define i32 @test_cr234() nounwind {
+entry:
+  %ret = alloca i32, align 4
+  %0 = call i32 asm sideeffect "\0A\09mtcr $4\0A\09cmp 2,$2,$1\0A\09cmp 3,$2,$2\0A\09cmp 4,$2,$3\0A\09mfcr $0", "=r,r,r,r,r,~{cr2},~{cr3},~{cr4}"(i32 1, i32 2, i32 3, i32 0) nounwind
+  store i32 %0, i32* %ret, align 4
+  call void @foo()
+  %1 = load i32* %ret, align 4
+  ret i32 %1
+}
+
+; PPC32: mfcr 12
+; PPC32-NEXT: stw 12, {{[0-9]+}}(31)
+; PPC32: lwz 12, {{[0-9]+}}(31)
+; PPC32-NEXT: mtcrf 32, 12
+; PPC32-NEXT: mtcrf 16, 12
+; PPC32-NEXT: mtcrf 8, 12
+
+; PPC64: mfcr 12
+; PPC64-NEXT: stw 12, 8(1)
+; PPC64: lwz 12, 8(1)
+; PPC64-NEXT: mtcrf 32, 12
+; PPC64-NEXT: mtcrf 16, 12
+; PPC64-NEXT: mtcrf 8, 12
+
diff --git a/test/CodeGen/PowerPC/ppc64-calls.ll b/test/CodeGen/PowerPC/ppc64-calls.ll
new file mode 100644
index 0000000000..c382edbbce
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-calls.ll
@@ -0,0 +1,63 @@
+; RUN: llc < %s -march=ppc64 | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define void @foo() nounwind readnone noinline {
+  ret void
+}
+
+define weak void @foo_weak() nounwind {
+  ret void
+}
+
+; Calls to local function does not require the TOC restore 'nop'
+define void @test_direct() nounwind readnone {
+; CHECK: test_direct:
+  tail call void @foo() nounwind
+; CHECK: bl foo
+; CHECK-NOT: nop
+  ret void
+}
+
+; Calls to weak function requires a TOC restore 'nop' because they
+; may be overridden in a different module.
+define void @test_weak() nounwind readnone {
+; CHECK: test_weak:
+  tail call void @foo_weak() nounwind
+; CHECK: bl foo
+; CHECK-NEXT: nop
+  ret void
+}
+
+; Indirect calls requires a full stub creation
+define void @test_indirect(void ()* nocapture %fp) nounwind {
+; CHECK: test_indirect:
+  tail call void %fp() nounwind
+; CHECK: ld [[FP:[0-9]+]], 0(3)
+; CHECK: ld 11, 16(3)
+; CHECK: ld 2, 8(3)
+; CHECK-NEXT: mtctr [[FP]]
+; CHECK-NEXT: bctrl
+; CHECK-NEXT: ld 2, 40(1)
+  ret void
+}
+
+; Absolute vales should be have the TOC restore 'nop'
+define void @test_abs() nounwind {
+; CHECK: test_abs:
+  tail call void inttoptr (i64 1024 to void ()*)() nounwind
+; CHECK: bla 1024
+; CHECK-NEXT: nop
+  ret void
+}
+
+declare double @sin(double) nounwind
+
+; External functions call should also have a 'nop'
+define double @test_external(double %x) nounwind {
+; CHECK: test_external:
+  %call = tail call double @sin(double %x) nounwind
+; CHECK: bl sin
+; CHECK-NEXT: nop
+  ret double %call
+}
diff --git a/test/CodeGen/PowerPC/ppc64-ind-call.ll b/test/CodeGen/PowerPC/ppc64-ind-call.ll
deleted file mode 100644
index d5c4d468c6..0000000000
--- a/test/CodeGen/PowerPC/ppc64-ind-call.ll
+++ /dev/null
@@ -1,16 +0,0 @@
-; RUN: llc < %s -march=ppc64 | FileCheck %s
-target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
-target triple = "powerpc64-unknown-linux-gnu"
-
-define void @test1() {
-entry:
-  %call.i75 = call zeroext i8 undef(i8* undef, i8 zeroext 10)
-  unreachable
-}
-
-; CHECK: @test1
-; CHECK: ld 11, 0(3)
-; CHECK: ld 2, 8(3)
-; CHECK: bctrl
-; CHECK: ld 2, 40(1)
-
diff --git a/test/CodeGen/PowerPC/ppc64-linux-func-size.ll b/test/CodeGen/PowerPC/ppc64-linux-func-size.ll
index e5aa1f169f..e1d50bac51 100644
--- a/test/CodeGen/PowerPC/ppc64-linux-func-size.ll
+++ b/test/CodeGen/PowerPC/ppc64-linux-func-size.ll
@@ -5,6 +5,7 @@
 ; CHECK-NEXT:	.align 3
 ; CHECK-NEXT:	.quad .L.test1
 ; CHECK-NEXT:	.quad .TOC.@tocbase
+; CHECK-NEXT:   .quad 0
 ; CHECK-NEXT:	.text
 ; CHECK-NEXT: .L.test1:
 
diff --git a/test/CodeGen/PowerPC/ppc64-toc.ll b/test/CodeGen/PowerPC/ppc64-toc.ll
index f1326ba992..b992ab1ad3 100644
--- a/test/CodeGen/PowerPC/ppc64-toc.ll
+++ b/test/CodeGen/PowerPC/ppc64-toc.ll
@@ -12,6 +12,7 @@ entry:
 ; CHECK-NEXT: .align  3
 ; CHECK-NEXT: .quad   .L.access_int64
 ; CHECK-NEXT: .quad   .TOC.@tocbase
+; CHECK-NEXT: .quad   0
 ; CHECK-NEXT: .text
   %0 = load i64* @number64, align 8
 ; CHECK: ld {{[0-9]+}}, .LC{{[0-9]+}}@toc(2)
diff --git a/test/CodeGen/PowerPC/structsinregs.ll b/test/CodeGen/PowerPC/structsinregs.ll
new file mode 100644
index 0000000000..ffd228bc81
--- /dev/null
+++ b/test/CodeGen/PowerPC/structsinregs.ll
@@ -0,0 +1,205 @@
+; RUN: llc -mcpu=pwr7 -O0 -disable-fp-elim < %s | FileCheck %s
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.s1 = type { i8 }
+%struct.s2 = type { i16 }
+%struct.s4 = type { i32 }
+%struct.t1 = type { i8 }
+%struct.t3 = type <{ i16, i8 }>
+%struct.t5 = type <{ i32, i8 }>
+%struct.t6 = type <{ i32, i16 }>
+%struct.t7 = type <{ i32, i16, i8 }>
+%struct.s3 = type { i16, i8 }
+%struct.s5 = type { i32, i8 }
+%struct.s6 = type { i32, i16 }
+%struct.s7 = type { i32, i16, i8 }
+%struct.t2 = type <{ i16 }>
+%struct.t4 = type <{ i32 }>
+
+@caller1.p1 = private unnamed_addr constant %struct.s1 { i8 1 }, align 1
+@caller1.p2 = private unnamed_addr constant %struct.s2 { i16 2 }, align 2
+@caller1.p3 = private unnamed_addr constant { i16, i8, i8 } { i16 4, i8 8, i8 undef }, align 2
+@caller1.p4 = private unnamed_addr constant %struct.s4 { i32 16 }, align 4
+@caller1.p5 = private unnamed_addr constant { i32, i8, [3 x i8] } { i32 32, i8 64, [3 x i8] undef }, align 4
+@caller1.p6 = private unnamed_addr constant { i32, i16, [2 x i8] } { i32 128, i16 256, [2 x i8] undef }, align 4
+@caller1.p7 = private unnamed_addr constant { i32, i16, i8, i8 } { i32 512, i16 1024, i8 -3, i8 undef }, align 4
+@caller2.p1 = private unnamed_addr constant %struct.t1 { i8 1 }, align 1
+@caller2.p2 = private unnamed_addr constant { i16 } { i16 2 }, align 1
+@caller2.p3 = private unnamed_addr constant %struct.t3 <{ i16 4, i8 8 }>, align 1
+@caller2.p4 = private unnamed_addr constant { i32 } { i32 16 }, align 1
+@caller2.p5 = private unnamed_addr constant %struct.t5 <{ i32 32, i8 64 }>, align 1
+@caller2.p6 = private unnamed_addr constant %struct.t6 <{ i32 128, i16 256 }>, align 1
+@caller2.p7 = private unnamed_addr constant %struct.t7 <{ i32 512, i16 1024, i8 -3 }>, align 1
+
+define i32 @caller1() nounwind {
+entry:
+  %p1 = alloca %struct.s1, align 1
+  %p2 = alloca %struct.s2, align 2
+  %p3 = alloca %struct.s3, align 2
+  %p4 = alloca %struct.s4, align 4
+  %p5 = alloca %struct.s5, align 4
+  %p6 = alloca %struct.s6, align 4
+  %p7 = alloca %struct.s7, align 4
+  %0 = bitcast %struct.s1* %p1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* getelementptr inbounds (%struct.s1* @caller1.p1, i32 0, i32 0), i64 1, i32 1, i1 false)
+  %1 = bitcast %struct.s2* %p2 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast (%struct.s2* @caller1.p2 to i8*), i64 2, i32 2, i1 false)
+  %2 = bitcast %struct.s3* %p3 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* bitcast ({ i16, i8, i8 }* @caller1.p3 to i8*), i64 4, i32 2, i1 false)
+  %3 = bitcast %struct.s4* %p4 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %3, i8* bitcast (%struct.s4* @caller1.p4 to i8*), i64 4, i32 4, i1 false)
+  %4 = bitcast %struct.s5* %p5 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* bitcast ({ i32, i8, [3 x i8] }* @caller1.p5 to i8*), i64 8, i32 4, i1 false)
+  %5 = bitcast %struct.s6* %p6 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %5, i8* bitcast ({ i32, i16, [2 x i8] }* @caller1.p6 to i8*), i64 8, i32 4, i1 false)
+  %6 = bitcast %struct.s7* %p7 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %6, i8* bitcast ({ i32, i16, i8, i8 }* @caller1.p7 to i8*), i64 8, i32 4, i1 false)
+  %call = call i32 @callee1(%struct.s1* byval %p1, %struct.s2* byval %p2, %struct.s3* byval %p3, %struct.s4* byval %p4, %struct.s5* byval %p5, %struct.s6* byval %p6, %struct.s7* byval %p7)
+  ret i32 %call
+
+; CHECK: ld 9, 128(31)
+; CHECK: ld 8, 136(31)
+; CHECK: ld 7, 144(31)
+; CHECK: lwz 6, 152(31)
+; CHECK: lwz 5, 160(31)
+; CHECK: lhz 4, 168(31)
+; CHECK: lbz 3, 176(31)
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+
+define internal i32 @callee1(%struct.s1* byval %v1, %struct.s2* byval %v2, %struct.s3* byval %v3, %struct.s4* byval %v4, %struct.s5* byval %v5, %struct.s6* byval %v6, %struct.s7* byval %v7) nounwind {
+entry:
+  %a = getelementptr inbounds %struct.s1* %v1, i32 0, i32 0
+  %0 = load i8* %a, align 1
+  %conv = zext i8 %0 to i32
+  %a1 = getelementptr inbounds %struct.s2* %v2, i32 0, i32 0
+  %1 = load i16* %a1, align 2
+  %conv2 = sext i16 %1 to i32
+  %add = add nsw i32 %conv, %conv2
+  %a3 = getelementptr inbounds %struct.s3* %v3, i32 0, i32 0
+  %2 = load i16* %a3, align 2
+  %conv4 = sext i16 %2 to i32
+  %add5 = add nsw i32 %add, %conv4
+  %a6 = getelementptr inbounds %struct.s4* %v4, i32 0, i32 0
+  %3 = load i32* %a6, align 4
+  %add7 = add nsw i32 %add5, %3
+  %a8 = getelementptr inbounds %struct.s5* %v5, i32 0, i32 0
+  %4 = load i32* %a8, align 4
+  %add9 = add nsw i32 %add7, %4
+  %a10 = getelementptr inbounds %struct.s6* %v6, i32 0, i32 0
+  %5 = load i32* %a10, align 4
+  %add11 = add nsw i32 %add9, %5
+  %a12 = getelementptr inbounds %struct.s7* %v7, i32 0, i32 0
+  %6 = load i32* %a12, align 4
+  %add13 = add nsw i32 %add11, %6
+  ret i32 %add13
+
+; CHECK: std 9, 96(1)
+; CHECK: std 8, 88(1)
+; CHECK: std 7, 80(1)
+; CHECK: stw 6, 72(1)
+; CHECK: stw 5, 64(1)
+; CHECK: sth 4, 58(1)
+; CHECK: stb 3, 51(1)
+; CHECK: lha {{[0-9]+}}, 58(1)
+; CHECK: lbz {{[0-9]+}}, 51(1)
+; CHECK: lha {{[0-9]+}}, 64(1)
+; CHECK: lwz {{[0-9]+}}, 72(1)
+; CHECK: lwz {{[0-9]+}}, 80(1)
+; CHECK: lwz {{[0-9]+}}, 88(1)
+; CHECK: lwz {{[0-9]+}}, 96(1)
+}
+
+define i32 @caller2() nounwind {
+entry:
+  %p1 = alloca %struct.t1, align 1
+  %p2 = alloca %struct.t2, align 1
+  %p3 = alloca %struct.t3, align 1
+  %p4 = alloca %struct.t4, align 1
+  %p5 = alloca %struct.t5, align 1
+  %p6 = alloca %struct.t6, align 1
+  %p7 = alloca %struct.t7, align 1
+  %0 = bitcast %struct.t1* %p1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* getelementptr inbounds (%struct.t1* @caller2.p1, i32 0, i32 0), i64 1, i32 1, i1 false)
+  %1 = bitcast %struct.t2* %p2 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* bitcast ({ i16 }* @caller2.p2 to i8*), i64 2, i32 1, i1 false)
+  %2 = bitcast %struct.t3* %p3 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* bitcast (%struct.t3* @caller2.p3 to i8*), i64 3, i32 1, i1 false)
+  %3 = bitcast %struct.t4* %p4 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %3, i8* bitcast ({ i32 }* @caller2.p4 to i8*), i64 4, i32 1, i1 false)
+  %4 = bitcast %struct.t5* %p5 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* bitcast (%struct.t5* @caller2.p5 to i8*), i64 5, i32 1, i1 false)
+  %5 = bitcast %struct.t6* %p6 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %5, i8* bitcast (%struct.t6* @caller2.p6 to i8*), i64 6, i32 1, i1 false)
+  %6 = bitcast %struct.t7* %p7 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %6, i8* bitcast (%struct.t7* @caller2.p7 to i8*), i64 7, i32 1, i1 false)
+  %call = call i32 @callee2(%struct.t1* byval %p1, %struct.t2* byval %p2, %struct.t3* byval %p3, %struct.t4* byval %p4, %struct.t5* byval %p5, %struct.t6* byval %p6, %struct.t7* byval %p7)
+  ret i32 %call
+
+; CHECK: stb {{[0-9]+}}, 71(1)
+; CHECK: sth {{[0-9]+}}, 69(1)
+; CHECK: stb {{[0-9]+}}, 87(1)
+; CHECK: stw {{[0-9]+}}, 83(1)
+; CHECK: sth {{[0-9]+}}, 94(1)
+; CHECK: stw {{[0-9]+}}, 90(1)
+; CHECK: stb {{[0-9]+}}, 103(1)
+; CHECK: sth {{[0-9]+}}, 101(1)
+; CHECK: stw {{[0-9]+}}, 97(1)
+; CHECK: ld 9, 96(1)
+; CHECK: ld 8, 88(1)
+; CHECK: ld 7, 80(1)
+; CHECK: lwz 6, 152(31)
+; CHECK: ld 5, 64(1)
+; CHECK: lhz 4, 168(31)
+; CHECK: lbz 3, 176(31)
+}
+
+define internal i32 @callee2(%struct.t1* byval %v1, %struct.t2* byval %v2, %struct.t3* byval %v3, %struct.t4* byval %v4, %struct.t5* byval %v5, %struct.t6* byval %v6, %struct.t7* byval %v7) nounwind {
+entry:
+  %a = getelementptr inbounds %struct.t1* %v1, i32 0, i32 0
+  %0 = load i8* %a, align 1
+  %conv = zext i8 %0 to i32
+  %a1 = getelementptr inbounds %struct.t2* %v2, i32 0, i32 0
+  %1 = load i16* %a1, align 1
+  %conv2 = sext i16 %1 to i32
+  %add = add nsw i32 %conv, %conv2
+  %a3 = getelementptr inbounds %struct.t3* %v3, i32 0, i32 0
+  %2 = load i16* %a3, align 1
+  %conv4 = sext i16 %2 to i32
+  %add5 = add nsw i32 %add, %conv4
+  %a6 = getelementptr inbounds %struct.t4* %v4, i32 0, i32 0
+  %3 = load i32* %a6, align 1
+  %add7 = add nsw i32 %add5, %3
+  %a8 = getelementptr inbounds %struct.t5* %v5, i32 0, i32 0
+  %4 = load i32* %a8, align 1
+  %add9 = add nsw i32 %add7, %4
+  %a10 = getelementptr inbounds %struct.t6* %v6, i32 0, i32 0
+  %5 = load i32* %a10, align 1
+  %add11 = add nsw i32 %add9, %5
+  %a12 = getelementptr inbounds %struct.t7* %v7, i32 0, i32 0
+  %6 = load i32* %a12, align 1
+  %add13 = add nsw i32 %add11, %6
+  ret i32 %add13
+
+; CHECK: sldi 9, 9, 8
+; CHECK: sldi 8, 8, 16
+; CHECK: sldi 7, 7, 24
+; CHECK: sldi 5, 5, 40
+; CHECK: stw 6, 72(1)
+; CHECK: sth 4, 58(1)
+; CHECK: stb 3, 51(1)
+; CHECK: std 9, 96(1)
+; CHECK: std 8, 88(1)
+; CHECK: std 7, 80(1)
+; CHECK: std 5, 64(1)
+; CHECK: lha {{[0-9]+}}, 58(1)
+; CHECK: lbz {{[0-9]+}}, 51(1)
+; CHECK: lha {{[0-9]+}}, 64(1)
+; CHECK: lwz {{[0-9]+}}, 72(1)
+; CHECK: lwz {{[0-9]+}}, 80(1)
+; CHECK: lwz {{[0-9]+}}, 88(1)
+; CHECK: lwz {{[0-9]+}}, 96(1)
+}
diff --git a/test/CodeGen/SPARC/load_to_switch.ll b/test/CodeGen/SPARC/load_to_switch.ll
new file mode 100644
index 0000000000..8d62de527e
--- /dev/null
+++ b/test/CodeGen/SPARC/load_to_switch.ll
@@ -0,0 +1,84 @@
+; RUN: llc -march=sparc < %s | FileCheck %s
+
+; Check that all the switches turned into lookup tables by SimplifyCFG are
+; turned back into switches for targets that don't like lookup tables.
+
+@.str = private unnamed_addr constant [4 x i8] c"foo\00", align 1
+@.str1 = private unnamed_addr constant [4 x i8] c"bar\00", align 1
+@.str2 = private unnamed_addr constant [4 x i8] c"baz\00", align 1
+@.str3 = private unnamed_addr constant [4 x i8] c"qux\00", align 1
+@.str4 = private unnamed_addr constant [6 x i8] c"error\00", align 1
+@switch.table = private unnamed_addr constant [7 x i32] [i32 55, i32 123, i32 0, i32 -1, i32 27, i32 62, i32 1]
+@switch.table1 = private unnamed_addr constant [4 x i8] c"*\09X\05"
+@switch.table2 = private unnamed_addr constant [4 x float] [float 0x40091EB860000000, float 0x3FF3BE76C0000000, float 0x4012449BA0000000, float 0x4001AE1480000000]
+@switch.table3 = private unnamed_addr constant [4 x i8*] [i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8]* @.str1, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8]* @.str2, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8]* @.str3, i64 0, i64 0)]
+
+define i32 @f(i32 %c)  {
+entry:
+  %switch.tableidx = sub i32 %c, 42
+  %0 = icmp ult i32 %switch.tableidx, 7
+  br i1 %0, label %switch.lookup, label %return
+
+switch.lookup:
+  %switch.gep = getelementptr inbounds [7 x i32]* @switch.table, i32 0, i32 %switch.tableidx
+  %switch.load = load i32* %switch.gep
+  ret i32 %switch.load
+
+return:
+  ret i32 15
+
+; CHECK: f:
+; CHECK: %switch.lookup
+; CHECK-NOT: sethi %hi(.Lswitch.table)
+}
+
+declare void @dummy(i8 signext, float)
+
+define void @h(i32 %x) {
+entry:
+  %switch.tableidx = sub i32 %x, 0
+  %0 = icmp ult i32 %switch.tableidx, 4
+  br i1 %0, label %switch.lookup, label %sw.epilog
+
+switch.lookup:
+  %switch.gep = getelementptr inbounds [4 x i8]* @switch.table1, i32 0, i32 %switch.tableidx
+  %switch.load = load i8* %switch.gep
+  %switch.gep1 = getelementptr inbounds [4 x float]* @switch.table2, i32 0, i32 %switch.tableidx
+  %switch.load2 = load float* %switch.gep1
+  br label %sw.epilog
+
+sw.epilog:
+  %a.0 = phi i8 [ %switch.load, %switch.lookup ], [ 7, %entry ]
+  %b.0 = phi float [ %switch.load2, %switch.lookup ], [ 0x4023FAE140000000, %entry ]
+  call void @dummy(i8 signext %a.0, float %b.0)
+  ret void
+
+; CHECK: h:
+; CHECK: %switch.lookup
+; CHECK-NOT: sethi %hi(.Lswitch.table{{[0-9]}})
+; CHECK-NOT: sethi %hi(.Lswitch.table{{[0-9]}})
+}
+
+define i8* @foostring(i32 %x) {
+entry:
+  %switch.tableidx = sub i32 %x, 0
+  %0 = icmp ult i32 %switch.tableidx, 4
+  br i1 %0, label %switch.lookup, label %return
+
+switch.lookup:
+  %switch.gep = getelementptr inbounds [4 x i8*]* @switch.table3, i32 0, i32 %switch.tableidx
+  %switch.load = load i8** %switch.gep
+  ret i8* %switch.load
+
+return:
+  ret i8* getelementptr inbounds ([6 x i8]* @.str4, i64 0, i64 0)
+
+; CHECK: foostring:
+; CHECK: %switch.lookup
+; CHECK-NOT: sethi %hi(.Lswitch.table3)
+}
+
+; CHECK-NOT: .Lswitch.table
+; CHECK-NOT: .Lswitch.table1
+; CHECK-NOT: .Lswitch.table2
+; CHECK-NOT: .Lswitch.table3
diff --git a/test/CodeGen/Thumb2/buildvector-crash.ll b/test/CodeGen/Thumb2/buildvector-crash.ll
index 01ef472d31..ce42f4b377 100644
--- a/test/CodeGen/Thumb2/buildvector-crash.ll
+++ b/test/CodeGen/Thumb2/buildvector-crash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O3 -mtriple=thumbv7-apple-darwin10 -mcpu=cortex-a8 | FileCheck %s
+; RUN: llc < %s -O3 -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 | FileCheck %s
 ; Formerly crashed, 3573915.
 
 define void @RotateStarsFP_Vec() nounwind {
@@ -13,5 +13,5 @@ bb8:                                              ; preds = %bb8, %bb.nph372
   store <4 x float> %3, <4 x float>* undef, align 4
   br label %bb8
 ; CHECK: RotateStarsFP_Vec:
-; CHECK: vldmia
+; CHECK: vld1.64
 }
diff --git a/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll b/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll
index 8b55bd79aa..7d1cda35a2 100644
--- a/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll
+++ b/test/CodeGen/X86/2010-01-08-Atomic64Bug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=corei7 | FileCheck %s
 ; rdar://r7512579
 
 ; PHI defs in the atomic loop should be used by the add / adc
@@ -7,17 +7,16 @@
 define void @t(i64* nocapture %p) nounwind ssp {
 entry:
 ; CHECK: t:
-; CHECK: movl $1
-; CHECK: movl (%ebp), %eax
-; CHECK: movl 4(%ebp), %edx
+; CHECK: movl ([[REG:%[a-z]+]]), %eax
+; CHECK: movl 4([[REG]]), %edx
 ; CHECK: LBB0_1:
-; CHECK-NOT: movl $1
-; CHECK-NOT: movl $0
+; CHECK: movl $1
 ; CHECK: addl
+; CHECK: movl $0
 ; CHECK: adcl
 ; CHECK: lock
-; CHECK: cmpxchg8b
-; CHECK: jne
+; CHECK-NEXT: cmpxchg8b ([[REG]])
+; CHECK-NEXT: jne
   %0 = atomicrmw add i64* %p, i64 1 seq_cst
   ret void
 }
diff --git a/test/CodeGen/X86/2012-09-13-dagco-fneg.ll b/test/CodeGen/X86/2012-09-13-dagco-fneg.ll
new file mode 100644
index 0000000000..7b9bab97be
--- /dev/null
+++ b/test/CodeGen/X86/2012-09-13-dagco-fneg.ll
@@ -0,0 +1,21 @@
+; RUN: llc -march=x86-64 -mcpu=corei7 < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; CHECK: foo
+; Make sure we are not trying to use scalar xor on the high bits of the vector.
+; CHECK-NOT: xorq
+; CHECK: xorl
+; CHECK-NEXT: ret
+
+define i32 @foo() {
+bb:
+  %tmp44.i = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, <float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, float 0.000000e+00>
+  %0 = bitcast <4 x float> %tmp44.i to i128
+  %1 = zext i128 %0 to i512
+  %2 = shl nuw nsw i512 %1, 256
+  %ins = or i512 %2, 3325764857622480139933400731976840738652108318779753826115024029985671937147149347761402413803120180680770390816681124225944317364750115981129923635970048
+  store i512 %ins, i512* undef, align 64
+  ret i32 0
+}
diff --git a/test/CodeGen/X86/StackColoring.ll b/test/CodeGen/X86/StackColoring.ll
index 0f36ce29b9..f8ae74f292 100644
--- a/test/CodeGen/X86/StackColoring.ll
+++ b/test/CodeGen/X86/StackColoring.ll
@@ -7,7 +7,6 @@ target triple = "x86_64-apple-macosx10.8.0"
 ;YESCOLOR: subq  $136, %rsp
 ;NOCOLOR: subq  $264, %rsp
 
-
 define i32 @myCall_w2(i32 %in) {
 entry:
   %a = alloca [17 x i8*], align 8
@@ -328,7 +327,6 @@ entry:
 
 ;YESCOLOR: subq  $272, %rsp
 ;NOCOLOR: subq  $272, %rsp
-
 define i32 @myCall_end_before_begin(i32 %in, i1 %d) {
 entry:
   %a = alloca [17 x i8*], align 8
@@ -352,11 +350,61 @@ bb3:
   ret i32 0
 }
 
+; Check that we don't assert and crash even when there are allocas
+; outside the declared lifetime regions.
+;YESCOLOR: bad_range
+;NOCOLOR:  bad_range
+define void @bad_range() nounwind ssp {
+entry:
+  %A.i1 = alloca [100 x i32], align 4
+  %B.i2 = alloca [100 x i32], align 4
+  %A.i = alloca [100 x i32], align 4
+  %B.i = alloca [100 x i32], align 4
+  %0 = bitcast [100 x i32]* %A.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind
+  %1 = bitcast [100 x i32]* %B.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %1) nounwind
+  call void @bar([100 x i32]* %A.i, [100 x i32]* %B.i) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %1) nounwind
+  br label %block2
+
+block2:
+  ; I am used outside the marked lifetime.
+  call void @bar([100 x i32]* %A.i, [100 x i32]* %B.i) nounwind
+  ret void
+}
+
+
+; Check that we don't assert and crash even when there are usages
+; of allocas which do not read or write outside the declared lifetime regions.
+;YESCOLOR: shady_range
+;NOCOLOR:  shady_range
+
+%struct.Klass = type { i32, i32 }
+
+define i32 @shady_range(i32 %argc, i8** nocapture %argv) uwtable {
+  %a.i = alloca [4 x %struct.Klass], align 16
+  %b.i = alloca [4 x %struct.Klass], align 16
+  %a8 = bitcast [4 x %struct.Klass]* %a.i to i8*
+  %b8 = bitcast [4 x %struct.Klass]* %b.i to i8*
+  ; I am used outside the lifetime zone below:
+  %z2 = getelementptr inbounds [4 x %struct.Klass]* %a.i, i64 0, i64 0, i32 0
+  call void @llvm.lifetime.start(i64 -1, i8* %a8)
+  call void @llvm.lifetime.start(i64 -1, i8* %b8)
+  %z3 = load i32* %z2, align 16
+  %r = call i32 @foo(i32 %z3, i8* %a8)
+  %r2 = call i32 @foo(i32 %z3, i8* %b8)
+  call void @llvm.lifetime.end(i64 -1, i8* %a8)
+  call void @llvm.lifetime.end(i64 -1, i8* %b8)
+  ret i32 9
+}
+
 declare void @bar([100 x i32]* , [100 x i32]*) nounwind
 
 declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
 
 declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
 
- declare i32 @foo(i32, i8*)
+declare i32 @foo(i32, i8*)
 
diff --git a/test/CodeGen/X86/atomic16.ll b/test/CodeGen/X86/atomic16.ll
new file mode 100644
index 0000000000..824995d6cb
--- /dev/null
+++ b/test/CodeGen/X86/atomic16.ll
@@ -0,0 +1,250 @@
+; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mcpu=corei7 -show-mc-encoding | FileCheck %s --check-prefix X64
+; RUN: llc < %s -O0 -mtriple=i386-unknown-unknown -mcpu=corei7 | FileCheck %s --check-prefix X32
+
+@sc16 = external global i16
+
+define void @atomic_fetch_add16() nounwind {
+; X64:   atomic_fetch_add16
+; X32:   atomic_fetch_add16
+entry:
+; 32-bit
+  %t1 = atomicrmw add  i16* @sc16, i16 1 acquire
+; X64:       lock
+; X64:       incw
+; X32:       lock
+; X32:       incw
+  %t2 = atomicrmw add  i16* @sc16, i16 3 acquire
+; X64:       lock
+; X64:       addw $3, {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       addw $3
+  %t3 = atomicrmw add  i16* @sc16, i16 5 acquire
+; X64:       lock
+; X64:       xaddw {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       xaddw
+  %t4 = atomicrmw add  i16* @sc16, i16 %t3 acquire
+; X64:       lock
+; X64:       addw {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       addw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_sub16() nounwind {
+; X64:   atomic_fetch_sub16
+; X32:   atomic_fetch_sub16
+  %t1 = atomicrmw sub  i16* @sc16, i16 1 acquire
+; X64:       lock
+; X64:       decw
+; X32:       lock
+; X32:       decw
+  %t2 = atomicrmw sub  i16* @sc16, i16 3 acquire
+; X64:       lock
+; X64:       subw $3, {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       subw $3
+  %t3 = atomicrmw sub  i16* @sc16, i16 5 acquire
+; X64:       lock
+; X64:       xaddw {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       xaddw
+  %t4 = atomicrmw sub  i16* @sc16, i16 %t3 acquire
+; X64:       lock
+; X64:       subw {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       subw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_and16() nounwind {
+; X64:   atomic_fetch_and16
+; X32:   atomic_fetch_and16
+  %t1 = atomicrmw and  i16* @sc16, i16 3 acquire
+; X64:       lock
+; X64:       andw $3, {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       andw $3
+  %t2 = atomicrmw and  i16* @sc16, i16 5 acquire
+; X64:       andw
+; X64:       lock
+; X64:       cmpxchgw
+; X32:       andw
+; X32:       lock
+; X32:       cmpxchgw
+  %t3 = atomicrmw and  i16* @sc16, i16 %t2 acquire
+; X64:       lock
+; X64:       andw {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       andw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_or16() nounwind {
+; X64:   atomic_fetch_or16
+; X32:   atomic_fetch_or16
+  %t1 = atomicrmw or   i16* @sc16, i16 3 acquire
+; X64:       lock
+; X64:       orw $3, {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       orw $3
+  %t2 = atomicrmw or   i16* @sc16, i16 5 acquire
+; X64:       orw
+; X64:       lock
+; X64:       cmpxchgw
+; X32:       orw
+; X32:       lock
+; X32:       cmpxchgw
+  %t3 = atomicrmw or   i16* @sc16, i16 %t2 acquire
+; X64:       lock
+; X64:       orw {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       orw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_xor16() nounwind {
+; X64:   atomic_fetch_xor16
+; X32:   atomic_fetch_xor16
+  %t1 = atomicrmw xor  i16* @sc16, i16 3 acquire
+; X64:       lock
+; X64:       xorw $3, {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       xorw $3
+  %t2 = atomicrmw xor  i16* @sc16, i16 5 acquire
+; X64:       xorw
+; X64:       lock
+; X64:       cmpxchgw
+; X32:       xorw
+; X32:       lock
+; X32:       cmpxchgw
+  %t3 = atomicrmw xor  i16* @sc16, i16 %t2 acquire
+; X64:       lock
+; X64:       xorw {{.*}} # encoding: [0xf0,0x66
+; X32:       lock
+; X32:       xorw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_nand16(i16 %x) nounwind {
+; X64:   atomic_fetch_nand16
+; X32:   atomic_fetch_nand16
+  %t1 = atomicrmw nand i16* @sc16, i16 %x acquire
+; X64:       andw
+; X64:       notw
+; X64:       lock
+; X64:       cmpxchgw
+; X32:       andw
+; X32:       notw
+; X32:       lock
+; X32:       cmpxchgw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_max16(i16 %x) nounwind {
+  %t1 = atomicrmw max  i16* @sc16, i16 %x acquire
+; X64:       cmpw
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgw
+
+; X32:       cmpw
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_min16(i16 %x) nounwind {
+  %t1 = atomicrmw min  i16* @sc16, i16 %x acquire
+; X64:       cmpw
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgw
+
+; X32:       cmpw
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umax16(i16 %x) nounwind {
+  %t1 = atomicrmw umax i16* @sc16, i16 %x acquire
+; X64:       cmpw
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgw
+
+; X32:       cmpw
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umin16(i16 %x) nounwind {
+  %t1 = atomicrmw umin i16* @sc16, i16 %x acquire
+; X64:       cmpw
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgw
+; X32:       cmpw
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_cmpxchg16() nounwind {
+  %t1 = cmpxchg i16* @sc16, i16 0, i16 1 acquire
+; X64:       lock
+; X64:       cmpxchgw
+; X32:       lock
+; X32:       cmpxchgw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_store16(i16 %x) nounwind {
+  store atomic i16 %x, i16* @sc16 release, align 4
+; X64-NOT:   lock
+; X64:       movw
+; X32-NOT:   lock
+; X32:       movw
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_swap16(i16 %x) nounwind {
+  %t1 = atomicrmw xchg i16* @sc16, i16 %x acquire
+; X64-NOT:   lock
+; X64:       xchgw
+; X32-NOT:   lock
+; X32:       xchgw
+  ret void
+; X64:       ret
+; X32:       ret
+}
diff --git a/test/CodeGen/X86/atomic32.ll b/test/CodeGen/X86/atomic32.ll
new file mode 100644
index 0000000000..dc927d8cb6
--- /dev/null
+++ b/test/CodeGen/X86/atomic32.ll
@@ -0,0 +1,250 @@
+; RUN: llc < %s -O0 -march=x86-64 -mcpu=corei7 | FileCheck %s --check-prefix X64
+; RUN: llc < %s -O0 -march=x86 -mcpu=corei7 | FileCheck %s --check-prefix X32
+
+@sc32 = external global i32
+
+define void @atomic_fetch_add32() nounwind {
+; X64:   atomic_fetch_add32
+; X32:   atomic_fetch_add32
+entry:
+; 32-bit
+  %t1 = atomicrmw add  i32* @sc32, i32 1 acquire
+; X64:       lock
+; X64:       incl
+; X32:       lock
+; X32:       incl
+  %t2 = atomicrmw add  i32* @sc32, i32 3 acquire
+; X64:       lock
+; X64:       addl $3
+; X32:       lock
+; X32:       addl $3
+  %t3 = atomicrmw add  i32* @sc32, i32 5 acquire
+; X64:       lock
+; X64:       xaddl
+; X32:       lock
+; X32:       xaddl
+  %t4 = atomicrmw add  i32* @sc32, i32 %t3 acquire
+; X64:       lock
+; X64:       addl
+; X32:       lock
+; X32:       addl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_sub32() nounwind {
+; X64:   atomic_fetch_sub32
+; X32:   atomic_fetch_sub32
+  %t1 = atomicrmw sub  i32* @sc32, i32 1 acquire
+; X64:       lock
+; X64:       decl
+; X32:       lock
+; X32:       decl
+  %t2 = atomicrmw sub  i32* @sc32, i32 3 acquire
+; X64:       lock
+; X64:       subl $3
+; X32:       lock
+; X32:       subl $3
+  %t3 = atomicrmw sub  i32* @sc32, i32 5 acquire
+; X64:       lock
+; X64:       xaddl
+; X32:       lock
+; X32:       xaddl
+  %t4 = atomicrmw sub  i32* @sc32, i32 %t3 acquire
+; X64:       lock
+; X64:       subl
+; X32:       lock
+; X32:       subl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_and32() nounwind {
+; X64:   atomic_fetch_and32
+; X32:   atomic_fetch_and32
+  %t1 = atomicrmw and  i32* @sc32, i32 3 acquire
+; X64:       lock
+; X64:       andl $3
+; X32:       lock
+; X32:       andl $3
+  %t2 = atomicrmw and  i32* @sc32, i32 5 acquire
+; X64:       andl
+; X64:       lock
+; X64:       cmpxchgl
+; X32:       andl
+; X32:       lock
+; X32:       cmpxchgl
+  %t3 = atomicrmw and  i32* @sc32, i32 %t2 acquire
+; X64:       lock
+; X64:       andl
+; X32:       lock
+; X32:       andl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_or32() nounwind {
+; X64:   atomic_fetch_or32
+; X32:   atomic_fetch_or32
+  %t1 = atomicrmw or   i32* @sc32, i32 3 acquire
+; X64:       lock
+; X64:       orl $3
+; X32:       lock
+; X32:       orl $3
+  %t2 = atomicrmw or   i32* @sc32, i32 5 acquire
+; X64:       orl
+; X64:       lock
+; X64:       cmpxchgl
+; X32:       orl
+; X32:       lock
+; X32:       cmpxchgl
+  %t3 = atomicrmw or   i32* @sc32, i32 %t2 acquire
+; X64:       lock
+; X64:       orl
+; X32:       lock
+; X32:       orl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_xor32() nounwind {
+; X64:   atomic_fetch_xor32
+; X32:   atomic_fetch_xor32
+  %t1 = atomicrmw xor  i32* @sc32, i32 3 acquire
+; X64:       lock
+; X64:       xorl $3
+; X32:       lock
+; X32:       xorl $3
+  %t2 = atomicrmw xor  i32* @sc32, i32 5 acquire
+; X64:       xorl
+; X64:       lock
+; X64:       cmpxchgl
+; X32:       xorl
+; X32:       lock
+; X32:       cmpxchgl
+  %t3 = atomicrmw xor  i32* @sc32, i32 %t2 acquire
+; X64:       lock
+; X64:       xorl
+; X32:       lock
+; X32:       xorl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_nand32(i32 %x) nounwind {
+; X64:   atomic_fetch_nand32
+; X32:   atomic_fetch_nand32
+  %t1 = atomicrmw nand i32* @sc32, i32 %x acquire
+; X64:       andl
+; X64:       notl
+; X64:       lock
+; X64:       cmpxchgl
+; X32:       andl
+; X32:       notl
+; X32:       lock
+; X32:       cmpxchgl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_max32(i32 %x) nounwind {
+  %t1 = atomicrmw max  i32* @sc32, i32 %x acquire
+; X64:       cmpl
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgl
+
+; X32:       cmpl
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_min32(i32 %x) nounwind {
+  %t1 = atomicrmw min  i32* @sc32, i32 %x acquire
+; X64:       cmpl
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgl
+
+; X32:       cmpl
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umax32(i32 %x) nounwind {
+  %t1 = atomicrmw umax i32* @sc32, i32 %x acquire
+; X64:       cmpl
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgl
+
+; X32:       cmpl
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umin32(i32 %x) nounwind {
+  %t1 = atomicrmw umin i32* @sc32, i32 %x acquire
+; X64:       cmpl
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgl
+; X32:       cmpl
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_cmpxchg32() nounwind {
+  %t1 = cmpxchg i32* @sc32, i32 0, i32 1 acquire
+; X64:       lock
+; X64:       cmpxchgl
+; X32:       lock
+; X32:       cmpxchgl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_store32(i32 %x) nounwind {
+  store atomic i32 %x, i32* @sc32 release, align 4
+; X64-NOT:   lock
+; X64:       movl
+; X32-NOT:   lock
+; X32:       movl
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_swap32(i32 %x) nounwind {
+  %t1 = atomicrmw xchg i32* @sc32, i32 %x acquire
+; X64-NOT:   lock
+; X64:       xchgl
+; X32-NOT:   lock
+; X32:       xchgl
+  ret void
+; X64:       ret
+; X32:       ret
+}
diff --git a/test/CodeGen/X86/atomic64.ll b/test/CodeGen/X86/atomic64.ll
new file mode 100644
index 0000000000..45785cc8fe
--- /dev/null
+++ b/test/CodeGen/X86/atomic64.ll
@@ -0,0 +1,216 @@
+; RUN: llc < %s -O0 -march=x86-64 -mcpu=corei7 | FileCheck %s --check-prefix X64
+
+@sc64 = external global i64
+
+define void @atomic_fetch_add64() nounwind {
+; X64:   atomic_fetch_add64
+entry:
+  %t1 = atomicrmw add  i64* @sc64, i64 1 acquire
+; X64:       lock
+; X64:       incq
+  %t2 = atomicrmw add  i64* @sc64, i64 3 acquire
+; X64:       lock
+; X64:       addq $3
+  %t3 = atomicrmw add  i64* @sc64, i64 5 acquire
+; X64:       lock
+; X64:       xaddq
+  %t4 = atomicrmw add  i64* @sc64, i64 %t3 acquire
+; X64:       lock
+; X64:       addq
+  ret void
+; X64:       ret
+}
+
+define void @atomic_fetch_sub64() nounwind {
+; X64:   atomic_fetch_sub64
+  %t1 = atomicrmw sub  i64* @sc64, i64 1 acquire
+; X64:       lock
+; X64:       decq
+  %t2 = atomicrmw sub  i64* @sc64, i64 3 acquire
+; X64:       lock
+; X64:       subq $3
+  %t3 = atomicrmw sub  i64* @sc64, i64 5 acquire
+; X64:       lock
+; X64:       xaddq
+  %t4 = atomicrmw sub  i64* @sc64, i64 %t3 acquire
+; X64:       lock
+; X64:       subq
+  ret void
+; X64:       ret
+}
+
+define void @atomic_fetch_and64() nounwind {
+; X64:   atomic_fetch_and64
+  %t1 = atomicrmw and  i64* @sc64, i64 3 acquire
+; X64:       lock
+; X64:       andq $3
+  %t2 = atomicrmw and  i64* @sc64, i64 5 acquire
+; X64:       andq
+; X64:       lock
+; X64:       cmpxchgq
+  %t3 = atomicrmw and  i64* @sc64, i64 %t2 acquire
+; X64:       lock
+; X64:       andq
+  ret void
+; X64:       ret
+}
+
+define void @atomic_fetch_or64() nounwind {
+; X64:   atomic_fetch_or64
+  %t1 = atomicrmw or   i64* @sc64, i64 3 acquire
+; X64:       lock
+; X64:       orq $3
+  %t2 = atomicrmw or   i64* @sc64, i64 5 acquire
+; X64:       orq
+; X64:       lock
+; X64:       cmpxchgq
+  %t3 = atomicrmw or   i64* @sc64, i64 %t2 acquire
+; X64:       lock
+; X64:       orq
+  ret void
+; X64:       ret
+}
+
+define void @atomic_fetch_xor64() nounwind {
+; X64:   atomic_fetch_xor64
+  %t1 = atomicrmw xor  i64* @sc64, i64 3 acquire
+; X64:       lock
+; X64:       xorq $3
+  %t2 = atomicrmw xor  i64* @sc64, i64 5 acquire
+; X64:       xorq
+; X64:       lock
+; X64:       cmpxchgq
+  %t3 = atomicrmw xor  i64* @sc64, i64 %t2 acquire
+; X64:       lock
+; X64:       xorq
+  ret void
+; X64:       ret
+}
+
+define void @atomic_fetch_nand64(i64 %x) nounwind {
+; X64:   atomic_fetch_nand64
+; X32:   atomic_fetch_nand64
+  %t1 = atomicrmw nand i64* @sc64, i64 %x acquire
+; X64:       andq
+; X64:       notq
+; X64:       lock
+; X64:       cmpxchgq
+; X32:       andl
+; X32:       andl
+; X32:       notl
+; X32:       notl
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_max64(i64 %x) nounwind {
+  %t1 = atomicrmw max  i64* @sc64, i64 %x acquire
+; X64:       cmpq
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgq
+
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_min64(i64 %x) nounwind {
+  %t1 = atomicrmw min  i64* @sc64, i64 %x acquire
+; X64:       cmpq
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgq
+
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umax64(i64 %x) nounwind {
+  %t1 = atomicrmw umax i64* @sc64, i64 %x acquire
+; X64:       cmpq
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgq
+
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umin64(i64 %x) nounwind {
+  %t1 = atomicrmw umin i64* @sc64, i64 %x acquire
+; X64:       cmpq
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgq
+
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_cmpxchg64() nounwind {
+  %t1 = cmpxchg i64* @sc64, i64 0, i64 1 acquire
+; X64:       lock
+; X64:       cmpxchgq
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_store64(i64 %x) nounwind {
+  store atomic i64 %x, i64* @sc64 release, align 8
+; X64-NOT:   lock
+; X64:       movq
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_swap64(i64 %x) nounwind {
+  %t1 = atomicrmw xchg i64* @sc64, i64 %x acquire
+; X64-NOT:   lock
+; X64:       xchgq
+; X32:       lock
+; X32:       xchg8b
+  ret void
+; X64:       ret
+; X32:       ret
+}
diff --git a/test/CodeGen/X86/atomic6432.ll b/test/CodeGen/X86/atomic6432.ll
new file mode 100644
index 0000000000..556c36ebfd
--- /dev/null
+++ b/test/CodeGen/X86/atomic6432.ll
@@ -0,0 +1,209 @@
+; RUN: llc < %s -O0 -march=x86 -mcpu=corei7 | FileCheck %s --check-prefix X32
+; XFAIL: *
+
+@sc64 = external global i64
+
+define void @atomic_fetch_add64() nounwind {
+; X32:   atomic_fetch_add64
+entry:
+  %t1 = atomicrmw add  i64* @sc64, i64 1 acquire
+; X32:       addl
+; X32:       adcl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t2 = atomicrmw add  i64* @sc64, i64 3 acquire
+; X32:       addl
+; X32:       adcl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t3 = atomicrmw add  i64* @sc64, i64 5 acquire
+; X32:       addl
+; X32:       adcl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t4 = atomicrmw add  i64* @sc64, i64 %t3 acquire
+; X32:       addl
+; X32:       adcl
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_sub64() nounwind {
+; X32:   atomic_fetch_sub64
+  %t1 = atomicrmw sub  i64* @sc64, i64 1 acquire
+; X32:       subl
+; X32:       sbbl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t2 = atomicrmw sub  i64* @sc64, i64 3 acquire
+; X32:       subl
+; X32:       sbbl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t3 = atomicrmw sub  i64* @sc64, i64 5 acquire
+; X32:       subl
+; X32:       sbbl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t4 = atomicrmw sub  i64* @sc64, i64 %t3 acquire
+; X32:       subl
+; X32:       sbbl
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_and64() nounwind {
+; X32:   atomic_fetch_and64
+  %t1 = atomicrmw and  i64* @sc64, i64 3 acquire
+; X32:       andl
+; X32:       andl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t2 = atomicrmw and  i64* @sc64, i64 5 acquire
+; X32:       andl
+; X32:       andl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t3 = atomicrmw and  i64* @sc64, i64 %t2 acquire
+; X32:       andl
+; X32:       andl
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_or64() nounwind {
+; X32:   atomic_fetch_or64
+  %t1 = atomicrmw or   i64* @sc64, i64 3 acquire
+; X32:       orl
+; X32:       orl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t2 = atomicrmw or   i64* @sc64, i64 5 acquire
+; X32:       orl
+; X32:       orl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t3 = atomicrmw or   i64* @sc64, i64 %t2 acquire
+; X32:       orl
+; X32:       orl
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_xor64() nounwind {
+; X32:   atomic_fetch_xor64
+  %t1 = atomicrmw xor  i64* @sc64, i64 3 acquire
+; X32:       xorl
+; X32:       xorl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t2 = atomicrmw xor  i64* @sc64, i64 5 acquire
+; X32:       xorl
+; X32:       xorl
+; X32:       lock
+; X32:       cmpxchg8b
+  %t3 = atomicrmw xor  i64* @sc64, i64 %t2 acquire
+; X32:       xorl
+; X32:       xorl
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_nand64(i64 %x) nounwind {
+; X32:   atomic_fetch_nand64
+  %t1 = atomicrmw nand i64* @sc64, i64 %x acquire
+; X32:       andl
+; X32:       andl
+; X32:       notl
+; X32:       notl
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_max64(i64 %x) nounwind {
+  %t1 = atomicrmw max  i64* @sc64, i64 %x acquire
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_min64(i64 %x) nounwind {
+  %t1 = atomicrmw min  i64* @sc64, i64 %x acquire
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_umax64(i64 %x) nounwind {
+  %t1 = atomicrmw umax i64* @sc64, i64 %x acquire
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_umin64(i64 %x) nounwind {
+  %t1 = atomicrmw umin i64* @sc64, i64 %x acquire
+; X32:       cmpl
+; X32:       cmpl
+; X32:       cmov
+; X32:       cmov
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_cmpxchg64() nounwind {
+  %t1 = cmpxchg i64* @sc64, i64 0, i64 1 acquire
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_store64(i64 %x) nounwind {
+  store atomic i64 %x, i64* @sc64 release, align 8
+; X32:       lock
+; X32:       cmpxchg8b
+  ret void
+; X32:       ret
+}
+
+define void @atomic_fetch_swap64(i64 %x) nounwind {
+  %t1 = atomicrmw xchg i64* @sc64, i64 %x acquire
+; X32:       lock
+; X32:       xchg8b
+  ret void
+; X32:       ret
+}
diff --git a/test/CodeGen/X86/atomic8.ll b/test/CodeGen/X86/atomic8.ll
new file mode 100644
index 0000000000..412428406d
--- /dev/null
+++ b/test/CodeGen/X86/atomic8.ll
@@ -0,0 +1,250 @@
+; RUN: llc < %s -O0 -march=x86-64 -mcpu=corei7 | FileCheck %s --check-prefix X64
+; RUN: llc < %s -O0 -march=x86 -mcpu=corei7 | FileCheck %s --check-prefix X32
+
+@sc8 = external global i8
+
+define void @atomic_fetch_add8() nounwind {
+; X64:   atomic_fetch_add8
+; X32:   atomic_fetch_add8
+entry:
+; 32-bit
+  %t1 = atomicrmw add  i8* @sc8, i8 1 acquire
+; X64:       lock
+; X64:       incb
+; X32:       lock
+; X32:       incb
+  %t2 = atomicrmw add  i8* @sc8, i8 3 acquire
+; X64:       lock
+; X64:       addb $3
+; X32:       lock
+; X32:       addb $3
+  %t3 = atomicrmw add  i8* @sc8, i8 5 acquire
+; X64:       lock
+; X64:       xaddb
+; X32:       lock
+; X32:       xaddb
+  %t4 = atomicrmw add  i8* @sc8, i8 %t3 acquire
+; X64:       lock
+; X64:       addb
+; X32:       lock
+; X32:       addb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_sub8() nounwind {
+; X64:   atomic_fetch_sub8
+; X32:   atomic_fetch_sub8
+  %t1 = atomicrmw sub  i8* @sc8, i8 1 acquire
+; X64:       lock
+; X64:       decb
+; X32:       lock
+; X32:       decb
+  %t2 = atomicrmw sub  i8* @sc8, i8 3 acquire
+; X64:       lock
+; X64:       subb $3
+; X32:       lock
+; X32:       subb $3
+  %t3 = atomicrmw sub  i8* @sc8, i8 5 acquire
+; X64:       lock
+; X64:       xaddb
+; X32:       lock
+; X32:       xaddb
+  %t4 = atomicrmw sub  i8* @sc8, i8 %t3 acquire
+; X64:       lock
+; X64:       subb
+; X32:       lock
+; X32:       subb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_and8() nounwind {
+; X64:   atomic_fetch_and8
+; X32:   atomic_fetch_and8
+  %t1 = atomicrmw and  i8* @sc8, i8 3 acquire
+; X64:       lock
+; X64:       andb $3
+; X32:       lock
+; X32:       andb $3
+  %t2 = atomicrmw and  i8* @sc8, i8 5 acquire
+; X64:       andb
+; X64:       lock
+; X64:       cmpxchgb
+; X32:       andb
+; X32:       lock
+; X32:       cmpxchgb
+  %t3 = atomicrmw and  i8* @sc8, i8 %t2 acquire
+; X64:       lock
+; X64:       andb
+; X32:       lock
+; X32:       andb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_or8() nounwind {
+; X64:   atomic_fetch_or8
+; X32:   atomic_fetch_or8
+  %t1 = atomicrmw or   i8* @sc8, i8 3 acquire
+; X64:       lock
+; X64:       orb $3
+; X32:       lock
+; X32:       orb $3
+  %t2 = atomicrmw or   i8* @sc8, i8 5 acquire
+; X64:       orb
+; X64:       lock
+; X64:       cmpxchgb
+; X32:       orb
+; X32:       lock
+; X32:       cmpxchgb
+  %t3 = atomicrmw or   i8* @sc8, i8 %t2 acquire
+; X64:       lock
+; X64:       orb
+; X32:       lock
+; X32:       orb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_xor8() nounwind {
+; X64:   atomic_fetch_xor8
+; X32:   atomic_fetch_xor8
+  %t1 = atomicrmw xor  i8* @sc8, i8 3 acquire
+; X64:       lock
+; X64:       xorb $3
+; X32:       lock
+; X32:       xorb $3
+  %t2 = atomicrmw xor  i8* @sc8, i8 5 acquire
+; X64:       xorb
+; X64:       lock
+; X64:       cmpxchgb
+; X32:       xorb
+; X32:       lock
+; X32:       cmpxchgb
+  %t3 = atomicrmw xor  i8* @sc8, i8 %t2 acquire
+; X64:       lock
+; X64:       xorb
+; X32:       lock
+; X32:       xorb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_nand8(i8 %x) nounwind {
+; X64:   atomic_fetch_nand8
+; X32:   atomic_fetch_nand8
+  %t1 = atomicrmw nand i8* @sc8, i8 %x acquire
+; X64:       andb
+; X64:       notb
+; X64:       lock
+; X64:       cmpxchgb
+; X32:       andb
+; X32:       notb
+; X32:       lock
+; X32:       cmpxchgb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_max8(i8 %x) nounwind {
+  %t1 = atomicrmw max  i8* @sc8, i8 %x acquire
+; X64:       cmpb
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgb
+
+; X32:       cmpb
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_min8(i8 %x) nounwind {
+  %t1 = atomicrmw min  i8* @sc8, i8 %x acquire
+; X64:       cmpb
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgb
+
+; X32:       cmpb
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umax8(i8 %x) nounwind {
+  %t1 = atomicrmw umax i8* @sc8, i8 %x acquire
+; X64:       cmpb
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgb
+
+; X32:       cmpb
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_umin8(i8 %x) nounwind {
+  %t1 = atomicrmw umin i8* @sc8, i8 %x acquire
+; X64:       cmpb
+; X64:       cmov
+; X64:       lock
+; X64:       cmpxchgb
+; X32:       cmpb
+; X32:       cmov
+; X32:       lock
+; X32:       cmpxchgb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_cmpxchg8() nounwind {
+  %t1 = cmpxchg i8* @sc8, i8 0, i8 1 acquire
+; X64:       lock
+; X64:       cmpxchgb
+; X32:       lock
+; X32:       cmpxchgb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_store8(i8 %x) nounwind {
+  store atomic i8 %x, i8* @sc8 release, align 4
+; X64-NOT:   lock
+; X64:       movb
+; X32-NOT:   lock
+; X32:       movb
+  ret void
+; X64:       ret
+; X32:       ret
+}
+
+define void @atomic_fetch_swap8(i8 %x) nounwind {
+  %t1 = atomicrmw xchg i8* @sc8, i8 %x acquire
+; X64-NOT:   lock
+; X64:       xchgb
+; X32-NOT:   lock
+; X32:       xchgb
+  ret void
+; X64:       ret
+; X32:       ret
+}
diff --git a/test/CodeGen/X86/atomic_add.ll b/test/CodeGen/X86/atomic_add.ll
index 1fce256a8a..d94499889d 100644
--- a/test/CodeGen/X86/atomic_add.ll
+++ b/test/CodeGen/X86/atomic_add.ll
@@ -178,7 +178,8 @@ entry:
 define void @sub2(i16* nocapture %p, i32 %v) nounwind ssp {
 entry:
 ; CHECK: sub2:
-; CHECK: negl
+; CHECK-NOT: negl
+; CHECK: subw
 	%0 = trunc i32 %v to i16		; <i16> [#uses=1]
   %1 = atomicrmw sub i16* %p, i16 %0 monotonic
   ret void
diff --git a/test/CodeGen/X86/atomic_op.ll b/test/CodeGen/X86/atomic_op.ll
index 152bece424..c5fa07d07d 100644
--- a/test/CodeGen/X86/atomic_op.ll
+++ b/test/CodeGen/X86/atomic_op.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=generic -march=x86 | FileCheck %s
+; RUN: llc < %s -mcpu=generic -march=x86 -mattr=+cmov | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 
@@ -107,13 +107,12 @@ entry:
         ; CHECK: cmpxchgl
   %17 = cmpxchg i32* %val2, i32 1976, i32 1 monotonic
 	store i32 %17, i32* %old
+        ; CHECK: movl  [[R17atomic:.*]], %eax
         ; CHECK: movl	$1401, %[[R17mask:[a-z]*]]
-        ; CHECK: movl	[[R17atomic:.*]], %eax
-        ; CHECK: movl	%eax, %[[R17newval:[a-z]*]]
-        ; CHECK: andl	%[[R17mask]], %[[R17newval]]
-        ; CHECK: notl	%[[R17newval]]
+        ; CHECK: andl	%eax, %[[R17mask]]
+        ; CHECK: notl	%[[R17mask]]
         ; CHECK: lock
-        ; CHECK: cmpxchgl	%[[R17newval]], [[R17atomic]]
+        ; CHECK: cmpxchgl	%[[R17mask]], [[R17atomic]]
         ; CHECK: jne
         ; CHECK: movl	%eax,
   %18 = atomicrmw nand i32* %val2, i32 1401 monotonic
diff --git a/test/CodeGen/X86/avx2-shuffle.ll b/test/CodeGen/X86/avx2-shuffle.ll
index 267a806672..a414e6880c 100644
--- a/test/CodeGen/X86/avx2-shuffle.ll
+++ b/test/CodeGen/X86/avx2-shuffle.ll
@@ -28,7 +28,7 @@ entry:
 }
 
 ; CHECK: vpshufb_test
-; CHECK; vpshufb {{.*\(%r.*}}, %ymm
+; CHECK: vpshufb {{.*\(%r.*}}, %ymm
 ; CHECK: ret
 define <32 x i8> @vpshufb_test(<32 x i8> %a) nounwind {
   %S = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, 
@@ -39,7 +39,7 @@ define <32 x i8> @vpshufb_test(<32 x i8> %a) nounwind {
 }
 
 ; CHECK: vpshufb1_test
-; CHECK; vpshufb {{.*\(%r.*}}, %ymm
+; CHECK: vpshufb {{.*\(%r.*}}, %ymm
 ; CHECK: ret
 define <32 x i8> @vpshufb1_test(<32 x i8> %a) nounwind {
   %S = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, 
@@ -51,7 +51,7 @@ define <32 x i8> @vpshufb1_test(<32 x i8> %a) nounwind {
 
 
 ; CHECK: vpshufb2_test
-; CHECK; vpshufb {{.*\(%r.*}}, %ymm
+; CHECK: vpshufb {{.*\(%r.*}}, %ymm
 ; CHECK: ret
 define <32 x i8> @vpshufb2_test(<32 x i8> %a) nounwind {
   %S = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, 
diff --git a/test/CodeGen/X86/bitcast-i256.ll b/test/CodeGen/X86/bitcast-i256.ll
new file mode 100644
index 0000000000..85ac2fed6f
--- /dev/null
+++ b/test/CodeGen/X86/bitcast-i256.ll
@@ -0,0 +1,11 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=core-avx-i < %s | FileCheck %s --check-prefix CHECK
+
+define i256 @foo(<8 x i32> %a) {
+  %r = bitcast <8 x i32> %a to i256
+  ret i256 %r
+; CHECK: foo
+; CHECK: vextractf128
+; CHECK: vpextrq
+; CHECK: vpextrq
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/jump_sign.ll b/test/CodeGen/X86/jump_sign.ll
index 48e21061d2..0cf397d4af 100644
--- a/test/CodeGen/X86/jump_sign.ll
+++ b/test/CodeGen/X86/jump_sign.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=pentiumpro | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=pentiumpro -verify-machineinstrs | FileCheck %s
 
 define i32 @f(i32 %X) {
 entry:
@@ -253,3 +253,28 @@ return:
   %retval.0 = phi i8* [ %add.ptr, %if.end ], [ null, %entry ]
   ret i8* %retval.0
 }
+
+; Test optimizations of dec/inc.
+define i32 @dec(i32 %a) nounwind {
+entry:
+; CHECK: dec:
+; CHECK: decl
+; CHECK-NOT: test
+; CHECK: cmovsl
+  %sub = sub nsw i32 %a, 1
+  %cmp = icmp sgt i32 %sub, 0
+  %cond = select i1 %cmp, i32 %sub, i32 0
+  ret i32 %cond
+}
+
+define i32 @inc(i32 %a) nounwind {
+entry:
+; CHECK: inc:
+; CHECK: incl
+; CHECK-NOT: test
+; CHECK: cmovsl
+  %add = add nsw i32 %a, 1
+  %cmp = icmp sgt i32 %add, 0
+  %cond = select i1 %cmp, i32 %add, i32 0
+  ret i32 %cond
+}
diff --git a/test/CodeGen/X86/ms-inline-asm.ll b/test/CodeGen/X86/ms-inline-asm.ll
index 8db86580a6..cd06fe68a6 100644
--- a/test/CodeGen/X86/ms-inline-asm.ll
+++ b/test/CodeGen/X86/ms-inline-asm.ll
@@ -12,3 +12,15 @@ entry:
 ; CHECK: .att_syntax
 ; CHECK: {{## InlineAsm End|#NO_APP}}
 }
+
+define void @t2() nounwind {
+entry:
+  call void asm sideeffect inteldialect "mov eax, $$1", "~{eax},~{dirflag},~{fpsr},~{flags}"() nounwind
+  ret void
+; CHECK: t2
+; CHECK: {{## InlineAsm Start|#APP}}
+; CHECK: .intel_syntax
+; CHECK: mov eax, 1
+; CHECK: .att_syntax
+; CHECK: {{## InlineAsm End|#NO_APP}}
+}
diff --git a/test/CodeGen/X86/pmovext.ll b/test/CodeGen/X86/pmovext.ll
new file mode 100644
index 0000000000..16e9c28fcd
--- /dev/null
+++ b/test/CodeGen/X86/pmovext.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
+
+; rdar://11897677
+
+;CHECK: intrin_pmov
+;CHECK: pmovzxbw  (%{{.*}}), %xmm0
+;CHECK-NEXT: movdqu
+;CHECK-NEXT: ret
+define void @intrin_pmov(i16* noalias %dest, i8* noalias %src) nounwind uwtable ssp {
+  %1 = bitcast i8* %src to <2 x i64>*
+  %2 = load <2 x i64>* %1, align 16
+  %3 = bitcast <2 x i64> %2 to <16 x i8>
+  %4 = tail call <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8> %3) nounwind
+  %5 = bitcast i16* %dest to i8*
+  %6 = bitcast <8 x i16> %4 to <16 x i8>
+  tail call void @llvm.x86.sse2.storeu.dq(i8* %5, <16 x i8> %6) nounwind
+  ret void
+}
+
+declare <8 x i16> @llvm.x86.sse41.pmovzxbw(<16 x i8>) nounwind readnone
+
+declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
diff --git a/test/CodeGen/X86/pr11985.ll b/test/CodeGen/X86/pr11985.ll
new file mode 100644
index 0000000000..fa378502f7
--- /dev/null
+++ b/test/CodeGen/X86/pr11985.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=prescott | FileCheck %s
+
+define float @foo(i8* nocapture %buf, float %a, float %b) nounwind uwtable {
+entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %buf, i8* blockaddress(@foo, %out), i64 22, i32 1, i1 false)
+  br label %out
+
+out:                                              ; preds = %entry
+  %add = fadd float %a, %b
+  ret float %add
+; CHECK: foo
+; CHECK: movw .L{{.*}}+20(%rip), %{{.*}}
+; CHECK: movl .L{{.*}}+16(%rip), %{{.*}}
+; CHECK: movq .L{{.*}}+8(%rip), %{{.*}}
+; CHECK: movq .L{{.*}}(%rip), %{{.*}}
+; CHECK: ret
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/CodeGen/X86/pr12312.ll b/test/CodeGen/X86/pr12312.ll
index 84102f148b..087b8d7539 100644
--- a/test/CodeGen/X86/pr12312.ll
+++ b/test/CodeGen/X86/pr12312.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse41,-avx < %s | FileCheck %s --check-prefix SSE41
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix AVX
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx,-avx2 < %s | FileCheck %s --check-prefix AVX
 
-define i32 @veccond(<4 x i32> %input) {
+define i32 @veccond128(<4 x i32> %input) {
 entry:
   %0 = bitcast <4 x i32> %input to i128
   %1 = icmp ne i128 %0, 0
@@ -11,38 +11,145 @@ if-true-block:                                    ; preds = %entry
   ret i32 0
 endif-block:                                      ; preds = %entry,
   ret i32 1
-; SSE41: veccond
+; SSE41: veccond128
 ; SSE41: ptest
 ; SSE41: ret
-; AVX:   veccond
-; AVX:   vptest
+; AVX:   veccond128
+; AVX:   vptest %xmm{{.*}}, %xmm{{.*}}
 ; AVX:   ret
 }
 
-define i32 @vectest(<4 x i32> %input) {
+define i32 @veccond256(<8 x i32> %input) {
+entry:
+  %0 = bitcast <8 x i32> %input to i256
+  %1 = icmp ne i256 %0, 0
+  br i1 %1, label %if-true-block, label %endif-block
+
+if-true-block:                                    ; preds = %entry
+  ret i32 0
+endif-block:                                      ; preds = %entry,
+  ret i32 1
+; SSE41: veccond256
+; SSE41: por
+; SSE41: ptest
+; SSE41: ret
+; AVX:   veccond256
+; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
+; AVX:   ret
+}
+
+define i32 @veccond512(<16 x i32> %input) {
+entry:
+  %0 = bitcast <16 x i32> %input to i512
+  %1 = icmp ne i512 %0, 0
+  br i1 %1, label %if-true-block, label %endif-block
+
+if-true-block:                                    ; preds = %entry
+  ret i32 0
+endif-block:                                      ; preds = %entry,
+  ret i32 1
+; SSE41: veccond512
+; SSE41: por
+; SSE41: por
+; SSE41: por
+; SSE41: ptest
+; SSE41: ret
+; AVX:   veccond512
+; AVX:   vorps
+; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
+; AVX:   ret
+}
+
+define i32 @vectest128(<4 x i32> %input) {
 entry:
   %0 = bitcast <4 x i32> %input to i128
   %1 = icmp ne i128 %0, 0
   %2 = zext i1 %1 to i32
   ret i32 %2
-; SSE41: vectest
+; SSE41: vectest128
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vectest128
+; AVX:   vptest %xmm{{.*}}, %xmm{{.*}}
+; AVX:   ret
+}
+
+define i32 @vectest256(<8 x i32> %input) {
+entry:
+  %0 = bitcast <8 x i32> %input to i256
+  %1 = icmp ne i256 %0, 0
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+; SSE41: vectest256
+; SSE41: por
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vectest256
+; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
+; AVX:   ret
+}
+
+define i32 @vectest512(<16 x i32> %input) {
+entry:
+  %0 = bitcast <16 x i32> %input to i512
+  %1 = icmp ne i512 %0, 0
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+; SSE41: vectest512
+; SSE41: por
+; SSE41: por
+; SSE41: por
 ; SSE41: ptest
 ; SSE41: ret
-; AVX:   vectest
-; AVX:   vptest
+; AVX:   vectest512
+; AVX:   vorps
+; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
 ; AVX:   ret
 }
 
-define i32 @vecsel(<4 x i32> %input, i32 %a, i32 %b) {
+define i32 @vecsel128(<4 x i32> %input, i32 %a, i32 %b) {
 entry:
   %0 = bitcast <4 x i32> %input to i128
   %1 = icmp ne i128 %0, 0
   %2 = select i1 %1, i32 %a, i32 %b
   ret i32 %2
-; SSE41: vecsel
+; SSE41: vecsel128
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vecsel128
+; AVX:   vptest %xmm{{.*}}, %xmm{{.*}}
+; AVX:   ret
+}
+
+define i32 @vecsel256(<8 x i32> %input, i32 %a, i32 %b) {
+entry:
+  %0 = bitcast <8 x i32> %input to i256
+  %1 = icmp ne i256 %0, 0
+  %2 = select i1 %1, i32 %a, i32 %b
+  ret i32 %2
+; SSE41: vecsel256
+; SSE41: por
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vecsel256
+; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
+; AVX:   ret
+}
+
+define i32 @vecsel512(<16 x i32> %input, i32 %a, i32 %b) {
+entry:
+  %0 = bitcast <16 x i32> %input to i512
+  %1 = icmp ne i512 %0, 0
+  %2 = select i1 %1, i32 %a, i32 %b
+  ret i32 %2
+; SSE41: vecsel512
+; SSE41: por
+; SSE41: por
+; SSE41: por
 ; SSE41: ptest
 ; SSE41: ret
-; AVX:   vecsel
-; AVX:   vptest
+; AVX:   vecsel512
+; AVX:   vorps
+; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
 ; AVX:   ret
 }
diff --git a/test/CodeGen/X86/pr13458.ll b/test/CodeGen/X86/pr13458.ll
new file mode 100644
index 0000000000..55548b3c3b
--- /dev/null
+++ b/test/CodeGen/X86/pr13458.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-darwin11.4.2"
+
+%v8_uniform_Stats.0.2.4.10 = type { i64, i64, i32, i32, i32, i32, i32, i32, i32, i32, i64, i64, i64, i32, i32, i32, i32, i32, i32, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i64, i32, i64, [7 x i32], [7 x i64] }
+
+@globalStats = external global %v8_uniform_Stats.0.2.4.10
+
+define void @MergeStats() nounwind {
+allocas:
+  %r.i.i720 = atomicrmw max i64* getelementptr inbounds (%v8_uniform_Stats.0.2.4.10* @globalStats, i64 0, i32 30), i64 0 seq_cst
+  ret void
+}
diff --git a/test/CodeGen/X86/pr13859.ll b/test/CodeGen/X86/pr13859.ll
new file mode 100644
index 0000000000..719721dfd8
--- /dev/null
+++ b/test/CodeGen/X86/pr13859.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.7.0"
+
+define void @_Z17FilterYUVRows_MMXi(i32 %af) nounwind ssp {
+entry:
+  %aMyAlloca = alloca i32, align 32
+  %dest = alloca <1 x i64>, align 32
+
+  %a32 = load i32* %aMyAlloca, align 4
+  %aconv = trunc i32 %a32 to i16
+  %a36 = insertelement <4 x i16> undef, i16 %aconv, i32 0
+  %a37 = insertelement <4 x i16> %a36, i16 %aconv, i32 1
+  %a38 = insertelement <4 x i16> %a37, i16 %aconv, i32 2
+  %a39 = insertelement <4 x i16> %a38, i16 %aconv, i32 3
+  %a40 = bitcast <4 x i16> %a39 to x86_mmx
+  %a41 = bitcast x86_mmx %a40 to <1 x i64>
+
+  %a47 = trunc i32 %a32 to i1
+  br i1 %a47, label %a48, label %a49
+
+a48:
+  unreachable
+
+a49:
+  store <1 x i64> %a41, <1 x i64>* %dest, align 8 ; !!!
+  ret void
+}
diff --git a/test/CodeGen/X86/pr5145.ll b/test/CodeGen/X86/pr5145.ll
new file mode 100644
index 0000000000..d048db8a85
--- /dev/null
+++ b/test/CodeGen/X86/pr5145.ll
@@ -0,0 +1,35 @@
+; RUN: llc -march=x86-64 < %s | FileCheck %s
+@sc8 = external global i8
+
+define void @atomic_maxmin_i8() {
+; CHECK: atomic_maxmin_i8
+  %1 = atomicrmw max  i8* @sc8, i8 5 acquire
+; CHECK: [[LABEL1:\.?LBB[0-9]+_[0-9]+]]:
+; CHECK: cmpb
+; CHECK: cmovl
+; CHECK: lock
+; CHECK-NEXT: cmpxchgb
+; CHECK: jne [[LABEL1]]
+  %2 = atomicrmw min  i8* @sc8, i8 6 acquire
+; CHECK: [[LABEL3:\.?LBB[0-9]+_[0-9]+]]:
+; CHECK: cmpb
+; CHECK: cmovg
+; CHECK: lock
+; CHECK-NEXT: cmpxchgb
+; CHECK: jne [[LABEL3]]
+  %3 = atomicrmw umax i8* @sc8, i8 7 acquire
+; CHECK: [[LABEL5:\.?LBB[0-9]+_[0-9]+]]:
+; CHECK: cmpb
+; CHECK: cmovb
+; CHECK: lock
+; CHECK-NEXT: cmpxchgb
+; CHECK: jne [[LABEL5]]
+  %4 = atomicrmw umin i8* @sc8, i8 8 acquire
+; CHECK: [[LABEL7:\.?LBB[0-9]+_[0-9]+]]:
+; CHECK: cmpb
+; CHECK: cmova
+; CHECK: lock
+; CHECK-NEXT: cmpxchgb
+; CHECK: jne [[LABEL7]]
+  ret void
+}
diff --git a/test/CodeGen/X86/sincos.ll b/test/CodeGen/X86/sincos.ll
index 1479be1f56..734f48ae32 100644
--- a/test/CodeGen/X86/sincos.ll
+++ b/test/CodeGen/X86/sincos.ll
@@ -1,6 +1,7 @@
 ; Make sure this testcase codegens to the sin and cos instructions, not calls
 ; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math  | FileCheck %s --check-prefix=SIN
 ; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math  | FileCheck %s --check-prefix=COS
+; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 | FileCheck %s --check-prefix=SAFE
 
 declare float  @sinf(float) readonly
 
@@ -17,6 +18,9 @@ define float @test1(float %X) {
 
 ; SIN-NOT: fsin
 
+; SAFE: test1
+; SAFE-NOT: fsin
+
 ; SIN: test2:
 define double @test2(double %X) {
         %Y = call double @sin(double %X) readonly
@@ -26,6 +30,9 @@ define double @test2(double %X) {
 
 ; SIN-NOT: fsin
 
+; SAFE: test2
+; SAFE-NOT: fsin
+
 ; SIN: test3:
 define x86_fp80 @test3(x86_fp80 %X) {
         %Y = call x86_fp80 @sinl(x86_fp80 %X) readonly
@@ -50,12 +57,18 @@ define float @test4(float %X) {
 }
 ; COS: {{^[ \t]*fcos}}
 
+; SAFE: test4
+; SAFE-NOT: fcos
+
 define double @test5(double %X) {
         %Y = call double @cos(double %X) readonly
         ret double %Y
 }
 ; COS: {{^[ \t]*fcos}}
 
+; SAFE: test5
+; SAFE-NOT: fcos
+
 define x86_fp80 @test6(x86_fp80 %X) {
         %Y = call x86_fp80 @cosl(x86_fp80 %X) readonly
         ret x86_fp80 %Y
diff --git a/test/CodeGen/X86/tailcall-64.ll b/test/CodeGen/X86/tailcall-64.ll
index 7030753415..adc8620060 100644
--- a/test/CodeGen/X86/tailcall-64.ll
+++ b/test/CodeGen/X86/tailcall-64.ll
@@ -93,4 +93,42 @@ define { i64, i64 } @crash(i8* %this) {
   ret { i64, i64 } %mrv7
 }
 
+; Check that we can fold an indexed load into a tail call instruction.
+; CHECK: fold_indexed_load
+; CHECK: leaq (%rsi,%rsi,4), %[[RAX:r..]]
+; CHECK: jmpq *16(%{{r..}},%[[RAX]],8)  # TAILCALL
+%struct.funcs = type { i32 (i8*, i32*, i32)*, i32 (i8*)*, i32 (i8*)*, i32 (i8*, i32)*, i32 }
+@func_table = external global [0 x %struct.funcs]
+define void @fold_indexed_load(i8* %mbstr, i64 %idxprom) nounwind uwtable ssp {
+entry:
+  %dsplen = getelementptr inbounds [0 x %struct.funcs]* @func_table, i64 0, i64 %idxprom, i32 2
+  %x1 = load i32 (i8*)** %dsplen, align 8
+  %call = tail call i32 %x1(i8* %mbstr) nounwind
+  ret void
+}
 
+; <rdar://problem/12282281> Fold an indexed load into the tail call instruction.
+; Calling a varargs function with 6 arguments requires 7 registers (%al is the
+; vector count for varargs functions). This leaves %r11 as the only available
+; scratch register.
+;
+; It is not possible to fold an indexed load into TCRETURNmi64 in that case.
+;
+; typedef int (*funcptr)(void*, ...);
+; extern const funcptr funcs[];
+; int f(int n) {
+;   return funcs[n](0, 0, 0, 0, 0, 0);
+; }
+;
+; CHECK: rdar12282281
+; CHECK: jmpq *%r11 # TAILCALL
+@funcs = external constant [0 x i32 (i8*, ...)*]
+
+define i32 @rdar12282281(i32 %n) nounwind uwtable ssp {
+entry:
+  %idxprom = sext i32 %n to i64
+  %arrayidx = getelementptr inbounds [0 x i32 (i8*, ...)*]* @funcs, i64 0, i64 %idxprom
+  %0 = load i32 (i8*, ...)** %arrayidx, align 8
+  %call = tail call i32 (i8*, ...)* %0(i8* null, i32 0, i32 0, i32 0, i32 0, i32 0) nounwind
+  ret i32 %call
+}
diff --git a/test/CodeGen/X86/xmulo.ll b/test/CodeGen/X86/xmulo.ll
new file mode 100644
index 0000000000..486dafeb5a
--- /dev/null
+++ b/test/CodeGen/X86/xmulo.ll
@@ -0,0 +1,50 @@
+; RUN: llc %s -o - | FileCheck %s
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.8.0"
+
+declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone
+declare i32 @printf(i8*, ...)
+
+@.str = private unnamed_addr constant [10 x i8] c"%llx, %d\0A\00", align 1
+
+define i32 @t1() nounwind {
+; CHECK: t1:
+; CHECK:  movl $0, 12(%esp)
+; CHECK:  movl $0, 8(%esp)
+; CHECK:  movl $72, 4(%esp)
+
+    %1 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 9, i64 8)
+    %2 = extractvalue {i64, i1} %1, 0
+    %3 = extractvalue {i64, i1} %1, 1
+    %4 = zext i1 %3 to i32
+    %5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([10 x i8]* @.str, i32 0, i32 0), i64 %2, i32 %4)
+    ret i32 0
+}
+
+define i32 @t2() nounwind {
+; CHECK: t2:
+; CHECK:  movl $0, 12(%esp)
+; CHECK:  movl $0, 8(%esp)
+; CHECK:  movl $0, 4(%esp)
+
+    %1 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 9, i64 0)
+    %2 = extractvalue {i64, i1} %1, 0
+    %3 = extractvalue {i64, i1} %1, 1
+    %4 = zext i1 %3 to i32
+    %5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([10 x i8]* @.str, i32 0, i32 0), i64 %2, i32 %4)
+    ret i32 0
+}
+
+define i32 @t3() nounwind {
+; CHECK: t3:
+; CHECK:  movl $1, 12(%esp)
+; CHECK:  movl $-1, 8(%esp)
+; CHECK:  movl $-9, 4(%esp)
+
+    %1 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 9, i64 -1)
+    %2 = extractvalue {i64, i1} %1, 0
+    %3 = extractvalue {i64, i1} %1, 1
+    %4 = zext i1 %3 to i32
+    %5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([10 x i8]* @.str, i32 0, i32 0), i64 %2, i32 %4)
+    ret i32 0
+}
diff --git a/test/DebugInfo/2010-04-13-PubType.ll b/test/DebugInfo/X86/2010-04-13-PubType.ll
index 559f032cb3..559f032cb3 100644
--- a/test/DebugInfo/2010-04-13-PubType.ll
+++ b/test/DebugInfo/X86/2010-04-13-PubType.ll
diff --git a/test/DebugInfo/X86/DW_AT_object_pointer.ll b/test/DebugInfo/X86/DW_AT_object_pointer.ll
new file mode 100644
index 0000000000..163a1e7cec
--- /dev/null
+++ b/test/DebugInfo/X86/DW_AT_object_pointer.ll
@@ -0,0 +1,79 @@
+; RUN: llc -mtriple=x86_64-apple-darwin %s -o %t -filetype=obj
+; RUN: llvm-dwarfdump %t | FileCheck %s
+
+; CHECK: DW_AT_object_pointer [DW_FORM_ref4]     (cu + 0x00bf => {0x000000bf})
+; CHECK: 0x000000bf:     DW_TAG_formal_parameter [12]
+; CHECK-NEXT: DW_AT_name [DW_FORM_strp]     ( .debug_str[0x00000085] = "this")
+
+%class.A = type { i32 }
+
+define i32 @_Z3foov() nounwind uwtable ssp {
+entry:
+  %a = alloca %class.A, align 4
+  call void @llvm.dbg.declare(metadata !{%class.A* %a}, metadata !21), !dbg !23
+  call void @_ZN1AC1Ev(%class.A* %a), !dbg !24
+  %m_a = getelementptr inbounds %class.A* %a, i32 0, i32 0, !dbg !25
+  %0 = load i32* %m_a, align 4, !dbg !25
+  ret i32 %0, !dbg !25
+}
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+define linkonce_odr void @_ZN1AC1Ev(%class.A* %this) unnamed_addr nounwind uwtable ssp align 2 {
+entry:
+  %this.addr = alloca %class.A*, align 8
+  store %class.A* %this, %class.A** %this.addr, align 8
+  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !26), !dbg !28
+  %this1 = load %class.A** %this.addr
+  call void @_ZN1AC2Ev(%class.A* %this1), !dbg !29
+  ret void, !dbg !29
+}
+
+define linkonce_odr void @_ZN1AC2Ev(%class.A* %this) unnamed_addr nounwind uwtable ssp align 2 {
+entry:
+  %this.addr = alloca %class.A*, align 8
+  store %class.A* %this, %class.A** %this.addr, align 8
+  call void @llvm.dbg.declare(metadata !{%class.A** %this.addr}, metadata !30), !dbg !31
+  %this1 = load %class.A** %this.addr
+  %m_a = getelementptr inbounds %class.A* %this1, i32 0, i32 0, !dbg !32
+  store i32 0, i32* %m_a, align 4, !dbg !32
+  ret void, !dbg !34
+}
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"bar.cpp", metadata !"/Users/echristo/debug-tests", metadata !"clang version 3.2 (trunk 163586) (llvm/trunk 163570)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ] [/Users/echristo/debug-tests/bar.cpp] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !5, metadata !10, metadata !20}
+!5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"foo", metadata !"foo", metadata !"_Z3foov", metadata !6, i32 7, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 ()* @_Z3foov, null, null, metadata !1, i32 7} ; [ DW_TAG_subprogram ] [line 7] [def] [foo]
+!6 = metadata !{i32 786473, metadata !"bar.cpp", metadata !"/Users/echristo/debug-tests", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !9}
+!9 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{i32 786478, i32 0, null, metadata !"A", metadata !"A", metadata !"_ZN1AC1Ev", metadata !6, i32 3, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.A*)* @_ZN1AC1Ev, null, metadata !17, metadata !1, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [A]
+!11 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!12 = metadata !{null, metadata !13}
+!13 = metadata !{i32 786447, i32 0, metadata !"", i32 0, i32 0, i64 64, i64 64, i64 0, i32 1088, metadata !14} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
+!14 = metadata !{i32 786434, null, metadata !"A", metadata !6, i32 1, i64 32, i64 32, i32 0, i32 0, null, metadata !15, i32 0, null, null} ; [ DW_TAG_class_type ] [A] [line 1, size 32, align 32, offset 0] [from ]
+!15 = metadata !{metadata !16, metadata !17}
+!16 = metadata !{i32 786445, metadata !14, metadata !"m_a", metadata !6, i32 4, i64 32, i64 32, i64 0, i32 0, metadata !9} ; [ DW_TAG_member ] [m_a] [line 4, size 32, align 32, offset 0] [from int]
+!17 = metadata !{i32 786478, i32 0, metadata !14, metadata !"A", metadata !"A", metadata !"", metadata !6, i32 3, metadata !11, i1 false, i1 false, i32 0, i32 0, null, i32 256, i1 false, null, null, i32 0, metadata !18, i32 3} ; [ DW_TAG_subprogram ] [line 3] [A]
+!18 = metadata !{metadata !19}
+!19 = metadata !{i32 786468}                      ; [ DW_TAG_base_type ] [line 0, size 0, align 0, offset 0]
+!20 = metadata !{i32 786478, i32 0, null, metadata !"A", metadata !"A", metadata !"_ZN1AC2Ev", metadata !6, i32 3, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void (%class.A*)* @_ZN1AC2Ev, null, metadata !17, metadata !1, i32 3} ; [ DW_TAG_subprogram ] [line 3] [def] [A]
+!21 = metadata !{i32 786688, metadata !22, metadata !"a", metadata !6, i32 8, metadata !14, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [a] [line 8]
+!22 = metadata !{i32 786443, metadata !5, i32 7, i32 11, metadata !6, i32 0} ; [ DW_TAG_lexical_block ] [/Users/echristo/debug-tests/bar.cpp]
+!23 = metadata !{i32 8, i32 5, metadata !22, null}
+!24 = metadata !{i32 8, i32 6, metadata !22, null}
+!25 = metadata !{i32 9, i32 3, metadata !22, null}
+!26 = metadata !{i32 786689, metadata !10, metadata !"this", metadata !6, i32 16777219, metadata !27, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 3]
+!27 = metadata !{i32 786447, null, metadata !"", null, i32 0, i64 64, i64 64, i64 0, i32 0, metadata !14} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from A]
+!28 = metadata !{i32 3, i32 3, metadata !10, null}
+!29 = metadata !{i32 3, i32 18, metadata !10, null}
+!30 = metadata !{i32 786689, metadata !20, metadata !"this", metadata !6, i32 16777219, metadata !27, i32 1088, i32 0} ; [ DW_TAG_arg_variable ] [this] [line 3]
+!31 = metadata !{i32 3, i32 3, metadata !20, null}
+!32 = metadata !{i32 3, i32 9, metadata !33, null}
+!33 = metadata !{i32 786443, metadata !20, i32 3, i32 7, metadata !6, i32 1} ; [ DW_TAG_lexical_block ] [/Users/echristo/debug-tests/bar.cpp]
+!34 = metadata !{i32 3, i32 18, metadata !33, null}
diff --git a/test/DebugInfo/linkage-name.ll b/test/DebugInfo/X86/linkage-name.ll
index b98492383a..b98492383a 100644
--- a/test/DebugInfo/linkage-name.ll
+++ b/test/DebugInfo/X86/linkage-name.ll
diff --git a/test/MC/ARM/arm-shift-encoding.s b/test/MC/ARM/arm-shift-encoding.s
new file mode 100644
index 0000000000..3c57b67f6e
--- /dev/null
+++ b/test/MC/ARM/arm-shift-encoding.s
@@ -0,0 +1,119 @@
+@ RUN: llvm-mc -mcpu=cortex-a8 -triple armv7 -show-encoding < %s | FileCheck %s
+
+	ldr r0, [r0, r0]
+	ldr r0, [r0, r0, lsr #32]
+	ldr r0, [r0, r0, lsr #16]
+	ldr r0, [r0, r0, lsl #0]
+	ldr r0, [r0, r0, lsl #16]
+	ldr r0, [r0, r0, asr #32]
+	ldr r0, [r0, r0, asr #16]
+	ldr r0, [r0, r0, rrx]
+	ldr r0, [r0, r0, ror #16]
+
+@ CHECK: ldr r0, [r0, r0]          @ encoding: [0x00,0x00,0x90,0xe7]
+@ CHECK: ldr r0, [r0, r0, lsr #32] @ encoding: [0x20,0x00,0x90,0xe7]
+@ CHECK: ldr r0, [r0, r0, lsr #16] @ encoding: [0x20,0x08,0x90,0xe7]
+@ CHECK: ldr r0, [r0, r0]          @ encoding: [0x00,0x00,0x90,0xe7]
+@ CHECK: ldr r0, [r0, r0, lsl #16] @ encoding: [0x00,0x08,0x90,0xe7]
+@ CHECK: ldr r0, [r0, r0, asr #32] @ encoding: [0x40,0x00,0x90,0xe7]
+@ CHECK: ldr r0, [r0, r0, asr #16] @ encoding: [0x40,0x08,0x90,0xe7]
+@ CHECK: ldr r0, [r0, r0, rrx]     @ encoding: [0x60,0x00,0x90,0xe7]
+@ CHECK: ldr r0, [r0, r0, ror #16] @ encoding: [0x60,0x08,0x90,0xe7]
+
+	pld [r0, r0]
+	pld [r0, r0, lsr #32]
+	pld [r0, r0, lsr #16]
+	pld [r0, r0, lsl #0]
+	pld [r0, r0, lsl #16]
+	pld [r0, r0, asr #32]
+	pld [r0, r0, asr #16]
+	pld [r0, r0, rrx]
+	pld [r0, r0, ror #16]
+
+@ CHECK: [r0, r0]          @ encoding: [0x00,0xf0,0xd0,0xf7]
+@ CHECK: [r0, r0, lsr #32] @ encoding: [0x20,0xf0,0xd0,0xf7]
+@ CHECK: [r0, r0, lsr #16] @ encoding: [0x20,0xf8,0xd0,0xf7]
+@ CHECK: [r0, r0]          @ encoding: [0x00,0xf0,0xd0,0xf7]
+@ CHECK: [r0, r0, lsl #16] @ encoding: [0x00,0xf8,0xd0,0xf7]
+@ CHECK: [r0, r0, asr #32] @ encoding: [0x40,0xf0,0xd0,0xf7]
+@ CHECK: [r0, r0, asr #16] @ encoding: [0x40,0xf8,0xd0,0xf7]
+@ CHECK: [r0, r0, rrx]     @ encoding: [0x60,0xf0,0xd0,0xf7]
+@ CHECK: [r0, r0, ror #16] @ encoding: [0x60,0xf8,0xd0,0xf7]
+
+	str r0, [r0, r0]
+	str r0, [r0, r0, lsr #32]
+	str r0, [r0, r0, lsr #16]
+	str r0, [r0, r0, lsl #0]
+	str r0, [r0, r0, lsl #16]
+	str r0, [r0, r0, asr #32]
+	str r0, [r0, r0, asr #16]
+	str r0, [r0, r0, rrx]
+	str r0, [r0, r0, ror #16]
+
+@ CHECK: str r0, [r0, r0]          @ encoding: [0x00,0x00,0x80,0xe7]
+@ CHECK: str r0, [r0, r0, lsr #32] @ encoding: [0x20,0x00,0x80,0xe7]
+@ CHECK: str r0, [r0, r0, lsr #16] @ encoding: [0x20,0x08,0x80,0xe7]
+@ CHECK: str r0, [r0, r0]          @ encoding: [0x00,0x00,0x80,0xe7]
+@ CHECK: str r0, [r0, r0, lsl #16] @ encoding: [0x00,0x08,0x80,0xe7]
+@ CHECK: str r0, [r0, r0, asr #32] @ encoding: [0x40,0x00,0x80,0xe7]
+@ CHECK: str r0, [r0, r0, asr #16] @ encoding: [0x40,0x08,0x80,0xe7]
+@ CHECK: str r0, [r0, r0, rrx]     @ encoding: [0x60,0x00,0x80,0xe7]
+@ CHECK: str r0, [r0, r0, ror #16] @ encoding: [0x60,0x08,0x80,0xe7]
+
+@ Uses printAddrMode2OffsetOperand(), used by LDRBT_POST_IMM LDRBT_POST_REG
+@ LDRB_POST_IMM LDRB_POST_REG LDRT_POST_IMM LDRT_POST_REG LDR_POST_IMM
+@ LDR_POST_REG STRBT_POST_IMM STRBT_POST_REG STRB_POST_IMM STRB_POST_REG
+@ STRT_POST_IMM STRT_POST_REG STR_POST_IMM STR_POST_REG
+
+	ldr r0, [r1], r2, rrx
+	ldr r3, [r4], r5, ror #0
+	str r6, [r7], r8, lsl #0
+	str r9, [r10], r11
+
+@ CHECK: ldr r0, [r1], r2, rrx    @ encoding: [0x62,0x00,0x91,0xe6]
+@ CHECK: ldr r3, [r4], r5         @ encoding: [0x05,0x30,0x94,0xe6]
+@ CHECK: str r6, [r7], r8         @ encoding: [0x08,0x60,0x87,0xe6]
+@ CHECK: str r9, [r10], r11       @ encoding: [0x0b,0x90,0x8a,0xe6]
+
+@ Uses printSORegImmOperand(), used by ADCrsi ADDrsi ANDrsi BICrsi EORrsi
+@ ORRrsi RSBrsi RSCrsi SBCrsi SUBrsi CMNzrsi CMPrsi MOVsi MVNsi TEQrsi TSTrsi
+
+	adc sp, lr, pc
+	adc r1, r8, r9, lsr #32
+	adc r2, r7, pc, lsr #16
+	adc r3, r6, r10, lsl #0
+	adc r4, r5, lr, lsl #16
+	adc r5, r4, r11, asr #32
+	adc r6, r3, sp, asr #16
+	adc r7, r2, r12, rrx
+	adc r8, r1, r0, ror #16
+
+@ CHECK: adc sp, lr, pc           @ encoding: [0x0f,0xd0,0xae,0xe0]
+@ CHECK: adc r1, r8, r9, lsr #32  @ encoding: [0x29,0x10,0xa8,0xe0]
+@ CHECK: adc r2, r7, pc, lsr #16  @ encoding: [0x2f,0x28,0xa7,0xe0]
+@ CHECK: adc r3, r6, r10          @ encoding: [0x0a,0x30,0xa6,0xe0]
+@ CHECK: adc r4, r5, lr, lsl #16  @ encoding: [0x0e,0x48,0xa5,0xe0]
+@ CHECK: adc r5, r4, r11, asr #32 @ encoding: [0x4b,0x50,0xa4,0xe0]
+@ CHECK: adc r6, r3, sp, asr #16  @ encoding: [0x4d,0x68,0xa3,0xe0]
+@ CHECK: adc r7, r2, r12, rrx     @ encoding: [0x6c,0x70,0xa2,0xe0]
+@ CHECK: adc r8, r1, r0, ror #16  @ encoding: [0x60,0x88,0xa1,0xe0]
+
+	cmp sp, lr
+	cmp r1, r8, lsr #32
+	cmp r2, r7, lsr #16
+	cmp r3, r6, lsl #0
+	cmp r4, r5, lsl #16
+	cmp r5, r4, asr #32
+	cmp r6, r3, asr #16
+	cmp r7, r2, rrx
+	cmp r8, r1, ror #16
+
+@ CHECK: cmp sp, lr           @ encoding: [0x0e,0x00,0x5d,0xe1]
+@ CHECK: cmp r1, r8, lsr #32  @ encoding: [0x28,0x00,0x51,0xe1]
+@ CHECK: cmp r2, r7, lsr #16  @ encoding: [0x27,0x08,0x52,0xe1]
+@ CHECK: cmp r3, r6           @ encoding: [0x06,0x00,0x53,0xe1]
+@ CHECK: cmp r4, r5, lsl #16  @ encoding: [0x05,0x08,0x54,0xe1]
+@ CHECK: cmp r5, r4, asr #32  @ encoding: [0x44,0x00,0x55,0xe1]
+@ CHECK: cmp r6, r3, asr #16  @ encoding: [0x43,0x08,0x56,0xe1]
+@ CHECK: cmp r7, r2, rrx      @ encoding: [0x62,0x00,0x57,0xe1]
+@ CHECK: cmp r8, r1, ror #16  @ encoding: [0x61,0x08,0x58,0xe1]
diff --git a/test/MC/ARM/diagnostics.s b/test/MC/ARM/diagnostics.s
index 499e055013..ad4d9ab9b5 100644
--- a/test/MC/ARM/diagnostics.s
+++ b/test/MC/ARM/diagnostics.s
@@ -47,7 +47,47 @@
 @ CHECK-ERRORS: error: immediate shift value out of range
 @ CHECK-ERRORS:         adc r4, r5, r6, ror #32
 
+        @ Out of range shift immediate values for load/store.
+        str r1, [r2, r3, lsl #invalid]
+        ldr r4, [r5], r6, lsl #-1
+        pld r4, [r5, r6, lsl #32]
+        str r4, [r5], r6, lsr #-1
+        ldr r4, [r5, r6, lsr #33]
+        pld r4, [r5, r6, asr #-1]
+        str r4, [r5, r6, asr #33]
+        ldr r4, [r5, r6, ror #-1]
+        pld r4, [r5, r6, ror #32]
+        pld r4, [r5, r6, rrx #0]
 
+@ CHECK-ERRORS: error: shift amount must be an immediate
+@ CHECK-ERRORS:         str r1, [r2, r3, lsl #invalid]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: immediate shift value out of range
+@ CHECK-ERRORS:         ldr r4, [r5], r6, lsl #-1
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: immediate shift value out of range
+@ CHECK-ERRORS:         pld r4, [r5, r6, lsl #32]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: immediate shift value out of range
+@ CHECK-ERRORS:         str r4, [r5], r6, lsr #-1
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: immediate shift value out of range
+@ CHECK-ERRORS:         ldr r4, [r5, r6, lsr #33]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: immediate shift value out of range
+@ CHECK-ERRORS:         pld r4, [r5, r6, asr #-1]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: immediate shift value out of range
+@ CHECK-ERRORS:         str r4, [r5, r6, asr #33]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: immediate shift value out of range
+@ CHECK-ERRORS:         ldr r4, [r5, r6, ror #-1]
+@ CHECK-ERRORS:                              ^
+@ CHECK-ERRORS: error: immediate shift value out of range
+@ CHECK-ERRORS:         pld r4, [r5, r6, ror #32]
+@ CHECK-ERRORS: error: ']' expected
+@ CHECK-ERRORS:         pld r4, [r5, r6, rrx #0]
+        
         @ Out of range 16-bit immediate on BKPT
         bkpt #65536
 
diff --git a/test/MC/ARM/thumb-shift-encoding.s b/test/MC/ARM/thumb-shift-encoding.s
new file mode 100644
index 0000000000..54284132b6
--- /dev/null
+++ b/test/MC/ARM/thumb-shift-encoding.s
@@ -0,0 +1,45 @@
+@ RUN: llvm-mc -mcpu=cortex-a8 -triple thumbv7 -show-encoding < %s | FileCheck %s
+
+@ Uses printT2SOOperand(), used by t2ADCrs t2ADDrs t2ANDrs t2BICrs t2EORrs
+@ t2ORNrs t2ORRrs t2RSBrs t2SBCrs t2SUBrs t2CMNzrs t2CMPrs t2MOVSsi t2MOVsi
+@ t2MVNs t2TEQrs t2TSTrs
+
+	sbc.w r12, lr, r0
+	sbc.w r1, r8, r9, lsr #32
+	sbc.w r2, r7, pc, lsr #16
+	sbc.w r3, r6, r10, lsl #0
+	sbc.w r4, r5, lr, lsl #16
+	sbc.w r5, r4, r11, asr #32
+	sbc.w r6, r3, sp, asr #16
+	sbc.w r7, r2, r12, rrx
+	sbc.w r8, r1, r0, ror #16
+
+@ CHECK: sbc.w r12, lr, r0          @ encoding: [0x6e,0xeb,0x00,0x0c]
+@ CHECK: sbc.w r1, r8, r9, lsr #32  @ encoding: [0x68,0xeb,0x19,0x01]
+@ CHECK: sbc.w r2, r7, pc, lsr #16  @ encoding: [0x67,0xeb,0x1f,0x42]
+@ CHECK: sbc.w r3, r6, r10          @ encoding: [0x66,0xeb,0x0a,0x03]
+@ CHECK: sbc.w r4, r5, lr, lsl #16  @ encoding: [0x65,0xeb,0x0e,0x44]
+@ CHECK: sbc.w r5, r4, r11, asr #32 @ encoding: [0x64,0xeb,0x2b,0x05]
+@ CHECK: sbc.w r6, r3, sp, asr #16  @ encoding: [0x63,0xeb,0x2d,0x46]
+@ CHECK: sbc.w r7, r2, r12, rrx     @ encoding: [0x62,0xeb,0x3c,0x07]
+@ CHECK: sbc.w r8, r1, r0, ror #16  @ encoding: [0x61,0xeb,0x30,0x48]
+
+	and.w r12, lr, r0
+	and.w r1, r8, r9, lsr #32
+	and.w r2, r7, pc, lsr #16
+	and.w r3, r6, r10, lsl #0
+	and.w r4, r5, lr, lsl #16
+	and.w r5, r4, r11, asr #32
+	and.w r6, r3, sp, asr #16
+	and.w r7, r2, r12, rrx
+	and.w r8, r1, r0, ror #16
+
+@ CHECK: and.w r12, lr, r0          @ encoding: [0x0e,0xea,0x00,0x0c]
+@ CHECK: and.w r1, r8, r9, lsr #32  @ encoding: [0x08,0xea,0x19,0x01]
+@ CHECK: and.w r2, r7, pc, lsr #16  @ encoding: [0x07,0xea,0x1f,0x42]
+@ CHECK: and.w r3, r6, r10          @ encoding: [0x06,0xea,0x0a,0x03]
+@ CHECK: and.w r4, r5, lr, lsl #16  @ encoding: [0x05,0xea,0x0e,0x44]
+@ CHECK: and.w r5, r4, r11, asr #32 @ encoding: [0x04,0xea,0x2b,0x05]
+@ CHECK: and.w r6, r3, sp, asr #16  @ encoding: [0x03,0xea,0x2d,0x46]
+@ CHECK: and.w r7, r2, r12, rrx     @ encoding: [0x02,0xea,0x3c,0x07]
+@ CHECK: and.w r8, r1, r0, ror #16  @ encoding: [0x01,0xea,0x30,0x48]
diff --git a/test/MC/AsmParser/macro-args.s b/test/MC/AsmParser/macro-args.s
index 6d084213e4..3269369be0 100644
--- a/test/MC/AsmParser/macro-args.s
+++ b/test/MC/AsmParser/macro-args.s
@@ -4,10 +4,18 @@
     movl   \var@GOTOFF(%ebx),\re2g
 .endm
 
+.macro GET_DEFAULT var, re2g=%ebx, re3g=%ecx
+movl 2(\re2g, \re3g, 2), \var
+.endm
+
+GET         is_sse, %eax
+// CHECK: movl  is_sse@GOTOFF(%ebx), %eax
 
-GET    is_sse, %eax
+GET_DEFAULT %ebx, , %edx
+// CHECK: movl  2(%ebx,%edx,2), %ebx
 
-// CHECK: movl	is_sse@GOTOFF(%ebx), %eax
+GET_DEFAULT %ebx, %edx
+// CHECK: movl  2(%edx,%ecx,2), %ebx
 
 .macro bar
     .long $n
diff --git a/test/MC/AsmParser/macro-rept-err1.s b/test/MC/AsmParser/macro-rept-err1.s
index db92856a1d..cfa66878d9 100644
--- a/test/MC/AsmParser/macro-rept-err1.s
+++ b/test/MC/AsmParser/macro-rept-err1.s
@@ -3,4 +3,4 @@
 
 .endr
 
-// CHECK: unexpected '.endr' directive, no current .rept
+// CHECK: unmatched '.endr' directive
diff --git a/test/MC/AsmParser/macros-darwin.s b/test/MC/AsmParser/macros-darwin.s
new file mode 100644
index 0000000000..31b9edb378
--- /dev/null
+++ b/test/MC/AsmParser/macros-darwin.s
@@ -0,0 +1,9 @@
+// RUN: not llvm-mc -triple i386-apple-darwin10 %s 2> %t.err | FileCheck %s
+
+.macro test1
+.globl "$0 $1 $2 $$3 $n"
+.endmacro
+
+// CHECK: .globl "1 23  $3 2"
+test1 1, 2 3
+
diff --git a/test/MC/AsmParser/macros.s b/test/MC/AsmParser/macros.s
index dd2cc1f149..b1cb851fcd 100644
--- a/test/MC/AsmParser/macros.s
+++ b/test/MC/AsmParser/macros.s
@@ -1,4 +1,4 @@
-// RUN: not llvm-mc -triple x86_64-apple-darwin10 %s 2> %t.err | FileCheck %s
+// RUN: not llvm-mc -triple i386-unknown-unknown %s 2> %t.err | FileCheck %s
 // RUN: FileCheck --check-prefix=CHECK-ERRORS %s < %t.err
 
 .macro .test0
@@ -28,33 +28,66 @@ test2 10
 .globl "$0 $1 $2 $$3 $n"
 .endmacro
 
-// CHECK: .globl	"1 23  $3 2"
-test3 1,2 3
+// CHECK: .globl	"1 (23)  $3 2"
+test3 1, (2 3)
+
+// CHECK: .globl "1 2  $3 2"
+test3 1 2
 
 .macro test4
 .globl "$0 -- $1"
 .endmacro
 
-// CHECK: .globl	"ab)(,) -- (cd)"
-test4 a b)(,),(cd)
+// CHECK: .globl  "(ab)(,)) -- (cd)"
+test4 (a b)(,)),(cd)
+
+// CHECK: .globl  "(ab)(,)) -- (cd)"
+test4 (a b)(,)),(cd)
 
 .macro test5 _a
 .globl "\_a"
 .endm
 
-test5 zed1
 // CHECK: .globl zed1
+test5 zed1
 
 .macro test6 $a
 .globl "\$a"
 .endm
 
-test6 zed2
 // CHECK: .globl zed2
+test6 zed2
 
 .macro test7 .a
 .globl "\.a"
 .endm
 
-test7 zed3
 // CHECK: .globl zed3
+test7 zed3
+
+.macro test8 _a, _b, _c
+.globl "\_a,\_b,\_c"
+.endmacro
+
+.macro test9 _a _b _c
+.globl "\_a \_b \_c"
+.endmacro
+
+// CHECK: .globl  "a,b,c"
+test8 a, b, c
+// CHECK: .globl  "%1,%2,%3"
+test8 %1 %2 %3 #a comment
+// CHECK: .globl "x-y,z,1"
+test8 x - y z 1
+// CHECK: .globl  "1 2 3"
+test9 1, 2,3
+
+test8 1,2 3
+// CHECK-ERRORS: error: macro argument '_c' is missing
+// CHECK-ERRORS-NEXT: test8 1,2 3
+// CHECK-ERRORS-NEXT:           ^
+
+test8 1 2, 3
+// CHECK-ERRORS: error: expected ' ' for macro argument separator
+// CHECK-ERRORS-NEXT:test8 1 2, 3
+// CHECK-ERRORS-NEXT:         ^
diff --git a/test/MC/COFF/global_ctors.ll b/test/MC/COFF/global_ctors_dtors.ll
index 4d6b1c7d99..2a25219a77 100644
--- a/test/MC/COFF/global_ctors.ll
+++ b/test/MC/COFF/global_ctors_dtors.ll
@@ -1,14 +1,16 @@
 ; Test that global ctors are emitted into the proper COFF section for the
 ; target. Mingw uses .ctors, whereas MSVC uses .CRT$XC*.
-; RUN: llc < %s -mtriple i686-pc-win32 | FileCheck %s --check-prefix WIN32 
-; RUN: llc < %s -mtriple x86_64-pc-win32 | FileCheck %s --check-prefix WIN32 
-; RUN: llc < %s -mtriple i686-pc-mingw32 | FileCheck %s --check-prefix MINGW32 
-; RUN: llc < %s -mtriple x86_64-pc-mingw32 | FileCheck %s --check-prefix MINGW32 
+; RUN: llc < %s -mtriple i686-pc-win32 | FileCheck %s --check-prefix WIN32
+; RUN: llc < %s -mtriple x86_64-pc-win32 | FileCheck %s --check-prefix WIN32
+; RUN: llc < %s -mtriple i686-pc-mingw32 | FileCheck %s --check-prefix MINGW32
+; RUN: llc < %s -mtriple x86_64-pc-mingw32 | FileCheck %s --check-prefix MINGW32
 
 @.str = private unnamed_addr constant [13 x i8] c"constructing\00", align 1
-@.str2 = private unnamed_addr constant [5 x i8] c"main\00", align 1
+@.str2 = private unnamed_addr constant [12 x i8] c"destructing\00", align 1
+@.str3 = private unnamed_addr constant [5 x i8] c"main\00", align 1
 
 @llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @a_global_ctor }]
+@llvm.global_dtors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @a_global_dtor }]
 
 declare i32 @puts(i8*)
 
@@ -17,12 +19,21 @@ define void @a_global_ctor() nounwind {
   ret void
 }
 
+define void @a_global_dtor() nounwind {
+  %1 = call i32 @puts(i8* getelementptr inbounds ([12 x i8]* @.str2, i32 0, i32 0))
+  ret void
+}
+
 define i32 @main() nounwind {
-  %1 = call i32 @puts(i8* getelementptr inbounds ([5 x i8]* @.str2, i32 0, i32 0))
+  %1 = call i32 @puts(i8* getelementptr inbounds ([5 x i8]* @.str3, i32 0, i32 0))
   ret i32 0
 }
 
 ; WIN32: .section .CRT$XCU,"r"
 ; WIN32: a_global_ctor
+; WIN32: .section .CRT$XTX,"r"
+; WIN32: a_global_dtor
 ; MINGW32: .section .ctors,"w"
 ; MINGW32: a_global_ctor
+; MINGW32: .section .dtors,"w"
+; MINGW32: a_global_dtor
diff --git a/test/MC/MachO/absolute.s b/test/MC/MachO/absolute.s
new file mode 100644
index 0000000000..784e32a7e4
--- /dev/null
+++ b/test/MC/MachO/absolute.s
@@ -0,0 +1,158 @@
+// RUN: llvm-mc -triple x86_64-apple-darwin10 %s -filetype=obj -o - | macho-dump | FileCheck %s
+
+_bar:
+  nop
+_foo:
+  nop
+
+  .set foo_set1, (_foo + 0xffff0000)
+  .set foo_set2, (_foo - _bar + 0xffff0000)
+
+foo_equals = (_foo + 0xffff0000)
+foo_equals2 = (_foo - _bar + 0xffff0000)
+
+  .globl foo_set1_global;
+  .set foo_set1_global, (_foo + 0xffff0000)
+
+  .globl foo_set2_global;
+  .set foo_set2_global, (_foo - _bar + 0xffff0000)
+
+// CHECK: ('cputype', 16777223)
+// CHECK: ('cpusubtype', 3)
+// CHECK: ('filetype', 1)
+// CHECK: ('num_load_commands', 3)
+// CHECK: ('load_commands_size', 256)
+// CHECK: ('flag', 0)
+// CHECK: ('reserved', 0)
+// CHECK: ('load_commands', [
+// CHECK:   # Load Command 0
+// CHECK:  (('command', 25)
+// CHECK:   ('size', 152)
+// CHECK:   ('segment_name', '\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+// CHECK:   ('vm_addr', 0)
+// CHECK:   ('vm_size', 2)
+// CHECK:   ('file_offset', 288)
+// CHECK:   ('file_size', 2)
+// CHECK:   ('maxprot', 7)
+// CHECK:   ('initprot', 7)
+// CHECK:   ('num_sections', 1)
+// CHECK:   ('flags', 0)
+// CHECK:   ('sections', [
+// CHECK:     # Section 0
+// CHECK:    (('section_name', '__text\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+// CHECK:     ('segment_name', '__TEXT\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00')
+// CHECK:     ('address', 0)
+// CHECK:     ('size', 2)
+// CHECK:     ('offset', 288)
+// CHECK:     ('alignment', 0)
+// CHECK:     ('reloc_offset', 0)
+// CHECK:     ('num_reloc', 0)
+// CHECK:     ('flags', 0x80000400)
+// CHECK:     ('reserved1', 0)
+// CHECK:     ('reserved2', 0)
+// CHECK:     ('reserved3', 0)
+// CHECK:    ),
+// CHECK:   ('_relocations', [
+// CHECK:   ])
+// CHECK:   ])
+// CHECK:  ),
+// CHECK:   # Load Command 1
+// CHECK:  (('command', 2)
+// CHECK:   ('size', 24)
+// CHECK:   ('symoff', 292)
+// CHECK:   ('nsyms', 8)
+// CHECK:   ('stroff', 420)
+// CHECK:   ('strsize', 84)
+// CHECK:   ('_string_data', '\x00foo_set1_global\x00foo_set2_global\x00_bar\x00_foo\x00foo_set1\x00foo_set2\x00foo_equals\x00foo_equals2\x00')
+// CHECK:   ('_symbols', [
+// CHECK:     # Symbol 0
+// CHECK:    (('n_strx', 33)
+// CHECK:     ('n_type', 0xe)
+// CHECK:     ('n_sect', 1)
+// CHECK:     ('n_desc', 0)
+// CHECK:     ('n_value', 0)
+// CHECK:     ('_string', '_bar')
+// CHECK:    ),
+// CHECK:     # Symbol 1
+// CHECK:    (('n_strx', 38)
+// CHECK:     ('n_type', 0xe)
+// CHECK:     ('n_sect', 1)
+// CHECK:     ('n_desc', 0)
+// CHECK:     ('n_value', 1)
+// CHECK:     ('_string', '_foo')
+// CHECK:    ),
+// CHECK:     # Symbol 2
+// CHECK:    (('n_strx', 43)
+// CHECK:     ('n_type', 0xe)
+// CHECK:     ('n_sect', 1)
+// CHECK:     ('n_desc', 32)
+// CHECK:     ('n_value', 4294901761)
+// CHECK:     ('_string', 'foo_set1')
+// CHECK:    ),
+// CHECK:     # Symbol 3
+// CHECK:    (('n_strx', 52)
+// CHECK:     ('n_type', 0x2)
+// CHECK:     ('n_sect', 0)
+// CHECK:     ('n_desc', 32)
+// CHECK:     ('n_value', 4294901761)
+// CHECK:     ('_string', 'foo_set2')
+// CHECK:    ),
+// CHECK:     # Symbol 4
+// CHECK:    (('n_strx', 61)
+// CHECK:     ('n_type', 0xe)
+// CHECK:     ('n_sect', 1)
+// CHECK:     ('n_desc', 0)
+// CHECK:     ('n_value', 4294901761)
+// CHECK:     ('_string', 'foo_equals')
+// CHECK:    ),
+// CHECK:     # Symbol 5
+// CHECK:    (('n_strx', 72)
+// CHECK:     ('n_type', 0x2)
+// CHECK:     ('n_sect', 0)
+// CHECK:     ('n_desc', 0)
+// CHECK:     ('n_value', 4294901761)
+// CHECK:     ('_string', 'foo_equals2')
+// CHECK:    ),
+// CHECK:     # Symbol 6
+// CHECK:    (('n_strx', 1)
+// CHECK:     ('n_type', 0xf)
+// CHECK:     ('n_sect', 1)
+// CHECK:     ('n_desc', 32)
+// CHECK:     ('n_value', 4294901761)
+// CHECK:     ('_string', 'foo_set1_global')
+// CHECK:    ),
+// CHECK:     # Symbol 7
+// CHECK:    (('n_strx', 17)
+// CHECK:     ('n_type', 0x3)
+// CHECK:     ('n_sect', 0)
+// CHECK:     ('n_desc', 32)
+// CHECK:     ('n_value', 4294901761)
+// CHECK:     ('_string', 'foo_set2_global')
+// CHECK:    ),
+// CHECK:   ])
+// CHECK:  ),
+// CHECK:   # Load Command 2
+// CHECK:  (('command', 11)
+// CHECK:   ('size', 80)
+// CHECK:   ('ilocalsym', 0)
+// CHECK:   ('nlocalsym', 6)
+// CHECK:   ('iextdefsym', 6)
+// CHECK:   ('nextdefsym', 2)
+// CHECK:   ('iundefsym', 8)
+// CHECK:   ('nundefsym', 0)
+// CHECK:   ('tocoff', 0)
+// CHECK:   ('ntoc', 0)
+// CHECK:   ('modtaboff', 0)
+// CHECK:   ('nmodtab', 0)
+// CHECK:   ('extrefsymoff', 0)
+// CHECK:   ('nextrefsyms', 0)
+// CHECK:   ('indirectsymoff', 0)
+// CHECK:   ('nindirectsyms', 0)
+// CHECK:   ('extreloff', 0)
+// CHECK:   ('nextrel', 0)
+// CHECK:   ('locreloff', 0)
+// CHECK:   ('nlocrel', 0)
+// CHECK:   ('_indirect_symbols', [
+// CHECK:   ])
+// CHECK:  ),
+// CHECK: ])
diff --git a/test/MC/X86/x86_nop.s b/test/MC/X86/x86_nop.s
new file mode 100644
index 0000000000..de0fc08834
--- /dev/null
+++ b/test/MC/X86/x86_nop.s
@@ -0,0 +1,7 @@
+# RUN: llvm-mc -filetype=obj -arch=x86 -mcpu=geode %s -o %t
+# RUN: llvm-objdump -disassemble %t | FileCheck %s
+
+# CHECK-NOT: nopw
+inc %eax
+.align 8
+inc %eax
diff --git a/test/Object/nm-shared-object.test b/test/Object/nm-shared-object.test
index b361df5355..a57b9401ad 100644
--- a/test/Object/nm-shared-object.test
+++ b/test/Object/nm-shared-object.test
@@ -1,15 +1,23 @@
 RUN: llvm-nm -D %p/Inputs/shared-object-test.elf-i386 \
-RUN:         | FileCheck %s -check-prefix ELF
+RUN:         | FileCheck %s -check-prefix ELF-32
 RUN: llvm-nm -D %p/Inputs/shared-object-test.elf-x86-64 \
-RUN:         | FileCheck %s -check-prefix ELF
+RUN:         | FileCheck %s -check-prefix ELF-64
 
 ; Note: tls_sym should be 'D' (not '?'), but TLS is not
 ; yet recognized by ObjectFile.
 
-ELF: {{[0-9a-f]+}} A __bss_start
-ELF: {{[0-9a-f]+}} A _edata
-ELF: {{[0-9a-f]+}} A _end
-ELF: {{[0-9a-f]+}} B common_sym
-ELF: {{[0-9a-f]+}} D defined_sym
-ELF: {{[0-9a-f]+}} T global_func
-ELF:               ? tls_sym
+ELF-32: 0012c8 A __bss_start
+ELF-32: 0012c8 A _edata
+ELF-32: 0012cc A _end
+ELF-32: 0012c8 B common_sym
+ELF-32: 0012c4 D defined_sym
+ELF-32: 0001f0 T global_func
+ELF-32:        ? tls_sym
+
+ELF-64: 200454 A __bss_start
+ELF-64: 200454 A _edata
+ELF-64: 200458 A _end
+ELF-64: 200454 B common_sym
+ELF-64: 200450 D defined_sym
+ELF-64: 0002f0 T global_func
+ELF-64:        ? tls_sym
diff --git a/test/Object/objdump-symbol-table.test b/test/Object/objdump-symbol-table.test
index 989ec04a8d..c94b077735 100644
--- a/test/Object/objdump-symbol-table.test
+++ b/test/Object/objdump-symbol-table.test
@@ -4,6 +4,8 @@ RUN: llvm-objdump -t %p/Inputs/trivial-object-test.elf-i386 \
 RUN:              | FileCheck %s -check-prefix ELF-i386
 RUN: llvm-objdump -t %p/Inputs/trivial-object-test.macho-i386 \
 RUN:              | FileCheck %s -check-prefix macho-i386
+RUN: llvm-objdump -t %p/Inputs/shared-object-test.elf-i386 \
+RUN:              | FileCheck %s -check-prefix ELF-shared
 
 COFF-i386: file format
 COFF-i386: SYMBOL TABLE:
@@ -31,3 +33,9 @@ macho-i386: SYMBOL TABLE:
 macho-i386: 00000000 g     F __TEXT,__text  00000024 _main
 macho-i386: 00000000         *UND*  00000000 _SomeOtherFunction
 macho-i386: 00000000         *UND*  00000000 _puts
+
+ELF-shared: shared-object-test.elf-i386:     file format
+ELF-shared: SYMBOL TABLE:
+ELF-shared: 00000200 l     F .text 00000003 local_func
+ELF-shared: 000012c4 g       .data 00000004 defined_sym
+ELF-shared: 000001f0 g     F .text 00000003 global_func
diff --git a/test/Other/FileCheck-space.txt b/test/Other/FileCheck-space.txt
new file mode 100644
index 0000000000..6bbe5bc05b
--- /dev/null
+++ b/test/Other/FileCheck-space.txt
@@ -0,0 +1,9 @@
+RUN: printf "a\nb" | FileCheck %s -check-prefix=TEST1
+RUN: echo oo | FileCheck %s -check-prefix=TEST2
+
+Check that CHECK-NEXT without a space after the colon works.
+TEST1:a
+TEST1-NEXT:b
+
+Check that CHECK-NOT without a space after the colon works.
+TEST2-NOT:foo
diff --git a/test/Other/Inputs/llvm-cov.gcda b/test/Other/Inputs/llvm-cov.gcda
new file mode 100644
index 0000000000..9ae2286ea2
--- /dev/null
+++ b/test/Other/Inputs/llvm-cov.gcda
diff --git a/test/Other/Inputs/llvm-cov.gcno b/test/Other/Inputs/llvm-cov.gcno
new file mode 100644
index 0000000000..25e202386a
--- /dev/null
+++ b/test/Other/Inputs/llvm-cov.gcno
diff --git a/test/Other/lit.local.cfg b/test/Other/lit.local.cfg
index 19eebc0ac7..2693077242 100644
--- a/test/Other/lit.local.cfg
+++ b/test/Other/lit.local.cfg
@@ -1 +1 @@
-config.suffixes = ['.ll', '.c', '.cpp']
+config.suffixes = ['.ll', '.c', '.cpp', '.txt']
diff --git a/test/Other/llvm-cov.test b/test/Other/llvm-cov.test
new file mode 100644
index 0000000000..c0aa203e2c
--- /dev/null
+++ b/test/Other/llvm-cov.test
@@ -0,0 +1,3 @@
+PR11760
+RUN: llvm-cov -gcda=%S/Inputs/llvm-cov.gcda -gcno=%S/Inputs/llvm-cov.gcno
+
diff --git a/test/Other/llvm-nm-without-aliases.ll b/test/Other/llvm-nm-without-aliases.ll
new file mode 100644
index 0000000000..9d9408c13b
--- /dev/null
+++ b/test/Other/llvm-nm-without-aliases.ll
@@ -0,0 +1,25 @@
+; RUN: llvm-as < %s > %t
+; RUN: llvm-nm -without-aliases < %t | FileCheck %s
+; RUN: llvm-nm < %t | FileCheck --check-prefix=WITH %s
+
+; CHECK-NOT: T a0bar
+; CHECK-NOT: T a0foo
+; CHECK: T bar
+; CHECK: T foo
+
+; WITH: T a0bar
+; WITH: T a0foo
+; WITH: T bar
+; WITH: T foo
+
+@a0foo = alias void ()* @foo
+
+define void @foo() {
+  ret void
+}
+
+@a0bar = alias void ()* @bar
+
+define void @bar() {
+  ret void
+}
diff --git a/test/Transforms/Inline/recursive.ll b/test/Transforms/Inline/recursive.ll
new file mode 100644
index 0000000000..5fe8d1639c
--- /dev/null
+++ b/test/Transforms/Inline/recursive.ll
@@ -0,0 +1,38 @@
+; RUN: opt %s -inline -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i386-apple-darwin10.0"
+
+; rdar://10853263
+
+; Make sure that the callee is still here.
+; CHECK: define i32 @callee
+define i32 @callee(i32 %param) {
+ %yyy = alloca [100000 x i8]
+ %r = bitcast [100000 x i8]* %yyy to i8*
+ call void @foo2(i8* %r)
+ ret i32 4
+}
+
+; CHECK: define i32 @caller
+; CHECK-NEXT: entry:
+; CHECK-NOT: alloca
+; CHECK: ret
+define i32 @caller(i32 %param) {
+entry:
+  %t = call i32 @foo(i32 %param)
+  %cmp = icmp eq i32 %t, -1
+  br i1 %cmp, label %exit, label %cont
+
+cont:
+  %r = call i32 @caller(i32 %t)
+  %f = call i32 @callee(i32 %r)
+  br label %cont
+exit:
+  ret i32 4
+}
+
+declare void @foo2(i8* %in)
+
+declare i32 @foo(i32 %param)
+
diff --git a/test/Transforms/InstCombine/2012-09-17-ZeroSizedAlloca.ll b/test/Transforms/InstCombine/2012-09-17-ZeroSizedAlloca.ll
new file mode 100644
index 0000000000..ba025e92b0
--- /dev/null
+++ b/test/Transforms/InstCombine/2012-09-17-ZeroSizedAlloca.ll
@@ -0,0 +1,24 @@
+; RUN: opt -S -instcombine < %s | FileCheck %s
+
+; When merging zero sized alloca check that requested alignments of the allocas
+; are obeyed.
+
+@x = global i8* null, align 8
+@y = global i8* null, align 8
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; CHECK: @f
+; CHECK-NEXT: alloca [0 x i8], align 1024
+; CHECK-NOT: alloca
+; CHECK: ret void
+define void @f() {
+  %1 = alloca [0 x i8], align 1
+  %2 = alloca [0 x i8], align 1024
+  %3 = getelementptr inbounds [0 x i8]* %1, i64 0, i64 0
+  %4 = getelementptr inbounds [0 x i8]* %2, i64 0, i64 0
+  store i8* %3, i8** @x, align 8
+  store i8* %4, i8** @y, align 8
+  ret void
+}
diff --git a/test/Transforms/InstCombine/div-shift.ll b/test/Transforms/InstCombine/div-shift.ll
index a07f3ea949..e0372ebac1 100644
--- a/test/Transforms/InstCombine/div-shift.ll
+++ b/test/Transforms/InstCombine/div-shift.ll
@@ -21,3 +21,17 @@ define i64 @t2(i64 %x, i32 %y) nounwind  {
   %3 = udiv i64 %x, %2
   ret i64 %3
 }
+
+; PR13250
+define i64 @t3(i64 %x, i32 %y) nounwind  {
+; CHECK: t3
+; CHECK-NOT: udiv
+; CHECK-NEXT: %1 = add i32 %y, 2
+; CHECK-NEXT: %2 = zext i32 %1 to i64
+; CHECK-NEXT: %3 = lshr i64 %x, %2
+; CHECK-NEXT: ret i64 %3
+  %1 = shl i32 4, %y
+  %2 = zext i32 %1 to i64
+  %3 = udiv i64 %x, %2
+  ret i64 %3
+}
diff --git a/test/Transforms/InstCombine/struct-assign-tbaa.ll b/test/Transforms/InstCombine/struct-assign-tbaa.ll
new file mode 100644
index 0000000000..4fbdb0ab67
--- /dev/null
+++ b/test/Transforms/InstCombine/struct-assign-tbaa.ll
@@ -0,0 +1,26 @@
+; RUN: opt -instcombine -S < %s | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+%struct.foo = type { float }
+
+; Verify that instcombine preserves TBAA tags when converting a memcpy into
+; a scalar load and store.
+
+; CHECK: %2 = load float* %0, align 4, !tbaa !0
+; CHECK: store float %2, float* %1, align 4, !tbaa !0
+; CHECK: !0 = metadata !{metadata !"float", metadata !1}
+define void @test(%struct.foo* nocapture %a, %struct.foo* nocapture %b) {
+entry:
+  %0 = bitcast %struct.foo* %a to i8*
+  %1 = bitcast %struct.foo* %b to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 4, i32 4, i1 false), !tbaa.struct !3
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+
+!0 = metadata !{metadata !"Simple C/C++ TBAA"}
+!1 = metadata !{metadata !"omnipotent char", metadata !0}
+!2 = metadata !{metadata !"float", metadata !0}
+!3 = metadata !{i64 0, i64 4, metadata !2}
diff --git a/test/Transforms/LoopIdiom/non-canonical-loop.ll b/test/Transforms/LoopIdiom/non-canonical-loop.ll
new file mode 100644
index 0000000000..a6a4f9227f
--- /dev/null
+++ b/test/Transforms/LoopIdiom/non-canonical-loop.ll
@@ -0,0 +1,34 @@
+; RUN: opt -S -loop-idiom < %s
+; Don't crash
+; PR13892
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @test(i32* %currMB) nounwind uwtable {
+entry:
+  br i1 undef, label %start.exit, label %if.then.i
+
+if.then.i:                                        ; preds = %entry
+  unreachable
+
+start.exit:                       ; preds = %entry
+  indirectbr i8* undef, [label %0, label %for.bodyprime]
+
+; <label>:0                                       ; preds = %start.exit
+  unreachable
+
+for.bodyprime:                                    ; preds = %for.bodyprime, %start.exit
+  %i.057375 = phi i32 [ 0, %start.exit ], [ %1, %for.bodyprime ]
+  %arrayidx8prime = getelementptr inbounds i32* %currMB, i32 %i.057375
+  store i32 0, i32* %arrayidx8prime, align 4
+  %1 = add i32 %i.057375, 1
+  %cmp5prime = icmp slt i32 %1, 4
+  br i1 %cmp5prime, label %for.bodyprime, label %for.endprime
+
+for.endprime:                                     ; preds = %for.bodyprime
+  br label %for.body23prime
+
+for.body23prime:                                  ; preds = %for.body23prime, %for.endprime
+  br label %for.body23prime
+}
diff --git a/test/Transforms/LoopStrengthReduce/2011-10-03-CritEdgeMerge.ll b/test/Transforms/LoopStrengthReduce/2011-10-03-CritEdgeMerge.ll
index a6996a81fb..af3a53708b 100644
--- a/test/Transforms/LoopStrengthReduce/2011-10-03-CritEdgeMerge.ll
+++ b/test/Transforms/LoopStrengthReduce/2011-10-03-CritEdgeMerge.ll
@@ -1,15 +1,15 @@
 ; RUN: opt -loop-reduce -S < %s | FileCheck %s
 ;
 ; Test LSR's use of SplitCriticalEdge during phi rewriting.
-; Verify that identical edges are merged. rdar://problem/6453893
 
 target triple = "x86-apple-darwin"
 
-; CHECK: @test
+; Verify that identical edges are merged. rdar://problem/6453893
+; CHECK: @test1
 ; CHECK: bb89:
 ; CHECK: phi i8* [ %lsr.iv.next1, %bbA.bb89_crit_edge ], [ %lsr.iv.next1, %bbB.bb89_crit_edge ]{{$}}
 
-define i8* @test() {
+define i8* @test1() {
 entry:
   br label %loop
 
@@ -41,3 +41,41 @@ bb89:
 exit:
   ret i8* %tmp75phi
 }
+
+; Handle single-predecessor phis: PR13756
+; CHECK: @test2
+; CHECK: bb89:
+; CHECK: phi i8* [ %lsr.iv.next1, %bbA ], [ %lsr.iv.next1, %bbA ], [ %lsr.iv.next1, %bbA ]{{$}}
+define i8* @test2() {
+entry:
+  br label %loop
+
+loop:
+  %rec = phi i32 [ %next, %loop ], [ 0, %entry ]
+  %next = add i32 %rec, 1
+  %tmp75 = getelementptr i8* null, i32 %next
+  br i1 false, label %loop, label %loopexit
+
+loopexit:
+  br i1 false, label %bbA, label %bbB
+
+bbA:
+  switch i32 0, label %bb89 [
+    i32 47, label %bb89
+    i32 58, label %bb89
+  ]
+
+bbB:
+  switch i8 0, label %exit [
+    i8 47, label %exit
+    i8 58, label %exit
+  ]
+
+bb89:
+  %tmp75phi = phi i8* [ %tmp75, %bbA ], [ %tmp75, %bbA ], [ %tmp75, %bbA ]
+  br label %exit
+
+exit:
+  %result = phi i8* [ %tmp75phi, %bb89 ], [ %tmp75, %bbB ], [ %tmp75, %bbB ], [ %tmp75, %bbB ]
+  ret i8* %result
+}
diff --git a/test/Transforms/MemCpyOpt/form-memset.ll b/test/Transforms/MemCpyOpt/form-memset.ll
index 8832f897b0..f63b1dcfdd 100644
--- a/test/Transforms/MemCpyOpt/form-memset.ll
+++ b/test/Transforms/MemCpyOpt/form-memset.ll
@@ -248,3 +248,27 @@ entry:
 ; CHECK: @test8
 ; CHECK: store <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>, <4 x i32>* %0, align 16
 }
+
+@test9buf = internal unnamed_addr global [16 x i64] zeroinitializer, align 16
+
+define void @test9() nounwind {
+  store i8 -1, i8* bitcast ([16 x i64]* @test9buf to i8*), align 16
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 1), align 1
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 2), align 2
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 3), align 1
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 4), align 4
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 5), align 1
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 6), align 2
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 7), align 1
+  store i8 -1, i8* bitcast (i64* getelementptr inbounds ([16 x i64]* @test9buf, i64 0, i64 1) to i8*), align 8
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 9), align 1
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 10), align 2
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 11), align 1
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 12), align 4
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 13), align 1
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 14), align 2
+  store i8 -1, i8* getelementptr (i8* bitcast ([16 x i64]* @test9buf to i8*), i64 15), align 1
+  ret void
+; CHECK: @test9(
+; CHECK: call void @llvm.memset.p0i8.i64(i8* bitcast ([16 x i64]* @test9buf to i8*), i8 -1, i64 16, i32 16, i1 false)
+}
diff --git a/test/Transforms/MetaRenamer/lit.local.cfg b/test/Transforms/MetaRenamer/lit.local.cfg
index 19eebc0ac7..c6106e4746 100644
--- a/test/Transforms/MetaRenamer/lit.local.cfg
+++ b/test/Transforms/MetaRenamer/lit.local.cfg
@@ -1 +1 @@
-config.suffixes = ['.ll', '.c', '.cpp']
+config.suffixes = ['.ll']
diff --git a/test/Transforms/ObjCARC/path-overflow.ll b/test/Transforms/ObjCARC/path-overflow.ll
new file mode 100644
index 0000000000..e7866ed1b4
--- /dev/null
+++ b/test/Transforms/ObjCARC/path-overflow.ll
@@ -0,0 +1,329 @@
+; RUN: opt -objc-arc -S < %s
+; rdar://12277446
+
+; The total number of paths grows exponentially with the number of branches, and a
+; computation of this number can overflow any reasonable fixed-sized integer.
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios5.0.0"
+
+%struct.NSConstantString.11.33.55.77.99.121.143.332.1130.1340.2768 = type { i32*, i32, i8*, i32 }
+
+@_unnamed_cfstring_591 = external constant %struct.NSConstantString.11.33.55.77.99.121.143.332.1130.1340.2768, section "__DATA,__cfstring"
+
+declare i8* @objc_retain(i8*) nonlazybind
+
+declare void @objc_release(i8*) nonlazybind
+
+define hidden void @foo() {
+entry:
+  br i1 undef, label %msgSend.nullinit, label %msgSend.call
+
+msgSend.call:                                     ; preds = %entry
+  br label %msgSend.cont
+
+msgSend.nullinit:                                 ; preds = %entry
+  br label %msgSend.cont
+
+msgSend.cont:                                     ; preds = %msgSend.nullinit, %msgSend.call
+  %0 = bitcast %struct.NSConstantString.11.33.55.77.99.121.143.332.1130.1340.2768* @_unnamed_cfstring_591 to i8*
+  %1 = call i8* @objc_retain(i8* %0) nounwind
+  br i1 undef, label %msgSend.nullinit33, label %msgSend.call32
+
+msgSend.call32:                                   ; preds = %if.end10
+  br label %msgSend.cont34
+
+msgSend.nullinit33:                               ; preds = %if.end10
+  br label %msgSend.cont34
+
+msgSend.cont34:                                   ; preds = %msgSend.nullinit33, %msgSend.call32
+  br i1 undef, label %msgSend.nullinit38, label %msgSend.call37
+
+msgSend.call37:                                   ; preds = %msgSend.cont34
+  br label %msgSend.cont39
+
+msgSend.nullinit38:                               ; preds = %msgSend.cont34
+  br label %msgSend.cont39
+
+msgSend.cont39:                                   ; preds = %msgSend.nullinit38, %msgSend.call37
+  br i1 undef, label %msgSend.nullinit49, label %msgSend.call48
+
+msgSend.call48:                                   ; preds = %msgSend.cont39
+  br label %msgSend.cont50
+
+msgSend.nullinit49:                               ; preds = %msgSend.cont39
+  br label %msgSend.cont50
+
+msgSend.cont50:                                   ; preds = %msgSend.nullinit49, %msgSend.call48
+  br i1 undef, label %msgSend.nullinit61, label %msgSend.call60
+
+msgSend.call60:                                   ; preds = %msgSend.cont50
+  br label %msgSend.cont62
+
+msgSend.nullinit61:                               ; preds = %msgSend.cont50
+  br label %msgSend.cont62
+
+msgSend.cont62:                                   ; preds = %msgSend.nullinit61, %msgSend.call60
+  br i1 undef, label %msgSend.nullinit67, label %msgSend.call66
+
+msgSend.call66:                                   ; preds = %msgSend.cont62
+  br label %msgSend.cont68
+
+msgSend.nullinit67:                               ; preds = %msgSend.cont62
+  br label %msgSend.cont68
+
+msgSend.cont68:                                   ; preds = %msgSend.nullinit67, %msgSend.call66
+  br i1 undef, label %msgSend.nullinit84, label %msgSend.call83
+
+msgSend.call83:                                   ; preds = %msgSend.cont68
+  br label %msgSend.cont85
+
+msgSend.nullinit84:                               ; preds = %msgSend.cont68
+  br label %msgSend.cont85
+
+msgSend.cont85:                                   ; preds = %msgSend.nullinit84, %msgSend.call83
+  br i1 undef, label %msgSend.nullinit90, label %msgSend.call89
+
+msgSend.call89:                                   ; preds = %msgSend.cont85
+  br label %msgSend.cont91
+
+msgSend.nullinit90:                               ; preds = %msgSend.cont85
+  br label %msgSend.cont91
+
+msgSend.cont91:                                   ; preds = %msgSend.nullinit90, %msgSend.call89
+  br i1 undef, label %msgSend.nullinit104, label %msgSend.call103
+
+msgSend.call103:                                  ; preds = %msgSend.cont91
+  br label %msgSend.cont105
+
+msgSend.nullinit104:                              ; preds = %msgSend.cont91
+  br label %msgSend.cont105
+
+msgSend.cont105:                                  ; preds = %msgSend.nullinit104, %msgSend.call103
+  br i1 undef, label %land.lhs.true, label %if.end123
+
+land.lhs.true:                                    ; preds = %msgSend.cont105
+  br i1 undef, label %if.then117, label %if.end123
+
+if.then117:                                       ; preds = %land.lhs.true
+  br label %if.end123
+
+if.end123:                                        ; preds = %if.then117, %land.lhs.true, %msgSend.cont105
+  br i1 undef, label %msgSend.nullinit132, label %msgSend.call131
+
+msgSend.call131:                                  ; preds = %if.end123
+  br label %msgSend.cont133
+
+msgSend.nullinit132:                              ; preds = %if.end123
+  br label %msgSend.cont133
+
+msgSend.cont133:                                  ; preds = %msgSend.nullinit132, %msgSend.call131
+  br i1 undef, label %msgSend.nullinit139, label %msgSend.call138
+
+msgSend.call138:                                  ; preds = %msgSend.cont133
+  br label %msgSend.cont140
+
+msgSend.nullinit139:                              ; preds = %msgSend.cont133
+  br label %msgSend.cont140
+
+msgSend.cont140:                                  ; preds = %msgSend.nullinit139, %msgSend.call138
+  br i1 undef, label %if.then151, label %if.end157
+
+if.then151:                                       ; preds = %msgSend.cont140
+  br label %if.end157
+
+if.end157:                                        ; preds = %if.then151, %msgSend.cont140
+  br i1 undef, label %msgSend.nullinit164, label %msgSend.call163
+
+msgSend.call163:                                  ; preds = %if.end157
+  br label %msgSend.cont165
+
+msgSend.nullinit164:                              ; preds = %if.end157
+  br label %msgSend.cont165
+
+msgSend.cont165:                                  ; preds = %msgSend.nullinit164, %msgSend.call163
+  br i1 undef, label %msgSend.nullinit176, label %msgSend.call175
+
+msgSend.call175:                                  ; preds = %msgSend.cont165
+  br label %msgSend.cont177
+
+msgSend.nullinit176:                              ; preds = %msgSend.cont165
+  br label %msgSend.cont177
+
+msgSend.cont177:                                  ; preds = %msgSend.nullinit176, %msgSend.call175
+  br i1 undef, label %land.lhs.true181, label %if.end202
+
+land.lhs.true181:                                 ; preds = %msgSend.cont177
+  br i1 undef, label %if.then187, label %if.end202
+
+if.then187:                                       ; preds = %land.lhs.true181
+  br i1 undef, label %msgSend.nullinit199, label %msgSend.call198
+
+msgSend.call198:                                  ; preds = %if.then187
+  br label %msgSend.cont200
+
+msgSend.nullinit199:                              ; preds = %if.then187
+  br label %msgSend.cont200
+
+msgSend.cont200:                                  ; preds = %msgSend.nullinit199, %msgSend.call198
+  br label %if.end202
+
+if.end202:                                        ; preds = %msgSend.cont200, %land.lhs.true181, %msgSend.cont177
+  br i1 undef, label %msgSend.nullinit236, label %msgSend.call235
+
+msgSend.call235:                                  ; preds = %if.end202
+  br label %msgSend.cont237
+
+msgSend.nullinit236:                              ; preds = %if.end202
+  br label %msgSend.cont237
+
+msgSend.cont237:                                  ; preds = %msgSend.nullinit236, %msgSend.call235
+  br i1 undef, label %msgSend.nullinit254, label %msgSend.call253
+
+msgSend.call253:                                  ; preds = %msgSend.cont237
+  br label %msgSend.cont255
+
+msgSend.nullinit254:                              ; preds = %msgSend.cont237
+  br label %msgSend.cont255
+
+msgSend.cont255:                                  ; preds = %msgSend.nullinit254, %msgSend.call253
+  br i1 undef, label %msgSend.nullinit269, label %msgSend.call268
+
+msgSend.call268:                                  ; preds = %msgSend.cont255
+  br label %msgSend.cont270
+
+msgSend.nullinit269:                              ; preds = %msgSend.cont255
+  br label %msgSend.cont270
+
+msgSend.cont270:                                  ; preds = %msgSend.nullinit269, %msgSend.call268
+  br i1 undef, label %msgSend.nullinit281, label %msgSend.call280
+
+msgSend.call280:                                  ; preds = %msgSend.cont270
+  br label %msgSend.cont282
+
+msgSend.nullinit281:                              ; preds = %msgSend.cont270
+  br label %msgSend.cont282
+
+msgSend.cont282:                                  ; preds = %msgSend.nullinit281, %msgSend.call280
+  br i1 undef, label %msgSend.nullinit287, label %msgSend.call286
+
+msgSend.call286:                                  ; preds = %msgSend.cont282
+  br label %msgSend.cont288
+
+msgSend.nullinit287:                              ; preds = %msgSend.cont282
+  br label %msgSend.cont288
+
+msgSend.cont288:                                  ; preds = %msgSend.nullinit287, %msgSend.call286
+  br i1 undef, label %msgSend.nullinit303, label %msgSend.call302
+
+msgSend.call302:                                  ; preds = %msgSend.cont288
+  br label %msgSend.cont304
+
+msgSend.nullinit303:                              ; preds = %msgSend.cont288
+  br label %msgSend.cont304
+
+msgSend.cont304:                                  ; preds = %msgSend.nullinit303, %msgSend.call302
+  br i1 undef, label %msgSend.nullinit344, label %msgSend.call343
+
+msgSend.call343:                                  ; preds = %msgSend.cont304
+  br label %msgSend.cont345
+
+msgSend.nullinit344:                              ; preds = %msgSend.cont304
+  br label %msgSend.cont345
+
+msgSend.cont345:                                  ; preds = %msgSend.nullinit344, %msgSend.call343
+  br i1 undef, label %msgSend.nullinit350, label %msgSend.call349
+
+msgSend.call349:                                  ; preds = %msgSend.cont345
+  br label %msgSend.cont351
+
+msgSend.nullinit350:                              ; preds = %msgSend.cont345
+  br label %msgSend.cont351
+
+msgSend.cont351:                                  ; preds = %msgSend.nullinit350, %msgSend.call349
+  br i1 undef, label %msgSend.nullinit366, label %msgSend.call365
+
+msgSend.call365:                                  ; preds = %msgSend.cont351
+  br label %msgSend.cont367
+
+msgSend.nullinit366:                              ; preds = %msgSend.cont351
+  br label %msgSend.cont367
+
+msgSend.cont367:                                  ; preds = %msgSend.nullinit366, %msgSend.call365
+  br i1 undef, label %msgSend.nullinit376, label %msgSend.call375
+
+msgSend.call375:                                  ; preds = %msgSend.cont367
+  br label %msgSend.cont377
+
+msgSend.nullinit376:                              ; preds = %msgSend.cont367
+  br label %msgSend.cont377
+
+msgSend.cont377:                                  ; preds = %msgSend.nullinit376, %msgSend.call375
+  br i1 undef, label %if.then384, label %if.else401
+
+if.then384:                                       ; preds = %msgSend.cont377
+  br i1 undef, label %msgSend.nullinit392, label %msgSend.call391
+
+msgSend.call391:                                  ; preds = %if.then384
+  br label %msgSend.cont393
+
+msgSend.nullinit392:                              ; preds = %if.then384
+  br label %msgSend.cont393
+
+msgSend.cont393:                                  ; preds = %msgSend.nullinit392, %msgSend.call391
+  br label %if.end418
+
+if.else401:                                       ; preds = %msgSend.cont377
+  br i1 undef, label %msgSend.nullinit409, label %msgSend.call408
+
+msgSend.call408:                                  ; preds = %if.else401
+  br label %msgSend.cont410
+
+msgSend.nullinit409:                              ; preds = %if.else401
+  br label %msgSend.cont410
+
+msgSend.cont410:                                  ; preds = %msgSend.nullinit409, %msgSend.call408
+  br label %if.end418
+
+if.end418:                                        ; preds = %msgSend.cont410, %msgSend.cont393
+  br i1 undef, label %msgSend.nullinit470, label %msgSend.call469
+
+msgSend.call469:                                  ; preds = %if.end418
+  br label %msgSend.cont471
+
+msgSend.nullinit470:                              ; preds = %if.end418
+  br label %msgSend.cont471
+
+msgSend.cont471:                                  ; preds = %msgSend.nullinit470, %msgSend.call469
+  br i1 undef, label %msgSend.nullinit484, label %msgSend.call483
+
+msgSend.call483:                                  ; preds = %msgSend.cont471
+  br label %msgSend.cont485
+
+msgSend.nullinit484:                              ; preds = %msgSend.cont471
+  br label %msgSend.cont485
+
+msgSend.cont485:                                  ; preds = %msgSend.nullinit484, %msgSend.call483
+  br i1 undef, label %msgSend.nullinit500, label %msgSend.call499
+
+msgSend.call499:                                  ; preds = %msgSend.cont485
+  br label %msgSend.cont501
+
+msgSend.nullinit500:                              ; preds = %msgSend.cont485
+  br label %msgSend.cont501
+
+msgSend.cont501:                                  ; preds = %msgSend.nullinit500, %msgSend.call499
+  br i1 undef, label %msgSend.nullinit506, label %msgSend.call505
+
+msgSend.call505:                                  ; preds = %msgSend.cont501
+  br label %msgSend.cont507
+
+msgSend.nullinit506:                              ; preds = %msgSend.cont501
+  br label %msgSend.cont507
+
+msgSend.cont507:                                  ; preds = %msgSend.nullinit506, %msgSend.call505
+  call void @objc_release(i8* %0) nounwind, !clang.imprecise_release !0
+  ret void
+}
+
+!0 = metadata !{}
diff --git a/test/Transforms/SROA/basictest.ll b/test/Transforms/SROA/basictest.ll
new file mode 100644
index 0000000000..a61de05f45
--- /dev/null
+++ b/test/Transforms/SROA/basictest.ll
@@ -0,0 +1,857 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+; RUN: opt < %s -sroa -force-ssa-updater -S | FileCheck %s
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
+declare void @llvm.lifetime.start(i64, i8* nocapture)
+declare void @llvm.lifetime.end(i64, i8* nocapture)
+
+define i32 @test0() {
+; CHECK: @test0
+; CHECK-NOT: alloca
+; CHECK: ret i32
+
+entry:
+  %a1 = alloca i32
+  %a2 = alloca float
+
+  %a1.i8 = bitcast i32* %a1 to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %a1.i8)
+
+  store i32 0, i32* %a1
+  %v1 = load i32* %a1
+
+  call void @llvm.lifetime.end(i64 4, i8* %a1.i8)
+
+  %a2.i8 = bitcast float* %a2 to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %a2.i8)
+
+  store float 0.0, float* %a2
+  %v2 = load float * %a2
+  %v2.int = bitcast float %v2 to i32
+  %sum1 = add i32 %v1, %v2.int
+
+  call void @llvm.lifetime.end(i64 4, i8* %a2.i8)
+
+  ret i32 %sum1
+}
+
+define i32 @test1() {
+; CHECK: @test1
+; CHECK-NOT: alloca
+; CHECK: ret i32 0
+
+entry:
+  %X = alloca { i32, float }
+  %Y = getelementptr { i32, float }* %X, i64 0, i32 0
+  store i32 0, i32* %Y
+  %Z = load i32* %Y
+  ret i32 %Z
+}
+
+define i64 @test2(i64 %X) {
+; CHECK: @test2
+; CHECK-NOT: alloca
+; CHECK: ret i64 %X
+
+entry:
+  %A = alloca [8 x i8]
+  %B = bitcast [8 x i8]* %A to i64*
+  store i64 %X, i64* %B
+  br label %L2
+
+L2:
+  %Z = load i64* %B
+  ret i64 %Z
+}
+
+define void @test3(i8* %dst, i8* %src) {
+; CHECK: @test3
+
+entry:
+  %a = alloca [300 x i8]
+; CHECK-NOT:  alloca
+; CHECK:      %[[test3_a1:.*]] = alloca [42 x i8]
+; CHECK-NEXT: %[[test3_a2:.*]] = alloca [99 x i8]
+; CHECK-NEXT: %[[test3_a3:.*]] = alloca [16 x i8]
+; CHECK-NEXT: %[[test3_a4:.*]] = alloca [42 x i8]
+; CHECK-NEXT: %[[test3_a5:.*]] = alloca [7 x i8]
+; CHECK-NEXT: %[[test3_a6:.*]] = alloca [7 x i8]
+; CHECK-NEXT: %[[test3_a7:.*]] = alloca [85 x i8]
+
+  %b = getelementptr [300 x i8]* %a, i64 0, i64 0
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %b, i8* %src, i32 300, i32 1, i1 false)
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [42 x i8]* %[[test3_a1]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %src, i32 42
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %src, i64 42
+; CHECK-NEXT: %[[test3_r1:.*]] = load i8* %[[gep]]
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 43
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [99 x i8]* %[[test3_a2]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 99
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 142
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 16
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 158
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [42 x i8]* %[[test3_a4]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 42
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 200
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a5]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %src, i64 207
+; CHECK-NEXT: %[[test3_r2:.*]] = load i8* %[[gep]]
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 208
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 215
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [85 x i8]* %[[test3_a7]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 85
+
+  ; Clobber a single element of the array, this should be promotable.
+  %c = getelementptr [300 x i8]* %a, i64 0, i64 42
+  store i8 0, i8* %c
+
+  ; Make a sequence of overlapping stores to the array. These overlap both in
+  ; forward strides and in shrinking accesses.
+  %overlap.1.i8 = getelementptr [300 x i8]* %a, i64 0, i64 142
+  %overlap.2.i8 = getelementptr [300 x i8]* %a, i64 0, i64 143
+  %overlap.3.i8 = getelementptr [300 x i8]* %a, i64 0, i64 144
+  %overlap.4.i8 = getelementptr [300 x i8]* %a, i64 0, i64 145
+  %overlap.5.i8 = getelementptr [300 x i8]* %a, i64 0, i64 146
+  %overlap.6.i8 = getelementptr [300 x i8]* %a, i64 0, i64 147
+  %overlap.7.i8 = getelementptr [300 x i8]* %a, i64 0, i64 148
+  %overlap.8.i8 = getelementptr [300 x i8]* %a, i64 0, i64 149
+  %overlap.9.i8 = getelementptr [300 x i8]* %a, i64 0, i64 150
+  %overlap.1.i16 = bitcast i8* %overlap.1.i8 to i16*
+  %overlap.1.i32 = bitcast i8* %overlap.1.i8 to i32*
+  %overlap.1.i64 = bitcast i8* %overlap.1.i8 to i64*
+  %overlap.2.i64 = bitcast i8* %overlap.2.i8 to i64*
+  %overlap.3.i64 = bitcast i8* %overlap.3.i8 to i64*
+  %overlap.4.i64 = bitcast i8* %overlap.4.i8 to i64*
+  %overlap.5.i64 = bitcast i8* %overlap.5.i8 to i64*
+  %overlap.6.i64 = bitcast i8* %overlap.6.i8 to i64*
+  %overlap.7.i64 = bitcast i8* %overlap.7.i8 to i64*
+  %overlap.8.i64 = bitcast i8* %overlap.8.i8 to i64*
+  %overlap.9.i64 = bitcast i8* %overlap.9.i8 to i64*
+  store i8 1, i8* %overlap.1.i8
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 0
+; CHECK-NEXT: store i8 1, i8* %[[gep]]
+  store i16 1, i16* %overlap.1.i16
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast [16 x i8]* %[[test3_a3]] to i16*
+; CHECK-NEXT: store i16 1, i16* %[[bitcast]]
+  store i32 1, i32* %overlap.1.i32
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast [16 x i8]* %[[test3_a3]] to i32*
+; CHECK-NEXT: store i32 1, i32* %[[bitcast]]
+  store i64 1, i64* %overlap.1.i64
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast [16 x i8]* %[[test3_a3]] to i64*
+; CHECK-NEXT: store i64 1, i64* %[[bitcast]]
+  store i64 2, i64* %overlap.2.i64
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 1
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i64*
+; CHECK-NEXT: store i64 2, i64* %[[bitcast]]
+  store i64 3, i64* %overlap.3.i64
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 2
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i64*
+; CHECK-NEXT: store i64 3, i64* %[[bitcast]]
+  store i64 4, i64* %overlap.4.i64
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 3
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i64*
+; CHECK-NEXT: store i64 4, i64* %[[bitcast]]
+  store i64 5, i64* %overlap.5.i64
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 4
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i64*
+; CHECK-NEXT: store i64 5, i64* %[[bitcast]]
+  store i64 6, i64* %overlap.6.i64
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 5
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i64*
+; CHECK-NEXT: store i64 6, i64* %[[bitcast]]
+  store i64 7, i64* %overlap.7.i64
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 6
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i64*
+; CHECK-NEXT: store i64 7, i64* %[[bitcast]]
+  store i64 8, i64* %overlap.8.i64
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 7
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i64*
+; CHECK-NEXT: store i64 8, i64* %[[bitcast]]
+  store i64 9, i64* %overlap.9.i64
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 8
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i64*
+; CHECK-NEXT: store i64 9, i64* %[[bitcast]]
+
+  ; Make two sequences of overlapping stores with more gaps and irregularities.
+  %overlap2.1.0.i8 = getelementptr [300 x i8]* %a, i64 0, i64 200
+  %overlap2.1.1.i8 = getelementptr [300 x i8]* %a, i64 0, i64 201
+  %overlap2.1.2.i8 = getelementptr [300 x i8]* %a, i64 0, i64 202
+  %overlap2.1.3.i8 = getelementptr [300 x i8]* %a, i64 0, i64 203
+
+  %overlap2.2.0.i8 = getelementptr [300 x i8]* %a, i64 0, i64 208
+  %overlap2.2.1.i8 = getelementptr [300 x i8]* %a, i64 0, i64 209
+  %overlap2.2.2.i8 = getelementptr [300 x i8]* %a, i64 0, i64 210
+  %overlap2.2.3.i8 = getelementptr [300 x i8]* %a, i64 0, i64 211
+
+  %overlap2.1.0.i16 = bitcast i8* %overlap2.1.0.i8 to i16*
+  %overlap2.1.0.i32 = bitcast i8* %overlap2.1.0.i8 to i32*
+  %overlap2.1.1.i32 = bitcast i8* %overlap2.1.1.i8 to i32*
+  %overlap2.1.2.i32 = bitcast i8* %overlap2.1.2.i8 to i32*
+  %overlap2.1.3.i32 = bitcast i8* %overlap2.1.3.i8 to i32*
+  store i8 1,  i8*  %overlap2.1.0.i8
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a5]], i64 0, i64 0
+; CHECK-NEXT: store i8 1, i8* %[[gep]]
+  store i16 1, i16* %overlap2.1.0.i16
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast [7 x i8]* %[[test3_a5]] to i16*
+; CHECK-NEXT: store i16 1, i16* %[[bitcast]]
+  store i32 1, i32* %overlap2.1.0.i32
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast [7 x i8]* %[[test3_a5]] to i32*
+; CHECK-NEXT: store i32 1, i32* %[[bitcast]]
+  store i32 2, i32* %overlap2.1.1.i32
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a5]], i64 0, i64 1
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i32*
+; CHECK-NEXT: store i32 2, i32* %[[bitcast]]
+  store i32 3, i32* %overlap2.1.2.i32
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a5]], i64 0, i64 2
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i32*
+; CHECK-NEXT: store i32 3, i32* %[[bitcast]]
+  store i32 4, i32* %overlap2.1.3.i32
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a5]], i64 0, i64 3
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i32*
+; CHECK-NEXT: store i32 4, i32* %[[bitcast]]
+
+  %overlap2.2.0.i32 = bitcast i8* %overlap2.2.0.i8 to i32*
+  %overlap2.2.1.i16 = bitcast i8* %overlap2.2.1.i8 to i16*
+  %overlap2.2.1.i32 = bitcast i8* %overlap2.2.1.i8 to i32*
+  %overlap2.2.2.i32 = bitcast i8* %overlap2.2.2.i8 to i32*
+  %overlap2.2.3.i32 = bitcast i8* %overlap2.2.3.i8 to i32*
+  store i32 1, i32* %overlap2.2.0.i32
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast [7 x i8]* %[[test3_a6]] to i32*
+; CHECK-NEXT: store i32 1, i32* %[[bitcast]]
+  store i8 1,  i8*  %overlap2.2.1.i8
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 1
+; CHECK-NEXT: store i8 1, i8* %[[gep]]
+  store i16 1, i16* %overlap2.2.1.i16
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 1
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i16*
+; CHECK-NEXT: store i16 1, i16* %[[bitcast]]
+  store i32 1, i32* %overlap2.2.1.i32
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 1
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i32*
+; CHECK-NEXT: store i32 1, i32* %[[bitcast]]
+  store i32 3, i32* %overlap2.2.2.i32
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 2
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i32*
+; CHECK-NEXT: store i32 3, i32* %[[bitcast]]
+  store i32 4, i32* %overlap2.2.3.i32
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 3
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i32*
+; CHECK-NEXT: store i32 4, i32* %[[bitcast]]
+
+  %overlap2.prefix = getelementptr i8* %overlap2.1.1.i8, i64 -4
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %overlap2.prefix, i8* %src, i32 8, i32 1, i1 false)
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [42 x i8]* %[[test3_a4]], i64 0, i64 39
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %src, i32 3
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 3
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a5]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 5
+
+  ; Bridge between the overlapping areas
+  call void @llvm.memset.p0i8.i32(i8* %overlap2.1.2.i8, i8 42, i32 8, i32 1, i1 false)
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a5]], i64 0, i64 2
+; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* %[[gep]], i8 42, i32 5
+; ...promoted i8 store...
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* %[[gep]], i8 42, i32 2
+
+  ; Entirely within the second overlap.
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %overlap2.2.1.i8, i8* %src, i32 5, i32 1, i1 false)
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 1
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep]], i8* %src, i32 5
+
+  ; Trailing past the second overlap.
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %overlap2.2.2.i8, i8* %src, i32 8, i32 1, i1 false)
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 2
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep]], i8* %src, i32 5
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 5
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [85 x i8]* %[[test3_a7]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 3
+
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %b, i32 300, i32 1, i1 false)
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [42 x i8]* %[[test3_a1]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %[[gep]], i32 42
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %dst, i64 42
+; CHECK-NEXT: store i8 0, i8* %[[gep]]
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 43
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [99 x i8]* %[[test3_a2]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 99
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 142
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [16 x i8]* %[[test3_a3]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 16
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 158
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [42 x i8]* %[[test3_a4]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 42
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 200
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a5]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %dst, i64 207
+; CHECK-NEXT: store i8 42, i8* %[[gep]]
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 208
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [7 x i8]* %[[test3_a6]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 215
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [85 x i8]* %[[test3_a7]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 85
+
+  ret void
+}
+
+define void @test4(i8* %dst, i8* %src) {
+; CHECK: @test4
+
+entry:
+  %a = alloca [100 x i8]
+; CHECK-NOT:  alloca
+; CHECK:      %[[test4_a1:.*]] = alloca [20 x i8]
+; CHECK-NEXT: %[[test4_a2:.*]] = alloca [7 x i8]
+; CHECK-NEXT: %[[test4_a3:.*]] = alloca [10 x i8]
+; CHECK-NEXT: %[[test4_a4:.*]] = alloca [7 x i8]
+; CHECK-NEXT: %[[test4_a5:.*]] = alloca [7 x i8]
+; CHECK-NEXT: %[[test4_a6:.*]] = alloca [40 x i8]
+
+  %b = getelementptr [100 x i8]* %a, i64 0, i64 0
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %b, i8* %src, i32 100, i32 1, i1 false)
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [20 x i8]* %[[test4_a1]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep]], i8* %src, i32 20
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %src, i64 20
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i16*
+; CHECK-NEXT: %[[test4_r1:.*]] = load i16* %[[bitcast]]
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %src, i64 22
+; CHECK-NEXT: %[[test4_r2:.*]] = load i8* %[[gep]]
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 23
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a2]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 30
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [10 x i8]* %[[test4_a3]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 10
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %src, i64 40
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i16*
+; CHECK-NEXT: %[[test4_r3:.*]] = load i16* %[[bitcast]]
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %src, i64 42
+; CHECK-NEXT: %[[test4_r4:.*]] = load i8* %[[gep]]
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 43
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a4]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %src, i64 50
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i16*
+; CHECK-NEXT: %[[test4_r5:.*]] = load i16* %[[bitcast]]
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %src, i64 52
+; CHECK-NEXT: %[[test4_r6:.*]] = load i8* %[[gep]]
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 53
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a5]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds i8* %src, i64 60
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [40 x i8]* %[[test4_a6]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 40
+
+  %a.src.1 = getelementptr [100 x i8]* %a, i64 0, i64 20
+  %a.dst.1 = getelementptr [100 x i8]* %a, i64 0, i64 40
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.dst.1, i8* %a.src.1, i32 10, i32 1, i1 false)
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a4]], i64 0, i64 0
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a2]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+
+  ; Clobber a single element of the array, this should be promotable, and be deleted.
+  %c = getelementptr [100 x i8]* %a, i64 0, i64 42
+  store i8 0, i8* %c
+
+  %a.src.2 = getelementptr [100 x i8]* %a, i64 0, i64 50
+  call void @llvm.memmove.p0i8.p0i8.i32(i8* %a.dst.1, i8* %a.src.2, i32 10, i32 1, i1 false)
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a4]], i64 0, i64 0
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a5]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %b, i32 100, i32 1, i1 false)
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds [20 x i8]* %[[test4_a1]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %[[gep]], i32 20
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %dst, i64 20
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i16*
+; CHECK-NEXT: store i16 %[[test4_r1]], i16* %[[bitcast]]
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %dst, i64 22
+; CHECK-NEXT: store i8 %[[test4_r2]], i8* %[[gep]]
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 23
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a2]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 30
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [10 x i8]* %[[test4_a3]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 10
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %dst, i64 40
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i16*
+; CHECK-NEXT: store i16 %[[test4_r5]], i16* %[[bitcast]]
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %dst, i64 42
+; CHECK-NEXT: store i8 %[[test4_r6]], i8* %[[gep]]
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 43
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a4]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %dst, i64 50
+; CHECK-NEXT: %[[bitcast:.*]] = bitcast i8* %[[gep]] to i16*
+; CHECK-NEXT: store i16 %[[test4_r5]], i16* %[[bitcast]]
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds i8* %dst, i64 52
+; CHECK-NEXT: store i8 %[[test4_r6]], i8* %[[gep]]
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 53
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [7 x i8]* %[[test4_a5]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 7
+; CHECK-NEXT: %[[gep_dst:.*]] = getelementptr inbounds i8* %dst, i64 60
+; CHECK-NEXT: %[[gep_src:.*]] = getelementptr inbounds [40 x i8]* %[[test4_a6]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[gep_dst]], i8* %[[gep_src]], i32 40
+
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
+
+define i16 @test5() {
+; CHECK: @test5
+; CHECK: alloca float
+; CHECK: ret i16 %
+
+entry:
+  %a = alloca [4 x i8]
+  %fptr = bitcast [4 x i8]* %a to float*
+  store float 0.0, float* %fptr
+  %ptr = getelementptr [4 x i8]* %a, i32 0, i32 2
+  %iptr = bitcast i8* %ptr to i16*
+  %val = load i16* %iptr
+  ret i16 %val
+}
+
+define i32 @test6() {
+; CHECK: @test6
+; CHECK: alloca i32
+; CHECK-NEXT: store volatile i32
+; CHECK-NEXT: load i32*
+; CHECK-NEXT: ret i32
+
+entry:
+  %a = alloca [4 x i8]
+  %ptr = getelementptr [4 x i8]* %a, i32 0, i32 0
+  call void @llvm.memset.p0i8.i32(i8* %ptr, i8 42, i32 4, i32 1, i1 true)
+  %iptr = bitcast i8* %ptr to i32*
+  %val = load i32* %iptr
+  ret i32 %val
+}
+
+define void @test7(i8* %src, i8* %dst) {
+; CHECK: @test7
+; CHECK: alloca i32
+; CHECK-NEXT: bitcast i8* %src to i32*
+; CHECK-NEXT: load volatile i32*
+; CHECK-NEXT: store volatile i32
+; CHECK-NEXT: bitcast i8* %dst to i32*
+; CHECK-NEXT: load volatile i32*
+; CHECK-NEXT: store volatile i32
+; CHECK-NEXT: ret
+
+entry:
+  %a = alloca [4 x i8]
+  %ptr = getelementptr [4 x i8]* %a, i32 0, i32 0
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %ptr, i8* %src, i32 4, i32 1, i1 true)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %ptr, i32 4, i32 1, i1 true)
+  ret void
+}
+
+
+%S1 = type { i32, i32, [16 x i8] }
+%S2 = type { %S1*, %S2* }
+
+define %S2 @test8(%S2* %s2) {
+; CHECK: @test8
+entry:
+  %new = alloca %S2
+; CHECK-NOT: alloca
+
+  %s2.next.ptr = getelementptr %S2* %s2, i64 0, i32 1
+  %s2.next = load %S2** %s2.next.ptr
+; CHECK:      %[[gep:.*]] = getelementptr %S2* %s2, i64 0, i32 1
+; CHECK-NEXT: %[[next:.*]] = load %S2** %[[gep]]
+
+  %s2.next.s1.ptr = getelementptr %S2* %s2.next, i64 0, i32 0
+  %s2.next.s1 = load %S1** %s2.next.s1.ptr
+  %new.s1.ptr = getelementptr %S2* %new, i64 0, i32 0
+  store %S1* %s2.next.s1, %S1** %new.s1.ptr
+  %s2.next.next.ptr = getelementptr %S2* %s2.next, i64 0, i32 1
+  %s2.next.next = load %S2** %s2.next.next.ptr
+  %new.next.ptr = getelementptr %S2* %new, i64 0, i32 1
+  store %S2* %s2.next.next, %S2** %new.next.ptr
+; CHECK-NEXT: %[[gep:.*]] = getelementptr %S2* %[[next]], i64 0, i32 0
+; CHECK-NEXT: %[[next_s1:.*]] = load %S1** %[[gep]]
+; CHECK-NEXT: %[[gep:.*]] = getelementptr %S2* %[[next]], i64 0, i32 1
+; CHECK-NEXT: %[[next_next:.*]] = load %S2** %[[gep]]
+
+  %new.s1 = load %S1** %new.s1.ptr
+  %result1 = insertvalue %S2 undef, %S1* %new.s1, 0
+; CHECK-NEXT: %[[result1:.*]] = insertvalue %S2 undef, %S1* %[[next_s1]], 0
+  %new.next = load %S2** %new.next.ptr
+  %result2 = insertvalue %S2 %result1, %S2* %new.next, 1
+; CHECK-NEXT: %[[result2:.*]] = insertvalue %S2 %[[result1]], %S2* %[[next_next]], 1
+  ret %S2 %result2
+; CHECK-NEXT: ret %S2 %[[result2]]
+}
+
+define i64 @test9() {
+; Ensure we can handle loads off the end of an alloca even when wrapped in
+; weird bit casts and types. The result is undef, but this shouldn't crash
+; anything.
+; CHECK: @test9
+; CHECK-NOT: alloca
+; CHECK: ret i64 undef
+
+entry:
+  %a = alloca { [3 x i8] }
+  %gep1 = getelementptr inbounds { [3 x i8] }* %a, i32 0, i32 0, i32 0
+  store i8 0, i8* %gep1, align 1
+  %gep2 = getelementptr inbounds { [3 x i8] }* %a, i32 0, i32 0, i32 1
+  store i8 0, i8* %gep2, align 1
+  %gep3 = getelementptr inbounds { [3 x i8] }* %a, i32 0, i32 0, i32 2
+  store i8 26, i8* %gep3, align 1
+  %cast = bitcast { [3 x i8] }* %a to { i64 }*
+  %elt = getelementptr inbounds { i64 }* %cast, i32 0, i32 0
+  %result = load i64* %elt
+  ret i64 %result
+}
+
+define %S2* @test10() {
+; CHECK: @test10
+; CHECK-NOT: alloca %S2*
+; CHECK: ret %S2* null
+
+entry:
+  %a = alloca [8 x i8]
+  %ptr = getelementptr [8 x i8]* %a, i32 0, i32 0
+  call void @llvm.memset.p0i8.i32(i8* %ptr, i8 0, i32 8, i32 1, i1 false)
+  %s2ptrptr = bitcast i8* %ptr to %S2**
+  %s2ptr = load %S2** %s2ptrptr
+  ret %S2* %s2ptr
+}
+
+define i32 @test11() {
+; CHECK: @test11
+; CHECK-NOT: alloca
+; CHECK: ret i32 0
+
+entry:
+  %X = alloca i32
+  br i1 undef, label %good, label %bad
+
+good:
+  %Y = getelementptr i32* %X, i64 0
+  store i32 0, i32* %Y
+  %Z = load i32* %Y
+  ret i32 %Z
+
+bad:
+  %Y2 = getelementptr i32* %X, i64 1
+  store i32 0, i32* %Y2
+  %Z2 = load i32* %Y2
+  ret i32 %Z2
+}
+
+define i8 @test12() {
+; We fully promote these to the i24 load or store size, resulting in just masks
+; and other operations that instcombine will fold, but no alloca.
+;
+; CHECK: @test12
+
+entry:
+  %a = alloca [3 x i8]
+  %b = alloca [3 x i8]
+; CHECK-NOT: alloca
+
+  %a0ptr = getelementptr [3 x i8]* %a, i64 0, i32 0
+  store i8 0, i8* %a0ptr
+  %a1ptr = getelementptr [3 x i8]* %a, i64 0, i32 1
+  store i8 0, i8* %a1ptr
+  %a2ptr = getelementptr [3 x i8]* %a, i64 0, i32 2
+  store i8 0, i8* %a2ptr
+  %aiptr = bitcast [3 x i8]* %a to i24*
+  %ai = load i24* %aiptr
+; CHCEK-NOT: store
+; CHCEK-NOT: load
+; CHECK:      %[[mask0:.*]] = and i24 undef, -256
+; CHECK-NEXT: %[[mask1:.*]] = and i24 %[[mask0]], -65281
+; CHECK-NEXT: %[[mask2:.*]] = and i24 %[[mask1]], 65535
+
+  %biptr = bitcast [3 x i8]* %b to i24*
+  store i24 %ai, i24* %biptr
+  %b0ptr = getelementptr [3 x i8]* %b, i64 0, i32 0
+  %b0 = load i8* %b0ptr
+  %b1ptr = getelementptr [3 x i8]* %b, i64 0, i32 1
+  %b1 = load i8* %b1ptr
+  %b2ptr = getelementptr [3 x i8]* %b, i64 0, i32 2
+  %b2 = load i8* %b2ptr
+; CHCEK-NOT: store
+; CHCEK-NOT: load
+; CHECK:      %[[trunc0:.*]] = trunc i24 %[[mask2]] to i8
+; CHECK-NEXT: %[[shift1:.*]] = lshr i24 %[[mask2]], 8
+; CHECK-NEXT: %[[trunc1:.*]] = trunc i24 %[[shift1]] to i8
+; CHECK-NEXT: %[[shift2:.*]] = lshr i24 %[[mask2]], 16
+; CHECK-NEXT: %[[trunc2:.*]] = trunc i24 %[[shift2]] to i8
+
+  %bsum0 = add i8 %b0, %b1
+  %bsum1 = add i8 %bsum0, %b2
+  ret i8 %bsum1
+; CHECK:      %[[sum0:.*]] = add i8 %[[trunc0]], %[[trunc1]]
+; CHECK-NEXT: %[[sum1:.*]] = add i8 %[[sum0]], %[[trunc2]]
+; CHECK-NEXT: ret i8 %[[sum1]]
+}
+
+define i32 @test13() {
+; Ensure we don't crash and handle undefined loads that straddle the end of the
+; allocation.
+; CHECK: @test13
+; CHECK: %[[ret:.*]] = zext i16 undef to i32
+; CHECK: ret i32 %[[ret]]
+
+entry:
+  %a = alloca [3 x i8]
+  %b0ptr = getelementptr [3 x i8]* %a, i64 0, i32 0
+  store i8 0, i8* %b0ptr
+  %b1ptr = getelementptr [3 x i8]* %a, i64 0, i32 1
+  store i8 0, i8* %b1ptr
+  %b2ptr = getelementptr [3 x i8]* %a, i64 0, i32 2
+  store i8 0, i8* %b2ptr
+  %iptrcast = bitcast [3 x i8]* %a to i16*
+  %iptrgep = getelementptr i16* %iptrcast, i64 1
+  %i = load i16* %iptrgep
+  %ret = zext i16 %i to i32
+  ret i32 %ret
+}
+
+%test14.struct = type { [3 x i32] }
+
+define void @test14(...) nounwind uwtable {
+; This is a strange case where we split allocas into promotable partitions, but
+; also gain enough data to prove they must be dead allocas due to GEPs that walk
+; across two adjacent allocas. Test that we don't try to promote or otherwise
+; do bad things to these dead allocas, they should just be removed.
+; CHECK: @test14
+; CHECK-NEXT: entry:
+; CHECK-NEXT: ret void
+
+entry:
+  %a = alloca %test14.struct
+  %p = alloca %test14.struct*
+  %0 = bitcast %test14.struct* %a to i8*
+  %1 = getelementptr i8* %0, i64 12
+  %2 = bitcast i8* %1 to %test14.struct*
+  %3 = getelementptr inbounds %test14.struct* %2, i32 0, i32 0
+  %4 = getelementptr inbounds %test14.struct* %a, i32 0, i32 0
+  %5 = bitcast [3 x i32]* %3 to i32*
+  %6 = bitcast [3 x i32]* %4 to i32*
+  %7 = load i32* %6, align 4
+  store i32 %7, i32* %5, align 4
+  %8 = getelementptr inbounds i32* %5, i32 1
+  %9 = getelementptr inbounds i32* %6, i32 1
+  %10 = load i32* %9, align 4
+  store i32 %10, i32* %8, align 4
+  %11 = getelementptr inbounds i32* %5, i32 2
+  %12 = getelementptr inbounds i32* %6, i32 2
+  %13 = load i32* %12, align 4
+  store i32 %13, i32* %11, align 4
+  ret void
+}
+
+define i32 @test15(i1 %flag) nounwind uwtable {
+; Ensure that when there are dead instructions using an alloca that are not
+; loads or stores we still delete them during partitioning and rewriting.
+; Otherwise we'll go to promote them while thy still have unpromotable uses.
+; CHECK: @test15
+; CHECK-NEXT: entry:
+; CHECK-NEXT:   br label %loop
+; CHECK:      loop:
+; CHECK-NEXT:   br label %loop
+
+entry:
+  %l0 = alloca i64
+  %l1 = alloca i64
+  %l2 = alloca i64
+  %l3 = alloca i64
+  br label %loop
+
+loop:
+  %dead3 = phi i8* [ %gep3, %loop ], [ null, %entry ]
+
+  store i64 1879048192, i64* %l0, align 8
+  %bc0 = bitcast i64* %l0 to i8*
+  %gep0 = getelementptr i8* %bc0, i64 3
+  %dead0 = bitcast i8* %gep0 to i64*
+
+  store i64 1879048192, i64* %l1, align 8
+  %bc1 = bitcast i64* %l1 to i8*
+  %gep1 = getelementptr i8* %bc1, i64 3
+  %dead1 = getelementptr i8* %gep1, i64 1
+
+  store i64 1879048192, i64* %l2, align 8
+  %bc2 = bitcast i64* %l2 to i8*
+  %gep2.1 = getelementptr i8* %bc2, i64 1
+  %gep2.2 = getelementptr i8* %bc2, i64 3
+  ; Note that this select should get visited multiple times due to using two
+  ; different GEPs off the same alloca. We should only delete it once.
+  %dead2 = select i1 %flag, i8* %gep2.1, i8* %gep2.2
+
+  store i64 1879048192, i64* %l3, align 8
+  %bc3 = bitcast i64* %l3 to i8*
+  %gep3 = getelementptr i8* %bc3, i64 3
+
+  br label %loop
+}
+
+define void @test16(i8* %src, i8* %dst) {
+; Ensure that we can promote an alloca of [3 x i8] to an i24 SSA value.
+; CHECK: @test16
+; CHECK-NOT: alloca
+; CHECK:      %[[srccast:.*]] = bitcast i8* %src to i24*
+; CHECK-NEXT: load i24* %[[srccast]]
+; CHECK-NEXT: %[[dstcast:.*]] = bitcast i8* %dst to i24*
+; CHECK-NEXT: store i24 0, i24* %[[dstcast]]
+; CHECK-NEXT: ret void
+
+entry:
+  %a = alloca [3 x i8]
+  %ptr = getelementptr [3 x i8]* %a, i32 0, i32 0
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %ptr, i8* %src, i32 4, i32 1, i1 false)
+  %cast = bitcast i8* %ptr to i24*
+  store i24 0, i24* %cast
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %ptr, i32 4, i32 1, i1 false)
+  ret void
+}
+
+define void @test17(i8* %src, i8* %dst) {
+; Ensure that we can rewrite unpromotable memcpys which extend past the end of
+; the alloca.
+; CHECK: @test17
+; CHECK:      %[[a:.*]] = alloca [3 x i8]
+; CHECK-NEXT: %[[ptr:.*]] = getelementptr [3 x i8]* %[[a]], i32 0, i32 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[ptr]], i8* %src,
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %[[ptr]],
+; CHECK-NEXT: ret void
+
+entry:
+  %a = alloca [3 x i8]
+  %ptr = getelementptr [3 x i8]* %a, i32 0, i32 0
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %ptr, i8* %src, i32 4, i32 1, i1 true)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %ptr, i32 4, i32 1, i1 true)
+  ret void
+}
+
+define void @test18(i8* %src, i8* %dst, i32 %size) {
+; Preserve transfer instrinsics with a variable size, even if they overlap with
+; fixed size operations. Further, continue to split and promote allocas preceding
+; the variable sized intrinsic.
+; CHECK: @test18
+; CHECK:      %[[a:.*]] = alloca [34 x i8]
+; CHECK:      %[[srcgep1:.*]] = getelementptr inbounds i8* %src, i64 4
+; CHECK-NEXT: %[[srccast1:.*]] = bitcast i8* %[[srcgep1]] to i32*
+; CHECK-NEXT: %[[srcload:.*]] = load i32* %[[srccast1]]
+; CHECK-NEXT: %[[agep1:.*]] = getelementptr inbounds [34 x i8]* %[[a]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %[[agep1]], i8* %src, i32 %size,
+; CHECK-NEXT: %[[agep2:.*]] = getelementptr inbounds [34 x i8]* %[[a]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memset.p0i8.i32(i8* %[[agep2]], i8 42, i32 %size,
+; CHECK-NEXT: %[[dstcast1:.*]] = bitcast i8* %dst to i32*
+; CHECK-NEXT: store i32 42, i32* %[[dstcast1]]
+; CHECK-NEXT: %[[dstgep1:.*]] = getelementptr inbounds i8* %dst, i64 4
+; CHECK-NEXT: %[[dstcast2:.*]] = bitcast i8* %[[dstgep1]] to i32*
+; CHECK-NEXT: store i32 %[[srcload]], i32* %[[dstcast2]]
+; CHECK-NEXT: %[[agep3:.*]] = getelementptr inbounds [34 x i8]* %[[a]], i64 0, i64 0
+; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %[[agep3]], i32 %size,
+; CHECK-NEXT: ret void
+
+entry:
+  %a = alloca [42 x i8]
+  %ptr = getelementptr [42 x i8]* %a, i32 0, i32 0
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %ptr, i8* %src, i32 8, i32 1, i1 false)
+  %ptr2 = getelementptr [42 x i8]* %a, i32 0, i32 8
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %ptr2, i8* %src, i32 %size, i32 1, i1 false)
+  call void @llvm.memset.p0i8.i32(i8* %ptr2, i8 42, i32 %size, i32 1, i1 false)
+  %cast = bitcast i8* %ptr to i32*
+  store i32 42, i32* %cast
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %ptr, i32 8, i32 1, i1 false)
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dst, i8* %ptr2, i32 %size, i32 1, i1 false)
+  ret void
+}
+
+%opaque = type opaque
+
+define i32 @test19(%opaque* %x) {
+; This input will cause us to try to compute a natural GEP when rewriting
+; pointers in such a way that we try to GEP through the opaque type. Previously,
+; a check for an unsized type was missing and this crashed. Ensure it behaves
+; reasonably now.
+; CHECK: @test19
+; CHECK-NOT: alloca
+; CHECK: ret i32 undef
+
+entry:
+  %a = alloca { i64, i8* }
+  %cast1 = bitcast %opaque* %x to i8*
+  %cast2 = bitcast { i64, i8* }* %a to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %cast2, i8* %cast1, i32 16, i32 1, i1 false)
+  %gep = getelementptr inbounds { i64, i8* }* %a, i32 0, i32 0
+  %val = load i64* %gep
+  ret i32 undef
+}
+
+define i32 @test20() {
+; Ensure we can track negative offsets (before the beginning of the alloca) and
+; negative relative offsets from offsets starting past the end of the alloca.
+; CHECK: @test20
+; CHECK-NOT: alloca
+; CHECK: %[[sum1:.*]] = add i32 1, 2
+; CHECK: %[[sum2:.*]] = add i32 %[[sum1]], 3
+; CHECK: ret i32 %[[sum2]]
+
+entry:
+  %a = alloca [3 x i32]
+  %gep1 = getelementptr [3 x i32]* %a, i32 0, i32 0
+  store i32 1, i32* %gep1
+  %gep2.1 = getelementptr [3 x i32]* %a, i32 0, i32 -2
+  %gep2.2 = getelementptr i32* %gep2.1, i32 3
+  store i32 2, i32* %gep2.2
+  %gep3.1 = getelementptr [3 x i32]* %a, i32 0, i32 14
+  %gep3.2 = getelementptr i32* %gep3.1, i32 -12
+  store i32 3, i32* %gep3.2
+
+  %load1 = load i32* %gep1
+  %load2 = load i32* %gep2.2
+  %load3 = load i32* %gep3.2
+  %sum1 = add i32 %load1, %load2
+  %sum2 = add i32 %sum1, %load3
+  ret i32 %sum2
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+
+define i8 @test21() {
+; Test allocations and offsets which border on overflow of the int64_t used
+; internally. This is really awkward to really test as LLVM doesn't really
+; support such extreme constructs cleanly.
+; CHECK: @test21
+; CHECK-NOT: alloca
+; CHECK: or i8 -1, -1
+
+entry:
+  %a = alloca [2305843009213693951 x i8]
+  %gep0 = getelementptr [2305843009213693951 x i8]* %a, i64 0, i64 2305843009213693949
+  store i8 255, i8* %gep0
+  %gep1 = getelementptr [2305843009213693951 x i8]* %a, i64 0, i64 -9223372036854775807
+  %gep2 = getelementptr i8* %gep1, i64 -1
+  call void @llvm.memset.p0i8.i64(i8* %gep2, i8 0, i64 18446744073709551615, i32 1, i1 false)
+  %gep3 = getelementptr i8* %gep1, i64 9223372036854775807
+  %gep4 = getelementptr i8* %gep3, i64 9223372036854775807
+  %gep5 = getelementptr i8* %gep4, i64 -6917529027641081857
+  store i8 255, i8* %gep5
+  %cast1 = bitcast i8* %gep4 to i32*
+  store i32 0, i32* %cast1
+  %load = load i8* %gep0
+  %gep6 = getelementptr i8* %gep0, i32 1
+  %load2 = load i8* %gep6
+  %result = or i8 %load, %load2
+  ret i8 %result
+}
diff --git a/test/Transforms/SROA/fca.ll b/test/Transforms/SROA/fca.ll
new file mode 100644
index 0000000000..6ddaed2f30
--- /dev/null
+++ b/test/Transforms/SROA/fca.ll
@@ -0,0 +1,49 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+; RUN: opt < %s -sroa -force-ssa-updater -S | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
+define { i32, i32 } @test0(i32 %x, i32 %y) {
+; CHECK: @test0
+; CHECK-NOT: alloca
+; CHECK: insertvalue { i32, i32 }
+; CHECK: insertvalue { i32, i32 }
+; CHECK: ret { i32, i32 }
+
+entry:
+  %a = alloca { i32, i32 }
+
+  store { i32, i32 } undef, { i32, i32 }* %a
+
+  %gep1 = getelementptr inbounds { i32, i32 }* %a, i32 0, i32 0
+  store i32 %x, i32* %gep1
+  %gep2 = getelementptr inbounds { i32, i32 }* %a, i32 0, i32 1
+  store i32 %y, i32* %gep2
+
+  %result = load { i32, i32 }* %a
+  ret { i32, i32 } %result
+}
+
+define { i32, i32 } @test1(i32 %x, i32 %y) {
+; FIXME: This may be too conservative. Duncan argues that we are allowed to
+; split the volatile load and store here but must produce volatile scalar loads
+; and stores from them.
+; CHECK: @test1
+; CHECK: alloca
+; CHECK: alloca
+; CHECK: load volatile { i32, i32 }*
+; CHECK: store volatile { i32, i32 }
+; CHECK: ret { i32, i32 }
+
+entry:
+  %a = alloca { i32, i32 }
+  %b = alloca { i32, i32 }
+
+  %gep1 = getelementptr inbounds { i32, i32 }* %a, i32 0, i32 0
+  store i32 %x, i32* %gep1
+  %gep2 = getelementptr inbounds { i32, i32 }* %a, i32 0, i32 1
+  store i32 %y, i32* %gep2
+
+  %result = load volatile { i32, i32 }* %a
+  store volatile { i32, i32 } %result, { i32, i32 }* %b
+  ret { i32, i32 } %result
+}
diff --git a/test/Transforms/SROA/lit.local.cfg b/test/Transforms/SROA/lit.local.cfg
new file mode 100644
index 0000000000..c6106e4746
--- /dev/null
+++ b/test/Transforms/SROA/lit.local.cfg
@@ -0,0 +1 @@
+config.suffixes = ['.ll']
diff --git a/test/Transforms/SROA/phi-and-select.ll b/test/Transforms/SROA/phi-and-select.ll
new file mode 100644
index 0000000000..ad0c55748d
--- /dev/null
+++ b/test/Transforms/SROA/phi-and-select.ll
@@ -0,0 +1,329 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
+define i32 @test1() {
+; CHECK: @test1
+entry:
+	%a = alloca [2 x i32]
+; CHECK-NOT: alloca
+
+  %a0 = getelementptr [2 x i32]* %a, i64 0, i32 0
+  %a1 = getelementptr [2 x i32]* %a, i64 0, i32 1
+	store i32 0, i32* %a0
+	store i32 1, i32* %a1
+	%v0 = load i32* %a0
+	%v1 = load i32* %a1
+; CHECK-NOT: store
+; CHECK-NOT: load
+
+	%cond = icmp sle i32 %v0, %v1
+	br i1 %cond, label %then, label %exit
+
+then:
+	br label %exit
+
+exit:
+	%phi = phi i32* [ %a1, %then ], [ %a0, %entry ]
+; CHECK: phi i32 [ 1, %{{.*}} ], [ 0, %{{.*}} ]
+
+	%result = load i32* %phi
+	ret i32 %result
+}
+
+define i32 @test2() {
+; CHECK: @test2
+entry:
+	%a = alloca [2 x i32]
+; CHECK-NOT: alloca
+
+  %a0 = getelementptr [2 x i32]* %a, i64 0, i32 0
+  %a1 = getelementptr [2 x i32]* %a, i64 0, i32 1
+	store i32 0, i32* %a0
+	store i32 1, i32* %a1
+	%v0 = load i32* %a0
+	%v1 = load i32* %a1
+; CHECK-NOT: store
+; CHECK-NOT: load
+
+	%cond = icmp sle i32 %v0, %v1
+	%select = select i1 %cond, i32* %a1, i32* %a0
+; CHECK: select i1 %{{.*}}, i32 1, i32 0
+
+	%result = load i32* %select
+	ret i32 %result
+}
+
+define i32 @test3(i32 %x) {
+; CHECK: @test3
+entry:
+	%a = alloca [2 x i32]
+; CHECK-NOT: alloca
+
+  %a0 = getelementptr [2 x i32]* %a, i64 0, i32 0
+  %a1 = getelementptr [2 x i32]* %a, i64 0, i32 1
+	store i32 0, i32* %a0
+	store i32 1, i32* %a1
+; CHECK-NOT: store
+
+  switch i32 %x, label %bb0 [ i32 1, label %bb1
+                              i32 2, label %bb2
+                              i32 3, label %bb3 ]
+
+bb0:
+	br label %exit
+bb1:
+	br label %exit
+bb2:
+	br label %exit
+bb3:
+	br label %exit
+
+exit:
+	%phi = phi i32* [ %a1, %bb0 ], [ %a0, %bb1 ], [ %a0, %bb2 ], [ %a1, %bb3 ]
+; CHECK: phi i32 [ 1, %{{.*}} ], [ 0, %{{.*}} ], [ 0, %{{.*}} ], [ 1, %{{.*}} ]
+
+	%result = load i32* %phi
+	ret i32 %result
+}
+
+define i32 @test4() {
+; CHECK: @test4
+entry:
+	%a = alloca [2 x i32]
+; CHECK-NOT: alloca
+
+  %a0 = getelementptr [2 x i32]* %a, i64 0, i32 0
+  %a1 = getelementptr [2 x i32]* %a, i64 0, i32 1
+	store i32 0, i32* %a0
+	store i32 1, i32* %a1
+	%v0 = load i32* %a0
+	%v1 = load i32* %a1
+; CHECK-NOT: store
+; CHECK-NOT: load
+
+	%cond = icmp sle i32 %v0, %v1
+	%select = select i1 %cond, i32* %a0, i32* %a0
+; CHECK-NOT: select
+
+	%result = load i32* %select
+	ret i32 %result
+; CHECK: ret i32 0
+}
+
+define i32 @test5(i32* %b) {
+; CHECK: @test5
+entry:
+	%a = alloca [2 x i32]
+; CHECK-NOT: alloca
+
+  %a1 = getelementptr [2 x i32]* %a, i64 0, i32 1
+	store i32 1, i32* %a1
+; CHECK-NOT: store
+
+	%select = select i1 true, i32* %a1, i32* %b
+; CHECK-NOT: select
+
+	%result = load i32* %select
+; CHECK-NOT: load
+
+	ret i32 %result
+; CHECK: ret i32 1
+}
+
+declare void @f(i32*, i32*)
+
+define i32 @test6(i32* %b) {
+; CHECK: @test6
+entry:
+	%a = alloca [2 x i32]
+  %c = alloca i32
+; CHECK-NOT: alloca
+
+  %a1 = getelementptr [2 x i32]* %a, i64 0, i32 1
+	store i32 1, i32* %a1
+
+	%select = select i1 true, i32* %a1, i32* %b
+	%select2 = select i1 false, i32* %a1, i32* %b
+  %select3 = select i1 false, i32* %c, i32* %b
+; CHECK: %[[select2:.*]] = select i1 false, i32* undef, i32* %b
+; CHECK: %[[select3:.*]] = select i1 false, i32* undef, i32* %b
+
+  ; Note, this would potentially escape the alloca pointer except for the
+  ; constant folding of the select.
+  call void @f(i32* %select2, i32* %select3)
+; CHECK: call void @f(i32* %[[select2]], i32* %[[select3]])
+
+
+	%result = load i32* %select
+; CHECK-NOT: load
+
+  %dead = load i32* %c
+
+	ret i32 %result
+; CHECK: ret i32 1
+}
+
+define i32 @test7() {
+; CHECK: @test7
+; CHECK-NOT: alloca
+
+entry:
+  %X = alloca i32
+  br i1 undef, label %good, label %bad
+
+good:
+  %Y1 = getelementptr i32* %X, i64 0
+  store i32 0, i32* %Y1
+  br label %exit
+
+bad:
+  %Y2 = getelementptr i32* %X, i64 1
+  store i32 0, i32* %Y2
+  br label %exit
+
+exit:
+	%P = phi i32* [ %Y1, %good ], [ %Y2, %bad ]
+; CHECK: %[[phi:.*]] = phi i32 [ 0, %good ],
+  %Z2 = load i32* %P
+  ret i32 %Z2
+; CHECK: ret i32 %[[phi]]
+}
+
+define i32 @test8(i32 %b, i32* %ptr) {
+; Ensure that we rewrite allocas to the used type when that use is hidden by
+; a PHI that can be speculated.
+; CHECK: @test8
+; CHECK-NOT: alloca
+; CHECK-NOT: load
+; CHECK: %[[value:.*]] = load i32* %ptr
+; CHECK-NOT: load
+; CHECK: %[[result:.*]] = phi i32 [ undef, %else ], [ %[[value]], %then ]
+; CHECK-NEXT: ret i32 %[[result]]
+
+entry:
+  %f = alloca float
+  %test = icmp ne i32 %b, 0
+  br i1 %test, label %then, label %else
+
+then:
+  br label %exit
+
+else:
+  %bitcast = bitcast float* %f to i32*
+  br label %exit
+
+exit:
+  %phi = phi i32* [ %bitcast, %else ], [ %ptr, %then ]
+  %loaded = load i32* %phi, align 4
+  ret i32 %loaded
+}
+
+define i32 @test9(i32 %b, i32* %ptr) {
+; Same as @test8 but for a select rather than a PHI node.
+; CHECK: @test9
+; CHECK-NOT: alloca
+; CHECK-NOT: load
+; CHECK: %[[value:.*]] = load i32* %ptr
+; CHECK-NOT: load
+; CHECK: %[[result:.*]] = select i1 %{{.*}}, i32 undef, i32 %[[value]]
+; CHECK-NEXT: ret i32 %[[result]]
+
+entry:
+  %f = alloca float
+  store i32 0, i32* %ptr
+  %test = icmp ne i32 %b, 0
+  %bitcast = bitcast float* %f to i32*
+  %select = select i1 %test, i32* %bitcast, i32* %ptr
+  %loaded = load i32* %select, align 4
+  ret i32 %loaded
+}
+
+define i32 @test10(i32 %b, i32* %ptr) {
+; Don't try to promote allocas which are not elligible for it even after
+; rewriting due to the necessity of inserting bitcasts when speculating a PHI
+; node.
+; CHECK: @test10
+; CHECK: %[[alloca:.*]] = alloca
+; CHECK: %[[argvalue:.*]] = load i32* %ptr
+; CHECK: %[[cast:.*]] = bitcast double* %[[alloca]] to i32*
+; CHECK: %[[allocavalue:.*]] = load i32* %[[cast]]
+; CHECK: %[[result:.*]] = phi i32 [ %[[allocavalue]], %else ], [ %[[argvalue]], %then ]
+; CHECK-NEXT: ret i32 %[[result]]
+
+entry:
+  %f = alloca double
+  store double 0.0, double* %f
+  %test = icmp ne i32 %b, 0
+  br i1 %test, label %then, label %else
+
+then:
+  br label %exit
+
+else:
+  %bitcast = bitcast double* %f to i32*
+  br label %exit
+
+exit:
+  %phi = phi i32* [ %bitcast, %else ], [ %ptr, %then ]
+  %loaded = load i32* %phi, align 4
+  ret i32 %loaded
+}
+
+define i32 @test11(i32 %b, i32* %ptr) {
+; Same as @test10 but for a select rather than a PHI node.
+; CHECK: @test11
+; CHECK: %[[alloca:.*]] = alloca
+; CHECK: %[[cast:.*]] = bitcast double* %[[alloca]] to i32*
+; CHECK: %[[allocavalue:.*]] = load i32* %[[cast]]
+; CHECK: %[[argvalue:.*]] = load i32* %ptr
+; CHECK: %[[result:.*]] = select i1 %{{.*}}, i32 %[[allocavalue]], i32 %[[argvalue]]
+; CHECK-NEXT: ret i32 %[[result]]
+
+entry:
+  %f = alloca double
+  store double 0.0, double* %f
+  store i32 0, i32* %ptr
+  %test = icmp ne i32 %b, 0
+  %bitcast = bitcast double* %f to i32*
+  %select = select i1 %test, i32* %bitcast, i32* %ptr
+  %loaded = load i32* %select, align 4
+  ret i32 %loaded
+}
+
+define i32 @test12(i32 %x, i32* %p) {
+; Ensure we don't crash or fail to nuke dead selects of allocas if no load is
+; never found.
+; CHECK: @test12
+; CHECK-NOT: alloca
+; CHECK-NOT: select
+; CHECK: ret i32 %x
+
+entry:
+  %a = alloca i32
+  store i32 %x, i32* %a
+  %dead = select i1 undef, i32* %a, i32* %p
+  %load = load i32* %a
+  ret i32 %load
+}
+
+define i32 @test13(i32 %x, i32* %p) {
+; Ensure we don't crash or fail to nuke dead phis of allocas if no load is ever
+; found.
+; CHECK: @test13
+; CHECK-NOT: alloca
+; CHECK-NOT: phi
+; CHECK: ret i32 %x
+
+entry:
+  %a = alloca i32
+  store i32 %x, i32* %a
+  br label %loop
+
+loop:
+  %phi = phi i32* [ %p, %entry ], [ %a, %loop ]
+  br i1 undef, label %loop, label %exit
+
+exit:
+  %load = load i32* %a
+  ret i32 %load
+}
diff --git a/test/Transforms/SROA/vector-promotion.ll b/test/Transforms/SROA/vector-promotion.ll
new file mode 100644
index 0000000000..9cbab385c2
--- /dev/null
+++ b/test/Transforms/SROA/vector-promotion.ll
@@ -0,0 +1,191 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
+
+%S1 = type { i64, [42 x float] }
+
+define i32 @test1(<4 x i32> %x, <4 x i32> %y) {
+; CHECK: @test1
+entry:
+	%a = alloca [2 x <4 x i32>]
+; CHECK-NOT: alloca
+
+  %a.x = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0
+  store <4 x i32> %x, <4 x i32>* %a.x
+  %a.y = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1
+  store <4 x i32> %y, <4 x i32>* %a.y
+; CHECK-NOT: store
+
+  %a.tmp1 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0, i64 2
+  %tmp1 = load i32* %a.tmp1
+  %a.tmp2 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 3
+  %tmp2 = load i32* %a.tmp2
+  %a.tmp3 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 0
+  %tmp3 = load i32* %a.tmp3
+; CHECK-NOT: load
+; CHECK:      extractelement <4 x i32> %x, i32 2
+; CHECK-NEXT: extractelement <4 x i32> %y, i32 3
+; CHECK-NEXT: extractelement <4 x i32> %y, i32 0
+
+  %tmp4 = add i32 %tmp1, %tmp2
+  %tmp5 = add i32 %tmp3, %tmp4
+  ret i32 %tmp5
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: ret
+}
+
+define i32 @test2(<4 x i32> %x, <4 x i32> %y) {
+; CHECK: @test2
+; FIXME: This should be handled!
+entry:
+	%a = alloca [2 x <4 x i32>]
+; CHECK: alloca <4 x i32>
+
+  %a.x = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0
+  store <4 x i32> %x, <4 x i32>* %a.x
+  %a.y = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1
+  store <4 x i32> %y, <4 x i32>* %a.y
+
+  %a.tmp1 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0, i64 2
+  %tmp1 = load i32* %a.tmp1
+  %a.tmp2 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 3
+  %tmp2 = load i32* %a.tmp2
+  %a.tmp3 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 0
+  %a.tmp3.cast = bitcast i32* %a.tmp3 to <2 x i32>*
+  %tmp3.vec = load <2 x i32>* %a.tmp3.cast
+  %tmp3 = extractelement <2 x i32> %tmp3.vec, i32 0
+
+  %tmp4 = add i32 %tmp1, %tmp2
+  %tmp5 = add i32 %tmp3, %tmp4
+  ret i32 %tmp5
+}
+
+define i32 @test3(<4 x i32> %x, <4 x i32> %y) {
+; CHECK: @test3
+entry:
+	%a = alloca [2 x <4 x i32>]
+; CHECK-NOT: alloca
+
+  %a.x = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0
+  store <4 x i32> %x, <4 x i32>* %a.x
+  %a.y = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1
+  store <4 x i32> %y, <4 x i32>* %a.y
+; CHECK-NOT: store
+
+  %a.y.cast = bitcast <4 x i32>* %a.y to i8*
+  call void @llvm.memset.p0i8.i32(i8* %a.y.cast, i8 0, i32 16, i32 1, i1 false)
+; CHECK-NOT: memset
+
+  %a.tmp1 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0, i64 2
+  %a.tmp1.cast = bitcast i32* %a.tmp1 to i8*
+  call void @llvm.memset.p0i8.i32(i8* %a.tmp1.cast, i8 -1, i32 4, i32 1, i1 false)
+  %tmp1 = load i32* %a.tmp1
+  %a.tmp2 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 3
+  %tmp2 = load i32* %a.tmp2
+  %a.tmp3 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 0
+  %tmp3 = load i32* %a.tmp3
+; CHECK-NOT: load
+; CHECK:      %[[insert:.*]] = insertelement <4 x i32> %x, i32 -1, i32 2
+; CHECK-NEXT: extractelement <4 x i32> %[[insert]], i32 2
+; CHECK-NEXT: extractelement <4 x i32> zeroinitializer, i32 3
+; CHECK-NEXT: extractelement <4 x i32> zeroinitializer, i32 0
+
+  %tmp4 = add i32 %tmp1, %tmp2
+  %tmp5 = add i32 %tmp3, %tmp4
+  ret i32 %tmp5
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: ret
+}
+
+define i32 @test4(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %z) {
+; CHECK: @test4
+entry:
+	%a = alloca [2 x <4 x i32>]
+; CHECK-NOT: alloca
+
+  %a.x = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0
+  store <4 x i32> %x, <4 x i32>* %a.x
+  %a.y = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1
+  store <4 x i32> %y, <4 x i32>* %a.y
+; CHECK-NOT: store
+
+  %a.y.cast = bitcast <4 x i32>* %a.y to i8*
+  %z.cast = bitcast <4 x i32>* %z to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.y.cast, i8* %z.cast, i32 16, i32 1, i1 false)
+; CHECK-NOT: memcpy
+
+  %a.tmp1 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0, i64 2
+  %a.tmp1.cast = bitcast i32* %a.tmp1 to i8*
+  %z.tmp1 = getelementptr inbounds <4 x i32>* %z, i64 0, i64 2
+  %z.tmp1.cast = bitcast i32* %z.tmp1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.tmp1.cast, i8* %z.tmp1.cast, i32 4, i32 1, i1 false)
+  %tmp1 = load i32* %a.tmp1
+  %a.tmp2 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 3
+  %tmp2 = load i32* %a.tmp2
+  %a.tmp3 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 0
+  %tmp3 = load i32* %a.tmp3
+; CHECK-NOT: memcpy
+; CHECK:      %[[load:.*]] = load <4 x i32>* %z
+; CHECK-NEXT: %[[gep:.*]] = getelementptr inbounds <4 x i32>* %z, i64 0, i64 2
+; CHECK-NEXT: %[[element_load:.*]] = load i32* %[[gep]]
+; CHECK-NEXT: %[[insert:.*]] = insertelement <4 x i32> %x, i32 %[[element_load]], i32 2
+; CHECK-NEXT: extractelement <4 x i32> %[[insert]], i32 2
+; CHECK-NEXT: extractelement <4 x i32> %[[load]], i32 3
+; CHECK-NEXT: extractelement <4 x i32> %[[load]], i32 0
+
+  %tmp4 = add i32 %tmp1, %tmp2
+  %tmp5 = add i32 %tmp3, %tmp4
+  ret i32 %tmp5
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: ret
+}
+
+define i32 @test5(<4 x i32> %x, <4 x i32> %y, <4 x i32>* %z) {
+; CHECK: @test5
+; The same as the above, but with reversed source and destination for the
+; element memcpy, and a self copy.
+entry:
+	%a = alloca [2 x <4 x i32>]
+; CHECK-NOT: alloca
+
+  %a.x = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0
+  store <4 x i32> %x, <4 x i32>* %a.x
+  %a.y = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1
+  store <4 x i32> %y, <4 x i32>* %a.y
+; CHECK-NOT: store
+
+  %a.y.cast = bitcast <4 x i32>* %a.y to i8*
+  %a.x.cast = bitcast <4 x i32>* %a.x to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.x.cast, i8* %a.y.cast, i32 16, i32 1, i1 false)
+; CHECK-NOT: memcpy
+
+  %a.tmp1 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 0, i64 2
+  %a.tmp1.cast = bitcast i32* %a.tmp1 to i8*
+  %z.tmp1 = getelementptr inbounds <4 x i32>* %z, i64 0, i64 2
+  %z.tmp1.cast = bitcast i32* %z.tmp1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %z.tmp1.cast, i8* %a.tmp1.cast, i32 4, i32 1, i1 false)
+  %tmp1 = load i32* %a.tmp1
+  %a.tmp2 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 3
+  %tmp2 = load i32* %a.tmp2
+  %a.tmp3 = getelementptr inbounds [2 x <4 x i32>]* %a, i64 0, i64 1, i64 0
+  %tmp3 = load i32* %a.tmp3
+; CHECK-NOT: memcpy
+; CHECK:      %[[gep:.*]] = getelementptr inbounds <4 x i32>* %z, i64 0, i64 2
+; CHECK-NEXT: %[[extract:.*]] = extractelement <4 x i32> %y, i32 2
+; CHECK-NEXT: store i32 %[[extract]], i32* %[[gep]]
+; CHECK-NEXT: extractelement <4 x i32> %y, i32 2
+; CHECK-NEXT: extractelement <4 x i32> %y, i32 3
+; CHECK-NEXT: extractelement <4 x i32> %y, i32 0
+
+  %tmp4 = add i32 %tmp1, %tmp2
+  %tmp5 = add i32 %tmp3, %tmp4
+  ret i32 %tmp5
+; CHECK-NEXT: add
+; CHECK-NEXT: add
+; CHECK-NEXT: ret
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
diff --git a/test/Transforms/SimplifyCFG/preserve-branchweights-switch-create.ll b/test/Transforms/SimplifyCFG/preserve-branchweights-switch-create.ll
index 75f5f06daa..941f5ad9d5 100644
--- a/test/Transforms/SimplifyCFG/preserve-branchweights-switch-create.ll
+++ b/test/Transforms/SimplifyCFG/preserve-branchweights-switch-create.ll
@@ -84,9 +84,57 @@ sw.epilog:
   ret void
 }
 
+;; test3 - Merge two switches where PredDefault != BB.
+define void @test3(i32 %M, i32 %N) nounwind uwtable {
+entry:
+  %cmp = icmp sgt i32 %M, 2
+  br i1 %cmp, label %sw1, label %sw2
+
+sw1:
+  switch i32 %N, label %sw.bb [
+    i32 2, label %sw2
+    i32 3, label %sw2
+    i32 1, label %sw.bb1
+  ], !prof !4
+; CHECK: test3
+; CHECK: switch i32 %N, label %sw.bb
+; CHECK: i32 1, label %sw.bb1
+; CHECK: i32 3, label %sw.bb4
+; CHECK: i32 2, label %sw.epilog
+; CHECK: ], !prof !3
+
+sw.bb:
+  call void @func2(i32 %N) nounwind
+  br label %sw.epilog
+
+sw.bb1:
+  call void @func4(i32 %N) nounwind
+  br label %sw.epilog
+
+sw2:
+  switch i32 %N, label %sw.epilog [
+    i32 3, label %sw.bb4
+    i32 4, label %sw.bb5
+  ], !prof !5
+
+sw.bb4:
+  call void @func6(i32 %N) nounwind
+  br label %sw.epilog
+
+sw.bb5:
+  call void @func8(i32 %N) nounwind
+  br label %sw.epilog
+
+sw.epilog:
+  ret void
+}
+
 !0 = metadata !{metadata !"branch_weights", i32 64, i32 4}
 !1 = metadata !{metadata !"branch_weights", i32 4, i32 64}
 ; CHECK: !0 = metadata !{metadata !"branch_weights", i32 256, i32 4352, i32 16}
 !2 = metadata !{metadata !"branch_weights", i32 4, i32 4, i32 8}
 !3 = metadata !{metadata !"branch_weights", i32 8, i32 8, i32 4}
 ; CHECK: !1 = metadata !{metadata !"branch_weights", i32 32, i32 48, i32 96, i32 16}
+!4 = metadata !{metadata !"branch_weights", i32 7, i32 6, i32 4, i32 3}
+!5 = metadata !{metadata !"branch_weights", i32 17, i32 13, i32 9}
+; CHECK: !3 = metadata !{metadata !"branch_weights", i32 7, i32 3, i32 4, i32 6}
diff --git a/test/Transforms/SimplifyCFG/preserve-branchweights.ll b/test/Transforms/SimplifyCFG/preserve-branchweights.ll
index c7917857ee..beef527008 100644
--- a/test/Transforms/SimplifyCFG/preserve-branchweights.ll
+++ b/test/Transforms/SimplifyCFG/preserve-branchweights.ll
@@ -79,10 +79,238 @@ Z:
   ret void
 }
 
+;; test5 - The case where it jumps to the default target will be removed.
+define void @test5(i32 %M, i32 %N) nounwind uwtable {
+entry:
+  switch i32 %N, label %sw2 [
+    i32 1, label %sw2
+    i32 2, label %sw.bb
+    i32 3, label %sw.bb1
+  ], !prof !3
+; CHECK: test5
+; CHECK: switch i32 %N, label %sw2 [
+; CHECK: i32 3, label %sw.bb1
+; CHECK: i32 2, label %sw.bb
+; CHECK: ], !prof !2
+
+sw.bb:
+  call void @helper(i32 0)
+  br label %sw.epilog
+
+sw.bb1:
+  call void @helper(i32 1)
+  br label %sw.epilog
+
+sw2:
+  call void @helper(i32 2)
+  br label %sw.epilog
+
+sw.epilog:
+  ret void
+}
+
+;; test6 - Some cases of the second switch are pruned during optimization.
+;; Then the second switch will be converted to a branch, finally, the first
+;; switch and the branch will be merged into a single switch.
+define void @test6(i32 %M, i32 %N) nounwind uwtable {
+entry:
+  switch i32 %N, label %sw2 [
+    i32 1, label %sw2
+    i32 2, label %sw.bb
+    i32 3, label %sw.bb1
+  ], !prof !4
+; CHECK: test6
+; CHECK: switch i32 %N, label %sw.epilog
+; CHECK: i32 3, label %sw.bb1
+; CHECK: i32 2, label %sw.bb
+; CHECK: i32 4, label %sw.bb5
+; CHECK: ], !prof !3
+
+sw.bb:
+  call void @helper(i32 0)
+  br label %sw.epilog
+
+sw.bb1:
+  call void @helper(i32 1)
+  br label %sw.epilog
+
+sw2:
+;; Here "case 2" is invalidated since the default case of the first switch
+;; does not include "case 2".
+  switch i32 %N, label %sw.epilog [
+    i32 2, label %sw.bb4
+    i32 4, label %sw.bb5
+  ], !prof !5
+
+sw.bb4:
+  call void @helper(i32 2)
+  br label %sw.epilog
+
+sw.bb5:
+  call void @helper(i32 3)
+  br label %sw.epilog
+
+sw.epilog:
+  ret void
+}
+
+;; This test is based on test1 but swapped the targets of the second branch.
+define void @test1_swap(i1 %a, i1 %b) {
+; CHECK: @test1_swap
+entry:
+  br i1 %a, label %Y, label %X, !prof !0
+; CHECK: br i1 %or.cond, label %Y, label %Z, !prof !4
+
+X:
+  %c = or i1 %b, false
+  br i1 %c, label %Y, label %Z, !prof !1
+
+Y:
+  call void @helper(i32 0)
+  ret void
+
+Z:
+  call void @helper(i32 1)
+  ret void
+}
+
+define void @test7(i1 %a, i1 %b) {
+; CHECK: @test7
+entry:
+  %c = or i1 %b, false
+  br i1 %a, label %Y, label %X, !prof !0
+; CHECK: br i1 %brmerge, label %Y, label %Z, !prof !5
+
+X:
+  br i1 %c, label %Y, label %Z, !prof !6
+
+Y:
+  call void @helper(i32 0)
+  ret void
+
+Z:
+  call void @helper(i32 1)
+  ret void
+}
+
+; Test basic folding to a conditional branch.
+define void @test8(i64 %x, i64 %y) nounwind {
+; CHECK: @test8
+entry:
+    %lt = icmp slt i64 %x, %y
+; CHECK: br i1 %lt, label %a, label %b, !prof !6
+    %qux = select i1 %lt, i32 0, i32 2
+    switch i32 %qux, label %bees [
+        i32 0, label %a
+        i32 1, label %b
+        i32 2, label %b
+    ], !prof !7
+a:
+    call void @helper(i32 0) nounwind
+    ret void
+b:
+    call void @helper(i32 1) nounwind
+    ret void
+bees:
+    call void @helper(i32 2) nounwind
+    ret void
+}
+
+; Test edge splitting when the default target has icmp and unconditinal
+; branch
+define i1 @test9(i32 %x, i32 %y) nounwind {
+; CHECK: @test9
+entry:
+    switch i32 %x, label %bees [
+        i32 0, label %a
+        i32 1, label %end
+        i32 2, label %end
+    ], !prof !7
+; CHECK: switch i32 %x, label %bees [
+; CHECK: i32 0, label %a
+; CHECK: i32 1, label %end
+; CHECK: i32 2, label %end
+; CHECK: i32 92, label %end
+; CHECK: ], !prof !7
+
+a:
+    call void @helper(i32 0) nounwind
+    %reta = icmp slt i32 %x, %y
+    ret i1 %reta
+
+bees:
+    %tmp = icmp eq i32 %x, 92
+    br label %end
+
+end:
+; CHECK: end:
+; CHECK: %ret = phi i1 [ true, %entry ], [ false, %bees ], [ true, %entry ], [ true, %entry ]
+    %ret = phi i1 [ true, %entry ], [%tmp, %bees], [true, %entry]
+    call void @helper(i32 2) nounwind
+    ret i1 %ret
+}
+
+define void @test10(i32 %x) nounwind readnone ssp noredzone {
+entry:
+ switch i32 %x, label %lor.rhs [
+   i32 2, label %lor.end
+   i32 1, label %lor.end
+   i32 3, label %lor.end
+ ], !prof !7
+
+lor.rhs:
+ call void @helper(i32 1) nounwind
+ ret void
+
+lor.end:
+ call void @helper(i32 0) nounwind
+ ret void
+
+; CHECK: test10
+; CHECK: %x.off = add i32 %x, -1
+; CHECK: %switch = icmp ult i32 %x.off, 3
+; CHECK: br i1 %switch, label %lor.end, label %lor.rhs, !prof !8
+}
+
+; Remove dead cases from the switch.
+define void @test11(i32 %x) nounwind {
+  %i = shl i32 %x, 1
+  switch i32 %i, label %a [
+    i32 21, label %b
+    i32 24, label %c
+  ], !prof !8
+; CHECK: %cond = icmp eq i32 %i, 24
+; CHECK: br i1 %cond, label %c, label %a, !prof !9
+
+a:
+ call void @helper(i32 0) nounwind
+ ret void
+b:
+ call void @helper(i32 1) nounwind
+ ret void
+c:
+ call void @helper(i32 2) nounwind
+ ret void
+}
+
 !0 = metadata !{metadata !"branch_weights", i32 3, i32 5}
 !1 = metadata !{metadata !"branch_weights", i32 1, i32 1}
 !2 = metadata !{metadata !"branch_weights", i32 1, i32 2}
+!3 = metadata !{metadata !"branch_weights", i32 4, i32 3, i32 2, i32 1}
+!4 = metadata !{metadata !"branch_weights", i32 4, i32 3, i32 2, i32 1}
+!5 = metadata !{metadata !"branch_weights", i32 7, i32 6, i32 5}
+!6 = metadata !{metadata !"branch_weights", i32 1, i32 3}
+!7 = metadata !{metadata !"branch_weights", i32 33, i32 9, i32 8, i32 7}
+!8 = metadata !{metadata !"branch_weights", i32 33, i32 9, i32 8}
 
 ; CHECK: !0 = metadata !{metadata !"branch_weights", i32 5, i32 11}
 ; CHECK: !1 = metadata !{metadata !"branch_weights", i32 1, i32 5}
-; CHECK-NOT: !2
+; CHECK: !2 = metadata !{metadata !"branch_weights", i32 7, i32 1, i32 2}
+; CHECK: !3 = metadata !{metadata !"branch_weights", i32 49, i32 12, i32 24, i32 35}
+; CHECK: !4 = metadata !{metadata !"branch_weights", i32 11, i32 5}
+; CHECK: !5 = metadata !{metadata !"branch_weights", i32 17, i32 15} 
+; CHECK: !6 = metadata !{metadata !"branch_weights", i32 9, i32 7}
+; CHECK: !7 = metadata !{metadata !"branch_weights", i32 17, i32 9, i32 8, i32 7, i32 17}
+; CHECK: !8 = metadata !{metadata !"branch_weights", i32 24, i32 33}
+; CHECK: !9 = metadata !{metadata !"branch_weights", i32 8, i32 33}
+; CHECK-NOT: !9
diff --git a/test/Transforms/SimplifyCFG/sink-common-code.ll b/test/Transforms/SimplifyCFG/sink-common-code.ll
new file mode 100644
index 0000000000..28d7279382
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/sink-common-code.ll
@@ -0,0 +1,53 @@
+; RUN: opt < %s -simplifycfg -S | FileCheck %s
+
+define zeroext i1 @test1(i1 zeroext %flag, i32 %blksA, i32 %blksB, i32 %nblks) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+; CHECK: test1
+; CHECK: add
+; CHECK: select
+; CHECK: icmp
+; CHECK-NOT: br
+if.then:
+  %cmp = icmp uge i32 %blksA, %nblks
+  %frombool1 = zext i1 %cmp to i8
+  br label %if.end
+
+if.else:
+  %add = add i32 %nblks, %blksB
+  %cmp2 = icmp ule i32 %add, %blksA
+  %frombool3 = zext i1 %cmp2 to i8
+  br label %if.end
+
+if.end:
+  %obeys.0 = phi i8 [ %frombool1, %if.then ], [ %frombool3, %if.else ]
+  %tobool4 = icmp ne i8 %obeys.0, 0
+  ret i1 %tobool4
+}
+
+define zeroext i1 @test2(i1 zeroext %flag, i32 %blksA, i32 %blksB, i32 %nblks) {
+entry:
+  br i1 %flag, label %if.then, label %if.else
+
+; CHECK: test2
+; CHECK: add
+; CHECK: select
+; CHECK: icmp
+; CHECK-NOT: br
+if.then:
+  %cmp = icmp uge i32 %blksA, %nblks
+  %frombool1 = zext i1 %cmp to i8
+  br label %if.end
+
+if.else:
+  %add = add i32 %nblks, %blksB
+  %cmp2 = icmp uge i32 %blksA, %add
+  %frombool3 = zext i1 %cmp2 to i8
+  br label %if.end
+
+if.end:
+  %obeys.0 = phi i8 [ %frombool1, %if.then ], [ %frombool3, %if.else ]
+  %tobool4 = icmp ne i8 %obeys.0, 0
+  ret i1 %tobool4
+}
diff --git a/test/Transforms/SimplifyCFG/switch_to_lookup_table.ll b/test/Transforms/SimplifyCFG/switch_to_lookup_table.ll
index 414da93976..f0bd688050 100644
--- a/test/Transforms/SimplifyCFG/switch_to_lookup_table.ll
+++ b/test/Transforms/SimplifyCFG/switch_to_lookup_table.ll
@@ -15,6 +15,9 @@ target triple = "x86_64-unknown-linux-gnu"
 ; The table for @foostring
 ; CHECK: @switch.table3 = private unnamed_addr constant [4 x i8*] [i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8]* @.str1, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8]* @.str2, i64 0, i64 0), i8* getelementptr inbounds ([4 x i8]* @.str3, i64 0, i64 0)]
 
+; The table for @earlyreturncrash
+; CHECK: @switch.table4 = private unnamed_addr constant [4 x i32] [i32 42, i32 9, i32 88, i32 5]
+
 ; A simple int-to-int selection switch.
 ; It is dense enough to be replaced by table lookup.
 ; The result is directly by a ret from an otherwise empty bb,
@@ -138,3 +141,34 @@ return:
 ; CHECK-NEXT: %switch.load = load i8** %switch.gep
 ; CHECK-NEXT: ret i8* %switch.load
 }
+
+; Switch used to initialize two values. The first value is returned, the second
+; value is not used. This used to make the transformation generate illegal code.
+
+define i32 @earlyreturncrash(i32 %x)  {
+entry:
+  switch i32 %x, label %sw.default [
+    i32 0, label %sw.epilog
+    i32 1, label %sw.bb1
+    i32 2, label %sw.bb2
+    i32 3, label %sw.bb3
+  ]
+
+sw.bb1: br label %sw.epilog
+sw.bb2: br label %sw.epilog
+sw.bb3: br label %sw.epilog
+sw.default: br label %sw.epilog
+
+sw.epilog:
+  %a.0 = phi i32 [ 7, %sw.default ], [ 5, %sw.bb3 ], [ 88, %sw.bb2 ], [ 9, %sw.bb1 ], [ 42, %entry ]
+  %b.0 = phi i32 [ 10, %sw.default ], [ 5, %sw.bb3 ], [ 1, %sw.bb2 ], [ 4, %sw.bb1 ], [ 3, %entry ]
+  ret i32 %a.0
+
+; CHECK: @earlyreturncrash
+; CHECK: switch.lookup:
+; CHECK-NEXT: %switch.gep = getelementptr inbounds [4 x i32]* @switch.table4, i32 0, i32 %switch.tableidx
+; CHECK-NEXT: %switch.load = load i32* %switch.gep
+; CHECK-NEXT: ret i32 %switch.load
+; CHECK: sw.epilog:
+; CHECK-NEXT: ret i32 7
+}
diff --git a/tools/lli/RemoteTarget.cpp b/tools/lli/RemoteTarget.cpp
index 918f1572e3..66743225db 100644
--- a/tools/lli/RemoteTarget.cpp
+++ b/tools/lli/RemoteTarget.cpp
@@ -13,9 +13,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "RemoteTarget.h"
-#include <llvm/ADT/StringRef.h>
-#include <llvm/Support/DataTypes.h>
-#include <llvm/Support/Memory.h>
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/Memory.h"
 #include <stdlib.h>
 #include <string>
 using namespace llvm;
diff --git a/tools/lli/RemoteTarget.h b/tools/lli/RemoteTarget.h
index c5845266d6..d05d3c6f45 100644
--- a/tools/lli/RemoteTarget.h
+++ b/tools/lli/RemoteTarget.h
@@ -15,10 +15,10 @@
 #ifndef REMOTEPROCESS_H
 #define REMOTEPROCESS_H
 
-#include <llvm/ADT/StringRef.h>
-#include <llvm/ADT/SmallVector.h>
-#include <llvm/Support/DataTypes.h>
-#include <llvm/Support/Memory.h>
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/Memory.h"
 #include <stdlib.h>
 #include <string>
 
diff --git a/tools/lli/lli.cpp b/tools/lli/lli.cpp
index 4004b6c4d4..c5645ec601 100644
--- a/tools/lli/lli.cpp
+++ b/tools/lli/lli.cpp
@@ -656,6 +656,9 @@ int main(int argc, char **argv, char * const *envp) {
 
     Target.stop();
   } else {
+    // Trigger compilation separately so code regions that need to be 
+    // invalidated will be known.
+    (void)EE->getPointerToFunction(EntryFn);
     // Clear instruction cache before code will be executed.
     if (JMM)
       static_cast<LLIMCJITMemoryManager*>(JMM)->invalidateInstructionCache();
diff --git a/tools/llvm-mc/llvm-mc.cpp b/tools/llvm-mc/llvm-mc.cpp
index 3bceb14624..756221b79a 100644
--- a/tools/llvm-mc/llvm-mc.cpp
+++ b/tools/llvm-mc/llvm-mc.cpp
@@ -409,7 +409,7 @@ int main(int argc, char **argv) {
     MCAsmBackend *MAB = 0;
     if (ShowEncoding) {
       CE = TheTarget->createMCCodeEmitter(*MCII, *MRI, *STI, Ctx);
-      MAB = TheTarget->createMCAsmBackend(TripleName);
+      MAB = TheTarget->createMCAsmBackend(TripleName, MCPU);
     }
     Str.reset(TheTarget->createAsmStreamer(Ctx, FOS, /*asmverbose*/true,
                                            /*useLoc*/ true,
@@ -422,7 +422,7 @@ int main(int argc, char **argv) {
   } else {
     assert(FileType == OFT_ObjectFile && "Invalid file type!");
     MCCodeEmitter *CE = TheTarget->createMCCodeEmitter(*MCII, *MRI, *STI, Ctx);
-    MCAsmBackend *MAB = TheTarget->createMCAsmBackend(TripleName);
+    MCAsmBackend *MAB = TheTarget->createMCAsmBackend(TripleName, MCPU);
     Str.reset(TheTarget->createMCObjectStreamer(TripleName, Ctx, *MAB,
                                                 FOS, CE, RelaxAll,
                                                 NoExecStack));
diff --git a/tools/llvm-nm/llvm-nm.cpp b/tools/llvm-nm/llvm-nm.cpp
index eb52acc85d..0543e83f9c 100644
--- a/tools/llvm-nm/llvm-nm.cpp
+++ b/tools/llvm-nm/llvm-nm.cpp
@@ -110,6 +110,9 @@ namespace {
 
   cl::opt<bool> SizeSort("size-sort", cl::desc("Sort symbols by size"));
 
+  cl::opt<bool> WithoutAliases("without-aliases", cl::Hidden,
+                               cl::desc("Exclude aliases from output"));
+
   bool PrintAddress = true;
 
   bool MultipleFiles = false;
@@ -275,8 +278,9 @@ static void DumpSymbolNamesFromModule(Module *M) {
   std::for_each (M->begin(), M->end(), DumpSymbolNameForGlobalValue);
   std::for_each (M->global_begin(), M->global_end(),
                  DumpSymbolNameForGlobalValue);
-  std::for_each (M->alias_begin(), M->alias_end(),
-                 DumpSymbolNameForGlobalValue);
+  if (!WithoutAliases)
+    std::for_each (M->alias_begin(), M->alias_end(),
+		   DumpSymbolNameForGlobalValue);
 
   SortAndPrintSymbolList();
 }
diff --git a/tools/llvm-stress/llvm-stress.cpp b/tools/llvm-stress/llvm-stress.cpp
index 31252dd7f7..8473d94731 100644
--- a/tools/llvm-stress/llvm-stress.cpp
+++ b/tools/llvm-stress/llvm-stress.cpp
@@ -126,6 +126,10 @@ public:
   /// C'tor
   Modifier(BasicBlock *Block, PieceTable *PT, Random *R):
     BB(Block),PT(PT),Ran(R),Context(BB->getContext()) {}
+
+  /// virtual D'tor to silence warnings.
+  virtual ~Modifier() {}
+
   /// Add a new instruction.
   virtual void Act() = 0;
   /// Add N new instructions,
diff --git a/unittests/ADT/DenseSetTest.cpp b/unittests/ADT/DenseSetTest.cpp
index 7a35f521a1..ada5f6db83 100644
--- a/unittests/ADT/DenseSetTest.cpp
+++ b/unittests/ADT/DenseSetTest.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "gtest/gtest.h"
-#include <llvm/ADT/DenseSet.h>
+#include "llvm/ADT/DenseSet.h"
 
 using namespace llvm;
 
diff --git a/unittests/Analysis/ScalarEvolutionTest.cpp b/unittests/Analysis/ScalarEvolutionTest.cpp
index ea5aeb38b0..c30492a5f0 100644
--- a/unittests/Analysis/ScalarEvolutionTest.cpp
+++ b/unittests/Analysis/ScalarEvolutionTest.cpp
@@ -7,14 +7,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <llvm/Analysis/ScalarEvolutionExpressions.h>
-#include <llvm/Analysis/LoopInfo.h>
-#include <llvm/GlobalVariable.h>
-#include <llvm/Constants.h>
-#include <llvm/LLVMContext.h>
-#include <llvm/Module.h>
-#include <llvm/PassManager.h>
-#include <llvm/ADT/SmallVector.h>
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Constants.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Module.h"
+#include "llvm/PassManager.h"
+#include "llvm/ADT/SmallVector.h"
 #include "gtest/gtest.h"
 
 namespace llvm {
diff --git a/unittests/Support/AlignOfTest.cpp b/unittests/Support/AlignOfTest.cpp
index 6f576681a3..f2d11708a2 100644
--- a/unittests/Support/AlignOfTest.cpp
+++ b/unittests/Support/AlignOfTest.cpp
@@ -1,4 +1,4 @@
-//===- llvm/unittest/Support/AlignOfTest.cpp - Alignment utility tests ----===//
+//=== - llvm/unittest/Support/AlignOfTest.cpp - Alignment utility tests ----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -23,31 +23,25 @@ namespace {
 #endif
 
 // Define some fixed alignment types to use in these tests.
-#if __cplusplus == 201103L || __has_feature(cxx_alignas)
-typedef char alignas(1) A1;
-typedef char alignas(2) A2;
-typedef char alignas(4) A4;
-typedef char alignas(8) A8;
-#elif defined(__clang__) || defined(__GNUC__)
-typedef char A1 __attribute__((aligned(1)));
-typedef char A2 __attribute__((aligned(2)));
-typedef char A4 __attribute__((aligned(4)));
-typedef char A8 __attribute__((aligned(8)));
+#if __has_feature(cxx_alignas)
+struct alignas(1) A1 { };
+struct alignas(2) A2 { };
+struct alignas(4) A4 { };
+struct alignas(8) A8 { };
+#elif defined(__GNUC__)
+struct A1 { } __attribute__((aligned(1)));
+struct A2 { } __attribute__((aligned(2)));
+struct A4 { } __attribute__((aligned(4)));
+struct A8 { } __attribute__((aligned(8)));
 #elif defined(_MSC_VER)
-typedef __declspec(align(1)) char A1;
-typedef __declspec(align(2)) char A2;
-typedef __declspec(align(4)) char A4;
-typedef __declspec(align(8)) char A8;
+__declspec(align(1)) struct A1 { };
+__declspec(align(2)) struct A2 { };
+__declspec(align(4)) struct A4 { };
+__declspec(align(8)) struct A8 { };
 #else
 # error No supported align as directive.
 #endif
 
-// Wrap the forced aligned types in structs to hack around compiler bugs.
-struct SA1 { A1 a; };
-struct SA2 { A2 a; };
-struct SA4 { A4 a; };
-struct SA8 { A8 a; };
-
 struct S1 {};
 struct S2 { char a; };
 struct S3 { int x; };
@@ -90,11 +84,7 @@ char LLVM_ATTRIBUTE_UNUSED test_arr2
   [AlignOf<A1>::Alignment > 0]
   [AlignOf<A2>::Alignment > 0]
   [AlignOf<A4>::Alignment > 0]
-  [AlignOf<A8>::Alignment > 0]
-  [AlignOf<SA1>::Alignment > 0]
-  [AlignOf<SA2>::Alignment > 0]
-  [AlignOf<SA4>::Alignment > 0]
-  [AlignOf<SA8>::Alignment > 0];
+  [AlignOf<A8>::Alignment > 0];
 char LLVM_ATTRIBUTE_UNUSED test_arr3
   [AlignOf<S1>::Alignment > 0]
   [AlignOf<S2>::Alignment > 0]
@@ -123,20 +113,10 @@ char LLVM_ATTRIBUTE_UNUSED test_arr5
   [AlignOf<V8>::Alignment > 0];
 
 TEST(AlignOfTest, BasicAlignmentInvariants) {
-  // For a very strange reason, many compilers do not support this. Both Clang
-  // and GCC fail to align these properly.
-  EXPECT_EQ(1u, alignOf<A1>());
-#if 0
-  EXPECT_EQ(2u, alignOf<A2>());
-  EXPECT_EQ(4u, alignOf<A4>());
-  EXPECT_EQ(8u, alignOf<A8>());
-#endif
-
-  // But once wrapped in structs, the alignment is correctly managed.
-  EXPECT_LE(1u, alignOf<SA1>());
-  EXPECT_LE(2u, alignOf<SA2>());
-  EXPECT_LE(4u, alignOf<SA4>());
-  EXPECT_LE(8u, alignOf<SA8>());
+  EXPECT_LE(1u, alignOf<A1>());
+  EXPECT_LE(2u, alignOf<A2>());
+  EXPECT_LE(4u, alignOf<A4>());
+  EXPECT_LE(8u, alignOf<A8>());
 
   EXPECT_EQ(1u, alignOf<char>());
   EXPECT_LE(alignOf<char>(),   alignOf<short>());
@@ -174,42 +154,38 @@ TEST(AlignOfTest, BasicAlignmentInvariants) {
 }
 
 TEST(AlignOfTest, BasicAlignedArray) {
-  // Note: this code exclusively uses the struct-wrapped arbitrarily aligned
-  // types because of the bugs mentioned above where GCC and Clang both
-  // disregard the arbitrary alignment specifier until the type is used to
-  // declare a member of a struct.
-  EXPECT_LE(1u, alignOf<AlignedCharArrayUnion<SA1> >());
-  EXPECT_LE(2u, alignOf<AlignedCharArrayUnion<SA2> >());
-  EXPECT_LE(4u, alignOf<AlignedCharArrayUnion<SA4> >());
-  EXPECT_LE(8u, alignOf<AlignedCharArrayUnion<SA8> >());
+  EXPECT_LE(1u, alignOf<AlignedCharArrayUnion<A1> >());
+  EXPECT_LE(2u, alignOf<AlignedCharArrayUnion<A2> >());
+  EXPECT_LE(4u, alignOf<AlignedCharArrayUnion<A4> >());
+  EXPECT_LE(8u, alignOf<AlignedCharArrayUnion<A8> >());
 
-  EXPECT_LE(1u, sizeof(AlignedCharArrayUnion<SA1>));
-  EXPECT_LE(2u, sizeof(AlignedCharArrayUnion<SA2>));
-  EXPECT_LE(4u, sizeof(AlignedCharArrayUnion<SA4>));
-  EXPECT_LE(8u, sizeof(AlignedCharArrayUnion<SA8>));
+  EXPECT_LE(1u, sizeof(AlignedCharArrayUnion<A1>));
+  EXPECT_LE(2u, sizeof(AlignedCharArrayUnion<A2>));
+  EXPECT_LE(4u, sizeof(AlignedCharArrayUnion<A4>));
+  EXPECT_LE(8u, sizeof(AlignedCharArrayUnion<A8>));
 
-  EXPECT_EQ(1u, (alignOf<AlignedCharArrayUnion<SA1> >()));
-  EXPECT_EQ(2u, (alignOf<AlignedCharArrayUnion<SA1, SA2> >()));
-  EXPECT_EQ(4u, (alignOf<AlignedCharArrayUnion<SA1, SA2, SA4> >()));
-  EXPECT_EQ(8u, (alignOf<AlignedCharArrayUnion<SA1, SA2, SA4, SA8> >()));
+  EXPECT_EQ(1u, (alignOf<AlignedCharArrayUnion<A1> >()));
+  EXPECT_EQ(2u, (alignOf<AlignedCharArrayUnion<A1, A2> >()));
+  EXPECT_EQ(4u, (alignOf<AlignedCharArrayUnion<A1, A2, A4> >()));
+  EXPECT_EQ(8u, (alignOf<AlignedCharArrayUnion<A1, A2, A4, A8> >()));
 
-  EXPECT_EQ(1u, sizeof(AlignedCharArrayUnion<SA1>));
-  EXPECT_EQ(2u, sizeof(AlignedCharArrayUnion<SA1, SA2>));
-  EXPECT_EQ(4u, sizeof(AlignedCharArrayUnion<SA1, SA2, SA4>));
-  EXPECT_EQ(8u, sizeof(AlignedCharArrayUnion<SA1, SA2, SA4, SA8>));
+  EXPECT_EQ(1u, sizeof(AlignedCharArrayUnion<A1>));
+  EXPECT_EQ(2u, sizeof(AlignedCharArrayUnion<A1, A2>));
+  EXPECT_EQ(4u, sizeof(AlignedCharArrayUnion<A1, A2, A4>));
+  EXPECT_EQ(8u, sizeof(AlignedCharArrayUnion<A1, A2, A4, A8>));
 
-  EXPECT_EQ(1u, (alignOf<AlignedCharArrayUnion<SA1[1]> >()));
-  EXPECT_EQ(2u, (alignOf<AlignedCharArrayUnion<SA1[2], SA2[1]> >()));
-  EXPECT_EQ(4u, (alignOf<AlignedCharArrayUnion<SA1[42], SA2[55],
-                                               SA4[13]> >()));
-  EXPECT_EQ(8u, (alignOf<AlignedCharArrayUnion<SA1[2], SA2[1],
-                                               SA4, SA8> >()));
+  EXPECT_EQ(1u, (alignOf<AlignedCharArrayUnion<A1[1]> >()));
+  EXPECT_EQ(2u, (alignOf<AlignedCharArrayUnion<A1[2], A2[1]> >()));
+  EXPECT_EQ(4u, (alignOf<AlignedCharArrayUnion<A1[42], A2[55],
+                                               A4[13]> >()));
+  EXPECT_EQ(8u, (alignOf<AlignedCharArrayUnion<A1[2], A2[1],
+                                               A4, A8> >()));
 
-  EXPECT_EQ(1u,  sizeof(AlignedCharArrayUnion<SA1[1]>));
-  EXPECT_EQ(2u,  sizeof(AlignedCharArrayUnion<SA1[2], SA2[1]>));
-  EXPECT_EQ(4u,  sizeof(AlignedCharArrayUnion<SA1[3], SA2[2], SA4>));
-  EXPECT_EQ(16u, sizeof(AlignedCharArrayUnion<SA1, SA2[3],
-                                              SA4[3], SA8>));
+  EXPECT_EQ(1u,  sizeof(AlignedCharArrayUnion<A1[1]>));
+  EXPECT_EQ(2u,  sizeof(AlignedCharArrayUnion<A1[2], A2[1]>));
+  EXPECT_EQ(4u,  sizeof(AlignedCharArrayUnion<A1[3], A2[2], A4>));
+  EXPECT_EQ(16u, sizeof(AlignedCharArrayUnion<A1, A2[3],
+                                              A4[3], A8>));
 
   // For other tests we simply assert that the alignment of the union mathes
   // that of the fundamental type and hope that we have any weird type
diff --git a/unittests/Support/CMakeLists.txt b/unittests/Support/CMakeLists.txt
index 3b9bf84370..e562fbb18d 100644
--- a/unittests/Support/CMakeLists.txt
+++ b/unittests/Support/CMakeLists.txt
@@ -17,11 +17,13 @@ add_llvm_unittest(SupportTests
   LeakDetectorTest.cpp
   ManagedStatic.cpp
   MathExtrasTest.cpp
+  MemoryTest.cpp
   Path.cpp
-  raw_ostream_test.cpp
   RegexTest.cpp
   SwapByteOrderTest.cpp
   TimeValue.cpp
   ValueHandleTest.cpp
   YAMLParserTest.cpp
+  formatted_raw_ostream_test.cpp
+  raw_ostream_test.cpp
   )
diff --git a/unittests/Support/MemoryTest.cpp b/unittests/Support/MemoryTest.cpp
new file mode 100644
index 0000000000..21cb27eaf0
--- /dev/null
+++ b/unittests/Support/MemoryTest.cpp
@@ -0,0 +1,356 @@
+//===- llvm/unittest/Support/AllocatorTest.cpp - BumpPtrAllocator tests ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/Memory.h"
+#include "llvm/Support/Process.h"
+
+#include "gtest/gtest.h"
+#include <cstdlib>
+
+using namespace llvm;
+using namespace sys;
+
+namespace {
+
+class MappedMemoryTest : public ::testing::TestWithParam<unsigned> {
+public:
+  MappedMemoryTest() {
+    Flags = GetParam();
+    PageSize = sys::Process::GetPageSize();
+  }
+
+protected:
+  // Adds RW flags to permit testing of the resulting memory
+  unsigned getTestableEquivalent(unsigned RequestedFlags) {
+    switch (RequestedFlags) {
+    case Memory::MF_READ:
+    case Memory::MF_WRITE:
+    case Memory::MF_READ|Memory::MF_WRITE:
+      return Memory::MF_READ|Memory::MF_WRITE;
+    case Memory::MF_READ|Memory::MF_EXEC:
+    case Memory::MF_READ|Memory::MF_WRITE|Memory::MF_EXEC:
+    case Memory::MF_EXEC:
+      return Memory::MF_READ|Memory::MF_WRITE|Memory::MF_EXEC;
+    }
+    // Default in case values are added to the enum, as required by some compilers
+    return Memory::MF_READ|Memory::MF_WRITE;
+  }
+
+  // Returns true if the memory blocks overlap
+  bool doesOverlap(MemoryBlock M1, MemoryBlock M2) {
+    if (M1.base() == M2.base())
+      return true;
+
+    if (M1.base() > M2.base())
+      return (unsigned char *)M2.base() + M2.size() > M1.base();
+
+    return (unsigned char *)M1.base() + M1.size() > M2.base();
+  }
+
+  unsigned Flags;
+  size_t   PageSize;
+};
+
+TEST_P(MappedMemoryTest, AllocAndRelease) {
+  error_code EC;
+  MemoryBlock M1 = Memory::allocateMappedMemory(sizeof(int), 0, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+
+  EXPECT_NE((void*)0, M1.base());
+  EXPECT_LE(sizeof(int), M1.size());
+
+  EXPECT_FALSE(Memory::releaseMappedMemory(M1));
+}
+
+TEST_P(MappedMemoryTest, MultipleAllocAndRelease) {
+  error_code EC;
+  MemoryBlock M1 = Memory::allocateMappedMemory(16, 0, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+  MemoryBlock M2 = Memory::allocateMappedMemory(64, 0, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+  MemoryBlock M3 = Memory::allocateMappedMemory(32, 0, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+
+  EXPECT_NE((void*)0, M1.base());
+  EXPECT_LE(16U, M1.size());
+  EXPECT_NE((void*)0, M2.base());
+  EXPECT_LE(64U, M2.size());
+  EXPECT_NE((void*)0, M3.base());
+  EXPECT_LE(32U, M3.size());
+
+  EXPECT_FALSE(doesOverlap(M1, M2));
+  EXPECT_FALSE(doesOverlap(M2, M3));
+  EXPECT_FALSE(doesOverlap(M1, M3));
+
+  EXPECT_FALSE(Memory::releaseMappedMemory(M1));
+  EXPECT_FALSE(Memory::releaseMappedMemory(M3));
+  MemoryBlock M4 = Memory::allocateMappedMemory(16, 0, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+  EXPECT_NE((void*)0, M4.base());
+  EXPECT_LE(16U, M4.size());
+  EXPECT_FALSE(Memory::releaseMappedMemory(M4));
+  EXPECT_FALSE(Memory::releaseMappedMemory(M2));
+}
+
+TEST_P(MappedMemoryTest, BasicWrite) {
+  // This test applies only to writeable combinations
+  if (Flags && !(Flags & Memory::MF_WRITE))
+    return;
+
+  error_code EC;
+  MemoryBlock M1 = Memory::allocateMappedMemory(sizeof(int), 0, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+
+  EXPECT_NE((void*)0, M1.base());
+  EXPECT_LE(sizeof(int), M1.size());
+
+  int *a = (int*)M1.base();
+  *a = 1;
+  EXPECT_EQ(1, *a);
+
+  EXPECT_FALSE(Memory::releaseMappedMemory(M1));
+}
+
+TEST_P(MappedMemoryTest, MultipleWrite) {
+  // This test applies only to writeable combinations
+  if (Flags && !(Flags & Memory::MF_WRITE))
+    return;
+  error_code EC;
+  MemoryBlock M1 = Memory::allocateMappedMemory(sizeof(int), 0, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+  MemoryBlock M2 = Memory::allocateMappedMemory(8 * sizeof(int), 0, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+  MemoryBlock M3 = Memory::allocateMappedMemory(4 * sizeof(int), 0, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+
+  EXPECT_FALSE(doesOverlap(M1, M2));
+  EXPECT_FALSE(doesOverlap(M2, M3));
+  EXPECT_FALSE(doesOverlap(M1, M3));
+
+  EXPECT_NE((void*)0, M1.base());
+  EXPECT_LE(1U * sizeof(int), M1.size());
+  EXPECT_NE((void*)0, M2.base());
+  EXPECT_LE(8U * sizeof(int), M2.size());
+  EXPECT_NE((void*)0, M3.base());
+  EXPECT_LE(4U * sizeof(int), M3.size());
+
+  int *x = (int*)M1.base();
+  *x = 1;
+
+  int *y = (int*)M2.base();
+  for (int i = 0; i < 8; i++) {
+    y[i] = i;
+  }
+
+  int *z = (int*)M3.base();
+  *z = 42;
+
+  EXPECT_EQ(1, *x);
+  EXPECT_EQ(7, y[7]);
+  EXPECT_EQ(42, *z);
+
+  EXPECT_FALSE(Memory::releaseMappedMemory(M1));
+  EXPECT_FALSE(Memory::releaseMappedMemory(M3));
+
+  MemoryBlock M4 = Memory::allocateMappedMemory(64 * sizeof(int), 0, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+  EXPECT_NE((void*)0, M4.base());
+  EXPECT_LE(64U * sizeof(int), M4.size());
+  x = (int*)M4.base();
+  *x = 4;
+  EXPECT_EQ(4, *x);
+  EXPECT_FALSE(Memory::releaseMappedMemory(M4));
+
+  // Verify that M2 remains unaffected by other activity
+  for (int i = 0; i < 8; i++) {
+    EXPECT_EQ(i, y[i]);
+  }
+  EXPECT_FALSE(Memory::releaseMappedMemory(M2));
+}
+
+TEST_P(MappedMemoryTest, EnabledWrite) {
+  error_code EC;
+  MemoryBlock M1 = Memory::allocateMappedMemory(2 * sizeof(int), 0, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+  MemoryBlock M2 = Memory::allocateMappedMemory(8 * sizeof(int), 0, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+  MemoryBlock M3 = Memory::allocateMappedMemory(4 * sizeof(int), 0, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+
+  EXPECT_NE((void*)0, M1.base());
+  EXPECT_LE(2U * sizeof(int), M1.size());
+  EXPECT_NE((void*)0, M2.base());
+  EXPECT_LE(8U * sizeof(int), M2.size());
+  EXPECT_NE((void*)0, M3.base());
+  EXPECT_LE(4U * sizeof(int), M3.size());
+
+  EXPECT_FALSE(Memory::protectMappedMemory(M1, getTestableEquivalent(Flags)));
+  EXPECT_FALSE(Memory::protectMappedMemory(M2, getTestableEquivalent(Flags)));
+  EXPECT_FALSE(Memory::protectMappedMemory(M3, getTestableEquivalent(Flags)));
+
+  EXPECT_FALSE(doesOverlap(M1, M2));
+  EXPECT_FALSE(doesOverlap(M2, M3));
+  EXPECT_FALSE(doesOverlap(M1, M3));
+
+  int *x = (int*)M1.base();
+  *x = 1;
+  int *y = (int*)M2.base();
+  for (unsigned int i = 0; i < 8; i++) {
+    y[i] = i;
+  }
+  int *z = (int*)M3.base();
+  *z = 42;
+
+  EXPECT_EQ(1, *x);
+  EXPECT_EQ(7, y[7]);
+  EXPECT_EQ(42, *z);
+
+  EXPECT_FALSE(Memory::releaseMappedMemory(M1));
+  EXPECT_FALSE(Memory::releaseMappedMemory(M3));
+  EXPECT_EQ(6, y[6]);
+
+  MemoryBlock M4 = Memory::allocateMappedMemory(16, 0, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+  EXPECT_NE((void*)0, M4.base());
+  EXPECT_LE(16U, M4.size());
+  EXPECT_EQ(error_code::success(), Memory::protectMappedMemory(M4, getTestableEquivalent(Flags)));
+  x = (int*)M4.base();
+  *x = 4;
+  EXPECT_EQ(4, *x);
+  EXPECT_FALSE(Memory::releaseMappedMemory(M4));
+  EXPECT_FALSE(Memory::releaseMappedMemory(M2));
+}
+
+TEST_P(MappedMemoryTest, SuccessiveNear) {
+  error_code EC;
+  MemoryBlock M1 = Memory::allocateMappedMemory(16, 0, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+  MemoryBlock M2 = Memory::allocateMappedMemory(64, &M1, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+  MemoryBlock M3 = Memory::allocateMappedMemory(32, &M2, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+
+  EXPECT_NE((void*)0, M1.base());
+  EXPECT_LE(16U, M1.size());
+  EXPECT_NE((void*)0, M2.base());
+  EXPECT_LE(64U, M2.size());
+  EXPECT_NE((void*)0, M3.base());
+  EXPECT_LE(32U, M3.size());
+
+  EXPECT_FALSE(doesOverlap(M1, M2));
+  EXPECT_FALSE(doesOverlap(M2, M3));
+  EXPECT_FALSE(doesOverlap(M1, M3));
+
+  EXPECT_FALSE(Memory::releaseMappedMemory(M1));
+  EXPECT_FALSE(Memory::releaseMappedMemory(M3));
+  EXPECT_FALSE(Memory::releaseMappedMemory(M2));
+}
+
+TEST_P(MappedMemoryTest, DuplicateNear) {
+  error_code EC;
+  MemoryBlock Near((void*)(3*PageSize), 16);
+  MemoryBlock M1 = Memory::allocateMappedMemory(16, &Near, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+  MemoryBlock M2 = Memory::allocateMappedMemory(64, &Near, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+  MemoryBlock M3 = Memory::allocateMappedMemory(32, &Near, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+
+  EXPECT_NE((void*)0, M1.base());
+  EXPECT_LE(16U, M1.size());
+  EXPECT_NE((void*)0, M2.base());
+  EXPECT_LE(64U, M2.size());
+  EXPECT_NE((void*)0, M3.base());
+  EXPECT_LE(32U, M3.size());
+
+  EXPECT_FALSE(Memory::releaseMappedMemory(M1));
+  EXPECT_FALSE(Memory::releaseMappedMemory(M3));
+  EXPECT_FALSE(Memory::releaseMappedMemory(M2));
+}
+
+TEST_P(MappedMemoryTest, ZeroNear) {
+  error_code EC;
+  MemoryBlock Near(0, 0);
+  MemoryBlock M1 = Memory::allocateMappedMemory(16, &Near, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+  MemoryBlock M2 = Memory::allocateMappedMemory(64, &Near, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+  MemoryBlock M3 = Memory::allocateMappedMemory(32, &Near, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+
+  EXPECT_NE((void*)0, M1.base());
+  EXPECT_LE(16U, M1.size());
+  EXPECT_NE((void*)0, M2.base());
+  EXPECT_LE(64U, M2.size());
+  EXPECT_NE((void*)0, M3.base());
+  EXPECT_LE(32U, M3.size());
+
+  EXPECT_FALSE(doesOverlap(M1, M2));
+  EXPECT_FALSE(doesOverlap(M2, M3));
+  EXPECT_FALSE(doesOverlap(M1, M3));
+
+  EXPECT_FALSE(Memory::releaseMappedMemory(M1));
+  EXPECT_FALSE(Memory::releaseMappedMemory(M3));
+  EXPECT_FALSE(Memory::releaseMappedMemory(M2));
+}
+
+TEST_P(MappedMemoryTest, ZeroSizeNear) {
+  error_code EC;
+  MemoryBlock Near((void*)(4*PageSize), 0);
+  MemoryBlock M1 = Memory::allocateMappedMemory(16, &Near, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+  MemoryBlock M2 = Memory::allocateMappedMemory(64, &Near, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+  MemoryBlock M3 = Memory::allocateMappedMemory(32, &Near, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+
+  EXPECT_NE((void*)0, M1.base());
+  EXPECT_LE(16U, M1.size());
+  EXPECT_NE((void*)0, M2.base());
+  EXPECT_LE(64U, M2.size());
+  EXPECT_NE((void*)0, M3.base());
+  EXPECT_LE(32U, M3.size());
+
+  EXPECT_FALSE(doesOverlap(M1, M2));
+  EXPECT_FALSE(doesOverlap(M2, M3));
+  EXPECT_FALSE(doesOverlap(M1, M3));
+
+  EXPECT_FALSE(Memory::releaseMappedMemory(M1));
+  EXPECT_FALSE(Memory::releaseMappedMemory(M3));
+  EXPECT_FALSE(Memory::releaseMappedMemory(M2));
+}
+
+TEST_P(MappedMemoryTest, UnalignedNear) {
+  error_code EC;
+  MemoryBlock Near((void*)(2*PageSize+5), 0);
+  MemoryBlock M1 = Memory::allocateMappedMemory(15, &Near, Flags, EC);
+  EXPECT_EQ(error_code::success(), EC);
+
+  EXPECT_NE((void*)0, M1.base());
+  EXPECT_LE(sizeof(int), M1.size());
+
+  EXPECT_FALSE(Memory::releaseMappedMemory(M1));
+}
+
+// Note that Memory::MF_WRITE is not supported exclusively across
+// operating systems and architectures and can imply MF_READ|MF_WRITE
+unsigned MemoryFlags[] = {
+                           Memory::MF_READ,
+                           Memory::MF_WRITE,
+                           Memory::MF_READ|Memory::MF_WRITE,
+                           Memory::MF_EXEC,
+                           Memory::MF_READ|Memory::MF_EXEC,
+                           Memory::MF_READ|Memory::MF_WRITE|Memory::MF_EXEC
+                         };
+
+INSTANTIATE_TEST_CASE_P(AllocationTests,
+                        MappedMemoryTest,
+                        ::testing::ValuesIn(MemoryFlags));
+
+}  // anonymous namespace
diff --git a/unittests/Support/formatted_raw_ostream_test.cpp b/unittests/Support/formatted_raw_ostream_test.cpp
new file mode 100644
index 0000000000..4725cedc21
--- /dev/null
+++ b/unittests/Support/formatted_raw_ostream_test.cpp
@@ -0,0 +1,33 @@
+//===- llvm/unittest/Support/formatted_raw_ostream_test.cpp ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "gtest/gtest.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/FormattedStream.h"
+
+using namespace llvm;
+
+namespace {
+
+TEST(formatted_raw_ostreamTest, Test_Tell) {
+  // Check offset when underlying stream has buffer contents.
+  SmallString<128> A;
+  raw_svector_ostream B(A);
+  formatted_raw_ostream C(B);
+  char tmp[100] = "";
+
+  for (unsigned i = 0; i != 3; ++i) {
+    C.write(tmp, 100);
+
+    EXPECT_EQ(100*(i+1), (unsigned) C.tell());
+  }
+}
+
+}
diff --git a/utils/FileCheck/FileCheck.cpp b/utils/FileCheck/FileCheck.cpp
index 33f04ce647..e79162867e 100644
--- a/utils/FileCheck/FileCheck.cpp
+++ b/utils/FileCheck/FileCheck.cpp
@@ -470,7 +470,7 @@ static MemoryBuffer *CanonicalizeInputFile(MemoryBuffer *MB) {
       continue;
     }
 
-    // If C is not a horizontal whitespace, skip it.
+    // If current char is not a horizontal whitespace, dump it to output as is.
     if (*Ptr != ' ' && *Ptr != '\t') {
       NewFile.push_back(*Ptr);
       continue;
@@ -537,11 +537,11 @@ static bool ReadCheckFile(SourceMgr &SM,
       Buffer = Buffer.substr(CheckPrefix.size()+1);
     } else if (Buffer.size() > CheckPrefix.size()+6 &&
                memcmp(Buffer.data()+CheckPrefix.size(), "-NEXT:", 6) == 0) {
-      Buffer = Buffer.substr(CheckPrefix.size()+7);
+      Buffer = Buffer.substr(CheckPrefix.size()+6);
       IsCheckNext = true;
     } else if (Buffer.size() > CheckPrefix.size()+5 &&
                memcmp(Buffer.data()+CheckPrefix.size(), "-NOT:", 5) == 0) {
-      Buffer = Buffer.substr(CheckPrefix.size()+6);
+      Buffer = Buffer.substr(CheckPrefix.size()+5);
       IsCheckNot = true;
     } else {
       Buffer = Buffer.substr(1);
diff --git a/utils/TableGen/AsmMatcherEmitter.cpp b/utils/TableGen/AsmMatcherEmitter.cpp
index 78eb641899..593de698a9 100644
--- a/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/utils/TableGen/AsmMatcherEmitter.cpp
@@ -77,7 +77,7 @@
 //
 //  Some targets need a custom way to parse operands, some specific instructions
 //  can contain arguments that can represent processor flags and other kinds of
-//  identifiers that need to be mapped to specific valeus in the final encoded
+//  identifiers that need to be mapped to specific values in the final encoded
 //  instructions. The target specific custom operand parsing works in the
 //  following way:
 //
@@ -199,7 +199,7 @@ public:
     return Kind >= UserClass0;
   }
 
-  /// isRelatedTo - Check whether this class is "related" to \arg RHS. Classes
+  /// isRelatedTo - Check whether this class is "related" to \p RHS. Classes
   /// are related if they are in the same class hierarchy.
   bool isRelatedTo(const ClassInfo &RHS) const {
     // Tokens are only related to tokens.
@@ -238,7 +238,7 @@ public:
     return Root == RHSRoot;
   }
 
-  /// isSubsetOf - Test whether this class is a subset of \arg RHS;
+  /// isSubsetOf - Test whether this class is a subset of \p RHS.
   bool isSubsetOf(const ClassInfo &RHS) const {
     // This is a subset of RHS if it is the same class...
     if (this == &RHS)
@@ -279,6 +279,15 @@ public:
   }
 };
 
+namespace {
+/// Sort ClassInfo pointers independently of pointer value.
+struct LessClassInfoPtr {
+  bool operator()(const ClassInfo *LHS, const ClassInfo *RHS) const {
+    return *LHS < *RHS;
+  }
+};
+}
+
 /// MatchableInfo - Helper class for storing the necessary information for an
 /// instruction or alias which is capable of being matched.
 struct MatchableInfo {
@@ -501,7 +510,7 @@ struct MatchableInfo {
   }
 
   /// couldMatchAmbiguouslyWith - Check whether this matchable could
-  /// ambiguously match the same set of operands as \arg RHS (without being a
+  /// ambiguously match the same set of operands as \p RHS (without being a
   /// strictly superior match).
   bool couldMatchAmbiguouslyWith(const MatchableInfo &RHS) {
     // The primary comparator is the instruction mnemonic.
@@ -599,7 +608,8 @@ public:
   std::vector<OperandMatchEntry> OperandMatchInfo;
 
   /// Map of Register records to their class information.
-  std::map<Record*, ClassInfo*> RegisterClasses;
+  typedef std::map<Record*, ClassInfo*, LessRecordByID> RegisterClassesTy;
+  RegisterClassesTy RegisterClasses;
 
   /// Map of Predicate records to their subtarget information.
   std::map<Record*, SubtargetFeatureInfo*> SubtargetFeatures;
@@ -1020,7 +1030,9 @@ AsmMatcherInfo::getOperandClass(Record *Rec, int SubOpIdx) {
     throw TGError(Rec->getLoc(), "register class has no class info!");
   }
 
-  assert(Rec->isSubClassOf("Operand") && "Unexpected operand!");
+  if (!Rec->isSubClassOf("Operand"))
+    throw TGError(Rec->getLoc(), "Operand `" + Rec->getName() +
+                  "' does not derive from class Operand!\n");
   Record *MatchClass = Rec->getValueAsDef("ParserMatchClass");
   if (ClassInfo *CI = AsmOperandClasses[MatchClass])
     return CI;
@@ -1237,7 +1249,8 @@ void AsmMatcherInfo::buildOperandMatchInfo() {
 
   /// Map containing a mask with all operands indices that can be found for
   /// that class inside a instruction.
-  std::map<ClassInfo*, unsigned> OpClassMask;
+  typedef std::map<ClassInfo*, unsigned, LessClassInfoPtr> OpClassMaskTy;
+  OpClassMaskTy OpClassMask;
 
   for (std::vector<MatchableInfo*>::const_iterator it =
        Matchables.begin(), ie = Matchables.end();
@@ -1256,7 +1269,7 @@ void AsmMatcherInfo::buildOperandMatchInfo() {
     }
 
     // Generate operand match info for each mnemonic/operand class pair.
-    for (std::map<ClassInfo*, unsigned>::iterator iit = OpClassMask.begin(),
+    for (OpClassMaskTy::iterator iit = OpClassMask.begin(),
          iie = OpClassMask.end(); iit != iie; ++iit) {
       unsigned OpMask = iit->second;
       ClassInfo *CI = iit->first;
@@ -1684,9 +1697,9 @@ static void emitConvertToMCInst(CodeGenTarget &Target, StringRef ClassName,
         << "                const SmallVectorImpl<MCParsedAsmOperand*"
         << "> &Operands) {\n"
         << "  assert(Kind < CVT_NUM_SIGNATURES && \"Invalid signature!\");\n"
-        << "  uint8_t *Converter = ConversionTable[Kind];\n"
+        << "  const uint8_t *Converter = ConversionTable[Kind];\n"
         << "  Inst.setOpcode(Opcode);\n"
-        << "  for (uint8_t *p = Converter; *p; p+= 2) {\n"
+        << "  for (const uint8_t *p = Converter; *p; p+= 2) {\n"
         << "    switch (*p) {\n"
         << "    default: llvm_unreachable(\"invalid conversion entry!\");\n"
         << "    case CVT_Reg:\n"
@@ -1708,8 +1721,8 @@ static void emitConvertToMCInst(CodeGenTarget &Target, StringRef ClassName,
        << "  assert(Kind < CVT_NUM_SIGNATURES && \"Invalid signature!\");\n"
        << "  NumMCOperands = 0;\n"
        << "  unsigned MCOperandNum = 0;\n"
-       << "  uint8_t *Converter = ConversionTable[Kind];\n"
-       << "  for (uint8_t *p = Converter; *p; p+= 2) {\n"
+       << "  const uint8_t *Converter = ConversionTable[Kind];\n"
+       << "  for (const uint8_t *p = Converter; *p; p+= 2) {\n"
        << "    if (*(p + 1) > OperandNum) continue;\n"
        << "    switch (*p) {\n"
        << "    default: llvm_unreachable(\"invalid conversion entry!\");\n"
@@ -1945,7 +1958,7 @@ static void emitConvertToMCInst(CodeGenTarget &Target, StringRef ClassName,
   OS << "} // end anonymous namespace\n\n";
 
   // Output the conversion table.
-  OS << "static uint8_t ConversionTable[CVT_NUM_SIGNATURES]["
+  OS << "static const uint8_t ConversionTable[CVT_NUM_SIGNATURES]["
      << MaxRowLength << "] = {\n";
 
   for (unsigned Row = 0, ERow = ConversionTable.size(); Row != ERow; ++Row) {
@@ -2041,7 +2054,7 @@ static void emitValidateOperandClass(AsmMatcherInfo &Info,
   OS << "    MatchClassKind OpKind;\n";
   OS << "    switch (Operand.getReg()) {\n";
   OS << "    default: OpKind = InvalidMatchClass; break;\n";
-  for (std::map<Record*, ClassInfo*>::iterator
+  for (AsmMatcherInfo::RegisterClassesTy::iterator
          it = Info.RegisterClasses.begin(), ie = Info.RegisterClasses.end();
        it != ie; ++it)
     OS << "    case " << Info.Target.getName() << "::"
@@ -2062,7 +2075,7 @@ static void emitValidateOperandClass(AsmMatcherInfo &Info,
 static void emitIsSubclass(CodeGenTarget &Target,
                            std::vector<ClassInfo*> &Infos,
                            raw_ostream &OS) {
-  OS << "/// isSubclass - Compute whether \\arg A is a subclass of \\arg B.\n";
+  OS << "/// isSubclass - Compute whether \\p A is a subclass of \\p B.\n";
   OS << "static bool isSubclass(MatchClassKind A, MatchClassKind B) {\n";
   OS << "  if (A == B)\n";
   OS << "    return true;\n\n";
@@ -2377,17 +2390,27 @@ static const char *getMinimalTypeForRange(uint64_t Range) {
 }
 
 static void emitCustomOperandParsing(raw_ostream &OS, CodeGenTarget &Target,
-                              const AsmMatcherInfo &Info, StringRef ClassName) {
+                              const AsmMatcherInfo &Info, StringRef ClassName,
+                              StringToOffsetTable &StringTable,
+                              unsigned MaxMnemonicIndex) {
+  unsigned MaxMask = 0;
+  for (std::vector<OperandMatchEntry>::const_iterator it =
+       Info.OperandMatchInfo.begin(), ie = Info.OperandMatchInfo.end();
+       it != ie; ++it) {
+    MaxMask |= it->OperandMask;
+  }
+
   // Emit the static custom operand parsing table;
   OS << "namespace {\n";
   OS << "  struct OperandMatchEntry {\n";
-  OS << "    static const char *const MnemonicTable;\n";
-  OS << "    uint32_t OperandMask;\n";
-  OS << "    uint32_t Mnemonic;\n";
   OS << "    " << getMinimalTypeForRange(1ULL << Info.SubtargetFeatures.size())
                << " RequiredFeatures;\n";
+  OS << "    " << getMinimalTypeForRange(MaxMnemonicIndex)
+               << " Mnemonic;\n";
   OS << "    " << getMinimalTypeForRange(Info.Classes.size())
-               << " Class;\n\n";
+               << " Class;\n";
+  OS << "    " << getMinimalTypeForRange(MaxMask)
+               << " OperandMask;\n\n";
   OS << "    StringRef getMnemonic() const {\n";
   OS << "      return StringRef(MnemonicTable + Mnemonic + 1,\n";
   OS << "                       MnemonicTable[Mnemonic]);\n";
@@ -2410,8 +2433,6 @@ static void emitCustomOperandParsing(raw_ostream &OS, CodeGenTarget &Target,
 
   OS << "} // end anonymous namespace.\n\n";
 
-  StringToOffsetTable StringTable;
-
   OS << "static const OperandMatchEntry OperandMatchTable["
      << Info.OperandMatchInfo.size() << "] = {\n";
 
@@ -2422,8 +2443,25 @@ static void emitCustomOperandParsing(raw_ostream &OS, CodeGenTarget &Target,
     const OperandMatchEntry &OMI = *it;
     const MatchableInfo &II = *OMI.MI;
 
-    OS << "  { " << OMI.OperandMask;
+    OS << "  { ";
+
+    // Write the required features mask.
+    if (!II.RequiredFeatures.empty()) {
+      for (unsigned i = 0, e = II.RequiredFeatures.size(); i != e; ++i) {
+        if (i) OS << "|";
+        OS << II.RequiredFeatures[i]->getEnumName();
+      }
+    } else
+      OS << "0";
+
+    // Store a pascal-style length byte in the mnemonic.
+    std::string LenMnemonic = char(II.Mnemonic.size()) + II.Mnemonic.str();
+    OS << ", " << StringTable.GetOrAddStringOffset(LenMnemonic, false)
+       << " /* " << II.Mnemonic << " */, ";
+
+    OS << OMI.CI->Name;
 
+    OS << ", " << OMI.OperandMask;
     OS << " /* ";
     bool printComma = false;
     for (int i = 0, e = 31; i !=e; ++i)
@@ -2435,30 +2473,10 @@ static void emitCustomOperandParsing(raw_ostream &OS, CodeGenTarget &Target,
       }
     OS << " */";
 
-    // Store a pascal-style length byte in the mnemonic.
-    std::string LenMnemonic = char(II.Mnemonic.size()) + II.Mnemonic.str();
-    OS << ", " << StringTable.GetOrAddStringOffset(LenMnemonic, false)
-       << " /* " << II.Mnemonic << " */, ";
-
-    // Write the required features mask.
-    if (!II.RequiredFeatures.empty()) {
-      for (unsigned i = 0, e = II.RequiredFeatures.size(); i != e; ++i) {
-        if (i) OS << "|";
-        OS << II.RequiredFeatures[i]->getEnumName();
-      }
-    } else
-      OS << "0";
-
-    OS << ", " << OMI.CI->Name;
-
     OS << " },\n";
   }
   OS << "};\n\n";
 
-  OS << "const char *const OperandMatchEntry::MnemonicTable =\n";
-  StringTable.EmitString(OS);
-  OS << ";\n\n";
-
   // Emit the operand class switch to call the correct custom parser for
   // the found operand class.
   OS << Target.getName() << ClassName << "::OperandMatchResultTy "
@@ -2599,11 +2617,11 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
      << "unsigned Opcode,\n"
      << "                          const SmallVectorImpl<MCParsedAsmOperand*> "
      << "&Operands);\n";
-  OS << "  unsigned getMCInstOperandNumImpl(unsigned Kind, MCInst &Inst,\n     "
-     << "                              const "
+  OS << "  unsigned getMCInstOperandNumImpl(unsigned Kind, MCInst &Inst,\n"
+     << "                           const "
      << "SmallVectorImpl<MCParsedAsmOperand*> &Operands,\n                     "
      << "          unsigned OperandNum, unsigned &NumMCOperands);\n";
-  OS << "  bool MnemonicIsValid(StringRef Mnemonic);\n";
+  OS << "  bool mnemonicIsValidImpl(StringRef Mnemonic);\n";
   OS << "  unsigned MatchInstructionImpl(\n"
      << "    const SmallVectorImpl<MCParsedAsmOperand*> &Operands,\n"
      << "    unsigned &Kind, MCInst &Inst, "
@@ -2679,11 +2697,25 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   emitComputeAvailableFeatures(Info, OS);
 
 
+  StringToOffsetTable StringTable;
+
   size_t MaxNumOperands = 0;
+  unsigned MaxMnemonicIndex = 0;
   for (std::vector<MatchableInfo*>::const_iterator it =
          Info.Matchables.begin(), ie = Info.Matchables.end();
-       it != ie; ++it)
-    MaxNumOperands = std::max(MaxNumOperands, (*it)->AsmOperands.size());
+       it != ie; ++it) {
+    MatchableInfo &II = **it;
+    MaxNumOperands = std::max(MaxNumOperands, II.AsmOperands.size());
+
+    // Store a pascal-style length byte in the mnemonic.
+    std::string LenMnemonic = char(II.Mnemonic.size()) + II.Mnemonic.str();
+    MaxMnemonicIndex = std::max(MaxMnemonicIndex,
+                        StringTable.GetOrAddStringOffset(LenMnemonic, false));
+  }
+
+  OS << "static const char *const MnemonicTable =\n";
+  StringTable.EmitString(OS);
+  OS << ";\n\n";
 
   // Emit the static match table; unused classes get initalized to 0 which is
   // guaranteed to be InvalidMatchClass.
@@ -2697,8 +2729,8 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   // following the mnemonic.
   OS << "namespace {\n";
   OS << "  struct MatchEntry {\n";
-  OS << "    static const char *const MnemonicTable;\n";
-  OS << "    uint32_t Mnemonic;\n";
+  OS << "    " << getMinimalTypeForRange(MaxMnemonicIndex)
+               << " Mnemonic;\n";
   OS << "    uint16_t Opcode;\n";
   OS << "    " << getMinimalTypeForRange(Info.Matchables.size())
                << " ConvertFn;\n";
@@ -2728,8 +2760,6 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
 
   OS << "} // end anonymous namespace.\n\n";
 
-  StringToOffsetTable StringTable;
-
   OS << "static const MatchEntry MatchTable["
      << Info.Matchables.size() << "] = {\n";
 
@@ -2768,13 +2798,9 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
 
   OS << "};\n\n";
 
-  OS << "const char *const MatchEntry::MnemonicTable =\n";
-  StringTable.EmitString(OS);
-  OS << ";\n\n";
-
   // A method to determine if a mnemonic is in the list.
   OS << "bool " << Target.getName() << ClassName << "::\n"
-     << "MnemonicIsValid(StringRef Mnemonic) {\n";
+     << "mnemonicIsValidImpl(StringRef Mnemonic) {\n";
   OS << "  // Search the table.\n";
   OS << "  std::pair<const MatchEntry*, const MatchEntry*> MnemonicRange =\n";
   OS << "    std::equal_range(MatchTable, MatchTable+"
@@ -2918,7 +2944,8 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   OS << "}\n\n";
 
   if (Info.OperandMatchInfo.size())
-    emitCustomOperandParsing(OS, Target, Info, ClassName);
+    emitCustomOperandParsing(OS, Target, Info, ClassName, StringTable,
+                             MaxMnemonicIndex);
 
   OS << "#endif // GET_MATCHER_IMPLEMENTATION\n\n";
 }
diff --git a/utils/TableGen/AsmWriterEmitter.cpp b/utils/TableGen/AsmWriterEmitter.cpp
index 57979b3e6d..d0cd057cd3 100644
--- a/utils/TableGen/AsmWriterEmitter.cpp
+++ b/utils/TableGen/AsmWriterEmitter.cpp
@@ -313,7 +313,9 @@ void AsmWriterEmitter::EmitPrintInstruction(raw_ostream &O) {
 
   /// OpcodeInfo - This encodes the index of the string to use for the first
   /// chunk of the output as well as indices used for operand printing.
-  std::vector<unsigned> OpcodeInfo;
+  /// To reduce the number of unhandled cases, we expand the size from 32-bit
+  /// to 32+16 = 48-bit.
+  std::vector<uint64_t> OpcodeInfo;
 
   // Add all strings to the string table upfront so it can generate an optimized
   // representation.
@@ -362,7 +364,7 @@ void AsmWriterEmitter::EmitPrintInstruction(raw_ostream &O) {
 
   // To reduce code size, we compactify common instructions into a few bits
   // in the opcode-indexed table.
-  unsigned BitsLeft = 32-AsmStrBits;
+  unsigned BitsLeft = 64-AsmStrBits;
 
   std::vector<std::vector<std::string> > TableDrivenOperandPrinters;
 
@@ -388,10 +390,11 @@ void AsmWriterEmitter::EmitPrintInstruction(raw_ostream &O) {
     }
 
     // Otherwise, we can include this in the initial lookup table.  Add it in.
-    BitsLeft -= NumBits;
     for (unsigned i = 0, e = InstIdxs.size(); i != e; ++i)
-      if (InstIdxs[i] != ~0U)
-        OpcodeInfo[i] |= InstIdxs[i] << (BitsLeft+AsmStrBits);
+      if (InstIdxs[i] != ~0U) {
+        OpcodeInfo[i] |= (uint64_t)InstIdxs[i] << (64-BitsLeft);
+      }
+    BitsLeft -= NumBits;
 
     // Remove the info about this operand.
     for (unsigned i = 0, e = NumberedInstructions.size(); i != e; ++i) {
@@ -410,16 +413,32 @@ void AsmWriterEmitter::EmitPrintInstruction(raw_ostream &O) {
   }
 
 
-
-  O<<"  static const unsigned OpInfo[] = {\n";
+  // We always emit at least one 32-bit table. A second table is emitted if
+  // more bits are needed.
+  O<<"  static const uint32_t OpInfo[] = {\n";
   for (unsigned i = 0, e = NumberedInstructions.size(); i != e; ++i) {
-    O << "    " << OpcodeInfo[i] << "U,\t// "
+    O << "    " << (OpcodeInfo[i] & 0xffffffff) << "U,\t// "
       << NumberedInstructions[i]->TheDef->getName() << "\n";
   }
   // Add a dummy entry so the array init doesn't end with a comma.
   O << "    0U\n";
   O << "  };\n\n";
 
+  if (BitsLeft < 32) {
+    // Add a second OpInfo table only when it is necessary.
+    // Adjust the type of the second table based on the number of bits needed.
+    O << "  static const uint"
+      << ((BitsLeft < 16) ? "32" : (BitsLeft < 24) ? "16" : "8")
+      << "_t OpInfo2[] = {\n";
+    for (unsigned i = 0, e = NumberedInstructions.size(); i != e; ++i) {
+      O << "    " << (OpcodeInfo[i] >> 32) << "U,\t// "
+        << NumberedInstructions[i]->TheDef->getName() << "\n";
+    }
+    // Add a dummy entry so the array init doesn't end with a comma.
+    O << "    0U\n";
+    O << "  };\n\n";
+  }
+
   // Emit the string itself.
   O << "  const char AsmStrs[] = {\n";
   StringTable.emit(O, printChar);
@@ -427,13 +446,22 @@ void AsmWriterEmitter::EmitPrintInstruction(raw_ostream &O) {
 
   O << "  O << \"\\t\";\n\n";
 
-  O << "  // Emit the opcode for the instruction.\n"
-    << "  unsigned Bits = OpInfo[MI->getOpcode()];\n"
-    << "  assert(Bits != 0 && \"Cannot print this instruction.\");\n"
+  O << "  // Emit the opcode for the instruction.\n";
+  if (BitsLeft < 32) {
+    // If we have two tables then we need to perform two lookups and combine
+    // the results into a single 64-bit value.
+    O << "  uint64_t Bits1 = OpInfo[MI->getOpcode()];\n"
+      << "  uint64_t Bits2 = OpInfo2[MI->getOpcode()];\n"
+      << "  uint64_t Bits = (Bits2 << 32) | Bits1;\n";
+  } else {
+    // If only one table is used we just need to perform a single lookup.
+    O << "  uint32_t Bits = OpInfo[MI->getOpcode()];\n";
+  }
+  O << "  assert(Bits != 0 && \"Cannot print this instruction.\");\n"
     << "  O << AsmStrs+(Bits & " << (1 << AsmStrBits)-1 << ")-1;\n\n";
 
   // Output the table driven operand information.
-  BitsLeft = 32-AsmStrBits;
+  BitsLeft = 64-AsmStrBits;
   for (unsigned i = 0, e = TableDrivenOperandPrinters.size(); i != e; ++i) {
     std::vector<std::string> &Commands = TableDrivenOperandPrinters[i];
 
@@ -443,14 +471,13 @@ void AsmWriterEmitter::EmitPrintInstruction(raw_ostream &O) {
     assert(NumBits <= BitsLeft && "consistency error");
 
     // Emit code to extract this field from Bits.
-    BitsLeft -= NumBits;
-
     O << "\n  // Fragment " << i << " encoded into " << NumBits
       << " bits for " << Commands.size() << " unique commands.\n";
 
     if (Commands.size() == 2) {
       // Emit two possibilitys with if/else.
-      O << "  if ((Bits >> " << (BitsLeft+AsmStrBits) << ") & "
+      O << "  if ((Bits >> "
+        << (64-BitsLeft) << ") & "
         << ((1 << NumBits)-1) << ") {\n"
         << Commands[1]
         << "  } else {\n"
@@ -460,7 +487,8 @@ void AsmWriterEmitter::EmitPrintInstruction(raw_ostream &O) {
       // Emit a single possibility.
       O << Commands[0] << "\n\n";
     } else {
-      O << "  switch ((Bits >> " << (BitsLeft+AsmStrBits) << ") & "
+      O << "  switch ((Bits >> "
+        << (64-BitsLeft) << ") & "
         << ((1 << NumBits)-1) << ") {\n"
         << "  default:   // unreachable.\n";
 
@@ -472,6 +500,7 @@ void AsmWriterEmitter::EmitPrintInstruction(raw_ostream &O) {
       }
       O << "  }\n\n";
     }
+    BitsLeft -= NumBits;
   }
 
   // Okay, delete instructions with no operand info left.
@@ -551,7 +580,7 @@ emitRegisterNameString(raw_ostream &O, StringRef AltName,
   StringTable.emit(O, printChar);
   O << "  };\n\n";
 
-  O << "  static const unsigned RegAsmOffset" << AltName << "[] = {";
+  O << "  static const uint32_t RegAsmOffset" << AltName << "[] = {";
   for (unsigned i = 0, e = Registers.size(); i != e; ++i) {
     if ((i % 14) == 0)
       O << "\n    ";
@@ -590,7 +619,7 @@ void AsmWriterEmitter::EmitGetRegisterName(raw_ostream &O) {
     emitRegisterNameString(O, "", Registers);
 
   if (hasAltNames) {
-    O << "  const unsigned *RegAsmOffset;\n"
+    O << "  const uint32_t *RegAsmOffset;\n"
       << "  const char *AsmStrs;\n"
       << "  switch(AltIdx) {\n"
       << "  default: llvm_unreachable(\"Invalid register alt name index!\");\n";
diff --git a/utils/TableGen/CodeGenDAGPatterns.cpp b/utils/TableGen/CodeGenDAGPatterns.cpp
index 8713a56916..c91ec95e2e 100644
--- a/utils/TableGen/CodeGenDAGPatterns.cpp
+++ b/utils/TableGen/CodeGenDAGPatterns.cpp
@@ -574,10 +574,6 @@ bool EEVT::TypeSet::EnforceVectorSubVectorTypeIs(EEVT::TypeSet &VTOperand,
 //===----------------------------------------------------------------------===//
 // Helpers for working with extended types.
 
-bool RecordPtrCmp::operator()(const Record *LHS, const Record *RHS) const {
-  return LHS->getID() < RHS->getID();
-}
-
 /// Dependent variable map for CodeGenDAGPattern variant generation
 typedef std::map<std::string, int> DepVarMap;
 
@@ -1535,7 +1531,7 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
         const CodeGenRegisterClass &RC =
           CDP.getTargetInfo().getRegisterClass(RegClass);
         MadeChange |= UpdateNodeType(ResNo, RC.getValueTypes(), TP);
-      } else if (ResultNode->getName() == "unknown") {
+      } else if (ResultNode->isSubClassOf("unknown_class")) {
         // Nothing to do.
       } else {
         assert(ResultNode->isSubClassOf("RegisterClass") &&
@@ -1602,7 +1598,7 @@ bool TreePatternNode::ApplyTypeConstraints(TreePattern &TP, bool NotRegisters) {
         MadeChange |= Child->UpdateNodeType(ChildResNo, VT, TP);
       } else if (OperandNode->isSubClassOf("PointerLikeRegClass")) {
         MadeChange |= Child->UpdateNodeType(ChildResNo, MVT::iPTR, TP);
-      } else if (OperandNode->getName() == "unknown") {
+      } else if (OperandNode->isSubClassOf("unknown_class")) {
         // Nothing to do.
       } else
         llvm_unreachable("Unknown operand type!");
@@ -2748,7 +2744,7 @@ void CodeGenDAGPatterns::ParseInstructions() {
   }
 
   // If we can, convert the instructions to be patterns that are matched!
-  for (std::map<Record*, DAGInstruction, RecordPtrCmp>::iterator II =
+  for (std::map<Record*, DAGInstruction, LessRecordByID>::iterator II =
         Instructions.begin(),
        E = Instructions.end(); II != E; ++II) {
     DAGInstruction &TheInst = II->second;
@@ -2804,8 +2800,11 @@ void CodeGenDAGPatterns::AddPatternToMatch(const TreePattern *Pattern,
                                            const PatternToMatch &PTM) {
   // Do some sanity checking on the pattern we're about to match.
   std::string Reason;
-  if (!PTM.getSrcPattern()->canPatternMatch(Reason, *this))
-    Pattern->error("Pattern can never match: " + Reason);
+  if (!PTM.getSrcPattern()->canPatternMatch(Reason, *this)) {
+    PrintWarning(Pattern->getRecord()->getLoc(),
+      Twine("Pattern can never match: ") + Reason);
+    return;
+  }
 
   // If the source pattern's root is a complex pattern, that complex pattern
   // must specify the nodes it can potentially match.
diff --git a/utils/TableGen/CodeGenDAGPatterns.h b/utils/TableGen/CodeGenDAGPatterns.h
index 25a0e4bb10..66f77eae1a 100644
--- a/utils/TableGen/CodeGenDAGPatterns.h
+++ b/utils/TableGen/CodeGenDAGPatterns.h
@@ -661,23 +661,18 @@ public:
   unsigned getPatternComplexity(const CodeGenDAGPatterns &CGP) const;
 };
 
-// Deterministic comparison of Record*.
-struct RecordPtrCmp {
-  bool operator()(const Record *LHS, const Record *RHS) const;
-};
-
 class CodeGenDAGPatterns {
   RecordKeeper &Records;
   CodeGenTarget Target;
   std::vector<CodeGenIntrinsic> Intrinsics;
   std::vector<CodeGenIntrinsic> TgtIntrinsics;
 
-  std::map<Record*, SDNodeInfo, RecordPtrCmp> SDNodes;
-  std::map<Record*, std::pair<Record*, std::string>, RecordPtrCmp> SDNodeXForms;
-  std::map<Record*, ComplexPattern, RecordPtrCmp> ComplexPatterns;
-  std::map<Record*, TreePattern*, RecordPtrCmp> PatternFragments;
-  std::map<Record*, DAGDefaultOperand, RecordPtrCmp> DefaultOperands;
-  std::map<Record*, DAGInstruction, RecordPtrCmp> Instructions;
+  std::map<Record*, SDNodeInfo, LessRecordByID> SDNodes;
+  std::map<Record*, std::pair<Record*, std::string>, LessRecordByID> SDNodeXForms;
+  std::map<Record*, ComplexPattern, LessRecordByID> ComplexPatterns;
+  std::map<Record*, TreePattern*, LessRecordByID> PatternFragments;
+  std::map<Record*, DAGDefaultOperand, LessRecordByID> DefaultOperands;
+  std::map<Record*, DAGInstruction, LessRecordByID> Instructions;
 
   // Specific SDNode definitions:
   Record *intrinsic_void_sdnode;
@@ -708,7 +703,7 @@ public:
     return SDNodeXForms.find(R)->second;
   }
 
-  typedef std::map<Record*, NodeXForm, RecordPtrCmp>::const_iterator
+  typedef std::map<Record*, NodeXForm, LessRecordByID>::const_iterator
           nx_iterator;
   nx_iterator nx_begin() const { return SDNodeXForms.begin(); }
   nx_iterator nx_end() const { return SDNodeXForms.end(); }
@@ -758,7 +753,7 @@ public:
     return PatternFragments.find(R)->second;
   }
 
-  typedef std::map<Record*, TreePattern*, RecordPtrCmp>::const_iterator
+  typedef std::map<Record*, TreePattern*, LessRecordByID>::const_iterator
           pf_iterator;
   pf_iterator pf_begin() const { return PatternFragments.begin(); }
   pf_iterator pf_end() const { return PatternFragments.end(); }
diff --git a/utils/TableGen/CodeGenInstruction.cpp b/utils/TableGen/CodeGenInstruction.cpp
index 38e2b832f2..836279b89a 100644
--- a/utils/TableGen/CodeGenInstruction.cpp
+++ b/utils/TableGen/CodeGenInstruction.cpp
@@ -101,7 +101,7 @@ CGIOperandList::CGIOperandList(Record *R) : TheDef(R) {
     } else if (Rec->isSubClassOf("RegisterClass")) {
       OperandType = "OPERAND_REGISTER";
     } else if (!Rec->isSubClassOf("PointerLikeRegClass") &&
-               Rec->getName() != "unknown")
+               !Rec->isSubClassOf("unknown_class"))
       throw "Unknown operand class '" + Rec->getName() +
       "' in '" + R->getName() + "' instruction!";
 
diff --git a/utils/TableGen/CodeGenRegisters.cpp b/utils/TableGen/CodeGenRegisters.cpp
index 931807ae44..f195b4e3fa 100644
--- a/utils/TableGen/CodeGenRegisters.cpp
+++ b/utils/TableGen/CodeGenRegisters.cpp
@@ -993,6 +993,12 @@ CodeGenRegBank::CodeGenRegBank(RecordKeeper &Records) {
   for (unsigned i = 0, e = Registers.size(); i != e; ++i)
     Registers[i]->buildObjectGraph(*this);
 
+  // Compute register name map.
+  for (unsigned i = 0, e = Registers.size(); i != e; ++i)
+    RegistersByName.GetOrCreateValue(
+                       Registers[i]->TheDef->getValueAsString("AsmName"),
+                       Registers[i]);
+
   // Precompute all sub-register maps.
   // This will create Composite entries for all inferred sub-register indices.
   for (unsigned i = 0, e = Registers.size(); i != e; ++i)
diff --git a/utils/TableGen/CodeGenRegisters.h b/utils/TableGen/CodeGenRegisters.h
index bd748ca62e..e411074156 100644
--- a/utils/TableGen/CodeGenRegisters.h
+++ b/utils/TableGen/CodeGenRegisters.h
@@ -440,6 +440,7 @@ namespace llvm {
 
     // Registers.
     std::vector<CodeGenRegister*> Registers;
+    StringMap<CodeGenRegister*> RegistersByName;
     DenseMap<Record*, CodeGenRegister*> Def2Reg;
     unsigned NumNativeRegUnits;
 
@@ -522,6 +523,9 @@ namespace llvm {
     }
 
     const std::vector<CodeGenRegister*> &getRegisters() { return Registers; }
+    const StringMap<CodeGenRegister*> &getRegistersByName() {
+      return RegistersByName;
+    }
 
     // Find a register from its Record def.
     CodeGenRegister *getReg(Record*);
diff --git a/utils/TableGen/CodeGenSchedule.cpp b/utils/TableGen/CodeGenSchedule.cpp
index f57fd182ea..e54202e3f2 100644
--- a/utils/TableGen/CodeGenSchedule.cpp
+++ b/utils/TableGen/CodeGenSchedule.cpp
@@ -16,41 +16,409 @@
 
 #include "CodeGenSchedule.h"
 #include "CodeGenTarget.h"
+#include "llvm/TableGen/Error.h"
 #include "llvm/Support/Debug.h"
 
 using namespace llvm;
 
-// CodeGenModels ctor interprets machine model records and populates maps.
+#ifndef NDEBUG
+static void dumpIdxVec(const IdxVec &V) {
+  for (unsigned i = 0, e = V.size(); i < e; ++i) {
+    dbgs() << V[i] << ", ";
+  }
+}
+static void dumpIdxVec(const SmallVectorImpl<unsigned> &V) {
+  for (unsigned i = 0, e = V.size(); i < e; ++i) {
+    dbgs() << V[i] << ", ";
+  }
+}
+#endif
+
+/// CodeGenModels ctor interprets machine model records and populates maps.
 CodeGenSchedModels::CodeGenSchedModels(RecordKeeper &RK,
                                        const CodeGenTarget &TGT):
-  Records(RK), Target(TGT), NumItineraryClasses(0), HasProcItineraries(false) {
+  Records(RK), Target(TGT), NumItineraryClasses(0) {
+
+  // Instantiate a CodeGenProcModel for each SchedMachineModel with the values
+  // that are explicitly referenced in tablegen records. Resources associated
+  // with each processor will be derived later. Populate ProcModelMap with the
+  // CodeGenProcModel instances.
+  collectProcModels();
+
+  // Instantiate a CodeGenSchedRW for each SchedReadWrite record explicitly
+  // defined, and populate SchedReads and SchedWrites vectors. Implicit
+  // SchedReadWrites that represent sequences derived from expanded variant will
+  // be inferred later.
+  collectSchedRW();
+
+  // Instantiate a CodeGenSchedClass for each unique SchedRW signature directly
+  // required by an instruction definition, and populate SchedClassIdxMap. Set
+  // NumItineraryClasses to the number of explicit itinerary classes referenced
+  // by instructions. Set NumInstrSchedClasses to the number of itinerary
+  // classes plus any classes implied by instructions that derive from class
+  // Sched and provide SchedRW list. This does not infer any new classes from
+  // SchedVariant.
+  collectSchedClasses();
+
+  // Find instruction itineraries for each processor. Sort and populate
+  // CodeGenProcModel::ItinDefList. (Cycle-to-cycle itineraries). This requires
+  // all itinerary classes to be discovered.
+  collectProcItins();
+
+  // Find ItinRW records for each processor and itinerary class.
+  // (For per-operand resources mapped to itinerary classes).
+  collectProcItinRW();
+
+  // Infer new SchedClasses from SchedVariant.
+  inferSchedClasses();
+
+  DEBUG(for (unsigned i = 0; i < SchedClasses.size(); ++i)
+          SchedClasses[i].dump(this));
+
+  // Populate each CodeGenProcModel's WriteResDefs, ReadAdvanceDefs, and
+  // ProcResourceDefs.
+  collectProcResources();
+}
+
+/// Gather all processor models.
+void CodeGenSchedModels::collectProcModels() {
+  RecVec ProcRecords = Records.getAllDerivedDefinitions("Processor");
+  std::sort(ProcRecords.begin(), ProcRecords.end(), LessRecordFieldName());
+
+  // Reserve space because we can. Reallocation would be ok.
+  ProcModels.reserve(ProcRecords.size()+1);
 
-  // Populate SchedClassIdxMap and set NumItineraryClasses.
-  CollectSchedClasses();
+  // Use idx=0 for NoModel/NoItineraries.
+  Record *NoModelDef = Records.getDef("NoSchedModel");
+  Record *NoItinsDef = Records.getDef("NoItineraries");
+  ProcModels.push_back(CodeGenProcModel(0, "NoSchedModel",
+                                        NoModelDef, NoItinsDef));
+  ProcModelMap[NoModelDef] = 0;
 
-  // Populate ProcModelMap.
-  CollectProcModels();
+  // For each processor, find a unique machine model.
+  for (unsigned i = 0, N = ProcRecords.size(); i < N; ++i)
+    addProcModel(ProcRecords[i]);
 }
 
-// Visit all the instruction definitions for this target to gather and enumerate
-// the itinerary classes. These are the explicitly specified SchedClasses. More
-// SchedClasses may be inferred.
-void CodeGenSchedModels::CollectSchedClasses() {
+/// Get a unique processor model based on the defined MachineModel and
+/// ProcessorItineraries.
+void CodeGenSchedModels::addProcModel(Record *ProcDef) {
+  Record *ModelKey = getModelOrItinDef(ProcDef);
+  if (!ProcModelMap.insert(std::make_pair(ModelKey, ProcModels.size())).second)
+    return;
+
+  std::string Name = ModelKey->getName();
+  if (ModelKey->isSubClassOf("SchedMachineModel")) {
+    Record *ItinsDef = ModelKey->getValueAsDef("Itineraries");
+    ProcModels.push_back(
+      CodeGenProcModel(ProcModels.size(), Name, ModelKey, ItinsDef));
+  }
+  else {
+    // An itinerary is defined without a machine model. Infer a new model.
+    if (!ModelKey->getValueAsListOfDefs("IID").empty())
+      Name = Name + "Model";
+    ProcModels.push_back(
+      CodeGenProcModel(ProcModels.size(), Name,
+                       ProcDef->getValueAsDef("SchedModel"), ModelKey));
+  }
+  DEBUG(ProcModels.back().dump());
+}
+
+// Recursively find all reachable SchedReadWrite records.
+static void scanSchedRW(Record *RWDef, RecVec &RWDefs,
+                        SmallPtrSet<Record*, 16> &RWSet) {
+  if (!RWSet.insert(RWDef))
+    return;
+  RWDefs.push_back(RWDef);
+  // Reads don't current have sequence records, but it can be added later.
+  if (RWDef->isSubClassOf("WriteSequence")) {
+    RecVec Seq = RWDef->getValueAsListOfDefs("Writes");
+    for (RecIter I = Seq.begin(), E = Seq.end(); I != E; ++I)
+      scanSchedRW(*I, RWDefs, RWSet);
+  }
+  else if (RWDef->isSubClassOf("SchedVariant")) {
+    // Visit each variant (guarded by a different predicate).
+    RecVec Vars = RWDef->getValueAsListOfDefs("Variants");
+    for (RecIter VI = Vars.begin(), VE = Vars.end(); VI != VE; ++VI) {
+      // Visit each RW in the sequence selected by the current variant.
+      RecVec Selected = (*VI)->getValueAsListOfDefs("Selected");
+      for (RecIter I = Selected.begin(), E = Selected.end(); I != E; ++I)
+        scanSchedRW(*I, RWDefs, RWSet);
+    }
+  }
+}
+
+// Collect and sort all SchedReadWrites reachable via tablegen records.
+// More may be inferred later when inferring new SchedClasses from variants.
+void CodeGenSchedModels::collectSchedRW() {
+  // Reserve idx=0 for invalid writes/reads.
+  SchedWrites.resize(1);
+  SchedReads.resize(1);
+
+  SmallPtrSet<Record*, 16> RWSet;
+
+  // Find all SchedReadWrites referenced by instruction defs.
+  RecVec SWDefs, SRDefs;
+  for (CodeGenTarget::inst_iterator I = Target.inst_begin(),
+         E = Target.inst_end(); I != E; ++I) {
+    Record *SchedDef = (*I)->TheDef;
+    if (!SchedDef->isSubClassOf("Sched"))
+      continue;
+    RecVec RWs = SchedDef->getValueAsListOfDefs("SchedRW");
+    for (RecIter RWI = RWs.begin(), RWE = RWs.end(); RWI != RWE; ++RWI) {
+      if ((*RWI)->isSubClassOf("SchedWrite"))
+        scanSchedRW(*RWI, SWDefs, RWSet);
+      else {
+        assert((*RWI)->isSubClassOf("SchedRead") && "Unknown SchedReadWrite");
+        scanSchedRW(*RWI, SRDefs, RWSet);
+      }
+    }
+  }
+  // Find all ReadWrites referenced by InstRW.
+  RecVec InstRWDefs = Records.getAllDerivedDefinitions("InstRW");
+  for (RecIter OI = InstRWDefs.begin(), OE = InstRWDefs.end(); OI != OE; ++OI) {
+    // For all OperandReadWrites.
+    RecVec RWDefs = (*OI)->getValueAsListOfDefs("OperandReadWrites");
+    for (RecIter RWI = RWDefs.begin(), RWE = RWDefs.end();
+         RWI != RWE; ++RWI) {
+      if ((*RWI)->isSubClassOf("SchedWrite"))
+        scanSchedRW(*RWI, SWDefs, RWSet);
+      else {
+        assert((*RWI)->isSubClassOf("SchedRead") && "Unknown SchedReadWrite");
+        scanSchedRW(*RWI, SRDefs, RWSet);
+      }
+    }
+  }
+  // Find all ReadWrites referenced by ItinRW.
+  RecVec ItinRWDefs = Records.getAllDerivedDefinitions("ItinRW");
+  for (RecIter II = ItinRWDefs.begin(), IE = ItinRWDefs.end(); II != IE; ++II) {
+    // For all OperandReadWrites.
+    RecVec RWDefs = (*II)->getValueAsListOfDefs("OperandReadWrites");
+    for (RecIter RWI = RWDefs.begin(), RWE = RWDefs.end();
+         RWI != RWE; ++RWI) {
+      if ((*RWI)->isSubClassOf("SchedWrite"))
+        scanSchedRW(*RWI, SWDefs, RWSet);
+      else {
+        assert((*RWI)->isSubClassOf("SchedRead") && "Unknown SchedReadWrite");
+        scanSchedRW(*RWI, SRDefs, RWSet);
+      }
+    }
+  }
+  // Find all ReadWrites referenced by SchedAlias. AliasDefs needs to be sorted
+  // for the loop below that initializes Alias vectors.
+  RecVec AliasDefs = Records.getAllDerivedDefinitions("SchedAlias");
+  std::sort(AliasDefs.begin(), AliasDefs.end(), LessRecord());
+  for (RecIter AI = AliasDefs.begin(), AE = AliasDefs.end(); AI != AE; ++AI) {
+    Record *MatchDef = (*AI)->getValueAsDef("MatchRW");
+    Record *AliasDef = (*AI)->getValueAsDef("AliasRW");
+    if (MatchDef->isSubClassOf("SchedWrite")) {
+      if (!AliasDef->isSubClassOf("SchedWrite"))
+        throw TGError((*AI)->getLoc(), "SchedWrite Alias must be SchedWrite");
+      scanSchedRW(AliasDef, SWDefs, RWSet);
+    }
+    else {
+      assert(MatchDef->isSubClassOf("SchedRead") && "Unknown SchedReadWrite");
+      if (!AliasDef->isSubClassOf("SchedRead"))
+        throw TGError((*AI)->getLoc(), "SchedRead Alias must be SchedRead");
+      scanSchedRW(AliasDef, SRDefs, RWSet);
+    }
+  }
+  // Sort and add the SchedReadWrites directly referenced by instructions or
+  // itinerary resources. Index reads and writes in separate domains.
+  std::sort(SWDefs.begin(), SWDefs.end(), LessRecord());
+  for (RecIter SWI = SWDefs.begin(), SWE = SWDefs.end(); SWI != SWE; ++SWI) {
+    assert(!getSchedRWIdx(*SWI, /*IsRead=*/false) && "duplicate SchedWrite");
+    SchedWrites.push_back(CodeGenSchedRW(*SWI));
+  }
+  std::sort(SRDefs.begin(), SRDefs.end(), LessRecord());
+  for (RecIter SRI = SRDefs.begin(), SRE = SRDefs.end(); SRI != SRE; ++SRI) {
+    assert(!getSchedRWIdx(*SRI, /*IsRead-*/true) && "duplicate SchedWrite");
+    SchedReads.push_back(CodeGenSchedRW(*SRI));
+  }
+  // Initialize WriteSequence vectors.
+  for (std::vector<CodeGenSchedRW>::iterator WI = SchedWrites.begin(),
+         WE = SchedWrites.end(); WI != WE; ++WI) {
+    if (!WI->IsSequence)
+      continue;
+    findRWs(WI->TheDef->getValueAsListOfDefs("Writes"), WI->Sequence,
+            /*IsRead=*/false);
+  }
+  // Initialize Aliases vectors.
+  for (RecIter AI = AliasDefs.begin(), AE = AliasDefs.end(); AI != AE; ++AI) {
+    Record *AliasDef = (*AI)->getValueAsDef("AliasRW");
+    getSchedRW(AliasDef).IsAlias = true;
+    Record *MatchDef = (*AI)->getValueAsDef("MatchRW");
+    CodeGenSchedRW &RW = getSchedRW(MatchDef);
+    if (RW.IsAlias)
+      throw TGError((*AI)->getLoc(), "Cannot Alias an Alias");
+    RW.Aliases.push_back(*AI);
+  }
+  DEBUG(
+    for (unsigned WIdx = 0, WEnd = SchedWrites.size(); WIdx != WEnd; ++WIdx) {
+      dbgs() << WIdx << ": ";
+      SchedWrites[WIdx].dump();
+      dbgs() << '\n';
+    }
+    for (unsigned RIdx = 0, REnd = SchedReads.size(); RIdx != REnd; ++RIdx) {
+      dbgs() << RIdx << ": ";
+      SchedReads[RIdx].dump();
+      dbgs() << '\n';
+    }
+    RecVec RWDefs = Records.getAllDerivedDefinitions("SchedReadWrite");
+    for (RecIter RI = RWDefs.begin(), RE = RWDefs.end();
+         RI != RE; ++RI) {
+      if (!getSchedRWIdx(*RI, (*RI)->isSubClassOf("SchedRead"))) {
+        const std::string &Name = (*RI)->getName();
+        if (Name != "NoWrite" && Name != "ReadDefault")
+          dbgs() << "Unused SchedReadWrite " << (*RI)->getName() << '\n';
+      }
+    });
+}
+
+/// Compute a SchedWrite name from a sequence of writes.
+std::string CodeGenSchedModels::genRWName(const IdxVec& Seq, bool IsRead) {
+  std::string Name("(");
+  for (IdxIter I = Seq.begin(), E = Seq.end(); I != E; ++I) {
+    if (I != Seq.begin())
+      Name += '_';
+    Name += getSchedRW(*I, IsRead).Name;
+  }
+  Name += ')';
+  return Name;
+}
+
+unsigned CodeGenSchedModels::getSchedRWIdx(Record *Def, bool IsRead,
+                                           unsigned After) const {
+  const std::vector<CodeGenSchedRW> &RWVec = IsRead ? SchedReads : SchedWrites;
+  assert(After < RWVec.size() && "start position out of bounds");
+  for (std::vector<CodeGenSchedRW>::const_iterator I = RWVec.begin() + After,
+         E = RWVec.end(); I != E; ++I) {
+    if (I->TheDef == Def)
+      return I - RWVec.begin();
+  }
+  return 0;
+}
+
+bool CodeGenSchedModels::hasReadOfWrite(Record *WriteDef) const {
+  for (unsigned i = 0, e = SchedReads.size(); i < e; ++i) {
+    Record *ReadDef = SchedReads[i].TheDef;
+    if (!ReadDef || !ReadDef->isSubClassOf("ProcReadAdvance"))
+      continue;
+
+    RecVec ValidWrites = ReadDef->getValueAsListOfDefs("ValidWrites");
+    if (std::find(ValidWrites.begin(), ValidWrites.end(), WriteDef)
+        != ValidWrites.end()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+namespace llvm {
+void splitSchedReadWrites(const RecVec &RWDefs,
+                          RecVec &WriteDefs, RecVec &ReadDefs) {
+  for (RecIter RWI = RWDefs.begin(), RWE = RWDefs.end(); RWI != RWE; ++RWI) {
+    if ((*RWI)->isSubClassOf("SchedWrite"))
+      WriteDefs.push_back(*RWI);
+    else {
+      assert((*RWI)->isSubClassOf("SchedRead") && "unknown SchedReadWrite");
+      ReadDefs.push_back(*RWI);
+    }
+  }
+}
+} // namespace llvm
 
-  // NoItinerary is always the first class at Index=0
+// Split the SchedReadWrites defs and call findRWs for each list.
+void CodeGenSchedModels::findRWs(const RecVec &RWDefs,
+                                 IdxVec &Writes, IdxVec &Reads) const {
+    RecVec WriteDefs;
+    RecVec ReadDefs;
+    splitSchedReadWrites(RWDefs, WriteDefs, ReadDefs);
+    findRWs(WriteDefs, Writes, false);
+    findRWs(ReadDefs, Reads, true);
+}
+
+// Call getSchedRWIdx for all elements in a sequence of SchedRW defs.
+void CodeGenSchedModels::findRWs(const RecVec &RWDefs, IdxVec &RWs,
+                                 bool IsRead) const {
+  for (RecIter RI = RWDefs.begin(), RE = RWDefs.end(); RI != RE; ++RI) {
+    unsigned Idx = getSchedRWIdx(*RI, IsRead);
+    assert(Idx && "failed to collect SchedReadWrite");
+    RWs.push_back(Idx);
+  }
+}
+
+void CodeGenSchedModels::expandRWSequence(unsigned RWIdx, IdxVec &RWSeq,
+                                          bool IsRead) const {
+  const CodeGenSchedRW &SchedRW = getSchedRW(RWIdx, IsRead);
+  if (!SchedRW.IsSequence) {
+    RWSeq.push_back(RWIdx);
+    return;
+  }
+  int Repeat =
+    SchedRW.TheDef ? SchedRW.TheDef->getValueAsInt("Repeat") : 1;
+  for (int i = 0; i < Repeat; ++i) {
+    for (IdxIter I = SchedRW.Sequence.begin(), E = SchedRW.Sequence.end();
+         I != E; ++I) {
+      expandRWSequence(*I, RWSeq, IsRead);
+    }
+  }
+}
+
+// Find the existing SchedWrite that models this sequence of writes.
+unsigned CodeGenSchedModels::findRWForSequence(const IdxVec &Seq,
+                                               bool IsRead) {
+  std::vector<CodeGenSchedRW> &RWVec = IsRead ? SchedReads : SchedWrites;
+
+  for (std::vector<CodeGenSchedRW>::iterator I = RWVec.begin(), E = RWVec.end();
+       I != E; ++I) {
+    if (I->Sequence == Seq)
+      return I - RWVec.begin();
+  }
+  // Index zero reserved for invalid RW.
+  return 0;
+}
+
+/// Add this ReadWrite if it doesn't already exist.
+unsigned CodeGenSchedModels::findOrInsertRW(ArrayRef<unsigned> Seq,
+                                            bool IsRead) {
+  assert(!Seq.empty() && "cannot insert empty sequence");
+  if (Seq.size() == 1)
+    return Seq.back();
+
+  unsigned Idx = findRWForSequence(Seq, IsRead);
+  if (Idx)
+    return Idx;
+
+  CodeGenSchedRW SchedRW(Seq, genRWName(Seq, IsRead));
+  if (IsRead) {
+    SchedReads.push_back(SchedRW);
+    return SchedReads.size() - 1;
+  }
+  SchedWrites.push_back(SchedRW);
+  return SchedWrites.size() - 1;
+}
+
+/// Visit all the instruction definitions for this target to gather and
+/// enumerate the itinerary classes. These are the explicitly specified
+/// SchedClasses. More SchedClasses may be inferred.
+void CodeGenSchedModels::collectSchedClasses() {
+
+  // NoItinerary is always the first class at Idx=0
   SchedClasses.resize(1);
   SchedClasses.back().Name = "NoItinerary";
+  SchedClasses.back().ProcIndices.push_back(0);
   SchedClassIdxMap[SchedClasses.back().Name] = 0;
 
   // Gather and sort all itinerary classes used by instruction descriptions.
-  std::vector<Record*> ItinClassList;
+  RecVec ItinClassList;
   for (CodeGenTarget::inst_iterator I = Target.inst_begin(),
          E = Target.inst_end(); I != E; ++I) {
-    Record *SchedDef = (*I)->TheDef->getValueAsDef("Itinerary");
+    Record *ItinDef = (*I)->TheDef->getValueAsDef("Itinerary");
     // Map a new SchedClass with no index.
-    if (!SchedClassIdxMap.count(SchedDef->getName())) {
-      SchedClassIdxMap[SchedDef->getName()] = 0;
-      ItinClassList.push_back(SchedDef);
+    if (!SchedClassIdxMap.count(ItinDef->getName())) {
+      SchedClassIdxMap[ItinDef->getName()] = 0;
+      ItinClassList.push_back(ItinDef);
     }
   }
   // Assign each itinerary class unique number, skipping NoItinerary==0
@@ -61,91 +429,1073 @@ void CodeGenSchedModels::CollectSchedClasses() {
     SchedClassIdxMap[ItinDef->getName()] = SchedClasses.size();
     SchedClasses.push_back(CodeGenSchedClass(ItinDef));
   }
+  // Infer classes from SchedReadWrite resources listed for each
+  // instruction definition that inherits from class Sched.
+  for (CodeGenTarget::inst_iterator I = Target.inst_begin(),
+         E = Target.inst_end(); I != E; ++I) {
+    if (!(*I)->TheDef->isSubClassOf("Sched"))
+      continue;
+    IdxVec Writes, Reads;
+    findRWs((*I)->TheDef->getValueAsListOfDefs("SchedRW"), Writes, Reads);
+    // ProcIdx == 0 indicates the class applies to all processors.
+    IdxVec ProcIndices(1, 0);
+    addSchedClass(Writes, Reads, ProcIndices);
+  }
+  // Create classes for InstRW defs.
+  RecVec InstRWDefs = Records.getAllDerivedDefinitions("InstRW");
+  std::sort(InstRWDefs.begin(), InstRWDefs.end(), LessRecord());
+  for (RecIter OI = InstRWDefs.begin(), OE = InstRWDefs.end(); OI != OE; ++OI)
+    createInstRWClass(*OI);
 
-  // TODO: Infer classes from non-itinerary scheduler resources.
+  NumInstrSchedClasses = SchedClasses.size();
+
+  bool EnableDump = false;
+  DEBUG(EnableDump = true);
+  if (!EnableDump)
+    return;
+  for (CodeGenTarget::inst_iterator I = Target.inst_begin(),
+         E = Target.inst_end(); I != E; ++I) {
+    Record *SchedDef = (*I)->TheDef;
+    std::string InstName = (*I)->TheDef->getName();
+    if (SchedDef->isSubClassOf("Sched")) {
+      IdxVec Writes;
+      IdxVec Reads;
+      findRWs((*I)->TheDef->getValueAsListOfDefs("SchedRW"), Writes, Reads);
+      dbgs() << "SchedRW machine model for " << InstName;
+      for (IdxIter WI = Writes.begin(), WE = Writes.end(); WI != WE; ++WI)
+        dbgs() << " " << SchedWrites[*WI].Name;
+      for (IdxIter RI = Reads.begin(), RE = Reads.end(); RI != RE; ++RI)
+        dbgs() << " " << SchedReads[*RI].Name;
+      dbgs() << '\n';
+    }
+    unsigned SCIdx = InstrClassMap.lookup((*I)->TheDef);
+    if (SCIdx) {
+      const RecVec &RWDefs = SchedClasses[SCIdx].InstRWs;
+      for (RecIter RWI = RWDefs.begin(), RWE = RWDefs.end();
+           RWI != RWE; ++RWI) {
+        const CodeGenProcModel &ProcModel =
+          getProcModel((*RWI)->getValueAsDef("SchedModel"));
+        dbgs() << "InstrRW on " << ProcModel.ModelName << " for " << InstName;
+        IdxVec Writes;
+        IdxVec Reads;
+        findRWs((*RWI)->getValueAsListOfDefs("OperandReadWrites"),
+                Writes, Reads);
+        for (IdxIter WI = Writes.begin(), WE = Writes.end(); WI != WE; ++WI)
+          dbgs() << " " << SchedWrites[*WI].Name;
+        for (IdxIter RI = Reads.begin(), RE = Reads.end(); RI != RE; ++RI)
+          dbgs() << " " << SchedReads[*RI].Name;
+        dbgs() << '\n';
+      }
+      continue;
+    }
+    if (!SchedDef->isSubClassOf("Sched")
+        && (SchedDef->getValueAsDef("Itinerary")->getName() == "NoItinerary")) {
+      dbgs() << "No machine model for " << (*I)->TheDef->getName() << '\n';
+    }
+  }
 }
 
-// Gather all processor models.
-void CodeGenSchedModels::CollectProcModels() {
-  std::vector<Record*> ProcRecords =
-    Records.getAllDerivedDefinitions("Processor");
-  std::sort(ProcRecords.begin(), ProcRecords.end(), LessRecordFieldName());
+unsigned CodeGenSchedModels::getSchedClassIdx(
+  const RecVec &RWDefs) const {
 
-  // Reserve space because we can. Reallocation would be ok.
-  ProcModels.reserve(ProcRecords.size());
+  IdxVec Writes, Reads;
+  findRWs(RWDefs, Writes, Reads);
+  return findSchedClassIdx(Writes, Reads);
+}
 
-  // For each processor, find a unique machine model.
-  for (unsigned i = 0, N = ProcRecords.size(); i < N; ++i)
-    addProcModel(ProcRecords[i]);
+/// Find an SchedClass that has been inferred from a per-operand list of
+/// SchedWrites and SchedReads.
+unsigned CodeGenSchedModels::findSchedClassIdx(const IdxVec &Writes,
+                                               const IdxVec &Reads) const {
+  for (SchedClassIter I = schedClassBegin(), E = schedClassEnd(); I != E; ++I) {
+    // Classes with InstRWs may have the same Writes/Reads as a class originally
+    // produced by a SchedRW definition. We need to be able to recover the
+    // original class index for processors that don't match any InstRWs.
+    if (I->ItinClassDef || !I->InstRWs.empty())
+      continue;
+
+    if (I->Writes == Writes && I->Reads == Reads) {
+      return I - schedClassBegin();
+    }
+  }
+  return 0;
 }
 
-// Get a unique processor model based on the defined MachineModel and
-// ProcessorItineraries.
-void CodeGenSchedModels::addProcModel(Record *ProcDef) {
-  unsigned Idx = getProcModelIdx(ProcDef);
-  if (Idx < ProcModels.size())
-    return;
+// Get the SchedClass index for an instruction.
+unsigned CodeGenSchedModels::getSchedClassIdx(
+  const CodeGenInstruction &Inst) const {
+
+  unsigned SCIdx = InstrClassMap.lookup(Inst.TheDef);
+  if (SCIdx)
+    return SCIdx;
+
+  // If this opcode isn't mapped by the subtarget fallback to the instruction
+  // definition's SchedRW or ItinDef values.
+  if (Inst.TheDef->isSubClassOf("Sched")) {
+    RecVec RWs = Inst.TheDef->getValueAsListOfDefs("SchedRW");
+    return getSchedClassIdx(RWs);
+  }
+  Record *ItinDef = Inst.TheDef->getValueAsDef("Itinerary");
+  assert(SchedClassIdxMap.count(ItinDef->getName()) && "missing ItinClass");
+  unsigned Idx = SchedClassIdxMap.lookup(ItinDef->getName());
+  assert(Idx <= NumItineraryClasses && "bad ItinClass index");
+  return Idx;
+}
+
+std::string CodeGenSchedModels::createSchedClassName(
+  const IdxVec &OperWrites, const IdxVec &OperReads) {
+
+  std::string Name;
+  for (IdxIter WI = OperWrites.begin(), WE = OperWrites.end(); WI != WE; ++WI) {
+    if (WI != OperWrites.begin())
+      Name += '_';
+    Name += SchedWrites[*WI].Name;
+  }
+  for (IdxIter RI = OperReads.begin(), RE = OperReads.end(); RI != RE; ++RI) {
+    Name += '_';
+    Name += SchedReads[*RI].Name;
+  }
+  return Name;
+}
+
+std::string CodeGenSchedModels::createSchedClassName(const RecVec &InstDefs) {
+
+  std::string Name;
+  for (RecIter I = InstDefs.begin(), E = InstDefs.end(); I != E; ++I) {
+    if (I != InstDefs.begin())
+      Name += '_';
+    Name += (*I)->getName();
+  }
+  return Name;
+}
+
+/// Add an inferred sched class from a per-operand list of SchedWrites and
+/// SchedReads. ProcIndices contains the set of IDs of processors that may
+/// utilize this class.
+unsigned CodeGenSchedModels::addSchedClass(const IdxVec &OperWrites,
+                                           const IdxVec &OperReads,
+                                           const IdxVec &ProcIndices)
+{
+  assert(!ProcIndices.empty() && "expect at least one ProcIdx");
+
+  unsigned Idx = findSchedClassIdx(OperWrites, OperReads);
+  if (Idx) {
+    IdxVec PI;
+    std::set_union(SchedClasses[Idx].ProcIndices.begin(),
+                   SchedClasses[Idx].ProcIndices.end(),
+                   ProcIndices.begin(), ProcIndices.end(),
+                   std::back_inserter(PI));
+    SchedClasses[Idx].ProcIndices.swap(PI);
+    return Idx;
+  }
+  Idx = SchedClasses.size();
+  SchedClasses.resize(Idx+1);
+  CodeGenSchedClass &SC = SchedClasses.back();
+  SC.Name = createSchedClassName(OperWrites, OperReads);
+  SC.Writes = OperWrites;
+  SC.Reads = OperReads;
+  SC.ProcIndices = ProcIndices;
+
+  return Idx;
+}
+
+// Create classes for each set of opcodes that are in the same InstReadWrite
+// definition across all processors.
+void CodeGenSchedModels::createInstRWClass(Record *InstRWDef) {
+  // ClassInstrs will hold an entry for each subset of Instrs in InstRWDef that
+  // intersects with an existing class via a previous InstRWDef. Instrs that do
+  // not intersect with an existing class refer back to their former class as
+  // determined from ItinDef or SchedRW.
+  SmallVector<std::pair<unsigned, SmallVector<Record *, 8> >, 4> ClassInstrs;
+  // Sort Instrs into sets.
+  RecVec InstDefs = InstRWDef->getValueAsListOfDefs("Instrs");
+  std::sort(InstDefs.begin(), InstDefs.end(), LessRecord());
+  for (RecIter I = InstDefs.begin(), E = InstDefs.end(); I != E; ++I) {
+    unsigned SCIdx = 0;
+    InstClassMapTy::const_iterator Pos = InstrClassMap.find(*I);
+    if (Pos != InstrClassMap.end())
+      SCIdx = Pos->second;
+    else {
+      // This instruction has not been mapped yet. Get the original class. All
+      // instructions in the same InstrRW class must be from the same original
+      // class because that is the fall-back class for other processors.
+      Record *ItinDef = (*I)->getValueAsDef("Itinerary");
+      SCIdx = SchedClassIdxMap.lookup(ItinDef->getName());
+      if (!SCIdx && (*I)->isSubClassOf("Sched"))
+        SCIdx = getSchedClassIdx((*I)->getValueAsListOfDefs("SchedRW"));
+    }
+    unsigned CIdx = 0, CEnd = ClassInstrs.size();
+    for (; CIdx != CEnd; ++CIdx) {
+      if (ClassInstrs[CIdx].first == SCIdx)
+        break;
+    }
+    if (CIdx == CEnd) {
+      ClassInstrs.resize(CEnd + 1);
+      ClassInstrs[CIdx].first = SCIdx;
+    }
+    ClassInstrs[CIdx].second.push_back(*I);
+  }
+  // For each set of Instrs, create a new class if necessary, and map or remap
+  // the Instrs to it.
+  unsigned CIdx = 0, CEnd = ClassInstrs.size();
+  for (; CIdx != CEnd; ++CIdx) {
+    unsigned OldSCIdx = ClassInstrs[CIdx].first;
+    ArrayRef<Record*> InstDefs = ClassInstrs[CIdx].second;
+    // If the all instrs in the current class are accounted for, then leave
+    // them mapped to their old class.
+    if (SchedClasses[OldSCIdx].InstRWs.size() == InstDefs.size()) {
+      assert(SchedClasses[OldSCIdx].ProcIndices[0] == 0 &&
+             "expected a generic SchedClass");
+      continue;
+    }
+    unsigned SCIdx = SchedClasses.size();
+    SchedClasses.resize(SCIdx+1);
+    CodeGenSchedClass &SC = SchedClasses.back();
+    SC.Name = createSchedClassName(InstDefs);
+    // Preserve ItinDef and Writes/Reads for processors without an InstRW entry.
+    SC.ItinClassDef = SchedClasses[OldSCIdx].ItinClassDef;
+    SC.Writes = SchedClasses[OldSCIdx].Writes;
+    SC.Reads = SchedClasses[OldSCIdx].Reads;
+    SC.ProcIndices.push_back(0);
+    // Map each Instr to this new class.
+    // Note that InstDefs may be a smaller list than InstRWDef's "Instrs".
+    for (ArrayRef<Record*>::const_iterator
+           II = InstDefs.begin(), IE = InstDefs.end(); II != IE; ++II) {
+      unsigned OldSCIdx = InstrClassMap[*II];
+      if (OldSCIdx) {
+        SC.InstRWs.insert(SC.InstRWs.end(),
+                          SchedClasses[OldSCIdx].InstRWs.begin(),
+                          SchedClasses[OldSCIdx].InstRWs.end());
+      }
+      InstrClassMap[*II] = SCIdx;
+    }
+    SC.InstRWs.push_back(InstRWDef);
+  }
+}
+
+// Gather the processor itineraries.
+void CodeGenSchedModels::collectProcItins() {
+  for (std::vector<CodeGenProcModel>::iterator PI = ProcModels.begin(),
+         PE = ProcModels.end(); PI != PE; ++PI) {
+    CodeGenProcModel &ProcModel = *PI;
+    RecVec ItinRecords = ProcModel.ItinsDef->getValueAsListOfDefs("IID");
+    // Skip empty itinerary.
+    if (ItinRecords.empty())
+      continue;
+
+    ProcModel.ItinDefList.resize(NumItineraryClasses+1);
 
-  Record *ModelDef = ProcDef->getValueAsDef("SchedModel");
-  Record *ItinsDef = ProcDef->getValueAsDef("ProcItin");
+    // Insert each itinerary data record in the correct position within
+    // the processor model's ItinDefList.
+    for (unsigned i = 0, N = ItinRecords.size(); i < N; i++) {
+      Record *ItinData = ItinRecords[i];
+      Record *ItinDef = ItinData->getValueAsDef("TheClass");
+      if (!SchedClassIdxMap.count(ItinDef->getName())) {
+        DEBUG(dbgs() << ProcModel.ItinsDef->getName()
+              << " has unused itinerary class " << ItinDef->getName() << '\n');
+        continue;
+      }
+      assert(SchedClassIdxMap.count(ItinDef->getName()) && "missing ItinClass");
+      unsigned Idx = SchedClassIdxMap.lookup(ItinDef->getName());
+      assert(Idx <= NumItineraryClasses && "bad ItinClass index");
+      ProcModel.ItinDefList[Idx] = ItinData;
+    }
+    // Check for missing itinerary entries.
+    assert(!ProcModel.ItinDefList[0] && "NoItinerary class can't have rec");
+    DEBUG(
+      for (unsigned i = 1, N = ProcModel.ItinDefList.size(); i < N; ++i) {
+        if (!ProcModel.ItinDefList[i])
+          dbgs() << ProcModel.ItinsDef->getName()
+                 << " missing itinerary for class "
+                 << SchedClasses[i].Name << '\n';
+      });
+  }
+}
 
-  std::string ModelName = ModelDef->getName();
-  const std::string &ItinName = ItinsDef->getName();
+// Gather the read/write types for each itinerary class.
+void CodeGenSchedModels::collectProcItinRW() {
+  RecVec ItinRWDefs = Records.getAllDerivedDefinitions("ItinRW");
+  std::sort(ItinRWDefs.begin(), ItinRWDefs.end(), LessRecord());
+  for (RecIter II = ItinRWDefs.begin(), IE = ItinRWDefs.end(); II != IE; ++II) {
+    if (!(*II)->getValueInit("SchedModel")->isComplete())
+      throw TGError((*II)->getLoc(), "SchedModel is undefined");
+    Record *ModelDef = (*II)->getValueAsDef("SchedModel");
+    ProcModelMapTy::const_iterator I = ProcModelMap.find(ModelDef);
+    if (I == ProcModelMap.end()) {
+      throw TGError((*II)->getLoc(), "Undefined SchedMachineModel "
+                    + ModelDef->getName());
+    }
+    ProcModels[I->second].ItinRWDefs.push_back(*II);
+  }
+}
+
+/// Infer new classes from existing classes. In the process, this may create new
+/// SchedWrites from sequences of existing SchedWrites.
+void CodeGenSchedModels::inferSchedClasses() {
+  // Visit all existing classes and newly created classes.
+  for (unsigned Idx = 0; Idx != SchedClasses.size(); ++Idx) {
+    if (SchedClasses[Idx].ItinClassDef)
+      inferFromItinClass(SchedClasses[Idx].ItinClassDef, Idx);
+    else if (!SchedClasses[Idx].InstRWs.empty())
+      inferFromInstRWs(Idx);
+    else {
+      inferFromRW(SchedClasses[Idx].Writes, SchedClasses[Idx].Reads,
+                  Idx, SchedClasses[Idx].ProcIndices);
+    }
+    assert(SchedClasses.size() < (NumInstrSchedClasses*6) &&
+           "too many SchedVariants");
+  }
+}
+
+/// Infer classes from per-processor itinerary resources.
+void CodeGenSchedModels::inferFromItinClass(Record *ItinClassDef,
+                                            unsigned FromClassIdx) {
+  for (unsigned PIdx = 0, PEnd = ProcModels.size(); PIdx != PEnd; ++PIdx) {
+    const CodeGenProcModel &PM = ProcModels[PIdx];
+    // For all ItinRW entries.
+    bool HasMatch = false;
+    for (RecIter II = PM.ItinRWDefs.begin(), IE = PM.ItinRWDefs.end();
+         II != IE; ++II) {
+      RecVec Matched = (*II)->getValueAsListOfDefs("MatchedItinClasses");
+      if (!std::count(Matched.begin(), Matched.end(), ItinClassDef))
+        continue;
+      if (HasMatch)
+        throw TGError((*II)->getLoc(), "Duplicate itinerary class "
+                      + ItinClassDef->getName()
+                      + " in ItinResources for " + PM.ModelName);
+      HasMatch = true;
+      IdxVec Writes, Reads;
+      findRWs((*II)->getValueAsListOfDefs("OperandReadWrites"), Writes, Reads);
+      IdxVec ProcIndices(1, PIdx);
+      inferFromRW(Writes, Reads, FromClassIdx, ProcIndices);
+    }
+  }
+}
 
-  bool NoModel = ModelDef->getValueAsBit("NoModel");
-  bool hasTopLevelItin = !ItinsDef->getValueAsListOfDefs("IID").empty();
-  if (NoModel) {
-    // If an itinerary is defined without a machine model, infer a new model.
-    if (NoModel && hasTopLevelItin) {
-      ModelName = ItinName + "Model";
-      ModelDef = NULL;
+/// Infer classes from per-processor InstReadWrite definitions.
+void CodeGenSchedModels::inferFromInstRWs(unsigned SCIdx) {
+  const RecVec &RWDefs = SchedClasses[SCIdx].InstRWs;
+  for (RecIter RWI = RWDefs.begin(), RWE = RWDefs.end(); RWI != RWE; ++RWI) {
+    RecVec Instrs = (*RWI)->getValueAsListOfDefs("Instrs");
+    RecIter II = Instrs.begin(), IE = Instrs.end();
+    for (; II != IE; ++II) {
+      if (InstrClassMap[*II] == SCIdx)
+        break;
     }
+    // If this class no longer has any instructions mapped to it, it has become
+    // irrelevant.
+    if (II == IE)
+      continue;
+    IdxVec Writes, Reads;
+    findRWs((*RWI)->getValueAsListOfDefs("OperandReadWrites"), Writes, Reads);
+    unsigned PIdx = getProcModel((*RWI)->getValueAsDef("SchedModel")).Index;
+    IdxVec ProcIndices(1, PIdx);
+    inferFromRW(Writes, Reads, SCIdx, ProcIndices);
+  }
+}
+
+namespace {
+// Helper for substituteVariantOperand.
+struct TransVariant {
+  Record *VariantDef;
+  unsigned RWIdx;       // Index of this variant's matched type.
+  unsigned ProcIdx;     // Processor model index or zero for any.
+  unsigned TransVecIdx; // Index into PredTransitions::TransVec.
+
+  TransVariant(Record *def, unsigned rwi, unsigned pi, unsigned ti):
+    VariantDef(def), RWIdx(rwi), ProcIdx(pi), TransVecIdx(ti) {}
+};
+
+// Associate a predicate with the SchedReadWrite that it guards.
+// RWIdx is the index of the read/write variant.
+struct PredCheck {
+  bool IsRead;
+  unsigned RWIdx;
+  Record *Predicate;
+
+  PredCheck(bool r, unsigned w, Record *p): IsRead(r), RWIdx(w), Predicate(p) {}
+};
+
+// A Predicate transition is a list of RW sequences guarded by a PredTerm.
+struct PredTransition {
+  // A predicate term is a conjunction of PredChecks.
+  SmallVector<PredCheck, 4> PredTerm;
+  SmallVector<SmallVector<unsigned,4>, 16> WriteSequences;
+  SmallVector<SmallVector<unsigned,4>, 16> ReadSequences;
+  SmallVector<unsigned, 4> ProcIndices;
+};
+
+// Encapsulate a set of partially constructed transitions.
+// The results are built by repeated calls to substituteVariants.
+class PredTransitions {
+  CodeGenSchedModels &SchedModels;
+
+public:
+  std::vector<PredTransition> TransVec;
+
+  PredTransitions(CodeGenSchedModels &sm): SchedModels(sm) {}
+
+  void substituteVariantOperand(const SmallVectorImpl<unsigned> &RWSeq,
+                                bool IsRead, unsigned StartIdx);
+
+  void substituteVariants(const PredTransition &Trans);
+
+#ifndef NDEBUG
+  void dump() const;
+#endif
+
+private:
+  bool mutuallyExclusive(Record *PredDef, ArrayRef<PredCheck> Term);
+  void pushVariant(const TransVariant &VInfo, bool IsRead);
+};
+} // anonymous
+
+// Return true if this predicate is mutually exclusive with a PredTerm. This
+// degenerates into checking if the predicate is mutually exclusive with any
+// predicate in the Term's conjunction.
+//
+// All predicates associated with a given SchedRW are considered mutually
+// exclusive. This should work even if the conditions expressed by the
+// predicates are not exclusive because the predicates for a given SchedWrite
+// are always checked in the order they are defined in the .td file. Later
+// conditions implicitly negate any prior condition.
+bool PredTransitions::mutuallyExclusive(Record *PredDef,
+                                        ArrayRef<PredCheck> Term) {
+
+  for (ArrayRef<PredCheck>::iterator I = Term.begin(), E = Term.end();
+       I != E; ++I) {
+    if (I->Predicate == PredDef)
+      return false;
+
+    const CodeGenSchedRW &SchedRW = SchedModels.getSchedRW(I->RWIdx, I->IsRead);
+    assert(SchedRW.HasVariants && "PredCheck must refer to a SchedVariant");
+    RecVec Variants = SchedRW.TheDef->getValueAsListOfDefs("Variants");
+    for (RecIter VI = Variants.begin(), VE = Variants.end(); VI != VE; ++VI) {
+      if ((*VI)->getValueAsDef("Predicate") == PredDef)
+        return true;
+    }
+  }
+  return false;
+}
+
+// Push the Reads/Writes selected by this variant onto the PredTransition
+// specified by VInfo.
+void PredTransitions::
+pushVariant(const TransVariant &VInfo, bool IsRead) {
+
+  PredTransition &Trans = TransVec[VInfo.TransVecIdx];
+
+  Record *PredDef = VInfo.VariantDef->getValueAsDef("Predicate");
+  Trans.PredTerm.push_back(PredCheck(IsRead, VInfo.RWIdx,PredDef));
+
+  // If this operand transition is reached through a processor-specific alias,
+  // then the whole transition is specific to this processor.
+  if (VInfo.ProcIdx != 0)
+    Trans.ProcIndices.assign(1, VInfo.ProcIdx);
+
+  RecVec SelectedDefs = VInfo.VariantDef->getValueAsListOfDefs("Selected");
+  IdxVec SelectedRWs;
+  SchedModels.findRWs(SelectedDefs, SelectedRWs, IsRead);
+
+  const CodeGenSchedRW &SchedRW = SchedModels.getSchedRW(VInfo.RWIdx, IsRead);
+
+  SmallVectorImpl<SmallVector<unsigned,4> > &RWSequences = IsRead
+    ? Trans.ReadSequences : Trans.WriteSequences;
+  if (SchedRW.IsVariadic) {
+    unsigned OperIdx = RWSequences.size()-1;
+    // Make N-1 copies of this transition's last sequence.
+    for (unsigned i = 1, e = SelectedRWs.size(); i != e; ++i) {
+      RWSequences.push_back(RWSequences[OperIdx]);
+    }
+    // Push each of the N elements of the SelectedRWs onto a copy of the last
+    // sequence (split the current operand into N operands).
+    // Note that write sequences should be expanded within this loop--the entire
+    // sequence belongs to a single operand.
+    for (IdxIter RWI = SelectedRWs.begin(), RWE = SelectedRWs.end();
+         RWI != RWE; ++RWI, ++OperIdx) {
+      IdxVec ExpandedRWs;
+      if (IsRead)
+        ExpandedRWs.push_back(*RWI);
+      else
+        SchedModels.expandRWSequence(*RWI, ExpandedRWs, IsRead);
+      RWSequences[OperIdx].insert(RWSequences[OperIdx].end(),
+                                  ExpandedRWs.begin(), ExpandedRWs.end());
+    }
+    assert(OperIdx == RWSequences.size() && "missed a sequence");
   }
   else {
-    // If a machine model is defined, the itinerary must be defined within it
-    // rather than in the Processor definition itself.
-    assert(!hasTopLevelItin && "Itinerary must be defined in SchedModel");
-    ItinsDef = ModelDef->getValueAsDef("Itineraries");
+    // Push this transition's expanded sequence onto this transition's last
+    // sequence (add to the current operand's sequence).
+    SmallVectorImpl<unsigned> &Seq = RWSequences.back();
+    IdxVec ExpandedRWs;
+    for (IdxIter RWI = SelectedRWs.begin(), RWE = SelectedRWs.end();
+         RWI != RWE; ++RWI) {
+      if (IsRead)
+        ExpandedRWs.push_back(*RWI);
+      else
+        SchedModels.expandRWSequence(*RWI, ExpandedRWs, IsRead);
+    }
+    Seq.insert(Seq.end(), ExpandedRWs.begin(), ExpandedRWs.end());
   }
+}
 
-  ProcModelMap[getProcModelKey(ProcDef)]= ProcModels.size();
+static bool hasAliasedVariants(const CodeGenSchedRW &RW,
+                               CodeGenSchedModels &SchedModels) {
+  if (RW.HasVariants)
+    return true;
 
-  ProcModels.push_back(CodeGenProcModel(ModelName, ModelDef, ItinsDef));
+  for (RecIter I = RW.Aliases.begin(), E = RW.Aliases.end(); I != E; ++I) {
+    if (SchedModels.getSchedRW((*I)->getValueAsDef("AliasRW")).HasVariants)
+      return true;
+  }
+  return false;
+}
 
-  std::vector<Record*> ItinRecords = ItinsDef->getValueAsListOfDefs("IID");
-  CollectProcItin(ProcModels.back(), ItinRecords);
+static bool hasVariant(ArrayRef<PredTransition> Transitions,
+                       CodeGenSchedModels &SchedModels) {
+  for (ArrayRef<PredTransition>::iterator
+         PTI = Transitions.begin(), PTE = Transitions.end();
+       PTI != PTE; ++PTI) {
+    for (SmallVectorImpl<SmallVector<unsigned,4> >::const_iterator
+           WSI = PTI->WriteSequences.begin(), WSE = PTI->WriteSequences.end();
+         WSI != WSE; ++WSI) {
+      for (SmallVectorImpl<unsigned>::const_iterator
+             WI = WSI->begin(), WE = WSI->end(); WI != WE; ++WI) {
+        if (hasAliasedVariants(SchedModels.getSchedWrite(*WI), SchedModels))
+          return true;
+      }
+    }
+    for (SmallVectorImpl<SmallVector<unsigned,4> >::const_iterator
+           RSI = PTI->ReadSequences.begin(), RSE = PTI->ReadSequences.end();
+         RSI != RSE; ++RSI) {
+      for (SmallVectorImpl<unsigned>::const_iterator
+             RI = RSI->begin(), RE = RSI->end(); RI != RE; ++RI) {
+        if (hasAliasedVariants(SchedModels.getSchedRead(*RI), SchedModels))
+          return true;
+      }
+    }
+  }
+  return false;
 }
 
-// Gather the processor itineraries.
-void CodeGenSchedModels::CollectProcItin(CodeGenProcModel &ProcModel,
-                                         std::vector<Record*> ItinRecords) {
-  // Skip empty itinerary.
-  if (ItinRecords.empty())
+// RWSeq is a sequence of all Reads or all Writes for the next read or write
+// operand. StartIdx is an index into TransVec where partial results
+// starts. RWSeq must be applied to all transitions between StartIdx and the end
+// of TransVec.
+void PredTransitions::substituteVariantOperand(
+  const SmallVectorImpl<unsigned> &RWSeq, bool IsRead, unsigned StartIdx) {
+
+  // Visit each original RW within the current sequence.
+  for (SmallVectorImpl<unsigned>::const_iterator
+         RWI = RWSeq.begin(), RWE = RWSeq.end(); RWI != RWE; ++RWI) {
+    const CodeGenSchedRW &SchedRW = SchedModels.getSchedRW(*RWI, IsRead);
+    // Push this RW on all partial PredTransitions or distribute variants.
+    // New PredTransitions may be pushed within this loop which should not be
+    // revisited (TransEnd must be loop invariant).
+    for (unsigned TransIdx = StartIdx, TransEnd = TransVec.size();
+         TransIdx != TransEnd; ++TransIdx) {
+      // In the common case, push RW onto the current operand's sequence.
+      if (!hasAliasedVariants(SchedRW, SchedModels)) {
+        if (IsRead)
+          TransVec[TransIdx].ReadSequences.back().push_back(*RWI);
+        else
+          TransVec[TransIdx].WriteSequences.back().push_back(*RWI);
+        continue;
+      }
+      // Distribute this partial PredTransition across intersecting variants.
+      RecVec Variants;
+      if (SchedRW.HasVariants)
+        Variants = SchedRW.TheDef->getValueAsListOfDefs("Variants");
+      IdxVec VarRWIds(Variants.size(), *RWI);
+      IdxVec VarProcModels(Variants.size(), 0);
+      for (RecIter AI = SchedRW.Aliases.begin(), AE = SchedRW.Aliases.end();
+           AI != AE; ++AI) {
+        unsigned AIdx;
+        const CodeGenSchedRW &AliasRW =
+          SchedModels.getSchedRW((*AI)->getValueAsDef("AliasRW"), AIdx);
+        if (!AliasRW.HasVariants)
+          continue;
+
+        RecVec AliasVars = AliasRW.TheDef->getValueAsListOfDefs("Variants");
+        Variants.insert(Variants.end(), AliasVars.begin(), AliasVars.end());
+
+        VarRWIds.resize(Variants.size(), AIdx);
+
+        Record *ModelDef = AliasRW.TheDef->getValueAsDef("SchedModel");
+        VarProcModels.resize(Variants.size(),
+                             SchedModels.getProcModel(ModelDef).Index);
+      }
+      std::vector<TransVariant> IntersectingVariants;
+      for (unsigned VIdx = 0, VEnd = Variants.size(); VIdx != VEnd; ++VIdx) {
+        Record *PredDef = Variants[VIdx]->getValueAsDef("Predicate");
+
+        // Don't expand variants if the processor models don't intersect.
+        // A zero processor index means any processor.
+        SmallVector<unsigned, 4> &ProcIndices = TransVec[TransIdx].ProcIndices;
+        if (ProcIndices[0] != 0 && VarProcModels[VIdx] != 0) {
+          unsigned Cnt = std::count(ProcIndices.begin(), ProcIndices.end(),
+                                    VarProcModels[VIdx]);
+          if (!Cnt)
+            continue;
+          if (Cnt > 1) {
+            const CodeGenProcModel &PM =
+              *(SchedModels.procModelBegin() + VarProcModels[VIdx]);
+            throw TGError(Variants[VIdx]->getLoc(), "Multiple variants defined "
+                          "for processor " + PM.ModelName +
+                          " Ensure only one SchedAlias exists per RW.");
+          }
+        }
+        if (mutuallyExclusive(PredDef, TransVec[TransIdx].PredTerm))
+          continue;
+        if (IntersectingVariants.empty()) {
+          // The first variant builds on the existing transition.
+          IntersectingVariants.push_back(
+            TransVariant(Variants[VIdx], VarRWIds[VIdx], VarProcModels[VIdx],
+                         TransIdx));
+        }
+        else {
+          // Push another copy of the current transition for more variants.
+          IntersectingVariants.push_back(
+            TransVariant(Variants[VIdx], VarRWIds[VIdx], VarProcModels[VIdx],
+                         TransVec.size()));
+          TransVec.push_back(TransVec[TransIdx]);
+        }
+      }
+      if (IntersectingVariants.empty())
+        throw TGError(SchedRW.TheDef->getLoc(), "No variant of this type has a "
+                      "matching predicate on any processor ");
+      // Now expand each variant on top of its copy of the transition.
+      for (std::vector<TransVariant>::const_iterator
+             IVI = IntersectingVariants.begin(),
+             IVE = IntersectingVariants.end();
+           IVI != IVE; ++IVI) {
+        pushVariant(*IVI, IsRead);
+      }
+    }
+  }
+}
+
+// For each variant of a Read/Write in Trans, substitute the sequence of
+// Read/Writes guarded by the variant. This is exponential in the number of
+// variant Read/Writes, but in practice detection of mutually exclusive
+// predicates should result in linear growth in the total number variants.
+//
+// This is one step in a breadth-first search of nested variants.
+void PredTransitions::substituteVariants(const PredTransition &Trans) {
+  // Build up a set of partial results starting at the back of
+  // PredTransitions. Remember the first new transition.
+  unsigned StartIdx = TransVec.size();
+  TransVec.resize(TransVec.size() + 1);
+  TransVec.back().PredTerm = Trans.PredTerm;
+  TransVec.back().ProcIndices = Trans.ProcIndices;
+
+  // Visit each original write sequence.
+  for (SmallVectorImpl<SmallVector<unsigned,4> >::const_iterator
+         WSI = Trans.WriteSequences.begin(), WSE = Trans.WriteSequences.end();
+       WSI != WSE; ++WSI) {
+    // Push a new (empty) write sequence onto all partial Transitions.
+    for (std::vector<PredTransition>::iterator I =
+           TransVec.begin() + StartIdx, E = TransVec.end(); I != E; ++I) {
+      I->WriteSequences.resize(I->WriteSequences.size() + 1);
+    }
+    substituteVariantOperand(*WSI, /*IsRead=*/false, StartIdx);
+  }
+  // Visit each original read sequence.
+  for (SmallVectorImpl<SmallVector<unsigned,4> >::const_iterator
+         RSI = Trans.ReadSequences.begin(), RSE = Trans.ReadSequences.end();
+       RSI != RSE; ++RSI) {
+    // Push a new (empty) read sequence onto all partial Transitions.
+    for (std::vector<PredTransition>::iterator I =
+           TransVec.begin() + StartIdx, E = TransVec.end(); I != E; ++I) {
+      I->ReadSequences.resize(I->ReadSequences.size() + 1);
+    }
+    substituteVariantOperand(*RSI, /*IsRead=*/true, StartIdx);
+  }
+}
+
+// Create a new SchedClass for each variant found by inferFromRW. Pass
+static void inferFromTransitions(ArrayRef<PredTransition> LastTransitions,
+                                 unsigned FromClassIdx,
+                                 CodeGenSchedModels &SchedModels) {
+  // For each PredTransition, create a new CodeGenSchedTransition, which usually
+  // requires creating a new SchedClass.
+  for (ArrayRef<PredTransition>::iterator
+         I = LastTransitions.begin(), E = LastTransitions.end(); I != E; ++I) {
+    IdxVec OperWritesVariant;
+    for (SmallVectorImpl<SmallVector<unsigned,4> >::const_iterator
+           WSI = I->WriteSequences.begin(), WSE = I->WriteSequences.end();
+         WSI != WSE; ++WSI) {
+      // Create a new write representing the expanded sequence.
+      OperWritesVariant.push_back(
+        SchedModels.findOrInsertRW(*WSI, /*IsRead=*/false));
+    }
+    IdxVec OperReadsVariant;
+    for (SmallVectorImpl<SmallVector<unsigned,4> >::const_iterator
+           RSI = I->ReadSequences.begin(), RSE = I->ReadSequences.end();
+         RSI != RSE; ++RSI) {
+      // Create a new read representing the expanded sequence.
+      OperReadsVariant.push_back(
+        SchedModels.findOrInsertRW(*RSI, /*IsRead=*/true));
+    }
+    IdxVec ProcIndices(I->ProcIndices.begin(), I->ProcIndices.end());
+    CodeGenSchedTransition SCTrans;
+    SCTrans.ToClassIdx =
+      SchedModels.addSchedClass(OperWritesVariant, OperReadsVariant,
+                                ProcIndices);
+    SCTrans.ProcIndices = ProcIndices;
+    // The final PredTerm is unique set of predicates guarding the transition.
+    RecVec Preds;
+    for (SmallVectorImpl<PredCheck>::const_iterator
+           PI = I->PredTerm.begin(), PE = I->PredTerm.end(); PI != PE; ++PI) {
+      Preds.push_back(PI->Predicate);
+    }
+    RecIter PredsEnd = std::unique(Preds.begin(), Preds.end());
+    Preds.resize(PredsEnd - Preds.begin());
+    SCTrans.PredTerm = Preds;
+    SchedModels.getSchedClass(FromClassIdx).Transitions.push_back(SCTrans);
+  }
+}
+
+// Create new SchedClasses for the given ReadWrite list. If any of the
+// ReadWrites refers to a SchedVariant, create a new SchedClass for each variant
+// of the ReadWrite list, following Aliases if necessary.
+void CodeGenSchedModels::inferFromRW(const IdxVec &OperWrites,
+                                     const IdxVec &OperReads,
+                                     unsigned FromClassIdx,
+                                     const IdxVec &ProcIndices) {
+  DEBUG(dbgs() << "INFER RW: ");
+
+  // Create a seed transition with an empty PredTerm and the expanded sequences
+  // of SchedWrites for the current SchedClass.
+  std::vector<PredTransition> LastTransitions;
+  LastTransitions.resize(1);
+  LastTransitions.back().ProcIndices.append(ProcIndices.begin(),
+                                            ProcIndices.end());
+
+  for (IdxIter I = OperWrites.begin(), E = OperWrites.end(); I != E; ++I) {
+    IdxVec WriteSeq;
+    expandRWSequence(*I, WriteSeq, /*IsRead=*/false);
+    unsigned Idx = LastTransitions[0].WriteSequences.size();
+    LastTransitions[0].WriteSequences.resize(Idx + 1);
+    SmallVectorImpl<unsigned> &Seq = LastTransitions[0].WriteSequences[Idx];
+    for (IdxIter WI = WriteSeq.begin(), WE = WriteSeq.end(); WI != WE; ++WI)
+      Seq.push_back(*WI);
+    DEBUG(dbgs() << "("; dumpIdxVec(Seq); dbgs() << ") ");
+  }
+  DEBUG(dbgs() << " Reads: ");
+  for (IdxIter I = OperReads.begin(), E = OperReads.end(); I != E; ++I) {
+    IdxVec ReadSeq;
+    expandRWSequence(*I, ReadSeq, /*IsRead=*/true);
+    unsigned Idx = LastTransitions[0].ReadSequences.size();
+    LastTransitions[0].ReadSequences.resize(Idx + 1);
+    SmallVectorImpl<unsigned> &Seq = LastTransitions[0].ReadSequences[Idx];
+    for (IdxIter RI = ReadSeq.begin(), RE = ReadSeq.end(); RI != RE; ++RI)
+      Seq.push_back(*RI);
+    DEBUG(dbgs() << "("; dumpIdxVec(Seq); dbgs() << ") ");
+  }
+  DEBUG(dbgs() << '\n');
+
+  // Collect all PredTransitions for individual operands.
+  // Iterate until no variant writes remain.
+  while (hasVariant(LastTransitions, *this)) {
+    PredTransitions Transitions(*this);
+    for (std::vector<PredTransition>::const_iterator
+           I = LastTransitions.begin(), E = LastTransitions.end();
+         I != E; ++I) {
+      Transitions.substituteVariants(*I);
+    }
+    DEBUG(Transitions.dump());
+    LastTransitions.swap(Transitions.TransVec);
+  }
+  // If the first transition has no variants, nothing to do.
+  if (LastTransitions[0].PredTerm.empty())
     return;
 
-  HasProcItineraries = true;
+  // WARNING: We are about to mutate the SchedClasses vector. Do not refer to
+  // OperWrites, OperReads, or ProcIndices after calling inferFromTransitions.
+  inferFromTransitions(LastTransitions, FromClassIdx, *this);
+}
 
-  ProcModel.ItinDefList.resize(NumItineraryClasses+1);
+// Collect and sort WriteRes, ReadAdvance, and ProcResources.
+void CodeGenSchedModels::collectProcResources() {
+  // Add any subtarget-specific SchedReadWrites that are directly associated
+  // with processor resources. Refer to the parent SchedClass's ProcIndices to
+  // determine which processors they apply to.
+  for (SchedClassIter SCI = schedClassBegin(), SCE = schedClassEnd();
+       SCI != SCE; ++SCI) {
+    if (SCI->ItinClassDef)
+      collectItinProcResources(SCI->ItinClassDef);
+    else
+      collectRWResources(SCI->Writes, SCI->Reads, SCI->ProcIndices);
+  }
+  // Add resources separately defined by each subtarget.
+  RecVec WRDefs = Records.getAllDerivedDefinitions("WriteRes");
+  for (RecIter WRI = WRDefs.begin(), WRE = WRDefs.end(); WRI != WRE; ++WRI) {
+    Record *ModelDef = (*WRI)->getValueAsDef("SchedModel");
+    addWriteRes(*WRI, getProcModel(ModelDef).Index);
+  }
+  RecVec RADefs = Records.getAllDerivedDefinitions("ReadAdvance");
+  for (RecIter RAI = RADefs.begin(), RAE = RADefs.end(); RAI != RAE; ++RAI) {
+    Record *ModelDef = (*RAI)->getValueAsDef("SchedModel");
+    addReadAdvance(*RAI, getProcModel(ModelDef).Index);
+  }
+  // Finalize each ProcModel by sorting the record arrays.
+  for (unsigned PIdx = 0, PEnd = ProcModels.size(); PIdx != PEnd; ++PIdx) {
+    CodeGenProcModel &PM = ProcModels[PIdx];
+    std::sort(PM.WriteResDefs.begin(), PM.WriteResDefs.end(),
+              LessRecord());
+    std::sort(PM.ReadAdvanceDefs.begin(), PM.ReadAdvanceDefs.end(),
+              LessRecord());
+    std::sort(PM.ProcResourceDefs.begin(), PM.ProcResourceDefs.end(),
+              LessRecord());
+    DEBUG(
+      PM.dump();
+      dbgs() << "WriteResDefs: ";
+      for (RecIter RI = PM.WriteResDefs.begin(),
+             RE = PM.WriteResDefs.end(); RI != RE; ++RI) {
+        if ((*RI)->isSubClassOf("WriteRes"))
+          dbgs() << (*RI)->getValueAsDef("WriteType")->getName() << " ";
+        else
+          dbgs() << (*RI)->getName() << " ";
+      }
+      dbgs() << "\nReadAdvanceDefs: ";
+      for (RecIter RI = PM.ReadAdvanceDefs.begin(),
+             RE = PM.ReadAdvanceDefs.end(); RI != RE; ++RI) {
+        if ((*RI)->isSubClassOf("ReadAdvance"))
+          dbgs() << (*RI)->getValueAsDef("ReadType")->getName() << " ";
+        else
+          dbgs() << (*RI)->getName() << " ";
+      }
+      dbgs() << "\nProcResourceDefs: ";
+      for (RecIter RI = PM.ProcResourceDefs.begin(),
+             RE = PM.ProcResourceDefs.end(); RI != RE; ++RI) {
+        dbgs() << (*RI)->getName() << " ";
+      }
+      dbgs() << '\n');
+  }
+}
 
-  // Insert each itinerary data record in the correct position within
-  // the processor model's ItinDefList.
-  for (unsigned i = 0, N = ItinRecords.size(); i < N; i++) {
-    Record *ItinData = ItinRecords[i];
-    Record *ItinDef = ItinData->getValueAsDef("TheClass");
-    if (!SchedClassIdxMap.count(ItinDef->getName())) {
-      DEBUG(dbgs() << ProcModel.ItinsDef->getName()
-            << " has unused itinerary class " << ItinDef->getName() << '\n');
-      continue;
+// Collect itinerary class resources for each processor.
+void CodeGenSchedModels::collectItinProcResources(Record *ItinClassDef) {
+  for (unsigned PIdx = 0, PEnd = ProcModels.size(); PIdx != PEnd; ++PIdx) {
+    const CodeGenProcModel &PM = ProcModels[PIdx];
+    // For all ItinRW entries.
+    bool HasMatch = false;
+    for (RecIter II = PM.ItinRWDefs.begin(), IE = PM.ItinRWDefs.end();
+         II != IE; ++II) {
+      RecVec Matched = (*II)->getValueAsListOfDefs("MatchedItinClasses");
+      if (!std::count(Matched.begin(), Matched.end(), ItinClassDef))
+        continue;
+      if (HasMatch)
+        throw TGError((*II)->getLoc(), "Duplicate itinerary class "
+                      + ItinClassDef->getName()
+                      + " in ItinResources for " + PM.ModelName);
+      HasMatch = true;
+      IdxVec Writes, Reads;
+      findRWs((*II)->getValueAsListOfDefs("OperandReadWrites"), Writes, Reads);
+      IdxVec ProcIndices(1, PIdx);
+      collectRWResources(Writes, Reads, ProcIndices);
+    }
+  }
+}
+
+
+// Collect resources for a set of read/write types and processor indices.
+void CodeGenSchedModels::collectRWResources(const IdxVec &Writes,
+                                            const IdxVec &Reads,
+                                            const IdxVec &ProcIndices) {
+
+  for (IdxIter WI = Writes.begin(), WE = Writes.end(); WI != WE; ++WI) {
+    const CodeGenSchedRW &SchedRW = getSchedRW(*WI, /*IsRead=*/false);
+    if (SchedRW.TheDef && SchedRW.TheDef->isSubClassOf("SchedWriteRes")) {
+      for (IdxIter PI = ProcIndices.begin(), PE = ProcIndices.end();
+           PI != PE; ++PI) {
+        addWriteRes(SchedRW.TheDef, *PI);
+      }
+    }
+    for (RecIter AI = SchedRW.Aliases.begin(), AE = SchedRW.Aliases.end();
+         AI != AE; ++AI) {
+      const CodeGenSchedRW &AliasRW =
+        getSchedRW((*AI)->getValueAsDef("AliasRW"));
+      if (AliasRW.TheDef && AliasRW.TheDef->isSubClassOf("SchedWriteRes")) {
+        Record *ModelDef = AliasRW.TheDef->getValueAsDef("SchedModel");
+        addWriteRes(AliasRW.TheDef, getProcModel(ModelDef).Index);
+      }
+    }
+  }
+  for (IdxIter RI = Reads.begin(), RE = Reads.end(); RI != RE; ++RI) {
+    const CodeGenSchedRW &SchedRW = getSchedRW(*RI, /*IsRead=*/true);
+    if (SchedRW.TheDef && SchedRW.TheDef->isSubClassOf("SchedReadAdvance")) {
+      for (IdxIter PI = ProcIndices.begin(), PE = ProcIndices.end();
+           PI != PE; ++PI) {
+        addReadAdvance(SchedRW.TheDef, *PI);
+      }
+    }
+    for (RecIter AI = SchedRW.Aliases.begin(), AE = SchedRW.Aliases.end();
+         AI != AE; ++AI) {
+      const CodeGenSchedRW &AliasRW =
+        getSchedRW((*AI)->getValueAsDef("AliasRW"));
+      if (AliasRW.TheDef && AliasRW.TheDef->isSubClassOf("SchedReadAdvance")) {
+        Record *ModelDef = AliasRW.TheDef->getValueAsDef("SchedModel");
+        addReadAdvance(AliasRW.TheDef, getProcModel(ModelDef).Index);
+      }
+    }
+  }
+}
+
+// Find the processor's resource units for this kind of resource.
+Record *CodeGenSchedModels::findProcResUnits(Record *ProcResKind,
+                                             const CodeGenProcModel &PM) const {
+  if (ProcResKind->isSubClassOf("ProcResourceUnits"))
+    return ProcResKind;
+
+  Record *ProcUnitDef = 0;
+  RecVec ProcResourceDefs =
+    Records.getAllDerivedDefinitions("ProcResourceUnits");
+
+  for (RecIter RI = ProcResourceDefs.begin(), RE = ProcResourceDefs.end();
+       RI != RE; ++RI) {
+
+    if ((*RI)->getValueAsDef("Kind") == ProcResKind
+        && (*RI)->getValueAsDef("SchedModel") == PM.ModelDef) {
+      if (ProcUnitDef) {
+        throw TGError((*RI)->getLoc(),
+                      "Multiple ProcessorResourceUnits associated with "
+                      + ProcResKind->getName());
+      }
+      ProcUnitDef = *RI;
     }
-    ProcModel.ItinDefList[getItinClassIdx(ItinDef)] = ItinData;
   }
+  if (!ProcUnitDef) {
+    throw TGError(ProcResKind->getLoc(),
+                  "No ProcessorResources associated with "
+                  + ProcResKind->getName());
+  }
+  return ProcUnitDef;
+}
+
+// Iteratively add a resource and its super resources.
+void CodeGenSchedModels::addProcResource(Record *ProcResKind,
+                                         CodeGenProcModel &PM) {
+  for (;;) {
+    Record *ProcResUnits = findProcResUnits(ProcResKind, PM);
+
+    // See if this ProcResource is already associated with this processor.
+    RecIter I = std::find(PM.ProcResourceDefs.begin(),
+                          PM.ProcResourceDefs.end(), ProcResUnits);
+    if (I != PM.ProcResourceDefs.end())
+      return;
+
+    PM.ProcResourceDefs.push_back(ProcResUnits);
+    if (!ProcResUnits->getValueInit("Super")->isComplete())
+      return;
+
+    ProcResKind = ProcResUnits->getValueAsDef("Super");
+  }
+}
+
+// Add resources for a SchedWrite to this processor if they don't exist.
+void CodeGenSchedModels::addWriteRes(Record *ProcWriteResDef, unsigned PIdx) {
+  assert(PIdx && "don't add resources to an invalid Processor model");
+
+  RecVec &WRDefs = ProcModels[PIdx].WriteResDefs;
+  RecIter WRI = std::find(WRDefs.begin(), WRDefs.end(), ProcWriteResDef);
+  if (WRI != WRDefs.end())
+    return;
+  WRDefs.push_back(ProcWriteResDef);
+
+  // Visit ProcResourceKinds referenced by the newly discovered WriteRes.
+  RecVec ProcResDefs = ProcWriteResDef->getValueAsListOfDefs("ProcResources");
+  for (RecIter WritePRI = ProcResDefs.begin(), WritePRE = ProcResDefs.end();
+       WritePRI != WritePRE; ++WritePRI) {
+    addProcResource(*WritePRI, ProcModels[PIdx]);
+  }
+}
+
+// Add resources for a ReadAdvance to this processor if they don't exist.
+void CodeGenSchedModels::addReadAdvance(Record *ProcReadAdvanceDef,
+                                        unsigned PIdx) {
+  RecVec &RADefs = ProcModels[PIdx].ReadAdvanceDefs;
+  RecIter I = std::find(RADefs.begin(), RADefs.end(), ProcReadAdvanceDef);
+  if (I != RADefs.end())
+    return;
+  RADefs.push_back(ProcReadAdvanceDef);
+}
+
+unsigned CodeGenProcModel::getProcResourceIdx(Record *PRDef) const {
+  RecIter PRPos = std::find(ProcResourceDefs.begin(), ProcResourceDefs.end(),
+                            PRDef);
+  if (PRPos == ProcResourceDefs.end())
+    throw TGError(PRDef->getLoc(), "ProcResource def is not included in "
+                  "the ProcResources list for " + ModelName);
+  // Idx=0 is reserved for invalid.
+  return 1 + PRPos - ProcResourceDefs.begin();
+}
+
 #ifndef NDEBUG
-  // Check for missing itinerary entries.
-  assert(!ProcModel.ItinDefList[0] && "NoItinerary class can't have rec");
-  for (unsigned i = 1, N = ProcModel.ItinDefList.size(); i < N; ++i) {
-    if (!ProcModel.ItinDefList[i])
-      DEBUG(dbgs() << ProcModel.ItinsDef->getName()
-            << " missing itinerary for class " << SchedClasses[i].Name << '\n');
+void CodeGenProcModel::dump() const {
+  dbgs() << Index << ": " << ModelName << " "
+         << (ModelDef ? ModelDef->getName() : "inferred") << " "
+         << (ItinsDef ? ItinsDef->getName() : "no itinerary") << '\n';
+}
+
+void CodeGenSchedRW::dump() const {
+  dbgs() << Name << (IsVariadic ? " (V) " : " ");
+  if (IsSequence) {
+    dbgs() << "(";
+    dumpIdxVec(Sequence);
+    dbgs() << ")";
+  }
+}
+
+void CodeGenSchedClass::dump(const CodeGenSchedModels* SchedModels) const {
+  dbgs() << "SCHEDCLASS " << Name << '\n'
+         << "  Writes: ";
+  for (unsigned i = 0, N = Writes.size(); i < N; ++i) {
+    SchedModels->getSchedWrite(Writes[i]).dump();
+    if (i < N-1) {
+      dbgs() << '\n';
+      dbgs().indent(10);
+    }
+  }
+  dbgs() << "\n  Reads: ";
+  for (unsigned i = 0, N = Reads.size(); i < N; ++i) {
+    SchedModels->getSchedRead(Reads[i]).dump();
+    if (i < N-1) {
+      dbgs() << '\n';
+      dbgs().indent(10);
+    }
+  }
+  dbgs() << "\n  ProcIdx: "; dumpIdxVec(ProcIndices); dbgs() << '\n';
+}
+
+void PredTransitions::dump() const {
+  dbgs() << "Expanded Variants:\n";
+  for (std::vector<PredTransition>::const_iterator
+         TI = TransVec.begin(), TE = TransVec.end(); TI != TE; ++TI) {
+    dbgs() << "{";
+    for (SmallVectorImpl<PredCheck>::const_iterator
+           PCI = TI->PredTerm.begin(), PCE = TI->PredTerm.end();
+         PCI != PCE; ++PCI) {
+      if (PCI != TI->PredTerm.begin())
+        dbgs() << ", ";
+      dbgs() << SchedModels.getSchedRW(PCI->RWIdx, PCI->IsRead).Name
+             << ":" << PCI->Predicate->getName();
+    }
+    dbgs() << "},\n  => {";
+    for (SmallVectorImpl<SmallVector<unsigned,4> >::const_iterator
+           WSI = TI->WriteSequences.begin(), WSE = TI->WriteSequences.end();
+         WSI != WSE; ++WSI) {
+      dbgs() << "(";
+      for (SmallVectorImpl<unsigned>::const_iterator
+             WI = WSI->begin(), WE = WSI->end(); WI != WE; ++WI) {
+        if (WI != WSI->begin())
+          dbgs() << ", ";
+        dbgs() << SchedModels.getSchedWrite(*WI).Name;
+      }
+      dbgs() << "),";
+    }
+    dbgs() << "}\n";
   }
-#endif
 }
+#endif // NDEBUG
diff --git a/utils/TableGen/CodeGenSchedule.h b/utils/TableGen/CodeGenSchedule.h
index 9da0145732..226c3aed5c 100644
--- a/utils/TableGen/CodeGenSchedule.h
+++ b/utils/TableGen/CodeGenSchedule.h
@@ -23,21 +23,126 @@
 namespace llvm {
 
 class CodeGenTarget;
+class CodeGenSchedModels;
+class CodeGenInstruction;
 
-// Scheduling class.
-//
-// Each instruction description will be mapped to a scheduling class. It may be
-// an explicitly defined itinerary class, or an inferred class in which case
-// ItinClassDef == NULL.
+typedef std::vector<Record*> RecVec;
+typedef std::vector<Record*>::const_iterator RecIter;
+
+typedef std::vector<unsigned> IdxVec;
+typedef std::vector<unsigned>::const_iterator IdxIter;
+
+void splitSchedReadWrites(const RecVec &RWDefs,
+                          RecVec &WriteDefs, RecVec &ReadDefs);
+
+/// We have two kinds of SchedReadWrites. Explicitly defined and inferred
+/// sequences.  TheDef is nonnull for explicit SchedWrites, but Sequence may or
+/// may not be empty. TheDef is null for inferred sequences, and Sequence must
+/// be nonempty.
+///
+/// IsVariadic controls whether the variants are expanded into multiple operands
+/// or a sequence of writes on one operand.
+struct CodeGenSchedRW {
+  std::string Name;
+  Record *TheDef;
+  bool IsAlias;
+  bool HasVariants;
+  bool IsVariadic;
+  bool IsSequence;
+  IdxVec Sequence;
+  RecVec Aliases;
+
+  CodeGenSchedRW(): TheDef(0), IsAlias(false), HasVariants(false),
+                    IsVariadic(false), IsSequence(false) {}
+  CodeGenSchedRW(Record *Def): TheDef(Def), IsAlias(false), IsVariadic(false) {
+    Name = Def->getName();
+    HasVariants = Def->isSubClassOf("SchedVariant");
+    if (HasVariants)
+      IsVariadic = Def->getValueAsBit("Variadic");
+
+    // Read records don't currently have sequences, but it can be easily
+    // added. Note that implicit Reads (from ReadVariant) may have a Sequence
+    // (but no record).
+    IsSequence = Def->isSubClassOf("WriteSequence");
+  }
+
+  CodeGenSchedRW(const IdxVec &Seq, const std::string &Name):
+    Name(Name), TheDef(0), IsAlias(false), HasVariants(false),
+    IsVariadic(false), IsSequence(true), Sequence(Seq) {
+    assert(Sequence.size() > 1 && "implied sequence needs >1 RWs");
+  }
+
+  bool isValid() const {
+    assert((!HasVariants || TheDef) && "Variant write needs record def");
+    assert((!IsVariadic || HasVariants) && "Variadic write needs variants");
+    assert((!IsSequence || !HasVariants) && "Sequence can't have variant");
+    assert((!IsSequence || !Sequence.empty()) && "Sequence should be nonempty");
+    assert((!IsAlias || Aliases.empty()) && "Alias cannot have aliases");
+    return TheDef || !Sequence.empty();
+  }
+
+#ifndef NDEBUG
+  void dump() const;
+#endif
+};
+
+/// Represent a transition between SchedClasses induced by SchedVariant.
+struct CodeGenSchedTransition {
+  unsigned ToClassIdx;
+  IdxVec ProcIndices;
+  RecVec PredTerm;
+};
+
+/// Scheduling class.
+///
+/// Each instruction description will be mapped to a scheduling class. There are
+/// four types of classes:
+///
+/// 1) An explicitly defined itinerary class with ItinClassDef set.
+/// Writes and ReadDefs are empty. ProcIndices contains 0 for any processor.
+///
+/// 2) An implied class with a list of SchedWrites and SchedReads that are
+/// defined in an instruction definition and which are common across all
+/// subtargets. ProcIndices contains 0 for any processor.
+///
+/// 3) An implied class with a list of InstRW records that map instructions to
+/// SchedWrites and SchedReads per-processor. InstrClassMap should map the same
+/// instructions to this class. ProcIndices contains all the processors that
+/// provided InstrRW records for this class. ItinClassDef or Writes/Reads may
+/// still be defined for processors with no InstRW entry.
+///
+/// 4) An inferred class represents a variant of another class that may be
+/// resolved at runtime. ProcIndices contains the set of processors that may
+/// require the class. ProcIndices are propagated through SchedClasses as
+/// variants are expanded. Multiple SchedClasses may be inferred from an
+/// itinerary class. Each inherits the processor index from the ItinRW record
+/// that mapped the itinerary class to the variant Writes or Reads.
 struct CodeGenSchedClass {
   std::string Name;
-  unsigned Index;
   Record *ItinClassDef;
 
-  CodeGenSchedClass(): Index(0), ItinClassDef(0) {}
-  CodeGenSchedClass(Record *rec): Index(0), ItinClassDef(rec) {
+  IdxVec Writes;
+  IdxVec Reads;
+  // Sorted list of ProcIdx, where ProcIdx==0 implies any processor.
+  IdxVec ProcIndices;
+
+  std::vector<CodeGenSchedTransition> Transitions;
+
+  // InstRW records associated with this class. These records may refer to an
+  // Instruction no longer mapped to this class by InstrClassMap. These
+  // Instructions should be ignored by this class because they have been split
+  // off to join another inferred class.
+  RecVec InstRWs;
+
+  CodeGenSchedClass(): ItinClassDef(0) {}
+  CodeGenSchedClass(Record *rec): ItinClassDef(rec) {
     Name = rec->getName();
+    ProcIndices.push_back(0);
   }
+
+#ifndef NDEBUG
+  void dump(const CodeGenSchedModels *SchedModels) const;
+#endif
 };
 
 // Processor model.
@@ -55,28 +160,66 @@ struct CodeGenSchedClass {
 //
 // ItinDefList orders this processor's InstrItinData records by SchedClass idx.
 struct CodeGenProcModel {
+  unsigned Index;
   std::string ModelName;
   Record *ModelDef;
   Record *ItinsDef;
 
-  // Array of InstrItinData records indexed by CodeGenSchedClass::Index.
-  // The list is empty if the subtarget has no itineraries.
-  std::vector<Record *> ItinDefList;
+  // Derived members...
 
-  CodeGenProcModel(const std::string &Name, Record *MDef, Record *IDef):
-    ModelName(Name), ModelDef(MDef), ItinsDef(IDef) {}
+  // Array of InstrItinData records indexed by a CodeGenSchedClass index.
+  // This list is empty if the Processor has no value for Itineraries.
+  // Initialized by collectProcItins().
+  RecVec ItinDefList;
+
+  // Map itinerary classes to per-operand resources.
+  // This list is empty if no ItinRW refers to this Processor.
+  RecVec ItinRWDefs;
+
+  // All read/write resources associated with this processor.
+  RecVec WriteResDefs;
+  RecVec ReadAdvanceDefs;
+
+  // Per-operand machine model resources associated with this processor.
+  RecVec ProcResourceDefs;
+
+  CodeGenProcModel(unsigned Idx, const std::string &Name, Record *MDef,
+                   Record *IDef) :
+    Index(Idx), ModelName(Name), ModelDef(MDef), ItinsDef(IDef) {}
+
+  bool hasInstrSchedModel() const {
+    return !WriteResDefs.empty() || !ItinRWDefs.empty();
+  }
+
+  unsigned getProcResourceIdx(Record *PRDef) const;
+
+#ifndef NDEBUG
+  void dump() const;
+#endif
 };
 
-// Top level container for machine model data.
+/// Top level container for machine model data.
 class CodeGenSchedModels {
   RecordKeeper &Records;
   const CodeGenTarget &Target;
 
+  // List of unique processor models.
+  std::vector<CodeGenProcModel> ProcModels;
+
+  // Map Processor's MachineModel or ProcItin to a CodeGenProcModel index.
+  typedef DenseMap<Record*, unsigned> ProcModelMapTy;
+  ProcModelMapTy ProcModelMap;
+
+  // Per-operand SchedReadWrite types.
+  std::vector<CodeGenSchedRW> SchedWrites;
+  std::vector<CodeGenSchedRW> SchedReads;
+
   // List of unique SchedClasses.
   std::vector<CodeGenSchedClass> SchedClasses;
 
   // Map SchedClass name to itinerary index.
-  // These are either explicit itinerary classes or inferred classes.
+  // These are either explicit itinerary classes or classes implied by
+  // instruction definitions with SchedReadWrite lists.
   StringMap<unsigned> SchedClassIdxMap;
 
   // SchedClass indices 1 up to and including NumItineraryClasses identify
@@ -84,22 +227,81 @@ class CodeGenSchedModels {
   // definitions. NoItinerary always has index 0 regardless of whether it is
   // explicitly referenced.
   //
-  // Any inferred SchedClass have a index greater than NumItineraryClasses.
+  // Any implied SchedClass has an index greater than NumItineraryClasses.
   unsigned NumItineraryClasses;
 
-  // List of unique processor models.
-  std::vector<CodeGenProcModel> ProcModels;
-
-  // Map Processor's MachineModel + ProcItin fields to a CodeGenProcModel index.
-  typedef DenseMap<std::pair<Record*, Record*>, unsigned> ProcModelMapTy;
-  ProcModelMapTy ProcModelMap;
+  // Any inferred SchedClass has an index greater than NumInstrSchedClassses.
+  unsigned NumInstrSchedClasses;
 
-  // True if any processors have nonempty itineraries.
-  bool HasProcItineraries;
+  // Map Instruction to SchedClass index. Only for Instructions mentioned in
+  // InstRW records.
+  typedef DenseMap<Record*, unsigned> InstClassMapTy;
+  InstClassMapTy InstrClassMap;
 
 public:
   CodeGenSchedModels(RecordKeeper& RK, const CodeGenTarget &TGT);
 
+  Record *getModelOrItinDef(Record *ProcDef) const {
+    Record *ModelDef = ProcDef->getValueAsDef("SchedModel");
+    Record *ItinsDef = ProcDef->getValueAsDef("ProcItin");
+    if (!ItinsDef->getValueAsListOfDefs("IID").empty()) {
+      assert(ModelDef->getValueAsBit("NoModel")
+             && "Itineraries must be defined within SchedMachineModel");
+      return ItinsDef;
+    }
+    return ModelDef;
+  }
+
+  const CodeGenProcModel &getModelForProc(Record *ProcDef) const {
+    Record *ModelDef = getModelOrItinDef(ProcDef);
+    ProcModelMapTy::const_iterator I = ProcModelMap.find(ModelDef);
+    assert(I != ProcModelMap.end() && "missing machine model");
+    return ProcModels[I->second];
+  }
+
+  const CodeGenProcModel &getProcModel(Record *ModelDef) const {
+    ProcModelMapTy::const_iterator I = ProcModelMap.find(ModelDef);
+    assert(I != ProcModelMap.end() && "missing machine model");
+    return ProcModels[I->second];
+  }
+
+  // Iterate over the unique processor models.
+  typedef std::vector<CodeGenProcModel>::const_iterator ProcIter;
+  ProcIter procModelBegin() const { return ProcModels.begin(); }
+  ProcIter procModelEnd() const { return ProcModels.end(); }
+
+  // Get a SchedWrite from its index.
+  const CodeGenSchedRW &getSchedWrite(unsigned Idx) const {
+    assert(Idx < SchedWrites.size() && "bad SchedWrite index");
+    assert(SchedWrites[Idx].isValid() && "invalid SchedWrite");
+    return SchedWrites[Idx];
+  }
+  // Get a SchedWrite from its index.
+  const CodeGenSchedRW &getSchedRead(unsigned Idx) const {
+    assert(Idx < SchedReads.size() && "bad SchedRead index");
+    assert(SchedReads[Idx].isValid() && "invalid SchedRead");
+    return SchedReads[Idx];
+  }
+
+  const CodeGenSchedRW &getSchedRW(unsigned Idx, bool IsRead) const {
+    return IsRead ? getSchedRead(Idx) : getSchedWrite(Idx);
+  }
+  CodeGenSchedRW &getSchedRW(Record *Def, unsigned &Idx) {
+    bool IsRead = Def->isSubClassOf("SchedRead");
+    Idx = getSchedRWIdx(Def, IsRead);
+    return const_cast<CodeGenSchedRW&>(
+      IsRead ? getSchedRead(Idx) : getSchedWrite(Idx));
+  }
+  CodeGenSchedRW &getSchedRW(Record *Def) {
+    unsigned Idx;
+    return getSchedRW(Def, Idx);
+  }
+
+  unsigned getSchedRWIdx(Record *Def, bool IsRead, unsigned After = 0) const;
+
+  // Return true if the given write record is referenced by a ReadAdvance.
+  bool hasReadOfWrite(Record *WriteDef) const;
+
   // Check if any instructions are assigned to an explicit itinerary class other
   // than NoItinerary.
   bool hasItineraryClasses() const { return NumItineraryClasses > 0; }
@@ -111,60 +313,85 @@ public:
   }
 
   // Get a SchedClass from its index.
-  const CodeGenSchedClass &getSchedClass(unsigned Idx) {
+  CodeGenSchedClass &getSchedClass(unsigned Idx) {
     assert(Idx < SchedClasses.size() && "bad SchedClass index");
     return SchedClasses[Idx];
   }
-
-  // Get an itinerary class's index. Value indices are '0' for NoItinerary up to
-  // and including numItineraryClasses().
-  unsigned getItinClassIdx(Record *ItinDef) const {
-    assert(SchedClassIdxMap.count(ItinDef->getName()) && "missing ItinClass");
-    unsigned Idx = SchedClassIdxMap.lookup(ItinDef->getName());
-    assert(Idx <= NumItineraryClasses && "bad ItinClass index");
-    return Idx;
+  const CodeGenSchedClass &getSchedClass(unsigned Idx) const {
+    assert(Idx < SchedClasses.size() && "bad SchedClass index");
+    return SchedClasses[Idx];
   }
 
-  bool hasProcessorItineraries() const {
-    return HasProcItineraries;
-  }
+  // Get the SchedClass index for an instruction. Instructions with no
+  // itinerary, no SchedReadWrites, and no InstrReadWrites references return 0
+  // for NoItinerary.
+  unsigned getSchedClassIdx(const CodeGenInstruction &Inst) const;
+
+  unsigned getSchedClassIdx(const RecVec &RWDefs) const;
 
-  // Get an existing machine model for a processor definition.
-  const CodeGenProcModel &getProcModel(Record *ProcDef) const {
-    unsigned idx = getProcModelIdx(ProcDef);
-    assert(idx < ProcModels.size() && "missing machine model");
-    return ProcModels[idx];
+  unsigned getSchedClassIdxForItin(const Record *ItinDef) {
+    return SchedClassIdxMap[ItinDef->getName()];
   }
 
-  // Iterate over the unique processor models.
-  typedef std::vector<CodeGenProcModel>::const_iterator ProcIter;
-  ProcIter procModelBegin() const { return ProcModels.begin(); }
-  ProcIter procModelEnd() const { return ProcModels.end(); }
+  typedef std::vector<CodeGenSchedClass>::const_iterator SchedClassIter;
+  SchedClassIter schedClassBegin() const { return SchedClasses.begin(); }
+  SchedClassIter schedClassEnd() const { return SchedClasses.end(); }
 
-private:
-  // Get a key that can uniquely identify a machine model.
-  ProcModelMapTy::key_type getProcModelKey(Record *ProcDef) const {
-    Record *ModelDef = ProcDef->getValueAsDef("SchedModel");
-    Record *ItinsDef = ProcDef->getValueAsDef("ProcItin");
-    return std::make_pair(ModelDef, ItinsDef);
-  }
+  void findRWs(const RecVec &RWDefs, IdxVec &Writes, IdxVec &Reads) const;
+  void findRWs(const RecVec &RWDefs, IdxVec &RWs, bool IsRead) const;
+  void expandRWSequence(unsigned RWIdx, IdxVec &RWSeq, bool IsRead) const;
 
-  // Get the unique index of a machine model.
-  unsigned getProcModelIdx(Record *ProcDef) const {
-    ProcModelMapTy::const_iterator I =
-      ProcModelMap.find(getProcModelKey(ProcDef));
-    if (I == ProcModelMap.end())
-      return ProcModels.size();
-    return I->second;
-  }
+  unsigned addSchedClass(const IdxVec &OperWrites, const IdxVec &OperReads,
+                         const IdxVec &ProcIndices);
+
+  unsigned findOrInsertRW(ArrayRef<unsigned> Seq, bool IsRead);
+
+  unsigned findSchedClassIdx(const IdxVec &Writes, const IdxVec &Reads) const;
+
+  Record *findProcResUnits(Record *ProcResKind,
+                           const CodeGenProcModel &PM) const;
+
+private:
+  void collectProcModels();
 
   // Initialize a new processor model if it is unique.
   void addProcModel(Record *ProcDef);
 
-  void CollectSchedClasses();
-  void CollectProcModels();
-  void CollectProcItin(CodeGenProcModel &ProcModel,
-                       std::vector<Record*> ItinRecords);
+  void collectSchedRW();
+
+  std::string genRWName(const IdxVec& Seq, bool IsRead);
+  unsigned findRWForSequence(const IdxVec &Seq, bool IsRead);
+
+  void collectSchedClasses();
+
+  std::string createSchedClassName(const IdxVec &OperWrites,
+                                   const IdxVec &OperReads);
+  std::string createSchedClassName(const RecVec &InstDefs);
+  void createInstRWClass(Record *InstRWDef);
+
+  void collectProcItins();
+
+  void collectProcItinRW();
+
+  void inferSchedClasses();
+
+  void inferFromRW(const IdxVec &OperWrites, const IdxVec &OperReads,
+                   unsigned FromClassIdx, const IdxVec &ProcIndices);
+  void inferFromItinClass(Record *ItinClassDef, unsigned FromClassIdx);
+  void inferFromInstRWs(unsigned SCIdx);
+
+  void collectProcResources();
+
+  void collectItinProcResources(Record *ItinClassDef);
+
+  void collectRWResources(const IdxVec &Writes, const IdxVec &Reads,
+                          const IdxVec &ProcIndices);
+
+  void addProcResource(Record *ProcResourceKind, CodeGenProcModel &PM);
+
+  void addWriteRes(Record *ProcWriteResDef, unsigned PIdx);
+
+  void addReadAdvance(Record *ProcReadAdvanceDef, unsigned PIdx);
 };
 
 } // namespace llvm
diff --git a/utils/TableGen/CodeGenTarget.cpp b/utils/TableGen/CodeGenTarget.cpp
index d5c615a1de..42c7a59702 100644
--- a/utils/TableGen/CodeGenTarget.cpp
+++ b/utils/TableGen/CodeGenTarget.cpp
@@ -68,22 +68,30 @@ std::string llvm::getEnumName(MVT::SimpleValueType T) {
   case MVT::x86mmx:   return "MVT::x86mmx";
   case MVT::Glue:     return "MVT::Glue";
   case MVT::isVoid:   return "MVT::isVoid";
+  case MVT::v2i1:     return "MVT::v2i1";
+  case MVT::v4i1:     return "MVT::v4i1";
+  case MVT::v8i1:     return "MVT::v8i1";
+  case MVT::v16i1:    return "MVT::v16i1";
   case MVT::v2i8:     return "MVT::v2i8";
   case MVT::v4i8:     return "MVT::v4i8";
   case MVT::v8i8:     return "MVT::v8i8";
   case MVT::v16i8:    return "MVT::v16i8";
   case MVT::v32i8:    return "MVT::v32i8";
+  case MVT::v1i16:    return "MVT::v1i16";
   case MVT::v2i16:    return "MVT::v2i16";
   case MVT::v4i16:    return "MVT::v4i16";
   case MVT::v8i16:    return "MVT::v8i16";
   case MVT::v16i16:   return "MVT::v16i16";
+  case MVT::v1i32:    return "MVT::v1i32";
   case MVT::v2i32:    return "MVT::v2i32";
   case MVT::v4i32:    return "MVT::v4i32";
   case MVT::v8i32:    return "MVT::v8i32";
+  case MVT::v16i32:   return "MVT::v16i32";
   case MVT::v1i64:    return "MVT::v1i64";
   case MVT::v2i64:    return "MVT::v2i64";
   case MVT::v4i64:    return "MVT::v4i64";
   case MVT::v8i64:    return "MVT::v8i64";
+  case MVT::v16i64:   return "MVT::v16i64";
   case MVT::v2f16:    return "MVT::v2f16";
   case MVT::v2f32:    return "MVT::v2f32";
   case MVT::v4f32:    return "MVT::v4f32";
@@ -199,12 +207,11 @@ void CodeGenTarget::ReadRegAltNameIndices() const {
 /// getRegisterByName - If there is a register with the specific AsmName,
 /// return it.
 const CodeGenRegister *CodeGenTarget::getRegisterByName(StringRef Name) const {
-  const std::vector<CodeGenRegister*> &Regs = getRegBank().getRegisters();
-  for (unsigned i = 0, e = Regs.size(); i != e; ++i)
-    if (Regs[i]->TheDef->getValueAsString("AsmName") == Name)
-      return Regs[i];
-
-  return 0;
+  const StringMap<CodeGenRegister*> &Regs = getRegBank().getRegistersByName();
+  StringMap<CodeGenRegister*>::const_iterator I = Regs.find(Name);
+  if (I == Regs.end())
+    return 0;
+  return I->second;
 }
 
 std::vector<MVT::SimpleValueType> CodeGenTarget::
diff --git a/utils/TableGen/DAGISelMatcherEmitter.cpp b/utils/TableGen/DAGISelMatcherEmitter.cpp
index 1445edbe72..713f1743c1 100644
--- a/utils/TableGen/DAGISelMatcherEmitter.cpp
+++ b/utils/TableGen/DAGISelMatcherEmitter.cpp
@@ -598,7 +598,7 @@ EmitMatcherList(const Matcher *N, unsigned Indent, unsigned CurrentIdx,
 void MatcherTableEmitter::EmitPredicateFunctions(formatted_raw_ostream &OS) {
   // Emit pattern predicates.
   if (!PatternPredicates.empty()) {
-    OS << "bool CheckPatternPredicate(unsigned PredNo) const {\n";
+    OS << "virtual bool CheckPatternPredicate(unsigned PredNo) const {\n";
     OS << "  switch (PredNo) {\n";
     OS << "  default: llvm_unreachable(\"Invalid predicate in table?\");\n";
     for (unsigned i = 0, e = PatternPredicates.size(); i != e; ++i)
@@ -616,7 +616,8 @@ void MatcherTableEmitter::EmitPredicateFunctions(formatted_raw_ostream &OS) {
     PFsByName[I->first->getName()] = I->second;
 
   if (!NodePredicates.empty()) {
-    OS << "bool CheckNodePredicate(SDNode *Node, unsigned PredNo) const {\n";
+    OS << "virtual bool CheckNodePredicate(SDNode *Node,\n";
+    OS << "                                unsigned PredNo) const {\n";
     OS << "  switch (PredNo) {\n";
     OS << "  default: llvm_unreachable(\"Invalid predicate in table?\");\n";
     for (unsigned i = 0, e = NodePredicates.size(); i != e; ++i) {
@@ -635,8 +636,8 @@ void MatcherTableEmitter::EmitPredicateFunctions(formatted_raw_ostream &OS) {
   // Emit CompletePattern matchers.
   // FIXME: This should be const.
   if (!ComplexPatterns.empty()) {
-    OS << "bool CheckComplexPattern(SDNode *Root, SDNode *Parent, SDValue N,\n";
-    OS << "                         unsigned PatternNo,\n";
+    OS << "virtual bool CheckComplexPattern(SDNode *Root, SDNode *Parent,\n";
+    OS << "                                 SDValue N, unsigned PatternNo,\n";
     OS << "         SmallVectorImpl<std::pair<SDValue, SDNode*> > &Result) {\n";
     OS << "  unsigned NextRes = Result.size();\n";
     OS << "  switch (PatternNo) {\n";
@@ -676,7 +677,7 @@ void MatcherTableEmitter::EmitPredicateFunctions(formatted_raw_ostream &OS) {
   // Emit SDNodeXForm handlers.
   // FIXME: This should be const.
   if (!NodeXForms.empty()) {
-    OS << "SDValue RunSDNodeXForm(SDValue V, unsigned XFormNo) {\n";
+    OS << "virtual SDValue RunSDNodeXForm(SDValue V, unsigned XFormNo) {\n";
     OS << "  switch (XFormNo) {\n";
     OS << "  default: llvm_unreachable(\"Invalid xform # in table?\");\n";
 
diff --git a/utils/TableGen/EDEmitter.cpp b/utils/TableGen/EDEmitter.cpp
index 6990da3331..2b3c20a2ce 100644
--- a/utils/TableGen/EDEmitter.cpp
+++ b/utils/TableGen/EDEmitter.cpp
@@ -359,8 +359,8 @@ static int X86TypeFromOpName(LiteralConstantEmitter *type,
 /// X86PopulateOperands - Handles all the operands in an X86 instruction, adding
 ///   the appropriate flags to their descriptors
 ///
-/// @operandFlags - A reference the array of operand flag objects
-/// @inst         - The instruction to use as a source of information
+/// \param operandTypes A reference the array of operand type objects
+/// \param inst         The instruction to use as a source of information
 static void X86PopulateOperands(
   LiteralConstantEmitter *(&operandTypes)[EDIS_MAX_OPERANDS],
   const CodeGenInstruction &inst) {
@@ -386,11 +386,12 @@ static void X86PopulateOperands(
 
 /// decorate1 - Decorates a named operand with a new flag
 ///
-/// @operandFlags - The array of operand flag objects, which don't have names
-/// @inst         - The CodeGenInstruction, which provides a way to translate
-///                 between names and operand indices
-/// @opName       - The name of the operand
-/// @flag         - The name of the flag to add
+/// \param operandFlags The array of operand flag objects, which don't have
+///                     names
+/// \param inst         The CodeGenInstruction, which provides a way to
+//                      translate between names and operand indices
+/// \param opName       The name of the operand
+/// \param opFlag       The name of the flag to add
 static inline void decorate1(
   FlagsConstantEmitter *(&operandFlags)[EDIS_MAX_OPERANDS],
   const CodeGenInstruction &inst,
@@ -439,9 +440,9 @@ static inline void decorate1(
 ///   instruction to determine what sort of an instruction it is and then adds
 ///   the appropriate flags to the instruction and its operands
 ///
-/// @arg instType     - A reference to the type for the instruction as a whole
-/// @arg operandFlags - A reference to the array of operand flag object pointers
-/// @arg inst         - A reference to the original instruction
+/// \param instType     A reference to the type for the instruction as a whole
+/// \param operandFlags A reference to the array of operand flag object pointers
+/// \param inst         A reference to the original instruction
 static void X86ExtractSemantics(
   LiteralConstantEmitter &instType,
   FlagsConstantEmitter *(&operandFlags)[EDIS_MAX_OPERANDS],
@@ -568,8 +569,8 @@ static void X86ExtractSemantics(
 /// ARMFlagFromOpName - Processes the name of a single ARM operand (which is
 ///   actually its type) and translates it into an operand type
 ///
-/// @arg type     - The type object to set
-/// @arg name     - The name of the operand
+/// \param type The type object to set
+/// \param name The name of the operand
 static int ARMFlagFromOpName(LiteralConstantEmitter *type,
                              const std::string &name) {
   REG("GPR");
@@ -751,8 +752,8 @@ static int ARMFlagFromOpName(LiteralConstantEmitter *type,
 /// ARMPopulateOperands - Handles all the operands in an ARM instruction, adding
 ///   the appropriate flags to their descriptors
 ///
-/// @operandFlags - A reference the array of operand flag objects
-/// @inst         - The instruction to use as a source of information
+/// \param operandTypes A reference the array of operand type objects
+/// \param inst         The instruction to use as a source of information
 static void ARMPopulateOperands(
   LiteralConstantEmitter *(&operandTypes)[EDIS_MAX_OPERANDS],
   const CodeGenInstruction &inst) {
@@ -791,10 +792,10 @@ static void ARMPopulateOperands(
 ///   instruction to determine what sort of an instruction it is and then adds
 ///   the appropriate flags to the instruction and its operands
 ///
-/// @arg instType     - A reference to the type for the instruction as a whole
-/// @arg operandTypes - A reference to the array of operand type object pointers
-/// @arg operandFlags - A reference to the array of operand flag object pointers
-/// @arg inst         - A reference to the original instruction
+/// \param instType     A reference to the type for the instruction as a whole
+/// \param operandTypes A reference to the array of operand type object pointers
+/// \param operandFlags A reference to the array of operand flag object pointers
+/// \param inst         A reference to the original instruction
 static void ARMExtractSemantics(
   LiteralConstantEmitter &instType,
   LiteralConstantEmitter *(&operandTypes)[EDIS_MAX_OPERANDS],
@@ -832,8 +833,8 @@ static void ARMExtractSemantics(
 /// populateInstInfo - Fills an array of InstInfos with information about each
 ///   instruction in a target
 ///
-/// @arg infoArray  - The array of InstInfo objects to populate
-/// @arg target     - The CodeGenTarget to use as a source of instructions
+/// \param infoArray The array of InstInfo objects to populate
+/// \param target    The CodeGenTarget to use as a source of instructions
 static void populateInstInfo(CompoundConstantEmitter &infoArray,
                              CodeGenTarget &target) {
   const std::vector<const CodeGenInstruction*> &numberedInstructions =
diff --git a/utils/TableGen/FixedLenDecoderEmitter.cpp b/utils/TableGen/FixedLenDecoderEmitter.cpp
index aa6d7962a0..e755c1ce9c 100644
--- a/utils/TableGen/FixedLenDecoderEmitter.cpp
+++ b/utils/TableGen/FixedLenDecoderEmitter.cpp
@@ -1882,7 +1882,7 @@ static void emitDecodeInstruction(formatted_raw_ostream &OS) {
      << "  uint64_t Bits = STI.getFeatureBits();\n"
      << "\n"
      << "  const uint8_t *Ptr = DecodeTable;\n"
-     << "  uint32_t CurFieldValue;\n"
+     << "  uint32_t CurFieldValue = 0;\n"
      << "  DecodeStatus S = MCDisassembler::Success;\n"
      << "  for (;;) {\n"
      << "    ptrdiff_t Loc = Ptr - DecodeTable;\n"
diff --git a/utils/TableGen/InstrInfoEmitter.cpp b/utils/TableGen/InstrInfoEmitter.cpp
index b41ad94aca..79602da92b 100644
--- a/utils/TableGen/InstrInfoEmitter.cpp
+++ b/utils/TableGen/InstrInfoEmitter.cpp
@@ -304,11 +304,10 @@ void InstrInfoEmitter::emitRecord(const CodeGenInstruction &Inst, unsigned Num,
     MinOperands = Inst.Operands.back().MIOperandNo +
                   Inst.Operands.back().MINumOperands;
 
-  Record *ItinDef = Inst.TheDef->getValueAsDef("Itinerary");
   OS << "  { ";
   OS << Num << ",\t" << MinOperands << ",\t"
      << Inst.Operands.NumDefs << ",\t"
-     << SchedModels.getItinClassIdx(ItinDef) << ",\t"
+     << SchedModels.getSchedClassIdx(Inst) << ",\t"
      << Inst.TheDef->getValueAsInt("Size") << ",\t0";
 
   // Emit all of the target indepedent flags...
diff --git a/utils/TableGen/PseudoLoweringEmitter.cpp b/utils/TableGen/PseudoLoweringEmitter.cpp
index 8e28180ae8..1896a7baae 100644
--- a/utils/TableGen/PseudoLoweringEmitter.cpp
+++ b/utils/TableGen/PseudoLoweringEmitter.cpp
@@ -267,7 +267,7 @@ void PseudoLoweringEmitter::emitLoweringEmitter(raw_ostream &o) {
 
 void PseudoLoweringEmitter::run(raw_ostream &o) {
   Record *ExpansionClass = Records.getClass("PseudoInstExpansion");
-  Record *InstructionClass = Records.getClass("PseudoInstExpansion");
+  Record *InstructionClass = Records.getClass("Instruction");
   assert(ExpansionClass && "PseudoInstExpansion class definition missing!");
   assert(InstructionClass && "Instruction class definition missing!");
 
diff --git a/utils/TableGen/RegisterInfoEmitter.cpp b/utils/TableGen/RegisterInfoEmitter.cpp
index 42b213c04a..87624665cb 100644
--- a/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/utils/TableGen/RegisterInfoEmitter.cpp
@@ -802,16 +802,16 @@ RegisterInfoEmitter::runTargetHeader(raw_ostream &OS, CodeGenTarget &Target,
      << "  virtual bool needsStackRealignment(const MachineFunction &) const\n"
      << "     { return false; }\n";
   if (!RegBank.getSubRegIndices().empty()) {
-    OS << "  unsigned composeSubRegIndices(unsigned, unsigned) const;\n"
-      << "  const TargetRegisterClass *"
+    OS << "  virtual unsigned composeSubRegIndices(unsigned, unsigned) const;\n"
+      << "  virtual const TargetRegisterClass *"
       "getSubClassWithSubReg(const TargetRegisterClass*, unsigned) const;\n";
   }
-  OS << "  const RegClassWeight &getRegClassWeight("
+  OS << "  virtual const RegClassWeight &getRegClassWeight("
      << "const TargetRegisterClass *RC) const;\n"
-     << "  unsigned getNumRegPressureSets() const;\n"
-     << "  const char *getRegPressureSetName(unsigned Idx) const;\n"
-     << "  unsigned getRegPressureSetLimit(unsigned Idx) const;\n"
-     << "  const int *getRegClassPressureSets("
+     << "  virtual unsigned getNumRegPressureSets() const;\n"
+     << "  virtual const char *getRegPressureSetName(unsigned Idx) const;\n"
+     << "  virtual unsigned getRegPressureSetLimit(unsigned Idx) const;\n"
+     << "  virtual const int *getRegClassPressureSets("
      << "const TargetRegisterClass *RC) const;\n"
      << "};\n\n";
 
diff --git a/utils/TableGen/SequenceToOffsetTable.h b/utils/TableGen/SequenceToOffsetTable.h
index 60202b5ade..d4db152a96 100644
--- a/utils/TableGen/SequenceToOffsetTable.h
+++ b/utils/TableGen/SequenceToOffsetTable.h
@@ -82,7 +82,7 @@ public:
   }
 
   bool empty() const { return Seqs.empty(); }
-  
+
   /// layout - Computes the final table layout.
   void layout() {
     assert(Entries == 0 && "Can only call layout() once");
diff --git a/utils/TableGen/SubtargetEmitter.cpp b/utils/TableGen/SubtargetEmitter.cpp
index 5dfd716d89..4127b379b1 100644
--- a/utils/TableGen/SubtargetEmitter.cpp
+++ b/utils/TableGen/SubtargetEmitter.cpp
@@ -14,10 +14,13 @@
 #include "CodeGenTarget.h"
 #include "CodeGenSchedule.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/MCInstrItineraries.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
 #include <algorithm>
 #include <map>
 #include <string>
@@ -26,6 +29,32 @@ using namespace llvm;
 
 namespace {
 class SubtargetEmitter {
+  // Each processor has a SchedClassDesc table with an entry for each SchedClass.
+  // The SchedClassDesc table indexes into a global write resource table, write
+  // latency table, and read advance table.
+  struct SchedClassTables {
+    std::vector<std::vector<MCSchedClassDesc> > ProcSchedClasses;
+    std::vector<MCWriteProcResEntry> WriteProcResources;
+    std::vector<MCWriteLatencyEntry> WriteLatencies;
+    std::vector<std::string> WriterNames;
+    std::vector<MCReadAdvanceEntry> ReadAdvanceEntries;
+
+    // Reserve an invalid entry at index 0
+    SchedClassTables() {
+      ProcSchedClasses.resize(1);
+      WriteProcResources.resize(1);
+      WriteLatencies.resize(1);
+      WriterNames.push_back("InvalidWrite");
+      ReadAdvanceEntries.resize(1);
+    }
+  };
+
+  struct LessWriteProcResources {
+    bool operator()(const MCWriteProcResEntry &LHS,
+                    const MCWriteProcResEntry &RHS) {
+      return LHS.ProcResourceIdx < RHS.ProcResourceIdx;
+    }
+  };
 
   RecordKeeper &Records;
   CodeGenSchedModels &SchedModels;
@@ -50,8 +79,18 @@ class SubtargetEmitter {
                          &ProcItinLists);
   void EmitProcessorProp(raw_ostream &OS, const Record *R, const char *Name,
                          char Separator);
+  void EmitProcessorResources(const CodeGenProcModel &ProcModel,
+                              raw_ostream &OS);
+  Record *FindWriteResources(const CodeGenSchedRW &SchedWrite,
+                             const CodeGenProcModel &ProcModel);
+  Record *FindReadAdvance(const CodeGenSchedRW &SchedRead,
+                          const CodeGenProcModel &ProcModel);
+  void GenSchedClassTables(const CodeGenProcModel &ProcModel,
+                           SchedClassTables &SchedTables);
+  void EmitSchedClassTables(SchedClassTables &SchedTables, raw_ostream &OS);
   void EmitProcessorModels(raw_ostream &OS);
   void EmitProcessorLookup(raw_ostream &OS);
+  void EmitSchedModelHelpers(std::string ClassName, raw_ostream &OS);
   void EmitSchedModel(raw_ostream &OS);
   void ParseFeaturesFunction(raw_ostream &OS, unsigned NumFeatures,
                              unsigned NumProcs);
@@ -521,7 +560,7 @@ EmitItineraries(raw_ostream &OS,
   std::vector<std::vector<InstrItinerary> >::iterator
       ProcItinListsIter = ProcItinLists.begin();
   for (CodeGenSchedModels::ProcIter PI = SchedModels.procModelBegin(),
-         PE = SchedModels.procModelEnd(); PI != PE; ++PI) {
+         PE = SchedModels.procModelEnd(); PI != PE; ++PI, ++ProcItinListsIter) {
 
     Record *ItinsDef = PI->ItinsDef;
     if (!ItinsDefSet.insert(ItinsDef))
@@ -532,7 +571,7 @@ EmitItineraries(raw_ostream &OS,
 
     // Get the itinerary list for the processor.
     assert(ProcItinListsIter != ProcItinLists.end() && "bad iterator");
-    std::vector<InstrItinerary> &ItinList = *ProcItinListsIter++;
+    std::vector<InstrItinerary> &ItinList = *ProcItinListsIter;
 
     OS << "\n";
     OS << "static const llvm::InstrItinerary ";
@@ -578,11 +617,480 @@ void SubtargetEmitter::EmitProcessorProp(raw_ostream &OS, const Record *R,
   OS << '\n';
 }
 
+void SubtargetEmitter::EmitProcessorResources(const CodeGenProcModel &ProcModel,
+                                              raw_ostream &OS) {
+  char Sep = ProcModel.ProcResourceDefs.empty() ? ' ' : ',';
+
+  OS << "\n// {Name, NumUnits, SuperIdx}\n";
+  OS << "static const llvm::MCProcResourceDesc "
+     << ProcModel.ModelName << "ProcResources" << "[] = {\n"
+     << "  {DBGFIELD(\"InvalidUnit\")     0, 0}" << Sep << "\n";
+
+  for (unsigned i = 0, e = ProcModel.ProcResourceDefs.size(); i < e; ++i) {
+    Record *PRDef = ProcModel.ProcResourceDefs[i];
+
+    // Find the SuperIdx
+    unsigned SuperIdx = 0;
+    Record *SuperDef = 0;
+    if (PRDef->getValueInit("Super")->isComplete()) {
+      SuperDef =
+        SchedModels.findProcResUnits(PRDef->getValueAsDef("Super"), ProcModel);
+      SuperIdx = ProcModel.getProcResourceIdx(SuperDef);
+    }
+    // Emit the ProcResourceDesc
+    if (i+1 == e)
+      Sep = ' ';
+    OS << "  {DBGFIELD(\"" << PRDef->getName() << "\") ";
+    if (PRDef->getName().size() < 15)
+      OS.indent(15 - PRDef->getName().size());
+    OS << PRDef->getValueAsInt("NumUnits") << ", " << SuperIdx
+       << "}" << Sep << " // #" << i+1;
+    if (SuperDef)
+      OS << ", Super=" << SuperDef->getName();
+    OS << "\n";
+  }
+  OS << "};\n";
+}
+
+// Find the WriteRes Record that defines processor resources for this
+// SchedWrite.
+Record *SubtargetEmitter::FindWriteResources(
+  const CodeGenSchedRW &SchedWrite, const CodeGenProcModel &ProcModel) {
+
+  // Check if the SchedWrite is already subtarget-specific and directly
+  // specifies a set of processor resources.
+  if (SchedWrite.TheDef->isSubClassOf("SchedWriteRes"))
+    return SchedWrite.TheDef;
+
+  // Check this processor's list of aliases for SchedWrite.
+  Record *AliasDef = 0;
+  for (RecIter AI = SchedWrite.Aliases.begin(), AE = SchedWrite.Aliases.end();
+       AI != AE; ++AI) {
+    const CodeGenSchedRW &AliasRW =
+      SchedModels.getSchedRW((*AI)->getValueAsDef("AliasRW"));
+    Record *ModelDef = AliasRW.TheDef->getValueAsDef("SchedModel");
+    if (&SchedModels.getProcModel(ModelDef) != &ProcModel)
+      continue;
+    if (AliasDef)
+      throw TGError(AliasRW.TheDef->getLoc(), "Multiple aliases "
+                    "defined for processor " + ProcModel.ModelName +
+                    " Ensure only one SchedAlias exists per RW.");
+    AliasDef = AliasRW.TheDef;
+  }
+  if (AliasDef && AliasDef->isSubClassOf("SchedWriteRes"))
+    return AliasDef;
+
+  // Check this processor's list of write resources.
+  Record *ResDef = 0;
+  for (RecIter WRI = ProcModel.WriteResDefs.begin(),
+         WRE = ProcModel.WriteResDefs.end(); WRI != WRE; ++WRI) {
+    if (!(*WRI)->isSubClassOf("WriteRes"))
+      continue;
+    if (AliasDef == (*WRI)->getValueAsDef("WriteType")
+        || SchedWrite.TheDef == (*WRI)->getValueAsDef("WriteType")) {
+      if (ResDef) {
+        throw TGError((*WRI)->getLoc(), "Resources are defined for both "
+                      "SchedWrite and its alias on processor " +
+                      ProcModel.ModelName);
+      }
+      ResDef = *WRI;
+    }
+  }
+  // TODO: If ProcModel has a base model (previous generation processor),
+  // then call FindWriteResources recursively with that model here.
+  if (!ResDef) {
+    throw TGError(ProcModel.ModelDef->getLoc(),
+                  std::string("Processor does not define resources for ")
+                  + SchedWrite.TheDef->getName());
+  }
+  return ResDef;
+}
+
+/// Find the ReadAdvance record for the given SchedRead on this processor or
+/// return NULL.
+Record *SubtargetEmitter::FindReadAdvance(const CodeGenSchedRW &SchedRead,
+                                          const CodeGenProcModel &ProcModel) {
+  // Check for SchedReads that directly specify a ReadAdvance.
+  if (SchedRead.TheDef->isSubClassOf("SchedReadAdvance"))
+    return SchedRead.TheDef;
+
+  // Check this processor's list of aliases for SchedRead.
+  Record *AliasDef = 0;
+  for (RecIter AI = SchedRead.Aliases.begin(), AE = SchedRead.Aliases.end();
+       AI != AE; ++AI) {
+    const CodeGenSchedRW &AliasRW =
+      SchedModels.getSchedRW((*AI)->getValueAsDef("AliasRW"));
+    Record *ModelDef = AliasRW.TheDef->getValueAsDef("SchedModel");
+    if (&SchedModels.getProcModel(ModelDef) != &ProcModel)
+      continue;
+    if (AliasDef)
+      throw TGError(AliasRW.TheDef->getLoc(), "Multiple aliases "
+                    "defined for processor " + ProcModel.ModelName +
+                    " Ensure only one SchedAlias exists per RW.");
+    AliasDef = AliasRW.TheDef;
+  }
+  if (AliasDef && AliasDef->isSubClassOf("SchedReadAdvance"))
+    return AliasDef;
+
+  // Check this processor's ReadAdvanceList.
+  Record *ResDef = 0;
+  for (RecIter RAI = ProcModel.ReadAdvanceDefs.begin(),
+         RAE = ProcModel.ReadAdvanceDefs.end(); RAI != RAE; ++RAI) {
+    if (!(*RAI)->isSubClassOf("ReadAdvance"))
+      continue;
+    if (AliasDef == (*RAI)->getValueAsDef("ReadType")
+        || SchedRead.TheDef == (*RAI)->getValueAsDef("ReadType")) {
+      if (ResDef) {
+        throw TGError((*RAI)->getLoc(), "Resources are defined for both "
+                      "SchedRead and its alias on processor " +
+                      ProcModel.ModelName);
+      }
+      ResDef = *RAI;
+    }
+  }
+  // TODO: If ProcModel has a base model (previous generation processor),
+  // then call FindReadAdvance recursively with that model here.
+  if (!ResDef && SchedRead.TheDef->getName() != "ReadDefault") {
+    throw TGError(ProcModel.ModelDef->getLoc(),
+                  std::string("Processor does not define resources for ")
+                  + SchedRead.TheDef->getName());
+  }
+  return ResDef;
+}
+
+// Generate the SchedClass table for this processor and update global
+// tables. Must be called for each processor in order.
+void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
+                                           SchedClassTables &SchedTables) {
+  SchedTables.ProcSchedClasses.resize(SchedTables.ProcSchedClasses.size() + 1);
+  if (!ProcModel.hasInstrSchedModel())
+    return;
+
+  std::vector<MCSchedClassDesc> &SCTab = SchedTables.ProcSchedClasses.back();
+  for (CodeGenSchedModels::SchedClassIter SCI = SchedModels.schedClassBegin(),
+         SCE = SchedModels.schedClassEnd(); SCI != SCE; ++SCI) {
+    SCTab.resize(SCTab.size() + 1);
+    MCSchedClassDesc &SCDesc = SCTab.back();
+    // SCDesc.Name is guarded by NDEBUG
+    SCDesc.NumMicroOps = 0;
+    SCDesc.BeginGroup = false;
+    SCDesc.EndGroup = false;
+    SCDesc.WriteProcResIdx = 0;
+    SCDesc.WriteLatencyIdx = 0;
+    SCDesc.ReadAdvanceIdx = 0;
+
+    // A Variant SchedClass has no resources of its own.
+    if (!SCI->Transitions.empty()) {
+      SCDesc.NumMicroOps = MCSchedClassDesc::VariantNumMicroOps;
+      continue;
+    }
+
+    // Determine if the SchedClass is actually reachable on this processor. If
+    // not don't try to locate the processor resources, it will fail.
+    // If ProcIndices contains 0, this class applies to all processors.
+    assert(!SCI->ProcIndices.empty() && "expect at least one procidx");
+    if (SCI->ProcIndices[0] != 0) {
+      IdxIter PIPos = std::find(SCI->ProcIndices.begin(),
+                                SCI->ProcIndices.end(), ProcModel.Index);
+      if (PIPos == SCI->ProcIndices.end())
+        continue;
+    }
+    IdxVec Writes = SCI->Writes;
+    IdxVec Reads = SCI->Reads;
+    if (SCI->ItinClassDef) {
+      assert(SCI->InstRWs.empty() && "ItinClass should not have InstRWs");
+      // Check this processor's itinerary class resources.
+      for (RecIter II = ProcModel.ItinRWDefs.begin(),
+             IE = ProcModel.ItinRWDefs.end(); II != IE; ++II) {
+        RecVec Matched = (*II)->getValueAsListOfDefs("MatchedItinClasses");
+        if (std::find(Matched.begin(), Matched.end(), SCI->ItinClassDef)
+            != Matched.end()) {
+          SchedModels.findRWs((*II)->getValueAsListOfDefs("OperandReadWrites"),
+                              Writes, Reads);
+          break;
+        }
+      }
+      if (Writes.empty()) {
+        DEBUG(dbgs() << ProcModel.ItinsDef->getName()
+              << " does not have resources for itinerary class "
+              << SCI->ItinClassDef->getName() << '\n');
+      }
+    }
+    else if (!SCI->InstRWs.empty()) {
+      assert(SCI->Writes.empty() && SCI->Reads.empty() &&
+             "InstRW class should not have its own ReadWrites");
+      Record *RWDef = 0;
+      for (RecIter RWI = SCI->InstRWs.begin(), RWE = SCI->InstRWs.end();
+           RWI != RWE; ++RWI) {
+        Record *RWModelDef = (*RWI)->getValueAsDef("SchedModel");
+        if (&ProcModel == &SchedModels.getProcModel(RWModelDef)) {
+          RWDef = *RWI;
+          break;
+        }
+      }
+      if (RWDef) {
+        SchedModels.findRWs(RWDef->getValueAsListOfDefs("OperandReadWrites"),
+                            Writes, Reads);
+      }
+    }
+    // Sum resources across all operand writes.
+    std::vector<MCWriteProcResEntry> WriteProcResources;
+    std::vector<MCWriteLatencyEntry> WriteLatencies;
+    std::vector<std::string> WriterNames;
+    std::vector<MCReadAdvanceEntry> ReadAdvanceEntries;
+    for (IdxIter WI = Writes.begin(), WE = Writes.end(); WI != WE; ++WI) {
+      IdxVec WriteSeq;
+      SchedModels.expandRWSequence(*WI, WriteSeq, /*IsRead=*/false);
+
+      // For each operand, create a latency entry.
+      MCWriteLatencyEntry WLEntry;
+      WLEntry.Cycles = 0;
+      unsigned WriteID = WriteSeq.back();
+      WriterNames.push_back(SchedModels.getSchedWrite(WriteID).Name);
+      // If this Write is not referenced by a ReadAdvance, don't distinguish it
+      // from other WriteLatency entries.
+      if (!SchedModels.hasReadOfWrite(SchedModels.getSchedWrite(WriteID).TheDef)) {
+        WriteID = 0;
+      }
+      WLEntry.WriteResourceID = WriteID;
+
+      for (IdxIter WSI = WriteSeq.begin(), WSE = WriteSeq.end();
+           WSI != WSE; ++WSI) {
+
+        Record *WriteRes =
+          FindWriteResources(SchedModels.getSchedWrite(*WSI), ProcModel);
+
+        // Mark the parent class as invalid for unsupported write types.
+        if (WriteRes->getValueAsBit("Unsupported")) {
+          SCDesc.NumMicroOps = MCSchedClassDesc::InvalidNumMicroOps;
+          break;
+        }
+        WLEntry.Cycles += WriteRes->getValueAsInt("Latency");
+        SCDesc.NumMicroOps += WriteRes->getValueAsInt("NumMicroOps");
+        SCDesc.BeginGroup |= WriteRes->getValueAsBit("BeginGroup");
+        SCDesc.EndGroup |= WriteRes->getValueAsBit("EndGroup");
+
+        // Create an entry for each ProcResource listed in WriteRes.
+        RecVec PRVec = WriteRes->getValueAsListOfDefs("ProcResources");
+        std::vector<int64_t> Cycles =
+          WriteRes->getValueAsListOfInts("ResourceCycles");
+        for (unsigned PRIdx = 0, PREnd = PRVec.size();
+             PRIdx != PREnd; ++PRIdx) {
+          MCWriteProcResEntry WPREntry;
+          WPREntry.ProcResourceIdx = ProcModel.getProcResourceIdx(PRVec[PRIdx]);
+          assert(WPREntry.ProcResourceIdx && "Bad ProcResourceIdx");
+          if (Cycles.size() > PRIdx)
+            WPREntry.Cycles = Cycles[PRIdx];
+          else
+            WPREntry.Cycles = 1;
+          WriteProcResources.push_back(WPREntry);
+        }
+      }
+      WriteLatencies.push_back(WLEntry);
+    }
+    // Create an entry for each operand Read in this SchedClass.
+    // Entries must be sorted first by UseIdx then by WriteResourceID.
+    for (unsigned UseIdx = 0, EndIdx = Reads.size();
+         UseIdx != EndIdx; ++UseIdx) {
+      Record *ReadAdvance =
+        FindReadAdvance(SchedModels.getSchedRead(Reads[UseIdx]), ProcModel);
+      if (!ReadAdvance)
+        continue;
+
+      // Mark the parent class as invalid for unsupported write types.
+      if (ReadAdvance->getValueAsBit("Unsupported")) {
+        SCDesc.NumMicroOps = MCSchedClassDesc::InvalidNumMicroOps;
+        break;
+      }
+      RecVec ValidWrites = ReadAdvance->getValueAsListOfDefs("ValidWrites");
+      IdxVec WriteIDs;
+      if (ValidWrites.empty())
+        WriteIDs.push_back(0);
+      else {
+        for (RecIter VWI = ValidWrites.begin(), VWE = ValidWrites.end();
+             VWI != VWE; ++VWI) {
+          WriteIDs.push_back(SchedModels.getSchedRWIdx(*VWI, /*IsRead=*/false));
+        }
+      }
+      std::sort(WriteIDs.begin(), WriteIDs.end());
+      for(IdxIter WI = WriteIDs.begin(), WE = WriteIDs.end(); WI != WE; ++WI) {
+        MCReadAdvanceEntry RAEntry;
+        RAEntry.UseIdx = UseIdx;
+        RAEntry.WriteResourceID = *WI;
+        RAEntry.Cycles = ReadAdvance->getValueAsInt("Cycles");
+        ReadAdvanceEntries.push_back(RAEntry);
+      }
+    }
+    if (SCDesc.NumMicroOps == MCSchedClassDesc::InvalidNumMicroOps) {
+      WriteProcResources.clear();
+      WriteLatencies.clear();
+      ReadAdvanceEntries.clear();
+    }
+    // Add the information for this SchedClass to the global tables using basic
+    // compression.
+    //
+    // WritePrecRes entries are sorted by ProcResIdx.
+    std::sort(WriteProcResources.begin(), WriteProcResources.end(),
+              LessWriteProcResources());
+
+    SCDesc.NumWriteProcResEntries = WriteProcResources.size();
+    std::vector<MCWriteProcResEntry>::iterator WPRPos =
+      std::search(SchedTables.WriteProcResources.begin(),
+                  SchedTables.WriteProcResources.end(),
+                  WriteProcResources.begin(), WriteProcResources.end());
+    if (WPRPos != SchedTables.WriteProcResources.end())
+      SCDesc.WriteProcResIdx = WPRPos - SchedTables.WriteProcResources.begin();
+    else {
+      SCDesc.WriteProcResIdx = SchedTables.WriteProcResources.size();
+      SchedTables.WriteProcResources.insert(WPRPos, WriteProcResources.begin(),
+                                            WriteProcResources.end());
+    }
+    // Latency entries must remain in operand order.
+    SCDesc.NumWriteLatencyEntries = WriteLatencies.size();
+    std::vector<MCWriteLatencyEntry>::iterator WLPos =
+      std::search(SchedTables.WriteLatencies.begin(),
+                  SchedTables.WriteLatencies.end(),
+                  WriteLatencies.begin(), WriteLatencies.end());
+    if (WLPos != SchedTables.WriteLatencies.end()) {
+      unsigned idx = WLPos - SchedTables.WriteLatencies.begin();
+      SCDesc.WriteLatencyIdx = idx;
+      for (unsigned i = 0, e = WriteLatencies.size(); i < e; ++i)
+        if (SchedTables.WriterNames[idx + i].find(WriterNames[i]) ==
+            std::string::npos) {
+          SchedTables.WriterNames[idx + i] += std::string("_") + WriterNames[i];
+        }
+    }
+    else {
+      SCDesc.WriteLatencyIdx = SchedTables.WriteLatencies.size();
+      SchedTables.WriteLatencies.insert(SchedTables.WriteLatencies.end(),
+                                        WriteLatencies.begin(),
+                                        WriteLatencies.end());
+      SchedTables.WriterNames.insert(SchedTables.WriterNames.end(),
+                                     WriterNames.begin(), WriterNames.end());
+    }
+    // ReadAdvanceEntries must remain in operand order.
+    SCDesc.NumReadAdvanceEntries = ReadAdvanceEntries.size();
+    std::vector<MCReadAdvanceEntry>::iterator RAPos =
+      std::search(SchedTables.ReadAdvanceEntries.begin(),
+                  SchedTables.ReadAdvanceEntries.end(),
+                  ReadAdvanceEntries.begin(), ReadAdvanceEntries.end());
+    if (RAPos != SchedTables.ReadAdvanceEntries.end())
+      SCDesc.ReadAdvanceIdx = RAPos - SchedTables.ReadAdvanceEntries.begin();
+    else {
+      SCDesc.ReadAdvanceIdx = SchedTables.ReadAdvanceEntries.size();
+      SchedTables.ReadAdvanceEntries.insert(RAPos, ReadAdvanceEntries.begin(),
+                                            ReadAdvanceEntries.end());
+    }
+  }
+}
+
+// Emit SchedClass tables for all processors and associated global tables.
+void SubtargetEmitter::EmitSchedClassTables(SchedClassTables &SchedTables,
+                                            raw_ostream &OS) {
+  // Emit global WriteProcResTable.
+  OS << "\n// {ProcResourceIdx, Cycles}\n"
+     << "extern const llvm::MCWriteProcResEntry "
+     << Target << "WriteProcResTable[] = {\n"
+     << "  { 0,  0}, // Invalid\n";
+  for (unsigned WPRIdx = 1, WPREnd = SchedTables.WriteProcResources.size();
+       WPRIdx != WPREnd; ++WPRIdx) {
+    MCWriteProcResEntry &WPREntry = SchedTables.WriteProcResources[WPRIdx];
+    OS << "  {" << format("%2d", WPREntry.ProcResourceIdx) << ", "
+       << format("%2d", WPREntry.Cycles) << "}";
+    if (WPRIdx + 1 < WPREnd)
+      OS << ',';
+    OS << " // #" << WPRIdx << '\n';
+  }
+  OS << "}; // " << Target << "WriteProcResTable\n";
+
+  // Emit global WriteLatencyTable.
+  OS << "\n// {Cycles, WriteResourceID}\n"
+     << "extern const llvm::MCWriteLatencyEntry "
+     << Target << "WriteLatencyTable[] = {\n"
+     << "  { 0,  0}, // Invalid\n";
+  for (unsigned WLIdx = 1, WLEnd = SchedTables.WriteLatencies.size();
+       WLIdx != WLEnd; ++WLIdx) {
+    MCWriteLatencyEntry &WLEntry = SchedTables.WriteLatencies[WLIdx];
+    OS << "  {" << format("%2d", WLEntry.Cycles) << ", "
+       << format("%2d", WLEntry.WriteResourceID) << "}";
+    if (WLIdx + 1 < WLEnd)
+      OS << ',';
+    OS << " // #" << WLIdx << " " << SchedTables.WriterNames[WLIdx] << '\n';
+  }
+  OS << "}; // " << Target << "WriteLatencyTable\n";
+
+  // Emit global ReadAdvanceTable.
+  OS << "\n// {UseIdx, WriteResourceID, Cycles}\n"
+     << "extern const llvm::MCReadAdvanceEntry "
+     << Target << "ReadAdvanceTable[] = {\n"
+     << "  {0,  0,  0}, // Invalid\n";
+  for (unsigned RAIdx = 1, RAEnd = SchedTables.ReadAdvanceEntries.size();
+       RAIdx != RAEnd; ++RAIdx) {
+    MCReadAdvanceEntry &RAEntry = SchedTables.ReadAdvanceEntries[RAIdx];
+    OS << "  {" << RAEntry.UseIdx << ", "
+       << format("%2d", RAEntry.WriteResourceID) << ", "
+       << format("%2d", RAEntry.Cycles) << "}";
+    if (RAIdx + 1 < RAEnd)
+      OS << ',';
+    OS << " // #" << RAIdx << '\n';
+  }
+  OS << "}; // " << Target << "ReadAdvanceTable\n";
+
+  // Emit a SchedClass table for each processor.
+  for (CodeGenSchedModels::ProcIter PI = SchedModels.procModelBegin(),
+         PE = SchedModels.procModelEnd(); PI != PE; ++PI) {
+    if (!PI->hasInstrSchedModel())
+      continue;
+
+    std::vector<MCSchedClassDesc> &SCTab =
+      SchedTables.ProcSchedClasses[1 + PI - SchedModels.procModelBegin()];
+
+    OS << "\n// {Name, NumMicroOps, BeginGroup, EndGroup,"
+       << " WriteProcResIdx,#, WriteLatencyIdx,#, ReadAdvanceIdx,#}\n";
+    OS << "static const llvm::MCSchedClassDesc "
+       << PI->ModelName << "SchedClasses[] = {\n";
+
+    // The first class is always invalid. We no way to distinguish it except by
+    // name and position.
+    assert(SchedModels.getSchedClass(0).Name == "NoItinerary"
+           && "invalid class not first");
+    OS << "  {DBGFIELD(\"InvalidSchedClass\")  "
+       << MCSchedClassDesc::InvalidNumMicroOps
+       << ", 0, 0,  0, 0,  0, 0,  0, 0},\n";
+
+    for (unsigned SCIdx = 1, SCEnd = SCTab.size(); SCIdx != SCEnd; ++SCIdx) {
+      MCSchedClassDesc &MCDesc = SCTab[SCIdx];
+      const CodeGenSchedClass &SchedClass = SchedModels.getSchedClass(SCIdx);
+      OS << "  {DBGFIELD(\"" << SchedClass.Name << "\") ";
+      if (SchedClass.Name.size() < 18)
+        OS.indent(18 - SchedClass.Name.size());
+      OS << MCDesc.NumMicroOps
+         << ", " << MCDesc.BeginGroup << ", " << MCDesc.EndGroup
+         << ", " << format("%2d", MCDesc.WriteProcResIdx)
+         << ", " << MCDesc.NumWriteProcResEntries
+         << ", " << format("%2d", MCDesc.WriteLatencyIdx)
+         << ", " << MCDesc.NumWriteLatencyEntries
+         << ", " << format("%2d", MCDesc.ReadAdvanceIdx)
+         << ", " << MCDesc.NumReadAdvanceEntries << "}";
+      if (SCIdx + 1 < SCEnd)
+        OS << ',';
+      OS << " // #" << SCIdx << '\n';
+    }
+    OS << "}; // " << PI->ModelName << "SchedClasses\n";
+  }
+}
+
 void SubtargetEmitter::EmitProcessorModels(raw_ostream &OS) {
   // For each processor model.
   for (CodeGenSchedModels::ProcIter PI = SchedModels.procModelBegin(),
          PE = SchedModels.procModelEnd(); PI != PE; ++PI) {
-    // Skip default
+    // Emit processor resource table.
+    if (PI->hasInstrSchedModel())
+      EmitProcessorResources(*PI, OS);
+    else if(!PI->ProcResourceDefs.empty())
+      throw TGError(PI->ModelDef->getLoc(), "SchedMachineModel defines "
+                    "ProcResources without defining WriteRes SchedWriteRes");
+
     // Begin processor itinerary properties
     OS << "\n";
     OS << "static const llvm::MCSchedModel " << PI->ModelName << "(\n";
@@ -591,11 +1099,19 @@ void SubtargetEmitter::EmitProcessorModels(raw_ostream &OS) {
     EmitProcessorProp(OS, PI->ModelDef, "LoadLatency", ',');
     EmitProcessorProp(OS, PI->ModelDef, "HighLatency", ',');
     EmitProcessorProp(OS, PI->ModelDef, "MispredictPenalty", ',');
+    OS << "  " << PI->Index << ", // Processor ID\n";
+    if (PI->hasInstrSchedModel())
+      OS << "  " << PI->ModelName << "ProcResources" << ",\n"
+         << "  " << PI->ModelName << "SchedClasses" << ",\n"
+         << "  " << PI->ProcResourceDefs.size()+1 << ",\n"
+         << "  " << (SchedModels.schedClassEnd()
+                     - SchedModels.schedClassBegin()) << ",\n";
+    else
+      OS << "  0, 0, 0, 0, // No instruction-level machine model.\n";
     if (SchedModels.hasItineraryClasses())
-      OS << "  " << PI->ItinsDef->getName();
+      OS << "  " << PI->ItinsDef->getName() << ");\n";
     else
-      OS << "  0";
-    OS << ");\n";
+      OS << "  0); // No Itinerary\n";
   }
 }
 
@@ -621,14 +1137,10 @@ void SubtargetEmitter::EmitProcessorLookup(raw_ostream &OS) {
 
     const std::string &Name = Processor->getValueAsString("Name");
     const std::string &ProcModelName =
-      SchedModels.getProcModel(Processor).ModelName;
+      SchedModels.getModelForProc(Processor).ModelName;
 
     // Emit as { "cpu", procinit },
-    OS << "  { "
-       << "\"" << Name << "\", "
-       << "(const void *)&" << ProcModelName;
-
-    OS << " }";
+    OS << "  { \"" << Name << "\", (const void *)&" << ProcModelName << " }";
 
     // Depending on ''if more in the list'' emit comma
     if (++i < N) OS << ",";
@@ -644,16 +1156,116 @@ void SubtargetEmitter::EmitProcessorLookup(raw_ostream &OS) {
 // EmitSchedModel - Emits all scheduling model tables, folding common patterns.
 //
 void SubtargetEmitter::EmitSchedModel(raw_ostream &OS) {
+  OS << "#ifdef DBGFIELD\n"
+     << "#error \"<target>GenSubtargetInfo.inc requires a DBGFIELD macro\"\n"
+     << "#endif\n"
+     << "#ifndef NDEBUG\n"
+     << "#define DBGFIELD(x) x,\n"
+     << "#else\n"
+     << "#define DBGFIELD(x)\n"
+     << "#endif\n";
+
   if (SchedModels.hasItineraryClasses()) {
     std::vector<std::vector<InstrItinerary> > ProcItinLists;
     // Emit the stage data
     EmitStageAndOperandCycleData(OS, ProcItinLists);
     EmitItineraries(OS, ProcItinLists);
   }
+  OS << "\n// ===============================================================\n"
+     << "// Data tables for the new per-operand machine model.\n";
+
+  SchedClassTables SchedTables;
+  for (CodeGenSchedModels::ProcIter PI = SchedModels.procModelBegin(),
+         PE = SchedModels.procModelEnd(); PI != PE; ++PI) {
+    GenSchedClassTables(*PI, SchedTables);
+  }
+  EmitSchedClassTables(SchedTables, OS);
+
   // Emit the processor machine model
   EmitProcessorModels(OS);
   // Emit the processor lookup data
   EmitProcessorLookup(OS);
+
+  OS << "#undef DBGFIELD";
+}
+
+void SubtargetEmitter::EmitSchedModelHelpers(std::string ClassName,
+                                             raw_ostream &OS) {
+  OS << "unsigned " << ClassName
+     << "\n::resolveSchedClass(unsigned SchedClass, const MachineInstr *MI,"
+     << " const TargetSchedModel *SchedModel) const {\n";
+
+  std::vector<Record*> Prologs = Records.getAllDerivedDefinitions("PredicateProlog");
+  std::sort(Prologs.begin(), Prologs.end(), LessRecord());
+  for (std::vector<Record*>::const_iterator
+         PI = Prologs.begin(), PE = Prologs.end(); PI != PE; ++PI) {
+    OS << (*PI)->getValueAsString("Code") << '\n';
+  }
+  IdxVec VariantClasses;
+  for (CodeGenSchedModels::SchedClassIter SCI = SchedModels.schedClassBegin(),
+         SCE = SchedModels.schedClassEnd(); SCI != SCE; ++SCI) {
+    if (SCI->Transitions.empty())
+      continue;
+    VariantClasses.push_back(SCI - SchedModels.schedClassBegin());
+  }
+  if (!VariantClasses.empty()) {
+    OS << "  switch (SchedClass) {\n";
+    for (IdxIter VCI = VariantClasses.begin(), VCE = VariantClasses.end();
+         VCI != VCE; ++VCI) {
+      const CodeGenSchedClass &SC = SchedModels.getSchedClass(*VCI);
+      OS << "  case " << *VCI << ": // " << SC.Name << '\n';
+      IdxVec ProcIndices;
+      for (std::vector<CodeGenSchedTransition>::const_iterator
+             TI = SC.Transitions.begin(), TE = SC.Transitions.end();
+           TI != TE; ++TI) {
+        IdxVec PI;
+        std::set_union(TI->ProcIndices.begin(), TI->ProcIndices.end(),
+                       ProcIndices.begin(), ProcIndices.end(),
+                       std::back_inserter(PI));
+        ProcIndices.swap(PI);
+      }
+      for (IdxIter PI = ProcIndices.begin(), PE = ProcIndices.end();
+           PI != PE; ++PI) {
+        OS << "    ";
+        if (*PI != 0)
+          OS << "if (SchedModel->getProcessorID() == " << *PI << ") ";
+        OS << "{ // " << (SchedModels.procModelBegin() + *PI)->ModelName
+           << '\n';
+        for (std::vector<CodeGenSchedTransition>::const_iterator
+               TI = SC.Transitions.begin(), TE = SC.Transitions.end();
+             TI != TE; ++TI) {
+          OS << "      if (";
+          if (*PI != 0 && !std::count(TI->ProcIndices.begin(),
+                                      TI->ProcIndices.end(), *PI)) {
+              continue;
+          }
+          for (RecIter RI = TI->PredTerm.begin(), RE = TI->PredTerm.end();
+               RI != RE; ++RI) {
+            if (RI != TI->PredTerm.begin())
+              OS << "\n          && ";
+            OS << "(" << (*RI)->getValueAsString("Predicate") << ")";
+          }
+          OS << ")\n"
+             << "        return " << TI->ToClassIdx << "; // "
+             << SchedModels.getSchedClass(TI->ToClassIdx).Name << '\n';
+        }
+        OS << "    }\n";
+        if (*PI == 0)
+          break;
+      }
+      unsigned SCIdx = 0;
+      if (SC.ItinClassDef)
+        SCIdx = SchedModels.getSchedClassIdxForItin(SC.ItinClassDef);
+      else
+        SCIdx = SchedModels.findSchedClassIdx(SC.Writes, SC.Reads);
+      if (SCIdx != *VCI)
+        OS << "    return " << SCIdx << ";\n";
+      OS << "    break;\n";
+    }
+    OS << "  };\n";
+  }
+  OS << "  report_fatal_error(\"Expected a variant SchedClass\");\n"
+     << "} // " << ClassName << "::resolveSchedClass\n";
 }
 
 //
@@ -680,7 +1292,8 @@ void SubtargetEmitter::ParseFeaturesFunction(raw_ostream &OS,
     return;
   }
 
-  OS << "  uint64_t Bits = ReInitMCSubtargetInfo(CPU, FS);\n";
+  OS << "  InitMCProcessorInfo(CPU, FS);\n"
+     << "  uint64_t Bits = getFeatureBits();\n";
 
   for (unsigned i = 0; i < Features.size(); i++) {
     // Next record
@@ -747,13 +1360,18 @@ void SubtargetEmitter::run(raw_ostream &OS) {
     OS << Target << "SubTypeKV, ";
   else
     OS << "0, ";
+  OS << '\n'; OS.indent(22);
+  OS << Target << "ProcSchedKV, "
+     << Target << "WriteProcResTable, "
+     << Target << "WriteLatencyTable, "
+     << Target << "ReadAdvanceTable, ";
   if (SchedModels.hasItineraryClasses()) {
-    OS << Target << "ProcSchedKV, "
-       << Target << "Stages, "
+    OS << '\n'; OS.indent(22);
+    OS << Target << "Stages, "
        << Target << "OperandCycles, "
        << Target << "ForwardingPaths, ";
   } else
-    OS << "0, 0, 0, 0, ";
+    OS << "0, 0, 0, ";
   OS << NumFeatures << ", " << NumProcs << ");\n}\n\n";
 
   OS << "} // End llvm namespace \n";
@@ -780,6 +1398,8 @@ void SubtargetEmitter::run(raw_ostream &OS) {
      << "  explicit " << ClassName << "(StringRef TT, StringRef CPU, "
      << "StringRef FS);\n"
      << "public:\n"
+     << "  unsigned resolveSchedClass(unsigned SchedClass, const MachineInstr *DefMI,"
+     << " const TargetSchedModel *SchedModel) const;\n"
      << "  DFAPacketizer *createDFAPacketizer(const InstrItineraryData *IID)"
      << " const;\n"
      << "};\n";
@@ -790,11 +1410,19 @@ void SubtargetEmitter::run(raw_ostream &OS) {
   OS << "\n#ifdef GET_SUBTARGETINFO_CTOR\n";
   OS << "#undef GET_SUBTARGETINFO_CTOR\n";
 
+  OS << "#include \"llvm/CodeGen/TargetSchedule.h\"\n";
   OS << "namespace llvm {\n";
   OS << "extern const llvm::SubtargetFeatureKV " << Target << "FeatureKV[];\n";
   OS << "extern const llvm::SubtargetFeatureKV " << Target << "SubTypeKV[];\n";
+  OS << "extern const llvm::SubtargetInfoKV " << Target << "ProcSchedKV[];\n";
+  OS << "extern const llvm::MCWriteProcResEntry "
+     << Target << "WriteProcResTable[];\n";
+  OS << "extern const llvm::MCWriteLatencyEntry "
+     << Target << "WriteLatencyTable[];\n";
+  OS << "extern const llvm::MCReadAdvanceEntry "
+     << Target << "ReadAdvanceTable[];\n";
+
   if (SchedModels.hasItineraryClasses()) {
-    OS << "extern const llvm::SubtargetInfoKV " << Target << "ProcSchedKV[];\n";
     OS << "extern const llvm::InstrStage " << Target << "Stages[];\n";
     OS << "extern const unsigned " << Target << "OperandCycles[];\n";
     OS << "extern const unsigned " << Target << "ForwardingPaths[];\n";
@@ -812,14 +1440,22 @@ void SubtargetEmitter::run(raw_ostream &OS) {
     OS << Target << "SubTypeKV, ";
   else
     OS << "0, ";
+  OS << '\n'; OS.indent(22);
+  OS << Target << "ProcSchedKV, "
+     << Target << "WriteProcResTable, "
+     << Target << "WriteLatencyTable, "
+     << Target << "ReadAdvanceTable, ";
+  OS << '\n'; OS.indent(22);
   if (SchedModels.hasItineraryClasses()) {
-    OS << Target << "ProcSchedKV, "
-       << Target << "Stages, "
+    OS << Target << "Stages, "
        << Target << "OperandCycles, "
        << Target << "ForwardingPaths, ";
   } else
-    OS << "0, 0, 0, 0, ";
+    OS << "0, 0, 0, ";
   OS << NumFeatures << ", " << NumProcs << ");\n}\n\n";
+
+  EmitSchedModelHelpers(ClassName, OS);
+
   OS << "} // End llvm namespace \n";
 
   OS << "#endif // GET_SUBTARGETINFO_CTOR\n\n";
diff --git a/utils/TableGen/X86DisassemblerTables.cpp b/utils/TableGen/X86DisassemblerTables.cpp
index 8526621599..468a1f81c7 100644
--- a/utils/TableGen/X86DisassemblerTables.cpp
+++ b/utils/TableGen/X86DisassemblerTables.cpp
@@ -209,6 +209,7 @@ static ModRMDecisionType getDecisionType(ModRMDecision &decision) {
   bool satisfiesOneEntry = true;
   bool satisfiesSplitRM = true;
   bool satisfiesSplitReg = true;
+  bool satisfiesSplitMisc = true;
 
   for (unsigned index = 0; index < 256; ++index) {
     if (decision.instructionIDs[index] != decision.instructionIDs[0])
@@ -228,7 +229,7 @@ static ModRMDecisionType getDecisionType(ModRMDecision &decision) {
 
     if (((index & 0xc0) != 0xc0) &&
        (decision.instructionIDs[index] != decision.instructionIDs[index&0x38]))
-      satisfiesSplitReg = false;
+      satisfiesSplitMisc = false;
   }
 
   if (satisfiesOneEntry)
@@ -237,9 +238,12 @@ static ModRMDecisionType getDecisionType(ModRMDecision &decision) {
   if (satisfiesSplitRM)
     return MODRM_SPLITRM;
 
-  if (satisfiesSplitReg)
+  if (satisfiesSplitReg && satisfiesSplitMisc)
     return MODRM_SPLITREG;
 
+  if (satisfiesSplitMisc)
+    return MODRM_SPLITMISC;
+
   return MODRM_FULL;
 }
 
@@ -332,6 +336,12 @@ void DisassemblerTables::emitModRMDecision(raw_ostream &o1, raw_ostream &o2,
       for (unsigned index = 0xc0; index < 256; index += 8)
         emitOneID(o1, i1, decision.instructionIDs[index], true);
       break;
+    case MODRM_SPLITMISC:
+      for (unsigned index = 0; index < 64; index += 8)
+        emitOneID(o1, i1, decision.instructionIDs[index], true);
+      for (unsigned index = 0xc0; index < 256; ++index)
+        emitOneID(o1, i1, decision.instructionIDs[index], true);
+      break;
     case MODRM_FULL:
       for (unsigned index = 0; index < 256; ++index)
         emitOneID(o1, i1, decision.instructionIDs[index], true);
@@ -361,6 +371,9 @@ void DisassemblerTables::emitModRMDecision(raw_ostream &o1, raw_ostream &o2,
     case MODRM_SPLITREG:
       sEntryNumber += 16;
       break;
+    case MODRM_SPLITMISC:
+      sEntryNumber += 8 + 64;
+      break;
     case MODRM_FULL:
       sEntryNumber += 256;
       break;
diff --git a/utils/TableGen/X86ModRMFilters.h b/utils/TableGen/X86ModRMFilters.h
index 19fecbc3a0..2cbaf7985d 100644
--- a/utils/TableGen/X86ModRMFilters.h
+++ b/utils/TableGen/X86ModRMFilters.h
@@ -70,7 +70,7 @@ class ModFilter : public ModRMFilter {
 public:
   /// Constructor
   ///
-  /// @r            - True if the mod bits of the ModR/M byte must be 11; false
+  /// \param r        True if the mod bits of the ModR/M byte must be 11; false
   ///                 otherwise.  The name r derives from the fact that the mod
   ///                 bits indicate whether the R/M bits [bits 2-0] signify a
   ///                 register or a memory operand.
@@ -98,11 +98,12 @@ class EscapeFilter : public ModRMFilter {
 public:
   /// Constructor
   ///
-  /// @c0_ff        - True if the ModR/M byte must fall between 0xc0 and 0xff;
-  ///                 false otherwise.
-  /// @nnn_or_modRM - If c0_ff is true, the required value of the entire ModR/M
-  ///                 byte.  If c0_ff is false, the required value of the nnn
-  ///                 field.
+  /// \param c0_ff True if the ModR/M byte must fall between 0xc0 and 0xff;
+  ///              false otherwise.
+  ///
+  /// \param nnn_or_modRM If c0_ff is true, the required value of the entire
+  ///                     ModR/M byte.  If c0_ff is false, the required value
+  ///                     of the nnn field.
   EscapeFilter(bool c0_ff, uint8_t nnn_or_modRM) :
     ModRMFilter(),
     C0_FF(c0_ff),
@@ -128,8 +129,8 @@ class AddRegEscapeFilter : public ModRMFilter {
 public:
   /// Constructor
   ///
-  /// @modRM        - The value of the ModR/M byte when the register operand
-  ///                 refers to the first register in the register set.
+  /// \param modRM The value of the ModR/M byte when the register operand
+  ///              refers to the first register in the register set.
   AddRegEscapeFilter(uint8_t modRM) : ModRM(modRM) {
   }
   
@@ -150,9 +151,9 @@ class ExtendedFilter : public ModRMFilter {
 public:
   /// Constructor
   ///
-  /// @r            - True if the mod field must be set to 11; false otherwise.
-  ///                 The name is explained at ModFilter.
-  /// @nnn          - The required value of the nnn field.
+  /// \param r   True if the mod field must be set to 11; false otherwise.
+  ///            The name is explained at ModFilter.
+  /// \param nnn The required value of the nnn field.
   ExtendedFilter(bool r, uint8_t nnn) : 
     ModRMFilter(),
     R(r),
@@ -177,7 +178,7 @@ class ExactFilter : public ModRMFilter {
 public:
   /// Constructor
   ///
-  /// @modRM        - The required value of the full ModR/M byte.
+  /// \param modRM The required value of the full ModR/M byte.
   ExactFilter(uint8_t modRM) :
     ModRMFilter(),
     ModRM(modRM) {
diff --git a/utils/TableGen/X86RecognizableInstr.cpp b/utils/TableGen/X86RecognizableInstr.cpp
index 4b12279cdd..24155c0a5a 100644
--- a/utils/TableGen/X86RecognizableInstr.cpp
+++ b/utils/TableGen/X86RecognizableInstr.cpp
@@ -244,7 +244,7 @@ RecognizableInstr::RecognizableInstr(DisassemblerTables &tables,
   IsSSE            = (HasOpSizePrefix && (Name.find("16") == Name.npos)) ||
                      (Name.find("CRC32") != Name.npos);
   HasFROperands    = hasFROperands();
-  HasVEX_LPrefix   = has256BitOperands() || Rec->getValueAsBit("hasVEX_L");
+  HasVEX_LPrefix   = Rec->getValueAsBit("hasVEX_L");
 
   // Check for 64-bit inst which does not require REX
   Is32Bit = false;
@@ -479,20 +479,6 @@ bool RecognizableInstr::hasFROperands() const {
   return false;
 }
 
-bool RecognizableInstr::has256BitOperands() const {
-  const std::vector<CGIOperandList::OperandInfo> &OperandList = *Operands;
-  unsigned numOperands = OperandList.size();
-
-  for (unsigned operandIndex = 0; operandIndex < numOperands; ++operandIndex) {
-    const std::string &recName = OperandList[operandIndex].Rec->getName();
-
-    if (!recName.compare("VR256")) {
-      return true;
-    }
-  }
-  return false;
-}
-
 void RecognizableInstr::handleOperand(bool optional, unsigned &operandIndex,
                                       unsigned &physicalOperandIndex,
                                       unsigned &numPhysicalOperands,
diff --git a/utils/TableGen/X86RecognizableInstr.h b/utils/TableGen/X86RecognizableInstr.h
index e0bb2e24e2..9feb3c3c7d 100644
--- a/utils/TableGen/X86RecognizableInstr.h
+++ b/utils/TableGen/X86RecognizableInstr.h
@@ -127,10 +127,7 @@ private:
 
   /// hasFROperands - Returns true if any operand is a FR operand.
   bool hasFROperands() const;
-  
-  /// has256BitOperands - Returns true if any operand is a 256-bit SSE operand.
-  bool has256BitOperands() const;
-  
+
   /// typeFromString - Translates an operand type from the string provided in
   ///   the LLVM tables to an OperandType for use in the operand specifier.
   ///
@@ -225,23 +222,23 @@ private:
   /// emitInstructionSpecifier - Loads the instruction specifier for the current
   ///   instruction into a DisassemblerTables.
   ///
-  /// @arg tables - The DisassemblerTables to populate with the specifier for
+  /// \param tables The DisassemblerTables to populate with the specifier for
   ///               the current instruction.
   void emitInstructionSpecifier(DisassemblerTables &tables);
   
   /// emitDecodePath - Populates the proper fields in the decode tables
   ///   corresponding to the decode paths for this instruction.
   ///
-  /// @arg tables - The DisassemblerTables to populate with the decode
+  /// \param tables The DisassemblerTables to populate with the decode
   ///               decode information for the current instruction.
   void emitDecodePath(DisassemblerTables &tables) const;
 
   /// Constructor - Initializes a RecognizableInstr with the appropriate fields
   ///   from a CodeGenInstruction.
   ///
-  /// @arg tables - The DisassemblerTables that the specifier will be added to.
-  /// @arg insn   - The CodeGenInstruction to extract information from.
-  /// @arg uid    - The unique ID of the current instruction.
+  /// \param tables The DisassemblerTables that the specifier will be added to.
+  /// \param insn   The CodeGenInstruction to extract information from.
+  /// \param uid    The unique ID of the current instruction.
   RecognizableInstr(DisassemblerTables &tables,
                     const CodeGenInstruction &insn,
                     InstrUID uid);
@@ -249,11 +246,11 @@ public:
   /// processInstr - Accepts a CodeGenInstruction and loads decode information
   ///   for it into a DisassemblerTables if appropriate.
   ///
-  /// @arg tables - The DiassemblerTables to be populated with decode
+  /// \param tables The DiassemblerTables to be populated with decode
   ///               information.
-  /// @arg insn   - The CodeGenInstruction to be used as a source for this
+  /// \param insn   The CodeGenInstruction to be used as a source for this
   ///               information.
-  /// @uid        - The unique ID of the instruction.
+  /// \param uid    The unique ID of the instruction.
   static void processInstr(DisassemblerTables &tables,
                            const CodeGenInstruction &insn,
                            InstrUID uid);
diff --git a/utils/yaml2obj/yaml2obj.cpp b/utils/yaml2obj/yaml2obj.cpp
index c3b3e5499c..4fc620f4ea 100644
--- a/utils/yaml2obj/yaml2obj.cpp
+++ b/utils/yaml2obj/yaml2obj.cpp
@@ -148,7 +148,7 @@ struct COFFParser {
           return false;
         }
         if (KeyValue == "Machine") {
-          uint16_t Machine;
+          uint16_t Machine = COFF::MT_Invalid;
           if (!getAs(Value, Machine)) {
             // It's not a raw number, try matching the string.
             StringRef ValueValue = Value->getValue(Storage);
author	Derek Schuff <dschuff@chromium.org>	2012-09-25 17:30:25 -0700
committer	Derek Schuff <dschuff@chromium.org>	2012-09-25 18:01:23 -0700
commit	a27c28b1427dc2082ab2b31efdbb25f9fde31b61 (patch)
tree	6f3ff025f542ca3f66a1a01cbf239aeef7784511
parent	0e15ffd8cb1ec642eddb96380660914ff2b007e1 (diff)
parent	bc4021f31eaa97ee52655828da3e3de14a39e4a6 (diff)