444 files changed, 34096 insertions, 6390 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c2ccdecd13..e9d8e8530a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -185,7 +185,7 @@ endif( LLVM_USE_INTEL_JITEVENTS )
 option(LLVM_USE_OPROFILE
   "Use opagent JIT interface to inform OProfile about JIT code" OFF)
 
-# If enabled, ierify we are on a platform that supports oprofile.
+# If enabled, verify we are on a platform that supports oprofile.
 if( LLVM_USE_OPROFILE )
   if( NOT CMAKE_SYSTEM_NAME MATCHES "Linux" )
     message(FATAL_ERROR "OProfile support is available on Linux only.") 
@@ -426,7 +426,7 @@ if( LLVM_INCLUDE_TESTS )
   add_subdirectory(utils/unittest)
   add_subdirectory(unittests)
   if (MSVC)
-    # This utility is used to prevent chrashing tests from calling Dr. Watson on
+    # This utility is used to prevent crashing tests from calling Dr. Watson on
     # Windows.
     add_subdirectory(utils/KillTheDoctor)
   endif()
diff --git a/Makefile.config.in b/Makefile.config.in
index b4ecea631e..3c4f7b7a32 100644
--- a/Makefile.config.in
+++ b/Makefile.config.in
@@ -222,6 +222,15 @@ ENABLE_LIBCPP = @ENABLE_LIBCPP@
 # When ENABLE_CXX11 is enabled, LLVM uses c++11 mode by default to build.
 ENABLE_CXX11 = @ENABLE_CXX11@
 
+# When ENABLE_CLANG_ARCMT is enabled, clang will have ARCMigrationTool.
+ENABLE_CLANG_ARCMT = @ENABLE_CLANG_ARCMT@
+
+# When ENABLE_CLANG_REWRITER is enabled, clang will have Rewriter.
+ENABLE_CLANG_REWRITER = @ENABLE_CLANG_REWRITER@
+
+# When ENABLE_CLANG_STATIC_ANALYZER is enabled, clang will have StaticAnalyzer.
+ENABLE_CLANG_STATIC_ANALYZER = @ENABLE_CLANG_STATIC_ANALYZER@
+
 # When ENABLE_WERROR is enabled, we'll pass -Werror on the command line
 ENABLE_WERROR = @ENABLE_WERROR@
 
diff --git a/autoconf/configure.ac b/autoconf/configure.ac
index eec857f782..0395931659 100644
--- a/autoconf/configure.ac
+++ b/autoconf/configure.ac
@@ -475,6 +475,54 @@ case "$enableval" in
   *) AC_MSG_ERROR([Invalid setting for --enable-cxx11. Use "yes" or "no"]) ;;
 esac
 
+dnl --enable-clang-arcmt: check whether to enable clang arcmt
+clang_arcmt="yes"
+AC_ARG_ENABLE(clang-arcmt,
+              AS_HELP_STRING([--enable-clang-arcmt],
+                             [Enable building of clang ARCMT (default is YES)]),
+                             clang_arcmt="$enableval",
+                             enableval="yes")
+case "$enableval" in
+  yes) AC_SUBST(ENABLE_CLANG_ARCMT,[1]) ;;
+  no)  AC_SUBST(ENABLE_CLANG_ARCMT,[0]) ;;
+  default) AC_SUBST(ENABLE_CLANG_ARCMT,[1]);;
+  *) AC_MSG_ERROR([Invalid setting for --enable-clang-arcmt. Use "yes" or "no"]) ;;
+esac
+
+dnl --enable-clang-static-analyzer: check whether to enable static-analyzer
+clang_static_analyzer="yes"
+AC_ARG_ENABLE(clang-static-analyzer,
+              AS_HELP_STRING([--enable-clang-static-analyzer],
+                             [Enable building of clang Static Analyzer (default is YES)]),
+                             clang_static_analyzer="$enableval",
+                             enableval="yes")
+case "$enableval" in
+  yes) AC_SUBST(ENABLE_CLANG_STATIC_ANALYZER,[1]) ;;
+  no)  AC_SUBST(ENABLE_CLANG_STATIC_ANALYZER,[0]) ;;
+  default) AC_SUBST(ENABLE_CLANG_STATIC_ANALYZER,[1]);;
+  *) AC_MSG_ERROR([Invalid setting for --enable-clang-static-analyzer. Use "yes" or "no"]) ;;
+esac
+
+dnl --enable-clang-rewriter: check whether to enable clang rewriter
+AC_ARG_ENABLE(clang-rewriter,
+              AS_HELP_STRING([--enable-clang-rewriter],
+                             [Enable building of clang rewriter (default is YES)]),,
+                             enableval="yes")
+case "$enableval" in
+  yes) AC_SUBST(ENABLE_CLANG_REWRITER,[1]) ;;
+  no)  
+    if test clang_arcmt != "no" ; then
+      AC_MSG_ERROR([Cannot enable clang ARC Migration Tool while disabling rewriter.])
+    fi
+    if test clang_static_analyzer != "no" ; then
+      AC_MSG_ERROR([Cannot enable clang static analyzer while disabling rewriter.])
+    fi
+    AC_SUBST(ENABLE_CLANG_REWRITER,[0]) 
+    ;;
+  default) AC_SUBST(ENABLE_CLANG_REWRITER,[1]);;
+  *) AC_MSG_ERROR([Invalid setting for --enable-clang-rewriter. Use "yes" or "no"]) ;;
+esac
+
 dnl --enable-optimized : check whether they want to do an optimized build:
 AC_ARG_ENABLE(optimized, AS_HELP_STRING(
  --enable-optimized,[Compile with optimizations enabled (default is NO)]),,enableval=$optimize)
diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake
index fcd5dd5566..274de31c9e 100755
--- a/cmake/config-ix.cmake
+++ b/cmake/config-ix.cmake
@@ -294,6 +294,11 @@ else()
   set(ENABLE_PIC 0)
 endif()
 
+find_package(LibXml2)
+if (LIBXML2_FOUND)
+  set(CLANG_HAVE_LIBXML 1)
+endif ()
+
 include(CheckCXXCompilerFlag)
 
 check_cxx_compiler_flag("-Wno-variadic-macros" SUPPORTS_NO_VARIADIC_MACROS_FLAG)
diff --git a/configure b/configure
index 4813af3d27..5a23fc4a56 100755
--- a/configure
+++ b/configure
@@ -685,6 +685,9 @@ BUILD_CXX
 CVSBUILD
 ENABLE_LIBCPP
 ENABLE_CXX11
+ENABLE_CLANG_ARCMT
+ENABLE_CLANG_STATIC_ANALYZER
+ENABLE_CLANG_REWRITER
 ENABLE_OPTIMIZED
 ENABLE_PROFILING
 DISABLE_ASSERTIONS
@@ -1397,6 +1400,11 @@ Optional Features:
   --enable-polly          Use polly if available (default is YES)
   --enable-libcpp         Use libc++ if available (default is NO)
   --enable-cxx11          Use c++11 if available (default is NO)
+  --enable-clang-arcmt    Enable building of clang ARCMT (default is YES)
+  --enable-clang-static-analyzer
+                          Enable building of clang Static Analyzer (default is
+                          YES)
+  --enable-clang-rewriter Enable building of clang rewriter (default is YES)
   --enable-optimized      Compile with optimizations enabled (default is NO)
   --enable-profiling      Compile with profiling enabled (default is NO)
   --enable-assertions     Compile with assertion checks enabled (default is
@@ -5047,6 +5055,77 @@ echo "$as_me: error: Invalid setting for --enable-cxx11. Use \"yes\" or \"no\""
    { (exit 1); exit 1; }; } ;;
 esac
 
+clang_arcmt="yes"
+# Check whether --enable-clang-arcmt was given.
+if test "${enable_clang_arcmt+set}" = set; then
+  enableval=$enable_clang_arcmt; clang_arcmt="$enableval"
+else
+  enableval="yes"
+fi
+
+case "$enableval" in
+  yes) ENABLE_CLANG_ARCMT=1
+ ;;
+  no)  ENABLE_CLANG_ARCMT=0
+ ;;
+  default) ENABLE_CLANG_ARCMT=1
+;;
+  *) { { echo "$as_me:$LINENO: error: Invalid setting for --enable-clang-arcmt. Use \"yes\" or \"no\"" >&5
+echo "$as_me: error: Invalid setting for --enable-clang-arcmt. Use \"yes\" or \"no\"" >&2;}
+   { (exit 1); exit 1; }; } ;;
+esac
+
+clang_static_analyzer="yes"
+# Check whether --enable-clang-static-analyzer was given.
+if test "${enable_clang_static_analyzer+set}" = set; then
+  enableval=$enable_clang_static_analyzer; clang_static_analyzer="$enableval"
+else
+  enableval="yes"
+fi
+
+case "$enableval" in
+  yes) ENABLE_CLANG_STATIC_ANALYZER=1
+ ;;
+  no)  ENABLE_CLANG_STATIC_ANALYZER=0
+ ;;
+  default) ENABLE_CLANG_STATIC_ANALYZER=1
+;;
+  *) { { echo "$as_me:$LINENO: error: Invalid setting for --enable-clang-static-analyzer. Use \"yes\" or \"no\"" >&5
+echo "$as_me: error: Invalid setting for --enable-clang-static-analyzer. Use \"yes\" or \"no\"" >&2;}
+   { (exit 1); exit 1; }; } ;;
+esac
+
+# Check whether --enable-clang-rewriter was given.
+if test "${enable_clang_rewriter+set}" = set; then
+  enableval=$enable_clang_rewriter;
+else
+  enableval="yes"
+fi
+
+case "$enableval" in
+  yes) ENABLE_CLANG_REWRITER=1
+ ;;
+  no)
+    if test clang_arcmt != "no" ; then
+      { { echo "$as_me:$LINENO: error: Cannot enable clang ARC Migration Tool while disabling rewriter." >&5
+echo "$as_me: error: Cannot enable clang ARC Migration Tool while disabling rewriter." >&2;}
+   { (exit 1); exit 1; }; }
+    fi
+    if test clang_static_analyzer != "no" ; then
+      { { echo "$as_me:$LINENO: error: Cannot enable clang static analyzer while disabling rewriter." >&5
+echo "$as_me: error: Cannot enable clang static analyzer while disabling rewriter." >&2;}
+   { (exit 1); exit 1; }; }
+    fi
+    ENABLE_CLANG_REWRITER=0
+
+    ;;
+  default) ENABLE_CLANG_REWRITER=1
+;;
+  *) { { echo "$as_me:$LINENO: error: Invalid setting for --enable-clang-rewriter. Use \"yes\" or \"no\"" >&5
+echo "$as_me: error: Invalid setting for --enable-clang-rewriter. Use \"yes\" or \"no\"" >&2;}
+   { (exit 1); exit 1; }; } ;;
+esac
+
 # Check whether --enable-optimized was given.
 if test "${enable_optimized+set}" = set; then
   enableval=$enable_optimized;
@@ -10314,7 +10393,7 @@ else
   lt_dlunknown=0; lt_dlno_uscore=1; lt_dlneed_uscore=2
   lt_status=$lt_dlunknown
   cat > conftest.$ac_ext <<EOF
-#line 10317 "configure"
+#line 10396 "configure"
 #include "confdefs.h"
 
 #if HAVE_DLFCN_H
@@ -21910,6 +21989,9 @@ BUILD_CXX!$BUILD_CXX$ac_delim
 CVSBUILD!$CVSBUILD$ac_delim
 ENABLE_LIBCPP!$ENABLE_LIBCPP$ac_delim
 ENABLE_CXX11!$ENABLE_CXX11$ac_delim
+ENABLE_CLANG_ARCMT!$ENABLE_CLANG_ARCMT$ac_delim
+ENABLE_CLANG_STATIC_ANALYZER!$ENABLE_CLANG_STATIC_ANALYZER$ac_delim
+ENABLE_CLANG_REWRITER!$ENABLE_CLANG_REWRITER$ac_delim
 ENABLE_OPTIMIZED!$ENABLE_OPTIMIZED$ac_delim
 ENABLE_PROFILING!$ENABLE_PROFILING$ac_delim
 DISABLE_ASSERTIONS!$DISABLE_ASSERTIONS$ac_delim
@@ -21921,9 +22003,6 @@ DEBUG_SYMBOLS!$DEBUG_SYMBOLS$ac_delim
 KEEP_SYMBOLS!$KEEP_SYMBOLS$ac_delim
 JIT!$JIT$ac_delim
 TARGET_HAS_JIT!$TARGET_HAS_JIT$ac_delim
-ENABLE_DOCS!$ENABLE_DOCS$ac_delim
-ENABLE_DOXYGEN!$ENABLE_DOXYGEN$ac_delim
-LLVM_ENABLE_THREADS!$LLVM_ENABLE_THREADS$ac_delim
 _ACEOF
 
   if test `sed -n "s/.*$ac_delim\$/X/p" conf$$subs.sed | grep -c X` = 97; then
@@ -21965,6 +22044,9 @@ _ACEOF
 ac_delim='%!_!# '
 for ac_last_try in false false false false false :; do
   cat >conf$$subs.sed <<_ACEOF
+ENABLE_DOCS!$ENABLE_DOCS$ac_delim
+ENABLE_DOXYGEN!$ENABLE_DOXYGEN$ac_delim
+LLVM_ENABLE_THREADS!$LLVM_ENABLE_THREADS$ac_delim
 ENABLE_PTHREADS!$ENABLE_PTHREADS$ac_delim
 ENABLE_PIC!$ENABLE_PIC$ac_delim
 ENABLE_SHARED!$ENABLE_SHARED$ac_delim
@@ -22059,10 +22141,9 @@ RPATH!$RPATH$ac_delim
 RDYNAMIC!$RDYNAMIC$ac_delim
 program_prefix!$program_prefix$ac_delim
 LIBOBJS!$LIBOBJS$ac_delim
-LTLIBOBJS!$LTLIBOBJS$ac_delim
 _ACEOF
 
-  if test `sed -n "s/.*$ac_delim\$/X/p" conf$$subs.sed | grep -c X` = 95; then
+  if test `sed -n "s/.*$ac_delim\$/X/p" conf$$subs.sed | grep -c X` = 97; then
     break
   elif $ac_last_try; then
     { { echo "$as_me:$LINENO: error: could not make $CONFIG_STATUS" >&5
@@ -22081,6 +22162,48 @@ fi
 
 cat >>$CONFIG_STATUS <<_ACEOF
 cat >"\$tmp/subs-2.sed" <<\CEOF$ac_eof
+/@[a-zA-Z_][a-zA-Z_0-9]*@/!b
+_ACEOF
+sed '
+s/[,\\&]/\\&/g; s/@/@|#_!!_#|/g
+s/^/s,@/; s/!/@,|#_!!_#|/
+:n
+t n
+s/'"$ac_delim"'$/,g/; t
+s/$/\\/; p
+N; s/^.*\n//; s/[,\\&]/\\&/g; s/@/@|#_!!_#|/g; b n
+' >>$CONFIG_STATUS <conf$$subs.sed
+rm -f conf$$subs.sed
+cat >>$CONFIG_STATUS <<_ACEOF
+CEOF$ac_eof
+_ACEOF
+
+
+ac_delim='%!_!# '
+for ac_last_try in false false false false false :; do
+  cat >conf$$subs.sed <<_ACEOF
+LTLIBOBJS!$LTLIBOBJS$ac_delim
+_ACEOF
+
+  if test `sed -n "s/.*$ac_delim\$/X/p" conf$$subs.sed | grep -c X` = 1; then
+    break
+  elif $ac_last_try; then
+    { { echo "$as_me:$LINENO: error: could not make $CONFIG_STATUS" >&5
+echo "$as_me: error: could not make $CONFIG_STATUS" >&2;}
+   { (exit 1); exit 1; }; }
+  else
+    ac_delim="$ac_delim!$ac_delim _$ac_delim!! "
+  fi
+done
+
+ac_eof=`sed -n '/^CEOF[0-9]*$/s/CEOF/0/p' conf$$subs.sed`
+if test -n "$ac_eof"; then
+  ac_eof=`echo "$ac_eof" | sort -nru | sed 1q`
+  ac_eof=`expr $ac_eof + 1`
+fi
+
+cat >>$CONFIG_STATUS <<_ACEOF
+cat >"\$tmp/subs-3.sed" <<\CEOF$ac_eof
 /@[a-zA-Z_][a-zA-Z_0-9]*@/!b end
 _ACEOF
 sed '
@@ -22343,7 +22466,7 @@ s&@abs_builddir@&$ac_abs_builddir&;t t
 s&@abs_top_builddir@&$ac_abs_top_builddir&;t t
 s&@INSTALL@&$ac_INSTALL&;t t
 $ac_datarootdir_hack
-" $ac_file_inputs | sed -f "$tmp/subs-1.sed" | sed -f "$tmp/subs-2.sed" >$tmp/out
+" $ac_file_inputs | sed -f "$tmp/subs-1.sed" | sed -f "$tmp/subs-2.sed" | sed -f "$tmp/subs-3.sed" >$tmp/out
 
 test -z "$ac_datarootdir_hack$ac_datarootdir_seen" &&
   { ac_out=`sed -n '/\${datarootdir}/p' "$tmp/out"`; test -n "$ac_out"; } &&
diff --git a/docs/AliasAnalysis.rst b/docs/AliasAnalysis.rst
index fdaec89cdf..54b4a4a746 100644
--- a/docs/AliasAnalysis.rst
+++ b/docs/AliasAnalysis.rst
@@ -205,7 +205,7 @@ look at the `various alias analysis implementations`_ included with LLVM.
 Different Pass styles
 ---------------------
 
-The first step to determining what type of `LLVM pass <WritingAnLLVMPass.html>`_
+The first step to determining what type of :doc:`LLVM pass <WritingAnLLVMPass>`
 you need to use for your Alias Analysis.  As is the case with most other
 analyses and transformations, the answer should be fairly obvious from what type
 of problem you are trying to solve:
@@ -253,25 +253,24 @@ Interfaces which may be specified
 
 All of the `AliasAnalysis
 <http://llvm.org/doxygen/classllvm_1_1AliasAnalysis.html>`__ virtual methods
-default to providing `chaining`_ to another alias analysis implementation, which
-ends up returning conservatively correct information (returning "May" Alias and
-"Mod/Ref" for alias and mod/ref queries respectively).  Depending on the
-capabilities of the analysis you are implementing, you just override the
-interfaces you can improve.
+default to providing :ref:`chaining <aliasanalysis-chaining>` to another alias
+analysis implementation, which ends up returning conservatively correct
+information (returning "May" Alias and "Mod/Ref" for alias and mod/ref queries
+respectively).  Depending on the capabilities of the analysis you are
+implementing, you just override the interfaces you can improve.
 
-.. _chaining:
-.. _chain:
+.. _aliasanalysis-chaining:
 
 ``AliasAnalysis`` chaining behavior
 -----------------------------------
 
-With only one special exception (the `no-aa`_ pass) every alias analysis pass
-chains to another alias analysis implementation (for example, the user can
-specify "``-basicaa -ds-aa -licm``" to get the maximum benefit from both alias
-analyses).  The alias analysis class automatically takes care of most of this
-for methods that you don't override.  For methods that you do override, in code
-paths that return a conservative MayAlias or Mod/Ref result, simply return
-whatever the superclass computes.  For example:
+With only one special exception (the :ref:`-no-aa <aliasanalysis-no-aa>` pass)
+every alias analysis pass chains to another alias analysis implementation (for
+example, the user can specify "``-basicaa -ds-aa -licm``" to get the maximum
+benefit from both alias analyses).  The alias analysis class automatically
+takes care of most of this for methods that you don't override.  For methods
+that you do override, in code paths that return a conservative MayAlias or
+Mod/Ref result, simply return whatever the superclass computes.  For example:
 
 .. code-block:: c++
 
@@ -504,11 +503,11 @@ Available ``AliasAnalysis`` implementations
 -------------------------------------------
 
 This section lists the various implementations of the ``AliasAnalysis``
-interface.  With the exception of the `-no-aa`_ implementation, all of these
-`chain`_ to other alias analysis implementations.
+interface.  With the exception of the :ref:`-no-aa <aliasanalysis-no-aa>`
+implementation, all of these :ref:`chain <aliasanalysis-chaining>` to other
+alias analysis implementations.
 
-.. _no-aa:
-.. _-no-aa:
+.. _aliasanalysis-no-aa:
 
 The ``-no-aa`` pass
 ^^^^^^^^^^^^^^^^^^^
diff --git a/docs/Bugpoint.rst b/docs/Bugpoint.rst
index 9ccf0cc2d9..047129f410 100644
--- a/docs/Bugpoint.rst
+++ b/docs/Bugpoint.rst
@@ -136,9 +136,9 @@ non-obvious ways.  Here are some hints and tips:
   It is often useful to capture the output of the program to file.  For example,
   in the C shell, you can run:
 
-  .. code-block:: bash
+  .. code-block:: console
 
-    bugpoint  ... |& tee bugpoint.log
+    $ bugpoint  ... |& tee bugpoint.log
 
   to get a copy of ``bugpoint``'s output in the file ``bugpoint.log``, as well
   as on your terminal.
diff --git a/docs/CMake.rst b/docs/CMake.rst
index 7f0420c446..f89578863c 100644
--- a/docs/CMake.rst
+++ b/docs/CMake.rst
@@ -36,7 +36,7 @@ We use here the command-line, non-interactive CMake interface.
 #. Create a directory for containing the build. It is not supported to build
    LLVM on the source directory. cd to this directory:
 
-   .. code-block:: bash
+   .. code-block:: console
 
      $ mkdir mybuilddir
      $ cd mybuilddir
@@ -44,7 +44,7 @@ We use here the command-line, non-interactive CMake interface.
 #. Execute this command on the shell replacing `path/to/llvm/source/root` with
    the path to the root of your LLVM source tree:
 
-   .. code-block:: bash
+   .. code-block:: console
 
      $ cmake path/to/llvm/source/root
 
@@ -80,14 +80,14 @@ the corresponding *Generator* for creating files for your build tool. You can
 explicitly specify the generator with the command line option ``-G "Name of the
 generator"``. For knowing the available generators on your platform, execute
 
-.. code-block:: bash
+.. code-block:: console
 
   $ cmake --help
 
 This will list the generator's names at the end of the help text. Generator's
 names are case-sensitive. Example:
 
-.. code-block:: bash
+.. code-block:: console
 
   $ cmake -G "Visual Studio 9 2008" path/to/llvm/source/root
 
@@ -110,14 +110,14 @@ Variables customize how the build will be generated. Options are boolean
 variables, with possible values ON/OFF. Options and variables are defined on the
 CMake command line like this:
 
-.. code-block:: bash
+.. code-block:: console
 
   $ cmake -DVARIABLE=value path/to/llvm/source
 
 You can set a variable after the initial CMake invocation for changing its
 value. You can also undefine a variable:
 
-.. code-block:: bash
+.. code-block:: console
 
   $ cmake -UVARIABLE path/to/llvm/source
 
@@ -127,7 +127,7 @@ on the root of the build directory. Do not hand-edit it.
 Variables are listed here appending its type after a colon. It is correct to
 write the variable and the type on the CMake command line:
 
-.. code-block:: bash
+.. code-block:: console
 
   $ cmake -DVARIABLE:TYPE=value path/to/llvm/source
 
@@ -280,7 +280,7 @@ Testing is performed when the *check* target is built. For instance, if you are
 using makefiles, execute this command while on the top level of your build
 directory:
 
-.. code-block:: bash
+.. code-block:: console
 
   $ make check
 
@@ -355,13 +355,15 @@ an equivalent variant of snippet shown above:
 
   target_link_libraries(mycompiler ${REQ_LLVM_LIBRARIES})
 
+.. _cmake-out-of-source-pass:
+
 Developing LLVM pass out of source
 ----------------------------------
 
 It is possible to develop LLVM passes against installed LLVM.  An example of
 project layout provided below:
 
-.. code-block:: bash
+.. code-block:: none
 
   <project dir>/
       |
diff --git a/docs/CommandGuide/llvm-bcanalyzer.rst b/docs/CommandGuide/llvm-bcanalyzer.rst
index f1e4eac1be..7254088ec9 100644
--- a/docs/CommandGuide/llvm-bcanalyzer.rst
+++ b/docs/CommandGuide/llvm-bcanalyzer.rst
@@ -1,424 +1,305 @@
 llvm-bcanalyzer - LLVM bitcode analyzer
 =======================================
 
-
 SYNOPSIS
 --------
 
-
-**llvm-bcanalyzer** [*options*] [*filename*]
-
+:program:`llvm-bcanalyzer` [*options*] [*filename*]
 
 DESCRIPTION
 -----------
 
+The :program:`llvm-bcanalyzer` command is a small utility for analyzing bitcode
+files.  The tool reads a bitcode file (such as generated with the
+:program:`llvm-as` tool) and produces a statistical report on the contents of
+the bitcode file.  The tool can also dump a low level but human readable
+version of the bitcode file.  This tool is probably not of much interest or
+utility except for those working directly with the bitcode file format.  Most
+LLVM users can just ignore this tool.
 
-The **llvm-bcanalyzer** command is a small utility for analyzing bitcode files.
-The tool reads a bitcode file (such as generated with the **llvm-as** tool) and
-produces a statistical report on the contents of the bitcode file.  The tool
-can also dump a low level but human readable version of the bitcode file.
-This tool is probably not of much interest or utility except for those working
-directly with the bitcode file format. Most LLVM users can just ignore
-this tool.
-
-If *filename* is omitted or is ``-``, then **llvm-bcanalyzer** reads its input
-from standard input. This is useful for combining the tool into a pipeline.
-Output is written to the standard output.
-
+If *filename* is omitted or is ``-``, then :program:`llvm-bcanalyzer` reads its
+input from standard input.  This is useful for combining the tool into a
+pipeline.  Output is written to the standard output.
 
 OPTIONS
 -------
 
+.. program:: llvm-bcanalyzer
 
+.. option:: -nodetails
 
-**-nodetails**
-
- Causes **llvm-bcanalyzer** to abbreviate its output by writing out only a module
- level summary. The details for individual functions are not displayed.
-
-
+ Causes :program:`llvm-bcanalyzer` to abbreviate its output by writing out only
+ a module level summary.  The details for individual functions are not
+ displayed.
 
-**-dump**
+.. option:: -dump
 
- Causes **llvm-bcanalyzer** to dump the bitcode in a human readable format. This
- format is significantly different from LLVM assembly and provides details about
- the encoding of the bitcode file.
+ Causes :program:`llvm-bcanalyzer` to dump the bitcode in a human readable
+ format.  This format is significantly different from LLVM assembly and
+ provides details about the encoding of the bitcode file.
 
+.. option:: -verify
 
-
-**-verify**
-
- Causes **llvm-bcanalyzer** to verify the module produced by reading the
- bitcode. This ensures that the statistics generated are based on a consistent
+ Causes :program:`llvm-bcanalyzer` to verify the module produced by reading the
+ bitcode.  This ensures that the statistics generated are based on a consistent
  module.
 
-
-
-**-help**
+.. option:: -help
 
  Print a summary of command line options.
 
-
-
-
 EXIT STATUS
 -----------
 
-
-If **llvm-bcanalyzer** succeeds, it will exit with 0.  Otherwise, if an error
-occurs, it will exit with a non-zero value, usually 1.
-
+If :program:`llvm-bcanalyzer` succeeds, it will exit with 0.  Otherwise, if an
+error occurs, it will exit with a non-zero value, usually 1.
 
 SUMMARY OUTPUT DEFINITIONS
 --------------------------
 
-
-The following items are always printed by llvm-bcanalyzer. They comprize the
+The following items are always printed by llvm-bcanalyzer.  They comprize the
 summary output.
 
-
 **Bitcode Analysis Of Module**
 
  This just provides the name of the module for which bitcode analysis is being
  generated.
 
-
-
 **Bitcode Version Number**
 
  The bitcode version (not LLVM version) of the file read by the analyzer.
 
-
-
 **File Size**
 
  The size, in bytes, of the entire bitcode file.
 
-
-
 **Module Bytes**
 
- The size, in bytes, of the module block. Percentage is relative to File Size.
-
-
+ The size, in bytes, of the module block.  Percentage is relative to File Size.
 
 **Function Bytes**
 
- The size, in bytes, of all the function blocks. Percentage is relative to File
+ The size, in bytes, of all the function blocks.  Percentage is relative to File
  Size.
 
-
-
 **Global Types Bytes**
 
- The size, in bytes, of the Global Types Pool. Percentage is relative to File
- Size. This is the size of the definitions of all types in the bitcode file.
-
-
+ The size, in bytes, of the Global Types Pool.  Percentage is relative to File
+ Size.  This is the size of the definitions of all types in the bitcode file.
 
 **Constant Pool Bytes**
 
  The size, in bytes, of the Constant Pool Blocks Percentage is relative to File
  Size.
 
-
-
 **Module Globals Bytes**
 
  Ths size, in bytes, of the Global Variable Definitions and their initializers.
  Percentage is relative to File Size.
 
-
-
 **Instruction List Bytes**
 
  The size, in bytes, of all the instruction lists in all the functions.
- Percentage is relative to File Size. Note that this value is also included in
+ Percentage is relative to File Size.  Note that this value is also included in
  the Function Bytes.
 
-
-
 **Compaction Table Bytes**
 
  The size, in bytes, of all the compaction tables in all the functions.
- Percentage is relative to File Size. Note that this value is also included in
+ Percentage is relative to File Size.  Note that this value is also included in
  the Function Bytes.
 
-
-
 **Symbol Table Bytes**
 
- The size, in bytes, of all the symbol tables in all the functions. Percentage is
- relative to File Size. Note that this value is also included in the Function
+ The size, in bytes, of all the symbol tables in all the functions.  Percentage is
+ relative to File Size.  Note that this value is also included in the Function
  Bytes.
 
-
-
 **Dependent Libraries Bytes**
 
- The size, in bytes, of the list of dependent libraries in the module. Percentage
- is relative to File Size. Note that this value is also included in the Module
+ The size, in bytes, of the list of dependent libraries in the module.  Percentage
+ is relative to File Size.  Note that this value is also included in the Module
  Global Bytes.
 
-
-
 **Number Of Bitcode Blocks**
 
  The total number of blocks of any kind in the bitcode file.
 
-
-
 **Number Of Functions**
 
  The total number of function definitions in the bitcode file.
 
-
-
 **Number Of Types**
 
  The total number of types defined in the Global Types Pool.
 
-
-
 **Number Of Constants**
 
  The total number of constants (of any type) defined in the Constant Pool.
 
-
-
 **Number Of Basic Blocks**
 
  The total number of basic blocks defined in all functions in the bitcode file.
 
-
-
 **Number Of Instructions**
 
  The total number of instructions defined in all functions in the bitcode file.
 
-
-
 **Number Of Long Instructions**
 
  The total number of long instructions defined in all functions in the bitcode
- file. Long instructions are those taking greater than 4 bytes. Typically long
+ file.  Long instructions are those taking greater than 4 bytes.  Typically long
  instructions are GetElementPtr with several indices, PHI nodes, and calls to
  functions with large numbers of arguments.
 
-
-
 **Number Of Operands**
 
  The total number of operands used in all instructions in the bitcode file.
 
-
-
 **Number Of Compaction Tables**
 
  The total number of compaction tables in all functions in the bitcode file.
 
-
-
 **Number Of Symbol Tables**
 
  The total number of symbol tables in all functions in the bitcode file.
 
-
-
 **Number Of Dependent Libs**
 
  The total number of dependent libraries found in the bitcode file.
 
-
-
 **Total Instruction Size**
 
  The total size of the instructions in all functions in the bitcode file.
 
-
-
 **Average Instruction Size**
 
  The average number of bytes per instruction across all functions in the bitcode
- file. This value is computed by dividing Total Instruction Size by Number Of
+ file.  This value is computed by dividing Total Instruction Size by Number Of
  Instructions.
 
-
-
 **Maximum Type Slot Number**
 
- The maximum value used for a type's slot number. Larger slot number values take
+ The maximum value used for a type's slot number.  Larger slot number values take
  more bytes to encode.
 
-
-
 **Maximum Value Slot Number**
 
- The maximum value used for a value's slot number. Larger slot number values take
+ The maximum value used for a value's slot number.  Larger slot number values take
  more bytes to encode.
 
-
-
 **Bytes Per Value**
 
- The average size of a Value definition (of any type). This is computed by
+ The average size of a Value definition (of any type).  This is computed by
  dividing File Size by the total number of values of any type.
 
-
-
 **Bytes Per Global**
 
  The average size of a global definition (constants and global variables).
 
-
-
 **Bytes Per Function**
 
- The average number of bytes per function definition. This is computed by
+ The average number of bytes per function definition.  This is computed by
  dividing Function Bytes by Number Of Functions.
 
-
-
 **# of VBR 32-bit Integers**
 
  The total number of 32-bit integers encoded using the Variable Bit Rate
  encoding scheme.
 
-
-
 **# of VBR 64-bit Integers**
 
  The total number of 64-bit integers encoded using the Variable Bit Rate encoding
  scheme.
 
-
-
 **# of VBR Compressed Bytes**
 
  The total number of bytes consumed by the 32-bit and 64-bit integers that use
  the Variable Bit Rate encoding scheme.
 
-
-
 **# of VBR Expanded Bytes**
 
  The total number of bytes that would have been consumed by the 32-bit and 64-bit
  integers had they not been compressed with the Variable Bit Rage encoding
  scheme.
 
-
-
 **Bytes Saved With VBR**
 
  The total number of bytes saved by using the Variable Bit Rate encoding scheme.
  The percentage is relative to # of VBR Expanded Bytes.
 
-
-
-
 DETAILED OUTPUT DEFINITIONS
 ---------------------------
 
-
 The following definitions occur only if the -nodetails option was not given.
 The detailed output provides additional information on a per-function basis.
 
-
 **Type**
 
  The type signature of the function.
 
-
-
 **Byte Size**
 
  The total number of bytes in the function's block.
 
-
-
 **Basic Blocks**
 
  The number of basic blocks defined by the function.
 
-
-
 **Instructions**
 
  The number of instructions defined by the function.
 
-
-
 **Long Instructions**
 
  The number of instructions using the long instruction format in the function.
 
-
-
 **Operands**
 
  The number of operands used by all instructions in the function.
 
-
-
 **Instruction Size**
 
  The number of bytes consumed by instructions in the function.
 
-
-
 **Average Instruction Size**
 
- The average number of bytes consumed by the instructions in the function. This
- value is computed by dividing Instruction Size by Instructions.
-
-
+ The average number of bytes consumed by the instructions in the function.
+ This value is computed by dividing Instruction Size by Instructions.
 
 **Bytes Per Instruction**
 
- The average number of bytes used by the function per instruction. This value is
- computed by dividing Byte Size by Instructions. Note that this is not the same
- as Average Instruction Size. It computes a number relative to the total function
- size not just the size of the instruction list.
-
-
+ The average number of bytes used by the function per instruction.  This value
+ is computed by dividing Byte Size by Instructions.  Note that this is not the
+ same as Average Instruction Size.  It computes a number relative to the total
+ function size not just the size of the instruction list.
 
 **Number of VBR 32-bit Integers**
 
  The total number of 32-bit integers found in this function (for any use).
 
-
-
 **Number of VBR 64-bit Integers**
 
  The total number of 64-bit integers found in this function (for any use).
 
-
-
 **Number of VBR Compressed Bytes**
 
  The total number of bytes in this function consumed by the 32-bit and 64-bit
  integers that use the Variable Bit Rate encoding scheme.
 
-
-
 **Number of VBR Expanded Bytes**
 
  The total number of bytes in this function that would have been consumed by
  the 32-bit and 64-bit integers had they not been compressed with the Variable
  Bit Rate encoding scheme.
 
-
-
 **Bytes Saved With VBR**
 
  The total number of bytes saved in this function by using the Variable Bit
- Rate encoding scheme. The percentage is relative to # of VBR Expanded Bytes.
-
-
-
+ Rate encoding scheme.  The percentage is relative to # of VBR Expanded Bytes.
 
 SEE ALSO
 --------
 
+:doc:`/CommandGuide/llvm-dis`, :doc:`/BitCodeFormat`
 
-llvm-dis|llvm-dis, `http://llvm.org/docs/BitCodeFormat.html <http://llvm.org/docs/BitCodeFormat.html>`_
diff --git a/docs/CompilerWriterInfo.rst b/docs/CompilerWriterInfo.rst
index 7504d3c75a..f21491994f 100644
--- a/docs/CompilerWriterInfo.rst
+++ b/docs/CompilerWriterInfo.rst
@@ -81,7 +81,7 @@ AMD - Official manuals and docs
 Intel - Official manuals and docs
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-* `IA-32 manuals <http://developer.intel.com/design/pentium4/manuals/index_new.htm>`_
+* `Intel 64 and IA-32 manuals <http://www.intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html>`_
 * `Intel Itanium documentation <http://www.intel.com/design/itanium/documentation.htm?iid=ipp_srvr_proc_itanium2+techdocs>`_
 
 Other x86-specific information
diff --git a/docs/Dummy.html b/docs/Dummy.html
new file mode 100644
index 0000000000..e69de29bb2
--- /dev/null
+++ b/docs/Dummy.html
diff --git a/docs/FAQ.rst b/docs/FAQ.rst
index b0e3ca0456..67e8d0b27b 100644
--- a/docs/FAQ.rst
+++ b/docs/FAQ.rst
@@ -79,7 +79,7 @@ grabbing the wrong linker/assembler/etc, there are two ways to fix it:
 #. Run ``configure`` with an alternative ``PATH`` that is correct. In a
    Bourne compatible shell, the syntax would be:
 
-.. code-block:: bash
+.. code-block:: console
 
    % PATH=[the path without the bad program] ./configure ...
 
@@ -106,7 +106,7 @@ I've modified a Makefile in my source tree, but my build tree keeps using the ol
 If the Makefile already exists in your object tree, you can just run the
 following command in the top level directory of your object tree:
 
-.. code-block:: bash
+.. code-block:: console
 
    % ./config.status <relative path to Makefile>;
 
@@ -133,13 +133,13 @@ This is most likely occurring because you built a profile or release
 
 For example, if you built LLVM with the command:
 
-.. code-block:: bash
+.. code-block:: console
 
    % gmake ENABLE_PROFILING=1
 
 ...then you must run the tests with the following commands:
 
-.. code-block:: bash
+.. code-block:: console
 
    % cd llvm/test
    % gmake ENABLE_PROFILING=1
@@ -175,17 +175,17 @@ After Subversion update, rebuilding gives the error "No rule to make target".
 -----------------------------------------------------------------------------
 If the error is of the form:
 
-.. code-block:: bash
+.. code-block:: console
 
    gmake[2]: *** No rule to make target `/path/to/somefile',
-   needed by `/path/to/another/file.d'.
+                 needed by `/path/to/another/file.d'.
    Stop.
 
 This may occur anytime files are moved within the Subversion repository or
 removed entirely.  In this case, the best solution is to erase all ``.d``
 files, which list dependencies for source files, and rebuild:
 
-.. code-block:: bash
+.. code-block:: console
 
    % cd $LLVM_OBJ_DIR
    % rm -f `find . -name \*\.d`
diff --git a/docs/GarbageCollection.rst b/docs/GarbageCollection.rst
index b0b2718409..7765bd7d04 100644
--- a/docs/GarbageCollection.rst
+++ b/docs/GarbageCollection.rst
@@ -49,8 +49,6 @@ techniques dominates any low-level losses.
 This document describes the mechanisms and interfaces provided by LLVM to
 support accurate garbage collection.
 
-.. _feature:
-
 Goals and non-goals
 -------------------
 
@@ -121,8 +119,6 @@ lot of work for the developer of a novel language.  However, it's easy to get
 started quickly and scale up to a more sophisticated implementation as your
 compiler matures.
 
-.. _quickstart:
-
 Getting started
 ===============
 
@@ -177,8 +173,6 @@ To help with several of these tasks (those indicated with a \*), LLVM includes a
 highly portable, built-in ShadowStack code generator.  It is compiled into
 ``llc`` and works even with the interpreter and C backends.
 
-.. _quickstart-compiler:
-
 In your compiler
 ----------------
 
@@ -200,8 +194,6 @@ There's no need to use ``@llvm.gcread`` and ``@llvm.gcwrite`` over plain
 ``load`` and ``store`` for now.  You will need them when switching to a more
 advanced GC.
 
-.. _quickstart-runtime:
-
 In your runtime
 ---------------
 
@@ -263,8 +255,6 @@ data structure, but there are only 20 lines of meaningful code.)
     }
   }
 
-.. _shadow-stack:
-
 About the shadow stack
 ----------------------
 
@@ -283,8 +273,9 @@ The tradeoff for this simplicity and portability is:
 * Not thread-safe.
 
 Still, it's an easy way to get started.  After your compiler and runtime are up
-and running, writing a plugin_ will allow you to take advantage of :ref:`more
-advanced GC features <collector-algos>` of LLVM in order to improve performance.
+and running, writing a :ref:`plugin <plugin>` will allow you to take advantage
+of :ref:`more advanced GC features <collector-algos>` of LLVM in order to
+improve performance.
 
 .. _gc_intrinsics:
 
@@ -300,8 +291,6 @@ These facilities are limited to those strictly necessary; they are not intended
 to be a complete interface to any garbage collector.  A program will need to
 interface with the GC library using the facilities provided by that program.
 
-.. _gcattr:
-
 Specifying GC code generation: ``gc "..."``
 -------------------------------------------
 
@@ -392,8 +381,6 @@ could be compiled to this LLVM code:
      store %Object* null, %Object** %X
      ...
 
-.. _barriers:
-
 Reading and writing references in the heap
 ------------------------------------------
 
@@ -423,15 +410,13 @@ pointer:
   %derived = getelementptr %object, i32 0, i32 2, i32 %n
 
 LLVM does not enforce this relationship between the object and derived pointer
-(although a plugin_ might).  However, it would be an unusual collector that
-violated it.
+(although a :ref:`plugin <plugin>` might).  However, it would be an unusual
+collector that violated it.
 
 The use of these intrinsics is naturally optional if the target GC does require
 the corresponding barrier.  Such a GC plugin will replace the intrinsic calls
 with the corresponding ``load`` or ``store`` instruction if they are used.
 
-.. _gcwrite:
-
 Write barrier: ``llvm.gcwrite``
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -442,14 +427,12 @@ Write barrier: ``llvm.gcwrite``
 For write barriers, LLVM provides the ``llvm.gcwrite`` intrinsic function.  It
 has exactly the same semantics as a non-volatile ``store`` to the derived
 pointer (the third argument).  The exact code generated is specified by a
-compiler plugin_.
+compiler :ref:`plugin <plugin>`.
 
 Many important algorithms require write barriers, including generational and
 concurrent collectors.  Additionally, write barriers could be used to implement
 reference counting.
 
-.. _gcread:
-
 Read barrier: ``llvm.gcread``
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -459,8 +442,8 @@ Read barrier: ``llvm.gcread``
 
 For read barriers, LLVM provides the ``llvm.gcread`` intrinsic function.  It has
 exactly the same semantics as a non-volatile ``load`` from the derived pointer
-(the second argument).  The exact code generated is specified by a compiler
-plugin_.
+(the second argument).  The exact code generated is specified by a
+:ref:`compiler plugin <plugin>`.
 
 Read barriers are needed by fewer algorithms than write barriers, and may have a
 greater performance impact since pointer reads are more frequent than writes.
@@ -739,8 +722,6 @@ Since LLVM does not yet compute liveness information, there is no means of
 distinguishing an uninitialized stack root from an initialized one.  Therefore,
 this feature should be used by all GC plugins.  It is enabled by default.
 
-.. _custom:
-
 Custom lowering of intrinsics: ``CustomRoots``, ``CustomReadBarriers``, and ``CustomWriteBarriers``
 ---------------------------------------------------------------------------------------------------
 
@@ -777,10 +758,10 @@ If ``CustomReadBarriers`` or ``CustomWriteBarriers`` are specified, then
 ``performCustomLowering`` **must** eliminate the corresponding barriers.
 
 ``performCustomLowering`` must comply with the same restrictions as
-`FunctionPass::runOnFunction <WritingAnLLVMPass.html#runOnFunction>`__
+:ref:`FunctionPass::runOnFunction <writing-an-llvm-pass-runOnFunction>`
 Likewise, ``initializeCustomLowering`` has the same semantics as
-`Pass::doInitialization(Module&)
-<WritingAnLLVMPass.html#doInitialization_mod>`__
+:ref:`Pass::doInitialization(Module&)
+<writing-an-llvm-pass-doInitialization-mod>`
 
 The following can be used as a template:
 
diff --git a/docs/GettingStarted.rst b/docs/GettingStarted.rst
index 8902684c98..d99acbbb48 100644
--- a/docs/GettingStarted.rst
+++ b/docs/GettingStarted.rst
@@ -505,7 +505,7 @@ directory:
 If you would like to get the LLVM test suite (a separate package as of 1.4), you
 get it from the Subversion repository:
 
-.. code-block:: bash
+.. code-block:: console
 
   % cd llvm/projects
   % svn co http://llvm.org/svn/llvm-project/test-suite/trunk test-suite
@@ -523,13 +523,13 @@ marks (so, you can recreate git-svn metadata locally). Note that right now
 mirrors reflect only ``trunk`` for each project. You can do the read-only GIT
 clone of LLVM via:
 
-.. code-block:: bash
+.. code-block:: console
 
   % git clone http://llvm.org/git/llvm.git
 
 If you want to check out clang too, run:
 
-.. code-block:: bash
+.. code-block:: console
 
   % git clone http://llvm.org/git/llvm.git
   % cd llvm/tools
@@ -540,7 +540,7 @@ pull --rebase`` instead of ``git pull`` to avoid generating a non-linear history
 in your clone.  To configure ``git pull`` to pass ``--rebase`` by default on the
 master branch, run the following command:
 
-.. code-block:: bash
+.. code-block:: console
 
   % git config branch.master.rebase true
 
@@ -553,13 +553,13 @@ Assume ``master`` points the upstream and ``mybranch`` points your working
 branch, and ``mybranch`` is rebased onto ``master``.  At first you may check
 sanity of whitespaces:
 
-.. code-block:: bash
+.. code-block:: console
 
   % git diff --check master..mybranch
 
 The easiest way to generate a patch is as below:
 
-.. code-block:: bash
+.. code-block:: console
 
   % git diff master..mybranch > /path/to/mybranch.diff
 
@@ -570,14 +570,14 @@ could be accepted with ``patch -p1 -N``.
 But you may generate patchset with git-format-patch. It generates by-each-commit
 patchset. To generate patch files to attach to your article:
 
-.. code-block:: bash
+.. code-block:: console
 
   % git format-patch --no-attach master..mybranch -o /path/to/your/patchset
 
 If you would like to send patches directly, you may use git-send-email or
 git-imap-send. Here is an example to generate the patchset in Gmail's [Drafts].
 
-.. code-block:: bash
+.. code-block:: console
 
   % git format-patch --attach master..mybranch --stdout | git imap-send
 
@@ -603,7 +603,7 @@ For developers to work with git-svn
 
 To set up clone from which you can submit code using ``git-svn``, run:
 
-.. code-block:: bash
+.. code-block:: console
 
   % git clone http://llvm.org/git/llvm.git
   % cd llvm
@@ -622,7 +622,7 @@ To set up clone from which you can submit code using ``git-svn``, run:
 To update this clone without generating git-svn tags that conflict with the
 upstream git repo, run:
 
-.. code-block:: bash
+.. code-block:: console
 
   % git fetch && (cd tools/clang && git fetch)  # Get matching revisions of both trees.
   % git checkout master
@@ -640,7 +640,7 @@ The git-svn metadata can get out of sync after you mess around with branches and
 ``dcommit``. When that happens, ``git svn dcommit`` stops working, complaining
 about files with uncommitted changes. The fix is to rebuild the metadata:
 
-.. code-block:: bash
+.. code-block:: console
 
   % rm -rf .git/svn
   % git svn rebase -l
@@ -722,13 +722,13 @@ To configure LLVM, follow these steps:
 
 #. Change directory into the object root directory:
 
-   .. code-block:: bash
+   .. code-block:: console
 
      % cd OBJ_ROOT
 
 #. Run the ``configure`` script located in the LLVM source tree:
 
-   .. code-block:: bash
+   .. code-block:: console
 
      % SRC_ROOT/configure --prefix=/install/path [other options]
 
@@ -764,7 +764,7 @@ Profile Builds
 Once you have LLVM configured, you can build it by entering the *OBJ_ROOT*
 directory and issuing the following command:
 
-.. code-block:: bash
+.. code-block:: console
 
   % gmake
 
@@ -775,7 +775,7 @@ If you have multiple processors in your machine, you may wish to use some of the
 parallel build options provided by GNU Make.  For example, you could use the
 command:
 
-.. code-block:: bash
+.. code-block:: console
 
   % gmake -j2
 
@@ -857,7 +857,7 @@ For instructions on how to install Sphinx, see
 After following the instructions there for installing Sphinx, build the LLVM
 HTML documentation by doing the following:
 
-.. code-block:: bash
+.. code-block:: console
 
   $ cd SRC_ROOT/docs
   $ make -f Makefile.sphinx
@@ -893,13 +893,13 @@ This is accomplished in the typical autoconf manner:
 
 * Change directory to where the LLVM object files should live:
 
-  .. code-block:: bash
+  .. code-block:: console
 
     % cd OBJ_ROOT
 
 * Run the ``configure`` script found in the LLVM source directory:
 
-  .. code-block:: bash
+  .. code-block:: console
 
     % SRC_ROOT/configure
 
@@ -945,7 +945,7 @@ module, and you have root access on the system, you can set your system up to
 execute LLVM bitcode files directly. To do this, use commands like this (the
 first command may not be required if you are already using the module):
 
-.. code-block:: bash
+.. code-block:: console
 
   % mount -t binfmt_misc none /proc/sys/fs/binfmt_misc
   % echo ':llvm:M::BC::/path/to/lli:' > /proc/sys/fs/binfmt_misc/register
@@ -955,7 +955,7 @@ first command may not be required if you are already using the module):
 This allows you to execute LLVM bitcode files directly.  On Debian, you can also
 use this command instead of the 'echo' command above:
 
-.. code-block:: bash
+.. code-block:: console
 
   % sudo update-binfmts --install llvm /path/to/lli --magic 'BC'
 
@@ -1246,7 +1246,7 @@ Example with clang
 
 #. Next, compile the C file into a native executable:
 
-   .. code-block:: bash
+   .. code-block:: console
 
      % clang hello.c -o hello
 
@@ -1257,7 +1257,7 @@ Example with clang
 
 #. Next, compile the C file into a LLVM bitcode file:
 
-   .. code-block:: bash
+   .. code-block:: console
 
      % clang -O3 -emit-llvm hello.c -c -o hello.bc
 
@@ -1267,13 +1267,13 @@ Example with clang
 
 #. Run the program in both forms. To run the program, use:
 
-   .. code-block:: bash
+   .. code-block:: console
 
       % ./hello
  
    and
 
-   .. code-block:: bash
+   .. code-block:: console
 
      % lli hello.bc
 
@@ -1282,27 +1282,27 @@ Example with clang
 
 #. Use the ``llvm-dis`` utility to take a look at the LLVM assembly code:
 
-   .. code-block:: bash
+   .. code-block:: console
 
      % llvm-dis < hello.bc | less
 
 #. Compile the program to native assembly using the LLC code generator:
 
-   .. code-block:: bash
+   .. code-block:: console
 
      % llc hello.bc -o hello.s
 
 #. Assemble the native assembly language file into a program:
 
-   .. code-block:: bash
+   .. code-block:: console
 
-     **Solaris:** % /opt/SUNWspro/bin/cc -xarch=v9 hello.s -o hello.native
+     % /opt/SUNWspro/bin/cc -xarch=v9 hello.s -o hello.native   # On Solaris
 
-     **Others:**  % gcc hello.s -o hello.native
+     % gcc hello.s -o hello.native                              # On others
 
 #. Execute the native code program:
 
-   .. code-block:: bash
+   .. code-block:: console
 
      % ./hello.native
 
diff --git a/docs/Lexicon.rst b/docs/Lexicon.rst
index d568c0b302..cbe1585226 100644
--- a/docs/Lexicon.rst
+++ b/docs/Lexicon.rst
@@ -20,8 +20,10 @@ A
 B
 -
 
+.. _lexicon-bb-vectorization:
+
 **BB Vectorization**
-    Basic Block Vectorization
+    Basic-Block Vectorization
 
 **BURS**
     Bottom Up Rewriting System --- A method of instruction selection for code
@@ -185,6 +187,10 @@ S
 **SCCP**
     Sparse Conditional Constant Propagation
 
+**SLP**
+    Superword-Level Parallelism, same as :ref:`Basic-Block Vectorization
+    <lexicon-bb-vectorization>`.
+
 **SRoA**
     Scalar Replacement of Aggregates
 
diff --git a/docs/LinkTimeOptimization.rst b/docs/LinkTimeOptimization.rst
index 7eacf0bd0d..822196ccf4 100644
--- a/docs/LinkTimeOptimization.rst
+++ b/docs/LinkTimeOptimization.rst
@@ -85,9 +85,10 @@ invokes system linker.
     return foo1();
   }
 
-.. code-block:: bash
+To compile, run:
+
+.. code-block:: console
 
-  --- command lines ---
   % clang -emit-llvm -c a.c -o a.o   # <-- a.o is LLVM bitcode file
   % clang -c main.c -o main.o        # <-- main.o is native object file
   % clang a.o main.o -o main         # <-- standard link command without modifications
@@ -96,7 +97,7 @@ invokes system linker.
   visible symbol defined in LLVM bitcode file. The linker completes its usual
   symbol resolution pass and finds that ``foo2()`` is not used
   anywhere. This information is used by the LLVM optimizer and it
-  removes ``foo2()``.</li>
+  removes ``foo2()``.
 
 * As soon as ``foo2()`` is removed, the optimizer recognizes that condition ``i
   < 0`` is always false, which means ``foo3()`` is never used. Hence, the
diff --git a/docs/Makefile.sphinx b/docs/Makefile.sphinx
index 3746522db6..21f66488b2 100644
--- a/docs/Makefile.sphinx
+++ b/docs/Makefile.sphinx
@@ -46,10 +46,6 @@ clean:
 html:
 	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
 	@echo
-	@# FIXME: Remove this `cp` once HTML->Sphinx transition is completed.
-	@# Kind of a hack, but HTML-formatted docs are on the way out anyway.
-	@echo "Copying legacy HTML-formatted docs into $(BUILDDIR)/html"
-	@cp -a *.html $(BUILDDIR)/html
 	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
 
 dirhtml:
diff --git a/docs/MakefileGuide.rst b/docs/MakefileGuide.rst
index 2c1d33e962..168b0b3348 100644
--- a/docs/MakefileGuide.rst
+++ b/docs/MakefileGuide.rst
@@ -170,9 +170,9 @@ openable with the ``dlopen`` function and searchable with the ``dlsym`` function
 (or your operating system's equivalents). While this isn't strictly necessary on
 Linux and a few other platforms, it is required on systems like HP-UX and
 Darwin. You should use ``LOADABLE_MODULE`` for any shared library that you
-intend to be loaded into an tool via the ``-load`` option. See the
-`WritingAnLLVMPass.html <WritingAnLLVMPass.html#makefile>`_ document for an
-example of why you might want to do this.
+intend to be loaded into an tool via the ``-load`` option.  `Pass documentation
+<writing-an-llvm-pass-makefile>`_ has an example of why you might want to do
+this.
 
 Bitcode Modules
 ^^^^^^^^^^^^^^^
diff --git a/docs/Passes.html b/docs/Passes.html
deleted file mode 100644
index 7bffc54d8d..0000000000
--- a/docs/Passes.html
+++ /dev/null
@@ -1,2025 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
-                      "http://www.w3.org/TR/html4/strict.dtd">
-<html>
-<head>
-  <title>LLVM's Analysis and Transform Passes</title>
-  <link rel="stylesheet" href="_static/llvm.css" type="text/css">
-  <meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
-</head>
-<body>
-
-<!--
-
-If Passes.html is up to date, the following "one-liner" should print
-an empty diff.
-
-egrep -e '^<tr><td><a href="#.*">-.*</a></td><td>.*</td></tr>$' \
-      -e '^  <a name=".*">.*</a>$' < Passes.html >html; \
-perl >help <<'EOT' && diff -u help html; rm -f help html
-open HTML, "<Passes.html" or die "open: Passes.html: $!\n";
-while (<HTML>) {
-  m:^<tr><td><a href="#(.*)">-.*</a></td><td>.*</td></tr>$: or next;
-  $order{$1} = sprintf("%03d", 1 + int %order);
-}
-open HELP, "../Release/bin/opt -help|" or die "open: opt -help: $!\n";
-while (<HELP>) {
-  m:^    -([^ ]+) +- (.*)$: or next;
-  my $o = $order{$1};
-  $o = "000" unless defined $o;
-  push @x, "$o<tr><td><a href=\"#$1\">-$1</a></td><td>$2</td></tr>\n";
-  push @y, "$o  <a name=\"$1\">-$1: $2</a>\n";
-}
-@x = map { s/^\d\d\d//; $_ } sort @x;
-@y = map { s/^\d\d\d//; $_ } sort @y;
-print @x, @y;
-EOT
-
-This (real) one-liner can also be helpful when converting comments to HTML:
-
-perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !$on && $_ =~ /\S/; print "  </p>\n" if $on && $_ =~ /^\s*$/; print "  $_\n"; $on = ($_ =~ /\S/); } print "  </p>\n" if $on'
-
-  -->
-
-<h1>LLVM's Analysis and Transform Passes</h1>
-
-<ol>
-  <li><a href="#intro">Introduction</a></li>
-  <li><a href="#analyses">Analysis Passes</a>
-  <li><a href="#transforms">Transform Passes</a></li>
-  <li><a href="#utilities">Utility Passes</a></li>
-</ol>
-
-<div class="doc_author">
-  <p>Written by <a href="mailto:rspencer@x10sys.com">Reid Spencer</a>
-            and Gordon Henriksen</p>
-</div>
-
-<!-- ======================================================================= -->
-<h2><a name="intro">Introduction</a></h2>
-<div>
-  <p>This document serves as a high level summary of the optimization features 
-  that LLVM provides. Optimizations are implemented as Passes that traverse some
-  portion of a program to either collect information or transform the program.
-  The table below divides the passes that LLVM provides into three categories.
-  Analysis passes compute information that other passes can use or for debugging
-  or program visualization purposes. Transform passes can use (or invalidate)
-  the analysis passes. Transform passes all mutate the program in some way. 
-  Utility passes provides some utility but don't otherwise fit categorization.
-  For example passes to extract functions to bitcode or write a module to
-  bitcode are neither analysis nor transform passes.
-  <p>The table below provides a quick summary of each pass and links to the more
-  complete pass description later in the document.</p>
-
-<table>
-<tr><th colspan="2"><b>ANALYSIS PASSES</b></th></tr>
-<tr><th>Option</th><th>Name</th></tr>
-<tr><td><a href="#aa-eval">-aa-eval</a></td><td>Exhaustive Alias Analysis Precision Evaluator</td></tr>
-<tr><td><a href="#basicaa">-basicaa</a></td><td>Basic Alias Analysis (stateless AA impl)</td></tr>
-<tr><td><a href="#basiccg">-basiccg</a></td><td>Basic CallGraph Construction</td></tr>
-<tr><td><a href="#count-aa">-count-aa</a></td><td>Count Alias Analysis Query Responses</td></tr>
-<tr><td><a href="#da">-da</a></td><td>Dependence Analysis</td></tr>
-<tr><td><a href="#debug-aa">-debug-aa</a></td><td>AA use debugger</td></tr>
-<tr><td><a href="#domfrontier">-domfrontier</a></td><td>Dominance Frontier Construction</td></tr>
-<tr><td><a href="#domtree">-domtree</a></td><td>Dominator Tree Construction</td></tr>
-<tr><td><a href="#dot-callgraph">-dot-callgraph</a></td><td>Print Call Graph to 'dot' file</td></tr>
-<tr><td><a href="#dot-cfg">-dot-cfg</a></td><td>Print CFG of function to 'dot' file</td></tr>
-<tr><td><a href="#dot-cfg-only">-dot-cfg-only</a></td><td>Print CFG of function to 'dot' file (with no function bodies)</td></tr>
-<tr><td><a href="#dot-dom">-dot-dom</a></td><td>Print dominance tree of function to 'dot' file</td></tr>
-<tr><td><a href="#dot-dom-only">-dot-dom-only</a></td><td>Print dominance tree of function to 'dot' file (with no function bodies)</td></tr>
-<tr><td><a href="#dot-postdom">-dot-postdom</a></td><td>Print postdominance tree of function to 'dot' file</td></tr>
-<tr><td><a href="#dot-postdom-only">-dot-postdom-only</a></td><td>Print postdominance tree of function to 'dot' file (with no function bodies)</td></tr>
-<tr><td><a href="#globalsmodref-aa">-globalsmodref-aa</a></td><td>Simple mod/ref analysis for globals</td></tr>
-<tr><td><a href="#instcount">-instcount</a></td><td>Counts the various types of Instructions</td></tr>
-<tr><td><a href="#intervals">-intervals</a></td><td>Interval Partition Construction</td></tr>
-<tr><td><a href="#iv-users">-iv-users</a></td><td>Induction Variable Users</td></tr>
-<tr><td><a href="#lazy-value-info">-lazy-value-info</a></td><td>Lazy Value Information Analysis</td></tr>
-<tr><td><a href="#libcall-aa">-libcall-aa</a></td><td>LibCall Alias Analysis</td></tr>
-<tr><td><a href="#lint">-lint</a></td><td>Statically lint-checks LLVM IR</td></tr>
-<tr><td><a href="#loops">-loops</a></td><td>Natural Loop Information</td></tr>
-<tr><td><a href="#memdep">-memdep</a></td><td>Memory Dependence Analysis</td></tr>
-<tr><td><a href="#module-debuginfo">-module-debuginfo</a></td><td>Decodes module-level debug info</td></tr>
-<tr><td><a href="#no-aa">-no-aa</a></td><td>No Alias Analysis (always returns 'may' alias)</td></tr>
-<tr><td><a href="#no-profile">-no-profile</a></td><td>No Profile Information</td></tr>
-<tr><td><a href="#postdomtree">-postdomtree</a></td><td>Post-Dominator Tree Construction</td></tr>
-<tr><td><a href="#print-alias-sets">-print-alias-sets</a></td><td>Alias Set Printer</td></tr>
-<tr><td><a href="#print-callgraph">-print-callgraph</a></td><td>Print a call graph</td></tr>
-<tr><td><a href="#print-callgraph-sccs">-print-callgraph-sccs</a></td><td>Print SCCs of the Call Graph</td></tr>
-<tr><td><a href="#print-cfg-sccs">-print-cfg-sccs</a></td><td>Print SCCs of each function CFG</td></tr>
-<tr><td><a href="#print-dbginfo">-print-dbginfo</a></td><td>Print debug info in human readable form</td></tr>
-<tr><td><a href="#print-dom-info">-print-dom-info</a></td><td>Dominator Info Printer</td></tr>
-<tr><td><a href="#print-externalfnconstants">-print-externalfnconstants</a></td><td>Print external fn callsites passed constants</td></tr>
-<tr><td><a href="#print-function">-print-function</a></td><td>Print function to stderr</td></tr>
-<tr><td><a href="#print-module">-print-module</a></td><td>Print module to stderr</td></tr>
-<tr><td><a href="#print-used-types">-print-used-types</a></td><td>Find Used Types</td></tr>
-<tr><td><a href="#profile-estimator">-profile-estimator</a></td><td>Estimate profiling information</td></tr>
-<tr><td><a href="#profile-loader">-profile-loader</a></td><td>Load profile information from llvmprof.out</td></tr>
-<tr><td><a href="#profile-verifier">-profile-verifier</a></td><td>Verify profiling information</td></tr>
-<tr><td><a href="#regions">-regions</a></td><td>Detect single entry single exit regions</td></tr>
-<tr><td><a href="#scalar-evolution">-scalar-evolution</a></td><td>Scalar Evolution Analysis</td></tr>
-<tr><td><a href="#scev-aa">-scev-aa</a></td><td>ScalarEvolution-based Alias Analysis</td></tr>
-<tr><td><a href="#targetdata">-targetdata</a></td><td>Target Data Layout</td></tr>
-
-
-<tr><th colspan="2"><b>TRANSFORM PASSES</b></th></tr>
-<tr><th>Option</th><th>Name</th></tr>
-<tr><td><a href="#adce">-adce</a></td><td>Aggressive Dead Code Elimination</td></tr>
-<tr><td><a href="#always-inline">-always-inline</a></td><td>Inliner for always_inline functions</td></tr>
-<tr><td><a href="#argpromotion">-argpromotion</a></td><td>Promote 'by reference' arguments to scalars</td></tr>
-<tr><td><a href="#bb-vectorize">-bb-vectorize</a></td><td>Combine instructions to form vector instructions within basic blocks</td></tr>
-<tr><td><a href="#block-placement">-block-placement</a></td><td>Profile Guided Basic Block Placement</td></tr>
-<tr><td><a href="#break-crit-edges">-break-crit-edges</a></td><td>Break critical edges in CFG</td></tr>
-<tr><td><a href="#codegenprepare">-codegenprepare</a></td><td>Optimize for code generation</td></tr>
-<tr><td><a href="#constmerge">-constmerge</a></td><td>Merge Duplicate Global Constants</td></tr>
-<tr><td><a href="#constprop">-constprop</a></td><td>Simple constant propagation</td></tr>
-<tr><td><a href="#dce">-dce</a></td><td>Dead Code Elimination</td></tr>
-<tr><td><a href="#deadargelim">-deadargelim</a></td><td>Dead Argument Elimination</td></tr>
-<tr><td><a href="#deadtypeelim">-deadtypeelim</a></td><td>Dead Type Elimination</td></tr>
-<tr><td><a href="#die">-die</a></td><td>Dead Instruction Elimination</td></tr>
-<tr><td><a href="#dse">-dse</a></td><td>Dead Store Elimination</td></tr>
-<tr><td><a href="#functionattrs">-functionattrs</a></td><td>Deduce function attributes</td></tr>
-<tr><td><a href="#globaldce">-globaldce</a></td><td>Dead Global Elimination</td></tr>
-<tr><td><a href="#globalopt">-globalopt</a></td><td>Global Variable Optimizer</td></tr>
-<tr><td><a href="#gvn">-gvn</a></td><td>Global Value Numbering</td></tr>
-<tr><td><a href="#indvars">-indvars</a></td><td>Canonicalize Induction Variables</td></tr>
-<tr><td><a href="#inline">-inline</a></td><td>Function Integration/Inlining</td></tr>
-<tr><td><a href="#insert-edge-profiling">-insert-edge-profiling</a></td><td>Insert instrumentation for edge profiling</td></tr>
-<tr><td><a href="#insert-optimal-edge-profiling">-insert-optimal-edge-profiling</a></td><td>Insert optimal instrumentation for edge profiling</td></tr>
-<tr><td><a href="#instcombine">-instcombine</a></td><td>Combine redundant instructions</td></tr>
-<tr><td><a href="#internalize">-internalize</a></td><td>Internalize Global Symbols</td></tr>
-<tr><td><a href="#ipconstprop">-ipconstprop</a></td><td>Interprocedural constant propagation</td></tr>
-<tr><td><a href="#ipsccp">-ipsccp</a></td><td>Interprocedural Sparse Conditional Constant Propagation</td></tr>
-<tr><td><a href="#jump-threading">-jump-threading</a></td><td>Jump Threading</td></tr>
-<tr><td><a href="#lcssa">-lcssa</a></td><td>Loop-Closed SSA Form Pass</td></tr>
-<tr><td><a href="#licm">-licm</a></td><td>Loop Invariant Code Motion</td></tr>
-<tr><td><a href="#loop-deletion">-loop-deletion</a></td><td>Delete dead loops</td></tr>
-<tr><td><a href="#loop-extract">-loop-extract</a></td><td>Extract loops into new functions</td></tr>
-<tr><td><a href="#loop-extract-single">-loop-extract-single</a></td><td>Extract at most one loop into a new function</td></tr>
-<tr><td><a href="#loop-reduce">-loop-reduce</a></td><td>Loop Strength Reduction</td></tr>
-<tr><td><a href="#loop-rotate">-loop-rotate</a></td><td>Rotate Loops</td></tr>
-<tr><td><a href="#loop-simplify">-loop-simplify</a></td><td>Canonicalize natural loops</td></tr>
-<tr><td><a href="#loop-unroll">-loop-unroll</a></td><td>Unroll loops</td></tr>
-<tr><td><a href="#loop-unswitch">-loop-unswitch</a></td><td>Unswitch loops</td></tr>
-<tr><td><a href="#loweratomic">-loweratomic</a></td><td>Lower atomic intrinsics to non-atomic form</td></tr>
-<tr><td><a href="#lowerinvoke">-lowerinvoke</a></td><td>Lower invoke and unwind, for unwindless code generators</td></tr>
-<tr><td><a href="#lowerswitch">-lowerswitch</a></td><td>Lower SwitchInst's to branches</td></tr>
-<tr><td><a href="#mem2reg">-mem2reg</a></td><td>Promote Memory to Register</td></tr>
-<tr><td><a href="#memcpyopt">-memcpyopt</a></td><td>MemCpy Optimization</td></tr>
-<tr><td><a href="#mergefunc">-mergefunc</a></td><td>Merge Functions</td></tr>
-<tr><td><a href="#mergereturn">-mergereturn</a></td><td>Unify function exit nodes</td></tr>
-<tr><td><a href="#partial-inliner">-partial-inliner</a></td><td>Partial Inliner</td></tr>
-<tr><td><a href="#prune-eh">-prune-eh</a></td><td>Remove unused exception handling info</td></tr>
-<tr><td><a href="#reassociate">-reassociate</a></td><td>Reassociate expressions</td></tr>
-<tr><td><a href="#reg2mem">-reg2mem</a></td><td>Demote all values to stack slots</td></tr>
-<tr><td><a href="#scalarrepl">-scalarrepl</a></td><td>Scalar Replacement of Aggregates (DT)</td></tr>
-<tr><td><a href="#sccp">-sccp</a></td><td>Sparse Conditional Constant Propagation</td></tr>
-<tr><td><a href="#simplify-libcalls">-simplify-libcalls</a></td><td>Simplify well-known library calls</td></tr>
-<tr><td><a href="#simplifycfg">-simplifycfg</a></td><td>Simplify the CFG</td></tr>
-<tr><td><a href="#sink">-sink</a></td><td>Code sinking</td></tr>
-<tr><td><a href="#strip">-strip</a></td><td>Strip all symbols from a module</td></tr>
-<tr><td><a href="#strip-dead-debug-info">-strip-dead-debug-info</a></td><td>Strip debug info for unused symbols</td></tr>
-<tr><td><a href="#strip-dead-prototypes">-strip-dead-prototypes</a></td><td>Strip Unused Function Prototypes</td></tr>
-<tr><td><a href="#strip-debug-declare">-strip-debug-declare</a></td><td>Strip all llvm.dbg.declare intrinsics</td></tr>
-<tr><td><a href="#strip-nondebug">-strip-nondebug</a></td><td>Strip all symbols, except dbg symbols, from a module</td></tr>
-<tr><td><a href="#tailcallelim">-tailcallelim</a></td><td>Tail Call Elimination</td></tr>
-
-
-<tr><th colspan="2"><b>UTILITY PASSES</b></th></tr>
-<tr><th>Option</th><th>Name</th></tr>
-<tr><td><a href="#deadarghaX0r">-deadarghaX0r</a></td><td>Dead Argument Hacking (BUGPOINT USE ONLY; DO NOT USE)</td></tr>
-<tr><td><a href="#extract-blocks">-extract-blocks</a></td><td>Extract Basic Blocks From Module (for bugpoint use)</td></tr>
-<tr><td><a href="#instnamer">-instnamer</a></td><td>Assign names to anonymous instructions</td></tr>
-<tr><td><a href="#preverify">-preverify</a></td><td>Preliminary module verification</td></tr>
-<tr><td><a href="#verify">-verify</a></td><td>Module Verifier</td></tr>
-<tr><td><a href="#view-cfg">-view-cfg</a></td><td>View CFG of function</td></tr>
-<tr><td><a href="#view-cfg-only">-view-cfg-only</a></td><td>View CFG of function (with no function bodies)</td></tr>
-<tr><td><a href="#view-dom">-view-dom</a></td><td>View dominance tree of function</td></tr>
-<tr><td><a href="#view-dom-only">-view-dom-only</a></td><td>View dominance tree of function (with no function bodies)</td></tr>
-<tr><td><a href="#view-postdom">-view-postdom</a></td><td>View postdominance tree of function</td></tr>
-<tr><td><a href="#view-postdom-only">-view-postdom-only</a></td><td>View postdominance tree of function (with no function bodies)</td></tr>
-</table>
-
-</div>
-
-<!-- ======================================================================= -->
-<h2><a name="analyses">Analysis Passes</a></h2>
-<div>
-  <p>This section describes the LLVM Analysis Passes.</p>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="aa-eval">-aa-eval: Exhaustive Alias Analysis Precision Evaluator</a>
-</h3>
-<div>
-  <p>This is a simple N^2 alias analysis accuracy evaluator.
-  Basically, for each function in the program, it simply queries to see how the
-  alias analysis implementation answers alias queries between each pair of
-  pointers in the function.</p>
-
-  <p>This is inspired and adapted from code by: Naveen Neelakantam, Francesco
-  Spadini, and Wojciech Stryjewski.</p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="basicaa">-basicaa: Basic Alias Analysis (stateless AA impl)</a>
-</h3>
-<div>
-  <p>A basic alias analysis pass that implements identities (two different
-  globals cannot alias, etc), but does no stateful analysis.</p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="basiccg">-basiccg: Basic CallGraph Construction</a>
-</h3>
-<div>
-  <p>Yet to be written.</p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="count-aa">-count-aa: Count Alias Analysis Query Responses</a>
-</h3>
-<div>
-  <p>
-  A pass which can be used to count how many alias queries
-  are being made and how the alias analysis implementation being used responds.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="da">-da: Dependence Analysis</a>
-</h3>
-<div>
-  <p>Dependence analysis framework, which is used to detect dependences in
-  memory accesses.</p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="debug-aa">-debug-aa: AA use debugger</a>
-</h3>
-<div>
-  <p>
-  This simple pass checks alias analysis users to ensure that if they
-  create a new value, they do not query AA without informing it of the value.
-  It acts as a shim over any other AA pass you want.
-  </p>
-  
-  <p>
-  Yes keeping track of every value in the program is expensive, but this is 
-  a debugging pass.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="domfrontier">-domfrontier: Dominance Frontier Construction</a>
-</h3>
-<div>
-  <p>
-  This pass is a simple dominator construction algorithm for finding forward
-  dominator frontiers.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="domtree">-domtree: Dominator Tree Construction</a>
-</h3>
-<div>
-  <p>
-  This pass is a simple dominator construction algorithm for finding forward
-  dominators.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="dot-callgraph">-dot-callgraph: Print Call Graph to 'dot' file</a>
-</h3>
-<div>
-  <p>
-  This pass, only available in <code>opt</code>, prints the call graph into a
-  <code>.dot</code> graph.  This graph can then be processed with the "dot" tool
-  to convert it to postscript or some other suitable format.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="dot-cfg">-dot-cfg: Print CFG of function to 'dot' file</a>
-</h3>
-<div>
-  <p>
-  This pass, only available in <code>opt</code>, prints the control flow graph
-  into a <code>.dot</code> graph.  This graph can then be processed with the
-  "dot" tool to convert it to postscript or some other suitable format.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="dot-cfg-only">-dot-cfg-only: Print CFG of function to 'dot' file (with no function bodies)</a>
-</h3>
-<div>
-  <p>
-  This pass, only available in <code>opt</code>, prints the control flow graph
-  into a <code>.dot</code> graph, omitting the function bodies.  This graph can
-  then be processed with the "dot" tool to convert it to postscript or some
-  other suitable format.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="dot-dom">-dot-dom: Print dominance tree of function to 'dot' file</a>
-</h3>
-<div>
-  <p>
-  This pass, only available in <code>opt</code>, prints the dominator tree
-  into a <code>.dot</code> graph.  This graph can then be processed with the
-  "dot" tool to convert it to postscript or some other suitable format.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="dot-dom-only">-dot-dom-only: Print dominance tree of function to 'dot' file (with no function bodies)</a>
-</h3>
-<div>
-  <p>
-  This pass, only available in <code>opt</code>, prints the dominator tree
-  into a <code>.dot</code> graph, omitting the function bodies.  This graph can
-  then be processed with the "dot" tool to convert it to postscript or some
-  other suitable format.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="dot-postdom">-dot-postdom: Print postdominance tree of function to 'dot' file</a>
-</h3>
-<div>
-  <p>
-  This pass, only available in <code>opt</code>, prints the post dominator tree
-  into a <code>.dot</code> graph.  This graph can then be processed with the
-  "dot" tool to convert it to postscript or some other suitable format.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="dot-postdom-only">-dot-postdom-only: Print postdominance tree of function to 'dot' file (with no function bodies)</a>
-</h3>
-<div>
-  <p>
-  This pass, only available in <code>opt</code>, prints the post dominator tree
-  into a <code>.dot</code> graph, omitting the function bodies.  This graph can
-  then be processed with the "dot" tool to convert it to postscript or some
-  other suitable format.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="globalsmodref-aa">-globalsmodref-aa: Simple mod/ref analysis for globals</a>
-</h3>
-<div>
-  <p>
-  This simple pass provides alias and mod/ref information for global values
-  that do not have their address taken, and keeps track of whether functions
-  read or write memory (are "pure").  For this simple (but very common) case,
-  we can provide pretty accurate and useful information.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="instcount">-instcount: Counts the various types of Instructions</a>
-</h3>
-<div>
-  <p>
-  This pass collects the count of all instructions and reports them
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="intervals">-intervals: Interval Partition Construction</a>
-</h3>
-<div>
-  <p>
-  This analysis calculates and represents the interval partition of a function,
-  or a preexisting interval partition.
-  </p>
-  
-  <p>
-  In this way, the interval partition may be used to reduce a flow graph down
-  to its degenerate single node interval partition (unless it is irreducible).
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="iv-users">-iv-users: Induction Variable Users</a>
-</h3>
-<div>
-  <p>Bookkeeping for "interesting" users of expressions computed from 
-  induction variables.</p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="lazy-value-info">-lazy-value-info: Lazy Value Information Analysis</a>
-</h3>
-<div>
-  <p>Interface for lazy computation of value constraint information.</p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="libcall-aa">-libcall-aa: LibCall Alias Analysis</a>
-</h3>
-<div>
-  <p>LibCall Alias Analysis.</p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="lint">-lint: Statically lint-checks LLVM IR</a>
-</h3>
-<div>
-  <p>This pass statically checks for common and easily-identified constructs
-  which produce undefined or likely unintended behavior in LLVM IR.</p>
- 
-  <p>It is not a guarantee of correctness, in two ways. First, it isn't
-  comprehensive. There are checks which could be done statically which are
-  not yet implemented. Some of these are indicated by TODO comments, but
-  those aren't comprehensive either. Second, many conditions cannot be
-  checked statically. This pass does no dynamic instrumentation, so it
-  can't check for all possible problems.</p>
-  
-  <p>Another limitation is that it assumes all code will be executed. A store
-  through a null pointer in a basic block which is never reached is harmless,
-  but this pass will warn about it anyway.</p>
- 
-  <p>Optimization passes may make conditions that this pass checks for more or
-  less obvious. If an optimization pass appears to be introducing a warning,
-  it may be that the optimization pass is merely exposing an existing
-  condition in the code.</p>
-  
-  <p>This code may be run before instcombine. In many cases, instcombine checks
-  for the same kinds of things and turns instructions with undefined behavior
-  into unreachable (or equivalent). Because of this, this pass makes some
-  effort to look through bitcasts and so on.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="loops">-loops: Natural Loop Information</a>
-</h3>
-<div>
-  <p>
-  This analysis is used to identify natural loops and determine the loop depth
-  of various nodes of the CFG.  Note that the loops identified may actually be
-  several natural loops that share the same header node... not just a single
-  natural loop.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="memdep">-memdep: Memory Dependence Analysis</a>
-</h3>
-<div>
-  <p>
-  An analysis that determines, for a given memory operation, what preceding 
-  memory operations it depends on.  It builds on alias analysis information, and 
-  tries to provide a lazy, caching interface to a common kind of alias 
-  information query.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="module-debuginfo">-module-debuginfo: Decodes module-level debug info</a>
-</h3>
-<div>
-  <p>This pass decodes the debug info metadata in a module and prints in a
- (sufficiently-prepared-) human-readable form.
-
- For example, run this pass from opt along with the -analyze option, and
- it'll print to standard output.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="no-aa">-no-aa: No Alias Analysis (always returns 'may' alias)</a>
-</h3>
-<div>
-  <p>
-  This is the default implementation of the Alias Analysis interface. It always
-  returns "I don't know" for alias queries.  NoAA is unlike other alias analysis
-  implementations, in that it does not chain to a previous analysis. As such it
-  doesn't follow many of the rules that other alias analyses must.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="no-profile">-no-profile: No Profile Information</a>
-</h3>
-<div>
-  <p>
-  The default "no profile" implementation of the abstract
-  <code>ProfileInfo</code> interface.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="postdomfrontier">-postdomfrontier: Post-Dominance Frontier Construction</a>
-</h3>
-<div>
-  <p>
-  This pass is a simple post-dominator construction algorithm for finding
-  post-dominator frontiers.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="postdomtree">-postdomtree: Post-Dominator Tree Construction</a>
-</h3>
-<div>
-  <p>
-  This pass is a simple post-dominator construction algorithm for finding
-  post-dominators.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="print-alias-sets">-print-alias-sets: Alias Set Printer</a>
-</h3>
-<div>
-  <p>Yet to be written.</p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="print-callgraph">-print-callgraph: Print a call graph</a>
-</h3>
-<div>
-  <p>
-  This pass, only available in <code>opt</code>, prints the call graph to
-  standard error in a human-readable form.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="print-callgraph-sccs">-print-callgraph-sccs: Print SCCs of the Call Graph</a>
-</h3>
-<div>
-  <p>
-  This pass, only available in <code>opt</code>, prints the SCCs of the call
-  graph to standard error in a human-readable form.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="print-cfg-sccs">-print-cfg-sccs: Print SCCs of each function CFG</a>
-</h3>
-<div>
-  <p>
-  This pass, only available in <code>opt</code>, prints the SCCs of each
-  function CFG to standard error in a human-readable form.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="print-dbginfo">-print-dbginfo: Print debug info in human readable form</a>
-</h3>
-<div>
-  <p>Pass that prints instructions, and associated debug info:</p>
-  <ul>
-  
-  <li>source/line/col information</li>
-  <li>original variable name</li>
-  <li>original type name</li>
-  </ul>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="print-dom-info">-print-dom-info: Dominator Info Printer</a>
-</h3>
-<div>
-  <p>Dominator Info Printer.</p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="print-externalfnconstants">-print-externalfnconstants: Print external fn callsites passed constants</a>
-</h3>
-<div>
-  <p>
-  This pass, only available in <code>opt</code>, prints out call sites to
-  external functions that are called with constant arguments.  This can be
-  useful when looking for standard library functions we should constant fold
-  or handle in alias analyses.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="print-function">-print-function: Print function to stderr</a>
-</h3>
-<div>
-  <p>
-  The <code>PrintFunctionPass</code> class is designed to be pipelined with
-  other <code>FunctionPass</code>es, and prints out the functions of the module
-  as they are processed.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="print-module">-print-module: Print module to stderr</a>
-</h3>
-<div>
-  <p>
-  This pass simply prints out the entire module when it is executed.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="print-used-types">-print-used-types: Find Used Types</a>
-</h3>
-<div>
-  <p>
-  This pass is used to seek out all of the types in use by the program.  Note
-  that this analysis explicitly does not include types only used by the symbol
-  table.
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="profile-estimator">-profile-estimator: Estimate profiling information</a>
-</h3>
-<div>
-  <p>Profiling information that estimates the profiling information 
-  in a very crude and unimaginative way.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="profile-loader">-profile-loader: Load profile information from llvmprof.out</a>
-</h3>
-<div>
-  <p>
-  A concrete implementation of profiling information that loads the information
-  from a profile dump file.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="profile-verifier">-profile-verifier: Verify profiling information</a>
-</h3>
-<div>
-  <p>Pass that checks profiling information for plausibility.</p>
-</div>
-<h3>
-  <a name="regions">-regions: Detect single entry single exit regions</a>
-</h3>
-<div>
-  <p>
-  The <code>RegionInfo</code> pass detects single entry single exit regions in a
-  function, where a region is defined as any subgraph that is connected to the
-  remaining graph at only two spots. Furthermore, an hierarchical region tree is
-  built.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="scalar-evolution">-scalar-evolution: Scalar Evolution Analysis</a>
-</h3>
-<div>
-  <p>
-  The <code>ScalarEvolution</code> analysis can be used to analyze and
-  catagorize scalar expressions in loops.  It specializes in recognizing general
-  induction variables, representing them with the abstract and opaque
-  <code>SCEV</code> class.  Given this analysis, trip counts of loops and other
-  important properties can be obtained.
-  </p>
-  
-  <p>
-  This analysis is primarily useful for induction variable substitution and
-  strength reduction.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="scev-aa">-scev-aa: ScalarEvolution-based Alias Analysis</a>
-</h3>
-<div>
-  <p>Simple alias analysis implemented in terms of ScalarEvolution queries.
- 
-  This differs from traditional loop dependence analysis in that it tests
-  for dependencies within a single iteration of a loop, rather than
-  dependencies between different iterations.
- 
-  ScalarEvolution has a more complete understanding of pointer arithmetic
-  than BasicAliasAnalysis' collection of ad-hoc analyses.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="targetdata">-targetdata: Target Data Layout</a>
-</h3>
-<div>
-  <p>Provides other passes access to information on how the size and alignment
-  required by the target ABI for various data types.</p>
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h2><a name="transforms">Transform Passes</a></h2>
-<div>
-  <p>This section describes the LLVM Transform Passes.</p>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="adce">-adce: Aggressive Dead Code Elimination</a>
-</h3>
-<div>
-  <p>ADCE aggressively tries to eliminate code. This pass is similar to
-  <a href="#dce">DCE</a> but it assumes that values are dead until proven 
-  otherwise. This is similar to <a href="#sccp">SCCP</a>, except applied to 
-  the liveness of values.</p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="always-inline">-always-inline: Inliner for always_inline functions</a>
-</h3>
-<div>
-  <p>A custom inliner that handles only functions that are marked as 
-  "always inline".</p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="argpromotion">-argpromotion: Promote 'by reference' arguments to scalars</a>
-</h3>
-<div>
-  <p>
-  This pass promotes "by reference" arguments to be "by value" arguments.  In
-  practice, this means looking for internal functions that have pointer
-  arguments.  If it can prove, through the use of alias analysis, that an
-  argument is *only* loaded, then it can pass the value into the function
-  instead of the address of the value.  This can cause recursive simplification
-  of code and lead to the elimination of allocas (especially in C++ template
-  code like the STL).
-  </p>
-  
-  <p>
-  This pass also handles aggregate arguments that are passed into a function,
-  scalarizing them if the elements of the aggregate are only loaded.  Note that
-  it refuses to scalarize aggregates which would require passing in more than
-  three operands to the function, because passing thousands of operands for a
-  large array or structure is unprofitable!
-  </p>
-  
-  <p>
-  Note that this transformation could also be done for arguments that are only
-  stored to (returning the value instead), but does not currently.  This case
-  would be best handled when and if LLVM starts supporting multiple return
-  values from functions.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="bb-vectorize">-bb-vectorize: Basic-Block Vectorization</a>
-</h3>
-<div>
-  <p>This pass combines instructions inside basic blocks to form vector
-  instructions. It iterates over each basic block, attempting to pair
-  compatible instructions, repeating this process until no additional
-  pairs are selected for vectorization. When the outputs of some pair
-  of compatible instructions are used as inputs by some other pair of
-  compatible instructions, those pairs are part of a potential
-  vectorization chain. Instruction pairs are only fused into vector
-  instructions when they are part of a chain longer than some
-  threshold length. Moreover, the pass attempts to find the best
-  possible chain for each pair of compatible instructions. These
-  heuristics are intended to prevent vectorization in cases where
-  it would not yield a performance increase of the resulting code.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="block-placement">-block-placement: Profile Guided Basic Block Placement</a>
-</h3>
-<div>
-  <p>This pass is a very simple profile guided basic block placement algorithm.
-  The idea is to put frequently executed blocks together at the start of the
-  function and hopefully increase the number of fall-through conditional
-  branches.  If there is no profile information for a particular function, this
-  pass basically orders blocks in depth-first order.</p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="break-crit-edges">-break-crit-edges: Break critical edges in CFG</a>
-</h3>
-<div>
-  <p>
-  Break all of the critical edges in the CFG by inserting a dummy basic block.
-  It may be "required" by passes that cannot deal with critical edges. This
-  transformation obviously invalidates the CFG, but can update forward dominator
-  (set, immediate dominators, tree, and frontier) information.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="codegenprepare">-codegenprepare: Optimize for code generation</a>
-</h3>
-<div>
-  This pass munges the code in the input function to better prepare it for
-  SelectionDAG-based code generation. This works around limitations in it's
-  basic-block-at-a-time approach. It should eventually be removed.
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="constmerge">-constmerge: Merge Duplicate Global Constants</a>
-</h3>
-<div>
-  <p>
-  Merges duplicate global constants together into a single constant that is
-  shared.  This is useful because some passes (ie TraceValues) insert a lot of
-  string constants into the program, regardless of whether or not an existing
-  string is available.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="constprop">-constprop: Simple constant propagation</a>
-</h3>
-<div>
-  <p>This file implements constant propagation and merging. It looks for
-  instructions involving only constant operands and replaces them with a
-  constant value instead of an instruction. For example:</p>
-  <blockquote><pre>add i32 1, 2</pre></blockquote>
-  <p>becomes</p>
-  <blockquote><pre>i32 3</pre></blockquote>
-  <p>NOTE: this pass has a habit of making definitions be dead.  It is a good 
-  idea to to run a <a href="#die">DIE</a> (Dead Instruction Elimination) pass 
-  sometime after running this pass.</p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="dce">-dce: Dead Code Elimination</a>
-</h3>
-<div>
-  <p>
-  Dead code elimination is similar to <a href="#die">dead instruction
-  elimination</a>, but it rechecks instructions that were used by removed
-  instructions to see if they are newly dead.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="deadargelim">-deadargelim: Dead Argument Elimination</a>
-</h3>
-<div>
-  <p>
-  This pass deletes dead arguments from internal functions.  Dead argument
-  elimination removes arguments which are directly dead, as well as arguments
-  only passed into function calls as dead arguments of other functions.  This
-  pass also deletes dead arguments in a similar way.
-  </p>
-  
-  <p>
-  This pass is often useful as a cleanup pass to run after aggressive
-  interprocedural passes, which add possibly-dead arguments.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="deadtypeelim">-deadtypeelim: Dead Type Elimination</a>
-</h3>
-<div>
-  <p>
-  This pass is used to cleanup the output of GCC.  It eliminate names for types
-  that are unused in the entire translation unit, using the <a
-  href="#findusedtypes">find used types</a> pass.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="die">-die: Dead Instruction Elimination</a>
-</h3>
-<div>
-  <p>
-  Dead instruction elimination performs a single pass over the function,
-  removing instructions that are obviously dead.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="dse">-dse: Dead Store Elimination</a>
-</h3>
-<div>
-  <p>
-  A trivial dead store elimination that only considers basic-block local
-  redundant stores.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="functionattrs">-functionattrs: Deduce function attributes</a>
-</h3>
-<div>
-  <p>A simple interprocedural pass which walks the call-graph, looking for 
-  functions which do not access or only read non-local memory, and marking them 
-  readnone/readonly.  In addition, it marks function arguments (of pointer type) 
-  'nocapture' if a call to the function does not create any copies of the pointer 
-  value that outlive the call. This more or less means that the pointer is only
-  dereferenced, and not returned from the function or stored in a global.
-  This pass is implemented as a bottom-up traversal of the call-graph.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="globaldce">-globaldce: Dead Global Elimination</a>
-</h3>
-<div>
-  <p>
-  This transform is designed to eliminate unreachable internal globals from the
-  program.  It uses an aggressive algorithm, searching out globals that are
-  known to be alive.  After it finds all of the globals which are needed, it
-  deletes whatever is left over.  This allows it to delete recursive chunks of
-  the program which are unreachable.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="globalopt">-globalopt: Global Variable Optimizer</a>
-</h3>
-<div>
-  <p>
-  This pass transforms simple global variables that never have their address
-  taken.  If obviously true, it marks read/write globals as constant, deletes
-  variables only stored to, etc.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="gvn">-gvn: Global Value Numbering</a>
-</h3>
-<div>
-  <p>
-  This pass performs global value numbering to eliminate fully and partially
-  redundant instructions.  It also performs redundant load elimination.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="indvars">-indvars: Canonicalize Induction Variables</a>
-</h3>
-<div>
-  <p>
-  This transformation analyzes and transforms the induction variables (and
-  computations derived from them) into simpler forms suitable for subsequent
-  analysis and transformation.
-  </p>
-  
-  <p>
-  This transformation makes the following changes to each loop with an
-  identifiable induction variable:
-  </p>
-  
-  <ol>
-    <li>All loops are transformed to have a <em>single</em> canonical
-        induction variable which starts at zero and steps by one.</li>
-    <li>The canonical induction variable is guaranteed to be the first PHI node
-        in the loop header block.</li>
-    <li>Any pointer arithmetic recurrences are raised to use array
-        subscripts.</li>
-  </ol>
-  
-  <p>
-  If the trip count of a loop is computable, this pass also makes the following
-  changes:
-  </p>
-  
-  <ol>
-    <li>The exit condition for the loop is canonicalized to compare the
-        induction value against the exit value.  This turns loops like:
-        <blockquote><pre>for (i = 7; i*i < 1000; ++i)</pre></blockquote>
-        into
-        <blockquote><pre>for (i = 0; i != 25; ++i)</pre></blockquote></li>
-    <li>Any use outside of the loop of an expression derived from the indvar
-        is changed to compute the derived value outside of the loop, eliminating
-        the dependence on the exit value of the induction variable.  If the only
-        purpose of the loop is to compute the exit value of some derived
-        expression, this transformation will make the loop dead.</li>
-  </ol>
-  
-  <p>
-  This transformation should be followed by strength reduction after all of the
-  desired loop transformations have been performed.  Additionally, on targets
-  where it is profitable, the loop could be transformed to count down to zero
-  (the "do loop" optimization).
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="inline">-inline: Function Integration/Inlining</a>
-</h3>
-<div>
-  <p>
-  Bottom-up inlining of functions into callees.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="insert-edge-profiling">-insert-edge-profiling: Insert instrumentation for edge profiling</a>
-</h3>
-<div>
-  <p>
-  This pass instruments the specified program with counters for edge profiling.
-  Edge profiling can give a reasonable approximation of the hot paths through a
-  program, and is used for a wide variety of program transformations.
-  </p>
-  
-  <p>
-  Note that this implementation is very naïve.  It inserts a counter for
-  <em>every</em> edge in the program, instead of using control flow information
-  to prune the number of counters inserted.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="insert-optimal-edge-profiling">-insert-optimal-edge-profiling: Insert optimal instrumentation for edge profiling</a>
-</h3>
-<div>
-  <p>This pass instruments the specified program with counters for edge profiling.
-  Edge profiling can give a reasonable approximation of the hot paths through a
-  program, and is used for a wide variety of program transformations.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="instcombine">-instcombine: Combine redundant instructions</a>
-</h3>
-<div>
-  <p>
-  Combine instructions to form fewer, simple
-  instructions.  This pass does not modify the CFG This pass is where algebraic
-  simplification happens.
-  </p>
-  
-  <p>
-  This pass combines things like:
-  </p>
-  
-<blockquote><pre
->%Y = add i32 %X, 1
-%Z = add i32 %Y, 1</pre></blockquote>
-  
-  <p>
-  into:
-  </p>
-
-<blockquote><pre
->%Z = add i32 %X, 2</pre></blockquote>
-  
-  <p>
-  This is a simple worklist driven algorithm.
-  </p>
-  
-  <p>
-  This pass guarantees that the following canonicalizations are performed on
-  the program:
-  </p>
-
-  <ul>
-    <li>If a binary operator has a constant operand, it is moved to the right-
-        hand side.</li>
-    <li>Bitwise operators with constant operands are always grouped so that
-        shifts are performed first, then <code>or</code>s, then
-        <code>and</code>s, then <code>xor</code>s.</li>
-    <li>Compare instructions are converted from <code>&lt;</code>,
-        <code>&gt;</code>, <code>≤</code>, or <code>≥</code> to
-        <code>=</code> or <code>≠</code> if possible.</li>
-    <li>All <code>cmp</code> instructions on boolean values are replaced with
-        logical operations.</li>
-    <li><code>add <var>X</var>, <var>X</var></code> is represented as
-        <code>mul <var>X</var>, 2</code> ⇒ <code>shl <var>X</var>, 1</code></li>
-    <li>Multiplies with a constant power-of-two argument are transformed into
-        shifts.</li>
-    <li>… etc.</li>
-  </ul>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="internalize">-internalize: Internalize Global Symbols</a>
-</h3>
-<div>
-  <p>
-  This pass loops over all of the functions in the input module, looking for a
-  main function.  If a main function is found, all other functions and all
-  global variables with initializers are marked as internal.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="ipconstprop">-ipconstprop: Interprocedural constant propagation</a>
-</h3>
-<div>
-  <p>
-  This pass implements an <em>extremely</em> simple interprocedural constant
-  propagation pass.  It could certainly be improved in many different ways,
-  like using a worklist.  This pass makes arguments dead, but does not remove
-  them.  The existing dead argument elimination pass should be run after this
-  to clean up the mess.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="ipsccp">-ipsccp: Interprocedural Sparse Conditional Constant Propagation</a>
-</h3>
-<div>
-  <p>
-  An interprocedural variant of <a href="#sccp">Sparse Conditional Constant 
-  Propagation</a>.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="jump-threading">-jump-threading: Jump Threading</a>
-</h3>
-<div>
-  <p>
-  Jump threading tries to find distinct threads of control flow running through
-  a basic block. This pass looks at blocks that have multiple predecessors and
-  multiple successors.  If one or more of the predecessors of the block can be
-  proven to always cause a jump to one of the successors, we forward the edge
-  from the predecessor to the successor by duplicating the contents of this
-  block.
-  </p>
-  <p>
-  An example of when this can occur is code like this:
-  </p>
-
-  <pre
->if () { ...
-  X = 4;
-}
-if (X &lt; 3) {</pre>
-
-  <p>
-  In this case, the unconditional branch at the end of the first if can be
-  revectored to the false side of the second if.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="lcssa">-lcssa: Loop-Closed SSA Form Pass</a>
-</h3>
-<div>
-  <p>
-  This pass transforms loops by placing phi nodes at the end of the loops for
-  all values that are live across the loop boundary.  For example, it turns
-  the left into the right code:
-  </p>
-  
-  <pre
->for (...)                for (...)
-  if (c)                   if (c)
-    X1 = ...                 X1 = ...
-  else                     else
-    X2 = ...                 X2 = ...
-  X3 = phi(X1, X2)         X3 = phi(X1, X2)
-... = X3 + 4              X4 = phi(X3)
-                          ... = X4 + 4</pre>
-  
-  <p>
-  This is still valid LLVM; the extra phi nodes are purely redundant, and will
-  be trivially eliminated by <code>InstCombine</code>.  The major benefit of
-  this transformation is that it makes many other loop optimizations, such as 
-  LoopUnswitching, simpler.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="licm">-licm: Loop Invariant Code Motion</a>
-</h3>
-<div>
-  <p>
-  This pass performs loop invariant code motion, attempting to remove as much
-  code from the body of a loop as possible.  It does this by either hoisting
-  code into the preheader block, or by sinking code to the exit blocks if it is
-  safe.  This pass also promotes must-aliased memory locations in the loop to
-  live in registers, thus hoisting and sinking "invariant" loads and stores.
-  </p>
-  
-  <p>
-  This pass uses alias analysis for two purposes:
-  </p>
-  
-  <ul>
-    <li>Moving loop invariant loads and calls out of loops.  If we can determine
-        that a load or call inside of a loop never aliases anything stored to,
-        we can hoist it or sink it like any other instruction.</li>
-    <li>Scalar Promotion of Memory - If there is a store instruction inside of
-        the loop, we try to move the store to happen AFTER the loop instead of
-        inside of the loop.  This can only happen if a few conditions are true:
-        <ul>
-          <li>The pointer stored through is loop invariant.</li>
-          <li>There are no stores or loads in the loop which <em>may</em> alias
-              the pointer.  There are no calls in the loop which mod/ref the
-              pointer.</li>
-        </ul>
-        If these conditions are true, we can promote the loads and stores in the
-        loop of the pointer to use a temporary alloca'd variable.  We then use
-        the mem2reg functionality to construct the appropriate SSA form for the
-        variable.</li>
-  </ul>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="loop-deletion">-loop-deletion: Delete dead loops</a>
-</h3>
-<div>
-  <p>
-  This file implements the Dead Loop Deletion Pass.  This pass is responsible
-  for eliminating loops with non-infinite computable trip counts that have no
-  side effects or volatile instructions, and do not contribute to the
-  computation of the function's return value.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="loop-extract">-loop-extract: Extract loops into new functions</a>
-</h3>
-<div>
-  <p>
-  A pass wrapper around the <code>ExtractLoop()</code> scalar transformation to 
-  extract each top-level loop into its own new function. If the loop is the
-  <em>only</em> loop in a given function, it is not touched. This is a pass most
-  useful for debugging via bugpoint.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="loop-extract-single">-loop-extract-single: Extract at most one loop into a new function</a>
-</h3>
-<div>
-  <p>
-  Similar to <a href="#loop-extract">Extract loops into new functions</a>,
-  this pass extracts one natural loop from the program into a function if it
-  can. This is used by bugpoint.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="loop-reduce">-loop-reduce: Loop Strength Reduction</a>
-</h3>
-<div>
-  <p>
-  This pass performs a strength reduction on array references inside loops that
-  have as one or more of their components the loop induction variable.  This is
-  accomplished by creating a new value to hold the initial value of the array
-  access for the first iteration, and then creating a new GEP instruction in
-  the loop to increment the value by the appropriate amount.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="loop-rotate">-loop-rotate: Rotate Loops</a>
-</h3>
-<div>
-  <p>A simple loop rotation transformation.</p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="loop-simplify">-loop-simplify: Canonicalize natural loops</a>
-</h3>
-<div>
-  <p>
-  This pass performs several transformations to transform natural loops into a
-  simpler form, which makes subsequent analyses and transformations simpler and
-  more effective.
-  </p>
-  
-  <p>
-  Loop pre-header insertion guarantees that there is a single, non-critical
-  entry edge from outside of the loop to the loop header.  This simplifies a
-  number of analyses and transformations, such as LICM.
-  </p>
-  
-  <p>
-  Loop exit-block insertion guarantees that all exit blocks from the loop
-  (blocks which are outside of the loop that have predecessors inside of the
-  loop) only have predecessors from inside of the loop (and are thus dominated
-  by the loop header).  This simplifies transformations such as store-sinking
-  that are built into LICM.
-  </p>
-  
-  <p>
-  This pass also guarantees that loops will have exactly one backedge.
-  </p>
-  
-  <p>
-  Note that the simplifycfg pass will clean up blocks which are split out but
-  end up being unnecessary, so usage of this pass should not pessimize
-  generated code.
-  </p>
-  
-  <p>
-  This pass obviously modifies the CFG, but updates loop information and
-  dominator information.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="loop-unroll">-loop-unroll: Unroll loops</a>
-</h3>
-<div>
-  <p>
-  This pass implements a simple loop unroller.  It works best when loops have
-  been canonicalized by the <a href="#indvars"><tt>-indvars</tt></a> pass,
-  allowing it to determine the trip counts of loops easily.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="loop-unswitch">-loop-unswitch: Unswitch loops</a>
-</h3>
-<div>
-  <p>
-  This pass transforms loops that contain branches on loop-invariant conditions
-  to have multiple loops.  For example, it turns the left into the right code:
-  </p>
-  
-  <pre
->for (...)                  if (lic)
-  A                          for (...)
-  if (lic)                     A; B; C
-    B                      else
-  C                          for (...)
-                               A; C</pre>
-  
-  <p>
-  This can increase the size of the code exponentially (doubling it every time
-  a loop is unswitched) so we only unswitch if the resultant code will be
-  smaller than a threshold.
-  </p>
-  
-  <p>
-  This pass expects LICM to be run before it to hoist invariant conditions out
-  of the loop, to make the unswitching opportunity obvious.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="loweratomic">-loweratomic: Lower atomic intrinsics to non-atomic form</a>
-</h3>
-<div>
-  <p>
-  This pass lowers atomic intrinsics to non-atomic form for use in a known
-  non-preemptible environment.
-  </p>
-
-  <p>
-  The pass does not verify that the environment is non-preemptible (in
-  general this would require knowledge of the entire call graph of the
-  program including any libraries which may not be available in bitcode form);
-  it simply lowers every atomic intrinsic.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="lowerinvoke">-lowerinvoke: Lower invoke and unwind, for unwindless code generators</a>
-</h3>
-<div>
-  <p>
-  This transformation is designed for use by code generators which do not yet
-  support stack unwinding.  This pass supports two models of exception handling
-  lowering, the 'cheap' support and the 'expensive' support.
-  </p>
-  
-  <p>
-  'Cheap' exception handling support gives the program the ability to execute
-  any program which does not "throw an exception", by turning 'invoke'
-  instructions into calls and by turning 'unwind' instructions into calls to
-  abort().  If the program does dynamically use the unwind instruction, the
-  program will print a message then abort.
-  </p>
-  
-  <p>
-  'Expensive' exception handling support gives the full exception handling
-  support to the program at the cost of making the 'invoke' instruction
-  really expensive.  It basically inserts setjmp/longjmp calls to emulate the
-  exception handling as necessary.
-  </p>
-  
-  <p>
-  Because the 'expensive' support slows down programs a lot, and EH is only
-  used for a subset of the programs, it must be specifically enabled by the
-  <tt>-enable-correct-eh-support</tt> option.
-  </p>
-  
-  <p>
-  Note that after this pass runs the CFG is not entirely accurate (exceptional
-  control flow edges are not correct anymore) so only very simple things should
-  be done after the lowerinvoke pass has run (like generation of native code).
-  This should not be used as a general purpose "my LLVM-to-LLVM pass doesn't
-  support the invoke instruction yet" lowering pass.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="lowerswitch">-lowerswitch: Lower SwitchInst's to branches</a>
-</h3>
-<div>
-  <p>
-  Rewrites <tt>switch</tt> instructions with a sequence of branches, which
-  allows targets to get away with not implementing the switch instruction until
-  it is convenient.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="mem2reg">-mem2reg: Promote Memory to Register</a>
-</h3>
-<div>
-  <p>
-  This file promotes memory references to be register references.  It promotes
-  <tt>alloca</tt> instructions which only have <tt>load</tt>s and
-  <tt>store</tt>s as uses.  An <tt>alloca</tt> is transformed by using dominator
-  frontiers to place <tt>phi</tt> nodes, then traversing the function in
-  depth-first order to rewrite <tt>load</tt>s and <tt>store</tt>s as
-  appropriate. This is just the standard SSA construction algorithm to construct
-  "pruned" SSA form.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="memcpyopt">-memcpyopt: MemCpy Optimization</a>
-</h3>
-<div>
-  <p>
-  This pass performs various transformations related to eliminating memcpy
-  calls, or transforming sets of stores into memset's.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="mergefunc">-mergefunc: Merge Functions</a>
-</h3>
-<div>
-  <p>This pass looks for equivalent functions that are mergable and folds them.
- 
-  A hash is computed from the function, based on its type and number of
-  basic blocks.
- 
-  Once all hashes are computed, we perform an expensive equality comparison
-  on each function pair. This takes n^2/2 comparisons per bucket, so it's
-  important that the hash function be high quality. The equality comparison
-  iterates through each instruction in each basic block.
- 
-  When a match is found the functions are folded. If both functions are
-  overridable, we move the functionality into a new internal function and
-  leave two overridable thunks to it.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="mergereturn">-mergereturn: Unify function exit nodes</a>
-</h3>
-<div>
-  <p>
-  Ensure that functions have at most one <tt>ret</tt> instruction in them.
-  Additionally, it keeps track of which node is the new exit node of the CFG.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="partial-inliner">-partial-inliner: Partial Inliner</a>
-</h3>
-<div>
-  <p>This pass performs partial inlining, typically by inlining an if 
-  statement that surrounds the body of the function.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="prune-eh">-prune-eh: Remove unused exception handling info</a>
-</h3>
-<div>
-  <p>
-  This file implements a simple interprocedural pass which walks the call-graph,
-  turning <tt>invoke</tt> instructions into <tt>call</tt> instructions if and
-  only if the callee cannot throw an exception. It implements this as a
-  bottom-up traversal of the call-graph.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="reassociate">-reassociate: Reassociate expressions</a>
-</h3>
-<div>
-  <p>
-  This pass reassociates commutative expressions in an order that is designed
-  to promote better constant propagation, GCSE, LICM, PRE, etc.
-  </p>
-  
-  <p>
-  For example: 4 + (<var>x</var> + 5) ⇒ <var>x</var> + (4 + 5)
-  </p>
-  
-  <p>
-  In the implementation of this algorithm, constants are assigned rank = 0,
-  function arguments are rank = 1, and other values are assigned ranks
-  corresponding to the reverse post order traversal of current function
-  (starting at 2), which effectively gives values in deep loops higher rank
-  than values not in loops.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="reg2mem">-reg2mem: Demote all values to stack slots</a>
-</h3>
-<div>
-  <p>
-  This file demotes all registers to memory references.  It is intended to be
-  the inverse of <a href="#mem2reg"><tt>-mem2reg</tt></a>.  By converting to
-  <tt>load</tt> instructions, the only values live across basic blocks are
-  <tt>alloca</tt> instructions and <tt>load</tt> instructions before
-  <tt>phi</tt> nodes. It is intended that this should make CFG hacking much 
-  easier. To make later hacking easier, the entry block is split into two, such
-  that all introduced <tt>alloca</tt> instructions (and nothing else) are in the
-  entry block.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="scalarrepl">-scalarrepl: Scalar Replacement of Aggregates (DT)</a>
-</h3>
-<div>
-  <p>
-  The well-known scalar replacement of aggregates transformation.  This
-  transform breaks up <tt>alloca</tt> instructions of aggregate type (structure
-  or array) into individual <tt>alloca</tt> instructions for each member if
-  possible.  Then, if possible, it transforms the individual <tt>alloca</tt>
-  instructions into nice clean scalar SSA form.
-  </p>
-  
-  <p>
-  This combines a simple scalar replacement of aggregates algorithm with the <a
-  href="#mem2reg"><tt>mem2reg</tt></a> algorithm because often interact, 
-  especially for C++ programs.  As such, iterating between <tt>scalarrepl</tt>, 
-  then <a href="#mem2reg"><tt>mem2reg</tt></a> until we run out of things to 
-  promote works well.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="sccp">-sccp: Sparse Conditional Constant Propagation</a>
-</h3>
-<div>
-  <p>
-  Sparse conditional constant propagation and merging, which can be summarized
-  as:
-  </p>
-  
-  <ol>
-    <li>Assumes values are constant unless proven otherwise</li>
-    <li>Assumes BasicBlocks are dead unless proven otherwise</li>
-    <li>Proves values to be constant, and replaces them with constants</li>
-    <li>Proves conditional branches to be unconditional</li>
-  </ol>
-  
-  <p>
-  Note that this pass has a habit of making definitions be dead.  It is a good
-  idea to to run a DCE pass sometime after running this pass.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="simplify-libcalls">-simplify-libcalls: Simplify well-known library calls</a>
-</h3>
-<div>
-  <p>
-  Applies a variety of small optimizations for calls to specific well-known 
-  function calls (e.g. runtime library functions). For example, a call
-   <tt>exit(3)</tt> that occurs within the <tt>main()</tt> function can be 
-   transformed into simply <tt>return 3</tt>.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="simplifycfg">-simplifycfg: Simplify the CFG</a>
-</h3>
-<div>
-  <p>
-  Performs dead code elimination and basic block merging. Specifically:
-  </p>
-  
-  <ol>
-    <li>Removes basic blocks with no predecessors.</li>
-    <li>Merges a basic block into its predecessor if there is only one and the
-        predecessor only has one successor.</li>
-    <li>Eliminates PHI nodes for basic blocks with a single predecessor.</li>
-    <li>Eliminates a basic block that only contains an unconditional
-        branch.</li>
-  </ol>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="sink">-sink: Code sinking</a>
-</h3>
-<div>
-  <p>This pass moves instructions into successor blocks, when possible, so that
- they aren't executed on paths where their results aren't needed.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="strip">-strip: Strip all symbols from a module</a>
-</h3>
-<div>
-  <p>
-  performs code stripping. this transformation can delete:
-  </p>
-  
-  <ol>
-    <li>names for virtual registers</li>
-    <li>symbols for internal globals and functions</li>
-    <li>debug information</li>
-  </ol>
-  
-  <p>
-  note that this transformation makes code much less readable, so it should
-  only be used in situations where the <tt>strip</tt> utility would be used,
-  such as reducing code size or making it harder to reverse engineer code.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="strip-dead-debug-info">-strip-dead-debug-info: Strip debug info for unused symbols</a>
-</h3>
-<div>
-  <p>
-  performs code stripping. this transformation can delete:
-  </p>
-  
-  <ol>
-    <li>names for virtual registers</li>
-    <li>symbols for internal globals and functions</li>
-    <li>debug information</li>
-  </ol>
-  
-  <p>
-  note that this transformation makes code much less readable, so it should
-  only be used in situations where the <tt>strip</tt> utility would be used,
-  such as reducing code size or making it harder to reverse engineer code.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="strip-dead-prototypes">-strip-dead-prototypes: Strip Unused Function Prototypes</a>
-</h3>
-<div>
-  <p>
-  This pass loops over all of the functions in the input module, looking for
-  dead declarations and removes them. Dead declarations are declarations of
-  functions for which no implementation is available (i.e., declarations for
-  unused library functions).
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="strip-debug-declare">-strip-debug-declare: Strip all llvm.dbg.declare intrinsics</a>
-</h3>
-<div>
-  <p>This pass implements code stripping. Specifically, it can delete:</p>
-  <ul>
-  <li>names for virtual registers</li>
-  <li>symbols for internal globals and functions</li>
-  <li>debug information</li>
-  </ul>
-  <p>
-  Note that this transformation makes code much less readable, so it should
-  only be used in situations where the 'strip' utility would be used, such as
-  reducing code size or making it harder to reverse engineer code.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="strip-nondebug">-strip-nondebug: Strip all symbols, except dbg symbols, from a module</a>
-</h3>
-<div>
-  <p>This pass implements code stripping. Specifically, it can delete:</p>
-  <ul>
-  <li>names for virtual registers</li>
-  <li>symbols for internal globals and functions</li>
-  <li>debug information</li>
-  </ul>
-  <p>
-  Note that this transformation makes code much less readable, so it should
-  only be used in situations where the 'strip' utility would be used, such as
-  reducing code size or making it harder to reverse engineer code.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="tailcallelim">-tailcallelim: Tail Call Elimination</a>
-</h3>
-<div>
-  <p>
-  This file transforms calls of the current function (self recursion) followed
-  by a return instruction with a branch to the entry of the function, creating
-  a loop.  This pass also implements the following extensions to the basic
-  algorithm:
-  </p>
-  
-  <ul>
-  <li>Trivial instructions between the call and return do not prevent the
-      transformation from taking place, though currently the analysis cannot
-      support moving any really useful instructions (only dead ones).
-  <li>This pass transforms functions that are prevented from being tail
-      recursive by an associative expression to use an accumulator variable,
-      thus compiling the typical naive factorial or <tt>fib</tt> implementation
-      into efficient code.
-  <li>TRE is performed if the function returns void, if the return
-      returns the result returned by the call, or if the function returns a
-      run-time constant on all exits from the function.  It is possible, though
-      unlikely, that the return returns something else (like constant 0), and
-      can still be TRE'd.  It can be TRE'd if <em>all other</em> return 
-      instructions in the function return the exact same value.
-  <li>If it can prove that callees do not access theier caller stack frame,
-      they are marked as eligible for tail call elimination (by the code
-      generator).
-  </ul>
-</div>
-
-<!-- ======================================================================= -->
-<h2><a name="utilities">Utility Passes</a></h2>
-<div>
-  <p>This section describes the LLVM Utility Passes.</p>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="deadarghaX0r">-deadarghaX0r: Dead Argument Hacking (BUGPOINT USE ONLY; DO NOT USE)</a>
-</h3>
-<div>
-  <p>
-  Same as dead argument elimination, but deletes arguments to functions which
-  are external.  This is only for use by <a
-  href="Bugpoint.html">bugpoint</a>.</p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="extract-blocks">-extract-blocks: Extract Basic Blocks From Module (for bugpoint use)</a>
-</h3>
-<div>
-  <p>
-  This pass is used by bugpoint to extract all blocks from the module into their
-  own functions.</p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="instnamer">-instnamer: Assign names to anonymous instructions</a>
-</h3>
-<div>
-  <p>This is a little utility pass that gives instructions names, this is mostly
- useful when diffing the effect of an optimization because deleting an
- unnamed instruction can change all other instruction numbering, making the
- diff very noisy.  
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="preverify">-preverify: Preliminary module verification</a>
-</h3>
-<div>
-  <p>
-  Ensures that the module is in the form required by the <a
-  href="#verifier">Module Verifier</a> pass.
-  </p>
-  
-  <p>
-  Running the verifier runs this pass automatically, so there should be no need
-  to use it directly.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="verify">-verify: Module Verifier</a>
-</h3>
-<div>
-  <p>
-  Verifies an LLVM IR code. This is useful to run after an optimization which is
-  undergoing testing. Note that <tt>llvm-as</tt> verifies its input before
-  emitting bitcode, and also that malformed bitcode is likely to make LLVM
-  crash. All language front-ends are therefore encouraged to verify their output
-  before performing optimizing transformations.
-  </p>
-
-  <ul>
-    <li>Both of a binary operator's parameters are of the same type.</li>
-    <li>Verify that the indices of mem access instructions match other
-        operands.</li>
-    <li>Verify that arithmetic and other things are only performed on
-        first-class types.  Verify that shifts and logicals only happen on
-        integrals f.e.</li>
-    <li>All of the constants in a switch statement are of the correct type.</li>
-    <li>The code is in valid SSA form.</li>
-    <li>It is illegal to put a label into any other type (like a structure) or 
-        to return one.</li>
-    <li>Only phi nodes can be self referential: <tt>%x = add i32 %x, %x</tt> is
-        invalid.</li>
-    <li>PHI nodes must have an entry for each predecessor, with no extras.</li>
-    <li>PHI nodes must be the first thing in a basic block, all grouped
-        together.</li>
-    <li>PHI nodes must have at least one entry.</li>
-    <li>All basic blocks should only end with terminator insts, not contain
-        them.</li>
-    <li>The entry node to a function must not have predecessors.</li>
-    <li>All Instructions must be embedded into a basic block.</li>
-    <li>Functions cannot take a void-typed parameter.</li>
-    <li>Verify that a function's argument list agrees with its declared
-        type.</li>
-    <li>It is illegal to specify a name for a void value.</li>
-    <li>It is illegal to have an internal global value with no initializer.</li>
-    <li>It is illegal to have a ret instruction that returns a value that does
-        not agree with the function return value type.</li>
-    <li>Function call argument types match the function prototype.</li>
-    <li>All other things that are tested by asserts spread about the code.</li>
-  </ul>
-  
-  <p>
-  Note that this does not provide full security verification (like Java), but
-  instead just tries to ensure that code is well-formed.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="view-cfg">-view-cfg: View CFG of function</a>
-</h3>
-<div>
-  <p>
-  Displays the control flow graph using the GraphViz tool.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="view-cfg-only">-view-cfg-only: View CFG of function (with no function bodies)</a>
-</h3>
-<div>
-  <p>
-  Displays the control flow graph using the GraphViz tool, but omitting function
-  bodies.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="view-dom">-view-dom: View dominance tree of function</a>
-</h3>
-<div>
-  <p>
-  Displays the dominator tree using the GraphViz tool.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="view-dom-only">-view-dom-only: View dominance tree of function (with no function bodies)</a>
-</h3>
-<div>
-  <p>
-  Displays the dominator tree using the GraphViz tool, but omitting function
-  bodies.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="view-postdom">-view-postdom: View postdominance tree of function</a>
-</h3>
-<div>
-  <p>
-  Displays the post dominator tree using the GraphViz tool.
-  </p>
-</div>
-
-<!-------------------------------------------------------------------------- -->
-<h3>
-  <a name="view-postdom-only">-view-postdom-only: View postdominance tree of function (with no function bodies)</a>
-</h3>
-<div>
-  <p>
-  Displays the post dominator tree using the GraphViz tool, but omitting
-  function bodies.
-  </p>
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-
-<hr>
-<address>
-  <a href="http://jigsaw.w3.org/css-validator/check/referer"><img
-  src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a>
-  <a href="http://validator.w3.org/check/referer"><img
-  src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
-
-  <a href="mailto:rspencer@x10sys.com">Reid Spencer</a><br>
-  <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date$
-</address>
-
-</body>
-</html>
diff --git a/docs/Passes.rst b/docs/Passes.rst
new file mode 100644
index 0000000000..ed72166663
--- /dev/null
+++ b/docs/Passes.rst
@@ -0,0 +1,1264 @@
+..
+    If Passes.html is up to date, the following "one-liner" should print
+    an empty diff.
+
+    egrep -e '^<tr><td><a href="#.*">-.*</a></td><td>.*</td></tr>$' \
+          -e '^  <a name=".*">.*</a>$' < Passes.html >html; \
+    perl >help <<'EOT' && diff -u help html; rm -f help html
+    open HTML, "<Passes.html" or die "open: Passes.html: $!\n";
+    while (<HTML>) {
+      m:^<tr><td><a href="#(.*)">-.*</a></td><td>.*</td></tr>$: or next;
+      $order{$1} = sprintf("%03d", 1 + int %order);
+    }
+    open HELP, "../Release/bin/opt -help|" or die "open: opt -help: $!\n";
+    while (<HELP>) {
+      m:^    -([^ ]+) +- (.*)$: or next;
+      my $o = $order{$1};
+      $o = "000" unless defined $o;
+      push @x, "$o<tr><td><a href=\"#$1\">-$1</a></td><td>$2</td></tr>\n";
+      push @y, "$o  <a name=\"$1\">-$1: $2</a>\n";
+    }
+    @x = map { s/^\d\d\d//; $_ } sort @x;
+    @y = map { s/^\d\d\d//; $_ } sort @y;
+    print @x, @y;
+    EOT
+
+    This (real) one-liner can also be helpful when converting comments to HTML:
+
+    perl -e '$/ = undef; for (split(/\n/, <>)) { s:^ *///? ?::; print "  <p>\n" if !$on && $_ =~ /\S/; print "  </p>\n" if $on && $_ =~ /^\s*$/; print "  $_\n"; $on = ($_ =~ /\S/); } print "  </p>\n" if $on'
+
+====================================
+LLVM's Analysis and Transform Passes
+====================================
+
+.. contents::
+    :local:
+
+Written by `Reid Spencer <mailto:rspencer@x10sys.com>`_
+    and Gordon Henriksen
+
+Introduction
+============
+
+This document serves as a high level summary of the optimization features that
+LLVM provides.  Optimizations are implemented as Passes that traverse some
+portion of a program to either collect information or transform the program.
+The table below divides the passes that LLVM provides into three categories.
+Analysis passes compute information that other passes can use or for debugging
+or program visualization purposes.  Transform passes can use (or invalidate)
+the analysis passes.  Transform passes all mutate the program in some way.
+Utility passes provides some utility but don't otherwise fit categorization.
+For example passes to extract functions to bitcode or write a module to bitcode
+are neither analysis nor transform passes.  The table of contents above
+provides a quick summary of each pass and links to the more complete pass
+description later in the document.
+
+Analysis Passes
+===============
+
+This section describes the LLVM Analysis Passes.
+
+``-aa-eval``: Exhaustive Alias Analysis Precision Evaluator
+-----------------------------------------------------------
+
+This is a simple N^2 alias analysis accuracy evaluator.  Basically, for each
+function in the program, it simply queries to see how the alias analysis
+implementation answers alias queries between each pair of pointers in the
+function.
+
+This is inspired and adapted from code by: Naveen Neelakantam, Francesco
+Spadini, and Wojciech Stryjewski.
+
+``-basicaa``: Basic Alias Analysis (stateless AA impl)
+------------------------------------------------------
+
+A basic alias analysis pass that implements identities (two different globals
+cannot alias, etc), but does no stateful analysis.
+
+``-basiccg``: Basic CallGraph Construction
+------------------------------------------
+
+Yet to be written.
+
+``-count-aa``: Count Alias Analysis Query Responses
+---------------------------------------------------
+
+A pass which can be used to count how many alias queries are being made and how
+the alias analysis implementation being used responds.
+
+``-da``: Dependence Analysis
+----------------------------
+
+Dependence analysis framework, which is used to detect dependences in memory
+accesses.
+
+``-debug-aa``: AA use debugger
+------------------------------
+
+This simple pass checks alias analysis users to ensure that if they create a
+new value, they do not query AA without informing it of the value.  It acts as
+a shim over any other AA pass you want.
+
+Yes keeping track of every value in the program is expensive, but this is a
+debugging pass.
+
+``-domfrontier``: Dominance Frontier Construction
+-------------------------------------------------
+
+This pass is a simple dominator construction algorithm for finding forward
+dominator frontiers.
+
+``-domtree``: Dominator Tree Construction
+-----------------------------------------
+
+This pass is a simple dominator construction algorithm for finding forward
+dominators.
+
+
+``-dot-callgraph``: Print Call Graph to "dot" file
+--------------------------------------------------
+
+This pass, only available in ``opt``, prints the call graph into a ``.dot``
+graph.  This graph can then be processed with the "dot" tool to convert it to
+postscript or some other suitable format.
+
+``-dot-cfg``: Print CFG of function to "dot" file
+-------------------------------------------------
+
+This pass, only available in ``opt``, prints the control flow graph into a
+``.dot`` graph.  This graph can then be processed with the :program:`dot` tool
+to convert it to postscript or some other suitable format.
+
+``-dot-cfg-only``: Print CFG of function to "dot" file (with no function bodies)
+--------------------------------------------------------------------------------
+
+This pass, only available in ``opt``, prints the control flow graph into a
+``.dot`` graph, omitting the function bodies.  This graph can then be processed
+with the :program:`dot` tool to convert it to postscript or some other suitable
+format.
+
+``-dot-dom``: Print dominance tree of function to "dot" file
+------------------------------------------------------------
+
+This pass, only available in ``opt``, prints the dominator tree into a ``.dot``
+graph.  This graph can then be processed with the :program:`dot` tool to
+convert it to postscript or some other suitable format.
+
+``-dot-dom-only``: Print dominance tree of function to "dot" file (with no function bodies)
+-------------------------------------------------------------------------------------------
+
+This pass, only available in ``opt``, prints the dominator tree into a ``.dot``
+graph, omitting the function bodies.  This graph can then be processed with the
+:program:`dot` tool to convert it to postscript or some other suitable format.
+
+``-dot-postdom``: Print postdominance tree of function to "dot" file
+--------------------------------------------------------------------
+
+This pass, only available in ``opt``, prints the post dominator tree into a
+``.dot`` graph.  This graph can then be processed with the :program:`dot` tool
+to convert it to postscript or some other suitable format.
+
+``-dot-postdom-only``: Print postdominance tree of function to "dot" file (with no function bodies)
+---------------------------------------------------------------------------------------------------
+
+This pass, only available in ``opt``, prints the post dominator tree into a
+``.dot`` graph, omitting the function bodies.  This graph can then be processed
+with the :program:`dot` tool to convert it to postscript or some other suitable
+format.
+
+``-globalsmodref-aa``: Simple mod/ref analysis for globals
+----------------------------------------------------------
+
+This simple pass provides alias and mod/ref information for global values that
+do not have their address taken, and keeps track of whether functions read or
+write memory (are "pure").  For this simple (but very common) case, we can
+provide pretty accurate and useful information.
+
+``-instcount``: Counts the various types of ``Instruction``\ s
+--------------------------------------------------------------
+
+This pass collects the count of all instructions and reports them.
+
+``-intervals``: Interval Partition Construction
+-----------------------------------------------
+
+This analysis calculates and represents the interval partition of a function,
+or a preexisting interval partition.
+
+In this way, the interval partition may be used to reduce a flow graph down to
+its degenerate single node interval partition (unless it is irreducible).
+
+``-iv-users``: Induction Variable Users
+---------------------------------------
+
+Bookkeeping for "interesting" users of expressions computed from induction
+variables.
+
+``-lazy-value-info``: Lazy Value Information Analysis
+-----------------------------------------------------
+
+Interface for lazy computation of value constraint information.
+
+``-libcall-aa``: LibCall Alias Analysis
+---------------------------------------
+
+LibCall Alias Analysis.
+
+``-lint``: Statically lint-checks LLVM IR
+-----------------------------------------
+
+This pass statically checks for common and easily-identified constructs which
+produce undefined or likely unintended behavior in LLVM IR.
+
+It is not a guarantee of correctness, in two ways.  First, it isn't
+comprehensive.  There are checks which could be done statically which are not
+yet implemented.  Some of these are indicated by TODO comments, but those
+aren't comprehensive either.  Second, many conditions cannot be checked
+statically.  This pass does no dynamic instrumentation, so it can't check for
+all possible problems.
+
+Another limitation is that it assumes all code will be executed.  A store
+through a null pointer in a basic block which is never reached is harmless, but
+this pass will warn about it anyway.
+
+Optimization passes may make conditions that this pass checks for more or less
+obvious.  If an optimization pass appears to be introducing a warning, it may
+be that the optimization pass is merely exposing an existing condition in the
+code.
+
+This code may be run before :ref:`instcombine <passes-instcombine>`.  In many
+cases, instcombine checks for the same kinds of things and turns instructions
+with undefined behavior into unreachable (or equivalent).  Because of this,
+this pass makes some effort to look through bitcasts and so on.
+
+``-loops``: Natural Loop Information
+------------------------------------
+
+This analysis is used to identify natural loops and determine the loop depth of
+various nodes of the CFG.  Note that the loops identified may actually be
+several natural loops that share the same header node... not just a single
+natural loop.
+
+``-memdep``: Memory Dependence Analysis
+---------------------------------------
+
+An analysis that determines, for a given memory operation, what preceding
+memory operations it depends on.  It builds on alias analysis information, and
+tries to provide a lazy, caching interface to a common kind of alias
+information query.
+
+``-module-debuginfo``: Decodes module-level debug info
+------------------------------------------------------
+
+This pass decodes the debug info metadata in a module and prints in a
+(sufficiently-prepared-) human-readable form.
+
+For example, run this pass from ``opt`` along with the ``-analyze`` option, and
+it'll print to standard output.
+
+``-no-aa``: No Alias Analysis (always returns 'may' alias)
+----------------------------------------------------------
+
+This is the default implementation of the Alias Analysis interface.  It always
+returns "I don't know" for alias queries.  NoAA is unlike other alias analysis
+implementations, in that it does not chain to a previous analysis.  As such it
+doesn't follow many of the rules that other alias analyses must.
+
+``-no-profile``: No Profile Information
+---------------------------------------
+
+The default "no profile" implementation of the abstract ``ProfileInfo``
+interface.
+
+``-postdomfrontier``: Post-Dominance Frontier Construction
+----------------------------------------------------------
+
+This pass is a simple post-dominator construction algorithm for finding
+post-dominator frontiers.
+
+``-postdomtree``: Post-Dominator Tree Construction
+--------------------------------------------------
+
+This pass is a simple post-dominator construction algorithm for finding
+post-dominators.
+
+``-print-alias-sets``: Alias Set Printer
+----------------------------------------
+
+Yet to be written.
+
+``-print-callgraph``: Print a call graph
+----------------------------------------
+
+This pass, only available in ``opt``, prints the call graph to standard error
+in a human-readable form.
+
+``-print-callgraph-sccs``: Print SCCs of the Call Graph
+-------------------------------------------------------
+
+This pass, only available in ``opt``, prints the SCCs of the call graph to
+standard error in a human-readable form.
+
+``-print-cfg-sccs``: Print SCCs of each function CFG
+----------------------------------------------------
+
+This pass, only available in ``opt``, printsthe SCCs of each function CFG to
+standard error in a human-readable fom.
+
+``-print-dbginfo``: Print debug info in human readable form
+-----------------------------------------------------------
+
+Pass that prints instructions, and associated debug info:
+
+#. source/line/col information
+#. original variable name
+#. original type name
+
+``-print-dom-info``: Dominator Info Printer
+-------------------------------------------
+
+Dominator Info Printer.
+
+``-print-externalfnconstants``: Print external fn callsites passed constants
+----------------------------------------------------------------------------
+
+This pass, only available in ``opt``, prints out call sites to external
+functions that are called with constant arguments.  This can be useful when
+looking for standard library functions we should constant fold or handle in
+alias analyses.
+
+``-print-function``: Print function to stderr
+---------------------------------------------
+
+The ``PrintFunctionPass`` class is designed to be pipelined with other
+``FunctionPasses``, and prints out the functions of the module as they are
+processed.
+
+``-print-module``: Print module to stderr
+-----------------------------------------
+
+This pass simply prints out the entire module when it is executed.
+
+.. _passes-print-used-types:
+
+``-print-used-types``: Find Used Types
+--------------------------------------
+
+This pass is used to seek out all of the types in use by the program.  Note
+that this analysis explicitly does not include types only used by the symbol
+table.
+
+``-profile-estimator``: Estimate profiling information
+------------------------------------------------------
+
+Profiling information that estimates the profiling information in a very crude
+and unimaginative way.
+
+``-profile-loader``: Load profile information from ``llvmprof.out``
+-------------------------------------------------------------------
+
+A concrete implementation of profiling information that loads the information
+from a profile dump file.
+
+``-profile-verifier``: Verify profiling information
+---------------------------------------------------
+
+Pass that checks profiling information for plausibility.
+
+``-regions``: Detect single entry single exit regions
+-----------------------------------------------------
+
+The ``RegionInfo`` pass detects single entry single exit regions in a function,
+where a region is defined as any subgraph that is connected to the remaining
+graph at only two spots.  Furthermore, an hierarchical region tree is built.
+
+``-scalar-evolution``: Scalar Evolution Analysis
+------------------------------------------------
+
+The ``ScalarEvolution`` analysis can be used to analyze and catagorize scalar
+expressions in loops.  It specializes in recognizing general induction
+variables, representing them with the abstract and opaque ``SCEV`` class.
+Given this analysis, trip counts of loops and other important properties can be
+obtained.
+
+This analysis is primarily useful for induction variable substitution and
+strength reduction.
+
+``-scev-aa``: ScalarEvolution-based Alias Analysis
+--------------------------------------------------
+
+Simple alias analysis implemented in terms of ``ScalarEvolution`` queries.
+
+This differs from traditional loop dependence analysis in that it tests for
+dependencies within a single iteration of a loop, rather than dependencies
+between different iterations.
+
+``ScalarEvolution`` has a more complete understanding of pointer arithmetic
+than ``BasicAliasAnalysis``' collection of ad-hoc analyses.
+
+``-targetdata``: Target Data Layout
+-----------------------------------
+
+Provides other passes access to information on how the size and alignment
+required by the target ABI for various data types.
+
+Transform Passes
+================
+
+This section describes the LLVM Transform Passes.
+
+``-adce``: Aggressive Dead Code Elimination
+-------------------------------------------
+
+ADCE aggressively tries to eliminate code.  This pass is similar to :ref:`DCE
+<passes-dce>` but it assumes that values are dead until proven otherwise.  This
+is similar to :ref:`SCCP <passes-sccp>`, except applied to the liveness of
+values.
+
+``-always-inline``: Inliner for ``always_inline`` functions
+-----------------------------------------------------------
+
+A custom inliner that handles only functions that are marked as "always
+inline".
+
+``-argpromotion``: Promote 'by reference' arguments to scalars
+--------------------------------------------------------------
+
+This pass promotes "by reference" arguments to be "by value" arguments.  In
+practice, this means looking for internal functions that have pointer
+arguments.  If it can prove, through the use of alias analysis, that an
+argument is *only* loaded, then it can pass the value into the function instead
+of the address of the value.  This can cause recursive simplification of code
+and lead to the elimination of allocas (especially in C++ template code like
+the STL).
+
+This pass also handles aggregate arguments that are passed into a function,
+scalarizing them if the elements of the aggregate are only loaded.  Note that
+it refuses to scalarize aggregates which would require passing in more than
+three operands to the function, because passing thousands of operands for a
+large array or structure is unprofitable!
+
+Note that this transformation could also be done for arguments that are only
+stored to (returning the value instead), but does not currently.  This case
+would be best handled when and if LLVM starts supporting multiple return values
+from functions.
+
+``-bb-vectorize``: Basic-Block Vectorization
+--------------------------------------------
+
+This pass combines instructions inside basic blocks to form vector
+instructions.  It iterates over each basic block, attempting to pair compatible
+instructions, repeating this process until no additional pairs are selected for
+vectorization.  When the outputs of some pair of compatible instructions are
+used as inputs by some other pair of compatible instructions, those pairs are
+part of a potential vectorization chain.  Instruction pairs are only fused into
+vector instructions when they are part of a chain longer than some threshold
+length.  Moreover, the pass attempts to find the best possible chain for each
+pair of compatible instructions.  These heuristics are intended to prevent
+vectorization in cases where it would not yield a performance increase of the
+resulting code.
+
+``-block-placement``: Profile Guided Basic Block Placement
+----------------------------------------------------------
+
+This pass is a very simple profile guided basic block placement algorithm.  The
+idea is to put frequently executed blocks together at the start of the function
+and hopefully increase the number of fall-through conditional branches.  If
+there is no profile information for a particular function, this pass basically
+orders blocks in depth-first order.
+
+``-break-crit-edges``: Break critical edges in CFG
+--------------------------------------------------
+
+Break all of the critical edges in the CFG by inserting a dummy basic block.
+It may be "required" by passes that cannot deal with critical edges.  This
+transformation obviously invalidates the CFG, but can update forward dominator
+(set, immediate dominators, tree, and frontier) information.
+
+``-codegenprepare``: Optimize for code generation
+-------------------------------------------------
+
+This pass munges the code in the input function to better prepare it for
+SelectionDAG-based code generation.  This works around limitations in it's
+basic-block-at-a-time approach.  It should eventually be removed.
+
+``-constmerge``: Merge Duplicate Global Constants
+-------------------------------------------------
+
+Merges duplicate global constants together into a single constant that is
+shared.  This is useful because some passes (i.e., TraceValues) insert a lot of
+string constants into the program, regardless of whether or not an existing
+string is available.
+
+``-constprop``: Simple constant propagation
+-------------------------------------------
+
+This file implements constant propagation and merging.  It looks for
+instructions involving only constant operands and replaces them with a constant
+value instead of an instruction.  For example:
+
+.. code-block:: llvm
+
+  add i32 1, 2
+
+becomes
+
+.. code-block:: llvm
+
+  i32 3
+
+NOTE: this pass has a habit of making definitions be dead.  It is a good idea
+to to run a :ref:`Dead Instruction Elimination <passes-die>` pass sometime
+after running this pass.
+
+.. _passes-dce:
+
+``-dce``: Dead Code Elimination
+-------------------------------
+
+Dead code elimination is similar to :ref:`dead instruction elimination
+<passes-die>`, but it rechecks instructions that were used by removed
+instructions to see if they are newly dead.
+
+``-deadargelim``: Dead Argument Elimination
+-------------------------------------------
+
+This pass deletes dead arguments from internal functions.  Dead argument
+elimination removes arguments which are directly dead, as well as arguments
+only passed into function calls as dead arguments of other functions.  This
+pass also deletes dead arguments in a similar way.
+
+This pass is often useful as a cleanup pass to run after aggressive
+interprocedural passes, which add possibly-dead arguments.
+
+``-deadtypeelim``: Dead Type Elimination
+----------------------------------------
+
+This pass is used to cleanup the output of GCC.  It eliminate names for types
+that are unused in the entire translation unit, using the :ref:`find used types
+<passes-print-used-types>` pass.
+
+.. _passes-die:
+
+``-die``: Dead Instruction Elimination
+--------------------------------------
+
+Dead instruction elimination performs a single pass over the function, removing
+instructions that are obviously dead.
+
+``-dse``: Dead Store Elimination
+--------------------------------
+
+A trivial dead store elimination that only considers basic-block local
+redundant stores.
+
+``-functionattrs``: Deduce function attributes
+----------------------------------------------
+
+A simple interprocedural pass which walks the call-graph, looking for functions
+which do not access or only read non-local memory, and marking them
+``readnone``/``readonly``.  In addition, it marks function arguments (of
+pointer type) "``nocapture``" if a call to the function does not create any
+copies of the pointer value that outlive the call.  This more or less means
+that the pointer is only dereferenced, and not returned from the function or
+stored in a global.  This pass is implemented as a bottom-up traversal of the
+call-graph.
+
+``-globaldce``: Dead Global Elimination
+---------------------------------------
+
+This transform is designed to eliminate unreachable internal globals from the
+program.  It uses an aggressive algorithm, searching out globals that are known
+to be alive.  After it finds all of the globals which are needed, it deletes
+whatever is left over.  This allows it to delete recursive chunks of the
+program which are unreachable.
+
+``-globalopt``: Global Variable Optimizer
+-----------------------------------------
+
+This pass transforms simple global variables that never have their address
+taken.  If obviously true, it marks read/write globals as constant, deletes
+variables only stored to, etc.
+
+``-gvn``: Global Value Numbering
+--------------------------------
+
+This pass performs global value numbering to eliminate fully and partially
+redundant instructions.  It also performs redundant load elimination.
+
+.. _passes-indvars:
+
+``-indvars``: Canonicalize Induction Variables
+----------------------------------------------
+
+This transformation analyzes and transforms the induction variables (and
+computations derived from them) into simpler forms suitable for subsequent
+analysis and transformation.
+
+This transformation makes the following changes to each loop with an
+identifiable induction variable:
+
+* All loops are transformed to have a *single* canonical induction variable
+  which starts at zero and steps by one.
+* The canonical induction variable is guaranteed to be the first PHI node in
+  the loop header block.
+* Any pointer arithmetic recurrences are raised to use array subscripts.
+
+If the trip count of a loop is computable, this pass also makes the following
+changes:
+
+* The exit condition for the loop is canonicalized to compare the induction
+  value against the exit value.  This turns loops like:
+
+  .. code-block:: c++
+
+    for (i = 7; i*i < 1000; ++i)
+
+    into
+
+  .. code-block:: c++
+
+    for (i = 0; i != 25; ++i)
+
+* Any use outside of the loop of an expression derived from the indvar is
+  changed to compute the derived value outside of the loop, eliminating the
+  dependence on the exit value of the induction variable.  If the only purpose
+  of the loop is to compute the exit value of some derived expression, this
+  transformation will make the loop dead.
+
+This transformation should be followed by strength reduction after all of the
+desired loop transformations have been performed.  Additionally, on targets
+where it is profitable, the loop could be transformed to count down to zero
+(the "do loop" optimization).
+
+``-inline``: Function Integration/Inlining
+------------------------------------------
+
+Bottom-up inlining of functions into callees.
+
+``-insert-edge-profiling``: Insert instrumentation for edge profiling
+---------------------------------------------------------------------
+
+This pass instruments the specified program with counters for edge profiling.
+Edge profiling can give a reasonable approximation of the hot paths through a
+program, and is used for a wide variety of program transformations.
+
+Note that this implementation is very naïve.  It inserts a counter for *every*
+edge in the program, instead of using control flow information to prune the
+number of counters inserted.
+
+``-insert-optimal-edge-profiling``: Insert optimal instrumentation for edge profiling
+-------------------------------------------------------------------------------------
+
+This pass instruments the specified program with counters for edge profiling.
+Edge profiling can give a reasonable approximation of the hot paths through a
+program, and is used for a wide variety of program transformations.
+
+.. _passes-instcombine:
+
+``-instcombine``: Combine redundant instructions
+------------------------------------------------
+
+Combine instructions to form fewer, simple instructions.  This pass does not
+modify the CFG This pass is where algebraic simplification happens.
+
+This pass combines things like:
+
+.. code-block:: llvm
+
+  %Y = add i32 %X, 1
+  %Z = add i32 %Y, 1
+
+into:
+
+.. code-block:: llvm
+
+  %Z = add i32 %X, 2
+
+This is a simple worklist driven algorithm.
+
+This pass guarantees that the following canonicalizations are performed on the
+program:
+
+#. If a binary operator has a constant operand, it is moved to the right-hand
+   side.
+#. Bitwise operators with constant operands are always grouped so that shifts
+   are performed first, then ``or``\ s, then ``and``\ s, then ``xor``\ s.
+#. Compare instructions are converted from ``<``, ``>``, ``≤``, or ``≥`` to
+   ``=`` or ``≠`` if possible.
+#. All ``cmp`` instructions on boolean values are replaced with logical
+   operations.
+#. ``add X, X`` is represented as ``mul X, 2`` ⇒ ``shl X, 1``
+#. Multiplies with a constant power-of-two argument are transformed into
+   shifts.
+#. … etc.
+
+``-internalize``: Internalize Global Symbols
+--------------------------------------------
+
+This pass loops over all of the functions in the input module, looking for a
+main function.  If a main function is found, all other functions and all global
+variables with initializers are marked as internal.
+
+``-ipconstprop``: Interprocedural constant propagation
+------------------------------------------------------
+
+This pass implements an *extremely* simple interprocedural constant propagation
+pass.  It could certainly be improved in many different ways, like using a
+worklist.  This pass makes arguments dead, but does not remove them.  The
+existing dead argument elimination pass should be run after this to clean up
+the mess.
+
+``-ipsccp``: Interprocedural Sparse Conditional Constant Propagation
+--------------------------------------------------------------------
+
+An interprocedural variant of :ref:`Sparse Conditional Constant Propagation
+<passes-sccp>`.
+
+``-jump-threading``: Jump Threading
+-----------------------------------
+
+Jump threading tries to find distinct threads of control flow running through a
+basic block.  This pass looks at blocks that have multiple predecessors and
+multiple successors.  If one or more of the predecessors of the block can be
+proven to always cause a jump to one of the successors, we forward the edge
+from the predecessor to the successor by duplicating the contents of this
+block.
+
+An example of when this can occur is code like this:
+
+.. code-block:: c++
+
+  if () { ...
+    X = 4;
+  }
+  if (X < 3) {
+
+In this case, the unconditional branch at the end of the first if can be
+revectored to the false side of the second if.
+
+``-lcssa``: Loop-Closed SSA Form Pass
+-------------------------------------
+
+This pass transforms loops by placing phi nodes at the end of the loops for all
+values that are live across the loop boundary.  For example, it turns the left
+into the right code:
+
+.. code-block:: c++
+
+  for (...)                for (...)
+      if (c)                   if (c)
+          X1 = ...                 X1 = ...
+      else                     else
+          X2 = ...                 X2 = ...
+      X3 = phi(X1, X2)         X3 = phi(X1, X2)
+  ... = X3 + 4              X4 = phi(X3)
+                              ... = X4 + 4
+
+This is still valid LLVM; the extra phi nodes are purely redundant, and will be
+trivially eliminated by ``InstCombine``.  The major benefit of this
+transformation is that it makes many other loop optimizations, such as
+``LoopUnswitch``\ ing, simpler.
+
+.. _passes-licm:
+
+``-licm``: Loop Invariant Code Motion
+-------------------------------------
+
+This pass performs loop invariant code motion, attempting to remove as much
+code from the body of a loop as possible.  It does this by either hoisting code
+into the preheader block, or by sinking code to the exit blocks if it is safe.
+This pass also promotes must-aliased memory locations in the loop to live in
+registers, thus hoisting and sinking "invariant" loads and stores.
+
+This pass uses alias analysis for two purposes:
+
+#. Moving loop invariant loads and calls out of loops.  If we can determine
+   that a load or call inside of a loop never aliases anything stored to, we
+   can hoist it or sink it like any other instruction.
+
+#. Scalar Promotion of Memory.  If there is a store instruction inside of the
+   loop, we try to move the store to happen AFTER the loop instead of inside of
+   the loop.  This can only happen if a few conditions are true:
+
+   #. The pointer stored through is loop invariant.
+   #. There are no stores or loads in the loop which *may* alias the pointer.
+      There are no calls in the loop which mod/ref the pointer.
+
+   If these conditions are true, we can promote the loads and stores in the
+   loop of the pointer to use a temporary alloca'd variable.  We then use the
+   :ref:`mem2reg <passes-mem2reg>` functionality to construct the appropriate
+   SSA form for the variable.
+
+``-loop-deletion``: Delete dead loops
+-------------------------------------
+
+This file implements the Dead Loop Deletion Pass.  This pass is responsible for
+eliminating loops with non-infinite computable trip counts that have no side
+effects or volatile instructions, and do not contribute to the computation of
+the function's return value.
+
+.. _passes-loop-extract:
+
+``-loop-extract``: Extract loops into new functions
+---------------------------------------------------
+
+A pass wrapper around the ``ExtractLoop()`` scalar transformation to extract
+each top-level loop into its own new function.  If the loop is the *only* loop
+in a given function, it is not touched.  This is a pass most useful for
+debugging via bugpoint.
+
+``-loop-extract-single``: Extract at most one loop into a new function
+----------------------------------------------------------------------
+
+Similar to :ref:`Extract loops into new functions <passes-loop-extract>`, this
+pass extracts one natural loop from the program into a function if it can.
+This is used by :program:`bugpoint`.
+
+``-loop-reduce``: Loop Strength Reduction
+-----------------------------------------
+
+This pass performs a strength reduction on array references inside loops that
+have as one or more of their components the loop induction variable.  This is
+accomplished by creating a new value to hold the initial value of the array
+access for the first iteration, and then creating a new GEP instruction in the
+loop to increment the value by the appropriate amount.
+
+``-loop-rotate``: Rotate Loops
+------------------------------
+
+A simple loop rotation transformation.
+
+``-loop-simplify``: Canonicalize natural loops
+----------------------------------------------
+
+This pass performs several transformations to transform natural loops into a
+simpler form, which makes subsequent analyses and transformations simpler and
+more effective.
+
+Loop pre-header insertion guarantees that there is a single, non-critical entry
+edge from outside of the loop to the loop header.  This simplifies a number of
+analyses and transformations, such as :ref:`LICM <passes-licm>`.
+
+Loop exit-block insertion guarantees that all exit blocks from the loop (blocks
+which are outside of the loop that have predecessors inside of the loop) only
+have predecessors from inside of the loop (and are thus dominated by the loop
+header).  This simplifies transformations such as store-sinking that are built
+into LICM.
+
+This pass also guarantees that loops will have exactly one backedge.
+
+Note that the :ref:`simplifycfg <passes-simplifycfg>` pass will clean up blocks
+which are split out but end up being unnecessary, so usage of this pass should
+not pessimize generated code.
+
+This pass obviously modifies the CFG, but updates loop information and
+dominator information.
+
+``-loop-unroll``: Unroll loops
+------------------------------
+
+This pass implements a simple loop unroller.  It works best when loops have
+been canonicalized by the :ref:`indvars <passes-indvars>` pass, allowing it to
+determine the trip counts of loops easily.
+
+``-loop-unswitch``: Unswitch loops
+----------------------------------
+
+This pass transforms loops that contain branches on loop-invariant conditions
+to have multiple loops.  For example, it turns the left into the right code:
+
+.. code-block:: c++
+
+  for (...)                  if (lic)
+      A                          for (...)
+      if (lic)                       A; B; C
+          B                  else
+      C                          for (...)
+                                     A; C
+
+This can increase the size of the code exponentially (doubling it every time a
+loop is unswitched) so we only unswitch if the resultant code will be smaller
+than a threshold.
+
+This pass expects :ref:`LICM <passes-licm>` to be run before it to hoist
+invariant conditions out of the loop, to make the unswitching opportunity
+obvious.
+
+``-loweratomic``: Lower atomic intrinsics to non-atomic form
+------------------------------------------------------------
+
+This pass lowers atomic intrinsics to non-atomic form for use in a known
+non-preemptible environment.
+
+The pass does not verify that the environment is non-preemptible (in general
+this would require knowledge of the entire call graph of the program including
+any libraries which may not be available in bitcode form); it simply lowers
+every atomic intrinsic.
+
+``-lowerinvoke``: Lower invoke and unwind, for unwindless code generators
+-------------------------------------------------------------------------
+
+This transformation is designed for use by code generators which do not yet
+support stack unwinding.  This pass supports two models of exception handling
+lowering, the "cheap" support and the "expensive" support.
+
+"Cheap" exception handling support gives the program the ability to execute any
+program which does not "throw an exception", by turning "``invoke``"
+instructions into calls and by turning "``unwind``" instructions into calls to
+``abort()``.  If the program does dynamically use the "``unwind``" instruction,
+the program will print a message then abort.
+
+"Expensive" exception handling support gives the full exception handling
+support to the program at the cost of making the "``invoke``" instruction
+really expensive.  It basically inserts ``setjmp``/``longjmp`` calls to emulate
+the exception handling as necessary.
+
+Because the "expensive" support slows down programs a lot, and EH is only used
+for a subset of the programs, it must be specifically enabled by the
+``-enable-correct-eh-support`` option.
+
+Note that after this pass runs the CFG is not entirely accurate (exceptional
+control flow edges are not correct anymore) so only very simple things should
+be done after the ``lowerinvoke`` pass has run (like generation of native
+code).  This should not be used as a general purpose "my LLVM-to-LLVM pass
+doesn't support the ``invoke`` instruction yet" lowering pass.
+
+``-lowerswitch``: Lower ``SwitchInst``\ s to branches
+-----------------------------------------------------
+
+Rewrites switch instructions with a sequence of branches, which allows targets
+to get away with not implementing the switch instruction until it is
+convenient.
+
+.. _passes-mem2reg:
+
+``-mem2reg``: Promote Memory to Register
+----------------------------------------
+
+This file promotes memory references to be register references.  It promotes
+alloca instructions which only have loads and stores as uses.  An ``alloca`` is
+transformed by using dominator frontiers to place phi nodes, then traversing
+the function in depth-first order to rewrite loads and stores as appropriate.
+This is just the standard SSA construction algorithm to construct "pruned" SSA
+form.
+
+``-memcpyopt``: MemCpy Optimization
+-----------------------------------
+
+This pass performs various transformations related to eliminating ``memcpy``
+calls, or transforming sets of stores into ``memset``\ s.
+
+``-mergefunc``: Merge Functions
+-------------------------------
+
+This pass looks for equivalent functions that are mergable and folds them.
+
+A hash is computed from the function, based on its type and number of basic
+blocks.
+
+Once all hashes are computed, we perform an expensive equality comparison on
+each function pair.  This takes n^2/2 comparisons per bucket, so it's important
+that the hash function be high quality.  The equality comparison iterates
+through each instruction in each basic block.
+
+When a match is found the functions are folded.  If both functions are
+overridable, we move the functionality into a new internal function and leave
+two overridable thunks to it.
+
+``-mergereturn``: Unify function exit nodes
+-------------------------------------------
+
+Ensure that functions have at most one ``ret`` instruction in them.
+Additionally, it keeps track of which node is the new exit node of the CFG.
+
+``-partial-inliner``: Partial Inliner
+-------------------------------------
+
+This pass performs partial inlining, typically by inlining an ``if`` statement
+that surrounds the body of the function.
+
+``-prune-eh``: Remove unused exception handling info
+----------------------------------------------------
+
+This file implements a simple interprocedural pass which walks the call-graph,
+turning invoke instructions into call instructions if and only if the callee
+cannot throw an exception.  It implements this as a bottom-up traversal of the
+call-graph.
+
+``-reassociate``: Reassociate expressions
+-----------------------------------------
+
+This pass reassociates commutative expressions in an order that is designed to
+promote better constant propagation, GCSE, :ref:`LICM <passes-licm>`, PRE, etc.
+
+For example: 4 + (x + 5) ⇒ x + (4 + 5)
+
+In the implementation of this algorithm, constants are assigned rank = 0,
+function arguments are rank = 1, and other values are assigned ranks
+corresponding to the reverse post order traversal of current function (starting
+at 2), which effectively gives values in deep loops higher rank than values not
+in loops.
+
+``-reg2mem``: Demote all values to stack slots
+----------------------------------------------
+
+This file demotes all registers to memory references.  It is intended to be the
+inverse of :ref:`mem2reg <passes-mem2reg>`.  By converting to ``load``
+instructions, the only values live across basic blocks are ``alloca``
+instructions and ``load`` instructions before ``phi`` nodes.  It is intended
+that this should make CFG hacking much easier.  To make later hacking easier,
+the entry block is split into two, such that all introduced ``alloca``
+instructions (and nothing else) are in the entry block.
+
+``-scalarrepl``: Scalar Replacement of Aggregates (DT)
+------------------------------------------------------
+
+The well-known scalar replacement of aggregates transformation.  This transform
+breaks up ``alloca`` instructions of aggregate type (structure or array) into
+individual ``alloca`` instructions for each member if possible.  Then, if
+possible, it transforms the individual ``alloca`` instructions into nice clean
+scalar SSA form.
+
+This combines a simple scalar replacement of aggregates algorithm with the
+:ref:`mem2reg <passes-mem2reg>` algorithm because often interact, especially
+for C++ programs.  As such, iterating between ``scalarrepl``, then
+:ref:`mem2reg <passes-mem2reg>` until we run out of things to promote works
+well.
+
+.. _passes-sccp:
+
+``-sccp``: Sparse Conditional Constant Propagation
+--------------------------------------------------
+
+Sparse conditional constant propagation and merging, which can be summarized
+as:
+
+* Assumes values are constant unless proven otherwise
+* Assumes BasicBlocks are dead unless proven otherwise
+* Proves values to be constant, and replaces them with constants
+* Proves conditional branches to be unconditional
+
+Note that this pass has a habit of making definitions be dead.  It is a good
+idea to to run a :ref:`DCE <passes-dce>` pass sometime after running this pass.
+
+``-simplify-libcalls``: Simplify well-known library calls
+---------------------------------------------------------
+
+Applies a variety of small optimizations for calls to specific well-known
+function calls (e.g. runtime library functions).  For example, a call
+``exit(3)`` that occurs within the ``main()`` function can be transformed into
+simply ``return 3``.
+
+.. _passes-simplifycfg:
+
+``-simplifycfg``: Simplify the CFG
+----------------------------------
+
+Performs dead code elimination and basic block merging.  Specifically:
+
+* Removes basic blocks with no predecessors.
+* Merges a basic block into its predecessor if there is only one and the
+  predecessor only has one successor.
+* Eliminates PHI nodes for basic blocks with a single predecessor.
+* Eliminates a basic block that only contains an unconditional branch.
+
+``-sink``: Code sinking
+-----------------------
+
+This pass moves instructions into successor blocks, when possible, so that they
+aren't executed on paths where their results aren't needed.
+
+``-strip``: Strip all symbols from a module
+-------------------------------------------
+
+Performs code stripping.  This transformation can delete:
+
+* names for virtual registers
+* symbols for internal globals and functions
+* debug information
+
+Note that this transformation makes code much less readable, so it should only
+be used in situations where the strip utility would be used, such as reducing
+code size or making it harder to reverse engineer code.
+
+``-strip-dead-debug-info``: Strip debug info for unused symbols
+---------------------------------------------------------------
+
+.. FIXME: this description is the same as for -strip
+
+performs code stripping. this transformation can delete:
+
+* names for virtual registers
+* symbols for internal globals and functions
+* debug information
+
+note that this transformation makes code much less readable, so it should only
+be used in situations where the strip utility would be used, such as reducing
+code size or making it harder to reverse engineer code.
+
+``-strip-dead-prototypes``: Strip Unused Function Prototypes
+------------------------------------------------------------
+
+This pass loops over all of the functions in the input module, looking for dead
+declarations and removes them.  Dead declarations are declarations of functions
+for which no implementation is available (i.e., declarations for unused library
+functions).
+
+``-strip-debug-declare``: Strip all ``llvm.dbg.declare`` intrinsics
+-------------------------------------------------------------------
+
+.. FIXME: this description is the same as for -strip
+
+This pass implements code stripping.  Specifically, it can delete:
+
+#. names for virtual registers
+#. symbols for internal globals and functions
+#. debug information
+
+Note that this transformation makes code much less readable, so it should only
+be used in situations where the 'strip' utility would be used, such as reducing
+code size or making it harder to reverse engineer code.
+
+``-strip-nondebug``: Strip all symbols, except dbg symbols, from a module
+-------------------------------------------------------------------------
+
+.. FIXME: this description is the same as for -strip
+
+This pass implements code stripping.  Specifically, it can delete:
+
+#. names for virtual registers
+#. symbols for internal globals and functions
+#. debug information
+
+Note that this transformation makes code much less readable, so it should only
+be used in situations where the 'strip' utility would be used, such as reducing
+code size or making it harder to reverse engineer code.
+
+``-tailcallelim``: Tail Call Elimination
+----------------------------------------
+
+This file transforms calls of the current function (self recursion) followed by
+a return instruction with a branch to the entry of the function, creating a
+loop.  This pass also implements the following extensions to the basic
+algorithm:
+
+#. Trivial instructions between the call and return do not prevent the
+   transformation from taking place, though currently the analysis cannot
+   support moving any really useful instructions (only dead ones).
+#. This pass transforms functions that are prevented from being tail recursive
+   by an associative expression to use an accumulator variable, thus compiling
+   the typical naive factorial or fib implementation into efficient code.
+#. TRE is performed if the function returns void, if the return returns the
+   result returned by the call, or if the function returns a run-time constant
+   on all exits from the function.  It is possible, though unlikely, that the
+   return returns something else (like constant 0), and can still be TRE'd.  It
+   can be TRE'd if *all other* return instructions in the function return the
+   exact same value.
+#. If it can prove that callees do not access theier caller stack frame, they
+   are marked as eligible for tail call elimination (by the code generator).
+
+Utility Passes
+==============
+
+This section describes the LLVM Utility Passes.
+
+``-deadarghaX0r``: Dead Argument Hacking (BUGPOINT USE ONLY; DO NOT USE)
+------------------------------------------------------------------------
+
+Same as dead argument elimination, but deletes arguments to functions which are
+external.  This is only for use by :doc:`bugpoint <Bugpoint>`.
+
+``-extract-blocks``: Extract Basic Blocks From Module (for bugpoint use)
+------------------------------------------------------------------------
+
+This pass is used by bugpoint to extract all blocks from the module into their
+own functions.
+
+``-instnamer``: Assign names to anonymous instructions
+------------------------------------------------------
+
+This is a little utility pass that gives instructions names, this is mostly
+useful when diffing the effect of an optimization because deleting an unnamed
+instruction can change all other instruction numbering, making the diff very
+noisy.
+
+``-preverify``: Preliminary module verification
+-----------------------------------------------
+
+Ensures that the module is in the form required by the :ref:`Module Verifier
+<passes-verify>` pass.  Running the verifier runs this pass automatically, so
+there should be no need to use it directly.
+
+.. _passes-verify:
+
+``-verify``: Module Verifier
+----------------------------
+
+Verifies an LLVM IR code.  This is useful to run after an optimization which is
+undergoing testing.  Note that llvm-as verifies its input before emitting
+bitcode, and also that malformed bitcode is likely to make LLVM crash.  All
+language front-ends are therefore encouraged to verify their output before
+performing optimizing transformations.
+
+#. Both of a binary operator's parameters are of the same type.
+#. Verify that the indices of mem access instructions match other operands.
+#. Verify that arithmetic and other things are only performed on first-class
+   types.  Verify that shifts and logicals only happen on integrals f.e.
+#. All of the constants in a switch statement are of the correct type.
+#. The code is in valid SSA form.
+#. It is illegal to put a label into any other type (like a structure) or to
+   return one.
+#. Only phi nodes can be self referential: ``%x = add i32 %x``, ``%x`` is
+   invalid.
+#. PHI nodes must have an entry for each predecessor, with no extras.
+#. PHI nodes must be the first thing in a basic block, all grouped together.
+#. PHI nodes must have at least one entry.
+#. All basic blocks should only end with terminator insts, not contain them.
+#. The entry node to a function must not have predecessors.
+#. All Instructions must be embedded into a basic block.
+#. Functions cannot take a void-typed parameter.
+#. Verify that a function's argument list agrees with its declared type.
+#. It is illegal to specify a name for a void value.
+#. It is illegal to have an internal global value with no initializer.
+#. It is illegal to have a ``ret`` instruction that returns a value that does
+   not agree with the function return value type.
+#. Function call argument types match the function prototype.
+#. All other things that are tested by asserts spread about the code.
+
+Note that this does not provide full security verification (like Java), but
+instead just tries to ensure that code is well-formed.
+
+``-view-cfg``: View CFG of function
+-----------------------------------
+
+Displays the control flow graph using the GraphViz tool.
+
+``-view-cfg-only``: View CFG of function (with no function bodies)
+------------------------------------------------------------------
+
+Displays the control flow graph using the GraphViz tool, but omitting function
+bodies.
+
+``-view-dom``: View dominance tree of function
+----------------------------------------------
+
+Displays the dominator tree using the GraphViz tool.
+
+``-view-dom-only``: View dominance tree of function (with no function bodies)
+-----------------------------------------------------------------------------
+
+Displays the dominator tree using the GraphViz tool, but omitting function
+bodies.
+
+``-view-postdom``: View postdominance tree of function
+------------------------------------------------------
+
+Displays the post dominator tree using the GraphViz tool.
+
+``-view-postdom-only``: View postdominance tree of function (with no function bodies)
+-------------------------------------------------------------------------------------
+
+Displays the post dominator tree using the GraphViz tool, but omitting function
+bodies.
+
diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
index a5922ad983..2de4ebb281 100644
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@@ -61,7 +61,9 @@ for Darwin/ARM targets.
 In the LLVM 3.2 time-frame, the Clang team has made many improvements.
 Highlights include:
 
-#. ...
+#. More powerful warnings, especially `-Wuninitialized`
+#. Template type diffing in diagnostic messages
+#. Higher quality and more efficient debug info generation
 
 For more details about the changes to Clang since the 3.1 release, see the
 `Clang release notes. <http://clang.llvm.org/docs/ReleaseNotes.html>`_
@@ -83,7 +85,10 @@ for Go, Java, Obj-C and Obj-C++.
 
 The 3.2 release has the following notable changes:
 
-#. ...
+#. Able to load LLVM plugins such as Polly.
+#. Supports thread-local storage models.
+#. Passes knowledge of variable lifetimes to the LLVM optimizers.
+#. No longer requires GCC to be built with LTO support.
 
 compiler-rt: Compiler Runtime Library
 -------------------------------------
@@ -290,7 +295,6 @@ Major New Features
    strong phi elim
    loop dependence analysis
    CorrelatedValuePropagation
-   lib/Transforms/IPO/MergeFunctions.cpp => consider for 3.2.
    Integrated assembler on by default for arm/thumb?
 
   Near dead:
@@ -350,7 +354,21 @@ We vectorize under the following loops:
    '``noalias``' and are checked at runtime.
 #. ...
 
-SROA - We've re-written SROA to be significantly more powerful.
+SROA - We've re-written SROA to be significantly more powerful and generate
+code which is much more friendly to the rest of the optimization pipeline.
+Previously this pass had scaling problems that required it to only operate on
+relatively small aggregates, and at times it would mistakenly replace a large
+aggregate with a single very large integer in order to make it a scalar SSA
+value. The result was a large number of i1024 and i2048 values representing any
+small stack buffer. These in turn slowed down many subsequent optimization
+paths.
+
+The new SROA pass uses a different algorithm that allows it to only promote to
+scalars the pieces of the aggregate actively in use. Because of this it doesn't
+require any thresholds. It also always deduces the scalar values from the uses
+of the aggregate rather than the specific LLVM type of the aggregate. These
+features combine to both optimize more code with the pass but to improve the
+compile time of many functions dramatically.
 
 #. Branch weight metadata is preseved through more of the optimizer.
 #. ...
@@ -371,35 +389,19 @@ Post <http://blog.llvm.org/2010/04/intro-to-llvm-mc-project.html>`_.
 Target Independent Code Generator Improvements
 ----------------------------------------------
 
-Stack Coloring - We have implemented a new optimization pass to merge stack
-objects which are used in disjoin areas of the code.  This optimization reduces
-the required stack space significantly, in cases where it is clear to the
-optimizer that the stack slot is not shared.  We use the lifetime markers to
-tell the codegen that a certain alloca is used within a region.
-
-We now merge consecutive loads and stores.
-
 We have put a significant amount of work into the code generator
 infrastructure, which allows us to implement more aggressive algorithms and
 make it run faster:
 
 #. ...
 
-We added new TableGen infrastructure to support bundling for Very Long
-Instruction Word (VLIW) architectures.  TableGen can now automatically generate
-a deterministic finite automaton from a VLIW target's schedule description
-which can be queried to determine legal groupings of instructions in a bundle.
-
-We have added a new target independent VLIW packetizer based on the DFA
-infrastructure to group machine instructions into bundles.
-
-Basic Block Placement
-^^^^^^^^^^^^^^^^^^^^^
+Stack Coloring - We have implemented a new optimization pass to merge stack
+objects which are used in disjoin areas of the code.  This optimization reduces
+the required stack space significantly, in cases where it is clear to the
+optimizer that the stack slot is not shared.  We use the lifetime markers to
+tell the codegen that a certain alloca is used within a region.
 
-A probability based block placement and code layout algorithm was added to
-LLVM's code generator.  This layout pass supports probabilities derived from
-static heuristics as well as source code annotations such as
-``__builtin_expect``.
+We now merge consecutive loads and stores.
 
 X86-32 and X86-64 Target Improvements
 -------------------------------------
@@ -419,21 +421,6 @@ New features of the ARM target include:
 
 .. _armintegratedassembler:
 
-ARM Integrated Assembler
-^^^^^^^^^^^^^^^^^^^^^^^^
-
-The ARM target now includes a full featured macro assembler, including
-direct-to-object module support for clang.  The assembler is currently enabled
-by default for Darwin only pending testing and any additional necessary
-platform specific support for Linux.
-
-Full support is included for Thumb1, Thumb2 and ARM modes, along with subtarget
-and CPU specific extensions for VFP2, VFP3 and NEON.
-
-The assembler is Unified Syntax only (see ARM Architecural Reference Manual for
-details).  While there is some, and growing, support for pre-unfied (divided)
-syntax, there are still significant gaps in that support.
-
 MIPS Target Improvements
 ------------------------
 
@@ -542,7 +529,7 @@ the `LLVMdev list <http://lists.cs.uiuc.edu/mailman/listinfo/llvmdev>`_.
 
 Known problem areas include:
 
-#. The CellSPU, MSP430, and XCore backends are experimental.
+#. The MSP430 and XCore backends are experimental.
 
 #. The integrated assembler, disassembler, and JIT is not supported by several
    targets.  If an integrated assembler is not supported, then a system
diff --git a/docs/SphinxQuickstartTemplate.rst b/docs/SphinxQuickstartTemplate.rst
index 640df63db1..b0002bacb9 100644
--- a/docs/SphinxQuickstartTemplate.rst
+++ b/docs/SphinxQuickstartTemplate.rst
@@ -107,16 +107,32 @@ You can make blocks of code like this:
      return 0
    }
 
-For a shell session, use a ``bash`` code block:
+For a shell session, use a ``console`` code block (some existing docs use
+``bash``):
 
-.. code-block:: bash
+.. code-block:: console
 
    $ echo "Goodbye cruel world!"
    $ rm -rf /
 
 If you need to show LLVM IR use the ``llvm`` code block.
 
-You can show preformatted text without any syntax highlighting like this:
+.. code-block:: llvm
+
+   define i32 @test1() {
+   entry:
+     ret i32 0
+   }
+
+Some other common code blocks you might need are ``c``, ``objc``, ``make``,
+and ``cmake``. If you need something beyond that, you can look at the `full
+list`_ of supported code blocks.
+
+.. _`full list`: http://pygments.org/docs/lexers/
+
+However, don't waste time fiddling with syntax highlighting when you could
+be adding meaningful content. When in doubt, show preformatted text
+without any syntax highlighting like this:
 
 ::
 
diff --git a/docs/WritingAnLLVMBackend.rst b/docs/WritingAnLLVMBackend.rst
index 7803163ae6..7e243fa3ec 100644
--- a/docs/WritingAnLLVMBackend.rst
+++ b/docs/WritingAnLLVMBackend.rst
@@ -54,8 +54,8 @@ These essential documents must be read before reading this document:
   file (``.td`` suffix) and generates C++ code that can be used for code
   generation.
 
-* `Writing an LLVM Pass <WritingAnLLVMPass.html>`_ --- The assembly printer is
-  a ``FunctionPass``, as are several SelectionDAG processing steps.
+* :doc:`WritingAnLLVMPass` --- The assembly printer is a ``FunctionPass``, as
+  are several ``SelectionDAG`` processing steps.
 
 To follow the SPARC examples in this document, have a copy of `The SPARC
 Architecture Manual, Version 8 <http://www.sparc.org/standards/V8.pdf>`_ for
diff --git a/docs/WritingAnLLVMPass.html b/docs/WritingAnLLVMPass.html
deleted file mode 100644
index af1ffa4fb7..0000000000
--- a/docs/WritingAnLLVMPass.html
+++ /dev/null
@@ -1,1954 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
-                      "http://www.w3.org/TR/html4/strict.dtd">
-<html>
-<head>
-  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
-  <title>Writing an LLVM Pass</title>
-  <link rel="stylesheet" href="_static/llvm.css" type="text/css">
-</head>
-<body>
-
-<h1>
-  Writing an LLVM Pass
-</h1>
-
-<ol>
-  <li><a href="#introduction">Introduction - What is a pass?</a></li>
-  <li><a href="#quickstart">Quick Start - Writing hello world</a>
-    <ul>
-    <li><a href="#makefile">Setting up the build environment</a></li>
-    <li><a href="#basiccode">Basic code required</a></li>
-    <li><a href="#running">Running a pass with <tt>opt</tt></a></li>
-    </ul></li>
-  <li><a href="#passtype">Pass classes and requirements</a>
-     <ul>
-     <li><a href="#ImmutablePass">The <tt>ImmutablePass</tt> class</a></li>
-     <li><a href="#ModulePass">The <tt>ModulePass</tt> class</a>
-        <ul>
-        <li><a href="#runOnModule">The <tt>runOnModule</tt> method</a></li>
-        </ul></li>
-     <li><a href="#CallGraphSCCPass">The <tt>CallGraphSCCPass</tt> class</a>
-        <ul>
-        <li><a href="#doInitialization_scc">The <tt>doInitialization(CallGraph
-                                           &amp;)</tt> method</a></li>
-        <li><a href="#runOnSCC">The <tt>runOnSCC</tt> method</a></li>
-        <li><a href="#doFinalization_scc">The <tt>doFinalization(CallGraph
-                                           &amp;)</tt> method</a></li>
-        </ul></li>
-     <li><a href="#FunctionPass">The <tt>FunctionPass</tt> class</a>
-        <ul>
-        <li><a href="#doInitialization_mod">The <tt>doInitialization(Module
-                                            &amp;)</tt> method</a></li>
-        <li><a href="#runOnFunction">The <tt>runOnFunction</tt> method</a></li>
-        <li><a href="#doFinalization_mod">The <tt>doFinalization(Module
-                                            &amp;)</tt> method</a></li>
-        </ul></li>
-     <li><a href="#LoopPass">The <tt>LoopPass</tt> class</a>
-        <ul>
-        <li><a href="#doInitialization_loop">The <tt>doInitialization(Loop *,
-                                            LPPassManager &amp;)</tt> method</a></li>
-        <li><a href="#runOnLoop">The <tt>runOnLoop</tt> method</a></li>
-        <li><a href="#doFinalization_loop">The <tt>doFinalization()
-                                            </tt> method</a></li>
-        </ul></li>
-     <li><a href="#RegionPass">The <tt>RegionPass</tt> class</a>
-        <ul>
-        <li><a href="#doInitialization_region">The <tt>doInitialization(Region *,
-                                            RGPassManager &amp;)</tt> method</a></li>
-        <li><a href="#runOnRegion">The <tt>runOnRegion</tt> method</a></li>
-        <li><a href="#doFinalization_region">The <tt>doFinalization()
-                                            </tt> method</a></li>
-        </ul></li>
-     <li><a href="#BasicBlockPass">The <tt>BasicBlockPass</tt> class</a>
-        <ul>
-        <li><a href="#doInitialization_fn">The <tt>doInitialization(Function
-                                             &amp;)</tt> method</a></li>
-        <li><a href="#runOnBasicBlock">The <tt>runOnBasicBlock</tt>
-                                       method</a></li>
-        <li><a href="#doFinalization_fn">The <tt>doFinalization(Function
-                                         &amp;)</tt> method</a></li>
-        </ul></li>
-     <li><a href="#MachineFunctionPass">The <tt>MachineFunctionPass</tt>
-                                        class</a>
-        <ul>
-        <li><a href="#runOnMachineFunction">The
-            <tt>runOnMachineFunction(MachineFunction &amp;)</tt> method</a></li>
-        </ul></li>
-     </ul>
-  <li><a href="#registration">Pass Registration</a>
-     <ul>
-     <li><a href="#print">The <tt>print</tt> method</a></li>
-     </ul></li>
-  <li><a href="#interaction">Specifying interactions between passes</a>
-     <ul>
-     <li><a href="#getAnalysisUsage">The <tt>getAnalysisUsage</tt> 
-                                     method</a></li>
-     <li><a href="#AU::addRequired">The <tt>AnalysisUsage::addRequired&lt;&gt;</tt> and <tt>AnalysisUsage::addRequiredTransitive&lt;&gt;</tt> methods</a></li>
-     <li><a href="#AU::addPreserved">The <tt>AnalysisUsage::addPreserved&lt;&gt;</tt> method</a></li>
-     <li><a href="#AU::examples">Example implementations of <tt>getAnalysisUsage</tt></a></li>
-     <li><a href="#getAnalysis">The <tt>getAnalysis&lt;&gt;</tt> and
-<tt>getAnalysisIfAvailable&lt;&gt;</tt> methods</a></li>
-     </ul></li>
-  <li><a href="#analysisgroup">Implementing Analysis Groups</a>
-     <ul>
-     <li><a href="#agconcepts">Analysis Group Concepts</a></li>
-     <li><a href="#registerag">Using <tt>RegisterAnalysisGroup</tt></a></li>
-     </ul></li>
-  <li><a href="#passStatistics">Pass Statistics</a>
-  <li><a href="#passmanager">What PassManager does</a>
-    <ul>
-    <li><a href="#releaseMemory">The <tt>releaseMemory</tt> method</a></li>
-    </ul></li>
-  <li><a href="#registering">Registering dynamically loaded passes</a>
-    <ul>
-      <li><a href="#registering_existing">Using existing registries</a></li>
-      <li><a href="#registering_new">Creating new registries</a></li>
-    </ul></li>
-  <li><a href="#debughints">Using GDB with dynamically loaded passes</a>
-    <ul>
-    <li><a href="#breakpoint">Setting a breakpoint in your pass</a></li>
-    <li><a href="#debugmisc">Miscellaneous Problems</a></li>
-    </ul></li>
-  <li><a href="#future">Future extensions planned</a>
-    <ul>
-    <li><a href="#SMP">Multithreaded LLVM</a></li>
-    </ul></li>
-</ol>
-
-<div class="doc_author">
-  <p>Written by <a href="mailto:sabre@nondot.org">Chris Lattner</a> and
-  <a href="mailto:jlaskey@mac.com">Jim Laskey</a></p>
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="introduction">Introduction - What is a pass?</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>The LLVM Pass Framework is an important part of the LLVM system, because LLVM
-passes are where most of the interesting parts of the compiler exist.  Passes
-perform the transformations and optimizations that make up the compiler, they
-build the analysis results that are used by these transformations, and they are,
-above all, a structuring technique for compiler code.</p>
-
-<p>All LLVM passes are subclasses of the <tt><a
-href="http://llvm.org/doxygen/classllvm_1_1Pass.html">Pass</a></tt>
-class, which implement functionality by overriding virtual methods inherited
-from <tt>Pass</tt>.  Depending on how your pass works, you should inherit from
-the <tt><a href="#ModulePass">ModulePass</a></tt>, <tt><a
-href="#CallGraphSCCPass">CallGraphSCCPass</a></tt>, <tt><a
-href="#FunctionPass">FunctionPass</a></tt>, or <tt><a
-href="#LoopPass">LoopPass</a></tt>, or <tt><a
-href="#RegionPass">RegionPass</a></tt>, or <tt><a
-href="#BasicBlockPass">BasicBlockPass</a></tt> classes, which gives the system
-more information about what your pass does, and how it can be combined with
-other passes.  One of the main features of the LLVM Pass Framework is that it
-schedules passes to run in an efficient way based on the constraints that your
-pass meets (which are indicated by which class they derive from).</p>
-
-<p>We start by showing you how to construct a pass, everything from setting up
-the code, to compiling, loading, and executing it.  After the basics are down,
-more advanced features are discussed.</p>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="quickstart">Quick Start - Writing hello world</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>Here we describe how to write the "hello world" of passes.  The "Hello" pass
-is designed to simply print out the name of non-external functions that exist in
-the program being compiled.  It does not modify the program at all, it just
-inspects it.  The source code and files for this pass are available in the LLVM
-source tree in the <tt>lib/Transforms/Hello</tt> directory.</p>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="makefile">Setting up the build environment</a>
-</h3>
-
-<div>
-
-  <p>First, configure and build LLVM.  This needs to be done directly inside the
-  LLVM source tree rather than in a separate objects directory.
-  Next, you need to create a new directory somewhere in the LLVM source 
-  base.  For this example, we'll assume that you made 
-  <tt>lib/Transforms/Hello</tt>.  Finally, you must set up a build script 
-  (Makefile) that will compile the source code for the new pass.  To do this, 
-  copy the following into <tt>Makefile</tt>:</p>
-  <hr>
-
-<div class="doc_code"><pre>
-# Makefile for hello pass
-
-# Path to top level of LLVM hierarchy
-LEVEL = ../../..
-
-# Name of the library to build
-LIBRARYNAME = Hello
-
-# Make the shared library become a loadable module so the tools can 
-# dlopen/dlsym on the resulting library.
-LOADABLE_MODULE = 1
-
-# Include the makefile implementation stuff
-include $(LEVEL)/Makefile.common
-</pre></div>
-
-<p>This makefile specifies that all of the <tt>.cpp</tt> files in the current
-directory are to be compiled and linked together into a shared object
-<tt>$(LEVEL)/Debug+Asserts/lib/Hello.so</tt> that can be dynamically loaded by
-the <tt>opt</tt> or <tt>bugpoint</tt> tools via their <tt>-load</tt> options.  
-If your operating system uses a suffix other than .so (such as windows or 
-Mac OS/X), the appropriate extension will be used.</p>
-
-<p>If you are used CMake to build LLVM, see
-<a href="CMake.html#passdev">Developing an LLVM pass with CMake</a>.</p>
-
-<p>Now that we have the build scripts set up, we just need to write the code for
-the pass itself.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="basiccode">Basic code required</a>
-</h3>
-
-<div>
-
-<p>Now that we have a way to compile our new pass, we just have to write it.
-Start out with:</p>
-
-<div class="doc_code">
-<pre>
-<b>#include</b> "<a href="http://llvm.org/doxygen/Pass_8h-source.html">llvm/Pass.h</a>"
-<b>#include</b> "<a href="http://llvm.org/doxygen/Function_8h-source.html">llvm/Function.h</a>"
-<b>#include</b> "<a href="http://llvm.org/doxygen/raw__ostream_8h.html">llvm/Support/raw_ostream.h</a>"
-</pre>
-</div>
-
-<p>Which are needed because we are writing a <tt><a
-href="http://llvm.org/doxygen/classllvm_1_1Pass.html">Pass</a></tt>,
-we are operating on <tt><a
-href="http://llvm.org/doxygen/classllvm_1_1Function.html">Function</a></tt>'s,
-and we will be doing some printing.</p>
-
-<p>Next we have:</p>
-
-<div class="doc_code">
-<pre>
-<b>using namespace llvm;</b>
-</pre>
-</div>
-
-<p>... which is required because the functions from the include files 
-live in the llvm namespace.</p>
-
-<p>Next we have:</p>
-
-<div class="doc_code">
-<pre>
-<b>namespace</b> {
-</pre>
-</div>
-
-<p>... which starts out an anonymous namespace.  Anonymous namespaces are to C++
-what the "<tt>static</tt>" keyword is to C (at global scope).  It makes the
-things declared inside of the anonymous namespace visible only to the current
-file.  If you're not familiar with them, consult a decent C++ book for more
-information.</p>
-
-<p>Next, we declare our pass itself:</p>
-
-<div class="doc_code">
-<pre>
-  <b>struct</b> Hello : <b>public</b> <a href="#FunctionPass">FunctionPass</a> {
-</pre>
-</div>
-
-<p>This declares a "<tt>Hello</tt>" class that is a subclass of <tt><a
-href="http://llvm.org/doxygen/classllvm_1_1FunctionPass.html">FunctionPass</a></tt>.
-The different builtin pass subclasses are described in detail <a
-href="#passtype">later</a>, but for now, know that <a
-href="#FunctionPass"><tt>FunctionPass</tt></a>'s operate on a function at a
-time.</p>
-
-<div class="doc_code">
-<pre>
-    static char ID;
-    Hello() : FunctionPass(ID) {}
-</pre>
-</div>
-
-<p>This declares pass identifier used by LLVM to identify pass. This allows LLVM
-to avoid using expensive C++ runtime information.</p>
-
-<div class="doc_code">
-<pre>
-    <b>virtual bool</b> <a href="#runOnFunction">runOnFunction</a>(Function &amp;F) {
-      errs() &lt;&lt; "<i>Hello: </i>";
-      errs().write_escaped(F.getName()) &lt;&lt; "\n";
-      <b>return false</b>;
-    }
-  };  <i>// end of struct Hello</i>
-}  <i>// end of anonymous namespace</i>
-</pre>
-</div>
-
-<p>We declare a "<a href="#runOnFunction"><tt>runOnFunction</tt></a>" method,
-which overloads an abstract virtual method inherited from <a
-href="#FunctionPass"><tt>FunctionPass</tt></a>.  This is where we are supposed
-to do our thing, so we just print out our message with the name of each
-function.</p>
-
-<div class="doc_code">
-<pre>
-char Hello::ID = 0;
-</pre>
-</div>
-
-<p>We initialize pass ID here. LLVM uses ID's address to identify a pass, so
-initialization value is not important.</p>
-
-<div class="doc_code">
-<pre>
-static RegisterPass&lt;Hello&gt; X("<i>hello</i>", "<i>Hello World Pass</i>",
-                             false /* Only looks at CFG */,
-                             false /* Analysis Pass */);
-</pre>
-</div>
-
-<p>Lastly, we <a href="#registration">register our class</a> <tt>Hello</tt>,
-giving it a command line argument "<tt>hello</tt>", and a name "<tt>Hello World
-Pass</tt>". The last two arguments describe its behavior: if a pass walks CFG
-without modifying it then the third argument is set to <tt>true</tt>; if a pass
-is an analysis pass, for example dominator tree pass, then <tt>true</tt> is
-supplied as the fourth argument.</p>
-
-<p>As a whole, the <tt>.cpp</tt> file looks like:</p>
-
-<div class="doc_code">
-<pre>
-<b>#include</b> "<a href="http://llvm.org/doxygen/Pass_8h-source.html">llvm/Pass.h</a>"
-<b>#include</b> "<a href="http://llvm.org/doxygen/Function_8h-source.html">llvm/Function.h</a>"
-<b>#include</b> "<a href="http://llvm.org/doxygen/raw__ostream_8h.html">llvm/Support/raw_ostream.h</a>"
-
-<b>using namespace llvm;</b>
-
-<b>namespace</b> {
-  <b>struct Hello</b> : <b>public</b> <a href="#FunctionPass">FunctionPass</a> {
-    
-    static char ID;
-    Hello() : FunctionPass(ID) {}
-
-    <b>virtual bool</b> <a href="#runOnFunction">runOnFunction</a>(Function &amp;F) {
-      errs() &lt;&lt; "<i>Hello: </i>";
-      errs().write_escaped(F.getName()) &lt;&lt; '\n';
-      <b>return false</b>;
-    }
-
-  };
-}
-  
-char Hello::ID = 0;
-static RegisterPass&lt;Hello&gt; X("hello", "Hello World Pass", false, false);
-</pre>
-</div>
-
-<p>Now that it's all together, compile the file with a simple "<tt>gmake</tt>"
-command in the local directory and you should get a new file
-"<tt>Debug+Asserts/lib/Hello.so</tt>" under the top level directory of the LLVM
-source tree (not in the local directory).  Note that everything in this file is
-contained in an anonymous namespace &mdash; this reflects the fact that passes
-are self contained units that do not need external interfaces (although they can
-have them) to be useful.</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="running">Running a pass with <tt>opt</tt></a>
-</h3>
-
-<div>
-
-<p>Now that you have a brand new shiny shared object file, we can use the
-<tt>opt</tt> command to run an LLVM program through your pass.  Because you
-registered your pass with <tt>RegisterPass</tt>, you will be able to
-use the <tt>opt</tt> tool to access it, once loaded.</p>
-
-<p>To test it, follow the example at the end of the <a
-href="GettingStarted.html">Getting Started Guide</a> to compile "Hello World" to
-LLVM.  We can now run the bitcode file (<tt>hello.bc</tt>) for the program
-through our transformation like this (or course, any bitcode file will
-work):</p>
-
-<div class="doc_code"><pre>
-$ opt -load ../../../Debug+Asserts/lib/Hello.so -hello &lt; hello.bc &gt; /dev/null
-Hello: __main
-Hello: puts
-Hello: main
-</pre></div>
-
-<p>The '<tt>-load</tt>' option specifies that '<tt>opt</tt>' should load your
-pass as a shared object, which makes '<tt>-hello</tt>' a valid command line
-argument (which is one reason you need to <a href="#registration">register your
-pass</a>).  Because the hello pass does not modify the program in any
-interesting way, we just throw away the result of <tt>opt</tt> (sending it to
-<tt>/dev/null</tt>).</p>
-
-<p>To see what happened to the other string you registered, try running
-<tt>opt</tt> with the <tt>-help</tt> option:</p>
-
-<div class="doc_code"><pre>
-$ opt -load ../../../Debug+Asserts/lib/Hello.so -help
-OVERVIEW: llvm .bc -&gt; .bc modular optimizer
-
-USAGE: opt [options] &lt;input bitcode&gt;
-
-OPTIONS:
-  Optimizations available:
-...
-    -globalopt                - Global Variable Optimizer
-    -globalsmodref-aa         - Simple mod/ref analysis for globals
-    -gvn                      - Global Value Numbering
-    <b>-hello                    - Hello World Pass</b>
-    -indvars                  - Induction Variable Simplification
-    -inline                   - Function Integration/Inlining
-    -insert-edge-profiling    - Insert instrumentation for edge profiling
-...
-</pre></div>
-
-<p>The pass name gets added as the information string for your pass, giving some
-documentation to users of <tt>opt</tt>.  Now that you have a working pass, you
-would go ahead and make it do the cool transformations you want.  Once you get
-it all working and tested, it may become useful to find out how fast your pass
-is.  The <a href="#passManager"><tt>PassManager</tt></a> provides a nice command
-line option (<tt>--time-passes</tt>) that allows you to get information about
-the execution time of your pass along with the other passes you queue up.  For
-example:</p>
-
-<div class="doc_code"><pre>
-$ opt -load ../../../Debug+Asserts/lib/Hello.so -hello -time-passes &lt; hello.bc &gt; /dev/null
-Hello: __main
-Hello: puts
-Hello: main
-===============================================================================
-                      ... Pass execution timing report ...
-===============================================================================
-  Total Execution Time: 0.02 seconds (0.0479059 wall clock)
-
-   ---User Time---   --System Time--   --User+System--   ---Wall Time---  --- Pass Name ---
-   0.0100 (100.0%)   0.0000 (  0.0%)   0.0100 ( 50.0%)   0.0402 ( 84.0%)  Bitcode Writer
-   0.0000 (  0.0%)   0.0100 (100.0%)   0.0100 ( 50.0%)   0.0031 (  6.4%)  Dominator Set Construction
-   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0013 (  2.7%)  Module Verifier
- <b>  0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0033 (  6.9%)  Hello World Pass</b>
-   0.0100 (100.0%)   0.0100 (100.0%)   0.0200 (100.0%)   0.0479 (100.0%)  TOTAL
-</pre></div>
-
-<p>As you can see, our implementation above is pretty fast :).  The additional
-passes listed are automatically inserted by the '<tt>opt</tt>' tool to verify
-that the LLVM emitted by your pass is still valid and well formed LLVM, which
-hasn't been broken somehow.</p>
-
-<p>Now that you have seen the basics of the mechanics behind passes, we can talk
-about some more details of how they work and how to use them.</p>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="passtype">Pass classes and requirements</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>One of the first things that you should do when designing a new pass is to
-decide what class you should subclass for your pass.  The <a
-href="#basiccode">Hello World</a> example uses the <tt><a
-href="#FunctionPass">FunctionPass</a></tt> class for its implementation, but we
-did not discuss why or when this should occur.  Here we talk about the classes
-available, from the most general to the most specific.</p>
-
-<p>When choosing a superclass for your Pass, you should choose the <b>most
-specific</b> class possible, while still being able to meet the requirements
-listed.  This gives the LLVM Pass Infrastructure information necessary to
-optimize how passes are run, so that the resultant compiler isn't unnecessarily
-slow.</p>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="ImmutablePass">The <tt>ImmutablePass</tt> class</a>
-</h3>
-
-<div>
-
-<p>The most plain and boring type of pass is the "<tt><a
-href="http://llvm.org/doxygen/classllvm_1_1ImmutablePass.html">ImmutablePass</a></tt>"
-class.  This pass type is used for passes that do not have to be run, do not
-change state, and never need to be updated.  This is not a normal type of
-transformation or analysis, but can provide information about the current
-compiler configuration.</p>
-
-<p>Although this pass class is very infrequently used, it is important for
-providing information about the current target machine being compiled for, and
-other static information that can affect the various transformations.</p>
-
-<p><tt>ImmutablePass</tt>es never invalidate other transformations, are never
-invalidated, and are never "run".</p>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="ModulePass">The <tt>ModulePass</tt> class</a>
-</h3>
-
-<div>
-
-<p>The "<tt><a
-href="http://llvm.org/doxygen/classllvm_1_1ModulePass.html">ModulePass</a></tt>"
-class is the most general of all superclasses that you can use.  Deriving from
-<tt>ModulePass</tt> indicates that your pass uses the entire program as a unit,
-referring to function bodies in no predictable order, or adding and removing
-functions.  Because nothing is known about the behavior of <tt>ModulePass</tt>
-subclasses, no optimization can be done for their execution.</p>
-
-<p>A module pass can use function level passes (e.g. dominators) using
-the getAnalysis interface
-<tt>getAnalysis&lt;DominatorTree&gt;(llvm::Function *)</tt> to provide the
-function to retrieve analysis result for, if the function pass does not require
-any module or immutable passes. Note that this can only be done for functions for which the
-analysis ran, e.g. in the case of dominators you should only ask for the
-DominatorTree for function definitions, not declarations.</p>
-
-<p>To write a correct <tt>ModulePass</tt> subclass, derive from
-<tt>ModulePass</tt> and overload the <tt>runOnModule</tt> method with the
-following signature:</p>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="runOnModule">The <tt>runOnModule</tt> method</a>
-</h4>
-
-<div>
-
-<div class="doc_code"><pre>
-<b>virtual bool</b> runOnModule(Module &amp;M) = 0;
-</pre></div>
-
-<p>The <tt>runOnModule</tt> method performs the interesting work of the pass.
-It should return true if the module was modified by the transformation and
-false otherwise.</p>
-
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="CallGraphSCCPass">The <tt>CallGraphSCCPass</tt> class</a>
-</h3>
-
-<div>
-
-<p>The "<tt><a
-href="http://llvm.org/doxygen/classllvm_1_1CallGraphSCCPass.html">CallGraphSCCPass</a></tt>"
-is used by passes that need to traverse the program bottom-up on the call graph
-(callees before callers).  Deriving from CallGraphSCCPass provides some
-mechanics for building and traversing the CallGraph, but also allows the system
-to optimize execution of CallGraphSCCPass's.  If your pass meets the
-requirements outlined below, and doesn't meet the requirements of a <tt><a
-href="#FunctionPass">FunctionPass</a></tt> or <tt><a
-href="#BasicBlockPass">BasicBlockPass</a></tt>, you should derive from
-<tt>CallGraphSCCPass</tt>.</p>
-
-<p><b>TODO</b>: explain briefly what SCC, Tarjan's algo, and B-U mean.</p>
-
-<p>To be explicit, <tt>CallGraphSCCPass</tt> subclasses are:</p>
-
-<ol>
-
-<li>... <em>not allowed</em> to inspect or modify any <tt>Function</tt>s other
-than those in the current SCC and the direct callers and direct callees of the
-SCC.</li>
-
-<li>... <em>required</em> to preserve the current CallGraph object, updating it
-to reflect any changes made to the program.</li>
-
-<li>... <em>not allowed</em> to add or remove SCC's from the current Module,
-though they may change the contents of an SCC.</li>
-
-<li>... <em>allowed</em> to add or remove global variables from the current
-Module.</li>
-
-<li>... <em>allowed</em> to maintain state across invocations of
-    <a href="#runOnSCC"><tt>runOnSCC</tt></a> (including global data).</li>
-</ol>
-
-<p>Implementing a <tt>CallGraphSCCPass</tt> is slightly tricky in some cases
-because it has to handle SCCs with more than one node in it.  All of the virtual
-methods described below should return true if they modified the program, or
-false if they didn't.</p>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="doInitialization_scc">
-    The <tt>doInitialization(CallGraph &amp;)</tt> method
-  </a>
-</h4>
-
-<div>
-
-<div class="doc_code"><pre>
-<b>virtual bool</b> doInitialization(CallGraph &amp;CG);
-</pre></div>
-
-<p>The <tt>doIninitialize</tt> method is allowed to do most of the things that
-<tt>CallGraphSCCPass</tt>'s are not allowed to do.  They can add and remove
-functions, get pointers to functions, etc.  The <tt>doInitialization</tt> method
-is designed to do simple initialization type of stuff that does not depend on
-the SCCs being processed.  The <tt>doInitialization</tt> method call is not
-scheduled to overlap with any other pass executions (thus it should be very
-fast).</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="runOnSCC">The <tt>runOnSCC</tt> method</a>
-</h4>
-
-<div>
-
-<div class="doc_code"><pre>
-<b>virtual bool</b> runOnSCC(CallGraphSCC &amp;SCC) = 0;
-</pre></div>
-
-<p>The <tt>runOnSCC</tt> method performs the interesting work of the pass, and
-should return true if the module was modified by the transformation, false
-otherwise.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="doFinalization_scc">
-    The <tt>doFinalization(CallGraph &amp;)</tt> method
-  </a>
-</h4>
-
-<div>
-
-<div class="doc_code"><pre>
-<b>virtual bool</b> doFinalization(CallGraph &amp;CG);
-</pre></div>
-
-<p>The <tt>doFinalization</tt> method is an infrequently used method that is
-called when the pass framework has finished calling <a
-href="#runOnFunction"><tt>runOnFunction</tt></a> for every function in the
-program being compiled.</p>
-
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="FunctionPass">The <tt>FunctionPass</tt> class</a>
-</h3>
-
-<div>
-
-<p>In contrast to <tt>ModulePass</tt> subclasses, <tt><a
-href="http://llvm.org/doxygen/classllvm_1_1Pass.html">FunctionPass</a></tt>
-subclasses do have a predictable, local behavior that can be expected by the
-system.  All <tt>FunctionPass</tt> execute on each function in the program
-independent of all of the other functions in the program.
-<tt>FunctionPass</tt>'s do not require that they are executed in a particular
-order, and <tt>FunctionPass</tt>'s do not modify external functions.</p>
-
-<p>To be explicit, <tt>FunctionPass</tt> subclasses are not allowed to:</p>
-
-<ol>
-<li>Modify a Function other than the one currently being processed.</li>
-<li>Add or remove Function's from the current Module.</li>
-<li>Add or remove global variables from the current Module.</li>
-<li>Maintain state across invocations of
-    <a href="#runOnFunction"><tt>runOnFunction</tt></a> (including global data)</li>
-</ol>
-
-<p>Implementing a <tt>FunctionPass</tt> is usually straightforward (See the <a
-href="#basiccode">Hello World</a> pass for example).  <tt>FunctionPass</tt>'s
-may overload three virtual methods to do their work.  All of these methods
-should return true if they modified the program, or false if they didn't.</p>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="doInitialization_mod">
-    The <tt>doInitialization(Module &amp;)</tt> method
-  </a>
-</h4>
-
-<div>
-
-<div class="doc_code"><pre>
-<b>virtual bool</b> doInitialization(Module &amp;M);
-</pre></div>
-
-<p>The <tt>doIninitialize</tt> method is allowed to do most of the things that
-<tt>FunctionPass</tt>'s are not allowed to do.  They can add and remove
-functions, get pointers to functions, etc.  The <tt>doInitialization</tt> method
-is designed to do simple initialization type of stuff that does not depend on
-the functions being processed.  The <tt>doInitialization</tt> method call is not
-scheduled to overlap with any other pass executions (thus it should be very
-fast).</p>
-
-<p>A good example of how this method should be used is the <a
-href="http://llvm.org/doxygen/LowerAllocations_8cpp-source.html">LowerAllocations</a>
-pass.  This pass converts <tt>malloc</tt> and <tt>free</tt> instructions into
-platform dependent <tt>malloc()</tt> and <tt>free()</tt> function calls.  It
-uses the <tt>doInitialization</tt> method to get a reference to the malloc and
-free functions that it needs, adding prototypes to the module if necessary.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="runOnFunction">The <tt>runOnFunction</tt> method</a>
-</h4>
-
-<div>
-
-<div class="doc_code"><pre>
-<b>virtual bool</b> runOnFunction(Function &amp;F) = 0;
-</pre></div><p>
-
-<p>The <tt>runOnFunction</tt> method must be implemented by your subclass to do
-the transformation or analysis work of your pass.  As usual, a true value should
-be returned if the function is modified.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="doFinalization_mod">
-    The <tt>doFinalization(Module &amp;)</tt> method
-  </a>
-</h4>
-
-<div>
-
-<div class="doc_code"><pre>
-<b>virtual bool</b> doFinalization(Module &amp;M);
-</pre></div>
-
-<p>The <tt>doFinalization</tt> method is an infrequently used method that is
-called when the pass framework has finished calling <a
-href="#runOnFunction"><tt>runOnFunction</tt></a> for every function in the
-program being compiled.</p>
-
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="LoopPass">The <tt>LoopPass</tt> class </a>
-</h3>
-
-<div>
-
-<p> All <tt>LoopPass</tt> execute on each loop in the function independent of
-all of the other loops in the function. <tt>LoopPass</tt> processes loops in
-loop nest order such that outer most loop is processed last. </p>
-
-<p> <tt>LoopPass</tt> subclasses are allowed to update loop nest using
-<tt>LPPassManager</tt> interface. Implementing a loop pass is usually
-straightforward. <tt>LoopPass</tt>'s may overload three virtual methods to
-do their work. All these methods should return true if they modified the 
-program, or false if they didn't. </p>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="doInitialization_loop">
-    The <tt>doInitialization(Loop *,LPPassManager &amp;)</tt> method
-  </a>
-</h4>
-
-<div>
-
-<div class="doc_code"><pre>
-<b>virtual bool</b> doInitialization(Loop *, LPPassManager &amp;LPM);
-</pre></div>
-
-<p>The <tt>doInitialization</tt> method is designed to do simple initialization 
-type of stuff that does not depend on the functions being processed.  The 
-<tt>doInitialization</tt> method call is not scheduled to overlap with any 
-other pass executions (thus it should be very fast). LPPassManager 
-interface should be used to access Function or Module level analysis
-information.</p>
-
-</div>
-
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="runOnLoop">The <tt>runOnLoop</tt> method</a>
-</h4>
-
-<div>
-
-<div class="doc_code"><pre>
-<b>virtual bool</b> runOnLoop(Loop *, LPPassManager &amp;LPM) = 0;
-</pre></div><p>
-
-<p>The <tt>runOnLoop</tt> method must be implemented by your subclass to do
-the transformation or analysis work of your pass.  As usual, a true value should
-be returned if the function is modified. <tt>LPPassManager</tt> interface
-should be used to update loop nest.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="doFinalization_loop">The <tt>doFinalization()</tt> method</a>
-</h4>
-
-<div>
-
-<div class="doc_code"><pre>
-<b>virtual bool</b> doFinalization();
-</pre></div>
-
-<p>The <tt>doFinalization</tt> method is an infrequently used method that is
-called when the pass framework has finished calling <a
-href="#runOnLoop"><tt>runOnLoop</tt></a> for every loop in the
-program being compiled. </p>
-
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="RegionPass">The <tt>RegionPass</tt> class </a>
-</h3>
-
-<div>
-
-<p> <tt>RegionPass</tt> is similar to <a href="#LoopPass"><tt>LoopPass</tt></a>,
-but executes on each single entry single exit region in the function.
-<tt>RegionPass</tt> processes regions in nested order such that the outer most
-region is processed last.  </p>
-
-<p> <tt>RegionPass</tt> subclasses are allowed to update the region tree by using
-the <tt>RGPassManager</tt> interface. You may overload three virtual methods of
-<tt>RegionPass</tt> to implement your own region pass. All these
-methods should return true if they modified the program, or false if they didn not.
-</p>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="doInitialization_region">
-    The <tt>doInitialization(Region *, RGPassManager &amp;)</tt> method
-  </a>
-</h4>
-
-<div>
-
-<div class="doc_code"><pre>
-<b>virtual bool</b> doInitialization(Region *, RGPassManager &amp;RGM);
-</pre></div>
-
-<p>The <tt>doInitialization</tt> method is designed to do simple initialization
-type of stuff that does not depend on the functions being processed.  The
-<tt>doInitialization</tt> method call is not scheduled to overlap with any
-other pass executions (thus it should be very fast). RPPassManager
-interface should be used to access Function or Module level analysis
-information.</p>
-
-</div>
-
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="runOnRegion">The <tt>runOnRegion</tt> method</a>
-</h4>
-
-<div>
-
-<div class="doc_code"><pre>
-<b>virtual bool</b> runOnRegion(Region *, RGPassManager &amp;RGM) = 0;
-</pre></div><p>
-
-<p>The <tt>runOnRegion</tt> method must be implemented by your subclass to do
-the transformation or analysis work of your pass.  As usual, a true value should
-be returned if the region is modified. <tt>RGPassManager</tt> interface
-should be used to update region tree.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="doFinalization_region">The <tt>doFinalization()</tt> method</a>
-</h4>
-
-<div>
-
-<div class="doc_code"><pre>
-<b>virtual bool</b> doFinalization();
-</pre></div>
-
-<p>The <tt>doFinalization</tt> method is an infrequently used method that is
-called when the pass framework has finished calling <a
-href="#runOnRegion"><tt>runOnRegion</tt></a> for every region in the
-program being compiled. </p>
-
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="BasicBlockPass">The <tt>BasicBlockPass</tt> class</a>
-</h3>
-
-<div>
-
-<p><tt>BasicBlockPass</tt>'s are just like <a
-href="#FunctionPass"><tt>FunctionPass</tt></a>'s, except that they must limit
-their scope of inspection and modification to a single basic block at a time.
-As such, they are <b>not</b> allowed to do any of the following:</p>
-
-<ol>
-<li>Modify or inspect any basic blocks outside of the current one</li>
-<li>Maintain state across invocations of
-    <a href="#runOnBasicBlock"><tt>runOnBasicBlock</tt></a></li>
-<li>Modify the control flow graph (by altering terminator instructions)</li>
-<li>Any of the things forbidden for
-    <a href="#FunctionPass"><tt>FunctionPass</tt></a>es.</li>
-</ol>
-
-<p><tt>BasicBlockPass</tt>es are useful for traditional local and "peephole"
-optimizations.  They may override the same <a
-href="#doInitialization_mod"><tt>doInitialization(Module &amp;)</tt></a> and <a
-href="#doFinalization_mod"><tt>doFinalization(Module &amp;)</tt></a> methods that <a
-href="#FunctionPass"><tt>FunctionPass</tt></a>'s have, but also have the following virtual methods that may also be implemented:</p>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="doInitialization_fn">
-    The <tt>doInitialization(Function &amp;)</tt> method
-  </a>
-</h4>
-
-<div>
-
-<div class="doc_code"><pre>
-<b>virtual bool</b> doInitialization(Function &amp;F);
-</pre></div>
-
-<p>The <tt>doIninitialize</tt> method is allowed to do most of the things that
-<tt>BasicBlockPass</tt>'s are not allowed to do, but that
-<tt>FunctionPass</tt>'s can.  The <tt>doInitialization</tt> method is designed
-to do simple initialization that does not depend on the
-BasicBlocks being processed.  The <tt>doInitialization</tt> method call is not
-scheduled to overlap with any other pass executions (thus it should be very
-fast).</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="runOnBasicBlock">The <tt>runOnBasicBlock</tt> method</a>
-</h4>
-
-<div>
-
-<div class="doc_code"><pre>
-<b>virtual bool</b> runOnBasicBlock(BasicBlock &amp;BB) = 0;
-</pre></div>
-
-<p>Override this function to do the work of the <tt>BasicBlockPass</tt>.  This
-function is not allowed to inspect or modify basic blocks other than the
-parameter, and are not allowed to modify the CFG.  A true value must be returned
-if the basic block is modified.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="doFinalization_fn">
-    The <tt>doFinalization(Function &amp;)</tt> method
-  </a>
-</h4>
-
-<div>
-
-<div class="doc_code"><pre>
-<b>virtual bool</b> doFinalization(Function &amp;F);
-</pre></div>
-
-<p>The <tt>doFinalization</tt> method is an infrequently used method that is
-called when the pass framework has finished calling <a
-href="#runOnBasicBlock"><tt>runOnBasicBlock</tt></a> for every BasicBlock in the
-program being compiled.  This can be used to perform per-function
-finalization.</p>
-
-</div>
-
-</div>
-
-<!-- ======================================================================= -->
-<h3>
-  <a name="MachineFunctionPass">The <tt>MachineFunctionPass</tt> class</a>
-</h3>
-
-<div>
-
-<p>A <tt>MachineFunctionPass</tt> is a part of the LLVM code generator that
-executes on the machine-dependent representation of each LLVM function in the
-program.</p>
-
-<p>Code generator passes are registered and initialized specially by
-<tt>TargetMachine::addPassesToEmitFile</tt> and similar routines, so they
-cannot generally be run from the <tt>opt</tt> or <tt>bugpoint</tt>
-commands.</p>
-
-<p>A <tt>MachineFunctionPass</tt> is also a <tt>FunctionPass</tt>, so all
-the restrictions that apply to a <tt>FunctionPass</tt> also apply to it.
-<tt>MachineFunctionPass</tt>es also have additional restrictions. In particular,
-<tt>MachineFunctionPass</tt>es are not allowed to do any of the following:</p>
-
-<ol>
-<li>Modify or create any LLVM IR Instructions, BasicBlocks, Arguments,
-    Functions, GlobalVariables, GlobalAliases, or Modules.</li>
-<li>Modify a MachineFunction other than the one currently being processed.</li>
-<li>Maintain state across invocations of <a
-href="#runOnMachineFunction"><tt>runOnMachineFunction</tt></a> (including global
-data)</li>
-</ol>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="runOnMachineFunction">
-    The <tt>runOnMachineFunction(MachineFunction &amp;MF)</tt> method
-  </a>
-</h4>
-
-<div>
-
-<div class="doc_code"><pre>
-<b>virtual bool</b> runOnMachineFunction(MachineFunction &amp;MF) = 0;
-</pre></div>
-
-<p><tt>runOnMachineFunction</tt> can be considered the main entry point of a
-<tt>MachineFunctionPass</tt>; that is, you should override this method to do the
-work of your <tt>MachineFunctionPass</tt>.</p>
-
-<p>The <tt>runOnMachineFunction</tt> method is called on every
-<tt>MachineFunction</tt> in a <tt>Module</tt>, so that the
-<tt>MachineFunctionPass</tt> may perform optimizations on the machine-dependent
-representation of the function. If you want to get at the LLVM <tt>Function</tt>
-for the <tt>MachineFunction</tt> you're working on, use
-<tt>MachineFunction</tt>'s <tt>getFunction()</tt> accessor method -- but
-remember, you may not modify the LLVM <tt>Function</tt> or its contents from a
-<tt>MachineFunctionPass</tt>.</p>
-
-</div>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="registration">Pass registration</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>In the <a href="#basiccode">Hello World</a> example pass we illustrated how
-pass registration works, and discussed some of the reasons that it is used and
-what it does.  Here we discuss how and why passes are registered.</p>
-
-<p>As we saw above, passes are registered with the <b><tt>RegisterPass</tt></b>
-template.  The template parameter is the name of the pass that is to be used on
-the command line to specify that the pass should be added to a program (for
-example, with <tt>opt</tt> or <tt>bugpoint</tt>).  The first argument is the
-name of the pass, which is to be used for the <tt>-help</tt> output of
-programs, as
-well as for debug output generated by the <tt>--debug-pass</tt> option.</p>
-
-<p>If you want your pass to be easily dumpable, you should 
-implement the virtual <tt>print</tt> method:</p>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="print">The <tt>print</tt> method</a>
-</h4>
-
-<div>
-
-<div class="doc_code"><pre>
-<b>virtual void</b> print(std::ostream &amp;O, <b>const</b> Module *M) <b>const</b>;
-</pre></div>
-
-<p>The <tt>print</tt> method must be implemented by "analyses" in order to print
-a human readable version of the analysis results.  This is useful for debugging
-an analysis itself, as well as for other people to figure out how an analysis
-works.  Use the <tt>opt -analyze</tt> argument to invoke this method.</p>
-
-<p>The <tt>llvm::OStream</tt> parameter specifies the stream to write the results on,
-and the <tt>Module</tt> parameter gives a pointer to the top level module of the
-program that has been analyzed.  Note however that this pointer may be null in
-certain circumstances (such as calling the <tt>Pass::dump()</tt> from a
-debugger), so it should only be used to enhance debug output, it should not be
-depended on.</p>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="interaction">Specifying interactions between passes</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>One of the main responsibilities of the <tt>PassManager</tt> is to make sure
-that passes interact with each other correctly.  Because <tt>PassManager</tt>
-tries to <a href="#passmanager">optimize the execution of passes</a> it must
-know how the passes interact with each other and what dependencies exist between
-the various passes.  To track this, each pass can declare the set of passes that
-are required to be executed before the current pass, and the passes which are
-invalidated by the current pass.</p>
-
-<p>Typically this functionality is used to require that analysis results are
-computed before your pass is run.  Running arbitrary transformation passes can
-invalidate the computed analysis results, which is what the invalidation set
-specifies.  If a pass does not implement the <tt><a
-href="#getAnalysisUsage">getAnalysisUsage</a></tt> method, it defaults to not
-having any prerequisite passes, and invalidating <b>all</b> other passes.</p>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="getAnalysisUsage">The <tt>getAnalysisUsage</tt> method</a>
-</h4>
-
-<div>
-
-<div class="doc_code"><pre>
-<b>virtual void</b> getAnalysisUsage(AnalysisUsage &amp;Info) <b>const</b>;
-</pre></div>
-
-<p>By implementing the <tt>getAnalysisUsage</tt> method, the required and
-invalidated sets may be specified for your transformation.  The implementation
-should fill in the <tt><a
-href="http://llvm.org/doxygen/classllvm_1_1AnalysisUsage.html">AnalysisUsage</a></tt>
-object with information about which passes are required and not invalidated.  To
-do this, a pass may call any of the following methods on the AnalysisUsage
-object:</p>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="AU::addRequired">
-    The <tt>AnalysisUsage::addRequired&lt;&gt;</tt>
-    and <tt>AnalysisUsage::addRequiredTransitive&lt;&gt;</tt> methods
-  </a>
-</h4>
-
-<div>
-<p>
-If your pass requires a previous pass to be executed (an analysis for example),
-it can use one of these methods to arrange for it to be run before your pass.
-LLVM has many different types of analyses and passes that can be required,
-spanning the range from <tt>DominatorSet</tt> to <tt>BreakCriticalEdges</tt>.
-Requiring <tt>BreakCriticalEdges</tt>, for example, guarantees that there will
-be no critical edges in the CFG when your pass has been run.
-</p>
-
-<p>
-Some analyses chain to other analyses to do their job.  For example, an <a
-href="AliasAnalysis.html">AliasAnalysis</a> implementation is required to <a
-href="AliasAnalysis.html#chaining">chain</a> to other alias analysis passes.  In
-cases where analyses chain, the <tt>addRequiredTransitive</tt> method should be
-used instead of the <tt>addRequired</tt> method.  This informs the PassManager
-that the transitively required pass should be alive as long as the requiring
-pass is.
-</p>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="AU::addPreserved">
-    The <tt>AnalysisUsage::addPreserved&lt;&gt;</tt> method
-  </a>
-</h4>
-
-<div>
-<p>
-One of the jobs of the PassManager is to optimize how and when analyses are run.
-In particular, it attempts to avoid recomputing data unless it needs to.  For
-this reason, passes are allowed to declare that they preserve (i.e., they don't
-invalidate) an existing analysis if it's available.  For example, a simple
-constant folding pass would not modify the CFG, so it can't possibly affect the
-results of dominator analysis.  By default, all passes are assumed to invalidate
-all others.
-</p>
-
-<p>
-The <tt>AnalysisUsage</tt> class provides several methods which are useful in
-certain circumstances that are related to <tt>addPreserved</tt>.  In particular,
-the <tt>setPreservesAll</tt> method can be called to indicate that the pass does
-not modify the LLVM program at all (which is true for analyses), and the
-<tt>setPreservesCFG</tt> method can be used by transformations that change
-instructions in the program but do not modify the CFG or terminator instructions
-(note that this property is implicitly set for <a
-href="#BasicBlockPass">BasicBlockPass</a>'s).
-</p>
-
-<p>
-<tt>addPreserved</tt> is particularly useful for transformations like
-<tt>BreakCriticalEdges</tt>.  This pass knows how to update a small set of loop
-and dominator related analyses if they exist, so it can preserve them, despite
-the fact that it hacks on the CFG.
-</p>
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="AU::examples">
-    Example implementations of <tt>getAnalysisUsage</tt>
-  </a>
-</h4>
-
-<div>
-
-<div class="doc_code"><pre>
-<i>// This example modifies the program, but does not modify the CFG</i>
-<b>void</b> <a href="http://llvm.org/doxygen/structLICM.html">LICM</a>::getAnalysisUsage(AnalysisUsage &amp;AU) <b>const</b> {
-  AU.setPreservesCFG();
-  AU.addRequired&lt;<a href="http://llvm.org/doxygen/classllvm_1_1LoopInfo.html">LoopInfo</a>&gt;();
-}
-</pre></div>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="getAnalysis">
-    The <tt>getAnalysis&lt;&gt;</tt> and
-    <tt>getAnalysisIfAvailable&lt;&gt;</tt> methods
-  </a>
-</h4>
-
-<div>
-
-<p>The <tt>Pass::getAnalysis&lt;&gt;</tt> method is automatically inherited by
-your class, providing you with access to the passes that you declared that you
-required with the <a href="#getAnalysisUsage"><tt>getAnalysisUsage</tt></a>
-method.  It takes a single template argument that specifies which pass class you
-want, and returns a reference to that pass.  For example:</p>
-
-<div class="doc_code"><pre>
-bool LICM::runOnFunction(Function &amp;F) {
-  LoopInfo &amp;LI = getAnalysis&lt;LoopInfo&gt;();
-  ...
-}
-</pre></div>
-
-<p>This method call returns a reference to the pass desired.  You may get a
-runtime assertion failure if you attempt to get an analysis that you did not
-declare as required in your <a
-href="#getAnalysisUsage"><tt>getAnalysisUsage</tt></a> implementation.  This
-method can be called by your <tt>run*</tt> method implementation, or by any
-other local method invoked by your <tt>run*</tt> method.
-
-A module level pass can use function level analysis info using this interface.
-For example:</p>
-
-<div class="doc_code"><pre>
-bool ModuleLevelPass::runOnModule(Module &amp;M) {
-  ...
-  DominatorTree &amp;DT = getAnalysis&lt;DominatorTree&gt;(Func);
-  ...
-}
-</pre></div>
-
-<p>In above example, runOnFunction for DominatorTree is called by pass manager
-before returning a reference to the desired pass.</p>
-
-<p>
-If your pass is capable of updating analyses if they exist (e.g.,
-<tt>BreakCriticalEdges</tt>, as described above), you can use the
-<tt>getAnalysisIfAvailable</tt> method, which returns a pointer to the analysis
-if it is active.  For example:</p>
-
-<div class="doc_code"><pre>
-...
-if (DominatorSet *DS = getAnalysisIfAvailable&lt;DominatorSet&gt;()) {
-  <i>// A DominatorSet is active.  This code will update it.</i>
-}
-...
-</pre></div>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="analysisgroup">Implementing Analysis Groups</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>Now that we understand the basics of how passes are defined, how they are
-used, and how they are required from other passes, it's time to get a little bit
-fancier.  All of the pass relationships that we have seen so far are very
-simple: one pass depends on one other specific pass to be run before it can run.
-For many applications, this is great, for others, more flexibility is
-required.</p>
-
-<p>In particular, some analyses are defined such that there is a single simple
-interface to the analysis results, but multiple ways of calculating them.
-Consider alias analysis for example.  The most trivial alias analysis returns
-"may alias" for any alias query.  The most sophisticated analysis a
-flow-sensitive, context-sensitive interprocedural analysis that can take a
-significant amount of time to execute (and obviously, there is a lot of room
-between these two extremes for other implementations).  To cleanly support
-situations like this, the LLVM Pass Infrastructure supports the notion of
-Analysis Groups.</p>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="agconcepts">Analysis Group Concepts</a>
-</h4>
-
-<div>
-
-<p>An Analysis Group is a single simple interface that may be implemented by
-multiple different passes.  Analysis Groups can be given human readable names
-just like passes, but unlike passes, they need not derive from the <tt>Pass</tt>
-class.  An analysis group may have one or more implementations, one of which is
-the "default" implementation.</p>
-
-<p>Analysis groups are used by client passes just like other passes are: the
-<tt>AnalysisUsage::addRequired()</tt> and <tt>Pass::getAnalysis()</tt> methods.
-In order to resolve this requirement, the <a href="#passmanager">PassManager</a>
-scans the available passes to see if any implementations of the analysis group
-are available.  If none is available, the default implementation is created for
-the pass to use.  All standard rules for <A href="#interaction">interaction
-between passes</a> still apply.</p>
-
-<p>Although <a href="#registration">Pass Registration</a> is optional for normal
-passes, all analysis group implementations must be registered, and must use the
-<A href="#registerag"><tt>INITIALIZE_AG_PASS</tt></a> template to join the
-implementation pool.  Also, a default implementation of the interface
-<b>must</b> be registered with <A
-href="#registerag"><tt>RegisterAnalysisGroup</tt></a>.</p>
-
-<p>As a concrete example of an Analysis Group in action, consider the <a
-href="http://llvm.org/doxygen/classllvm_1_1AliasAnalysis.html">AliasAnalysis</a>
-analysis group.  The default implementation of the alias analysis interface (the
-<tt><a
-href="http://llvm.org/doxygen/structBasicAliasAnalysis.html">basicaa</a></tt>
-pass) just does a few simple checks that don't require significant analysis to
-compute (such as: two different globals can never alias each other, etc).
-Passes that use the <tt><a
-href="http://llvm.org/doxygen/classllvm_1_1AliasAnalysis.html">AliasAnalysis</a></tt>
-interface (for example the <tt><a
-href="http://llvm.org/doxygen/structGCSE.html">gcse</a></tt> pass), do
-not care which implementation of alias analysis is actually provided, they just
-use the designated interface.</p>
-
-<p>From the user's perspective, commands work just like normal.  Issuing the
-command '<tt>opt -gcse ...</tt>' will cause the <tt>basicaa</tt> class to be
-instantiated and added to the pass sequence.  Issuing the command '<tt>opt
--somefancyaa -gcse ...</tt>' will cause the <tt>gcse</tt> pass to use the
-<tt>somefancyaa</tt> alias analysis (which doesn't actually exist, it's just a
-hypothetical example) instead.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="registerag">Using <tt>RegisterAnalysisGroup</tt></a>
-</h4>
-
-<div>
-
-<p>The <tt>RegisterAnalysisGroup</tt> template is used to register the analysis
-group itself, while the <tt>INITIALIZE_AG_PASS</tt> is used to add pass
-implementations to the analysis group.  First,
-an analysis group should be registered, with a human readable name
-provided for it.
-Unlike registration of passes, there is no command line argument to be specified
-for the Analysis Group Interface itself, because it is "abstract":</p>
-
-<div class="doc_code"><pre>
-<b>static</b> RegisterAnalysisGroup&lt;<a href="http://llvm.org/doxygen/classllvm_1_1AliasAnalysis.html">AliasAnalysis</a>&gt; A("<i>Alias Analysis</i>");
-</pre></div>
-
-<p>Once the analysis is registered, passes can declare that they are valid
-implementations of the interface by using the following code:</p>
-
-<div class="doc_code"><pre>
-<b>namespace</b> {
-  //<i> Declare that we implement the AliasAnalysis interface</i>
-  INITIALIZE_AG_PASS(FancyAA, <a href="http://llvm.org/doxygen/classllvm_1_1AliasAnalysis.html">AliasAnalysis</a>, "<i>somefancyaa</i>",
-                     "<i>A more complex alias analysis implementation</i>",
-                     false,  // <i>Is CFG Only?</i>
-                     true,   // <i>Is Analysis?</i>
-                     false); // <i>Is default Analysis Group implementation?</i>
-}
-</pre></div>
-
-<p>This just shows a class <tt>FancyAA</tt> that 
-uses the <tt>INITIALIZE_AG_PASS</tt> macro both to register and
-to "join" the <tt><a href="http://llvm.org/doxygen/classllvm_1_1AliasAnalysis.html">AliasAnalysis</a></tt>
-analysis group.  Every implementation of an analysis group should join using
-this macro.</p>
-
-<div class="doc_code"><pre>
-<b>namespace</b> {
-  //<i> Declare that we implement the AliasAnalysis interface</i>
-  INITIALIZE_AG_PASS(BasicAA, <a href="http://llvm.org/doxygen/classllvm_1_1AliasAnalysis.html">AliasAnalysis</a>, "<i>basicaa</i>",
-                     "<i>Basic Alias Analysis (default AA impl)</i>",
-                     false, // <i>Is CFG Only?</i>
-                     true,  // <i>Is Analysis?</i>
-                     true); // <i>Is default Analysis Group implementation?</i>
-}
-</pre></div>
-
-<p>Here we show how the default implementation is specified (using the final
-argument to the <tt>INITIALIZE_AG_PASS</tt> template).  There must be exactly
-one default implementation available at all times for an Analysis Group to be
-used.  Only default implementation can derive from <tt>ImmutablePass</tt>. 
-Here we declare that the
- <tt><a href="http://llvm.org/doxygen/structBasicAliasAnalysis.html">BasicAliasAnalysis</a></tt>
-pass is the default implementation for the interface.</p>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="passStatistics">Pass Statistics</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-<p>The <a
-href="http://llvm.org/doxygen/Statistic_8h-source.html"><tt>Statistic</tt></a>
-class is designed to be an easy way to expose various success
-metrics from passes.  These statistics are printed at the end of a
-run, when the -stats command line option is enabled on the command
-line. See the <a href="http://llvm.org/docs/ProgrammersManual.html#Statistic">Statistics section</a> in the Programmer's Manual for details. 
-
-</div>
-
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="passmanager">What PassManager does</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>The <a
-href="http://llvm.org/doxygen/PassManager_8h-source.html"><tt>PassManager</tt></a>
-<a
-href="http://llvm.org/doxygen/classllvm_1_1PassManager.html">class</a>
-takes a list of passes, ensures their <a href="#interaction">prerequisites</a>
-are set up correctly, and then schedules passes to run efficiently.  All of the
-LLVM tools that run passes use the <tt>PassManager</tt> for execution of these
-passes.</p>
-
-<p>The <tt>PassManager</tt> does two main things to try to reduce the execution
-time of a series of passes:</p>
-
-<ol>
-<li><b>Share analysis results</b> - The PassManager attempts to avoid
-recomputing analysis results as much as possible.  This means keeping track of
-which analyses are available already, which analyses get invalidated, and which
-analyses are needed to be run for a pass.  An important part of work is that the
-<tt>PassManager</tt> tracks the exact lifetime of all analysis results, allowing
-it to <a href="#releaseMemory">free memory</a> allocated to holding analysis
-results as soon as they are no longer needed.</li>
-
-<li><b>Pipeline the execution of passes on the program</b> - The
-<tt>PassManager</tt> attempts to get better cache and memory usage behavior out
-of a series of passes by pipelining the passes together.  This means that, given
-a series of consecutive <a href="#FunctionPass"><tt>FunctionPass</tt></a>'s, it
-will execute all of the <a href="#FunctionPass"><tt>FunctionPass</tt></a>'s on
-the first function, then all of the <a
-href="#FunctionPass"><tt>FunctionPass</tt></a>es on the second function,
-etc... until the entire program has been run through the passes.
-
-<p>This improves the cache behavior of the compiler, because it is only touching
-the LLVM program representation for a single function at a time, instead of
-traversing the entire program.  It reduces the memory consumption of compiler,
-because, for example, only one <a
-href="http://llvm.org/doxygen/classllvm_1_1DominatorSet.html"><tt>DominatorSet</tt></a>
-needs to be calculated at a time.  This also makes it possible to implement
-some <a
-href="#SMP">interesting enhancements</a> in the future.</p></li>
-
-</ol>
-
-<p>The effectiveness of the <tt>PassManager</tt> is influenced directly by how
-much information it has about the behaviors of the passes it is scheduling.  For
-example, the "preserved" set is intentionally conservative in the face of an
-unimplemented <a href="#getAnalysisUsage"><tt>getAnalysisUsage</tt></a> method.
-Not implementing when it should be implemented will have the effect of not
-allowing any analysis results to live across the execution of your pass.</p>
-
-<p>The <tt>PassManager</tt> class exposes a <tt>--debug-pass</tt> command line
-options that is useful for debugging pass execution, seeing how things work, and
-diagnosing when you should be preserving more analyses than you currently are
-(To get information about all of the variants of the <tt>--debug-pass</tt>
-option, just type '<tt>opt -help-hidden</tt>').</p>
-
-<p>By using the <tt>--debug-pass=Structure</tt> option, for example, we can see
-how our <a href="#basiccode">Hello World</a> pass interacts with other passes.
-Lets try it out with the <tt>gcse</tt> and <tt>licm</tt> passes:</p>
-
-<div class="doc_code"><pre>
-$ opt -load ../../../Debug+Asserts/lib/Hello.so -gcse -licm --debug-pass=Structure &lt; hello.bc &gt; /dev/null
-Module Pass Manager
-  Function Pass Manager
-    Dominator Set Construction
-    Immediate Dominators Construction
-    Global Common Subexpression Elimination
---  Immediate Dominators Construction
---  Global Common Subexpression Elimination
-    Natural Loop Construction
-    Loop Invariant Code Motion
---  Natural Loop Construction
---  Loop Invariant Code Motion
-    Module Verifier
---  Dominator Set Construction
---  Module Verifier
-  Bitcode Writer
---Bitcode Writer
-</pre></div>
-
-<p>This output shows us when passes are constructed and when the analysis
-results are known to be dead (prefixed with '<tt>--</tt>').  Here we see that
-GCSE uses dominator and immediate dominator information to do its job.  The LICM
-pass uses natural loop information, which uses dominator sets, but not immediate
-dominators.  Because immediate dominators are no longer useful after the GCSE
-pass, it is immediately destroyed.  The dominator sets are then reused to
-compute natural loop information, which is then used by the LICM pass.</p>
-
-<p>After the LICM pass, the module verifier runs (which is automatically added
-by the '<tt>opt</tt>' tool), which uses the dominator set to check that the
-resultant LLVM code is well formed.  After it finishes, the dominator set
-information is destroyed, after being computed once, and shared by three
-passes.</p>
-
-<p>Lets see how this changes when we run the <a href="#basiccode">Hello
-World</a> pass in between the two passes:</p>
-
-<div class="doc_code"><pre>
-$ opt -load ../../../Debug+Asserts/lib/Hello.so -gcse -hello -licm --debug-pass=Structure &lt; hello.bc &gt; /dev/null
-Module Pass Manager
-  Function Pass Manager
-    Dominator Set Construction
-    Immediate Dominators Construction
-    Global Common Subexpression Elimination
-<b>--  Dominator Set Construction</b>
---  Immediate Dominators Construction
---  Global Common Subexpression Elimination
-<b>    Hello World Pass
---  Hello World Pass
-    Dominator Set Construction</b>
-    Natural Loop Construction
-    Loop Invariant Code Motion
---  Natural Loop Construction
---  Loop Invariant Code Motion
-    Module Verifier
---  Dominator Set Construction
---  Module Verifier
-  Bitcode Writer
---Bitcode Writer
-Hello: __main
-Hello: puts
-Hello: main
-</pre></div>
-
-<p>Here we see that the <a href="#basiccode">Hello World</a> pass has killed the
-Dominator Set pass, even though it doesn't modify the code at all!  To fix this,
-we need to add the following <a
-href="#getAnalysisUsage"><tt>getAnalysisUsage</tt></a> method to our pass:</p>
-
-<div class="doc_code"><pre>
-<i>// We don't modify the program, so we preserve all analyses</i>
-<b>virtual void</b> getAnalysisUsage(AnalysisUsage &amp;AU) <b>const</b> {
-  AU.setPreservesAll();
-}
-</pre></div>
-
-<p>Now when we run our pass, we get this output:</p>
-
-<div class="doc_code"><pre>
-$ opt -load ../../../Debug+Asserts/lib/Hello.so -gcse -hello -licm --debug-pass=Structure &lt; hello.bc &gt; /dev/null
-Pass Arguments:  -gcse -hello -licm
-Module Pass Manager
-  Function Pass Manager
-    Dominator Set Construction
-    Immediate Dominators Construction
-    Global Common Subexpression Elimination
---  Immediate Dominators Construction
---  Global Common Subexpression Elimination
-    Hello World Pass
---  Hello World Pass
-    Natural Loop Construction
-    Loop Invariant Code Motion
---  Loop Invariant Code Motion
---  Natural Loop Construction
-    Module Verifier
---  Dominator Set Construction
---  Module Verifier
-  Bitcode Writer
---Bitcode Writer
-Hello: __main
-Hello: puts
-Hello: main
-</pre></div>
-
-<p>Which shows that we don't accidentally invalidate dominator information
-anymore, and therefore do not have to compute it twice.</p>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="releaseMemory">The <tt>releaseMemory</tt> method</a>
-</h4>
-
-<div>
-
-<div class="doc_code"><pre>
-  <b>virtual void</b> releaseMemory();
-</pre></div>
-
-<p>The <tt>PassManager</tt> automatically determines when to compute analysis
-results, and how long to keep them around for.  Because the lifetime of the pass
-object itself is effectively the entire duration of the compilation process, we
-need some way to free analysis results when they are no longer useful.  The
-<tt>releaseMemory</tt> virtual method is the way to do this.</p>
-
-<p>If you are writing an analysis or any other pass that retains a significant
-amount of state (for use by another pass which "requires" your pass and uses the
-<a href="#getAnalysis">getAnalysis</a> method) you should implement
-<tt>releaseMemory</tt> to, well, release the memory allocated to maintain this
-internal state.  This method is called after the <tt>run*</tt> method for the
-class, before the next call of <tt>run*</tt> in your pass.</p>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="registering">Registering dynamically loaded passes</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p><i>Size matters</i> when constructing production quality tools using llvm, 
-both for the purposes of distribution, and for regulating the resident code size
-when running on the target system. Therefore, it becomes desirable to
-selectively use some passes, while omitting others and maintain the flexibility
-to change configurations later on. You want to be able to do all this, and,
-provide feedback to the user. This is where pass registration comes into
-play.</p>
-
-<p>The fundamental mechanisms for pass registration are the
-<tt>MachinePassRegistry</tt> class and subclasses of
-<tt>MachinePassRegistryNode</tt>.</p>
-
-<p>An instance of <tt>MachinePassRegistry</tt> is used to maintain a list of
-<tt>MachinePassRegistryNode</tt> objects.  This instance maintains the list and
-communicates additions and deletions to the command line interface.</p>
-
-<p>An instance of <tt>MachinePassRegistryNode</tt> subclass is used to maintain
-information provided about a particular pass.  This information includes the
-command line name, the command help string and the address of the function used
-to create an instance of the pass.  A global static constructor of one of these
-instances <i>registers</i> with a corresponding <tt>MachinePassRegistry</tt>,
-the static destructor <i>unregisters</i>. Thus a pass that is statically linked
-in the tool will be registered at start up. A dynamically loaded pass will
-register on load and unregister at unload.</p>
-
-<!-- _______________________________________________________________________ -->
-<h3>
-  <a name="registering_existing">Using existing registries</a>
-</h3>
-
-<div>
-
-<p>There are predefined registries to track instruction scheduling
-(<tt>RegisterScheduler</tt>) and register allocation (<tt>RegisterRegAlloc</tt>)
-machine passes.  Here we will describe how to <i>register</i> a register
-allocator machine pass.</p>
-
-<p>Implement your register allocator machine pass.  In your register allocator
-<tt>.cpp</tt> file add the following include;</p>
-
-<div class="doc_code"><pre>
-#include "llvm/CodeGen/RegAllocRegistry.h"
-</pre></div>
-
-<p>Also in your register allocator .cpp file, define a creator function in the
-form; </p>
-
-<div class="doc_code"><pre>
-FunctionPass *createMyRegisterAllocator() {
-  return new MyRegisterAllocator();
-}
-</pre></div>
-
-<p>Note that the signature of this function should match the type of
-<tt>RegisterRegAlloc::FunctionPassCtor</tt>.  In the same file add the
-"installing" declaration, in the form;</p>
-
-<div class="doc_code"><pre>
-static RegisterRegAlloc myRegAlloc("myregalloc",
-                                   "my register allocator help string",
-                                   createMyRegisterAllocator);
-</pre></div>
-
-<p>Note the two spaces prior to the help string produces a tidy result on the
--help query.</p>
-
-<div class="doc_code"><pre>
-$ llc -help
-  ...
-  -regalloc                    - Register allocator to use (default=linearscan)
-    =linearscan                -   linear scan register allocator
-    =local                     -   local register allocator
-    =simple                    -   simple register allocator
-    =myregalloc                -   my register allocator help string
-  ...
-</pre></div>
-
-<p>And that's it.  The user is now free to use <tt>-regalloc=myregalloc</tt> as
-an option.  Registering instruction schedulers is similar except use the
-<tt>RegisterScheduler</tt> class.  Note that the
-<tt>RegisterScheduler::FunctionPassCtor</tt> is significantly different from
-<tt>RegisterRegAlloc::FunctionPassCtor</tt>.</p>
-
-<p>To force the load/linking of your register allocator into the llc/lli tools,
-add your creator function's global declaration to "Passes.h" and add a "pseudo"
-call line to <tt>llvm/Codegen/LinkAllCodegenComponents.h</tt>.</p>
-
-</div>
-
-
-<!-- _______________________________________________________________________ -->
-<h3>
-  <a name="registering_new">Creating new registries</a>
-</h3>
-
-<div>
-
-<p>The easiest way to get started is to clone one of the existing registries; we
-recommend <tt>llvm/CodeGen/RegAllocRegistry.h</tt>.  The key things to modify
-are the class name and the <tt>FunctionPassCtor</tt> type.</p>
-
-<p>Then you need to declare the registry.  Example: if your pass registry is
-<tt>RegisterMyPasses</tt> then define;</p>
-
-<div class="doc_code"><pre>
-MachinePassRegistry RegisterMyPasses::Registry;
-</pre></div>
-
-<p>And finally, declare the command line option for your passes.  Example:</p> 
-
-<div class="doc_code"><pre>
-cl::opt&lt;RegisterMyPasses::FunctionPassCtor, false,
-        RegisterPassParser&lt;RegisterMyPasses&gt; &gt;
-MyPassOpt("mypass",
-          cl::init(&amp;createDefaultMyPass),
-          cl::desc("my pass option help")); 
-</pre></div>
-
-<p>Here the command option is "mypass", with createDefaultMyPass as the default
-creator.</p>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="debughints">Using GDB with dynamically loaded passes</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>Unfortunately, using GDB with dynamically loaded passes is not as easy as it
-should be.  First of all, you can't set a breakpoint in a shared object that has
-not been loaded yet, and second of all there are problems with inlined functions
-in shared objects.  Here are some suggestions to debugging your pass with
-GDB.</p>
-
-<p>For sake of discussion, I'm going to assume that you are debugging a
-transformation invoked by <tt>opt</tt>, although nothing described here depends
-on that.</p>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="breakpoint">Setting a breakpoint in your pass</a>
-</h4>
-
-<div>
-
-<p>First thing you do is start <tt>gdb</tt> on the <tt>opt</tt> process:</p>
-
-<div class="doc_code"><pre>
-$ <b>gdb opt</b>
-GNU gdb 5.0
-Copyright 2000 Free Software Foundation, Inc.
-GDB is free software, covered by the GNU General Public License, and you are
-welcome to change it and/or distribute copies of it under certain conditions.
-Type "show copying" to see the conditions.
-There is absolutely no warranty for GDB.  Type "show warranty" for details.
-This GDB was configured as "sparc-sun-solaris2.6"...
-(gdb)
-</pre></div>
-
-<p>Note that <tt>opt</tt> has a lot of debugging information in it, so it takes
-time to load.  Be patient.  Since we cannot set a breakpoint in our pass yet
-(the shared object isn't loaded until runtime), we must execute the process, and
-have it stop before it invokes our pass, but after it has loaded the shared
-object.  The most foolproof way of doing this is to set a breakpoint in
-<tt>PassManager::run</tt> and then run the process with the arguments you
-want:</p>
-
-<div class="doc_code"><pre>
-(gdb) <b>break llvm::PassManager::run</b>
-Breakpoint 1 at 0x2413bc: file Pass.cpp, line 70.
-(gdb) <b>run test.bc -load $(LLVMTOP)/llvm/Debug+Asserts/lib/[libname].so -[passoption]</b>
-Starting program: opt test.bc -load $(LLVMTOP)/llvm/Debug+Asserts/lib/[libname].so -[passoption]
-Breakpoint 1, PassManager::run (this=0xffbef174, M=@0x70b298) at Pass.cpp:70
-70      bool PassManager::run(Module &amp;M) { return PM-&gt;run(M); }
-(gdb)
-</pre></div>
-
-<p>Once the <tt>opt</tt> stops in the <tt>PassManager::run</tt> method you are
-now free to set breakpoints in your pass so that you can trace through execution
-or do other standard debugging stuff.</p>
-
-</div>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="debugmisc">Miscellaneous Problems</a>
-</h4>
-
-<div>
-
-<p>Once you have the basics down, there are a couple of problems that GDB has,
-some with solutions, some without.</p>
-
-<ul>
-<li>Inline functions have bogus stack information.  In general, GDB does a
-pretty good job getting stack traces and stepping through inline functions.
-When a pass is dynamically loaded however, it somehow completely loses this
-capability.  The only solution I know of is to de-inline a function (move it
-from the body of a class to a .cpp file).</li>
-
-<li>Restarting the program breaks breakpoints.  After following the information
-above, you have succeeded in getting some breakpoints planted in your pass.  Nex
-thing you know, you restart the program (i.e., you type '<tt>run</tt>' again),
-and you start getting errors about breakpoints being unsettable.  The only way I
-have found to "fix" this problem is to <tt>delete</tt> the breakpoints that are
-already set in your pass, run the program, and re-set the breakpoints once
-execution stops in <tt>PassManager::run</tt>.</li>
-
-</ul>
-
-<p>Hopefully these tips will help with common case debugging situations.  If
-you'd like to contribute some tips of your own, just contact <a
-href="mailto:sabre@nondot.org">Chris</a>.</p>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="future">Future extensions planned</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-
-<p>Although the LLVM Pass Infrastructure is very capable as it stands, and does
-some nifty stuff, there are things we'd like to add in the future.  Here is
-where we are going:</p>
-
-<!-- _______________________________________________________________________ -->
-<h4>
-  <a name="SMP">Multithreaded LLVM</a>
-</h4>
-
-<div>
-
-<p>Multiple CPU machines are becoming more common and compilation can never be
-fast enough: obviously we should allow for a multithreaded compiler.  Because of
-the semantics defined for passes above (specifically they cannot maintain state
-across invocations of their <tt>run*</tt> methods), a nice clean way to
-implement a multithreaded compiler would be for the <tt>PassManager</tt> class
-to create multiple instances of each pass object, and allow the separate
-instances to be hacking on different parts of the program at the same time.</p>
-
-<p>This implementation would prevent each of the passes from having to implement
-multithreaded constructs, requiring only the LLVM core to have locking in a few
-places (for global resources).  Although this is a simple extension, we simply
-haven't had time (or multiprocessor machines, thus a reason) to implement this.
-Despite that, we have kept the LLVM passes SMP ready, and you should too.</p>
-
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<hr>
-<address>
-  <a href="http://jigsaw.w3.org/css-validator/check/referer"><img
-  src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a>
-  <a href="http://validator.w3.org/check/referer"><img
-  src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
-
-  <a href="mailto:sabre@nondot.org">Chris Lattner</a><br>
-  <a href="http://llvm.org/">The LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date$
-</address>
-
-</body>
-</html>
diff --git a/docs/WritingAnLLVMPass.rst b/docs/WritingAnLLVMPass.rst
new file mode 100644
index 0000000000..db47fefd93
--- /dev/null
+++ b/docs/WritingAnLLVMPass.rst
@@ -0,0 +1,1439 @@
+====================
+Writing an LLVM Pass
+====================
+
+.. contents::
+    :local:
+
+Written by `Chris Lattner <mailto:sabre@nondot.org>`_ and
+`Jim Laskey <mailto:jlaskey@mac.com>`_
+
+Introduction --- What is a pass?
+================================
+
+The LLVM Pass Framework is an important part of the LLVM system, because LLVM
+passes are where most of the interesting parts of the compiler exist.  Passes
+perform the transformations and optimizations that make up the compiler, they
+build the analysis results that are used by these transformations, and they
+are, above all, a structuring technique for compiler code.
+
+All LLVM passes are subclasses of the `Pass
+<http://llvm.org/doxygen/classllvm_1_1Pass.html>`_ class, which implement
+functionality by overriding virtual methods inherited from ``Pass``.  Depending
+on how your pass works, you should inherit from the :ref:`ModulePass
+<writing-an-llvm-pass-ModulePass>` , :ref:`CallGraphSCCPass
+<writing-an-llvm-pass-CallGraphSCCPass>`, :ref:`FunctionPass
+<writing-an-llvm-pass-FunctionPass>` , or :ref:`LoopPass
+<writing-an-llvm-pass-LoopPass>`, or :ref:`RegionPass
+<writing-an-llvm-pass-RegionPass>`, or :ref:`BasicBlockPass
+<writing-an-llvm-pass-BasicBlockPass>` classes, which gives the system more
+information about what your pass does, and how it can be combined with other
+passes.  One of the main features of the LLVM Pass Framework is that it
+schedules passes to run in an efficient way based on the constraints that your
+pass meets (which are indicated by which class they derive from).
+
+We start by showing you how to construct a pass, everything from setting up the
+code, to compiling, loading, and executing it.  After the basics are down, more
+advanced features are discussed.
+
+Quick Start --- Writing hello world
+===================================
+
+Here we describe how to write the "hello world" of passes.  The "Hello" pass is
+designed to simply print out the name of non-external functions that exist in
+the program being compiled.  It does not modify the program at all, it just
+inspects it.  The source code and files for this pass are available in the LLVM
+source tree in the ``lib/Transforms/Hello`` directory.
+
+.. _writing-an-llvm-pass-makefile:
+
+Setting up the build environment
+--------------------------------
+
+.. FIXME: Why does this recommend to build in-tree?
+
+First, configure and build LLVM.  This needs to be done directly inside the
+LLVM source tree rather than in a separate objects directory.  Next, you need
+to create a new directory somewhere in the LLVM source base.  For this example,
+we'll assume that you made ``lib/Transforms/Hello``.  Finally, you must set up
+a build script (``Makefile``) that will compile the source code for the new
+pass.  To do this, copy the following into ``Makefile``:
+
+.. code-block:: make
+
+    # Makefile for hello pass
+
+    # Path to top level of LLVM hierarchy
+    LEVEL = ../../..
+
+    # Name of the library to build
+    LIBRARYNAME = Hello
+
+    # Make the shared library become a loadable module so the tools can
+    # dlopen/dlsym on the resulting library.
+    LOADABLE_MODULE = 1
+
+    # Include the makefile implementation stuff
+    include $(LEVEL)/Makefile.common
+
+This makefile specifies that all of the ``.cpp`` files in the current directory
+are to be compiled and linked together into a shared object
+``$(LEVEL)/Debug+Asserts/lib/Hello.so`` that can be dynamically loaded by the
+:program:`opt` or :program:`bugpoint` tools via their :option:`-load` options.
+If your operating system uses a suffix other than ``.so`` (such as Windows or Mac
+OS X), the appropriate extension will be used.
+
+If you are used CMake to build LLVM, see :ref:`cmake-out-of-source-pass`.
+
+Now that we have the build scripts set up, we just need to write the code for
+the pass itself.
+
+.. _writing-an-llvm-pass-basiccode:
+
+Basic code required
+-------------------
+
+Now that we have a way to compile our new pass, we just have to write it.
+Start out with:
+
+.. code-block:: c++
+
+  #include "llvm/Pass.h"
+  #include "llvm/Function.h"
+  #include "llvm/Support/raw_ostream.h"
+
+Which are needed because we are writing a `Pass
+<http://llvm.org/doxygen/classllvm_1_1Pass.html>`_, we are operating on
+`Function <http://llvm.org/doxygen/classllvm_1_1Function.html>`_\ s, and we will
+be doing some printing.
+
+Next we have:
+
+.. code-block:: c++
+
+  using namespace llvm;
+
+... which is required because the functions from the include files live in the
+llvm namespace.
+
+Next we have:
+
+.. code-block:: c++
+
+  namespace {
+
+... which starts out an anonymous namespace.  Anonymous namespaces are to C++
+what the "``static``" keyword is to C (at global scope).  It makes the things
+declared inside of the anonymous namespace visible only to the current file.
+If you're not familiar with them, consult a decent C++ book for more
+information.
+
+Next, we declare our pass itself:
+
+.. code-block:: c++
+
+  struct Hello : public FunctionPass {
+
+This declares a "``Hello``" class that is a subclass of `FunctionPass
+<writing-an-llvm-pass-FunctionPass>`.  The different builtin pass subclasses
+are described in detail :ref:`later <writing-an-llvm-pass-pass-classes>`, but
+for now, know that ``FunctionPass`` operates on a function at a time.
+
+.. code-block:: c++
+
+    static char ID;
+    Hello() : FunctionPass(ID) {}
+
+This declares pass identifier used by LLVM to identify pass.  This allows LLVM
+to avoid using expensive C++ runtime information.
+
+.. code-block:: c++
+
+      virtual bool runOnFunction(Function &F) {
+        errs() << "Hello: ";
+        errs().write_escaped(F.getName()) << "\n";
+        return false;
+      }
+    }; // end of struct Hello
+  }  // end of anonymous namespace
+
+We declare a :ref:`runOnFunction <writing-an-llvm-pass-runOnFunction>` method,
+which overrides an abstract virtual method inherited from :ref:`FunctionPass
+<writing-an-llvm-pass-FunctionPass>`.  This is where we are supposed to do our
+thing, so we just print out our message with the name of each function.
+
+.. code-block:: c++
+
+  char Hello::ID = 0;
+
+We initialize pass ID here.  LLVM uses ID's address to identify a pass, so
+initialization value is not important.
+
+.. code-block:: c++
+
+  static RegisterPass<Hello> X("hello", "Hello World Pass",
+                               false /* Only looks at CFG */,
+                               false /* Analysis Pass */);
+
+Lastly, we :ref:`register our class <writing-an-llvm-pass-registration>`
+``Hello``, giving it a command line argument "``hello``", and a name "Hello
+World Pass".  The last two arguments describe its behavior: if a pass walks CFG
+without modifying it then the third argument is set to ``true``; if a pass is
+an analysis pass, for example dominator tree pass, then ``true`` is supplied as
+the fourth argument.
+
+As a whole, the ``.cpp`` file looks like:
+
+.. code-block:: c++
+
+    #include "llvm/Pass.h"
+    #include "llvm/Function.h"
+    #include "llvm/Support/raw_ostream.h"
+
+    using namespace llvm;
+
+    namespace {
+      struct Hello : public FunctionPass {
+        static char ID;
+        Hello() : FunctionPass(ID) {}
+
+        virtual bool runOnFunction(Function &F) {
+          errs() << "Hello: ";
+          errs().write_escaped(F.getName()) << '\n';
+          return false;
+        }
+      };
+    }
+
+    char Hello::ID = 0;
+    static RegisterPass<Hello> X("hello", "Hello World Pass", false, false);
+
+Now that it's all together, compile the file with a simple "``gmake``" command
+in the local directory and you should get a new file
+"``Debug+Asserts/lib/Hello.so``" under the top level directory of the LLVM
+source tree (not in the local directory).  Note that everything in this file is
+contained in an anonymous namespace --- this reflects the fact that passes
+are self contained units that do not need external interfaces (although they
+can have them) to be useful.
+
+Running a pass with ``opt``
+---------------------------
+
+Now that you have a brand new shiny shared object file, we can use the
+:program:`opt` command to run an LLVM program through your pass.  Because you
+registered your pass with ``RegisterPass``, you will be able to use the
+:program:`opt` tool to access it, once loaded.
+
+To test it, follow the example at the end of the :doc:`GettingStarted` to
+compile "Hello World" to LLVM.  We can now run the bitcode file (hello.bc) for
+the program through our transformation like this (or course, any bitcode file
+will work):
+
+.. code-block:: console
+
+  $ opt -load ../../../Debug+Asserts/lib/Hello.so -hello < hello.bc > /dev/null
+  Hello: __main
+  Hello: puts
+  Hello: main
+
+The :option:`-load` option specifies that :program:`opt` should load your pass
+as a shared object, which makes "``-hello``" a valid command line argument
+(which is one reason you need to :ref:`register your pass
+<writing-an-llvm-pass-registration>`).  Because the Hello pass does not modify
+the program in any interesting way, we just throw away the result of
+:program:`opt` (sending it to ``/dev/null``).
+
+To see what happened to the other string you registered, try running
+:program:`opt` with the :option:`-help` option:
+
+.. code-block:: console
+
+  $ opt -load ../../../Debug+Asserts/lib/Hello.so -help
+  OVERVIEW: llvm .bc -> .bc modular optimizer
+
+  USAGE: opt [options] <input bitcode>
+
+  OPTIONS:
+    Optimizations available:
+  ...
+      -globalopt                - Global Variable Optimizer
+      -globalsmodref-aa         - Simple mod/ref analysis for globals
+      -gvn                      - Global Value Numbering
+      -hello                    - Hello World Pass
+      -indvars                  - Induction Variable Simplification
+      -inline                   - Function Integration/Inlining
+      -insert-edge-profiling    - Insert instrumentation for edge profiling
+  ...
+
+The pass name gets added as the information string for your pass, giving some
+documentation to users of :program:`opt`.  Now that you have a working pass,
+you would go ahead and make it do the cool transformations you want.  Once you
+get it all working and tested, it may become useful to find out how fast your
+pass is.  The :ref:`PassManager <writing-an-llvm-pass-passmanager>` provides a
+nice command line option (:option:`--time-passes`) that allows you to get
+information about the execution time of your pass along with the other passes
+you queue up.  For example:
+
+.. code-block:: console
+
+  $ opt -load ../../../Debug+Asserts/lib/Hello.so -hello -time-passes < hello.bc > /dev/null
+  Hello: __main
+  Hello: puts
+  Hello: main
+  ===============================================================================
+                        ... Pass execution timing report ...
+  ===============================================================================
+    Total Execution Time: 0.02 seconds (0.0479059 wall clock)
+
+     ---User Time---   --System Time--   --User+System--   ---Wall Time---  --- Pass Name ---
+     0.0100 (100.0%)   0.0000 (  0.0%)   0.0100 ( 50.0%)   0.0402 ( 84.0%)  Bitcode Writer
+     0.0000 (  0.0%)   0.0100 (100.0%)   0.0100 ( 50.0%)   0.0031 (  6.4%)  Dominator Set Construction
+     0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0013 (  2.7%)  Module Verifier
+     0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0033 (  6.9%)  Hello World Pass
+     0.0100 (100.0%)   0.0100 (100.0%)   0.0200 (100.0%)   0.0479 (100.0%)  TOTAL
+
+As you can see, our implementation above is pretty fast.  The additional
+passes listed are automatically inserted by the :program:`opt` tool to verify
+that the LLVM emitted by your pass is still valid and well formed LLVM, which
+hasn't been broken somehow.
+
+Now that you have seen the basics of the mechanics behind passes, we can talk
+about some more details of how they work and how to use them.
+
+.. _writing-an-llvm-pass-pass-classes:
+
+Pass classes and requirements
+=============================
+
+One of the first things that you should do when designing a new pass is to
+decide what class you should subclass for your pass.  The :ref:`Hello World
+<writing-an-llvm-pass-basiccode>` example uses the :ref:`FunctionPass
+<writing-an-llvm-pass-FunctionPass>` class for its implementation, but we did
+not discuss why or when this should occur.  Here we talk about the classes
+available, from the most general to the most specific.
+
+When choosing a superclass for your ``Pass``, you should choose the **most
+specific** class possible, while still being able to meet the requirements
+listed.  This gives the LLVM Pass Infrastructure information necessary to
+optimize how passes are run, so that the resultant compiler isn't unnecessarily
+slow.
+
+The ``ImmutablePass`` class
+---------------------------
+
+The most plain and boring type of pass is the "`ImmutablePass
+<http://llvm.org/doxygen/classllvm_1_1ImmutablePass.html>`_" class.  This pass
+type is used for passes that do not have to be run, do not change state, and
+never need to be updated.  This is not a normal type of transformation or
+analysis, but can provide information about the current compiler configuration.
+
+Although this pass class is very infrequently used, it is important for
+providing information about the current target machine being compiled for, and
+other static information that can affect the various transformations.
+
+``ImmutablePass``\ es never invalidate other transformations, are never
+invalidated, and are never "run".
+
+.. _writing-an-llvm-pass-ModulePass:
+
+The ``ModulePass`` class
+------------------------
+
+The `ModulePass <http://llvm.org/doxygen/classllvm_1_1ModulePass.html>`_ class
+is the most general of all superclasses that you can use.  Deriving from
+``ModulePass`` indicates that your pass uses the entire program as a unit,
+referring to function bodies in no predictable order, or adding and removing
+functions.  Because nothing is known about the behavior of ``ModulePass``
+subclasses, no optimization can be done for their execution.
+
+A module pass can use function level passes (e.g. dominators) using the
+``getAnalysis`` interface ``getAnalysis<DominatorTree>(llvm::Function *)`` to
+provide the function to retrieve analysis result for, if the function pass does
+not require any module or immutable passes.  Note that this can only be done
+for functions for which the analysis ran, e.g. in the case of dominators you
+should only ask for the ``DominatorTree`` for function definitions, not
+declarations.
+
+To write a correct ``ModulePass`` subclass, derive from ``ModulePass`` and
+overload the ``runOnModule`` method with the following signature:
+
+The ``runOnModule`` method
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: c++
+
+  virtual bool runOnModule(Module &M) = 0;
+
+The ``runOnModule`` method performs the interesting work of the pass.  It
+should return ``true`` if the module was modified by the transformation and
+``false`` otherwise.
+
+.. _writing-an-llvm-pass-CallGraphSCCPass:
+
+The ``CallGraphSCCPass`` class
+------------------------------
+
+The `CallGraphSCCPass
+<http://llvm.org/doxygen/classllvm_1_1CallGraphSCCPass.html>`_ is used by
+passes that need to traverse the program bottom-up on the call graph (callees
+before callers).  Deriving from ``CallGraphSCCPass`` provides some mechanics
+for building and traversing the ``CallGraph``, but also allows the system to
+optimize execution of ``CallGraphSCCPass``\ es.  If your pass meets the
+requirements outlined below, and doesn't meet the requirements of a
+:ref:`FunctionPass <writing-an-llvm-pass-FunctionPass>` or :ref:`BasicBlockPass
+<writing-an-llvm-pass-BasicBlockPass>`, you should derive from
+``CallGraphSCCPass``.
+
+``TODO``: explain briefly what SCC, Tarjan's algo, and B-U mean.
+
+To be explicit, CallGraphSCCPass subclasses are:
+
+#. ... *not allowed* to inspect or modify any ``Function``\ s other than those
+   in the current SCC and the direct callers and direct callees of the SCC.
+#. ... *required* to preserve the current ``CallGraph`` object, updating it to
+   reflect any changes made to the program.
+#. ... *not allowed* to add or remove SCC's from the current Module, though
+   they may change the contents of an SCC.
+#. ... *allowed* to add or remove global variables from the current Module.
+#. ... *allowed* to maintain state across invocations of :ref:`runOnSCC
+   <writing-an-llvm-pass-runOnSCC>` (including global data).
+
+Implementing a ``CallGraphSCCPass`` is slightly tricky in some cases because it
+has to handle SCCs with more than one node in it.  All of the virtual methods
+described below should return ``true`` if they modified the program, or
+``false`` if they didn't.
+
+The ``doInitialization(CallGraph &)`` method
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: c++
+
+  virtual bool doInitialization(CallGraph &CG);
+
+The ``doInitialization`` method is allowed to do most of the things that
+``CallGraphSCCPass``\ es are not allowed to do.  They can add and remove
+functions, get pointers to functions, etc.  The ``doInitialization`` method is
+designed to do simple initialization type of stuff that does not depend on the
+SCCs being processed.  The ``doInitialization`` method call is not scheduled to
+overlap with any other pass executions (thus it should be very fast).
+
+.. _writing-an-llvm-pass-runOnSCC:
+
+The ``runOnSCC`` method
+^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: c++
+
+  virtual bool runOnSCC(CallGraphSCC &SCC) = 0;
+
+The ``runOnSCC`` method performs the interesting work of the pass, and should
+return ``true`` if the module was modified by the transformation, ``false``
+otherwise.
+
+The ``doFinalization(CallGraph &)`` method
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: c++
+
+  virtual bool doFinalization(CallGraph &CG);
+
+The ``doFinalization`` method is an infrequently used method that is called
+when the pass framework has finished calling :ref:`runOnFunction
+<writing-an-llvm-pass-runOnFunction>` for every function in the program being
+compiled.
+
+.. _writing-an-llvm-pass-FunctionPass:
+
+The ``FunctionPass`` class
+--------------------------
+
+In contrast to ``ModulePass`` subclasses, `FunctionPass
+<http://llvm.org/doxygen/classllvm_1_1Pass.html>`_ subclasses do have a
+predictable, local behavior that can be expected by the system.  All
+``FunctionPass`` execute on each function in the program independent of all of
+the other functions in the program.  ``FunctionPass``\ es do not require that
+they are executed in a particular order, and ``FunctionPass``\ es do not modify
+external functions.
+
+To be explicit, ``FunctionPass`` subclasses are not allowed to:
+
+#. Modify a ``Function`` other than the one currently being processed.
+#. Add or remove ``Function``\ s from the current ``Module``.
+#. Add or remove global variables from the current ``Module``.
+#. Maintain state across invocations of:ref:`runOnFunction
+   <writing-an-llvm-pass-runOnFunction>` (including global data).
+
+Implementing a ``FunctionPass`` is usually straightforward (See the :ref:`Hello
+World <writing-an-llvm-pass-basiccode>` pass for example).
+``FunctionPass``\ es may overload three virtual methods to do their work.  All
+of these methods should return ``true`` if they modified the program, or
+``false`` if they didn't.
+
+.. _writing-an-llvm-pass-doInitialization-mod:
+
+The ``doInitialization(Module &)`` method
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: c++
+
+  virtual bool doInitialization(Module &M);
+
+The ``doInitialization`` method is allowed to do most of the things that
+``FunctionPass``\ es are not allowed to do.  They can add and remove functions,
+get pointers to functions, etc.  The ``doInitialization`` method is designed to
+do simple initialization type of stuff that does not depend on the functions
+being processed.  The ``doInitialization`` method call is not scheduled to
+overlap with any other pass executions (thus it should be very fast).
+
+A good example of how this method should be used is the `LowerAllocations
+<http://llvm.org/doxygen/LowerAllocations_8cpp-source.html>`_ pass.  This pass
+converts ``malloc`` and ``free`` instructions into platform dependent
+``malloc()`` and ``free()`` function calls.  It uses the ``doInitialization``
+method to get a reference to the ``malloc`` and ``free`` functions that it
+needs, adding prototypes to the module if necessary.
+
+.. _writing-an-llvm-pass-runOnFunction:
+
+The ``runOnFunction`` method
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: c++
+
+  virtual bool runOnFunction(Function &F) = 0;
+
+The ``runOnFunction`` method must be implemented by your subclass to do the
+transformation or analysis work of your pass.  As usual, a ``true`` value
+should be returned if the function is modified.
+
+.. _writing-an-llvm-pass-doFinalization-mod:
+
+The ``doFinalization(Module &)`` method
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: c++
+
+  virtual bool doFinalization(Module &M);
+
+The ``doFinalization`` method is an infrequently used method that is called
+when the pass framework has finished calling :ref:`runOnFunction
+<writing-an-llvm-pass-runOnFunction>` for every function in the program being
+compiled.
+
+.. _writing-an-llvm-pass-LoopPass:
+
+The ``LoopPass`` class
+----------------------
+
+All ``LoopPass`` execute on each loop in the function independent of all of the
+other loops in the function.  ``LoopPass`` processes loops in loop nest order
+such that outer most loop is processed last.
+
+``LoopPass`` subclasses are allowed to update loop nest using ``LPPassManager``
+interface.  Implementing a loop pass is usually straightforward.
+``LoopPass``\ es may overload three virtual methods to do their work.  All
+these methods should return ``true`` if they modified the program, or ``false``
+if they didn't.
+
+The ``doInitialization(Loop *, LPPassManager &)`` method
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: c++
+
+  virtual bool doInitialization(Loop *, LPPassManager &LPM);
+
+The ``doInitialization`` method is designed to do simple initialization type of
+stuff that does not depend on the functions being processed.  The
+``doInitialization`` method call is not scheduled to overlap with any other
+pass executions (thus it should be very fast).  ``LPPassManager`` interface
+should be used to access ``Function`` or ``Module`` level analysis information.
+
+.. _writing-an-llvm-pass-runOnLoop:
+
+The ``runOnLoop`` method
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: c++
+
+  virtual bool runOnLoop(Loop *, LPPassManager &LPM) = 0;
+
+The ``runOnLoop`` method must be implemented by your subclass to do the
+transformation or analysis work of your pass.  As usual, a ``true`` value
+should be returned if the function is modified.  ``LPPassManager`` interface
+should be used to update loop nest.
+
+The ``doFinalization()`` method
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: c++
+
+  virtual bool doFinalization();
+
+The ``doFinalization`` method is an infrequently used method that is called
+when the pass framework has finished calling :ref:`runOnLoop
+<writing-an-llvm-pass-runOnLoop>` for every loop in the program being compiled.
+
+.. _writing-an-llvm-pass-RegionPass:
+
+The ``RegionPass`` class
+------------------------
+
+``RegionPass`` is similar to :ref:`LoopPass <writing-an-llvm-pass-LoopPass>`,
+but executes on each single entry single exit region in the function.
+``RegionPass`` processes regions in nested order such that the outer most
+region is processed last.
+
+``RegionPass`` subclasses are allowed to update the region tree by using the
+``RGPassManager`` interface.  You may overload three virtual methods of
+``RegionPass`` to implement your own region pass.  All these methods should
+return ``true`` if they modified the program, or ``false`` if they did not.
+
+The ``doInitialization(Region *, RGPassManager &)`` method
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: c++
+
+  virtual bool doInitialization(Region *, RGPassManager &RGM);
+
+The ``doInitialization`` method is designed to do simple initialization type of
+stuff that does not depend on the functions being processed.  The
+``doInitialization`` method call is not scheduled to overlap with any other
+pass executions (thus it should be very fast).  ``RPPassManager`` interface
+should be used to access ``Function`` or ``Module`` level analysis information.
+
+.. _writing-an-llvm-pass-runOnRegion:
+
+The ``runOnRegion`` method
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: c++
+
+  virtual bool runOnRegion(Region *, RGPassManager &RGM) = 0;
+
+The ``runOnRegion`` method must be implemented by your subclass to do the
+transformation or analysis work of your pass.  As usual, a true value should be
+returned if the region is modified.  ``RGPassManager`` interface should be used to
+update region tree.
+
+The ``doFinalization()`` method
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: c++
+
+  virtual bool doFinalization();
+
+The ``doFinalization`` method is an infrequently used method that is called
+when the pass framework has finished calling :ref:`runOnRegion
+<writing-an-llvm-pass-runOnRegion>` for every region in the program being
+compiled.
+
+.. _writing-an-llvm-pass-BasicBlockPass:
+
+The ``BasicBlockPass`` class
+----------------------------
+
+``BasicBlockPass``\ es are just like :ref:`FunctionPass's
+<writing-an-llvm-pass-FunctionPass>` , except that they must limit their scope
+of inspection and modification to a single basic block at a time.  As such,
+they are **not** allowed to do any of the following:
+
+#. Modify or inspect any basic blocks outside of the current one.
+#. Maintain state across invocations of :ref:`runOnBasicBlock
+   <writing-an-llvm-pass-runOnBasicBlock>`.
+#. Modify the control flow graph (by altering terminator instructions)
+#. Any of the things forbidden for :ref:`FunctionPasses
+   <writing-an-llvm-pass-FunctionPass>`.
+
+``BasicBlockPass``\ es are useful for traditional local and "peephole"
+optimizations.  They may override the same :ref:`doInitialization(Module &)
+<writing-an-llvm-pass-doInitialization-mod>` and :ref:`doFinalization(Module &)
+<writing-an-llvm-pass-doFinalization-mod>` methods that :ref:`FunctionPass's
+<writing-an-llvm-pass-FunctionPass>` have, but also have the following virtual
+methods that may also be implemented:
+
+The ``doInitialization(Function &)`` method
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: c++
+
+  virtual bool doInitialization(Function &F);
+
+The ``doInitialization`` method is allowed to do most of the things that
+``BasicBlockPass``\ es are not allowed to do, but that ``FunctionPass``\ es
+can.  The ``doInitialization`` method is designed to do simple initialization
+that does not depend on the ``BasicBlock``\ s being processed.  The
+``doInitialization`` method call is not scheduled to overlap with any other
+pass executions (thus it should be very fast).
+
+.. _writing-an-llvm-pass-runOnBasicBlock:
+
+The ``runOnBasicBlock`` method
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: c++
+
+  virtual bool runOnBasicBlock(BasicBlock &BB) = 0;
+
+Override this function to do the work of the ``BasicBlockPass``.  This function
+is not allowed to inspect or modify basic blocks other than the parameter, and
+are not allowed to modify the CFG.  A ``true`` value must be returned if the
+basic block is modified.
+
+The ``doFinalization(Function &)`` method
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: c++
+
+    virtual bool doFinalization(Function &F);
+
+The ``doFinalization`` method is an infrequently used method that is called
+when the pass framework has finished calling :ref:`runOnBasicBlock
+<writing-an-llvm-pass-runOnBasicBlock>` for every ``BasicBlock`` in the program
+being compiled.  This can be used to perform per-function finalization.
+
+The ``MachineFunctionPass`` class
+---------------------------------
+
+A ``MachineFunctionPass`` is a part of the LLVM code generator that executes on
+the machine-dependent representation of each LLVM function in the program.
+
+Code generator passes are registered and initialized specially by
+``TargetMachine::addPassesToEmitFile`` and similar routines, so they cannot
+generally be run from the :program:`opt` or :program:`bugpoint` commands.
+
+A ``MachineFunctionPass`` is also a ``FunctionPass``, so all the restrictions
+that apply to a ``FunctionPass`` also apply to it.  ``MachineFunctionPass``\ es
+also have additional restrictions.  In particular, ``MachineFunctionPass``\ es
+are not allowed to do any of the following:
+
+#. Modify or create any LLVM IR ``Instruction``\ s, ``BasicBlock``\ s,
+   ``Argument``\ s, ``Function``\ s, ``GlobalVariable``\ s,
+   ``GlobalAlias``\ es, or ``Module``\ s.
+#. Modify a ``MachineFunction`` other than the one currently being processed.
+#. Maintain state across invocations of :ref:`runOnMachineFunction
+   <writing-an-llvm-pass-runOnMachineFunction>` (including global data).
+
+.. _writing-an-llvm-pass-runOnMachineFunction:
+
+The ``runOnMachineFunction(MachineFunction &MF)`` method
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: c++
+
+  virtual bool runOnMachineFunction(MachineFunction &MF) = 0;
+
+``runOnMachineFunction`` can be considered the main entry point of a
+``MachineFunctionPass``; that is, you should override this method to do the
+work of your ``MachineFunctionPass``.
+
+The ``runOnMachineFunction`` method is called on every ``MachineFunction`` in a
+``Module``, so that the ``MachineFunctionPass`` may perform optimizations on
+the machine-dependent representation of the function.  If you want to get at
+the LLVM ``Function`` for the ``MachineFunction`` you're working on, use
+``MachineFunction``'s ``getFunction()`` accessor method --- but remember, you
+may not modify the LLVM ``Function`` or its contents from a
+``MachineFunctionPass``.
+
+.. _writing-an-llvm-pass-registration:
+
+Pass registration
+-----------------
+
+In the :ref:`Hello World <writing-an-llvm-pass-basiccode>` example pass we
+illustrated how pass registration works, and discussed some of the reasons that
+it is used and what it does.  Here we discuss how and why passes are
+registered.
+
+As we saw above, passes are registered with the ``RegisterPass`` template.  The
+template parameter is the name of the pass that is to be used on the command
+line to specify that the pass should be added to a program (for example, with
+:program:`opt` or :program:`bugpoint`).  The first argument is the name of the
+pass, which is to be used for the :option:`-help` output of programs, as well
+as for debug output generated by the :option:`--debug-pass` option.
+
+If you want your pass to be easily dumpable, you should implement the virtual
+print method:
+
+The ``print`` method
+^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: c++
+
+  virtual void print(llvm::raw_ostream &O, const Module *M) const;
+
+The ``print`` method must be implemented by "analyses" in order to print a
+human readable version of the analysis results.  This is useful for debugging
+an analysis itself, as well as for other people to figure out how an analysis
+works.  Use the opt ``-analyze`` argument to invoke this method.
+
+The ``llvm::raw_ostream`` parameter specifies the stream to write the results
+on, and the ``Module`` parameter gives a pointer to the top level module of the
+program that has been analyzed.  Note however that this pointer may be ``NULL``
+in certain circumstances (such as calling the ``Pass::dump()`` from a
+debugger), so it should only be used to enhance debug output, it should not be
+depended on.
+
+.. _writing-an-llvm-pass-interaction:
+
+Specifying interactions between passes
+--------------------------------------
+
+One of the main responsibilities of the ``PassManager`` is to make sure that
+passes interact with each other correctly.  Because ``PassManager`` tries to
+:ref:`optimize the execution of passes <writing-an-llvm-pass-passmanager>` it
+must know how the passes interact with each other and what dependencies exist
+between the various passes.  To track this, each pass can declare the set of
+passes that are required to be executed before the current pass, and the passes
+which are invalidated by the current pass.
+
+Typically this functionality is used to require that analysis results are
+computed before your pass is run.  Running arbitrary transformation passes can
+invalidate the computed analysis results, which is what the invalidation set
+specifies.  If a pass does not implement the :ref:`getAnalysisUsage
+<writing-an-llvm-pass-getAnalysisUsage>` method, it defaults to not having any
+prerequisite passes, and invalidating **all** other passes.
+
+.. _writing-an-llvm-pass-getAnalysisUsage:
+
+The ``getAnalysisUsage`` method
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: c++
+
+  virtual void getAnalysisUsage(AnalysisUsage &Info) const;
+
+By implementing the ``getAnalysisUsage`` method, the required and invalidated
+sets may be specified for your transformation.  The implementation should fill
+in the `AnalysisUsage
+<http://llvm.org/doxygen/classllvm_1_1AnalysisUsage.html>`_ object with
+information about which passes are required and not invalidated.  To do this, a
+pass may call any of the following methods on the ``AnalysisUsage`` object:
+
+The ``AnalysisUsage::addRequired<>`` and ``AnalysisUsage::addRequiredTransitive<>`` methods
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If your pass requires a previous pass to be executed (an analysis for example),
+it can use one of these methods to arrange for it to be run before your pass.
+LLVM has many different types of analyses and passes that can be required,
+spanning the range from ``DominatorSet`` to ``BreakCriticalEdges``.  Requiring
+``BreakCriticalEdges``, for example, guarantees that there will be no critical
+edges in the CFG when your pass has been run.
+
+Some analyses chain to other analyses to do their job.  For example, an
+`AliasAnalysis <AliasAnalysis>` implementation is required to :ref:`chain
+<aliasanalysis-chaining>` to other alias analysis passes.  In cases where
+analyses chain, the ``addRequiredTransitive`` method should be used instead of
+the ``addRequired`` method.  This informs the ``PassManager`` that the
+transitively required pass should be alive as long as the requiring pass is.
+
+The ``AnalysisUsage::addPreserved<>`` method
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+One of the jobs of the ``PassManager`` is to optimize how and when analyses are
+run.  In particular, it attempts to avoid recomputing data unless it needs to.
+For this reason, passes are allowed to declare that they preserve (i.e., they
+don't invalidate) an existing analysis if it's available.  For example, a
+simple constant folding pass would not modify the CFG, so it can't possibly
+affect the results of dominator analysis.  By default, all passes are assumed
+to invalidate all others.
+
+The ``AnalysisUsage`` class provides several methods which are useful in
+certain circumstances that are related to ``addPreserved``.  In particular, the
+``setPreservesAll`` method can be called to indicate that the pass does not
+modify the LLVM program at all (which is true for analyses), and the
+``setPreservesCFG`` method can be used by transformations that change
+instructions in the program but do not modify the CFG or terminator
+instructions (note that this property is implicitly set for
+:ref:`BasicBlockPass <writing-an-llvm-pass-BasicBlockPass>`\ es).
+
+``addPreserved`` is particularly useful for transformations like
+``BreakCriticalEdges``.  This pass knows how to update a small set of loop and
+dominator related analyses if they exist, so it can preserve them, despite the
+fact that it hacks on the CFG.
+
+Example implementations of ``getAnalysisUsage``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: c++
+
+  // This example modifies the program, but does not modify the CFG
+  void LICM::getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesCFG();
+    AU.addRequired<LoopInfo>();
+  }
+
+.. _writing-an-llvm-pass-getAnalysis:
+
+The ``getAnalysis<>`` and ``getAnalysisIfAvailable<>`` methods
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``Pass::getAnalysis<>`` method is automatically inherited by your class,
+providing you with access to the passes that you declared that you required
+with the :ref:`getAnalysisUsage <writing-an-llvm-pass-getAnalysisUsage>`
+method.  It takes a single template argument that specifies which pass class
+you want, and returns a reference to that pass.  For example:
+
+.. code-block:: c++
+
+  bool LICM::runOnFunction(Function &F) {
+    LoopInfo &LI = getAnalysis<LoopInfo>();
+    //...
+  }
+
+This method call returns a reference to the pass desired.  You may get a
+runtime assertion failure if you attempt to get an analysis that you did not
+declare as required in your :ref:`getAnalysisUsage
+<writing-an-llvm-pass-getAnalysisUsage>` implementation.  This method can be
+called by your ``run*`` method implementation, or by any other local method
+invoked by your ``run*`` method.
+
+A module level pass can use function level analysis info using this interface.
+For example:
+
+.. code-block:: c++
+
+  bool ModuleLevelPass::runOnModule(Module &M) {
+    //...
+    DominatorTree &DT = getAnalysis<DominatorTree>(Func);
+    //...
+  }
+
+In above example, ``runOnFunction`` for ``DominatorTree`` is called by pass
+manager before returning a reference to the desired pass.
+
+If your pass is capable of updating analyses if they exist (e.g.,
+``BreakCriticalEdges``, as described above), you can use the
+``getAnalysisIfAvailable`` method, which returns a pointer to the analysis if
+it is active.  For example:
+
+.. code-block:: c++
+
+  if (DominatorSet *DS = getAnalysisIfAvailable<DominatorSet>()) {
+    // A DominatorSet is active.  This code will update it.
+  }
+
+Implementing Analysis Groups
+----------------------------
+
+Now that we understand the basics of how passes are defined, how they are used,
+and how they are required from other passes, it's time to get a little bit
+fancier.  All of the pass relationships that we have seen so far are very
+simple: one pass depends on one other specific pass to be run before it can
+run.  For many applications, this is great, for others, more flexibility is
+required.
+
+In particular, some analyses are defined such that there is a single simple
+interface to the analysis results, but multiple ways of calculating them.
+Consider alias analysis for example.  The most trivial alias analysis returns
+"may alias" for any alias query.  The most sophisticated analysis a
+flow-sensitive, context-sensitive interprocedural analysis that can take a
+significant amount of time to execute (and obviously, there is a lot of room
+between these two extremes for other implementations).  To cleanly support
+situations like this, the LLVM Pass Infrastructure supports the notion of
+Analysis Groups.
+
+Analysis Group Concepts
+^^^^^^^^^^^^^^^^^^^^^^^
+
+An Analysis Group is a single simple interface that may be implemented by
+multiple different passes.  Analysis Groups can be given human readable names
+just like passes, but unlike passes, they need not derive from the ``Pass``
+class.  An analysis group may have one or more implementations, one of which is
+the "default" implementation.
+
+Analysis groups are used by client passes just like other passes are: the
+``AnalysisUsage::addRequired()`` and ``Pass::getAnalysis()`` methods.  In order
+to resolve this requirement, the :ref:`PassManager
+<writing-an-llvm-pass-passmanager>` scans the available passes to see if any
+implementations of the analysis group are available.  If none is available, the
+default implementation is created for the pass to use.  All standard rules for
+:ref:`interaction between passes <writing-an-llvm-pass-interaction>` still
+apply.
+
+Although :ref:`Pass Registration <writing-an-llvm-pass-registration>` is
+optional for normal passes, all analysis group implementations must be
+registered, and must use the :ref:`INITIALIZE_AG_PASS
+<writing-an-llvm-pass-RegisterAnalysisGroup>` template to join the
+implementation pool.  Also, a default implementation of the interface **must**
+be registered with :ref:`RegisterAnalysisGroup
+<writing-an-llvm-pass-RegisterAnalysisGroup>`.
+
+As a concrete example of an Analysis Group in action, consider the
+`AliasAnalysis <http://llvm.org/doxygen/classllvm_1_1AliasAnalysis.html>`_
+analysis group.  The default implementation of the alias analysis interface
+(the `basicaa <http://llvm.org/doxygen/structBasicAliasAnalysis.html>`_ pass)
+just does a few simple checks that don't require significant analysis to
+compute (such as: two different globals can never alias each other, etc).
+Passes that use the `AliasAnalysis
+<http://llvm.org/doxygen/classllvm_1_1AliasAnalysis.html>`_ interface (for
+example the `gcse <http://llvm.org/doxygen/structGCSE.html>`_ pass), do not
+care which implementation of alias analysis is actually provided, they just use
+the designated interface.
+
+From the user's perspective, commands work just like normal.  Issuing the
+command ``opt -gcse ...`` will cause the ``basicaa`` class to be instantiated
+and added to the pass sequence.  Issuing the command ``opt -somefancyaa -gcse
+...`` will cause the ``gcse`` pass to use the ``somefancyaa`` alias analysis
+(which doesn't actually exist, it's just a hypothetical example) instead.
+
+.. _writing-an-llvm-pass-RegisterAnalysisGroup:
+
+Using ``RegisterAnalysisGroup``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``RegisterAnalysisGroup`` template is used to register the analysis group
+itself, while the ``INITIALIZE_AG_PASS`` is used to add pass implementations to
+the analysis group.  First, an analysis group should be registered, with a
+human readable name provided for it.  Unlike registration of passes, there is
+no command line argument to be specified for the Analysis Group Interface
+itself, because it is "abstract":
+
+.. code-block:: c++
+
+  static RegisterAnalysisGroup<AliasAnalysis> A("Alias Analysis");
+
+Once the analysis is registered, passes can declare that they are valid
+implementations of the interface by using the following code:
+
+.. code-block:: c++
+
+  namespace {
+    // Declare that we implement the AliasAnalysis interface
+    INITIALIZE_AG_PASS(FancyAA, AliasAnalysis , "somefancyaa",
+        "A more complex alias analysis implementation",
+        false,  // Is CFG Only?
+        true,   // Is Analysis?
+        false); // Is default Analysis Group implementation?
+  }
+
+This just shows a class ``FancyAA`` that uses the ``INITIALIZE_AG_PASS`` macro
+both to register and to "join" the `AliasAnalysis
+<http://llvm.org/doxygen/classllvm_1_1AliasAnalysis.html>`_ analysis group.
+Every implementation of an analysis group should join using this macro.
+
+.. code-block:: c++
+
+  namespace {
+    // Declare that we implement the AliasAnalysis interface
+    INITIALIZE_AG_PASS(BasicAA, AliasAnalysis, "basicaa",
+        "Basic Alias Analysis (default AA impl)",
+        false, // Is CFG Only?
+        true,  // Is Analysis?
+        true); // Is default Analysis Group implementation?
+  }
+
+Here we show how the default implementation is specified (using the final
+argument to the ``INITIALIZE_AG_PASS`` template).  There must be exactly one
+default implementation available at all times for an Analysis Group to be used.
+Only default implementation can derive from ``ImmutablePass``.  Here we declare
+that the `BasicAliasAnalysis
+<http://llvm.org/doxygen/structBasicAliasAnalysis.html>`_ pass is the default
+implementation for the interface.
+
+Pass Statistics
+===============
+
+The `Statistic <http://llvm.org/doxygen/Statistic_8h-source.html>`_ class is
+designed to be an easy way to expose various success metrics from passes.
+These statistics are printed at the end of a run, when the :option:`-stats`
+command line option is enabled on the command line.  See the :ref:`Statistics
+section <Statistic>` in the Programmer's Manual for details.
+
+.. _writing-an-llvm-pass-passmanager:
+
+What PassManager does
+---------------------
+
+The `PassManager <http://llvm.org/doxygen/PassManager_8h-source.html>`_ `class
+<http://llvm.org/doxygen/classllvm_1_1PassManager.html>`_ takes a list of
+passes, ensures their :ref:`prerequisites <writing-an-llvm-pass-interaction>`
+are set up correctly, and then schedules passes to run efficiently.  All of the
+LLVM tools that run passes use the PassManager for execution of these passes.
+
+The PassManager does two main things to try to reduce the execution time of a
+series of passes:
+
+#. **Share analysis results.**  The ``PassManager`` attempts to avoid
+   recomputing analysis results as much as possible.  This means keeping track
+   of which analyses are available already, which analyses get invalidated, and
+   which analyses are needed to be run for a pass.  An important part of work
+   is that the ``PassManager`` tracks the exact lifetime of all analysis
+   results, allowing it to :ref:`free memory
+   <writing-an-llvm-pass-releaseMemory>` allocated to holding analysis results
+   as soon as they are no longer needed.
+
+#. **Pipeline the execution of passes on the program.**  The ``PassManager``
+   attempts to get better cache and memory usage behavior out of a series of
+   passes by pipelining the passes together.  This means that, given a series
+   of consecutive :ref:`FunctionPass <writing-an-llvm-pass-FunctionPass>`, it
+   will execute all of the :ref:`FunctionPass
+   <writing-an-llvm-pass-FunctionPass>` on the first function, then all of the
+   :ref:`FunctionPasses <writing-an-llvm-pass-FunctionPass>` on the second
+   function, etc... until the entire program has been run through the passes.
+
+   This improves the cache behavior of the compiler, because it is only
+   touching the LLVM program representation for a single function at a time,
+   instead of traversing the entire program.  It reduces the memory consumption
+   of compiler, because, for example, only one `DominatorSet
+   <http://llvm.org/doxygen/classllvm_1_1DominatorSet.html>`_ needs to be
+   calculated at a time.  This also makes it possible to implement some
+   :ref:`interesting enhancements <writing-an-llvm-pass-SMP>` in the future.
+
+The effectiveness of the ``PassManager`` is influenced directly by how much
+information it has about the behaviors of the passes it is scheduling.  For
+example, the "preserved" set is intentionally conservative in the face of an
+unimplemented :ref:`getAnalysisUsage <writing-an-llvm-pass-getAnalysisUsage>`
+method.  Not implementing when it should be implemented will have the effect of
+not allowing any analysis results to live across the execution of your pass.
+
+The ``PassManager`` class exposes a ``--debug-pass`` command line options that
+is useful for debugging pass execution, seeing how things work, and diagnosing
+when you should be preserving more analyses than you currently are.  (To get
+information about all of the variants of the ``--debug-pass`` option, just type
+"``opt -help-hidden``").
+
+By using the --debug-pass=Structure option, for example, we can see how our
+:ref:`Hello World <writing-an-llvm-pass-basiccode>` pass interacts with other
+passes.  Lets try it out with the gcse and licm passes:
+
+.. code-block:: console
+
+  $ opt -load ../../../Debug+Asserts/lib/Hello.so -gcse -licm --debug-pass=Structure < hello.bc > /dev/null
+  Module Pass Manager
+    Function Pass Manager
+      Dominator Set Construction
+      Immediate Dominators Construction
+      Global Common Subexpression Elimination
+  --  Immediate Dominators Construction
+  --  Global Common Subexpression Elimination
+      Natural Loop Construction
+      Loop Invariant Code Motion
+  --  Natural Loop Construction
+  --  Loop Invariant Code Motion
+      Module Verifier
+  --  Dominator Set Construction
+  --  Module Verifier
+    Bitcode Writer
+  --Bitcode Writer
+
+This output shows us when passes are constructed and when the analysis results
+are known to be dead (prefixed with "``--``").  Here we see that GCSE uses
+dominator and immediate dominator information to do its job.  The LICM pass
+uses natural loop information, which uses dominator sets, but not immediate
+dominators.  Because immediate dominators are no longer useful after the GCSE
+pass, it is immediately destroyed.  The dominator sets are then reused to
+compute natural loop information, which is then used by the LICM pass.
+
+After the LICM pass, the module verifier runs (which is automatically added by
+the :program:`opt` tool), which uses the dominator set to check that the
+resultant LLVM code is well formed.  After it finishes, the dominator set
+information is destroyed, after being computed once, and shared by three
+passes.
+
+Lets see how this changes when we run the :ref:`Hello World
+<writing-an-llvm-pass-basiccode>` pass in between the two passes:
+
+.. code-block:: console
+
+  $ opt -load ../../../Debug+Asserts/lib/Hello.so -gcse -hello -licm --debug-pass=Structure < hello.bc > /dev/null
+  Module Pass Manager
+    Function Pass Manager
+      Dominator Set Construction
+      Immediate Dominators Construction
+      Global Common Subexpression Elimination
+  --  Dominator Set Construction
+  --  Immediate Dominators Construction
+  --  Global Common Subexpression Elimination
+      Hello World Pass
+  --  Hello World Pass
+      Dominator Set Construction
+      Natural Loop Construction
+      Loop Invariant Code Motion
+  --  Natural Loop Construction
+  --  Loop Invariant Code Motion
+      Module Verifier
+  --  Dominator Set Construction
+  --  Module Verifier
+    Bitcode Writer
+  --Bitcode Writer
+  Hello: __main
+  Hello: puts
+  Hello: main
+
+Here we see that the :ref:`Hello World <writing-an-llvm-pass-basiccode>` pass
+has killed the Dominator Set pass, even though it doesn't modify the code at
+all!  To fix this, we need to add the following :ref:`getAnalysisUsage
+<writing-an-llvm-pass-getAnalysisUsage>` method to our pass:
+
+.. code-block:: c++
+
+  // We don't modify the program, so we preserve all analyses
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.setPreservesAll();
+  }
+
+Now when we run our pass, we get this output:
+
+.. code-block:: console
+
+  $ opt -load ../../../Debug+Asserts/lib/Hello.so -gcse -hello -licm --debug-pass=Structure < hello.bc > /dev/null
+  Pass Arguments:  -gcse -hello -licm
+  Module Pass Manager
+    Function Pass Manager
+      Dominator Set Construction
+      Immediate Dominators Construction
+      Global Common Subexpression Elimination
+  --  Immediate Dominators Construction
+  --  Global Common Subexpression Elimination
+      Hello World Pass
+  --  Hello World Pass
+      Natural Loop Construction
+      Loop Invariant Code Motion
+  --  Loop Invariant Code Motion
+  --  Natural Loop Construction
+      Module Verifier
+  --  Dominator Set Construction
+  --  Module Verifier
+    Bitcode Writer
+  --Bitcode Writer
+  Hello: __main
+  Hello: puts
+  Hello: main
+
+Which shows that we don't accidentally invalidate dominator information
+anymore, and therefore do not have to compute it twice.
+
+.. _writing-an-llvm-pass-releaseMemory:
+
+The ``releaseMemory`` method
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: c++
+
+  virtual void releaseMemory();
+
+The ``PassManager`` automatically determines when to compute analysis results,
+and how long to keep them around for.  Because the lifetime of the pass object
+itself is effectively the entire duration of the compilation process, we need
+some way to free analysis results when they are no longer useful.  The
+``releaseMemory`` virtual method is the way to do this.
+
+If you are writing an analysis or any other pass that retains a significant
+amount of state (for use by another pass which "requires" your pass and uses
+the :ref:`getAnalysis <writing-an-llvm-pass-getAnalysis>` method) you should
+implement ``releaseMemory`` to, well, release the memory allocated to maintain
+this internal state.  This method is called after the ``run*`` method for the
+class, before the next call of ``run*`` in your pass.
+
+Registering dynamically loaded passes
+=====================================
+
+*Size matters* when constructing production quality tools using LLVM, both for
+the purposes of distribution, and for regulating the resident code size when
+running on the target system.  Therefore, it becomes desirable to selectively
+use some passes, while omitting others and maintain the flexibility to change
+configurations later on.  You want to be able to do all this, and, provide
+feedback to the user.  This is where pass registration comes into play.
+
+The fundamental mechanisms for pass registration are the
+``MachinePassRegistry`` class and subclasses of ``MachinePassRegistryNode``.
+
+An instance of ``MachinePassRegistry`` is used to maintain a list of
+``MachinePassRegistryNode`` objects.  This instance maintains the list and
+communicates additions and deletions to the command line interface.
+
+An instance of ``MachinePassRegistryNode`` subclass is used to maintain
+information provided about a particular pass.  This information includes the
+command line name, the command help string and the address of the function used
+to create an instance of the pass.  A global static constructor of one of these
+instances *registers* with a corresponding ``MachinePassRegistry``, the static
+destructor *unregisters*.  Thus a pass that is statically linked in the tool
+will be registered at start up.  A dynamically loaded pass will register on
+load and unregister at unload.
+
+Using existing registries
+-------------------------
+
+There are predefined registries to track instruction scheduling
+(``RegisterScheduler``) and register allocation (``RegisterRegAlloc``) machine
+passes.  Here we will describe how to *register* a register allocator machine
+pass.
+
+Implement your register allocator machine pass.  In your register allocator
+``.cpp`` file add the following include:
+
+.. code-block:: c++
+
+  #include "llvm/CodeGen/RegAllocRegistry.h"
+
+Also in your register allocator ``.cpp`` file, define a creator function in the
+form:
+
+.. code-block:: c++
+
+  FunctionPass *createMyRegisterAllocator() {
+    return new MyRegisterAllocator();
+  }
+
+Note that the signature of this function should match the type of
+``RegisterRegAlloc::FunctionPassCtor``.  In the same file add the "installing"
+declaration, in the form:
+
+.. code-block:: c++
+
+  static RegisterRegAlloc myRegAlloc("myregalloc",
+                                     "my register allocator help string",
+                                     createMyRegisterAllocator);
+
+Note the two spaces prior to the help string produces a tidy result on the
+:option:`-help` query.
+
+.. code-block:: console
+
+  $ llc -help
+    ...
+    -regalloc                    - Register allocator to use (default=linearscan)
+      =linearscan                -   linear scan register allocator
+      =local                     -   local register allocator
+      =simple                    -   simple register allocator
+      =myregalloc                -   my register allocator help string
+    ...
+
+And that's it.  The user is now free to use ``-regalloc=myregalloc`` as an
+option.  Registering instruction schedulers is similar except use the
+``RegisterScheduler`` class.  Note that the
+``RegisterScheduler::FunctionPassCtor`` is significantly different from
+``RegisterRegAlloc::FunctionPassCtor``.
+
+To force the load/linking of your register allocator into the
+:program:`llc`/:program:`lli` tools, add your creator function's global
+declaration to ``Passes.h`` and add a "pseudo" call line to
+``llvm/Codegen/LinkAllCodegenComponents.h``.
+
+Creating new registries
+-----------------------
+
+The easiest way to get started is to clone one of the existing registries; we
+recommend ``llvm/CodeGen/RegAllocRegistry.h``.  The key things to modify are
+the class name and the ``FunctionPassCtor`` type.
+
+Then you need to declare the registry.  Example: if your pass registry is
+``RegisterMyPasses`` then define:
+
+.. code-block:: c++
+
+  MachinePassRegistry RegisterMyPasses::Registry;
+
+And finally, declare the command line option for your passes.  Example:
+
+.. code-block:: c++
+
+  cl::opt<RegisterMyPasses::FunctionPassCtor, false,
+          RegisterPassParser<RegisterMyPasses> >
+  MyPassOpt("mypass",
+            cl::init(&createDefaultMyPass),
+            cl::desc("my pass option help"));
+
+Here the command option is "``mypass``", with ``createDefaultMyPass`` as the
+default creator.
+
+Using GDB with dynamically loaded passes
+----------------------------------------
+
+Unfortunately, using GDB with dynamically loaded passes is not as easy as it
+should be.  First of all, you can't set a breakpoint in a shared object that
+has not been loaded yet, and second of all there are problems with inlined
+functions in shared objects.  Here are some suggestions to debugging your pass
+with GDB.
+
+For sake of discussion, I'm going to assume that you are debugging a
+transformation invoked by :program:`opt`, although nothing described here
+depends on that.
+
+Setting a breakpoint in your pass
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+First thing you do is start gdb on the opt process:
+
+.. code-block:: console
+
+  $ gdb opt
+  GNU gdb 5.0
+  Copyright 2000 Free Software Foundation, Inc.
+  GDB is free software, covered by the GNU General Public License, and you are
+  welcome to change it and/or distribute copies of it under certain conditions.
+  Type "show copying" to see the conditions.
+  There is absolutely no warranty for GDB.  Type "show warranty" for details.
+  This GDB was configured as "sparc-sun-solaris2.6"...
+  (gdb)
+
+Note that :program:`opt` has a lot of debugging information in it, so it takes
+time to load.  Be patient.  Since we cannot set a breakpoint in our pass yet
+(the shared object isn't loaded until runtime), we must execute the process,
+and have it stop before it invokes our pass, but after it has loaded the shared
+object.  The most foolproof way of doing this is to set a breakpoint in
+``PassManager::run`` and then run the process with the arguments you want:
+
+.. code-block:: console
+
+  $ (gdb) break llvm::PassManager::run
+  Breakpoint 1 at 0x2413bc: file Pass.cpp, line 70.
+  (gdb) run test.bc -load $(LLVMTOP)/llvm/Debug+Asserts/lib/[libname].so -[passoption]
+  Starting program: opt test.bc -load $(LLVMTOP)/llvm/Debug+Asserts/lib/[libname].so -[passoption]
+  Breakpoint 1, PassManager::run (this=0xffbef174, M=@0x70b298) at Pass.cpp:70
+  70      bool PassManager::run(Module &M) { return PM->run(M); }
+  (gdb)
+
+Once the :program:`opt` stops in the ``PassManager::run`` method you are now
+free to set breakpoints in your pass so that you can trace through execution or
+do other standard debugging stuff.
+
+Miscellaneous Problems
+^^^^^^^^^^^^^^^^^^^^^^
+
+Once you have the basics down, there are a couple of problems that GDB has,
+some with solutions, some without.
+
+* Inline functions have bogus stack information.  In general, GDB does a pretty
+  good job getting stack traces and stepping through inline functions.  When a
+  pass is dynamically loaded however, it somehow completely loses this
+  capability.  The only solution I know of is to de-inline a function (move it
+  from the body of a class to a ``.cpp`` file).
+
+* Restarting the program breaks breakpoints.  After following the information
+  above, you have succeeded in getting some breakpoints planted in your pass.
+  Nex thing you know, you restart the program (i.e., you type "``run``" again),
+  and you start getting errors about breakpoints being unsettable.  The only
+  way I have found to "fix" this problem is to delete the breakpoints that are
+  already set in your pass, run the program, and re-set the breakpoints once
+  execution stops in ``PassManager::run``.
+
+Hopefully these tips will help with common case debugging situations.  If you'd
+like to contribute some tips of your own, just contact `Chris
+<mailto:sabre@nondot.org>`_.
+
+Future extensions planned
+-------------------------
+
+Although the LLVM Pass Infrastructure is very capable as it stands, and does
+some nifty stuff, there are things we'd like to add in the future.  Here is
+where we are going:
+
+.. _writing-an-llvm-pass-SMP:
+
+Multithreaded LLVM
+^^^^^^^^^^^^^^^^^^
+
+Multiple CPU machines are becoming more common and compilation can never be
+fast enough: obviously we should allow for a multithreaded compiler.  Because
+of the semantics defined for passes above (specifically they cannot maintain
+state across invocations of their ``run*`` methods), a nice clean way to
+implement a multithreaded compiler would be for the ``PassManager`` class to
+create multiple instances of each pass object, and allow the separate instances
+to be hacking on different parts of the program at the same time.
+
+This implementation would prevent each of the passes from having to implement
+multithreaded constructs, requiring only the LLVM core to have locking in a few
+places (for global resources).  Although this is a simple extension, we simply
+haven't had time (or multiprocessor machines, thus a reason) to implement this.
+Despite that, we have kept the LLVM passes SMP ready, and you should too.
+
diff --git a/docs/YamlIO.rst b/docs/YamlIO.rst
new file mode 100644
index 0000000000..b009b67ef4
--- /dev/null
+++ b/docs/YamlIO.rst
@@ -0,0 +1,862 @@
+.. _yamlio:
+
+=====================
+YAML I/O
+=====================
+
+.. contents::
+   :local:
+
+Introduction to YAML
+====================
+
+YAML is a human readable data serialization language.  The full YAML language 
+spec can be read at `yaml.org 
+<http://www.yaml.org/spec/1.2/spec.html#Introduction>`_.  The simplest form of
+yaml is just "scalars", "mappings", and "sequences".  A scalar is any number
+or string.  The pound/hash symbol (#) begins a comment line.   A mapping is 
+a set of key-value pairs where the key ends with a colon.  For example:
+
+.. code-block:: yaml
+
+     # a mapping
+     name:      Tom
+     hat-size:  7
+     
+A sequence is a list of items where each item starts with a leading dash ('-'). 
+For example:
+
+.. code-block:: yaml
+
+     # a sequence
+     - x86
+     - x86_64
+     - PowerPC
+
+You can combine mappings and sequences by indenting.  For example a sequence
+of mappings in which one of the mapping values is itself a sequence:
+
+.. code-block:: yaml
+
+     # a sequence of mappings with one key's value being a sequence
+     - name:      Tom
+       cpus:
+        - x86
+        - x86_64
+     - name:      Bob
+       cpus:
+        - x86
+     - name:      Dan
+       cpus:
+        - PowerPC
+        - x86
+
+Sometime sequences are known to be short and the one entry per line is too
+verbose, so YAML offers an alternate syntax for sequences called a "Flow
+Sequence" in which you put comma separated sequence elements into square 
+brackets.  The above example could then be simplified to :
+
+
+.. code-block:: yaml
+
+     # a sequence of mappings with one key's value being a flow sequence
+     - name:      Tom
+       cpus:      [ x86, x86_64 ]
+     - name:      Bob
+       cpus:      [ x86 ]
+     - name:      Dan
+       cpus:      [ PowerPC, x86 ]
+
+
+Introduction to YAML I/O
+========================
+
+The use of indenting makes the YAML easy for a human to read and understand,
+but having a program read and write YAML involves a lot of tedious details.
+The YAML I/O library structures and simplifies reading and writing YAML 
+documents.
+
+YAML I/O assumes you have some "native" data structures which you want to be
+able to dump as YAML and recreate from YAML.  The first step is to try 
+writing example YAML for your data structures. You may find after looking at 
+possible YAML representations that a direct mapping of your data structures
+to YAML is not very readable.  Often the fields are not in the order that
+a human would find readable.  Or the same information is replicated in multiple
+locations, making it hard for a human to write such YAML correctly.  
+
+In relational database theory there is a design step called normalization in 
+which you reorganize fields and tables.  The same considerations need to 
+go into the design of your YAML encoding.  But, you may not want to change
+your exisiting native data structures.  Therefore, when writing out YAML
+there may be a normalization step, and when reading YAML there would be a
+corresponding denormalization step.  
+
+YAML I/O uses a non-invasive, traits based design.  YAML I/O defines some 
+abstract base templates.  You specialize those templates on your data types.
+For instance, if you have an eumerated type FooBar you could specialize 
+ScalarEnumerationTraits on that type and define the enumeration() method:
+
+.. code-block:: c++
+
+    using llvm::yaml::ScalarEnumerationTraits;
+    using llvm::yaml::IO;
+
+    template <>
+    struct ScalarEnumerationTraits<FooBar> {
+      static void enumeration(IO &io, FooBar &value) {
+      ...
+      }
+    };
+
+
+As with all YAML I/O template specializations, the ScalarEnumerationTraits is used for 
+both reading and writing YAML. That is, the mapping between in-memory enum
+values and the YAML string representation is only in place.
+This assures that the code for writing and parsing of YAML stays in sync.
+
+To specify a YAML mappings, you define a specialization on 
+llvm::yaml::MapppingTraits.
+If your native data structure happens to be a struct that is already normalized,
+then the specialization is simple.  For example:
+
+.. code-block:: c++
+   
+    using llvm::yaml::MapppingTraits;
+    using llvm::yaml::IO;
+    
+    template <>
+    struct MapppingTraits<Person> {
+      static void mapping(IO &io, Person &info) {
+        io.mapRequired("name",         info.name);
+        io.mapOptional("hat-size",     info.hatSize);
+      }
+    };
+
+
+A YAML sequence is automatically infered if you data type has begin()/end()
+iterators and a push_back() method.  Therefore any of the STL containers
+(such as std::vector<>) will automatically translate to YAML sequences.
+
+Once you have defined specializations for your data types, you can 
+programmatically use YAML I/O to write a YAML document:
+
+.. code-block:: c++
+   
+    using llvm::yaml::Output;
+
+    Person tom;
+    tom.name = "Tom";
+    tom.hatSize = 8;
+    Person dan;
+    dan.name = "Dan";
+    dan.hatSize = 7;
+    std::vector<Person> persons;
+    persons.push_back(tom);
+    persons.push_back(dan);
+    
+    Output yout(llvm::outs());
+    yout << persons;
+   
+This would write the following:
+
+.. code-block:: yaml
+
+     - name:      Tom
+       hat-size:  8
+     - name:      Dan
+       hat-size:  7
+
+And you can also read such YAML documents with the following code:
+
+.. code-block:: c++
+
+    using llvm::yaml::Input;
+
+    typedef std::vector<Person> PersonList;
+    std::vector<PersonList> docs;
+    
+    Input yin(document.getBuffer());
+    yin >> docs;
+    
+    if ( yin.error() )
+      return;
+    
+    // Process read document
+    for ( PersonList &pl : docs ) {
+      for ( Person &person : pl ) {
+        cout << "name=" << person.name;
+      }
+    }
+  
+One other feature of YAML is the ability to define multiple documents in a 
+single file.  That is why reading YAML produces a vector of your document type.
+
+
+
+Error Handling
+==============
+
+When parsing a YAML document, if the input does not match your schema (as 
+expressed in your XxxTraits<> specializations).  YAML I/O 
+will print out an error message and your Input object's error() method will 
+return true. For instance the following document:
+
+.. code-block:: yaml
+
+     - name:      Tom
+       shoe-size: 12
+     - name:      Dan
+       hat-size:  7
+
+Has a key (shoe-size) that is not defined in the schema.  YAML I/O will 
+automatically generate this error:
+
+.. code-block:: yaml
+
+    YAML:2:2: error: unknown key 'shoe-size'
+      shoe-size:       12
+      ^~~~~~~~~
+
+Similar errors are produced for other input not conforming to the schema.
+
+
+Scalars
+=======
+
+YAML scalars are just strings (i.e. not a sequence or mapping).  The YAML I/O
+library provides support for translating between YAML scalars and specific
+C++ types.
+
+
+Built-in types
+--------------
+The following types have built-in support in YAML I/O:
+
+* bool
+* float
+* double
+* StringRef
+* int64_t
+* int32_t
+* int16_t
+* int8_t
+* uint64_t
+* uint32_t
+* uint16_t
+* uint8_t
+
+That is, you can use those types in fields of MapppingTraits or as element type
+in sequence.  When reading, YAML I/O will validate that the string found
+is convertible to that type and error out if not.
+
+
+Unique types
+------------
+Given that YAML I/O is trait based, the selection of how to convert your data
+to YAML is based on the type of your data.  But in C++ type matching, typedefs
+do not generate unique type names.  That means if you have two typedefs of
+unsigned int, to YAML I/O both types look exactly like unsigned int.  To
+facilitate make unique type names, YAML I/O provides a macro which is used
+like a typedef on built-in types, but expands to create a class with conversion
+operators to and from the base type.  For example:
+
+.. code-block:: c++
+
+    LLVM_YAML_STRONG_TYPEDEF(uint32_t, MyFooFlags)
+    LLVM_YAML_STRONG_TYPEDEF(uint32_t, MyBarFlags)
+
+This generates two classes MyFooFlags and MyBarFlags which you can use in your
+native data structures instead of uint32_t. They are implicitly 
+converted to and from uint32_t.  The point of creating these unique types
+is that you can now specify traits on them to get different YAML conversions.
+
+Hex types
+---------
+An example use of a unique type is that YAML I/O provides fixed sized unsigned
+integers that are written with YAML I/O as hexadecimal instead of the decimal
+format used by the built-in integer types:
+
+* Hex64
+* Hex32
+* Hex16
+* Hex8
+
+You can use llvm::yaml::Hex32 instead of uint32_t and the only different will
+be that when YAML I/O writes out that type it will be formatted in hexadecimal.
+
+
+ScalarEnumerationTraits
+-----------------------
+YAML I/O supports translating between in-memory enumerations and a set of string
+values in YAML documents. This is done by specializing ScalarEnumerationTraits<>
+on your enumeration type and define a enumeration() method. 
+For instance, suppose you had an enumeration of CPUs and a struct with it as 
+a field:
+
+.. code-block:: c++
+
+    enum CPUs {
+      cpu_x86_64  = 5,
+      cpu_x86     = 7,
+      cpu_PowerPC = 8
+    };
+    
+    struct Info {
+      CPUs      cpu;
+      uint32_t  flags;
+    };
+    
+To support reading and writing of this enumeration, you can define a 
+ScalarEnumerationTraits specialization on CPUs, which can then be used 
+as a field type: 
+
+.. code-block:: c++
+
+    using llvm::yaml::ScalarEnumerationTraits;
+    using llvm::yaml::MapppingTraits;
+    using llvm::yaml::IO;
+
+    template <>
+    struct ScalarEnumerationTraits<CPUs> {
+      static void enumeration(IO &io, CPUs &value) {
+        io.enumCase(value, "x86_64",  cpu_x86_64);
+        io.enumCase(value, "x86",     cpu_x86);
+        io.enumCase(value, "PowerPC", cpu_PowerPC);
+      }
+    };
+ 
+    template <>
+    struct MapppingTraits<Info> {
+      static void mapping(IO &io, Info &info) {
+        io.mapRequired("cpu",       info.cpu);
+        io.mapOptional("flags",     info.flags, 0);
+      }
+    };
+
+When reading YAML, if the string found does not match any of the the strings
+specified by enumCase() methods, an error is automatically generated.
+When writing YAML, if the value being written does not match any of the values
+specified by the enumCase() methods, a runtime assertion is triggered.
+  
+
+BitValue
+--------
+Another common data structure in C++ is a field where each bit has a unique
+meaning.  This is often used in a "flags" field.  YAML I/O has support for
+converting such fields to a flow sequence.   For instance suppose you 
+had the following bit flags defined:
+
+.. code-block:: c++
+
+    enum {
+      flagsPointy = 1
+      flagsHollow = 2
+      flagsFlat   = 4
+      flagsRound  = 8
+    };
+
+    LLVM_YAML_UNIQUE_TYPE(MyFlags, uint32_t)
+    
+To support reading and writing of MyFlags, you specialize ScalarBitSetTraits<>
+on MyFlags and provide the bit values and their names.   
+
+.. code-block:: c++
+
+    using llvm::yaml::ScalarBitSetTraits;
+    using llvm::yaml::MapppingTraits;
+    using llvm::yaml::IO;
+
+    template <>
+    struct ScalarBitSetTraits<MyFlags> {
+      static void bitset(IO &io, MyFlags &value) {
+        io.bitSetCase(value, "hollow",  flagHollow);
+        io.bitSetCase(value, "flat",    flagFlat);
+        io.bitSetCase(value, "round",   flagRound);
+        io.bitSetCase(value, "pointy",  flagPointy);
+      }
+    };
+    
+    struct Info {
+      StringRef   name;
+      MyFlags     flags;
+    };
+    
+    template <>
+    struct MapppingTraits<Info> {
+      static void mapping(IO &io, Info& info) {
+        io.mapRequired("name",  info.name);
+        io.mapRequired("flags", info.flags);
+       }
+    };
+
+With the above, YAML I/O (when writing) will test mask each value in the 
+bitset trait against the flags field, and each that matches will
+cause the corresponding string to be added to the flow sequence.  The opposite
+is done when reading and any unknown string values will result in a error. With 
+the above schema, a same valid YAML document is:
+
+.. code-block:: yaml
+
+    name:    Tom
+    flags:   [ pointy, flat ]
+
+
+Custom Scalar
+-------------
+Sometimes for readability a scalar needs to be formatted in a custom way. For
+instance your internal data structure may use a integer for time (seconds since
+some epoch), but in YAML it would be much nicer to express that integer in 
+some time format (e.g. 4-May-2012 10:30pm).  YAML I/O has a way to support  
+custom formatting and parsing of scalar types by specializing ScalarTraits<> on
+your data type.  When writing, YAML I/O will provide the native type and
+your specialization must create a temporary llvm::StringRef.  When reading,
+YAML I/O will provide a llvm::StringRef of scalar and your specialization
+must convert that to your native data type.  An outline of a custom scalar type
+looks like:
+
+.. code-block:: c++
+
+    using llvm::yaml::ScalarTraits;
+    using llvm::yaml::IO;
+
+    template <>
+    struct ScalarTraits<MyCustomType> {
+      static void output(const T &value, llvm::raw_ostream &out) {
+        out << value;  // do custom formatting here
+      }
+      static StringRef input(StringRef scalar, T &value) {
+        // do custom parsing here.  Return the empty string on success,
+        // or an error message on failure.
+        return StringRef(); 
+      }
+    };
+    
+
+Mappings
+========
+
+To be translated to or from a YAML mapping for your type T you must specialize  
+llvm::yaml::MapppingTraits on T and implement the "void mapping(IO &io, T&)" 
+method. If your native data structures use pointers to a class everywhere,
+you can specialize on the class pointer.  Examples:
+
+.. code-block:: c++
+   
+    using llvm::yaml::MapppingTraits;
+    using llvm::yaml::IO;
+    
+    // Example of struct Foo which is used by value
+    template <>
+    struct MapppingTraits<Foo> {
+      static void mapping(IO &io, Foo &foo) {
+        io.mapOptional("size",      foo.size);
+      ...
+      }
+    };
+
+    // Example of struct Bar which is natively always a pointer
+    template <>
+    struct MapppingTraits<Bar*> {
+      static void mapping(IO &io, Bar *&bar) {
+        io.mapOptional("size",    bar->size);
+      ...
+      }
+    };
+
+
+No Normalization
+----------------
+
+The mapping() method is responsible, if needed, for normalizing and 
+denormalizing. In a simple case where the native data structure requires no 
+normalization, the mapping method just uses mapOptional() or mapRequired() to 
+bind the struct's fields to YAML key names.  For example:
+
+.. code-block:: c++
+   
+    using llvm::yaml::MapppingTraits;
+    using llvm::yaml::IO;
+    
+    template <>
+    struct MapppingTraits<Person> {
+      static void mapping(IO &io, Person &info) {
+        io.mapRequired("name",         info.name);
+        io.mapOptional("hat-size",     info.hatSize);
+      }
+    };
+
+
+Normalization
+----------------
+
+When [de]normalization is required, the mapping() method needs a way to access
+normalized values as fields. To help with this, there is
+a template MappingNormalization<> which you can then use to automatically
+do the normalization and denormalization.  The template is used to create
+a local variable in your mapping() method which contains the normalized keys.
+
+Suppose you have native data type 
+Polar which specifies a position in polar coordinates (distance, angle):
+
+.. code-block:: c++
+   
+    struct Polar {
+      float distance;
+      float angle;
+    };
+
+but you've decided the normalized YAML for should be in x,y coordinates. That 
+is, you want the yaml to look like:
+
+.. code-block:: yaml
+
+    x:   10.3
+    y:   -4.7
+
+You can support this by defining a MapppingTraits that normalizes the polar
+coordinates to x,y coordinates when writing YAML and denormalizes x,y 
+coordindates into polar when reading YAML.  
+
+.. code-block:: c++
+   
+    using llvm::yaml::MapppingTraits;
+    using llvm::yaml::IO;
+        
+    template <>
+    struct MapppingTraits<Polar> {
+      
+      class NormalizedPolar {
+      public:
+        NormalizedPolar(IO &io)
+          : x(0.0), y(0.0) {
+        }
+        NormalizedPolar(IO &, Polar &polar)
+          : x(polar.distance * cos(polar.angle)), 
+            y(polar.distance * sin(polar.angle)) {
+        }
+        Polar denormalize(IO &) {
+          return Polar(sqrt(x*x+y*y, arctan(x,y));
+        }
+         
+        float        x;
+        float        y;
+      };
+
+      static void mapping(IO &io, Polar &polar) {
+        MappingNormalization<NormalizedPolar, Polar> keys(io, polar);
+        
+        io.mapRequired("x",    keys->x);
+        io.mapRequired("y",    keys->y);
+      }
+    };
+
+When writing YAML, the local variable "keys" will be a stack allocated 
+instance of NormalizedPolar, constructed from the suppled polar object which
+initializes it x and y fields.  The mapRequired() methods then write out the x
+and y values as key/value pairs.  
+
+When reading YAML, the local variable "keys" will be a stack allocated instance
+of NormalizedPolar, constructed by the empty constructor.  The mapRequired 
+methods will find the matching key in the YAML document and fill in the x and y 
+fields of the NormalizedPolar object keys. At the end of the mapping() method
+when the local keys variable goes out of scope, the denormalize() method will
+automatically be called to convert the read values back to polar coordinates,
+and then assigned back to the second parameter to mapping().
+
+In some cases, the normalized class may be a subclass of the native type and
+could be returned by the denormalize() method, except that the temporary
+normalized instance is stack allocated.  In these cases, the utility template
+MappingNormalizationHeap<> can be used instead.  It just like 
+MappingNormalization<> except that it heap allocates the normalized object
+when reading YAML.  It never destroyes the normalized object.  The denormalize()
+method can this return "this".
+
+
+Default values
+--------------
+Within a mapping() method, calls to io.mapRequired() mean that that key is 
+required to exist when parsing YAML documents, otherwise YAML I/O will issue an 
+error.
+
+On the other hand, keys registered with io.mapOptional() are allowed to not 
+exist in the YAML document being read.  So what value is put in the field 
+for those optional keys? 
+There are two steps to how those optional fields are filled in. First, the  
+second parameter to the mapping() method is a reference to a native class.  That
+native class must have a default constructor.  Whatever value the default
+constructor initially sets for an optional field will be that field's value.
+Second, the mapOptional() method has an optional third parameter.  If provided
+it is the value that mapOptional() should set that field to if the YAML document  
+does not have that key.  
+
+There is one important difference between those two ways (default constructor
+and third parameter to mapOptional). When YAML I/O generates a YAML document, 
+if the mapOptional() third parameter is used, if the actual value being written
+is the same as (using ==) the default value, then that key/value is not written.
+
+
+Order of Keys
+--------------
+
+When writing out a YAML document, the keys are written in the order that the
+calls to mapRequired()/mapOptional() are made in the mapping() method. This
+gives you a chance to write the fields in an order that a human reader of
+the YAML document would find natural.  This may be different that the order
+of the fields in the native class.
+
+When reading in a YAML document, the keys in the document can be in any order, 
+but they are processed in the order that the calls to mapRequired()/mapOptional() 
+are made in the mapping() method.  That enables some interesting 
+functionality.  For instance, if the first field bound is the cpu and the second
+field bound is flags, and the flags are cpu specific, you can programmatically
+switch how the flags are converted to and from YAML based on the cpu.  
+This works for both reading and writing. For example:
+
+.. code-block:: c++
+
+    using llvm::yaml::MapppingTraits;
+    using llvm::yaml::IO;
+    
+    struct Info {
+      CPUs        cpu;
+      uint32_t    flags;
+    };
+
+    template <>
+    struct MapppingTraits<Info> {
+      static void mapping(IO &io, Info &info) {
+        io.mapRequired("cpu",       info.cpu);
+        // flags must come after cpu for this to work when reading yaml
+        if ( info.cpu == cpu_x86_64 )
+          io.mapRequired("flags",  *(My86_64Flags*)info.flags);
+        else
+          io.mapRequired("flags",  *(My86Flags*)info.flags);
+     }
+    };
+
+
+Sequence
+========
+
+To be translated to or from a YAML sequence for your type T you must specialize
+llvm::yaml::SequenceTraits on T and implement two methods:
+“size_t size(IO &io, T&)” and “T::value_type& element(IO &io, T&, size_t indx)”.
+For example:
+
+.. code-block:: c++
+
+  template <>
+  struct SequenceTraits<MySeq> {
+    static size_t size(IO &io, MySeq &list) { ... }
+    static MySeqEl element(IO &io, MySeq &list, size_t index) { ... }
+  };
+
+The size() method returns how many elements are currently in your sequence.
+The element() method returns a reference to the i'th element in the sequence. 
+When parsing YAML, the element() method may be called with an index one bigger
+than the current size.  Your element() method should allocate space for one
+more element (using default constructor if element is a C++ object) and returns
+a reference to that new allocated space.  
+
+
+Flow Sequence
+-------------
+A YAML "flow sequence" is a sequence that when written to YAML it uses the 
+inline notation (e.g [ foo, bar ] ).  To specify that a sequence type should
+be written in YAML as a flow sequence, your SequenceTraits specialization should
+add "static const bool flow = true;".  For instance:
+
+.. code-block:: c++
+
+  template <>
+  struct SequenceTraits<MyList> {
+    static size_t size(IO &io, MyList &list) { ... }
+    static MyListEl element(IO &io, MyList &list, size_t index) { ... }
+    
+    // The existence of this member causes YAML I/O to use a flow sequence
+    static const bool flow = true;
+  };
+
+With the above, if you used MyList as the data type in your native data 
+strucutures, then then when converted to YAML, a flow sequence of integers 
+will be used (e.g. [ 10, -3, 4 ]).
+
+
+Utility Macros
+--------------
+Since a common source of sequences is std::vector<>, YAML I/O provids macros:
+LLVM_YAML_IS_SEQUENCE_VECTOR() and LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR() which
+can be used to easily specify SequenceTraits<> on a std::vector type.  YAML 
+I/O does not partial specialize SequenceTraits on std::vector<> because that
+would force all vectors to be sequences.  An example use of the macros:
+
+.. code-block:: c++
+
+  std::vector<MyType1>;
+  std::vector<MyType2>;
+  LLVM_YAML_IS_SEQUENCE_VECTOR(MyType1)
+  LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(MyType2)
+
+
+
+Document List
+=============
+
+YAML allows you to define multiple "documents" in a single YAML file.  Each 
+new document starts with a left aligned "---" token.  The end of all documents
+is denoted with a left aligned "..." token.  Many users of YAML will never
+have need for multiple documents.  The top level node in their YAML schema
+will be a mapping or sequence. For those cases, the following is not needed.
+But for cases where you do want multiple documents, you can specify a
+trait for you document list type.  The trait has the same methods as 
+SequenceTraits but is named DocumentListTraits.  For example:
+
+.. code-block:: c++
+
+  template <>
+  struct DocumentListTraits<MyDocList> {
+    static size_t size(IO &io, MyDocList &list) { ... }
+    static MyDocType element(IO &io, MyDocList &list, size_t index) { ... }
+  };
+
+
+User Context Data
+=================
+When an llvm::yaml::Input or llvm::yaml::Output object is created their 
+constructors take an optional "context" parameter.  This is a pointer to 
+whatever state information you might need.  
+
+For instance, in a previous example we showed how the conversion type for a 
+flags field could be determined at runtime based on the value of another field 
+in the mapping. But what if an inner mapping needs to know some field value
+of an outer mapping?  That is where the "context" parameter comes in. You
+can set values in the context in the outer map's mapping() method and
+retrieve those values in the inner map's mapping() method.
+
+The context value is just a void*.  All your traits which use the context 
+and operate on your native data types, need to agree what the context value
+actually is.  It could be a pointer to an object or struct which your various
+traits use to shared context sensitive information.
+
+
+Output
+======
+
+The llvm::yaml::Output class is used to generate a YAML document from your 
+in-memory data structures, using traits defined on your data types.  
+To instantiate an Output object you need an llvm::raw_ostream, and optionally 
+a context pointer:
+
+.. code-block:: c++
+
+      class Output : public IO {
+      public:
+        Output(llvm::raw_ostream &, void *context=NULL);
+    
+Once you have an Output object, you can use the C++ stream operator on it
+to write your native data as YAML. One thing to recall is that a YAML file
+can contain multiple "documents".  If the top level data structure you are
+streaming as YAML is a mapping, scalar, or sequence, then Output assumes you
+are generating one document and wraps the mapping output 
+with  "``---``" and trailing "``...``".  
+
+.. code-block:: c++
+   
+    using llvm::yaml::Output;
+
+    void dumpMyMapDoc(const MyMapType &info) {
+      Output yout(llvm::outs());
+      yout << info;
+    }
+
+The above could produce output like:
+
+.. code-block:: yaml
+
+     ---
+     name:      Tom
+     hat-size:  7
+     ...
+
+On the other hand, if the top level data structure you are streaming as YAML
+has a DocumentListTraits specialization, then Output walks through each element
+of your DocumentList and generates a "---" before the start of each element
+and ends with a "...".
+
+.. code-block:: c++
+   
+    using llvm::yaml::Output;
+
+    void dumpMyMapDoc(const MyDocListType &docList) {
+      Output yout(llvm::outs());
+      yout << docList;
+    }
+
+The above could produce output like:
+
+.. code-block:: yaml
+
+     ---
+     name:      Tom
+     hat-size:  7
+     ---
+     name:      Tom
+     shoe-size:  11
+     ...
+
+Input
+=====
+
+The llvm::yaml::Input class is used to parse YAML document(s) into your native
+data structures. To instantiate an Input
+object you need a StringRef to the entire YAML file, and optionally a context 
+pointer:
+
+.. code-block:: c++
+
+      class Input : public IO {
+      public:
+        Input(StringRef inputContent, void *context=NULL);
+    
+Once you have an Input object, you can use the C++ stream operator to read
+the document(s).  If you expect there might be multiple YAML documents in
+one file, you'll need to specialize DocumentListTraits on a list of your
+document type and stream in that document list type.  Otherwise you can
+just stream in the document type.  Also, you can check if there was 
+any syntax errors in the YAML be calling the error() method on the Input
+object.  For example:
+
+.. code-block:: c++
+   
+     // Reading a single document
+     using llvm::yaml::Input;
+
+     Input yin(mb.getBuffer());
+     
+     // Parse the YAML file
+     MyDocType theDoc;
+     yin >> theDoc;
+
+     // Check for error
+     if ( yin.error() )
+       return;
+  
+      
+.. code-block:: c++
+   
+     // Reading multiple documents in one file
+     using llvm::yaml::Input;
+
+     LLVM_YAML_IS_DOCUMENT_LIST_VECTOR(std::vector<MyDocType>)
+     
+     Input yin(mb.getBuffer());
+     
+     // Parse the YAML file
+     std::vector<MyDocType> theDocList;
+     yin >> theDocList;
+
+     // Check for error
+     if ( yin.error() )
+       return;
+
+
diff --git a/docs/subsystems.rst b/docs/subsystems.rst
index 275955be6e..6c21d335af 100644
--- a/docs/subsystems.rst
+++ b/docs/subsystems.rst
@@ -23,14 +23,15 @@ Subsystem Documentation
    SourceLevelDebugging
    WritingAnLLVMBackend
    GarbageCollection
+   WritingAnLLVMPass
 
 .. FIXME: once LangRef is Sphinxified, HowToUseInstrMappings should be put
    under LangRef's toctree instead of this page's toctree.
 
-* `Writing an LLVM Pass <WritingAnLLVMPass.html>`_
-    
+* :doc:`WritingAnLLVMPass`
+
    Information on how to write LLVM transformations and analyses.
-    
+
 * :doc:`WritingAnLLVMBackend`
 
    Information on how to write LLVM backends for machine targets.
diff --git a/docs/userguides.rst b/docs/userguides.rst
index cfb6dbeb5e..7e4e3b7bc0 100644
--- a/docs/userguides.rst
+++ b/docs/userguides.rst
@@ -23,6 +23,8 @@ User Guides
    TestingGuide
    tutorial/index
    ReleaseNotes
+   Passes
+   YamlIO
 
 * :ref:`getting_started`
     
@@ -58,10 +60,10 @@ User Guides
    A reference manual for the LLVM command line utilities ("man" pages for LLVM
    tools).
     
-* `LLVM's Analysis and Transform Passes <Passes.html>`_
+* :doc:`Passes`
 
    A list of optimizations and analyses implemented in LLVM.
-    
+
 * :ref:`faq`
 
    A list of common questions and problems and their solutions.
@@ -99,6 +101,10 @@ User Guides
 
    Instructions for adding new builder to LLVM buildbot master.
     
+* :ref:`yamlio`
+
+   A reference guide for using LLVM's YAML I/O library.
+
 * **IRC** -- You can probably find help on the unofficial LLVM IRC.
 
    We often are on irc.oftc.net in the #llvm channel.  If you are using the
diff --git a/include/llvm/Analysis/InstructionSimplify.h b/include/llvm/Analysis/InstructionSimplify.h
index e9b72a2fa7..721f0ddd2e 100644
--- a/include/llvm/Analysis/InstructionSimplify.h
+++ b/include/llvm/Analysis/InstructionSimplify.h
@@ -44,6 +44,20 @@ namespace llvm {
                          const TargetLibraryInfo *TLI = 0,
                          const DominatorTree *DT = 0);
 
+  /// Given operands for an FAdd, see if we can fold the result.  If not, this
+  /// returns null.
+  Value *SimplifyFAddInst(Value *LHS, Value *RHS, FastMathFlags FMF,
+                         const DataLayout *TD = 0,
+                         const TargetLibraryInfo *TLI = 0,
+                         const DominatorTree *DT = 0);
+
+  /// Given operands for an FSub, see if we can fold the result.  If not, this
+  /// returns null.
+  Value *SimplifyFSubInst(Value *LHS, Value *RHS, FastMathFlags FMF,
+                         const DataLayout *TD = 0,
+                         const TargetLibraryInfo *TLI = 0,
+                         const DominatorTree *DT = 0);
+
   /// Given operands for an FMul, see if we can fold the result.  If not, this
   /// returns null.
   Value *SimplifyFMulInst(Value *LHS, Value *RHS,
diff --git a/include/llvm/Analysis/PtrUseVisitor.h b/include/llvm/Analysis/PtrUseVisitor.h
index e15f2b45a8..f5daf2c63e 100644
--- a/include/llvm/Analysis/PtrUseVisitor.h
+++ b/include/llvm/Analysis/PtrUseVisitor.h
@@ -29,7 +29,6 @@
 #include "llvm/InstVisitor.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Support/Compiler.h"
-#include "llvm/Support/GetElementPtrTypeIterator.h"
 
 namespace llvm {
 
diff --git a/include/llvm/Analysis/ValueTracking.h b/include/llvm/Analysis/ValueTracking.h
index a85752446b..875c47dc6b 100644
--- a/include/llvm/Analysis/ValueTracking.h
+++ b/include/llvm/Analysis/ValueTracking.h
@@ -45,13 +45,12 @@ namespace llvm {
   void ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne,
                       const DataLayout *TD = 0, unsigned Depth = 0);
 
-  /// isPowerOfTwo - Return true if the given value is known to have exactly one
-  /// bit set when defined. For vectors return true if every element is known to
-  /// be a power of two when defined.  Supports values with integer or pointer
-  /// type and vectors of integers.  If 'OrZero' is set then returns true if the
-  /// given value is either a power of two or zero.
-  bool isPowerOfTwo(Value *V, const DataLayout *TD = 0, bool OrZero = false,
-                    unsigned Depth = 0);
+  /// isKnownToBeAPowerOfTwo - Return true if the given value is known to have
+  /// exactly one bit set when defined. For vectors return true if every
+  /// element is known to be a power of two when defined.  Supports values with
+  /// integer or pointer type and vectors of integers.  If 'OrZero' is set then
+  /// returns true if the given value is either a power of two or zero.
+  bool isKnownToBeAPowerOfTwo(Value *V, bool OrZero = false, unsigned Depth = 0);
 
   /// isKnownNonZero - Return true if the given value is known to be non-zero
   /// when defined.  For vectors return true if every element is known to be
diff --git a/include/llvm/CodeGen/CommandFlags.h b/include/llvm/CodeGen/CommandFlags.h
index e7a654b423..a66e05baff 100644
--- a/include/llvm/CodeGen/CommandFlags.h
+++ b/include/llvm/CodeGen/CommandFlags.h
@@ -1,4 +1,4 @@
-//===-- CommandFlags.h - Register Coalescing Interface ----------*- C++ -*-===//
+//===-- CommandFlags.h - Command Line Flags Interface -----------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/include/llvm/CodeGen/FunctionLoweringInfo.h b/include/llvm/CodeGen/FunctionLoweringInfo.h
index 789f77f26e..451dc99867 100644
--- a/include/llvm/CodeGen/FunctionLoweringInfo.h
+++ b/include/llvm/CodeGen/FunctionLoweringInfo.h
@@ -136,7 +136,7 @@ public:
     return ValueMap.count(V);
   }
 
-  unsigned CreateReg(EVT VT);
+  unsigned CreateReg(MVT VT);
   
   unsigned CreateRegs(Type *Ty);
   
diff --git a/include/llvm/CodeGen/MachineBasicBlock.h b/include/llvm/CodeGen/MachineBasicBlock.h
index 81912742fa..91dcf1c4c3 100644
--- a/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/include/llvm/CodeGen/MachineBasicBlock.h
@@ -463,37 +463,57 @@ public:
     return Insts.insertAfter(I.getInstrIterator(), M);
   }
 
-  /// erase - Remove the specified element or range from the instruction list.
-  /// These functions delete any instructions removed.
+  /// Remove an instruction from the instruction list and delete it.
   ///
-  instr_iterator erase(instr_iterator I) {
-    return Insts.erase(I);
-  }
-  instr_iterator erase(instr_iterator I, instr_iterator E) {
-    return Insts.erase(I, E);
-  }
+  /// If the instruction is part of a bundle, the other instructions in the
+  /// bundle will still be bundled after removing the single instruction.
+  instr_iterator erase(instr_iterator I);
+
+  /// Remove an instruction from the instruction list and delete it.
+  ///
+  /// If the instruction is part of a bundle, the other instructions in the
+  /// bundle will still be bundled after removing the single instruction.
   instr_iterator erase_instr(MachineInstr *I) {
-    instr_iterator MII(I);
-    return erase(MII);
+    return erase(instr_iterator(I));
   }
 
-  iterator erase(iterator I);
+  /// Remove a range of instructions from the instruction list and delete them.
   iterator erase(iterator I, iterator E) {
     return Insts.erase(I.getInstrIterator(), E.getInstrIterator());
   }
+
+  /// Remove an instruction or bundle from the instruction list and delete it.
+  ///
+  /// If I points to a bundle of instructions, they are all erased.
+  iterator erase(iterator I) {
+    return erase(I, llvm::next(I));
+  }
+
+  /// Remove an instruction from the instruction list and delete it.
+  ///
+  /// If I is the head of a bundle of instructions, the whole bundle will be
+  /// erased.
   iterator erase(MachineInstr *I) {
-    iterator MII(I);
-    return erase(MII);
+    return erase(iterator(I));
   }
 
-  /// remove - Remove the instruction from the instruction list. This function
-  /// does not delete the instruction. WARNING: Note, if the specified
-  /// instruction is a bundle this function will remove all the bundled
-  /// instructions as well. It is up to the caller to keep a list of the
-  /// bundled instructions and re-insert them if desired. This function is
-  /// *not recommended* for manipulating instructions with bundles. Use
-  /// splice instead.
-  MachineInstr *remove(MachineInstr *I);
+  /// Remove the unbundled instruction from the instruction list without
+  /// deleting it.
+  ///
+  /// This function can not be used to remove bundled instructions, use
+  /// remove_instr to remove individual instructions from a bundle.
+  MachineInstr *remove(MachineInstr *I) {
+    assert(!I->isBundled() && "Cannot remove bundled instructions");
+    return Insts.remove(I);
+  }
+
+  /// Remove the possibly bundled instruction from the instruction list
+  /// without deleting it.
+  ///
+  /// If the instruction is part of a bundle, the other instructions in the
+  /// bundle will still be bundled after removing the single instruction.
+  MachineInstr *remove_instr(MachineInstr *I);
+
   void clear() {
     Insts.clear();
   }
diff --git a/include/llvm/CodeGen/MachineInstr.h b/include/llvm/CodeGen/MachineInstr.h
index 57da779ca1..320cd0dc53 100644
--- a/include/llvm/CodeGen/MachineInstr.h
+++ b/include/llvm/CodeGen/MachineInstr.h
@@ -590,14 +590,33 @@ public:
   bool isIdenticalTo(const MachineInstr *Other,
                      MICheckType Check = CheckDefs) const;
 
-  /// removeFromParent - This method unlinks 'this' from the containing basic
-  /// block, and returns it, but does not delete it.
+  /// Unlink 'this' from the containing basic block, and return it without
+  /// deleting it.
+  ///
+  /// This function can not be used on bundled instructions, use
+  /// removeFromBundle() to remove individual instructions from a bundle.
   MachineInstr *removeFromParent();
 
-  /// eraseFromParent - This method unlinks 'this' from the containing basic
-  /// block and deletes it.
+  /// Unlink this instruction from its basic block and return it without
+  /// deleting it.
+  ///
+  /// If the instruction is part of a bundle, the other instructions in the
+  /// bundle remain bundled.
+  MachineInstr *removeFromBundle();
+
+  /// Unlink 'this' from the containing basic block and delete it.
+  ///
+  /// If this instruction is the header of a bundle, the whole bundle is erased.
+  /// This function can not be used for instructions inside a bundle, use
+  /// eraseFromBundle() to erase individual bundled instructions.
   void eraseFromParent();
 
+  /// Unlink 'this' form its basic block and delete it.
+  ///
+  /// If the instruction is part of a bundle, the other instructions in the
+  /// bundle remain bundled.
+  void eraseFromBundle();
+
   /// isLabel - Returns true if the MachineInstr represents a label.
   ///
   bool isLabel() const {
diff --git a/include/llvm/CodeGen/MachineInstrBuilder.h b/include/llvm/CodeGen/MachineInstrBuilder.h
index 408c2a8789..c485112846 100644
--- a/include/llvm/CodeGen/MachineInstrBuilder.h
+++ b/include/llvm/CodeGen/MachineInstrBuilder.h
@@ -363,6 +363,18 @@ public:
     }
   }
 
+  /// Create an MIBundleBuilder representing an existing instruction or bundle
+  /// that has MI as its head.
+  explicit MIBundleBuilder(MachineInstr *MI)
+    : MBB(*MI->getParent()), Begin(MI) {
+    MachineBasicBlock::iterator I = MI;
+    ++I;
+    End = I.getInstrIterator();
+  }
+
+  /// Return a reference to the basic block containing this bundle.
+  MachineBasicBlock &getMBB() const { return MBB; }
+
   /// Return true if no instructions have been inserted in this bundle yet.
   /// Empty bundles aren't representable in a MachineBasicBlock.
   bool empty() const { return Begin == End; }
@@ -373,25 +385,38 @@ public:
   /// Return an iterator beyond the last bundled instruction.
   MachineBasicBlock::instr_iterator end() const { return End; }
 
+  /// Insert MI into this bundle before I which must point to an instruction in
+  /// the bundle, or end().
+  MIBundleBuilder &insert(MachineBasicBlock::instr_iterator I,
+                          MachineInstr *MI) {
+    MBB.insert(I, MI);
+    if (I == Begin) {
+      if (!empty())
+        MI->bundleWithSucc();
+      Begin = MI;
+      return *this;
+    }
+    if (I == End) {
+      MI->bundleWithPred();
+      return *this;
+    }
+    // MI was inserted in the middle of the bundle, so its neighbors' flags are
+    // already fine. Update MI's bundle flags manually.
+    MI->setFlag(MachineInstr::BundledPred);
+    MI->setFlag(MachineInstr::BundledSucc);
+    return *this;
+  }
+
   /// Insert MI into MBB by prepending it to the instructions in the bundle.
   /// MI will become the first instruction in the bundle.
   MIBundleBuilder &prepend(MachineInstr *MI) {
-    MBB.insert(Begin, MI);
-    if (!empty())
-      MI->bundleWithSucc();
-    Begin = MI;
-    return *this;
+    return insert(begin(), MI);
   }
 
   /// Insert MI into MBB by appending it to the instructions in the bundle.
   /// MI will become the last instruction in the bundle.
   MIBundleBuilder &append(MachineInstr *MI) {
-    MBB.insert(End, MI);
-    if (empty())
-      Begin = MI;
-    else
-      MI->bundleWithPred();
-    return *this;
+    return insert(end(), MI);
   }
 };
 
diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h
index 252d9ca173..d5acdac591 100644
--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -130,6 +130,11 @@ public:
   ///
   inline EVT getValueType() const;
 
+  /// Return the simple ValueType of the referenced return value.
+  MVT getSimpleValueType() const {
+    return getValueType().getSimpleVT();
+  }
+
   /// getValueSizeInBits - Returns the size of the value in bits.
   ///
   unsigned getValueSizeInBits() const {
@@ -595,6 +600,12 @@ public:
     return ValueList[ResNo];
   }
 
+  /// Return the type of a specified result as a simple type.
+  ///
+  MVT getSimpleValueType(unsigned ResNo) const {
+    return getValueType(ResNo).getSimpleVT();
+  }
+
   /// getValueSizeInBits - Returns MVT::getSizeInBits(getValueType(ResNo)).
   ///
   unsigned getValueSizeInBits(unsigned ResNo) const {
diff --git a/include/llvm/Config/config.h.cmake b/include/llvm/Config/config.h.cmake
index ca64124729..ff765ccd37 100644
--- a/include/llvm/Config/config.h.cmake
+++ b/include/llvm/Config/config.h.cmake
@@ -6,6 +6,9 @@
 /* Bug report URL. */
 #define BUG_REPORT_URL "${BUG_REPORT_URL}"
 
+/* Define if we have libxml2 */
+#cmakedefine CLANG_HAVE_LIBXML ${CLANG_HAVE_LIBXML}
+
 /* Relative directory for resource files */
 #define CLANG_RESOURCE_DIR "${CLANG_RESOURCE_DIR}"
 
diff --git a/include/llvm/Config/llvm-config.h.cmake b/include/llvm/Config/llvm-config.h.cmake
index 39442926dc..fbc3040dd2 100644
--- a/include/llvm/Config/llvm-config.h.cmake
+++ b/include/llvm/Config/llvm-config.h.cmake
@@ -112,6 +112,12 @@
 /* Installation prefix directory */
 #cmakedefine LLVM_PREFIX "${LLVM_PREFIX}"
 
+/* Define if we have the Intel JIT API runtime support library */
+#cmakedefine LLVM_USE_INTEL_JITEVENTS 1
+
+/* Define if we have the oprofile JIT-support library */
+#cmakedefine LLVM_USE_OPROFILE 1
+
 /* Major version of the LLVM API */
 #cmakedefine LLVM_VERSION_MAJOR ${LLVM_VERSION_MAJOR}
 
diff --git a/include/llvm/Config/llvm-config.h.in b/include/llvm/Config/llvm-config.h.in
index 9489dfe016..af3a324855 100644
--- a/include/llvm/Config/llvm-config.h.in
+++ b/include/llvm/Config/llvm-config.h.in
@@ -112,6 +112,12 @@
 /* Installation prefix directory */
 #undef LLVM_PREFIX
 
+/* Define if we have the Intel JIT API runtime support library */
+#undef LLVM_USE_INTEL_JITEVENTS
+
+/* Define if we have the oprofile JIT-support library */
+#undef LLVM_USE_OPROFILE
+
 /* Major version of the LLVM API */
 #undef LLVM_VERSION_MAJOR
 
diff --git a/include/llvm/DebugInfo.h b/include/llvm/DebugInfo.h
index 43af6ed080..5bfbc63eff 100644
--- a/include/llvm/DebugInfo.h
+++ b/include/llvm/DebugInfo.h
@@ -635,6 +635,7 @@ namespace llvm {
       DIFile F = getFieldAs<DIFile>(3);
       return F.getCompileUnit();
     }
+    DIFile getFile() const              { return getFieldAs<DIFile>(3); }
     unsigned getLineNumber() const      {
       return (getUnsignedField(4) << 8) >> 8;
     }
diff --git a/include/llvm/ExecutionEngine/JITEventListener.h b/include/llvm/ExecutionEngine/JITEventListener.h
index 7bd1fad934..3aa014ee32 100644
--- a/include/llvm/ExecutionEngine/JITEventListener.h
+++ b/include/llvm/ExecutionEngine/JITEventListener.h
@@ -15,7 +15,7 @@
 #ifndef LLVM_EXECUTION_ENGINE_JIT_EVENTLISTENER_H
 #define LLVM_EXECUTION_ENGINE_JIT_EVENTLISTENER_H
 
-#include "llvm/Config/config.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/DebugLoc.h"
 #include <vector>
diff --git a/include/llvm/Function.h b/include/llvm/Function.h
index b49b8c1457..6edc660c9c 100644
--- a/include/llvm/Function.h
+++ b/include/llvm/Function.h
@@ -89,7 +89,7 @@ private:
 
   // HasLazyArguments is stored in Value::SubclassData.
   /*bool HasLazyArguments;*/
-                   
+
   // The Calling Convention is stored in Value::SubclassData.
   /*CallingConv::ID CallingConvention;*/
 
@@ -131,7 +131,7 @@ public:
   Type *getReturnType() const;           // Return the type of the ret val
   FunctionType *getFunctionType() const; // Return the FunctionType for me
 
-  /// getContext - Return a pointer to the LLVMContext associated with this 
+  /// getContext - Return a pointer to the LLVMContext associated with this
   /// function, or NULL if this function is not bound to a context yet.
   LLVMContext &getContext() const;
 
@@ -159,7 +159,7 @@ public:
     setValueSubclassData((getSubclassDataFromValue() & 1) |
                          (static_cast<unsigned>(CC) << 1));
   }
-  
+
   /// getAttributes - Return the attribute list for this Function.
   ///
   const AttributeSet &getAttributes() const { return AttributeList; }
@@ -176,15 +176,15 @@ public:
 
   /// addFnAttr - Add function attributes to this function.
   ///
-  void addFnAttr(Attributes::AttrVal N) { 
-    // Function Attributes are stored at ~0 index 
+  void addFnAttr(Attributes::AttrVal N) {
+    // Function Attributes are stored at ~0 index
     addAttribute(AttributeSet::FunctionIndex, Attributes::get(getContext(), N));
   }
 
   /// removeFnAttr - Remove function attributes from this function.
   ///
   void removeFnAttr(Attributes N) {
-    // Function Attributes are stored at ~0 index 
+    // Function Attributes are stored at ~0 index
     removeAttribute(~0U, N);
   }
 
@@ -208,7 +208,7 @@ public:
 
   /// addAttribute - adds the attribute to the list of attributes.
   void addAttribute(unsigned i, Attributes attr);
-  
+
   /// removeAttribute - removes the attribute from the list of attributes.
   void removeAttribute(unsigned i, Attributes attr);
 
@@ -264,7 +264,7 @@ public:
     return hasUWTable() || !doesNotThrow();
   }
 
-  /// @brief Determine if the function returns a structure through first 
+  /// @brief Determine if the function returns a structure through first
   /// pointer argument.
   bool hasStructRetAttr() const {
     return getParamAttributes(1).hasAttribute(Attributes::StructRet);
diff --git a/include/llvm/IRBuilder.h b/include/llvm/IRBuilder.h
index bb86875828..75aa8e76f8 100644
--- a/include/llvm/IRBuilder.h
+++ b/include/llvm/IRBuilder.h
@@ -1332,7 +1332,7 @@ public:
 
   LandingPadInst *CreateLandingPad(Type *Ty, Value *PersFn, unsigned NumClauses,
                                    const Twine &Name = "") {
-    return Insert(LandingPadInst::Create(Ty, PersFn, NumClauses, Name));
+    return Insert(LandingPadInst::Create(Ty, PersFn, NumClauses), Name);
   }
 
   //===--------------------------------------------------------------------===//
diff --git a/include/llvm/Instructions.h b/include/llvm/Instructions.h
index 9c5a2db2fe..b5c75e3782 100644
--- a/include/llvm/Instructions.h
+++ b/include/llvm/Instructions.h
@@ -29,9 +29,10 @@
 
 namespace llvm {
 
+class APInt;
 class ConstantInt;
 class ConstantRange;
-class APInt;
+class DataLayout;
 class LLVMContext;
 
 enum AtomicOrdering {
@@ -850,6 +851,16 @@ public:
   /// isInBounds - Determine whether the GEP has the inbounds flag.
   bool isInBounds() const;
 
+  /// \brief Accumulate the constant address offset of this GEP if possible.
+  ///
+  /// This routine accepts an APInt into which it will accumulate the constant
+  /// offset of this GEP if the GEP is in fact constant. If the GEP is not
+  /// all-constant, it returns false and the value of the offset APInt is
+  /// undefined (it is *not* preserved!). The APInt passed into this routine
+  /// must be at least as wide as the IntPtr type for the address space of
+  /// the base GEP pointer.
+  bool accumulateConstantOffset(const DataLayout &DL, APInt &Offset) const;
+
   // Methods for support type inquiry through isa, cast, and dyn_cast:
   static inline bool classof(const Instruction *I) {
     return (I->getOpcode() == Instruction::GetElementPtr);
diff --git a/include/llvm/IntrinsicInst.h b/include/llvm/IntrinsicInst.h
index 9b2afd56e0..d5d27e6998 100644
--- a/include/llvm/IntrinsicInst.h
+++ b/include/llvm/IntrinsicInst.h
@@ -54,7 +54,7 @@ namespace llvm {
       return isa<CallInst>(V) && classof(cast<CallInst>(V));
     }
   };
-  
+
   /// DbgInfoIntrinsic - This is the common base class for debug info intrinsics
   ///
   class DbgInfoIntrinsic : public IntrinsicInst {
diff --git a/include/llvm/Intrinsics.td b/include/llvm/Intrinsics.td
index d3a548cd4e..d5c01ccbf3 100644
--- a/include/llvm/Intrinsics.td
+++ b/include/llvm/Intrinsics.td
@@ -472,3 +472,4 @@ include "llvm/IntrinsicsXCore.td"
 include "llvm/IntrinsicsHexagon.td"
 include "llvm/IntrinsicsNVVM.td"
 include "llvm/IntrinsicsMips.td"
+include "llvm/IntrinsicsR600.td"
diff --git a/include/llvm/IntrinsicsR600.td b/include/llvm/IntrinsicsR600.td
new file mode 100644
index 0000000000..ecb5668d8e
--- /dev/null
+++ b/include/llvm/IntrinsicsR600.td
@@ -0,0 +1,36 @@
+//===- IntrinsicsR600.td - Defines R600 intrinsics ---------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines all of the R600-specific intrinsics.
+//
+//===----------------------------------------------------------------------===//
+
+let TargetPrefix = "r600" in {
+
+class R600ReadPreloadRegisterIntrinsic<string name>
+  : Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>,
+    GCCBuiltin<name>;
+
+multiclass R600ReadPreloadRegisterIntrinsic_xyz<string prefix> {
+  def _x : R600ReadPreloadRegisterIntrinsic<!strconcat(prefix, "_x")>;
+  def _y : R600ReadPreloadRegisterIntrinsic<!strconcat(prefix, "_y")>;
+  def _z : R600ReadPreloadRegisterIntrinsic<!strconcat(prefix, "_z")>;
+}
+
+defm int_r600_read_global_size : R600ReadPreloadRegisterIntrinsic_xyz <
+                                       "__builtin_r600_read_global_size">;
+defm int_r600_read_local_size : R600ReadPreloadRegisterIntrinsic_xyz <
+                                       "__builtin_r600_read_local_size">;
+defm int_r600_read_ngroups : R600ReadPreloadRegisterIntrinsic_xyz <
+                                       "__builtin_r600_read_ngroups">;
+defm int_r600_read_tgid : R600ReadPreloadRegisterIntrinsic_xyz <
+                                       "__builtin_r600_read_tgid">;
+defm int_r600_read_tidig : R600ReadPreloadRegisterIntrinsic_xyz <
+                                       "__builtin_r600_read_tidig">;
+} // End TargetPrefix = "r600"
diff --git a/include/llvm/MC/MCAsmBackend.h b/include/llvm/MC/MCAsmBackend.h
index 72ed1a317c..48bd9d0742 100644
--- a/include/llvm/MC/MCAsmBackend.h
+++ b/include/llvm/MC/MCAsmBackend.h
@@ -41,6 +41,9 @@ protected: // Can only create subclasses.
 public:
   virtual ~MCAsmBackend();
 
+  /// lifetime management
+  virtual void reset() { }
+
   /// createObjectWriter - Create a new MCObjectWriter instance for use by the
   /// assembler backend to emit the final object file.
   virtual MCObjectWriter *createObjectWriter(raw_ostream &OS) const = 0;
diff --git a/include/llvm/MC/MCAsmInfo.h b/include/llvm/MC/MCAsmInfo.h
index 45f03300ae..38cf060636 100644
--- a/include/llvm/MC/MCAsmInfo.h
+++ b/include/llvm/MC/MCAsmInfo.h
@@ -102,6 +102,9 @@ namespace llvm {
     /// LabelSuffix - This is appended to emitted labels.
     const char *LabelSuffix;                 // Defaults to ":"
 
+    /// LabelSuffix - This is appended to emitted labels.
+    const char *DebugLabelSuffix;                 // Defaults to ":"
+
     /// GlobalPrefix - If this is set to a non-empty string, it is prepended
     /// onto all global symbols.  This is often used for "_" or ".".
     const char *GlobalPrefix;                // Defaults to ""
@@ -426,6 +429,11 @@ namespace llvm {
     const char *getLabelSuffix() const {
       return LabelSuffix;
     }
+
+    const char *getDebugLabelSuffix() const {
+      return DebugLabelSuffix;
+    }
+
     const char *getGlobalPrefix() const {
       return GlobalPrefix;
     }
diff --git a/include/llvm/MC/MCAsmLayout.h b/include/llvm/MC/MCAsmLayout.h
index bcc63dc231..267e3a57f5 100644
--- a/include/llvm/MC/MCAsmLayout.h
+++ b/include/llvm/MC/MCAsmLayout.h
@@ -39,14 +39,15 @@ private:
 
   /// The last fragment which was laid out, or 0 if nothing has been laid
   /// out. Fragments are always laid out in order, so all fragments with a
-  /// lower ordinal will be up to date.
-  mutable DenseMap<const MCSectionData*, MCFragment *> LastValidFragment;
+  /// lower ordinal will be valid.
+  mutable DenseMap<const MCSectionData*, MCFragment*> LastValidFragment;
 
   /// \brief Make sure that the layout for the given fragment is valid, lazily
   /// computing it if necessary.
-  void EnsureValid(const MCFragment *F) const;
+  void ensureValid(const MCFragment *F) const;
 
-  bool isFragmentUpToDate(const MCFragment *F) const;
+  /// \brief Is the layout for this fragment valid?
+  bool isFragmentValid(const MCFragment *F) const;
 
 public:
   MCAsmLayout(MCAssembler &_Assembler);
@@ -61,7 +62,7 @@ public:
   /// \brief Perform layout for a single fragment, assuming that the previous
   /// fragment has already been laid out correctly, and the parent section has
   /// been initialized.
-  void LayoutFragment(MCFragment *Fragment);
+  void layoutFragment(MCFragment *Fragment);
 
   /// @name Section Access (in layout order)
   /// @{
diff --git a/include/llvm/MC/MCAssembler.h b/include/llvm/MC/MCAssembler.h
index 0ecb66c734..8dba3b9c09 100644
--- a/include/llvm/MC/MCAssembler.h
+++ b/include/llvm/MC/MCAssembler.h
@@ -802,6 +802,10 @@ public:
               raw_ostream &OS);
   ~MCAssembler();
 
+  /// Reuse an assembler instance
+  ///
+  void reset();
+
   MCContext &getContext() const { return Context; }
 
   MCAsmBackend &getBackend() const { return Backend; }
diff --git a/include/llvm/MC/MCCodeEmitter.h b/include/llvm/MC/MCCodeEmitter.h
index 0574890902..9bfa08eb5d 100644
--- a/include/llvm/MC/MCCodeEmitter.h
+++ b/include/llvm/MC/MCCodeEmitter.h
@@ -29,6 +29,9 @@ protected: // Can only create subclasses.
 public:
   virtual ~MCCodeEmitter();
 
+  /// Lifetime management
+  virtual void reset() { }
+
   /// EncodeInstruction - Encode the given \p Inst to bytes on the output
   /// stream \p OS.
   virtual void EncodeInstruction(const MCInst &Inst, raw_ostream &OS,
diff --git a/include/llvm/MC/MCContext.h b/include/llvm/MC/MCContext.h
index 111ad484ff..e92d3b9e71 100644
--- a/include/llvm/MC/MCContext.h
+++ b/include/llvm/MC/MCContext.h
@@ -94,6 +94,12 @@ namespace llvm {
     /// .secure_log_reset appearing between them.
     bool SecureLogUsed;
 
+    /// The compilation directory to use for DW_AT_comp_dir.
+    std::string CompilationDir;
+
+    /// The main file name if passed in explicitly.
+    std::string MainFileName;
+
     /// The dwarf file and directory tables from the dwarf .file directive.
     std::vector<MCDwarfFile *> MCDwarfFiles;
     std::vector<StringRef> MCDwarfDirs;
@@ -137,16 +143,15 @@ namespace llvm {
 
     void *MachOUniquingMap, *ELFUniquingMap, *COFFUniquingMap;
 
-    /// Do automatic initialization in constructor and finalization in
-    /// destructor
-    bool AutoInitializationFinalization;
+    /// Do automatic reset in destructor
+    bool AutoReset;
 
     MCSymbol *CreateSymbol(StringRef Name);
 
   public:
     explicit MCContext(const MCAsmInfo &MAI, const MCRegisterInfo &MRI,
                        const MCObjectFileInfo *MOFI, const SourceMgr *Mgr = 0,
-                       bool AutoInitializationFinalization = true);
+                       bool DoAutoReset = true);
     ~MCContext();
 
     const SourceMgr *getSourceManager() const { return SrcMgr; }
@@ -162,11 +167,9 @@ namespace llvm {
     /// @name Module Lifetime Management
     /// @{
 
-    /// doInitialization - prepare to process a new module
-    void doInitialization();
-
-    /// doFinalization - clean up state from the current module
-    void doFinalization();
+    /// reset - return object to right after construction state to prepare
+    /// to process a new module
+    void reset();
 
     /// @}
 
@@ -251,6 +254,24 @@ namespace llvm {
     /// @name Dwarf Management
     /// @{
 
+    /// \brief Get the compilation directory for DW_AT_comp_dir
+    /// This can be overridden by clients which want to control the reported
+    /// compilation directory and have it be something other than the current
+    /// working directory.
+    const std::string &getCompilationDir() const { return CompilationDir; }
+
+    /// \brief Set the compilation directory for DW_AT_comp_dir
+    /// Override the default (CWD) compilation directory.
+    void setCompilationDir(StringRef S) { CompilationDir = S.str(); }
+
+    /// \brief Get the main file name for use in error messages and debug
+    /// info. This can be set to ensure we've got the correct file name
+    /// after preprocessing or for -save-temps.
+    const std::string &getMainFileName() const { return MainFileName; }
+
+    /// \brief Set the main file name and override the default.
+    void setMainFileName(StringRef S) { MainFileName = S.str(); }
+
     /// GetDwarfFile - creates an entry in the dwarf file and directory tables.
     unsigned GetDwarfFile(StringRef Directory, StringRef FileName,
                           unsigned FileNumber);
diff --git a/include/llvm/MC/MCELFStreamer.h b/include/llvm/MC/MCELFStreamer.h
index 3b0152bba4..03a57c7ee3 100644
--- a/include/llvm/MC/MCELFStreamer.h
+++ b/include/llvm/MC/MCELFStreamer.h
@@ -47,6 +47,7 @@ public:
   virtual void InitSections();
   virtual void ChangeSection(const MCSection *Section);
   virtual void EmitLabel(MCSymbol *Symbol);
+  virtual void EmitDebugLabel(MCSymbol *Symbol);
   virtual void EmitAssemblerFlag(MCAssemblerFlag Flag);
   virtual void EmitThumbFunc(MCSymbol *Func);
   virtual void EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol);
diff --git a/include/llvm/MC/MCExpr.h b/include/llvm/MC/MCExpr.h
index 93872edf2f..475981fa90 100644
--- a/include/llvm/MC/MCExpr.h
+++ b/include/llvm/MC/MCExpr.h
@@ -160,6 +160,7 @@ public:
     VK_TLVP,      // Mach-O thread local variable relocation
     VK_SECREL,
     // FIXME: We'd really like to use the generic Kinds listed above for these.
+    VK_ARM_NONE,
     VK_ARM_PLT,   // ARM-style PLT references. i.e., (PLT) instead of @PLT
     VK_ARM_TLSGD, //   ditto for TLSGD, GOT, GOTOFF, TPOFF and GOTTPOFF
     VK_ARM_GOT,
@@ -168,6 +169,7 @@ public:
     VK_ARM_GOTTPOFF,
     VK_ARM_TARGET1,
     VK_ARM_TARGET2,
+    VK_ARM_PREL31,
 
     VK_PPC_TOC,          // TOC base
     VK_PPC_TOC_ENTRY,    // TOC entry
@@ -177,10 +179,19 @@ public:
     VK_PPC_GAS_LO16,     // symbol@l
     VK_PPC_TPREL16_HA,   // symbol@tprel@ha
     VK_PPC_TPREL16_LO,   // symbol@tprel@l
+    VK_PPC_DTPREL16_HA,  // symbol@dtprel@ha
+    VK_PPC_DTPREL16_LO,  // symbol@dtprel@l
     VK_PPC_TOC16_HA,     // symbol@toc@ha
     VK_PPC_TOC16_LO,     // symbol@toc@l
-    VK_PPC_GOT_TPREL16_DS, // symbol@got@tprel
+    VK_PPC_GOT_TPREL16_HA, // symbol@got@tprel@ha
+    VK_PPC_GOT_TPREL16_LO, // symbol@got@tprel@l
     VK_PPC_TLS,            // symbol@tls
+    VK_PPC_GOT_TLSGD16_HA, // symbol@got@tlsgd@ha
+    VK_PPC_GOT_TLSGD16_LO, // symbol@got@tlsgd@l
+    VK_PPC_TLSGD,          // symbol@tlsgd
+    VK_PPC_GOT_TLSLD16_HA, // symbol@got@tlsld@ha
+    VK_PPC_GOT_TLSLD16_LO, // symbol@got@tlsld@l
+    VK_PPC_TLSLD,          // symbol@tlsld
 
     VK_Mips_GPREL,
     VK_Mips_GOT_CALL,
diff --git a/include/llvm/MC/MCMachObjectWriter.h b/include/llvm/MC/MCMachObjectWriter.h
index efaabfb9e8..2ddde0b68a 100644
--- a/include/llvm/MC/MCMachObjectWriter.h
+++ b/include/llvm/MC/MCMachObjectWriter.h
@@ -45,6 +45,13 @@ protected:
 public:
   virtual ~MCMachObjectTargetWriter();
 
+  /// @name Lifetime Management
+  /// @{
+
+  virtual void reset() {};
+
+  /// @}
+
   /// @name Accessors
   /// @{
 
@@ -111,6 +118,13 @@ public:
     : MCObjectWriter(_OS, _IsLittleEndian), TargetObjectWriter(MOTW) {
   }
 
+  /// @name Lifetime management Methods
+  /// @{
+
+  virtual void reset();
+
+  /// @}
+
   /// @name Utility Methods
   /// @{
 
diff --git a/include/llvm/MC/MCObjectStreamer.h b/include/llvm/MC/MCObjectStreamer.h
index db30562649..df98fea41c 100644
--- a/include/llvm/MC/MCObjectStreamer.h
+++ b/include/llvm/MC/MCObjectStreamer.h
@@ -45,6 +45,11 @@ protected:
                    MCAssembler *_Assembler);
   ~MCObjectStreamer();
 
+public:
+  /// state management
+  virtual void reset();
+
+protected:
   MCSectionData *getCurrentSectionData() const {
     return CurSectionData;
   }
@@ -64,6 +69,7 @@ public:
   /// @{
 
   virtual void EmitLabel(MCSymbol *Symbol);
+  virtual void EmitDebugLabel(MCSymbol *Symbol);
   virtual void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value);
   virtual void EmitValueImpl(const MCExpr *Value, unsigned Size,
                              unsigned AddrSpace);
diff --git a/include/llvm/MC/MCObjectWriter.h b/include/llvm/MC/MCObjectWriter.h
index f77b7d853d..9d5c1a785e 100644
--- a/include/llvm/MC/MCObjectWriter.h
+++ b/include/llvm/MC/MCObjectWriter.h
@@ -51,6 +51,9 @@ protected: // Can only create subclasses.
 public:
   virtual ~MCObjectWriter();
 
+  /// lifetime management
+  virtual void reset() { }
+
   bool isLittleEndian() const { return IsLittleEndian; }
 
   raw_ostream &getStream() { return OS; }
diff --git a/include/llvm/MC/MCSection.h b/include/llvm/MC/MCSection.h
index 21fdb6bd39..e5754249e9 100644
--- a/include/llvm/MC/MCSection.h
+++ b/include/llvm/MC/MCSection.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_MC_MCSECTION_H
 #define LLVM_MC_MCSECTION_H
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/SectionKind.h"
 #include "llvm/Support/Compiler.h"
 
@@ -49,6 +50,11 @@ namespace llvm {
     virtual void PrintSwitchToSection(const MCAsmInfo &MAI,
                                       raw_ostream &OS) const = 0;
 
+    // Convenience routines to get label names for the beginning/end of a
+    // section.
+    virtual std::string getLabelBeginName() const = 0;
+    virtual std::string getLabelEndName() const = 0;
+
     /// isBaseAddressKnownZero - Return true if we know that this section will
     /// get a base address of zero.  In cases where we know that this is true we
     /// can emit section offsets as direct references to avoid a subtraction
diff --git a/include/llvm/MC/MCSectionCOFF.h b/include/llvm/MC/MCSectionCOFF.h
index 83bc63e652..07c47144cb 100644
--- a/include/llvm/MC/MCSectionCOFF.h
+++ b/include/llvm/MC/MCSectionCOFF.h
@@ -50,6 +50,12 @@ namespace llvm {
     bool ShouldOmitSectionDirective(StringRef Name, const MCAsmInfo &MAI) const;
 
     StringRef getSectionName() const { return SectionName; }
+    virtual std::string getLabelBeginName() const {
+      return SectionName.str() + "_begin";
+    }
+    virtual std::string getLabelEndName() const {
+      return SectionName.str() + "_end";
+    }
     unsigned getCharacteristics() const { return Characteristics; }
     int getSelection () const { return Selection; }
 
diff --git a/include/llvm/MC/MCSectionELF.h b/include/llvm/MC/MCSectionELF.h
index 329c75cb1d..451a1623c1 100644
--- a/include/llvm/MC/MCSectionELF.h
+++ b/include/llvm/MC/MCSectionELF.h
@@ -17,6 +17,8 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/Support/ELF.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Debug.h"
 
 namespace llvm {
 
@@ -57,6 +59,11 @@ public:
   bool ShouldOmitSectionDirective(StringRef Name, const MCAsmInfo &MAI) const;
 
   StringRef getSectionName() const { return SectionName; }
+  virtual std::string getLabelBeginName() const {
+    return SectionName.str() + "_begin"; }
+  virtual std::string getLabelEndName() const {
+    return SectionName.str() + "_end";
+  }
   unsigned getType() const { return Type; }
   unsigned getFlags() const { return Flags; }
   unsigned getEntrySize() const { return EntrySize; }
diff --git a/include/llvm/MC/MCSectionMachO.h b/include/llvm/MC/MCSectionMachO.h
index 65ad7961b3..898f571490 100644
--- a/include/llvm/MC/MCSectionMachO.h
+++ b/include/llvm/MC/MCSectionMachO.h
@@ -145,6 +145,14 @@ public:
     return StringRef(SectionName);
   }
 
+  virtual std::string getLabelBeginName() const {
+    return StringRef(getSegmentName().str() + getSectionName().str() + "_begin");
+  }
+
+  virtual std::string getLabelEndName() const {
+    return StringRef(getSegmentName().str() + getSectionName().str() + "_end");
+  }
+
   unsigned getTypeAndAttributes() const { return TypeAndAttributes; }
   unsigned getStubSize() const { return Reserved2; }
 
diff --git a/include/llvm/MC/MCStreamer.h b/include/llvm/MC/MCStreamer.h
index c411030e2b..ecf5e78b5e 100644
--- a/include/llvm/MC/MCStreamer.h
+++ b/include/llvm/MC/MCStreamer.h
@@ -92,6 +92,10 @@ namespace llvm {
   public:
     virtual ~MCStreamer();
 
+    /// State management
+    ///
+    virtual void reset();
+
     MCContext &getContext() const { return Context; }
 
     unsigned getNumFrameInfos() {
@@ -240,6 +244,8 @@ namespace llvm {
     /// used in an assignment.
     virtual void EmitLabel(MCSymbol *Symbol);
 
+    virtual void EmitDebugLabel(MCSymbol *Symbol);
+
     virtual void EmitEHSymAttributes(const MCSymbol *Symbol,
                                      MCSymbol *EHSymbol);
 
diff --git a/include/llvm/Object/MachOFormat.h b/include/llvm/Object/MachOFormat.h
index c0f700d3c8..a17d58dae2 100644
--- a/include/llvm/Object/MachOFormat.h
+++ b/include/llvm/Object/MachOFormat.h
@@ -237,6 +237,10 @@ namespace macho {
   /// @name Section Data
   /// @{
 
+  enum SectionFlags {
+    SF_PureInstructions = 0x80000000
+  };
+
   struct Section {
     char Name[16];
     char SegmentName[16];
diff --git a/include/llvm/Operator.h b/include/llvm/Operator.h
index d31e09e2b4..b2bb2e6b31 100644
--- a/include/llvm/Operator.h
+++ b/include/llvm/Operator.h
@@ -16,9 +16,11 @@
 #define LLVM_OPERATOR_H
 
 #include "llvm/Constants.h"
+#include "llvm/DataLayout.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Instruction.h"
 #include "llvm/Type.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
 
 namespace llvm {
 
@@ -430,6 +432,45 @@ public:
     }
     return true;
   }
+
+  /// \brief Accumulate the constant address offset of this GEP if possible.
+  ///
+  /// This routine accepts an APInt into which it will accumulate the constant
+  /// offset of this GEP if the GEP is in fact constant. If the GEP is not
+  /// all-constant, it returns false and the value of the offset APInt is
+  /// undefined (it is *not* preserved!). The APInt passed into this routine
+  /// must be at least as wide as the IntPtr type for the address space of
+  /// the base GEP pointer.
+  bool accumulateConstantOffset(const DataLayout &DL, APInt &Offset) const {
+    assert(Offset.getBitWidth() ==
+           DL.getPointerSizeInBits(getPointerAddressSpace()) &&
+           "The offset must have exactly as many bits as our pointer.");
+
+    for (gep_type_iterator GTI = gep_type_begin(this), GTE = gep_type_end(this);
+         GTI != GTE; ++GTI) {
+      ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
+      if (!OpC)
+        return false;
+      if (OpC->isZero())
+        continue;
+
+      // Handle a struct index, which adds its field offset to the pointer.
+      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
+        unsigned ElementIdx = OpC->getZExtValue();
+        const StructLayout *SL = DL.getStructLayout(STy);
+        Offset += APInt(Offset.getBitWidth(),
+                        SL->getElementOffset(ElementIdx));
+        continue;
+      }
+
+      // For array or vector indices, scale the index by the size of the type.
+      APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth());
+      Offset += Index * APInt(Offset.getBitWidth(),
+                              DL.getTypeAllocSize(GTI.getIndexedType()));
+    }
+    return true;
+  }
+
 };
 
 } // End llvm namespace
diff --git a/include/llvm/PassSupport.h b/include/llvm/PassSupport.h
index 81b3ce153c..3633f4703e 100644
--- a/include/llvm/PassSupport.h
+++ b/include/llvm/PassSupport.h
@@ -305,7 +305,7 @@ struct RegisterAnalysisGroup : public RegisterAGBase {
 /// clients that are interested in which passes get registered and unregistered
 /// at runtime (which can be because of the RegisterPass constructors being run
 /// as the program starts up, or may be because a shared object just got
-/// loaded).  Deriving from the PassRegistationListener class automatically
+/// loaded).  Deriving from the PassRegistrationListener class automatically
 /// registers your object to receive callbacks indicating when passes are loaded
 /// and removed.
 ///
diff --git a/include/llvm/Support/ELF.h b/include/llvm/Support/ELF.h
index 0614a14cc8..d0422b3bfa 100644
--- a/include/llvm/Support/ELF.h
+++ b/include/llvm/Support/ELF.h
@@ -478,7 +478,16 @@ enum {
   R_PPC64_TOC16_DS            = 63,
   R_PPC64_TOC16_LO_DS         = 64,
   R_PPC64_TLS                 = 67,
-  R_PPC64_GOT_TPREL16_DS      = 87
+  R_PPC64_DTPREL16_LO         = 75,
+  R_PPC64_DTPREL16_HA         = 77,
+  R_PPC64_GOT_TLSGD16_LO      = 80,
+  R_PPC64_GOT_TLSGD16_HA      = 82,
+  R_PPC64_GOT_TLSLD16_LO      = 84,
+  R_PPC64_GOT_TLSLD16_HA      = 86,
+  R_PPC64_GOT_TPREL16_LO_DS   = 88,
+  R_PPC64_GOT_TPREL16_HA      = 90,
+  R_PPC64_TLSGD               = 107,
+  R_PPC64_TLSLD               = 108
 };
 
 // ARM Specific e_flags
diff --git a/include/llvm/Support/PatternMatch.h b/include/llvm/Support/PatternMatch.h
index 36b6db7a72..7420fab1a7 100644
--- a/include/llvm/Support/PatternMatch.h
+++ b/include/llvm/Support/PatternMatch.h
@@ -31,7 +31,9 @@
 
 #include "llvm/Constants.h"
 #include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
 #include "llvm/Operator.h"
+#include "llvm/Support/CallSite.h"
 
 namespace llvm {
 namespace PatternMatch {
@@ -75,6 +77,52 @@ inline class_match<UndefValue> m_Undef() { return class_match<UndefValue>(); }
 
 inline class_match<Constant> m_Constant() { return class_match<Constant>(); }
 
+/// Matching combinators
+template<typename LTy, typename RTy>
+struct match_combine_or {
+  LTy L;
+  RTy R;
+
+  match_combine_or(const LTy &Left, const RTy &Right) : L(Left), R(Right) { }
+
+  template<typename ITy>
+  bool match(ITy *V) {
+    if (L.match(V))
+      return true;
+    if (R.match(V))
+      return true;
+    return false;
+  }
+};
+
+template<typename LTy, typename RTy>
+struct match_combine_and {
+  LTy L;
+  RTy R;
+
+  match_combine_and(const LTy &Left, const RTy &Right) : L(Left), R(Right) { }
+
+  template<typename ITy>
+  bool match(ITy *V) {
+    if (L.match(V))
+      if (R.match(V))
+        return true;
+    return false;
+  }
+};
+
+/// Combine two pattern matchers matching L || R
+template<typename LTy, typename RTy>
+inline match_combine_or<LTy, RTy> m_CombineOr(const LTy &L, const RTy &R) {
+  return match_combine_or<LTy, RTy>(L, R);
+}
+
+/// Combine two pattern matchers matching L && R
+template<typename LTy, typename RTy>
+inline match_combine_and<LTy, RTy> m_CombineAnd(const LTy &L, const RTy &R) {
+  return match_combine_and<LTy, RTy>(L, R);
+}
+
 struct match_zero {
   template<typename ITy>
   bool match(ITy *V) {
@@ -88,6 +136,27 @@ struct match_zero {
 /// zero_initializer for vectors and ConstantPointerNull for pointers.
 inline match_zero m_Zero() { return match_zero(); }
 
+struct match_neg_zero {
+  template<typename ITy>
+  bool match(ITy *V) {
+    if (const Constant *C = dyn_cast<Constant>(V))
+      return C->isNegativeZeroValue();
+    return false;
+  }
+};
+
+/// m_NegZero() - Match an arbitrary zero/null constant.  This includes
+/// zero_initializer for vectors and ConstantPointerNull for pointers. For
+/// floating point constants, this will match negative zero but not positive
+/// zero
+inline match_neg_zero m_NegZero() { return match_neg_zero(); }
+
+/// m_AnyZero() - Match an arbitrary zero/null constant.  This includes
+/// zero_initializer for vectors and ConstantPointerNull for pointers. For
+/// floating point constants, this will match negative zero and positive zero
+inline match_combine_or<match_zero, match_neg_zero> m_AnyZero() {
+  return m_CombineOr(m_Zero(), m_NegZero());
+}
 
 struct apint_match {
   const APInt *&Res;
@@ -98,19 +167,13 @@ struct apint_match {
       Res = &CI->getValue();
       return true;
     }
-    // FIXME: Remove this.
-    if (ConstantVector *CV = dyn_cast<ConstantVector>(V))
-      if (ConstantInt *CI =
-          dyn_cast_or_null<ConstantInt>(CV->getSplatValue())) {
-        Res = &CI->getValue();
-        return true;
-      }
-    if (ConstantDataVector *CV = dyn_cast<ConstantDataVector>(V))
-      if (ConstantInt *CI =
-          dyn_cast_or_null<ConstantInt>(CV->getSplatValue())) {
-        Res = &CI->getValue();
-        return true;
-      }
+    if (V->getType()->isVectorTy())
+      if (const Constant *C = dyn_cast<Constant>(V))
+        if (ConstantInt *CI =
+            dyn_cast_or_null<ConstantInt>(C->getSplatValue())) {
+          Res = &CI->getValue();
+          return true;
+        }
     return false;
   }
 };
@@ -151,13 +214,11 @@ struct cst_pred_ty : public Predicate {
   bool match(ITy *V) {
     if (const ConstantInt *CI = dyn_cast<ConstantInt>(V))
       return this->isValue(CI->getValue());
-    // FIXME: Remove this.
-    if (const ConstantVector *CV = dyn_cast<ConstantVector>(V))
-      if (ConstantInt *CI = dyn_cast_or_null<ConstantInt>(CV->getSplatValue()))
-        return this->isValue(CI->getValue());
-    if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(V))
-      if (ConstantInt *CI = dyn_cast_or_null<ConstantInt>(CV->getSplatValue()))
-        return this->isValue(CI->getValue());
+    if (V->getType()->isVectorTy())
+      if (const Constant *C = dyn_cast<Constant>(V))
+        if (const ConstantInt *CI =
+            dyn_cast_or_null<ConstantInt>(C->getSplatValue()))
+          return this->isValue(CI->getValue());
     return false;
   }
 };
@@ -175,21 +236,13 @@ struct api_pred_ty : public Predicate {
         Res = &CI->getValue();
         return true;
       }
-
-    // FIXME: remove.
-    if (const ConstantVector *CV = dyn_cast<ConstantVector>(V))
-      if (ConstantInt *CI = dyn_cast_or_null<ConstantInt>(CV->getSplatValue()))
-        if (this->isValue(CI->getValue())) {
-          Res = &CI->getValue();
-          return true;
-        }
-
-    if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(V))
-      if (ConstantInt *CI = dyn_cast_or_null<ConstantInt>(CV->getSplatValue()))
-        if (this->isValue(CI->getValue())) {
-          Res = &CI->getValue();
-          return true;
-        }
+    if (V->getType()->isVectorTy())
+      if (const Constant *C = dyn_cast<Constant>(V))
+        if (ConstantInt *CI = dyn_cast_or_null<ConstantInt>(C->getSplatValue()))
+          if (this->isValue(CI->getValue())) {
+            Res = &CI->getValue();
+            return true;
+          }
 
     return false;
   }
@@ -252,6 +305,9 @@ inline bind_ty<ConstantInt> m_ConstantInt(ConstantInt *&CI) { return CI; }
 /// m_Constant - Match a Constant, capturing the value if we match.
 inline bind_ty<Constant> m_Constant(Constant *&C) { return C; }
 
+/// m_ConstantFP - Match a ConstantFP, capturing the value if we match.
+inline bind_ty<ConstantFP> m_ConstantFP(ConstantFP *&C) { return C; }
+
 /// specificval_ty - Match a specified Value*.
 struct specificval_ty {
   const Value *Val;
@@ -266,6 +322,31 @@ struct specificval_ty {
 /// m_Specific - Match if we have a specific specified value.
 inline specificval_ty m_Specific(const Value *V) { return V; }
 
+/// Match a specified floating point value or vector of all elements of that
+/// value.
+struct specific_fpval {
+  double Val;
+  specific_fpval(double V) : Val(V) {}
+
+  template<typename ITy>
+  bool match(ITy *V) {
+    if (const ConstantFP *CFP = dyn_cast<ConstantFP>(V))
+      return CFP->isExactlyValue(Val);
+    if (V->getType()->isVectorTy())
+      if (const Constant *C = dyn_cast<Constant>(V))
+        if (ConstantFP *CFP = dyn_cast_or_null<ConstantFP>(C->getSplatValue()))
+          return CFP->isExactlyValue(Val);
+    return false;
+  }
+};
+
+/// Match a specific floating point value or vector with all elements equal to
+/// the value.
+inline specific_fpval m_SpecificFP(double V) { return specific_fpval(V); }
+
+/// Match a float 1.0 or vector with all elements equal to 1.0.
+inline specific_fpval m_FPOne() { return m_SpecificFP(1.0); }
+
 struct bind_const_intval_ty {
   uint64_t &VR;
   bind_const_intval_ty(uint64_t &V) : VR(V) {}
@@ -818,6 +899,102 @@ m_UMin(const LHS &L, const RHS &R) {
   return MaxMin_match<LHS, RHS, umin_pred_ty>(L, R);
 }
 
+template<typename Opnd_t>
+struct Argument_match {
+  unsigned OpI;
+  Opnd_t Val;
+  Argument_match(unsigned OpIdx, const Opnd_t &V) : OpI(OpIdx), Val(V) { }
+
+  template<typename OpTy>
+  bool match(OpTy *V) {
+    CallSite CS(V);
+    return CS.isCall() && Val.match(CS.getArgument(OpI));
+  }
+};
+
+/// Match an argument
+template<unsigned OpI, typename Opnd_t>
+inline Argument_match<Opnd_t> m_Argument(const Opnd_t &Op) {
+  return Argument_match<Opnd_t>(OpI, Op);
+}
+
+/// Intrinsic matchers.
+struct IntrinsicID_match {
+  unsigned ID;
+  IntrinsicID_match(unsigned IntrID) : ID(IntrID) { }
+
+  template<typename OpTy>
+  bool match(OpTy *V) {
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(V);
+    return II && II->getIntrinsicID() == ID;
+  }
+};
+
+/// Intrinsic matches are combinations of ID matchers, and argument
+/// matchers. Higher arity matcher are defined recursively in terms of and-ing
+/// them with lower arity matchers. Here's some convenient typedefs for up to
+/// several arguments, and more can be added as needed
+template <typename T0 = void, typename T1 = void, typename T2 = void,
+          typename T3 = void, typename T4 = void, typename T5 = void,
+          typename T6 = void, typename T7 = void, typename T8 = void,
+          typename T9 = void, typename T10 = void> struct m_Intrinsic_Ty;
+template <typename T0>
+struct m_Intrinsic_Ty<T0> {
+  typedef match_combine_and<IntrinsicID_match, Argument_match<T0> > Ty;
+};
+template <typename T0, typename T1>
+struct m_Intrinsic_Ty<T0, T1> {
+  typedef match_combine_and<typename m_Intrinsic_Ty<T0>::Ty,
+                            Argument_match<T1> > Ty;
+};
+template <typename T0, typename T1, typename T2>
+struct m_Intrinsic_Ty<T0, T1, T2> {
+  typedef match_combine_and<typename m_Intrinsic_Ty<T0, T1>::Ty,
+                            Argument_match<T2> > Ty;
+};
+template <typename T0, typename T1, typename T2, typename T3>
+struct m_Intrinsic_Ty<T0, T1, T2, T3> {
+  typedef match_combine_and<typename m_Intrinsic_Ty<T0, T1, T2>::Ty,
+                            Argument_match<T3> > Ty;
+};
+
+/// Match intrinsic calls like this:
+///   m_Intrinsic<Intrinsic::fabs>(m_Value(X))
+template <unsigned IntrID>
+inline IntrinsicID_match
+m_Intrinsic() { return IntrinsicID_match(IntrID); }
+
+template<unsigned IntrID, typename T0>
+inline typename m_Intrinsic_Ty<T0>::Ty
+m_Intrinsic(const T0 &Op0) {
+  return m_CombineAnd(m_Intrinsic<IntrID>(), m_Argument<0>(Op0));
+}
+
+template<unsigned IntrID, typename T0, typename T1>
+inline typename m_Intrinsic_Ty<T0, T1>::Ty
+m_Intrinsic(const T0 &Op0, const T1 &Op1) {
+  return m_CombineAnd(m_Intrinsic<IntrID>(Op0), m_Argument<1>(Op1));
+}
+
+template<unsigned IntrID, typename T0, typename T1, typename T2>
+inline typename m_Intrinsic_Ty<T0, T1, T2>::Ty
+m_Intrinsic(const T0 &Op0, const T1 &Op1, const T2 &Op2) {
+  return m_CombineAnd(m_Intrinsic<IntrID>(Op0, Op1), m_Argument<2>(Op2));
+}
+
+template<unsigned IntrID, typename T0, typename T1, typename T2, typename T3>
+inline typename m_Intrinsic_Ty<T0, T1, T2, T3>::Ty
+m_Intrinsic(const T0 &Op0, const T1 &Op1, const T2 &Op2, const T3 &Op3) {
+  return m_CombineAnd(m_Intrinsic<IntrID>(Op0, Op1, Op2), m_Argument<3>(Op3));
+}
+
+// Helper intrinsic matching specializations
+template<typename Opnd0>
+inline typename m_Intrinsic_Ty<Opnd0>::Ty
+m_BSwap(const Opnd0 &Op0) {
+  return m_Intrinsic<Intrinsic::bswap>(Op0);
+}
+
 } // end namespace PatternMatch
 } // end namespace llvm
 
diff --git a/include/llvm/Support/SourceMgr.h b/include/llvm/Support/SourceMgr.h
index bc832e0c9e..aaee344755 100644
--- a/include/llvm/Support/SourceMgr.h
+++ b/include/llvm/Support/SourceMgr.h
@@ -95,6 +95,10 @@ public:
     return Buffers[i].Buffer;
   }
 
+  unsigned getNumBuffers() const {
+    return Buffers.size();
+  }
+
   SMLoc getParentIncludeLoc(unsigned i) const {
     assert(i < Buffers.size() && "Invalid Buffer ID!");
     return Buffers[i].IncludeLoc;
diff --git a/include/llvm/Support/YAMLTraits.h b/include/llvm/Support/YAMLTraits.h
new file mode 100644
index 0000000000..4487eae248
--- /dev/null
+++ b/include/llvm/Support/YAMLTraits.h
@@ -0,0 +1,1111 @@
+//===- llvm/Supporrt/YAMLTraits.h -------------------------------*- C++ -*-===//
+//
+//                             The LLVM Linker
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_YAML_TRAITS_H_
+#define LLVM_YAML_TRAITS_H_
+
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/YAMLParser.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/system_error.h"
+#include "llvm/Support/type_traits.h"
+
+
+namespace llvm {
+namespace yaml {
+
+
+/// This class should be specialized by any type that needs to be converted
+/// to/from a YAML mapping.  For example:
+///
+///     struct ScalarBitSetTraits<MyStruct> {
+///       static void mapping(IO &io, MyStruct &s) {
+///         io.mapRequired("name", s.name);
+///         io.mapRequired("size", s.size);
+///         io.mapOptional("age",  s.age);
+///       }
+///     };
+template<class T>
+struct MappingTraits {
+  // Must provide:
+  // static void mapping(IO &io, T &fields);
+};
+
+
+/// This class should be specialized by any integral type that converts
+/// to/from a YAML scalar where there is a one-to-one mapping between
+/// in-memory values and a string in YAML.  For example:
+///
+///     struct ScalarEnumerationTraits<Colors> {
+///         static void enumeration(IO &io, Colors &value) {
+///           io.enumCase(value, "red",   cRed);
+///           io.enumCase(value, "blue",  cBlue);
+///           io.enumCase(value, "green", cGreen);
+///         }
+///       };
+template<typename T>
+struct ScalarEnumerationTraits {
+  // Must provide:
+  // static void enumeration(IO &io, T &value);
+};
+
+
+/// This class should be specialized by any integer type that is a union
+/// of bit values and the YAML representation is a flow sequence of
+/// strings.  For example:
+///
+///      struct ScalarBitSetTraits<MyFlags> {
+///        static void bitset(IO &io, MyFlags &value) {
+///          io.bitSetCase(value, "big",   flagBig);
+///          io.bitSetCase(value, "flat",  flagFlat);
+///          io.bitSetCase(value, "round", flagRound);
+///        }
+///      };
+template<typename T>
+struct ScalarBitSetTraits {
+  // Must provide:
+  // static void bitset(IO &io, T &value);
+};
+
+
+/// This class should be specialized by type that requires custom conversion
+/// to/from a yaml scalar.  For example:
+///
+///    template<>
+///    struct ScalarTraits<MyType> {
+///      static void output(const MyType &val, void*, llvm::raw_ostream &out) {
+///        // stream out custom formatting
+///        out << llvm::format("%x", val);
+///      }
+///      static StringRef input(StringRef scalar, void*, MyType &value) {
+///        // parse scalar and set `value`
+///        // return empty string on success, or error string
+///        return StringRef();
+///      }
+///    };
+template<typename T>
+struct ScalarTraits {
+  // Must provide:
+  //
+  // Function to write the value as a string:
+  //static void output(const T &value, void *ctxt, llvm::raw_ostream &out);
+  //
+  // Function to convert a string to a value.  Returns the empty
+  // StringRef on success or an error string if string is malformed:
+  //static StringRef input(StringRef scalar, void *ctxt, T &value);
+};
+
+
+/// This class should be specialized by any type that needs to be converted
+/// to/from a YAML sequence.  For example:
+///
+///    template<>
+///    struct SequenceTraits< std::vector<MyType> > {
+///      static size_t size(IO &io, std::vector<MyType> &seq) {
+///        return seq.size();
+///      }
+///      static MyType& element(IO &, std::vector<MyType> &seq, size_t index) {
+///        if ( index >= seq.size() )
+///          seq.resize(index+1);
+///        return seq[index];
+///      }
+///    };
+template<typename T>
+struct SequenceTraits {
+  // Must provide:
+  // static size_t size(IO &io, T &seq);
+  // static T::value_type& element(IO &io, T &seq, size_t index);
+  //
+  // The following is option and will cause generated YAML to use
+  // a flow sequence (e.g. [a,b,c]).
+  // static const bool flow = true;
+};
+
+
+/// This class should be specialized by any type that needs to be converted
+/// to/from a list of YAML documents.
+template<typename T>
+struct DocumentListTraits {
+  // Must provide:
+  // static size_t size(IO &io, T &seq);
+  // static T::value_type& element(IO &io, T &seq, size_t index);
+};
+
+
+// Only used by compiler if both template types are the same
+template <typename T, T>
+struct SameType;
+
+// Only used for better diagnostics of missing traits
+template <typename T>
+struct MissingTrait;
+
+
+
+// Test if ScalarEnumerationTraits<T> is defined on type T.
+template <class T>
+struct has_ScalarEnumerationTraits
+{
+  typedef void (*Signature_enumeration)(class IO&, T&);
+
+  template <typename U>
+  static char test(SameType<Signature_enumeration, &U::enumeration>*);
+
+  template <typename U>
+  static double test(...);
+
+public:
+  static bool const value = (sizeof(test<ScalarEnumerationTraits<T> >(0)) == 1);
+};
+
+
+// Test if ScalarBitSetTraits<T> is defined on type T.
+template <class T>
+struct has_ScalarBitSetTraits
+{
+  typedef void (*Signature_bitset)(class IO&, T&);
+
+  template <typename U>
+  static char test(SameType<Signature_bitset, &U::bitset>*);
+
+  template <typename U>
+  static double test(...);
+
+public:
+  static bool const value = (sizeof(test<ScalarBitSetTraits<T> >(0)) == 1);
+};
+
+
+// Test if ScalarTraits<T> is defined on type T.
+template <class T>
+struct has_ScalarTraits
+{
+  typedef llvm::StringRef (*Signature_input)(llvm::StringRef, void*, T&);
+  typedef void (*Signature_output)(const T&, void*, llvm::raw_ostream&);
+
+  template <typename U>
+  static char test(SameType<Signature_input, &U::input>*,
+                   SameType<Signature_output, &U::output>*);
+
+  template <typename U>
+  static double test(...);
+
+public:
+  static bool const value = (sizeof(test<ScalarTraits<T> >(0,0)) == 1);
+};
+
+
+// Test if MappingTraits<T> is defined on type T.
+template <class T>
+struct has_MappingTraits
+{
+  typedef void (*Signature_mapping)(class IO&, T&);
+
+  template <typename U>
+  static char test(SameType<Signature_mapping, &U::mapping>*);
+
+  template <typename U>
+  static double test(...);
+
+public:
+  static bool const value = (sizeof(test<MappingTraits<T> >(0)) == 1);
+};
+
+
+// Test if SequenceTraits<T> is defined on type T.
+template <class T>
+struct has_SequenceMethodTraits
+{
+  typedef size_t (*Signature_size)(class IO&, T&);
+
+  template <typename U>
+  static char test(SameType<Signature_size, &U::size>*);
+
+  template <typename U>
+  static double test(...);
+
+public:
+  static bool const value =  (sizeof(test<SequenceTraits<T> >(0)) == 1);
+};
+
+
+// has_FlowTraits<int> will cause an error with some compilers because
+// it subclasses int.  Using this wrapper only instantiates the
+// real has_FlowTraits only if the template type is a class.
+template <typename T, bool Enabled = llvm::is_class<T>::value>
+class has_FlowTraits
+{
+public:
+   static const bool value = false;
+};
+
+// Some older gcc compilers don't support straight forward tests
+// for members, so test for ambiguity cause by the base and derived
+// classes both defining the member.
+template <class T>
+struct has_FlowTraits<T, true>
+{
+  struct Fallback { bool flow; };
+  struct Derived : T, Fallback { };
+
+  template<typename C>
+  static char (&f(SameType<bool Fallback::*, &C::flow>*))[1];
+
+  template<typename C>
+  static char (&f(...))[2];
+
+public:
+  static bool const value = sizeof(f<Derived>(0)) == 2;
+};
+
+
+
+// Test if SequenceTraits<T> is defined on type T
+// and SequenceTraits<T>::flow is *not* defined.
+template<typename T>
+struct has_SequenceTraits : public  llvm::integral_constant<bool,
+                                         has_SequenceMethodTraits<T>::value
+                                      && !has_FlowTraits<T>::value > { };
+
+
+// Test if SequenceTraits<T> is defined on type T
+// and SequenceTraits<T>::flow is defined.
+template<typename T>
+struct has_FlowSequenceTraits : public llvm::integral_constant<bool,
+                                         has_SequenceMethodTraits<T>::value
+                                      && has_FlowTraits<T>::value > { };
+
+
+
+// Test if DocumentListTraits<T> is defined on type T
+template <class T>
+struct has_DocumentListTraits
+{
+  typedef size_t (*Signature_size)(class IO&, T&);
+
+  template <typename U>
+  static char test(SameType<Signature_size, &U::size>*);
+
+  template <typename U>
+  static double test(...);
+
+public:
+  static bool const value =  (sizeof(test<DocumentListTraits<T> >(0)) == 1);
+};
+
+
+
+
+template<typename T>
+struct missingTraits : public  llvm::integral_constant<bool,
+                                         !has_ScalarEnumerationTraits<T>::value
+                                      && !has_ScalarBitSetTraits<T>::value
+                                      && !has_ScalarTraits<T>::value
+                                      && !has_MappingTraits<T>::value
+                                      && !has_SequenceTraits<T>::value
+                                      && !has_FlowSequenceTraits<T>::value
+                                      && !has_DocumentListTraits<T>::value >  {};
+
+
+// Base class for Input and Output.
+class IO {
+public:
+
+  IO(void *Ctxt=NULL);
+  virtual ~IO();
+
+  virtual bool outputting() = 0;
+
+  virtual unsigned beginSequence() = 0;
+  virtual bool preflightElement(unsigned, void *&) = 0;
+  virtual void postflightElement(void*) = 0;
+  virtual void endSequence() = 0;
+
+  virtual unsigned beginFlowSequence() = 0;
+  virtual bool preflightFlowElement(unsigned, void *&) = 0;
+  virtual void postflightFlowElement(void*) = 0;
+  virtual void endFlowSequence() = 0;
+
+  virtual void beginMapping() = 0;
+  virtual void endMapping() = 0;
+  virtual bool preflightKey(const char*, bool, bool, bool &, void *&) = 0;
+  virtual void postflightKey(void*) = 0;
+
+  virtual void beginEnumScalar() = 0;
+  virtual bool matchEnumScalar(const char*, bool) = 0;
+  virtual void endEnumScalar() = 0;
+
+  virtual bool beginBitSetScalar(bool &) = 0;
+  virtual bool bitSetMatch(const char*, bool) = 0;
+  virtual void endBitSetScalar() = 0;
+
+  virtual void scalarString(StringRef &) = 0;
+
+  virtual void setError(const Twine &) = 0;
+
+  template <typename T>
+  void enumCase(T &Val, const char* Str, const T ConstVal) {
+    if ( matchEnumScalar(Str, (Val == ConstVal)) ) {
+      Val = ConstVal;
+    }
+  }
+
+  // allow anonymous enum values to be used with LLVM_YAML_STRONG_TYPEDEF
+  template <typename T>
+  void enumCase(T &Val, const char* Str, const uint32_t ConstVal) {
+    if ( matchEnumScalar(Str, (Val == static_cast<T>(ConstVal))) ) {
+      Val = ConstVal;
+    }
+  }
+
+  template <typename T>
+  void bitSetCase(T &Val, const char* Str, const T ConstVal) {
+    if ( bitSetMatch(Str, ((Val & ConstVal) == ConstVal)) ) {
+      Val = Val | ConstVal;
+    }
+  }
+
+  // allow anonymous enum values to be used with LLVM_YAML_STRONG_TYPEDEF
+  template <typename T>
+  void bitSetCase(T &Val, const char* Str, const uint32_t ConstVal) {
+    if ( bitSetMatch(Str, ((Val & ConstVal) == ConstVal)) ) {
+      Val = Val | ConstVal;
+    }
+  }
+
+  void *getContext();
+  void setContext(void *);
+
+  template <typename T>
+  void mapRequired(const char* Key, T& Val) {
+    this->processKey(Key, Val, true);
+  }
+
+  template <typename T>
+  typename llvm::enable_if_c<has_SequenceTraits<T>::value,void>::type
+  mapOptional(const char* Key, T& Val) {
+    // omit key/value instead of outputting empty sequence
+    if ( this->outputting() && !(Val.begin() != Val.end()) )
+      return;
+    this->processKey(Key, Val, false);
+  }
+
+  template <typename T>
+  typename llvm::enable_if_c<!has_SequenceTraits<T>::value,void>::type
+  mapOptional(const char* Key, T& Val) {
+    this->processKey(Key, Val, false);
+  }
+
+  template <typename T>
+  void mapOptional(const char* Key, T& Val, const T& Default) {
+    this->processKeyWithDefault(Key, Val, Default, false);
+  }
+
+
+private:
+  template <typename T>
+  void processKeyWithDefault(const char *Key, T &Val, const T& DefaultValue,
+                                                                bool Required) {
+    void *SaveInfo;
+    bool UseDefault;
+    const bool sameAsDefault = (Val == DefaultValue);
+    if ( this->preflightKey(Key, Required, sameAsDefault, UseDefault,
+                                                                  SaveInfo) ) {
+      yamlize(*this, Val, Required);
+      this->postflightKey(SaveInfo);
+    }
+    else {
+      if ( UseDefault )
+        Val = DefaultValue;
+    }
+  }
+
+  template <typename T>
+  void processKey(const char *Key, T &Val, bool Required) {
+    void *SaveInfo;
+    bool UseDefault;
+    if ( this->preflightKey(Key, Required, false, UseDefault, SaveInfo) ) {
+      yamlize(*this, Val, Required);
+      this->postflightKey(SaveInfo);
+    }
+  }
+
+private:
+  void  *Ctxt;
+};
+
+
+
+template<typename T>
+typename llvm::enable_if_c<has_ScalarEnumerationTraits<T>::value,void>::type
+yamlize(IO &io, T &Val, bool) {
+  io.beginEnumScalar();
+  ScalarEnumerationTraits<T>::enumeration(io, Val);
+  io.endEnumScalar();
+}
+
+template<typename T>
+typename llvm::enable_if_c<has_ScalarBitSetTraits<T>::value,void>::type
+yamlize(IO &io, T &Val, bool) {
+  bool DoClear;
+  if ( io.beginBitSetScalar(DoClear) ) {
+    if ( DoClear )
+      Val = static_cast<T>(0);
+    ScalarBitSetTraits<T>::bitset(io, Val);
+    io.endBitSetScalar();
+  }
+}
+
+
+template<typename T>
+typename llvm::enable_if_c<has_ScalarTraits<T>::value,void>::type
+yamlize(IO &io, T &Val, bool) {
+  if ( io.outputting() ) {
+    std::string Storage;
+    llvm::raw_string_ostream Buffer(Storage);
+    ScalarTraits<T>::output(Val, io.getContext(), Buffer);
+    StringRef Str = Buffer.str();
+    io.scalarString(Str);
+  }
+  else {
+    StringRef Str;
+    io.scalarString(Str);
+    StringRef Result = ScalarTraits<T>::input(Str, io.getContext(), Val);
+    if ( !Result.empty() ) {
+      io.setError(llvm::Twine(Result));
+    }
+  }
+}
+
+
+template<typename T>
+typename llvm::enable_if_c<has_MappingTraits<T>::value, void>::type
+yamlize(IO &io, T &Val, bool) {
+  io.beginMapping();
+  MappingTraits<T>::mapping(io, Val);
+  io.endMapping();
+}
+
+template<typename T>
+typename llvm::enable_if_c<missingTraits<T>::value, void>::type
+yamlize(IO &io, T &Val, bool) {
+  char missing_yaml_trait_for_type[sizeof(MissingTrait<T>)];
+}
+
+template<typename T>
+typename llvm::enable_if_c<has_SequenceTraits<T>::value,void>::type
+yamlize(IO &io, T &Seq, bool) {
+  unsigned incount = io.beginSequence();
+  unsigned count = io.outputting() ? SequenceTraits<T>::size(io, Seq) : incount;
+  for(unsigned i=0; i < count; ++i) {
+    void *SaveInfo;
+    if ( io.preflightElement(i, SaveInfo) ) {
+      yamlize(io, SequenceTraits<T>::element(io, Seq, i), true);
+      io.postflightElement(SaveInfo);
+    }
+  }
+  io.endSequence();
+}
+
+template<typename T>
+typename llvm::enable_if_c<has_FlowSequenceTraits<T>::value,void>::type
+yamlize(IO &io, T &Seq, bool) {
+  unsigned incount = io.beginFlowSequence();
+  unsigned count = io.outputting() ? SequenceTraits<T>::size(io, Seq) : incount;
+  for(unsigned i=0; i < count; ++i) {
+    void *SaveInfo;
+    if ( io.preflightFlowElement(i, SaveInfo) ) {
+      yamlize(io, SequenceTraits<T>::element(io, Seq, i), true);
+      io.postflightFlowElement(SaveInfo);
+    }
+  }
+  io.endFlowSequence();
+}
+
+
+
+template<>
+struct ScalarTraits<bool> {
+  static void output(const bool &, void*, llvm::raw_ostream &);
+  static llvm::StringRef input(llvm::StringRef , void*, bool &);
+};
+
+template<>
+struct ScalarTraits<StringRef> {
+  static void output(const StringRef &, void*, llvm::raw_ostream &);
+  static llvm::StringRef input(llvm::StringRef , void*, StringRef &);
+};
+
+template<>
+struct ScalarTraits<uint8_t> {
+  static void output(const uint8_t &, void*, llvm::raw_ostream &);
+  static llvm::StringRef input(llvm::StringRef , void*, uint8_t &);
+};
+
+template<>
+struct ScalarTraits<uint16_t> {
+  static void output(const uint16_t &, void*, llvm::raw_ostream &);
+  static llvm::StringRef input(llvm::StringRef , void*, uint16_t &);
+};
+
+template<>
+struct ScalarTraits<uint32_t> {
+  static void output(const uint32_t &, void*, llvm::raw_ostream &);
+  static llvm::StringRef input(llvm::StringRef , void*, uint32_t &);
+};
+
+template<>
+struct ScalarTraits<uint64_t> {
+  static void output(const uint64_t &, void*, llvm::raw_ostream &);
+  static llvm::StringRef input(llvm::StringRef , void*, uint64_t &);
+};
+
+template<>
+struct ScalarTraits<int8_t> {
+  static void output(const int8_t &, void*, llvm::raw_ostream &);
+  static llvm::StringRef input(llvm::StringRef , void*, int8_t &);
+};
+
+template<>
+struct ScalarTraits<int16_t> {
+  static void output(const int16_t &, void*, llvm::raw_ostream &);
+  static llvm::StringRef input(llvm::StringRef , void*, int16_t &);
+};
+
+template<>
+struct ScalarTraits<int32_t> {
+  static void output(const int32_t &, void*, llvm::raw_ostream &);
+  static llvm::StringRef input(llvm::StringRef , void*, int32_t &);
+};
+
+template<>
+struct ScalarTraits<int64_t> {
+  static void output(const int64_t &, void*, llvm::raw_ostream &);
+  static llvm::StringRef input(llvm::StringRef , void*, int64_t &);
+};
+
+template<>
+struct ScalarTraits<float> {
+  static void output(const float &, void*, llvm::raw_ostream &);
+  static llvm::StringRef input(llvm::StringRef , void*, float &);
+};
+
+template<>
+struct ScalarTraits<double> {
+  static void output(const double &, void*, llvm::raw_ostream &);
+  static llvm::StringRef input(llvm::StringRef , void*, double &);
+};
+
+
+
+// Utility for use within MappingTraits<>::mapping() method
+// to [de]normalize an object for use with YAML conversion.
+template <typename TNorm, typename TFinal>
+struct MappingNormalization {
+  MappingNormalization(IO &i_o, TFinal &Obj)
+      : io(i_o), BufPtr(NULL), Result(Obj) {
+    if ( io.outputting() ) {
+      BufPtr = new (&Buffer) TNorm(io, Obj);
+    }
+    else {
+      BufPtr = new (&Buffer) TNorm(io);
+    }
+  }
+
+  ~MappingNormalization() {
+    if ( ! io.outputting() ) {
+      Result = BufPtr->denormalize(io);
+    }
+    BufPtr->~TNorm();
+  }
+
+  TNorm* operator->() { return BufPtr; }
+
+private:
+  typedef llvm::AlignedCharArrayUnion<TNorm> Storage;
+
+  Storage       Buffer;
+  IO           &io;
+  TNorm        *BufPtr;
+  TFinal       &Result;
+};
+
+
+
+// Utility for use within MappingTraits<>::mapping() method
+// to [de]normalize an object for use with YAML conversion.
+template <typename TNorm, typename TFinal>
+struct MappingNormalizationHeap {
+  MappingNormalizationHeap(IO &i_o, TFinal &Obj)
+    : io(i_o), BufPtr(NULL), Result(Obj) {
+    if ( io.outputting() ) {
+      BufPtr = new (&Buffer) TNorm(io, Obj);
+    }
+    else {
+      BufPtr = new TNorm(io);
+    }
+  }
+
+  ~MappingNormalizationHeap() {
+    if ( io.outputting() ) {
+      BufPtr->~TNorm();
+    }
+    else {
+      Result = BufPtr->denormalize(io);
+    }
+  }
+
+  TNorm* operator->() { return BufPtr; }
+
+private:
+  typedef llvm::AlignedCharArrayUnion<TNorm> Storage;
+
+  Storage       Buffer;
+  IO           &io;
+  TNorm        *BufPtr;
+  TFinal       &Result;
+};
+
+
+
+///
+/// The Input class is used to parse a yaml document into in-memory structs
+/// and vectors.
+///
+/// It works by using YAMLParser to do a syntax parse of the entire yaml
+/// document, then the Input class builds a graph of HNodes which wraps
+/// each yaml Node.  The extra layer is buffering.  The low level yaml
+/// parser only lets you look at each node once.  The buffering layer lets
+/// you search and interate multiple times.  This is necessary because
+/// the mapRequired() method calls may not be in the same order
+/// as the keys in the document.
+///
+class Input : public IO {
+public:
+  // Construct a yaml Input object from a StringRef and optional user-data.
+  Input(StringRef InputContent, void *Ctxt=NULL);
+
+  // Check if there was an syntax or semantic error during parsing.
+  llvm::error_code error();
+
+  // To set alternate error reporting.
+  void setDiagHandler(llvm::SourceMgr::DiagHandlerTy Handler, void *Ctxt = 0);
+
+private:
+  virtual bool outputting();
+  virtual void beginMapping();
+  virtual void endMapping();
+  virtual bool preflightKey(const char *, bool, bool, bool &, void *&);
+  virtual void postflightKey(void *);
+  virtual unsigned beginSequence();
+  virtual void endSequence();
+  virtual bool preflightElement(unsigned index, void *&);
+  virtual void postflightElement(void *);
+  virtual unsigned beginFlowSequence();
+  virtual bool preflightFlowElement(unsigned , void *&);
+  virtual void postflightFlowElement(void *);
+  virtual void endFlowSequence();
+  virtual void beginEnumScalar();
+  virtual bool matchEnumScalar(const char*, bool);
+  virtual void endEnumScalar();
+  virtual bool beginBitSetScalar(bool &);
+  virtual bool bitSetMatch(const char *, bool );
+  virtual void endBitSetScalar();
+  virtual void scalarString(StringRef &);
+  virtual void setError(const Twine &message);
+
+  class HNode {
+  public:
+    HNode(Node *n) : _node(n) { }
+    static inline bool classof(const HNode *) { return true; }
+
+    Node *_node;
+  };
+
+  class EmptyHNode : public HNode {
+  public:
+    EmptyHNode(Node *n) : HNode(n) { }
+    static inline bool classof(const HNode *n) {
+      return NullNode::classof(n->_node);
+    }
+    static inline bool classof(const EmptyHNode *) { return true; }
+  };
+
+  class ScalarHNode : public HNode {
+  public:
+    ScalarHNode(Node *n, StringRef s) : HNode(n), _value(s) { }
+
+    StringRef value() const { return _value; }
+
+    static inline bool classof(const HNode *n) {
+      return ScalarNode::classof(n->_node);
+    }
+    static inline bool classof(const ScalarHNode *) { return true; }
+  protected:
+    StringRef _value;
+  };
+
+  class MapHNode : public HNode {
+  public:
+    MapHNode(Node *n) : HNode(n) { }
+
+    static inline bool classof(const HNode *n) {
+      return MappingNode::classof(n->_node);
+    }
+    static inline bool classof(const MapHNode *) { return true; }
+
+    struct StrMappingInfo {
+      static StringRef getEmptyKey() { return StringRef(); }
+      static StringRef getTombstoneKey() { return StringRef(" ", 0); }
+      static unsigned getHashValue(StringRef const val) {
+                                                return llvm::HashString(val); }
+      static bool isEqual(StringRef const lhs,
+                          StringRef const rhs) { return lhs.equals(rhs); }
+    };
+    typedef llvm::DenseMap<StringRef, HNode*, StrMappingInfo> NameToNode;
+
+    bool isValidKey(StringRef key);
+
+    NameToNode                        Mapping;
+    llvm::SmallVector<const char*, 6> ValidKeys;
+  };
+
+  class SequenceHNode : public HNode {
+  public:
+    SequenceHNode(Node *n) : HNode(n) { }
+
+    static inline bool classof(const HNode *n) {
+      return SequenceNode::classof(n->_node);
+    }
+    static inline bool classof(const SequenceHNode *) { return true; }
+
+    std::vector<HNode*> Entries;
+  };
+
+  Input::HNode *createHNodes(Node *node);
+  void setError(HNode *hnode, const Twine &message);
+  void setError(Node *node, const Twine &message);
+
+
+public:
+  // These are only used by operator>>. They could be private
+  // if those templated things could be made friends.
+  bool setCurrentDocument();
+  void nextDocument();
+
+private:
+  llvm::yaml::Stream              *Strm;
+  llvm::SourceMgr                  SrcMgr;
+  llvm::error_code                 EC;
+  llvm::BumpPtrAllocator           Allocator;
+  llvm::yaml::document_iterator    DocIterator;
+  std::vector<bool>                BitValuesUsed;
+  HNode                           *CurrentNode;
+  bool                             ScalarMatchFound;
+};
+
+
+
+
+///
+/// The Output class is used to generate a yaml document from in-memory structs
+/// and vectors.
+///
+class Output : public IO {
+public:
+  Output(llvm::raw_ostream &, void *Ctxt=NULL);
+  virtual ~Output();
+
+  virtual bool outputting();
+  virtual void beginMapping();
+  virtual void endMapping();
+  virtual bool preflightKey(const char *key, bool, bool, bool &, void *&);
+  virtual void postflightKey(void *);
+  virtual unsigned beginSequence();
+  virtual void endSequence();
+  virtual bool preflightElement(unsigned, void *&);
+  virtual void postflightElement(void *);
+  virtual unsigned beginFlowSequence();
+  virtual bool preflightFlowElement(unsigned, void *&);
+  virtual void postflightFlowElement(void *);
+  virtual void endFlowSequence();
+  virtual void beginEnumScalar();
+  virtual bool matchEnumScalar(const char*, bool);
+  virtual void endEnumScalar();
+  virtual bool beginBitSetScalar(bool &);
+  virtual bool bitSetMatch(const char *, bool );
+  virtual void endBitSetScalar();
+  virtual void scalarString(StringRef &);
+  virtual void setError(const Twine &message);
+
+public:
+  // These are only used by operator<<. They could be private
+  // if that templated operator could be made a friend.
+  void beginDocuments();
+  bool preflightDocument(unsigned);
+  void postflightDocument();
+  void endDocuments();
+
+private:
+  void output(StringRef s);
+  void outputUpToEndOfLine(StringRef s);
+  void newLineCheck();
+  void outputNewLine();
+  void paddedKey(StringRef key);
+
+  enum InState { inSeq, inFlowSeq, inMapFirstKey, inMapOtherKey };
+
+  llvm::raw_ostream       &Out;
+  SmallVector<InState, 8>  StateStack;
+  int                      Column;
+  int                      ColumnAtFlowStart;
+  bool                     NeedBitValueComma;
+  bool                     NeedFlowSequenceComma;
+  bool                     EnumerationMatchFound;
+  bool                     NeedsNewLine;
+};
+
+
+
+
+/// YAML I/O does conversion based on types. But often native data types
+/// are just a typedef of built in intergral types (e.g. int).  But the C++
+/// type matching system sees through the typedef and all the typedefed types
+/// look like a built in type. This will cause the generic YAML I/O conversion
+/// to be used. To provide better control over the YAML conversion, you can
+/// use this macro instead of typedef.  It will create a class with one field
+/// and automatic conversion operators to and from the base type.
+/// Based on BOOST_STRONG_TYPEDEF
+#define LLVM_YAML_STRONG_TYPEDEF(_base, _type)                                 \
+    struct _type {                                                             \
+        _type() { }                                                            \
+        _type(const _base v) : value(v) { }                                    \
+        _type(const _type &v) : value(v.value) {}                              \
+        _type &operator=(const _type &rhs) { value = rhs.value; return *this; }\
+        _type &operator=(const _base &rhs) { value = rhs; return *this; }      \
+        operator const _base & () const { return value; }                      \
+        bool operator==(const _type &rhs) const { return value == rhs.value; } \
+        bool operator==(const _base &rhs) const { return value == rhs; }       \
+        bool operator<(const _type &rhs) const { return value < rhs.value; }   \
+        _base value;                                                           \
+    };
+
+
+
+///
+/// Use these types instead of uintXX_t in any mapping to have
+/// its yaml output formatted as hexadecimal.
+///
+LLVM_YAML_STRONG_TYPEDEF(uint8_t, Hex8)
+LLVM_YAML_STRONG_TYPEDEF(uint16_t, Hex16)
+LLVM_YAML_STRONG_TYPEDEF(uint32_t, Hex32)
+LLVM_YAML_STRONG_TYPEDEF(uint64_t, Hex64)
+
+
+template<>
+struct ScalarTraits<Hex8> {
+  static void output(const Hex8 &, void*, llvm::raw_ostream &);
+  static llvm::StringRef input(llvm::StringRef , void*, Hex8 &);
+};
+
+template<>
+struct ScalarTraits<Hex16> {
+  static void output(const Hex16 &, void*, llvm::raw_ostream &);
+  static llvm::StringRef input(llvm::StringRef , void*, Hex16 &);
+};
+
+template<>
+struct ScalarTraits<Hex32> {
+  static void output(const Hex32 &, void*, llvm::raw_ostream &);
+  static llvm::StringRef input(llvm::StringRef , void*, Hex32 &);
+};
+
+template<>
+struct ScalarTraits<Hex64> {
+  static void output(const Hex64 &, void*, llvm::raw_ostream &);
+  static llvm::StringRef input(llvm::StringRef , void*, Hex64 &);
+};
+
+
+// Define non-member operator>> so that Input can stream in a document list.
+template <typename T>
+inline
+typename llvm::enable_if_c<has_DocumentListTraits<T>::value,Input &>::type
+operator>>(Input &yin, T &docList) {
+  int i = 0;
+  while ( yin.setCurrentDocument() ) {
+    yamlize(yin, DocumentListTraits<T>::element(yin, docList, i), true);
+    if ( yin.error() )
+      return yin;
+    yin.nextDocument();
+    ++i;
+  }
+  return yin;
+}
+
+// Define non-member operator>> so that Input can stream in a map as a document.
+template <typename T>
+inline
+typename llvm::enable_if_c<has_MappingTraits<T>::value,Input &>::type
+operator>>(Input &yin, T &docMap) {
+  yin.setCurrentDocument();
+  yamlize(yin, docMap, true);
+  return yin;
+}
+
+// Define non-member operator>> so that Input can stream in a sequence as
+// a document.
+template <typename T>
+inline
+typename llvm::enable_if_c<has_SequenceTraits<T>::value,Input &>::type
+operator>>(Input &yin, T &docSeq) {
+  yin.setCurrentDocument();
+  yamlize(yin, docSeq, true);
+  return yin;
+}
+
+// Provide better error message about types missing a trait specialization
+template <typename T>
+inline
+typename llvm::enable_if_c<missingTraits<T>::value,Input &>::type
+operator>>(Input &yin, T &docSeq) {
+  char missing_yaml_trait_for_type[sizeof(MissingTrait<T>)];
+  return yin;
+}
+
+
+// Define non-member operator<< so that Output can stream out document list.
+template <typename T>
+inline
+typename llvm::enable_if_c<has_DocumentListTraits<T>::value,Output &>::type
+operator<<(Output &yout, T &docList) {
+  yout.beginDocuments();
+  const size_t count = DocumentListTraits<T>::size(yout, docList);
+  for(size_t i=0; i < count; ++i) {
+    if ( yout.preflightDocument(i) ) {
+      yamlize(yout, DocumentListTraits<T>::element(yout, docList, i), true);
+      yout.postflightDocument();
+    }
+  }
+  yout.endDocuments();
+  return yout;
+}
+
+// Define non-member operator<< so that Output can stream out a map.
+template <typename T>
+inline
+typename llvm::enable_if_c<has_MappingTraits<T>::value,Output &>::type
+operator<<(Output &yout, T &map) {
+  yout.beginDocuments();
+  if ( yout.preflightDocument(0) ) {
+    yamlize(yout, map, true);
+    yout.postflightDocument();
+  }
+  yout.endDocuments();
+  return yout;
+}
+
+// Define non-member operator<< so that Output can stream out a sequence.
+template <typename T>
+inline
+typename llvm::enable_if_c<has_SequenceTraits<T>::value,Output &>::type
+operator<<(Output &yout, T &seq) {
+  yout.beginDocuments();
+  if ( yout.preflightDocument(0) ) {
+    yamlize(yout, seq, true);
+    yout.postflightDocument();
+  }
+  yout.endDocuments();
+  return yout;
+}
+
+// Provide better error message about types missing a trait specialization
+template <typename T>
+inline
+typename llvm::enable_if_c<missingTraits<T>::value,Output &>::type
+operator<<(Output &yout, T &seq) {
+  char missing_yaml_trait_for_type[sizeof(MissingTrait<T>)];
+  return yout;
+}
+
+
+} // namespace yaml
+} // namespace llvm
+
+
+/// Utility for declaring that a std::vector of a particular type
+/// should be considered a YAML sequence.
+#define LLVM_YAML_IS_SEQUENCE_VECTOR(_type)                                 \
+  namespace llvm {                                                          \
+  namespace yaml {                                                          \
+    template<>                                                              \
+    struct SequenceTraits< std::vector<_type> > {                           \
+      static size_t size(IO &io, std::vector<_type> &seq) {                 \
+        return seq.size();                                                  \
+      }                                                                     \
+      static _type& element(IO &io, std::vector<_type> &seq, size_t index) {\
+        if ( index >= seq.size() )                                          \
+          seq.resize(index+1);                                              \
+        return seq[index];                                                  \
+      }                                                                     \
+    };                                                                      \
+  }                                                                         \
+  }
+
+/// Utility for declaring that a std::vector of a particular type
+/// should be considered a YAML flow sequence.
+#define LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(_type)                            \
+  namespace llvm {                                                          \
+  namespace yaml {                                                          \
+    template<>                                                              \
+    struct SequenceTraits< std::vector<_type> > {                           \
+      static size_t size(IO &io, std::vector<_type> &seq) {                 \
+        return seq.size();                                                  \
+      }                                                                     \
+      static _type& element(IO &io, std::vector<_type> &seq, size_t index) {\
+        if ( index >= seq.size() )                                          \
+          seq.resize(index+1);                                              \
+        return seq[index];                                                  \
+      }                                                                     \
+      static const bool flow = true;                                        \
+    };                                                                      \
+  }                                                                         \
+  }
+
+/// Utility for declaring that a std::vector of a particular type
+/// should be considered a YAML document list.
+#define LLVM_YAML_IS_DOCUMENT_LIST_VECTOR(_type)                            \
+  namespace llvm {                                                          \
+  namespace yaml {                                                          \
+    template<>                                                              \
+    struct DocumentListTraits< std::vector<_type> > {                       \
+      static size_t size(IO &io, std::vector<_type> &seq) {                 \
+        return seq.size();                                                  \
+      }                                                                     \
+      static _type& element(IO &io, std::vector<_type> &seq, size_t index) {\
+        if ( index >= seq.size() )                                          \
+          seq.resize(index+1);                                              \
+        return seq[index];                                                  \
+      }                                                                     \
+    };                                                                      \
+  }                                                                         \
+  }
+
+
+
+#endif // LLVM_YAML_TRAITS_H_
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index d2e20107ef..debf296e3b 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -232,9 +232,8 @@ public:
 
   /// getRegClassFor - Return the register class that should be used for the
   /// specified value type.
-  virtual const TargetRegisterClass *getRegClassFor(EVT VT) const {
-    assert(VT.isSimple() && "getRegClassFor called on illegal type!");
-    const TargetRegisterClass *RC = RegClassForVT[VT.getSimpleVT().SimpleTy];
+  virtual const TargetRegisterClass *getRegClassFor(MVT VT) const {
+    const TargetRegisterClass *RC = RegClassForVT[VT.SimpleTy];
     assert(RC && "This value type is not natively supported!");
     return RC;
   }
@@ -244,9 +243,8 @@ public:
   /// legal super-reg register class for the register class of the value type.
   /// For example, on i386 the rep register class for i8, i16, and i32 are GR32;
   /// while the rep register class is GR64 on x86_64.
-  virtual const TargetRegisterClass *getRepRegClassFor(EVT VT) const {
-    assert(VT.isSimple() && "getRepRegClassFor called on illegal type!");
-    const TargetRegisterClass *RC = RepRegClassForVT[VT.getSimpleVT().SimpleTy];
+  virtual const TargetRegisterClass *getRepRegClassFor(MVT VT) const {
+    const TargetRegisterClass *RC = RepRegClassForVT[VT.SimpleTy];
     return RC;
   }
 
@@ -280,8 +278,8 @@ public:
       return (LegalizeTypeAction)ValueTypeActions[VT.SimpleTy];
     }
 
-    void setTypeAction(EVT VT, LegalizeTypeAction Action) {
-      unsigned I = VT.getSimpleVT().SimpleTy;
+    void setTypeAction(MVT VT, LegalizeTypeAction Action) {
+      unsigned I = VT.SimpleTy;
       ValueTypeActions[I] = Action;
     }
   };
@@ -371,16 +369,6 @@ public:
     return false;
   }
 
-  /// isIntImmLegal - Returns true if the target can instruction select the
-  /// specified integer immediate natively (that is, it's materialized with one
-  /// instruction). The current *assumption* in isel is all of integer
-  /// immediates are "legal" and only the memcpy / memset expansion code is
-  /// making use of this. The rest of isel doesn't have proper cost model for
-  /// immediate materialization.
-  virtual bool isIntImmLegal(const APInt &/*Imm*/, EVT /*VT*/) const {
-    return true;
-  }
-
   /// isShuffleMaskLegal - Targets can use this to indicate that they only
   /// support *some* VECTOR_SHUFFLE operations, those with specific masks.
   /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
@@ -444,17 +432,17 @@ public:
   /// either it is legal, needs to be promoted to a larger size, needs to be
   /// expanded to some other code sequence, or the target has a custom expander
   /// for it.
-  LegalizeAction getLoadExtAction(unsigned ExtType, EVT VT) const {
-    assert(ExtType < ISD::LAST_LOADEXT_TYPE &&
-           VT.getSimpleVT() < MVT::LAST_VALUETYPE &&
+  LegalizeAction getLoadExtAction(unsigned ExtType, MVT VT) const {
+    assert(ExtType < ISD::LAST_LOADEXT_TYPE && VT < MVT::LAST_VALUETYPE &&
            "Table isn't big enough!");
-    return (LegalizeAction)LoadExtActions[VT.getSimpleVT().SimpleTy][ExtType];
+    return (LegalizeAction)LoadExtActions[VT.SimpleTy][ExtType];
   }
 
   /// isLoadExtLegal - Return true if the specified load with extension is legal
   /// on this target.
   bool isLoadExtLegal(unsigned ExtType, EVT VT) const {
-    return VT.isSimple() && getLoadExtAction(ExtType, VT) == Legal;
+    return VT.isSimple() &&
+      getLoadExtAction(ExtType, VT.getSimpleVT()) == Legal;
   }
 
   /// getTruncStoreAction - Return how this store with truncation should be
@@ -588,7 +576,11 @@ public:
     }
     return EVT::getEVT(Ty, AllowUnknown);
   }
-  
+
+  /// Return the MVT corresponding to this LLVM type. See getValueType.
+  MVT getSimpleValueType(Type *Ty, bool AllowUnknown = false) const {
+    return getValueType(Ty, AllowUnknown).getSimpleVT();
+  }
 
   /// getByValTypeAlignment - Return the desired alignment for ByVal aggregate
   /// function arguments in the caller parameter area.  This is the actual
@@ -711,21 +703,31 @@ public:
   /// lowering. If DstAlign is zero that means it's safe to destination
   /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
   /// means there isn't a need to check it against alignment requirement,
-  /// probably because the source does not need to be loaded. If
-  /// 'IsZeroVal' is true, that means it's safe to return a
-  /// non-scalar-integer type, e.g. empty string source, constant, or loaded
-  /// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is
-  /// constant so it does not need to be loaded.
+  /// probably because the source does not need to be loaded. If 'IsMemset' is
+  /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
+  /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
+  /// source is constant so it does not need to be loaded.
   /// It returns EVT::Other if the type should be determined using generic
   /// target-independent logic.
   virtual EVT getOptimalMemOpType(uint64_t /*Size*/,
                                   unsigned /*DstAlign*/, unsigned /*SrcAlign*/,
-                                  bool /*IsZeroVal*/,
+                                  bool /*IsMemset*/,
+                                  bool /*ZeroMemset*/,
                                   bool /*MemcpyStrSrc*/,
                                   MachineFunction &/*MF*/) const {
     return MVT::Other;
   }
 
+  /// isSafeMemOpType - Returns true if it's safe to use load / store of the
+  /// specified type to expand memcpy / memset inline. This is mostly true
+  /// for all types except for some special cases. For example, on X86
+  /// targets without SSE2 f64 load / store are done with fldl / fstpl which
+  /// also does type conversion. Note the specified type doesn't have to be
+  /// legal as the hook is used before type legalization.
+  virtual bool isSafeMemOpType(MVT VT) const {
+    return true;
+  }
+
   /// usesUnderscoreSetJmp - Determine if we should use _setjmp or setjmp
   /// to implement llvm.setjmp.
   bool usesUnderscoreSetJmp() const {
diff --git a/include/llvm/Target/TargetMachine.h b/include/llvm/Target/TargetMachine.h
index a891626c37..5756f2c552 100644
--- a/include/llvm/Target/TargetMachine.h
+++ b/include/llvm/Target/TargetMachine.h
@@ -18,8 +18,6 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Target/TargetOptions.h"
-#include "llvm/Target/TargetTransformImpl.h"
-#include "llvm/TargetTransformInfo.h"
 #include <cassert>
 #include <string>
 
@@ -43,6 +41,8 @@ class TargetPassConfig;
 class TargetRegisterInfo;
 class TargetSelectionDAGInfo;
 class TargetSubtargetInfo;
+class ScalarTargetTransformInfo;
+class VectorTargetTransformInfo;
 class formatted_raw_ostream;
 class raw_ostream;
 
diff --git a/include/llvm/TargetTransformInfo.h b/include/llvm/TargetTransformInfo.h
index 519ccb9263..59fcf3b785 100644
--- a/include/llvm/TargetTransformInfo.h
+++ b/include/llvm/TargetTransformInfo.h
@@ -140,6 +140,13 @@ public:
   virtual PopcntHwSupport getPopcntHwSupport(unsigned IntTyWidthInBit) const {
     return None;
   }
+
+  /// getIntImmCost - Return the expected cost of materializing the given
+  /// integer immediate of the specified type.
+  virtual unsigned getIntImmCost(const APInt&, Type*) const {
+    // Default assumption is immediate is cheap.
+    return 1;
+  }
 };
 
 /// VectorTargetTransformInfo - This interface is used by the vectorizers
diff --git a/include/llvm/Transforms/IPO.h b/include/llvm/Transforms/IPO.h
index fc1cd59e4e..e6eb8d38bb 100644
--- a/include/llvm/Transforms/IPO.h
+++ b/include/llvm/Transforms/IPO.h
@@ -15,7 +15,7 @@
 #ifndef LLVM_TRANSFORMS_IPO_H
 #define LLVM_TRANSFORMS_IPO_H
 
-#include <vector>
+#include "llvm/ADT/ArrayRef.h"
 
 namespace llvm {
 
@@ -109,7 +109,7 @@ Pass *createPruneEHPass();
 ///
 /// Note that commandline options that are used with the above function are not
 /// used now!
-ModulePass *createInternalizePass(const std::vector<const char *> &exportList);
+ModulePass *createInternalizePass(ArrayRef<const char *> exportList);
 /// createInternalizePass - Same as above, but with an empty exportList.
 ModulePass *createInternalizePass();
 
diff --git a/include/llvm/Transforms/Utils/Local.h b/include/llvm/Transforms/Utils/Local.h
index 702628d7cd..0c3be289ed 100644
--- a/include/llvm/Transforms/Utils/Local.h
+++ b/include/llvm/Transforms/Utils/Local.h
@@ -252,6 +252,11 @@ bool LowerDbgDeclare(Function &F);
 /// an alloca, if any.
 DbgDeclareInst *FindAllocaDbgDeclare(Value *V);
 
+/// replaceDbgDeclareForAlloca - Replaces llvm.dbg.declare instruction when
+/// alloca is replaced with a new value.
+bool replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
+                                DIBuilder &Builder);
+
 } // End llvm namespace
 
 #endif
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index 00689475a4..0b88f45762 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -657,39 +657,6 @@ Value *llvm::SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
                            RecursionLimit);
 }
 
-/// \brief Accumulate the constant integer offset a GEP represents.
-///
-/// Given a getelementptr instruction/constantexpr, accumulate the constant
-/// offset from the base pointer into the provided APInt 'Offset'. Returns true
-/// if the GEP has all-constant indices. Returns false if any non-constant
-/// index is encountered leaving the 'Offset' in an undefined state. The
-/// 'Offset' APInt must be the bitwidth of the target's pointer size.
-static bool accumulateGEPOffset(const DataLayout &TD, GEPOperator *GEP,
-                                APInt &Offset) {
-  unsigned IntPtrWidth = TD.getPointerSizeInBits();
-  assert(IntPtrWidth == Offset.getBitWidth());
-
-  gep_type_iterator GTI = gep_type_begin(GEP);
-  for (User::op_iterator I = GEP->op_begin() + 1, E = GEP->op_end(); I != E;
-       ++I, ++GTI) {
-    ConstantInt *OpC = dyn_cast<ConstantInt>(*I);
-    if (!OpC) return false;
-    if (OpC->isZero()) continue;
-
-    // Handle a struct index, which adds its field offset to the pointer.
-    if (StructType *STy = dyn_cast<StructType>(*GTI)) {
-      unsigned ElementIdx = OpC->getZExtValue();
-      const StructLayout *SL = TD.getStructLayout(STy);
-      Offset += APInt(IntPtrWidth, SL->getElementOffset(ElementIdx));
-      continue;
-    }
-
-    APInt TypeSize(IntPtrWidth, TD.getTypeAllocSize(GTI.getIndexedType()));
-    Offset += OpC->getValue().sextOrTrunc(IntPtrWidth) * TypeSize;
-  }
-  return true;
-}
-
 /// \brief Compute the base pointer and cumulative constant offsets for V.
 ///
 /// This strips all constant offsets off of V, leaving it the base pointer, and
@@ -710,7 +677,7 @@ static Constant *stripAndComputeConstantOffsets(const DataLayout &TD,
   Visited.insert(V);
   do {
     if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
-      if (!GEP->isInBounds() || !accumulateGEPOffset(TD, GEP, Offset))
+      if (!GEP->isInBounds() || !GEP->accumulateConstantOffset(TD, Offset))
         break;
       V = GEP->getPointerOperand();
     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
@@ -886,6 +853,85 @@ Value *llvm::SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
                            RecursionLimit);
 }
 
+/// Given operands for an FAdd, see if we can fold the result.  If not, this
+/// returns null.
+static Value *SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF,
+                              const Query &Q, unsigned MaxRecurse) {
+  if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
+    if (Constant *CRHS = dyn_cast<Constant>(Op1)) {
+      Constant *Ops[] = { CLHS, CRHS };
+      return ConstantFoldInstOperands(Instruction::FAdd, CLHS->getType(),
+                                      Ops, Q.TD, Q.TLI);
+    }
+
+    // Canonicalize the constant to the RHS.
+    std::swap(Op0, Op1);
+  }
+
+  // fadd X, -0 ==> X
+  if (match(Op1, m_NegZero()))
+    return Op0;
+
+  // fadd X, 0 ==> X, when we know X is not -0
+  if (match(Op1, m_Zero()) &&
+      (FMF.noSignedZeros() || CannotBeNegativeZero(Op0)))
+    return Op0;
+
+  // fadd [nnan ninf] X, (fsub [nnan ninf] 0, X) ==> 0
+  //   where nnan and ninf have to occur at least once somewhere in this
+  //   expression
+  Value *SubOp = 0;
+  if (match(Op1, m_FSub(m_AnyZero(), m_Specific(Op0))))
+    SubOp = Op1;
+  else if (match(Op0, m_FSub(m_AnyZero(), m_Specific(Op1))))
+    SubOp = Op0;
+  if (SubOp) {
+    Instruction *FSub = cast<Instruction>(SubOp);
+    if ((FMF.noNaNs() || FSub->hasNoNaNs()) &&
+        (FMF.noInfs() || FSub->hasNoInfs()))
+      return Constant::getNullValue(Op0->getType());
+  }
+
+  return 0;
+}
+
+/// Given operands for an FSub, see if we can fold the result.  If not, this
+/// returns null.
+static Value *SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF,
+                              const Query &Q, unsigned MaxRecurse) {
+  if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
+    if (Constant *CRHS = dyn_cast<Constant>(Op1)) {
+      Constant *Ops[] = { CLHS, CRHS };
+      return ConstantFoldInstOperands(Instruction::FSub, CLHS->getType(),
+                                      Ops, Q.TD, Q.TLI);
+    }
+  }
+
+  // fsub X, 0 ==> X
+  if (match(Op1, m_Zero()))
+    return Op0;
+
+  // fsub X, -0 ==> X, when we know X is not -0
+  if (match(Op1, m_NegZero()) &&
+      (FMF.noSignedZeros() || CannotBeNegativeZero(Op0)))
+    return Op0;
+
+  // fsub 0, (fsub -0.0, X) ==> X
+  Value *X;
+  if (match(Op0, m_AnyZero())) {
+    if (match(Op1, m_FSub(m_NegZero(), m_Value(X))))
+      return X;
+    if (FMF.noSignedZeros() && match(Op1, m_FSub(m_AnyZero(), m_Value(X))))
+      return X;
+  }
+
+  // fsub nnan ninf x, x ==> 0.0
+  if (FMF.noNaNs() && FMF.noInfs() && Op0 == Op1)
+    return Constant::getNullValue(Op0->getType());
+
+  return 0;
+}
+
 /// Given the operands for an FMul, see if we can fold the result
 static Value *SimplifyFMulInst(Value *Op0, Value *Op1,
                                FastMathFlags FMF,
@@ -897,19 +943,19 @@ static Value *SimplifyFMulInst(Value *Op0, Value *Op1,
       return ConstantFoldInstOperands(Instruction::FMul, CLHS->getType(),
                                       Ops, Q.TD, Q.TLI);
     }
- }
 
- // Check for some fast-math optimizations
- if (FMF.noNaNs()) {
-   if (FMF.noSignedZeros()) {
-     // fmul N S 0, x ==> 0
-     if (match(Op0, m_Zero()))
-       return Op0;
-     if (match(Op1, m_Zero()))
-       return Op1;
-   }
+    // Canonicalize the constant to the RHS.
+    std::swap(Op0, Op1);
  }
 
+ // fmul X, 1.0 ==> X
+ if (match(Op1, m_FPOne()))
+   return Op0;
+
+ // fmul nnan nsz X, 0 ==> 0
+ if (FMF.noNaNs() && FMF.noSignedZeros() && match(Op1, m_AnyZero()))
+   return Op1;
+
  return 0;
 }
 
@@ -978,6 +1024,18 @@ static Value *SimplifyMulInst(Value *Op0, Value *Op1, const Query &Q,
   return 0;
 }
 
+Value *llvm::SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF,
+                             const DataLayout *TD, const TargetLibraryInfo *TLI,
+                             const DominatorTree *DT) {
+  return ::SimplifyFAddInst(Op0, Op1, FMF, Query (TD, TLI, DT), RecursionLimit);
+}
+
+Value *llvm::SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF,
+                             const DataLayout *TD, const TargetLibraryInfo *TLI,
+                             const DominatorTree *DT) {
+  return ::SimplifyFSubInst(Op0, Op1, FMF, Query (TD, TLI, DT), RecursionLimit);
+}
+
 Value *llvm::SimplifyFMulInst(Value *Op0, Value *Op1,
                               FastMathFlags FMF,
                               const DataLayout *TD,
@@ -1399,9 +1457,9 @@ static Value *SimplifyAndInst(Value *Op0, Value *Op1, const Query &Q,
   // A & (-A) = A if A is a power of two or zero.
   if (match(Op0, m_Neg(m_Specific(Op1))) ||
       match(Op1, m_Neg(m_Specific(Op0)))) {
-    if (isPowerOfTwo(Op0, Q.TD, /*OrZero*/true))
+    if (isKnownToBeAPowerOfTwo(Op0, /*OrZero*/true))
       return Op0;
-    if (isPowerOfTwo(Op1, Q.TD, /*OrZero*/true))
+    if (isKnownToBeAPowerOfTwo(Op1, /*OrZero*/true))
       return Op1;
   }
 
@@ -2732,10 +2790,18 @@ static Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
   case Instruction::Add:
     return SimplifyAddInst(LHS, RHS, /*isNSW*/false, /*isNUW*/false,
                            Q, MaxRecurse);
+  case Instruction::FAdd:
+    return SimplifyFAddInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
+
   case Instruction::Sub:
     return SimplifySubInst(LHS, RHS, /*isNSW*/false, /*isNUW*/false,
                            Q, MaxRecurse);
+  case Instruction::FSub:
+    return SimplifyFSubInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
+
   case Instruction::Mul:  return SimplifyMulInst (LHS, RHS, Q, MaxRecurse);
+  case Instruction::FMul:
+    return SimplifyFMulInst (LHS, RHS, FastMathFlags(), Q, MaxRecurse);
   case Instruction::SDiv: return SimplifySDivInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::UDiv: return SimplifyUDivInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::FDiv: return SimplifyFDivInst(LHS, RHS, Q, MaxRecurse);
@@ -2822,12 +2888,20 @@ Value *llvm::SimplifyInstruction(Instruction *I, const DataLayout *TD,
   default:
     Result = ConstantFoldInstruction(I, TD, TLI);
     break;
+  case Instruction::FAdd:
+    Result = SimplifyFAddInst(I->getOperand(0), I->getOperand(1),
+                              I->getFastMathFlags(), TD, TLI, DT);
+    break;
   case Instruction::Add:
     Result = SimplifyAddInst(I->getOperand(0), I->getOperand(1),
                              cast<BinaryOperator>(I)->hasNoSignedWrap(),
                              cast<BinaryOperator>(I)->hasNoUnsignedWrap(),
                              TD, TLI, DT);
     break;
+  case Instruction::FSub:
+    Result = SimplifyFSubInst(I->getOperand(0), I->getOperand(1),
+                              I->getFastMathFlags(), TD, TLI, DT);
+    break;
   case Instruction::Sub:
     Result = SimplifySubInst(I->getOperand(0), I->getOperand(1),
                              cast<BinaryOperator>(I)->hasNoSignedWrap(),
diff --git a/lib/Analysis/PtrUseVisitor.cpp b/lib/Analysis/PtrUseVisitor.cpp
index b48e86ea6e..0a342b2167 100644
--- a/lib/Analysis/PtrUseVisitor.cpp
+++ b/lib/Analysis/PtrUseVisitor.cpp
@@ -32,27 +32,5 @@ bool detail::PtrUseVisitorBase::adjustOffsetForGEP(GetElementPtrInst &GEPI) {
   if (!IsOffsetKnown)
     return false;
 
-  for (gep_type_iterator GTI = gep_type_begin(GEPI), GTE = gep_type_end(GEPI);
-       GTI != GTE; ++GTI) {
-    ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
-    if (!OpC)
-      return false;
-    if (OpC->isZero())
-      continue;
-
-    // Handle a struct index, which adds its field offset to the pointer.
-    if (StructType *STy = dyn_cast<StructType>(*GTI)) {
-      unsigned ElementIdx = OpC->getZExtValue();
-      const StructLayout *SL = DL.getStructLayout(STy);
-      Offset += APInt(Offset.getBitWidth(),
-                         SL->getElementOffset(ElementIdx));
-      continue;
-    }
-
-    // For array or vector indices, scale the index by the size of the type.
-    APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth());
-    Offset += Index * APInt(Offset.getBitWidth(),
-                               DL.getTypeAllocSize(GTI.getIndexedType()));
-  }
-  return true;
+  return GEPI.accumulateConstantOffset(DL, Offset);
 }
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index 13313e753b..64e132e2e2 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -799,12 +799,11 @@ void llvm::ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne,
   KnownZero = ZeroBits[BitWidth - 1];
 }
 
-/// isPowerOfTwo - Return true if the given value is known to have exactly one
+/// isKnownToBeAPowerOfTwo - Return true if the given value is known to have exactly one
 /// bit set when defined. For vectors return true if every element is known to
 /// be a power of two when defined.  Supports values with integer or pointer
 /// types and vectors of integers.
-bool llvm::isPowerOfTwo(Value *V, const DataLayout *TD, bool OrZero,
-                        unsigned Depth) {
+bool llvm::isKnownToBeAPowerOfTwo(Value *V, bool OrZero, unsigned Depth) {
   if (Constant *C = dyn_cast<Constant>(V)) {
     if (C->isNullValue())
       return OrZero;
@@ -831,19 +830,19 @@ bool llvm::isPowerOfTwo(Value *V, const DataLayout *TD, bool OrZero,
   // A shift of a power of two is a power of two or zero.
   if (OrZero && (match(V, m_Shl(m_Value(X), m_Value())) ||
                  match(V, m_Shr(m_Value(X), m_Value()))))
-    return isPowerOfTwo(X, TD, /*OrZero*/true, Depth);
+    return isKnownToBeAPowerOfTwo(X, /*OrZero*/true, Depth);
 
   if (ZExtInst *ZI = dyn_cast<ZExtInst>(V))
-    return isPowerOfTwo(ZI->getOperand(0), TD, OrZero, Depth);
+    return isKnownToBeAPowerOfTwo(ZI->getOperand(0), OrZero, Depth);
 
   if (SelectInst *SI = dyn_cast<SelectInst>(V))
-    return isPowerOfTwo(SI->getTrueValue(), TD, OrZero, Depth) &&
-      isPowerOfTwo(SI->getFalseValue(), TD, OrZero, Depth);
+    return isKnownToBeAPowerOfTwo(SI->getTrueValue(), OrZero, Depth) &&
+      isKnownToBeAPowerOfTwo(SI->getFalseValue(), OrZero, Depth);
 
   if (OrZero && match(V, m_And(m_Value(X), m_Value(Y)))) {
     // A power of two and'd with anything is a power of two or zero.
-    if (isPowerOfTwo(X, TD, /*OrZero*/true, Depth) ||
-        isPowerOfTwo(Y, TD, /*OrZero*/true, Depth))
+    if (isKnownToBeAPowerOfTwo(X, /*OrZero*/true, Depth) ||
+        isKnownToBeAPowerOfTwo(Y, /*OrZero*/true, Depth))
       return true;
     // X & (-X) is always a power of two or zero.
     if (match(X, m_Neg(m_Specific(Y))) || match(Y, m_Neg(m_Specific(X))))
@@ -856,7 +855,7 @@ bool llvm::isPowerOfTwo(Value *V, const DataLayout *TD, bool OrZero,
   // copying a sign bit (sdiv int_min, 2).
   if (match(V, m_Exact(m_LShr(m_Value(), m_Value()))) ||
       match(V, m_Exact(m_UDiv(m_Value(), m_Value())))) {
-    return isPowerOfTwo(cast<Operator>(V)->getOperand(0), TD, OrZero, Depth);
+    return isKnownToBeAPowerOfTwo(cast<Operator>(V)->getOperand(0), OrZero, Depth);
   }
 
   return false;
@@ -954,7 +953,7 @@ bool llvm::isKnownNonZero(Value *V, const DataLayout *TD, unsigned Depth) {
         return true;
   }
 
-  unsigned BitWidth = getBitWidth(V->getType(), TD);
+  unsigned BitWidth = getBitWidth(V->getType()->getScalarType(), TD);
 
   // X | Y != 0 if X != 0 or Y != 0.
   Value *X = 0, *Y = 0;
@@ -1028,9 +1027,9 @@ bool llvm::isKnownNonZero(Value *V, const DataLayout *TD, unsigned Depth) {
     }
 
     // The sum of a non-negative number and a power of two is not zero.
-    if (XKnownNonNegative && isPowerOfTwo(Y, TD, /*OrZero*/false, Depth))
+    if (XKnownNonNegative && isKnownToBeAPowerOfTwo(Y, /*OrZero*/false, Depth))
       return true;
-    if (YKnownNonNegative && isPowerOfTwo(X, TD, /*OrZero*/false, Depth))
+    if (YKnownNonNegative && isKnownToBeAPowerOfTwo(X, /*OrZero*/false, Depth))
       return true;
   }
   // X * Y.
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 68ed280c55..475b82557d 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -946,6 +946,8 @@ bool AsmPrinter::doFinalization(Module &M) {
   MMI = 0;
 
   OutStreamer.Finish();
+  OutStreamer.reset();
+
   return false;
 }
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
index 2d7a5f32ae..aec21f1c2f 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
@@ -122,8 +122,9 @@ void DwarfCFIException::BeginFunction(const MachineFunction *MF) {
   const MCSymbol *Sym = TLOF.getCFIPersonalitySymbol(Per, Asm->Mang, MMI);
   Asm->OutStreamer.EmitCFIPersonality(Sym, PerEncoding);
 
-  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("eh_func_begin",
-                                                Asm->getFunctionNumber()));
+  Asm->OutStreamer.EmitDebugLabel
+    (Asm->GetTempSymbol("eh_func_begin",
+                        Asm->getFunctionNumber()));
 
   // Provide LSDA information.
   if (!shouldEmitLSDA)
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index f0ea8893ca..83dfa54da5 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -1559,6 +1559,9 @@ DIE *CompileUnit::createMemberDIE(DIDerivedType DT) {
       MemberDie->addValue(dwarf::DW_AT_APPLE_property, dwarf::DW_FORM_ref4,
                           PropertyDie);
 
+  if (DT.isArtificial())
+    addFlag(MemberDie, dwarf::DW_AT_artificial);
+
   // This is only for backward compatibility.
   StringRef PropertyName = DT.getObjCPropertyName();
   if (!PropertyName.empty()) {
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index ad18024559..73502f2153 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -935,7 +935,7 @@ void DwarfDebug::endModule() {
     if (useDarwinGDBCompat())
       emitDebugInlineInfo();
   } else {
-    // TODO: Fill this in for Fission sections and separate
+    // TODO: Fill this in for separated debug sections and separate
     // out information into new sections.
 
     // Emit the debug info section and compile units.
@@ -1814,16 +1814,22 @@ void DwarfDebug::emitDIE(DIE *Die) {
   }
 }
 
-void DwarfDebug::emitCompileUnits(const MCSection *Section) {
-  Asm->OutStreamer.SwitchSection(Section);
-  for (DenseMap<const MDNode *, CompileUnit *>::iterator I = CUMap.begin(),
-         E = CUMap.end(); I != E; ++I) {
-    CompileUnit *TheCU = I->second;
+// Emit the various dwarf units to the unit section USection with
+// the abbreviations going into ASection.
+void DwarfUnits::emitUnits(DwarfDebug *DD,
+                           const MCSection *USection,
+                           const MCSection *ASection,
+                           const MCSymbol *ASectionSym) {
+  Asm->OutStreamer.SwitchSection(USection);
+  for (SmallVector<CompileUnit *, 1>::iterator I = CUs.begin(),
+         E = CUs.end(); I != E; ++I) {
+    CompileUnit *TheCU = *I;
     DIE *Die = TheCU->getCUDie();
 
     // Emit the compile units header.
-    Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("info_begin",
-                                                  TheCU->getUniqueID()));
+    Asm->OutStreamer
+      .EmitLabel(Asm->GetTempSymbol(USection->getLabelBeginName(),
+                                    TheCU->getUniqueID()));
 
     // Emit size of content not including length itself
     unsigned ContentSize = Die->getSize() +
@@ -1836,23 +1842,24 @@ void DwarfDebug::emitCompileUnits(const MCSection *Section) {
     Asm->OutStreamer.AddComment("DWARF version number");
     Asm->EmitInt16(dwarf::DWARF_VERSION);
     Asm->OutStreamer.AddComment("Offset Into Abbrev. Section");
-    Asm->EmitSectionOffset(Asm->GetTempSymbol("abbrev_begin"),
-                           DwarfAbbrevSectionSym);
+    Asm->EmitSectionOffset(Asm->GetTempSymbol(ASection->getLabelBeginName()),
+                           ASectionSym);
     Asm->OutStreamer.AddComment("Address Size (in bytes)");
     Asm->EmitInt8(Asm->getDataLayout().getPointerSize());
 
-    emitDIE(Die);
-    Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("info_end",
+    DD->emitDIE(Die);
+    Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol(USection->getLabelEndName(),
                                                   TheCU->getUniqueID()));
   }
 }
 
 // Emit the debug info section.
 void DwarfDebug::emitDebugInfo() {
-  if (!useSplitDwarf())
-    emitCompileUnits(Asm->getObjFileLowering().getDwarfInfoSection());
-  else
-    emitSkeletonCU(Asm->getObjFileLowering().getDwarfInfoSection());
+  DwarfUnits &Holder = useSplitDwarf() ? SkeletonHolder : InfoHolder;
+
+  Holder.emitUnits(this, Asm->getObjFileLowering().getDwarfInfoSection(),
+                   Asm->getObjFileLowering().getDwarfAbbrevSection(),
+                   DwarfAbbrevSectionSym);
 }
 
 // Emit the abbreviation section.
@@ -1860,10 +1867,11 @@ void DwarfDebug::emitAbbreviations() {
   // Check to see if it is worth the effort.
   if (!Abbreviations.empty()) {
     // Start the debug abbrev section.
-    Asm->OutStreamer.SwitchSection(
-                            Asm->getObjFileLowering().getDwarfAbbrevSection());
+    const MCSection *ASec = Asm->getObjFileLowering().getDwarfAbbrevSection();
+    Asm->OutStreamer.SwitchSection(ASec);
 
-    Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("abbrev_begin"));
+    MCSymbol *Begin = Asm->GetTempSymbol(ASec->getLabelBeginName());
+    Asm->OutStreamer.EmitLabel(Begin);
 
     // For each abbrevation.
     for (unsigned i = 0, N = Abbreviations.size(); i < N; ++i) {
@@ -1880,7 +1888,8 @@ void DwarfDebug::emitAbbreviations() {
     // Mark end of abbreviations.
     Asm->EmitULEB128(0, "EOM(3)");
 
-    Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("abbrev_end"));
+    MCSymbol *End = Asm->GetTempSymbol(ASec->getLabelEndName());
+    Asm->OutStreamer.EmitLabel(End);
   }
 }
 
@@ -2046,14 +2055,15 @@ void DwarfDebug::emitDebugPubTypes() {
     Asm->EmitInt16(dwarf::DWARF_VERSION);
 
     Asm->OutStreamer.AddComment("Offset of Compilation Unit Info");
-    Asm->EmitSectionOffset(Asm->GetTempSymbol("info_begin",
+    const MCSection *ISec = Asm->getObjFileLowering().getDwarfInfoSection();
+    Asm->EmitSectionOffset(Asm->GetTempSymbol(ISec->getLabelBeginName(),
                                               TheCU->getUniqueID()),
                            DwarfInfoSectionSym);
 
     Asm->OutStreamer.AddComment("Compilation Unit Length");
-    Asm->EmitLabelDifference(Asm->GetTempSymbol("info_end",
+    Asm->EmitLabelDifference(Asm->GetTempSymbol(ISec->getLabelEndName(),
                                                 TheCU->getUniqueID()),
-                             Asm->GetTempSymbol("info_begin",
+                             Asm->GetTempSymbol(ISec->getLabelBeginName(),
                                                 TheCU->getUniqueID()),
                              4);
 
@@ -2315,7 +2325,7 @@ void DwarfDebug::emitDebugInlineInfo() {
   Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("debug_inlined_end", 1));
 }
 
-// DWARF5 Experimental Fission emitters.
+// DWARF5 Experimental Separate Dwarf emitters.
 
 // This DIE has the following attributes: DW_AT_comp_dir, DW_AT_stmt_list,
 // DW_AT_low_pc, DW_AT_high_pc, DW_AT_ranges, DW_AT_dwo_name, DW_AT_dwo_id,
@@ -2358,7 +2368,7 @@ void DwarfDebug::emitSkeletonCU(const MCSection *Section) {
   DIE *Die = SkeletonCU->getCUDie();
 
   // Emit the compile units header.
-  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("skel_info_begin",
+  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol(Section->getLabelBeginName(),
                                                 SkeletonCU->getUniqueID()));
 
   // Emit size of content not including length itself
@@ -2378,15 +2388,18 @@ void DwarfDebug::emitSkeletonCU(const MCSection *Section) {
   Asm->EmitInt8(Asm->getDataLayout().getPointerSize());
 
   emitDIE(Die);
-  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol("skel_info_end",
+  Asm->OutStreamer.EmitLabel(Asm->GetTempSymbol(Section->getLabelEndName(),
                                                 SkeletonCU->getUniqueID()));
 
 
 }
 
-// Emit the .debug_info.dwo section for fission. This contains the compile
-// units that would normally be in debug_info.
+// Emit the .debug_info.dwo section for separated dwarf. This contains the
+// compile units that would normally be in debug_info.
 void DwarfDebug::emitDebugInfoDWO() {
   assert(useSplitDwarf() && "No split dwarf debug info?");
-  emitCompileUnits(Asm->getObjFileLowering().getDwarfInfoDWOSection());
+  // FIXME for Abbrev DWO.
+  InfoHolder.emitUnits(this, Asm->getObjFileLowering().getDwarfInfoDWOSection(),
+                       Asm->getObjFileLowering().getDwarfAbbrevSection(),
+                       DwarfAbbrevSectionSym);
 }
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 9ddefee8e7..63a3214c17 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -41,6 +41,7 @@ class DIEAbbrev;
 class DIE;
 class DIEBlock;
 class DIEEntry;
+class DwarfDebug;
 
 //===----------------------------------------------------------------------===//
 /// \brief This class is used to record source line correspondence.
@@ -219,6 +220,11 @@ public:
 
   /// \brief Add a unit to the list of CUs.
   void addUnit(CompileUnit *CU) { CUs.push_back(CU); }
+
+  /// \brief Emit all of the units to the section listed with the given
+  /// abbreviation section.
+  void emitUnits(DwarfDebug *, const MCSection *, const MCSection *,
+                 const MCSymbol *);
 };
 
 /// \brief Collects and handles dwarf debug information.
@@ -358,12 +364,12 @@ class DwarfDebug {
   bool HasDwarfAccelTables;
   bool HasSplitDwarf;
 
-  // Fission Variables
+  // Separated Dwarf Variables
   // In general these will all be for bits that are left in the
   // original object file, rather than things that are meant
   // to be in the .dwo sections.
 
-  // The CU left in the original object file for Fission debug info.
+  // The CU left in the original object file for separated debug info.
   CompileUnit *SkeletonCU;
   DwarfUnits SkeletonHolder;
 
@@ -394,9 +400,6 @@ private:
   /// \brief Emit initial Dwarf sections with a label at the start of each one.
   void emitSectionLabels();
 
-  /// \brief Recursively Emits a debug information entry.
-  void emitDIE(DIE *Die);
-
   /// \brief Compute the size and offset of a DIE given an incoming Offset.
   unsigned computeSizeAndOffset(DIE *Die, unsigned Offset);
 
@@ -417,9 +420,6 @@ private:
   /// open.
   void endSections();
 
-  /// \brief Emit all of the compile units to the target section.
-  void emitCompileUnits(const MCSection *);
-
   /// \brief Emit the debug info section.
   void emitDebugInfo();
 
@@ -569,6 +569,9 @@ public:
   /// string text.
   MCSymbol *getStringPoolEntry(StringRef Str);
 
+  /// \brief Recursively Emits a debug information entry.
+  void emitDIE(DIE *Die);
+
   /// \brief Returns whether or not to limit some of our debug
   /// output to the limitations of darwin gdb.
   bool useDarwinGDBCompat() { return IsDarwinGDBCompat; }
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index 48008537e7..b4ff6b4fcf 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -788,27 +788,30 @@ MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ, Pass *P) {
   return NMBB;
 }
 
-MachineBasicBlock::iterator
-MachineBasicBlock::erase(MachineBasicBlock::iterator I) {
-  if (I->isBundle()) {
-    MachineBasicBlock::iterator E = llvm::next(I);
-    return Insts.erase(I.getInstrIterator(), E.getInstrIterator());
-  }
-
-  return Insts.erase(I.getInstrIterator());
+/// Prepare MI to be removed from its bundle. This fixes bundle flags on MI's
+/// neighboring instructions so the bundle won't be broken by removing MI.
+static void unbundleSingleMI(MachineInstr *MI) {
+  // Removing the first instruction in a bundle.
+  if (MI->isBundledWithSucc() && !MI->isBundledWithPred())
+    MI->unbundleFromSucc();
+  // Removing the last instruction in a bundle.
+  if (MI->isBundledWithPred() && !MI->isBundledWithSucc())
+    MI->unbundleFromPred();
+  // If MI is not bundled, or if it is internal to a bundle, the neighbor flags
+  // are already fine.
 }
 
-MachineInstr *MachineBasicBlock::remove(MachineInstr *I) {
-  if (I->isBundle()) {
-    instr_iterator MII = llvm::next(I);
-    iterator E = end();
-    while (MII != E && MII->isInsideBundle()) {
-      MachineInstr *MI = &*MII++;
-      Insts.remove(MI);
-    }
-  }
+MachineBasicBlock::instr_iterator
+MachineBasicBlock::erase(MachineBasicBlock::instr_iterator I) {
+  unbundleSingleMI(I);
+  return Insts.erase(I);
+}
 
-  return Insts.remove(I);
+MachineInstr *MachineBasicBlock::remove_instr(MachineInstr *MI) {
+  unbundleSingleMI(MI);
+  MI->clearFlag(MachineInstr::BundledPred);
+  MI->clearFlag(MachineInstr::BundledSucc);
+  return Insts.remove(MI);
 }
 
 void MachineBasicBlock::splice(MachineBasicBlock::iterator where,
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index 3aebdcdabb..f545a9ce56 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -838,46 +838,25 @@ bool MachineInstr::isIdenticalTo(const MachineInstr *Other,
   return true;
 }
 
-/// removeFromParent - This method unlinks 'this' from the containing basic
-/// block, and returns it, but does not delete it.
 MachineInstr *MachineInstr::removeFromParent() {
   assert(getParent() && "Not embedded in a basic block!");
-
-  // If it's a bundle then remove the MIs inside the bundle as well.
-  if (isBundle()) {
-    MachineBasicBlock *MBB = getParent();
-    MachineBasicBlock::instr_iterator MII = *this; ++MII;
-    MachineBasicBlock::instr_iterator E = MBB->instr_end();
-    while (MII != E && MII->isInsideBundle()) {
-      MachineInstr *MI = &*MII;
-      ++MII;
-      MBB->remove(MI);
-    }
-  }
-  getParent()->remove(this);
-  return this;
+  return getParent()->remove(this);
 }
 
+MachineInstr *MachineInstr::removeFromBundle() {
+  assert(getParent() && "Not embedded in a basic block!");
+  return getParent()->remove_instr(this);
+}
 
-/// eraseFromParent - This method unlinks 'this' from the containing basic
-/// block, and deletes it.
 void MachineInstr::eraseFromParent() {
   assert(getParent() && "Not embedded in a basic block!");
-  // If it's a bundle then remove the MIs inside the bundle as well.
-  if (isBundle()) {
-    MachineBasicBlock *MBB = getParent();
-    MachineBasicBlock::instr_iterator MII = *this; ++MII;
-    MachineBasicBlock::instr_iterator E = MBB->instr_end();
-    while (MII != E && MII->isInsideBundle()) {
-      MachineInstr *MI = &*MII;
-      ++MII;
-      MBB->erase(MI);
-    }
-  }
-  // Erase the individual instruction, which may itself be inside a bundle.
-  getParent()->erase_instr(this);
+  getParent()->erase(this);
 }
 
+void MachineInstr::eraseFromBundle() {
+  assert(getParent() && "Not embedded in a basic block!");
+  getParent()->erase_instr(this);
+}
 
 /// getNumExplicitOperands - Returns the number of non-implicit operands.
 ///
diff --git a/lib/CodeGen/MachineInstrBundle.cpp b/lib/CodeGen/MachineInstrBundle.cpp
index dd46ecb17d..ffca550f8b 100644
--- a/lib/CodeGen/MachineInstrBundle.cpp
+++ b/lib/CodeGen/MachineInstrBundle.cpp
@@ -47,8 +47,8 @@ bool UnpackMachineBundles::runOnMachineFunction(MachineFunction &MF) {
       // Remove BUNDLE instruction and the InsideBundle flags from bundled
       // instructions.
       if (MI->isBundle()) {
-        while (++MII != MIE && MII->isInsideBundle()) {
-          MII->setIsInsideBundle(false);
+        while (++MII != MIE && MII->isBundledWithPred()) {
+          MII->unbundleFromPred();
           for (unsigned i = 0, e = MII->getNumOperands(); i != e; ++i) {
             MachineOperand &MO = MII->getOperand(i);
             if (MO.isReg() && MO.isInternalRead())
@@ -101,13 +101,15 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
                           MachineBasicBlock::instr_iterator FirstMI,
                           MachineBasicBlock::instr_iterator LastMI) {
   assert(FirstMI != LastMI && "Empty bundle?");
+  MIBundleBuilder Bundle(MBB, FirstMI, LastMI);
 
   const TargetMachine &TM = MBB.getParent()->getTarget();
   const TargetInstrInfo *TII = TM.getInstrInfo();
   const TargetRegisterInfo *TRI = TM.getRegisterInfo();
 
-  MachineInstrBuilder MIB = BuildMI(MBB, FirstMI, FirstMI->getDebugLoc(),
+  MachineInstrBuilder MIB = BuildMI(*MBB.getParent(), FirstMI->getDebugLoc(),
                                     TII->get(TargetOpcode::BUNDLE));
+  Bundle.prepend(MIB);
 
   SmallVector<unsigned, 32> LocalDefs;
   SmallSet<unsigned, 32> LocalDefSet;
@@ -177,7 +179,6 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
       }
     }
 
-    FirstMI->setIsInsideBundle();
     Defs.clear();
   }
 
diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp
index 8c2b4e6aa2..760cf8a516 100644
--- a/lib/CodeGen/MachineLICM.cpp
+++ b/lib/CodeGen/MachineLICM.cpp
@@ -780,7 +780,7 @@ MachineLICM::getRegisterClassIDAndCost(const MachineInstr *MI,
                                        unsigned Reg, unsigned OpIdx,
                                        unsigned &RCId, unsigned &RCCost) const {
   const TargetRegisterClass *RC = MRI->getRegClass(Reg);
-  EVT VT = *RC->vt_begin();
+  MVT VT = *RC->vt_begin();
   if (VT == MVT::Untyped) {
     RCId = RC->getID();
     RCCost = 1;
diff --git a/lib/CodeGen/MachineModuleInfo.cpp b/lib/CodeGen/MachineModuleInfo.cpp
index ad88c51118..26650468c4 100644
--- a/lib/CodeGen/MachineModuleInfo.cpp
+++ b/lib/CodeGen/MachineModuleInfo.cpp
@@ -270,8 +270,6 @@ MachineModuleInfo::~MachineModuleInfo() {
 }
 
 bool MachineModuleInfo::doInitialization(Module &M) {
-  
-  Context.doInitialization();
 
   ObjFileMMI = 0;
   CompactUnwindEncoding = 0;
@@ -294,7 +292,7 @@ bool MachineModuleInfo::doFinalization(Module &M) {
   delete AddrLabelSymbols;
   AddrLabelSymbols = 0;
 
-  Context.doFinalization();
+  Context.reset();
 
   return false;
 }
diff --git a/lib/CodeGen/PeepholeOptimizer.cpp b/lib/CodeGen/PeepholeOptimizer.cpp
index cc07d47150..a7439b5129 100644
--- a/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/lib/CodeGen/PeepholeOptimizer.cpp
@@ -49,6 +49,11 @@
 //     v1 = bitcast v0
 //        = v0
 //
+// - Optimize Loads:
+//
+//     Loads that can be folded into a later instruction. A load is foldable
+//     if it loads to virtual registers and the virtual register defined has 
+//     a single use.
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "peephole-opt"
@@ -61,6 +66,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 using namespace llvm;
@@ -473,6 +479,9 @@ bool PeepholeOptimizer::foldImmediate(MachineInstr *MI, MachineBasicBlock *MBB,
 }
 
 bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** PEEPHOLE OPTIMIZER **********\n");
+  DEBUG(dbgs() << "********** Function: " << MF.getName() << '\n');
+
   if (DisablePeephole)
     return false;
 
@@ -547,6 +556,8 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
                                                       FoldAsLoadDefReg, DefMI);
         if (FoldMI) {
           // Update LocalMIs since we replaced MI with FoldMI and deleted DefMI.
+          DEBUG(dbgs() << "Replacing: " << *MI);
+          DEBUG(dbgs() << "     With: " << *FoldMI);
           LocalMIs.erase(MI);
           LocalMIs.erase(DefMI);
           LocalMIs.insert(FoldMI);
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 1c28d6dcaf..7c54d17275 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2984,7 +2984,8 @@ SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) {
   SDValue N00 = N0.getOperand(0);
   SDValue N01 = N0.getOperand(1);
 
-  if (N1.getOpcode() == ISD::OR) {
+  if (N1.getOpcode() == ISD::OR &&
+      N00.getNumOperands() == 2 && N01.getNumOperands() == 2) {
     // (or (or (and), (and)), (or (and), (and)))
     SDValue N000 = N00.getOperand(0);
     if (!isBSwapHWordElement(N000, Parts))
@@ -7433,7 +7434,8 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
     // start at the previous one.
     if (ShAmt % NewBW)
       ShAmt = (((ShAmt + NewBW - 1) / NewBW) * NewBW) - NewBW;
-    APInt Mask = APInt::getBitsSet(BitWidth, ShAmt, ShAmt + NewBW);
+    APInt Mask = APInt::getBitsSet(BitWidth, ShAmt,
+                                   std::min(BitWidth, ShAmt + NewBW));
     if ((Imm & Mask) == Imm) {
       APInt NewImm = (Imm & Mask).lshr(ShAmt).trunc(NewBW);
       if (Opc == ISD::AND)
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index e900c6b603..b7042532e8 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -737,15 +737,15 @@ bool FastISel::SelectBitCast(const User *I) {
   }
 
   // Bitcasts of other values become reg-reg copies or BITCAST operators.
-  EVT SrcVT = TLI.getValueType(I->getOperand(0)->getType());
-  EVT DstVT = TLI.getValueType(I->getType());
-
-  if (SrcVT == MVT::Other || !SrcVT.isSimple() ||
-      DstVT == MVT::Other || !DstVT.isSimple() ||
-      !TLI.isTypeLegal(SrcVT) || !TLI.isTypeLegal(DstVT))
+  EVT SrcEVT = TLI.getValueType(I->getOperand(0)->getType());
+  EVT DstEVT = TLI.getValueType(I->getType());
+  if (SrcEVT == MVT::Other || DstEVT == MVT::Other ||
+      !TLI.isTypeLegal(SrcEVT) || !TLI.isTypeLegal(DstEVT))
     // Unhandled type. Halt "fast" selection and bail.
     return false;
 
+  MVT SrcVT = SrcEVT.getSimpleVT();
+  MVT DstVT = DstEVT.getSimpleVT();
   unsigned Op0 = getRegForValue(I->getOperand(0));
   if (Op0 == 0)
     // Unhandled operand. Halt "fast" selection and bail.
@@ -755,7 +755,7 @@ bool FastISel::SelectBitCast(const User *I) {
 
   // First, try to perform the bitcast by inserting a reg-reg copy.
   unsigned ResultReg = 0;
-  if (SrcVT.getSimpleVT() == DstVT.getSimpleVT()) {
+  if (SrcVT == DstVT) {
     const TargetRegisterClass* SrcClass = TLI.getRegClassFor(SrcVT);
     const TargetRegisterClass* DstClass = TLI.getRegClassFor(DstVT);
     // Don't attempt a cross-class copy. It will likely fail.
@@ -768,8 +768,7 @@ bool FastISel::SelectBitCast(const User *I) {
 
   // If the reg-reg copy failed, select a BITCAST opcode.
   if (!ResultReg)
-    ResultReg = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(),
-                           ISD::BITCAST, Op0, Op0IsKill);
+    ResultReg = FastEmit_r(SrcVT, DstVT, ISD::BITCAST, Op0, Op0IsKill);
 
   if (!ResultReg)
     return false;
diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index 8799effe20..05fd7af9fd 100644
--- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -208,7 +208,7 @@ void FunctionLoweringInfo::clear() {
 }
 
 /// CreateReg - Allocate a single virtual register for the given type.
-unsigned FunctionLoweringInfo::CreateReg(EVT VT) {
+unsigned FunctionLoweringInfo::CreateReg(MVT VT) {
   return RegInfo->createVirtualRegister(TLI.getRegClassFor(VT));
 }
 
@@ -226,7 +226,7 @@ unsigned FunctionLoweringInfo::CreateRegs(Type *Ty) {
   unsigned FirstReg = 0;
   for (unsigned Value = 0, e = ValueVTs.size(); Value != e; ++Value) {
     EVT ValueVT = ValueVTs[Value];
-    EVT RegisterVT = TLI.getRegisterType(Ty->getContext(), ValueVT);
+    MVT RegisterVT = TLI.getRegisterType(Ty->getContext(), ValueVT).getSimpleVT();
 
     unsigned NumRegs = TLI.getNumRegisters(Ty->getContext(), ValueVT);
     for (unsigned i = 0; i != NumRegs; ++i) {
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index ae10609db1..940cb2f3c2 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -99,7 +99,7 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned,
   // the CopyToReg'd destination register instead of creating a new vreg.
   bool MatchReg = true;
   const TargetRegisterClass *UseRC = NULL;
-  EVT VT = Node->getValueType(ResNo);
+  MVT VT = Node->getSimpleValueType(ResNo);
 
   // Stick to the preferred register classes for legal types.
   if (TLI->isTypeLegal(VT))
@@ -124,7 +124,7 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned,
           SDValue Op = User->getOperand(i);
           if (Op.getNode() != Node || Op.getResNo() != ResNo)
             continue;
-          EVT VT = Node->getValueType(Op.getResNo());
+          MVT VT = Node->getSimpleValueType(Op.getResNo());
           if (VT == MVT::Other || VT == MVT::Glue)
             continue;
           Match = false;
@@ -272,7 +272,8 @@ unsigned InstrEmitter::getVR(SDValue Op,
     // IMPLICIT_DEF can produce any type of result so its MCInstrDesc
     // does not include operand register class info.
     if (!VReg) {
-      const TargetRegisterClass *RC = TLI->getRegClassFor(Op.getValueType());
+      const TargetRegisterClass *RC =
+        TLI->getRegClassFor(Op.getSimpleValueType());
       VReg = MRI->createVirtualRegister(RC);
     }
     BuildMI(*MBB, InsertPos, Op.getDebugLoc(),
@@ -426,7 +427,7 @@ void InstrEmitter::AddOperand(MachineInstr *MI, SDValue Op,
 }
 
 unsigned InstrEmitter::ConstrainForSubReg(unsigned VReg, unsigned SubIdx,
-                                          EVT VT, DebugLoc DL) {
+                                          MVT VT, DebugLoc DL) {
   const TargetRegisterClass *VRC = MRI->getRegClass(VReg);
   const TargetRegisterClass *RC = TRI->getSubClassWithSubReg(VRC, SubIdx);
 
@@ -477,7 +478,8 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
     // constraints on the %dst register, COPY can target all legal register
     // classes.
     unsigned SubIdx = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
-    const TargetRegisterClass *TRC = TLI->getRegClassFor(Node->getValueType(0));
+    const TargetRegisterClass *TRC =
+      TLI->getRegClassFor(Node->getSimpleValueType(0));
 
     unsigned VReg = getVR(Node->getOperand(0), VRBaseMap);
     MachineInstr *DefMI = MRI->getVRegDef(VReg);
@@ -500,7 +502,7 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
       // constrain its register class or issue a COPY to a compatible register
       // class.
       VReg = ConstrainForSubReg(VReg, SubIdx,
-                                Node->getOperand(0).getValueType(),
+                                Node->getOperand(0).getSimpleValueType(),
                                 Node->getDebugLoc());
 
       // Create the destreg if it is missing.
@@ -532,7 +534,7 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
     //
     // There is no constraint on the %src register class.
     //
-    const TargetRegisterClass *SRC = TLI->getRegClassFor(Node->getValueType(0));
+    const TargetRegisterClass *SRC = TLI->getRegClassFor(Node->getSimpleValueType(0));
     SRC = TRI->getSubClassWithSubReg(SRC, SubIdx);
     assert(SRC && "No register class supports VT and SubIdx for INSERT_SUBREG");
 
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.h b/lib/CodeGen/SelectionDAG/InstrEmitter.h
index 9bfb51db8c..8168bd96ae 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.h
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.h
@@ -81,7 +81,7 @@ class InstrEmitter {
   /// supports SubIdx sub-registers.  Emit a copy if that isn't possible.
   /// Return the virtual register to use.
   unsigned ConstrainForSubReg(unsigned VReg, unsigned SubIdx,
-                              EVT VT, DebugLoc DL);
+                              MVT VT, DebugLoc DL);
 
   /// EmitSubregNode - Generate machine code for subreg nodes.
   ///
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 2c249fcaf9..c7eef8cf9e 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1037,7 +1037,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
     Chain = Ch;
   } else {
     bool isCustom = false;
-    switch (TLI.getLoadExtAction(ExtType, SrcVT)) {
+    switch (TLI.getLoadExtAction(ExtType, SrcVT.getSimpleVT())) {
     default: llvm_unreachable("This action is not supported yet!");
     case TargetLowering::Custom:
              isCustom = true;
diff --git a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
index a9b6a2eca8..473e1384e3 100644
--- a/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
+++ b/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
@@ -94,9 +94,9 @@ ResourcePriorityQueue::numberRCValPredInSU(SUnit *SU, unsigned RCId) {
       continue;
 
     for (unsigned i = 0, e = ScegN->getNumValues(); i != e; ++i) {
-      EVT VT = ScegN->getValueType(i);
+      MVT VT = ScegN->getSimpleValueType(i);
       if (TLI->isTypeLegal(VT)
-         && (TLI->getRegClassFor(VT)->getID() == RCId)) {
+          && (TLI->getRegClassFor(VT)->getID() == RCId)) {
         NumberDeps++;
         break;
       }
@@ -132,9 +132,9 @@ unsigned ResourcePriorityQueue::numberRCValSuccInSU(SUnit *SU,
 
     for (unsigned i = 0, e = ScegN->getNumOperands(); i != e; ++i) {
       const SDValue &Op = ScegN->getOperand(i);
-      EVT VT = Op.getNode()->getValueType(Op.getResNo());
+      MVT VT = Op.getNode()->getSimpleValueType(Op.getResNo());
       if (TLI->isTypeLegal(VT)
-         && (TLI->getRegClassFor(VT)->getID() == RCId)) {
+          && (TLI->getRegClassFor(VT)->getID() == RCId)) {
         NumberDeps++;
         break;
       }
@@ -332,7 +332,7 @@ signed ResourcePriorityQueue::rawRegPressureDelta(SUnit *SU, unsigned RCId) {
 
   // Gen estimate.
   for (unsigned i = 0, e = SU->getNode()->getNumValues(); i != e; ++i) {
-      EVT VT = SU->getNode()->getValueType(i);
+      MVT VT = SU->getNode()->getSimpleValueType(i);
       if (TLI->isTypeLegal(VT)
           && TLI->getRegClassFor(VT)
           && TLI->getRegClassFor(VT)->getID() == RCId)
@@ -341,7 +341,7 @@ signed ResourcePriorityQueue::rawRegPressureDelta(SUnit *SU, unsigned RCId) {
   // Kill estimate.
   for (unsigned i = 0, e = SU->getNode()->getNumOperands(); i != e; ++i) {
       const SDValue &Op = SU->getNode()->getOperand(i);
-      EVT VT = Op.getNode()->getValueType(Op.getResNo());
+      MVT VT = Op.getNode()->getSimpleValueType(Op.getResNo());
       if (isa<ConstantSDNode>(Op.getNode()))
         continue;
 
@@ -485,7 +485,7 @@ void ResourcePriorityQueue::scheduledNode(SUnit *SU) {
   if (ScegN->isMachineOpcode()) {
     // Estimate generated regs.
     for (unsigned i = 0, e = ScegN->getNumValues(); i != e; ++i) {
-      EVT VT = ScegN->getValueType(i);
+      MVT VT = ScegN->getSimpleValueType(i);
 
       if (TLI->isTypeLegal(VT)) {
         const TargetRegisterClass *RC = TLI->getRegClassFor(VT);
@@ -496,7 +496,7 @@ void ResourcePriorityQueue::scheduledNode(SUnit *SU) {
     // Estimate killed regs.
     for (unsigned i = 0, e = ScegN->getNumOperands(); i != e; ++i) {
       const SDValue &Op = ScegN->getOperand(i);
-      EVT VT = Op.getNode()->getValueType(Op.getResNo());
+      MVT VT = Op.getNode()->getSimpleValueType(Op.getResNo());
 
       if (TLI->isTypeLegal(VT)) {
         const TargetRegisterClass *RC = TLI->getRegClassFor(VT);
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index bab0c2764a..a890c99e0a 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -268,7 +268,7 @@ static void GetCostForDef(const ScheduleDAGSDNodes::RegDefIter &RegDefPos,
                           const TargetRegisterInfo *TRI,
                           unsigned &RegClass, unsigned &Cost,
                           const MachineFunction &MF) {
-  EVT VT = RegDefPos.GetValue();
+  MVT VT = RegDefPos.GetValue();
 
   // Special handling for untyped values.  These values can only come from
   // the expansion of custom DAG-to-DAG patterns.
@@ -1939,7 +1939,7 @@ bool RegReductionPQBase::MayReduceRegPressure(SUnit *SU) const {
 
   unsigned NumDefs = TII->get(N->getMachineOpcode()).getNumDefs();
   for (unsigned i = 0; i != NumDefs; ++i) {
-    EVT VT = N->getValueType(i);
+    MVT VT = N->getSimpleValueType(i);
     if (!N->hasAnyUseOfValue(i))
       continue;
     unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
@@ -1973,7 +1973,7 @@ int RegReductionPQBase::RegPressureDiff(SUnit *SU, unsigned &LiveUses) const {
     }
     for (ScheduleDAGSDNodes::RegDefIter RegDefPos(PredSU, scheduleDAG);
          RegDefPos.IsValid(); RegDefPos.Advance()) {
-      EVT VT = RegDefPos.GetValue();
+      MVT VT = RegDefPos.GetValue();
       unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
       if (RegPressure[RCId] >= RegLimit[RCId])
         ++PDiff;
@@ -1986,7 +1986,7 @@ int RegReductionPQBase::RegPressureDiff(SUnit *SU, unsigned &LiveUses) const {
 
   unsigned NumDefs = TII->get(N->getMachineOpcode()).getNumDefs();
   for (unsigned i = 0; i != NumDefs; ++i) {
-    EVT VT = N->getValueType(i);
+    MVT VT = N->getSimpleValueType(i);
     if (!N->hasAnyUseOfValue(i))
       continue;
     unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
@@ -2097,7 +2097,7 @@ void RegReductionPQBase::unscheduledNode(SUnit *SU) {
     const SDNode *PN = PredSU->getNode();
     if (!PN->isMachineOpcode()) {
       if (PN->getOpcode() == ISD::CopyFromReg) {
-        EVT VT = PN->getValueType(0);
+        MVT VT = PN->getSimpleValueType(0);
         unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
         RegPressure[RCId] += TLI->getRepRegClassCostFor(VT);
       }
@@ -2109,14 +2109,14 @@ void RegReductionPQBase::unscheduledNode(SUnit *SU) {
     if (POpc == TargetOpcode::EXTRACT_SUBREG ||
         POpc == TargetOpcode::INSERT_SUBREG ||
         POpc == TargetOpcode::SUBREG_TO_REG) {
-      EVT VT = PN->getValueType(0);
+      MVT VT = PN->getSimpleValueType(0);
       unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
       RegPressure[RCId] += TLI->getRepRegClassCostFor(VT);
       continue;
     }
     unsigned NumDefs = TII->get(PN->getMachineOpcode()).getNumDefs();
     for (unsigned i = 0; i != NumDefs; ++i) {
-      EVT VT = PN->getValueType(i);
+      MVT VT = PN->getSimpleValueType(i);
       if (!PN->hasAnyUseOfValue(i))
         continue;
       unsigned RCId = TLI->getRepRegClassFor(VT)->getID();
@@ -2133,7 +2133,7 @@ void RegReductionPQBase::unscheduledNode(SUnit *SU) {
   if (SU->NumSuccs && N->isMachineOpcode()) {
     unsigned NumDefs = TII->get(N->getMachineOpcode()).getNumDefs();
     for (unsigned i = NumDefs, e = N->getNumValues(); i != e; ++i) {
-      EVT VT = N->getValueType(i);
+      MVT VT = N->getSimpleValueType(i);
       if (VT == MVT::Glue || VT == MVT::Other)
         continue;
       if (!N->hasAnyUseOfValue(i))
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 057450de2b..b22440daf1 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -562,7 +562,7 @@ void ScheduleDAGSDNodes::RegDefIter::Advance() {
     for (;DefIdx < NodeNumDefs; ++DefIdx) {
       if (!Node->hasAnyUseOfValue(DefIdx))
         continue;
-      ValueType = Node->getValueType(DefIdx);
+      ValueType = Node->getSimpleValueType(DefIdx);
       ++DefIdx;
       return; // Found a normal regdef.
     }
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
index 907356fd21..76067a186d 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
@@ -135,13 +135,13 @@ namespace llvm {
       const SDNode *Node;
       unsigned DefIdx;
       unsigned NodeNumDefs;
-      EVT ValueType;
+      MVT ValueType;
     public:
       RegDefIter(const SUnit *SU, const ScheduleDAGSDNodes *SD);
 
       bool IsValid() const { return Node != NULL; }
 
-      EVT GetValue() const {
+      MVT GetValue() const {
         assert(IsValid() && "bad iterator");
         return ValueType;
       }
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 29339405aa..2375182167 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -41,6 +41,7 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/TargetTransformInfo.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
 #include "llvm/Target/TargetLowering.h"
@@ -3382,7 +3383,10 @@ static SDValue getMemsetStringVal(EVT VT, DebugLoc dl, SelectionDAG &DAG,
       Val |= (uint64_t)(unsigned char)Str[i] << (NumVTBytes-i-1)*8;
   }
 
-  if (TLI.isIntImmLegal(Val, VT))
+  // If the "cost" of materializing the integer immediate is 1 or free, then
+  // it is cost effective to turn the load into the immediate.
+  if (DAG.getTarget().getScalarTargetTransformInfo()->
+      getIntImmCost(Val, VT.getTypeForEVT(*DAG.getContext())) < 2)
     return DAG.getConstant(Val, VT);
   return SDValue(0, 0);
 }
@@ -3422,7 +3426,8 @@ static bool isMemSrcFromString(SDValue Src, StringRef &Str) {
 static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,
                                      unsigned Limit, uint64_t Size,
                                      unsigned DstAlign, unsigned SrcAlign,
-                                     bool IsZeroVal,
+                                     bool IsMemset,
+                                     bool ZeroMemset,
                                      bool MemcpyStrSrc,
                                      bool AllowOverlap,
                                      SelectionDAG &DAG,
@@ -3437,7 +3442,7 @@ static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,
   // 'MemcpyStrSrc' indicates whether the memcpy source is constant so it does
   // not need to be loaded.
   EVT VT = TLI.getOptimalMemOpType(Size, DstAlign, SrcAlign,
-                                   IsZeroVal, MemcpyStrSrc,
+                                   IsMemset, ZeroMemset, MemcpyStrSrc,
                                    DAG.getMachineFunction());
 
   if (VT == MVT::Other) {
@@ -3464,39 +3469,43 @@ static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,
 
   unsigned NumMemOps = 0;
   while (Size != 0) {
-    if (++NumMemOps > Limit)
-      return false;
-
     unsigned VTSize = VT.getSizeInBits() / 8;
     while (VTSize > Size) {
       // For now, only use non-vector load / store's for the left-over pieces.
-      EVT NewVT;
+      EVT NewVT = VT;
       unsigned NewVTSize;
+
+      bool Found = false;
       if (VT.isVector() || VT.isFloatingPoint()) {
         NewVT = (VT.getSizeInBits() > 64) ? MVT::i64 : MVT::i32;
-        while (!TLI.isOperationLegalOrCustom(ISD::STORE, NewVT)) {
-          if (NewVT == MVT::i64 &&
-              TLI.isOperationLegalOrCustom(ISD::STORE, MVT::f64)) {
-            // i64 is usually not legal on 32-bit targets, but f64 may be.
-            NewVT = MVT::f64;
-            break;
-          }
-          NewVT = (MVT::SimpleValueType)(NewVT.getSimpleVT().SimpleTy - 1);
+        if (TLI.isOperationLegalOrCustom(ISD::STORE, NewVT) &&
+            TLI.isSafeMemOpType(NewVT.getSimpleVT()))
+          Found = true;
+        else if (NewVT == MVT::i64 &&
+                 TLI.isOperationLegalOrCustom(ISD::STORE, MVT::f64) &&
+                 TLI.isSafeMemOpType(MVT::f64)) {
+          // i64 is usually not legal on 32-bit targets, but f64 may be.
+          NewVT = MVT::f64;
+          Found = true;
         }
-        NewVTSize = NewVT.getSizeInBits() / 8;
-      } else {
-        // This can result in a type that is not legal on the target, e.g.
-        // 1 or 2 bytes on PPC.
-        NewVT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1);
-        NewVTSize = VTSize >> 1;
       }
 
+      if (!Found) {
+        do {
+          NewVT = (MVT::SimpleValueType)(NewVT.getSimpleVT().SimpleTy - 1);
+          if (NewVT == MVT::i8)
+            break;
+        } while (!TLI.isSafeMemOpType(NewVT.getSimpleVT()));
+      }
+      NewVTSize = NewVT.getSizeInBits() / 8;
+
       // If the new VT cannot cover all of the remaining bits, then consider
       // issuing a (or a pair of) unaligned and overlapping load / store.
       // FIXME: Only does this for 64-bit or more since we don't have proper
       // cost model for unaligned load / store.
       bool Fast;
-      if (AllowOverlap && VTSize >= 8 && NewVTSize < Size &&
+      if (NumMemOps && AllowOverlap &&
+          VTSize >= 8 && NewVTSize < Size &&
           TLI.allowsUnalignedMemoryAccesses(VT, &Fast) && Fast)
         VTSize = Size;
       else {
@@ -3505,6 +3514,9 @@ static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,
       }
     }
 
+    if (++NumMemOps > Limit)
+      return false;
+
     MemOps.push_back(VT);
     Size -= VTSize;
   }
@@ -3549,7 +3561,7 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
   if (!FindOptimalMemOpLowering(MemOps, Limit, Size,
                                 (DstAlignCanChange ? 0 : Align),
                                 (isZeroStr ? 0 : SrcAlign),
-                                true, CopyFromStr, true, DAG, TLI))
+                                false, false, CopyFromStr, true, DAG, TLI))
     return SDValue();
 
   if (DstAlignCanChange) {
@@ -3650,8 +3662,8 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, DebugLoc dl,
   unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemmove(OptSize);
 
   if (!FindOptimalMemOpLowering(MemOps, Limit, Size,
-                                (DstAlignCanChange ? 0 : Align),
-                                SrcAlign, true, false, false, DAG, TLI))
+                                (DstAlignCanChange ? 0 : Align), SrcAlign,
+                                false, false, false, false, DAG, TLI))
     return SDValue();
 
   if (DstAlignCanChange) {
@@ -3727,7 +3739,7 @@ static SDValue getMemsetStores(SelectionDAG &DAG, DebugLoc dl,
     isa<ConstantSDNode>(Src) && cast<ConstantSDNode>(Src)->isNullValue();
   if (!FindOptimalMemOpLowering(MemOps, TLI.getMaxStoresPerMemset(OptSize),
                                 Size, (DstAlignCanChange ? 0 : Align), 0,
-                                IsZeroVal, false, true, DAG, TLI))
+                                true, IsZeroVal, false, true, DAG, TLI))
     return SDValue();
 
   if (DstAlignCanChange) {
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index ee8f272093..3a653474b0 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1760,8 +1760,8 @@ void SelectionDAGBuilder::visitBitTestHeader(BitTestBlock &B,
     Sub = DAG.getZExtOrTrunc(Sub, getCurDebugLoc(), VT);
   }
 
-  B.RegVT = VT;
-  B.Reg = FuncInfo.CreateReg(VT);
+  B.RegVT = VT.getSimpleVT();
+  B.Reg = FuncInfo.CreateReg(B.RegVT.getSimpleVT());
   SDValue CopyTo = DAG.getCopyToReg(getControlRoot(), getCurDebugLoc(),
                                     B.Reg, Sub);
 
@@ -6145,7 +6145,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
 
           RegsForValue MatchedRegs;
           MatchedRegs.ValueVTs.push_back(InOperandVal.getValueType());
-          EVT RegVT = AsmNodeOperands[CurOp+1].getValueType();
+          MVT RegVT = AsmNodeOperands[CurOp+1].getSimpleValueType();
           MatchedRegs.RegVTs.push_back(RegVT);
           MachineRegisterInfo &RegInfo = DAG.getMachineFunction().getRegInfo();
           for (unsigned i = 0, e = InlineAsm::getNumOperandRegisters(OpFlag);
@@ -6683,8 +6683,8 @@ void SelectionDAGISel::LowerArguments(const BasicBlock *LLVMBB) {
     // from the sret argument into it.
     SmallVector<EVT, 1> ValueVTs;
     ComputeValueVTs(TLI, PointerType::getUnqual(F.getReturnType()), ValueVTs);
-    EVT VT = ValueVTs[0];
-    EVT RegVT = TLI.getRegisterType(*CurDAG->getContext(), VT);
+    MVT VT = ValueVTs[0].getSimpleVT();
+    MVT RegVT = TLI.getRegisterType(*CurDAG->getContext(), VT).getSimpleVT();
     ISD::NodeType AssertOp = ISD::DELETED_NODE;
     SDValue ArgValue = getCopyFromParts(DAG, dl, &InVals[0], 1,
                                         RegVT, VT, NULL, AssertOp);
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 35f1931494..2ec7e73bf1 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -772,7 +772,7 @@ void TargetLowering::computeRegisterProperties() {
   unsigned LegalIntReg = LargestIntReg;
   for (unsigned IntReg = LargestIntReg - 1;
        IntReg >= (unsigned)MVT::i1; --IntReg) {
-    EVT IVT = (MVT::SimpleValueType)IntReg;
+    MVT IVT = (MVT::SimpleValueType)IntReg;
     if (isTypeLegal(IVT)) {
       LegalIntReg = IntReg;
     } else {
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index f348a33b38..c5b807b636 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -107,20 +107,18 @@ ObjectImage *RuntimeDyldImpl::loadObject(ObjectBuffer *InputBuffer) {
           SymType == object::SymbolRef::ST_Unknown) {
         uint64_t FileOffset;
         StringRef SectionData;
+        bool IsCode;
         section_iterator si = obj->end_sections();
         Check(i->getFileOffset(FileOffset));
         Check(i->getSection(si));
         if (si == obj->end_sections()) continue;
         Check(si->getContents(SectionData));
+        Check(si->isText(IsCode));
         const uint8_t* SymPtr = (const uint8_t*)InputBuffer->getBufferStart() +
                                 (uintptr_t)FileOffset;
         uintptr_t SectOffset = (uintptr_t)(SymPtr -
                                            (const uint8_t*)SectionData.begin());
-        unsigned SectionID =
-          findOrEmitSection(*obj,
-                            *si,
-                            SymType == object::SymbolRef::ST_Function,
-                            LocalSections);
+        unsigned SectionID = findOrEmitSection(*obj, *si, IsCode, LocalSections);
         LocalSymbols[Name.data()] = SymbolLoc(SectionID, SectOffset);
         DEBUG(dbgs() << "\tFileOffset: " << format("%p", (uintptr_t)FileOffset)
                      << " flags: " << flags
diff --git a/lib/MC/MCAsmInfo.cpp b/lib/MC/MCAsmInfo.cpp
index 7ea0f3b85a..a6fa6582ab 100644
--- a/lib/MC/MCAsmInfo.cpp
+++ b/lib/MC/MCAsmInfo.cpp
@@ -37,6 +37,7 @@ MCAsmInfo::MCAsmInfo() {
   CommentColumn = 40;
   CommentString = "#";
   LabelSuffix = ":";
+  DebugLabelSuffix = ":";
   GlobalPrefix = "";
   PrivateGlobalPrefix = ".";
   LinkerPrivateGlobalPrefix = "";
diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp
index 6537e4e685..6302f970d8 100644
--- a/lib/MC/MCAsmStreamer.cpp
+++ b/lib/MC/MCAsmStreamer.cpp
@@ -135,6 +135,8 @@ public:
   }
 
   virtual void EmitLabel(MCSymbol *Symbol);
+  virtual void EmitDebugLabel(MCSymbol *Symbol);
+
   virtual void EmitEHSymAttributes(const MCSymbol *Symbol,
                                    MCSymbol *EHSymbol);
   virtual void EmitAssemblerFlag(MCAssemblerFlag Flag);
@@ -345,6 +347,14 @@ void MCAsmStreamer::EmitLabel(MCSymbol *Symbol) {
   EmitEOL();
 }
 
+void MCAsmStreamer::EmitDebugLabel(MCSymbol *Symbol) {
+  assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
+  MCStreamer::EmitDebugLabel(Symbol);
+
+  OS << *Symbol << MAI.getDebugLabelSuffix();
+  EmitEOL();
+}
+
 void MCAsmStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
   switch (Flag) {
   case MCAF_SyntaxUnified:         OS << "\t.syntax unified"; break;
diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp
index 8f8ec151e5..2558eff045 100644
--- a/lib/MC/MCAssembler.cpp
+++ b/lib/MC/MCAssembler.cpp
@@ -71,7 +71,7 @@ MCAsmLayout::MCAsmLayout(MCAssembler &Asm)
       SectionOrder.push_back(&*it);
 }
 
-bool MCAsmLayout::isFragmentUpToDate(const MCFragment *F) const {
+bool MCAsmLayout::isFragmentValid(const MCFragment *F) const {
   const MCSectionData &SD = *F->getParent();
   const MCFragment *LastValid = LastValidFragment.lookup(&SD);
   if (!LastValid)
@@ -81,8 +81,8 @@ bool MCAsmLayout::isFragmentUpToDate(const MCFragment *F) const {
 }
 
 void MCAsmLayout::invalidateFragmentsAfter(MCFragment *F) {
-  // If this fragment wasn't already up-to-date, we don't need to do anything.
-  if (!isFragmentUpToDate(F))
+  // If this fragment wasn't already valid, we don't need to do anything.
+  if (!isFragmentValid(F))
     return;
 
   // Otherwise, reset the last valid fragment to this fragment.
@@ -90,7 +90,7 @@ void MCAsmLayout::invalidateFragmentsAfter(MCFragment *F) {
   LastValidFragment[&SD] = F;
 }
 
-void MCAsmLayout::EnsureValid(const MCFragment *F) const {
+void MCAsmLayout::ensureValid(const MCFragment *F) const {
   MCSectionData &SD = *F->getParent();
 
   MCFragment *Cur = LastValidFragment[&SD];
@@ -99,15 +99,16 @@ void MCAsmLayout::EnsureValid(const MCFragment *F) const {
   else
     Cur = Cur->getNextNode();
 
-  // Advance the layout position until the fragment is up-to-date.
-  while (!isFragmentUpToDate(F)) {
-    const_cast<MCAsmLayout*>(this)->LayoutFragment(Cur);
+  // Advance the layout position until the fragment is valid.
+  while (!isFragmentValid(F)) {
+    assert(Cur && "Layout bookkeeping error");
+    const_cast<MCAsmLayout*>(this)->layoutFragment(Cur);
     Cur = Cur->getNextNode();
   }
 }
 
 uint64_t MCAsmLayout::getFragmentOffset(const MCFragment *F) const {
-  EnsureValid(F);
+  ensureValid(F);
   assert(F->Offset != ~UINT64_C(0) && "Address not set!");
   return F->Offset;
 }
@@ -220,6 +221,24 @@ MCAssembler::MCAssembler(MCContext &Context_, MCAsmBackend &Backend_,
 MCAssembler::~MCAssembler() {
 }
 
+void MCAssembler::reset() {
+  Sections.clear();
+  Symbols.clear();
+  SectionMap.clear();
+  SymbolMap.clear();
+  IndirectSymbols.clear();
+  DataRegions.clear();
+  ThumbFuncs.clear();
+  RelaxAll = false;
+  NoExecStack = false;
+  SubsectionsViaSymbols = false;
+
+  // reset objects owned by us
+  getBackend().reset();
+  getEmitter().reset();
+  getWriter().reset();
+}
+
 bool MCAssembler::isSymbolLinkerVisible(const MCSymbol &Symbol) const {
   // Non-temporary labels should always be visible to the linker.
   if (!Symbol.isTemporary())
@@ -374,15 +393,15 @@ uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout,
   llvm_unreachable("invalid fragment kind");
 }
 
-void MCAsmLayout::LayoutFragment(MCFragment *F) {
+void MCAsmLayout::layoutFragment(MCFragment *F) {
   MCFragment *Prev = F->getPrevNode();
 
-  // We should never try to recompute something which is up-to-date.
-  assert(!isFragmentUpToDate(F) && "Attempt to recompute up-to-date fragment!");
-  // We should never try to compute the fragment layout if it's predecessor
-  // isn't up-to-date.
-  assert((!Prev || isFragmentUpToDate(Prev)) &&
-         "Attempt to compute fragment before it's predecessor!");
+  // We should never try to recompute something which is valid.
+  assert(!isFragmentValid(F) && "Attempt to recompute a valid fragment!");
+  // We should never try to compute the fragment layout if its predecessor
+  // isn't valid.
+  assert((!Prev || isFragmentValid(Prev)) &&
+         "Attempt to compute fragment before its predecessor!");
 
   ++stats::FragmentLayouts;
 
@@ -605,9 +624,9 @@ void MCAssembler::Finish() {
     SD->setLayoutOrder(i);
 
     unsigned FragmentIndex = 0;
-    for (MCSectionData::iterator it2 = SD->begin(),
-           ie2 = SD->end(); it2 != ie2; ++it2)
-      it2->setLayoutOrder(FragmentIndex++);
+    for (MCSectionData::iterator iFrag = SD->begin(), iFragEnd = SD->end();
+         iFrag != iFragEnd; ++iFrag)
+      iFrag->setLayoutOrder(FragmentIndex++);
   }
 
   // Layout until everything fits.
@@ -657,9 +676,6 @@ void MCAssembler::Finish() {
 bool MCAssembler::fixupNeedsRelaxation(const MCFixup &Fixup,
                                        const MCInstFragment *DF,
                                        const MCAsmLayout &Layout) const {
-  if (getRelaxAll())
-    return true;
-
   // If we cannot resolve the fixup value, it requires relaxation.
   MCValue Target;
   uint64_t Value;
@@ -770,9 +786,11 @@ bool MCAssembler::relaxDwarfCallFrameFragment(MCAsmLayout &Layout,
 bool MCAssembler::layoutSectionOnce(MCAsmLayout &Layout, MCSectionData &SD) {
   // Holds the first fragment which needed relaxing during this layout. It will
   // remain NULL if none were relaxed.
-  MCFragment *FirstInvalidFragment = NULL;
+  // When a fragment is relaxed, all the fragments following it should get
+  // invalidated because their offset is going to change.
+  MCFragment *FirstRelaxedFragment = NULL;
 
-  // Scan for fragments that need relaxation.
+  // Attempt to relax all the fragments in the section.
   for (MCSectionData::iterator I = SD.begin(), IE = SD.end(); I != IE; ++I) {
     // Check if this is a fragment that needs relaxation.
     bool RelaxedFrag = false;
@@ -780,6 +798,8 @@ bool MCAssembler::layoutSectionOnce(MCAsmLayout &Layout, MCSectionData &SD) {
     default:
       break;
     case MCFragment::FT_Inst:
+      assert(!getRelaxAll() &&
+             "Did not expect a MCInstFragment in RelaxAll mode");
       RelaxedFrag = relaxInstruction(Layout, *cast<MCInstFragment>(I));
       break;
     case MCFragment::FT_Dwarf:
@@ -795,11 +815,11 @@ bool MCAssembler::layoutSectionOnce(MCAsmLayout &Layout, MCSectionData &SD) {
       RelaxedFrag = relaxLEB(Layout, *cast<MCLEBFragment>(I));
       break;
     }
-    if (RelaxedFrag && !FirstInvalidFragment)
-      FirstInvalidFragment = I;
+    if (RelaxedFrag && !FirstRelaxedFragment)
+      FirstRelaxedFragment = I;
   }
-  if (FirstInvalidFragment) {
-    Layout.invalidateFragmentsAfter(FirstInvalidFragment);
+  if (FirstRelaxedFragment) {
+    Layout.invalidateFragmentsAfter(FirstRelaxedFragment);
     return true;
   }
   return false;
diff --git a/lib/MC/MCContext.cpp b/lib/MC/MCContext.cpp
index 23ec0bb12d..d206dd9f50 100644
--- a/lib/MC/MCContext.cpp
+++ b/lib/MC/MCContext.cpp
@@ -21,6 +21,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/SourceMgr.h"
 using namespace llvm;
@@ -32,13 +33,15 @@ typedef StringMap<const MCSectionCOFF*> COFFUniqueMapTy;
 
 MCContext::MCContext(const MCAsmInfo &mai, const MCRegisterInfo &mri,
                      const MCObjectFileInfo *mofi, const SourceMgr *mgr,
-                     bool DoAutoInitializationFinalization ) :
+                     bool DoAutoReset ) :
   SrcMgr(mgr), MAI(mai), MRI(mri), MOFI(mofi),
   Allocator(), Symbols(Allocator), UsedNames(Allocator),
   NextUniqueID(0),
-  CurrentDwarfLoc(0,0,0,DWARF2_FLAG_IS_STMT,0,0),
-  AllowTemporaryLabels(true),
-  AutoInitializationFinalization(DoAutoInitializationFinalization) {
+  CompilationDir(llvm::sys::Path::GetCurrentDirectory().str()),
+  CurrentDwarfLoc(0,0,0,DWARF2_FLAG_IS_STMT,0,0), 
+  DwarfLocSeen(false), GenDwarfForAssembly(false), GenDwarfFileNumber(0),
+  AllowTemporaryLabels(true), AutoReset(DoAutoReset) {
+
   MachOUniquingMap = 0;
   ELFUniquingMap = 0;
   COFFUniquingMap = 0;
@@ -47,14 +50,16 @@ MCContext::MCContext(const MCAsmInfo &mai, const MCRegisterInfo &mri,
   SecureLog = 0;
   SecureLogUsed = false;
 
-  if (AutoInitializationFinalization)
-    doInitialization();
+  if (SrcMgr && SrcMgr->getNumBuffers() > 0)
+    MainFileName = SrcMgr->getMemoryBuffer(0)->getBufferIdentifier();
+  else
+    MainFileName = "";
 }
 
 MCContext::~MCContext() {
 
-  if (AutoInitializationFinalization)
-    doFinalization();
+  if (AutoReset)
+    reset();
 
   // NOTE: The symbols are all allocated out of a bump pointer allocator,
   // we don't need to free them here.
@@ -67,15 +72,7 @@ MCContext::~MCContext() {
 // Module Lifetime Management
 //===----------------------------------------------------------------------===//
 
-void MCContext::doInitialization() {
-  NextUniqueID = 0;
-  AllowTemporaryLabels = true;
-  DwarfLocSeen = false;
-  GenDwarfForAssembly = false;
-  GenDwarfFileNumber = 0;
-}
-
-void MCContext::doFinalization() {
+void MCContext::reset() {
   UsedNames.clear();
   Symbols.clear();
   Allocator.Reset();
@@ -95,6 +92,12 @@ void MCContext::doFinalization() {
   MachOUniquingMap = 0;
   ELFUniquingMap = 0;
   COFFUniquingMap = 0;
+
+  NextUniqueID = 0;
+  AllowTemporaryLabels = true;
+  DwarfLocSeen = false;
+  GenDwarfForAssembly = false;
+  GenDwarfFileNumber = 0;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/MC/MCDwarf.cpp b/lib/MC/MCDwarf.cpp
index 597ee1d691..d53d2fc0b7 100644
--- a/lib/MC/MCDwarf.cpp
+++ b/lib/MC/MCDwarf.cpp
@@ -627,8 +627,7 @@ static void EmitGenDwarfInfo(MCStreamer *MCOS,
   MCOS->EmitIntValue(0, 1); // NULL byte to terminate the string.
 
   // AT_comp_dir, the working directory the assembly was done in.
-  llvm::sys::Path CWD = llvm::sys::Path::GetCurrentDirectory();
-  MCOS->EmitBytes(StringRef(CWD.c_str()), 0);
+  MCOS->EmitBytes(context.getCompilationDir(), 0);
   MCOS->EmitIntValue(0, 1); // NULL byte to terminate the string.
 
   // AT_APPLE_flags, the command line arguments of the assembler tool.
diff --git a/lib/MC/MCELFStreamer.cpp b/lib/MC/MCELFStreamer.cpp
index 8b9bdb14a0..9771ef0549 100644
--- a/lib/MC/MCELFStreamer.cpp
+++ b/lib/MC/MCELFStreamer.cpp
@@ -86,6 +86,10 @@ void MCELFStreamer::EmitLabel(MCSymbol *Symbol) {
     MCELF::SetType(SD, ELF::STT_TLS);
 }
 
+void MCELFStreamer::EmitDebugLabel(MCSymbol *Symbol) {
+  EmitLabel(Symbol);
+}
+
 void MCELFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
   switch (Flag) {
   case MCAF_SyntaxUnified: return; // no-op here.
diff --git a/lib/MC/MCExpr.cpp b/lib/MC/MCExpr.cpp
index 7a63bace6c..1a53934fef 100644
--- a/lib/MC/MCExpr.cpp
+++ b/lib/MC/MCExpr.cpp
@@ -54,14 +54,16 @@ void MCExpr::print(raw_ostream &OS) const {
     else
       OS << Sym;
 
-    if (SRE.getKind() == MCSymbolRefExpr::VK_ARM_PLT ||
+    if (SRE.getKind() == MCSymbolRefExpr::VK_ARM_NONE ||
+        SRE.getKind() == MCSymbolRefExpr::VK_ARM_PLT ||
         SRE.getKind() == MCSymbolRefExpr::VK_ARM_TLSGD ||
         SRE.getKind() == MCSymbolRefExpr::VK_ARM_GOT ||
         SRE.getKind() == MCSymbolRefExpr::VK_ARM_GOTOFF ||
         SRE.getKind() == MCSymbolRefExpr::VK_ARM_TPOFF ||
         SRE.getKind() == MCSymbolRefExpr::VK_ARM_GOTTPOFF ||
         SRE.getKind() == MCSymbolRefExpr::VK_ARM_TARGET1 ||
-        SRE.getKind() == MCSymbolRefExpr::VK_ARM_TARGET2)
+        SRE.getKind() == MCSymbolRefExpr::VK_ARM_TARGET2 ||
+        SRE.getKind() == MCSymbolRefExpr::VK_ARM_PREL31)
       OS << MCSymbolRefExpr::getVariantKindName(SRE.getKind());
     else if (SRE.getKind() != MCSymbolRefExpr::VK_None &&
              SRE.getKind() != MCSymbolRefExpr::VK_PPC_DARWIN_HA16 &&
@@ -193,6 +195,7 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_DTPOFF: return "DTPOFF";
   case VK_TLVP: return "TLVP";
   case VK_SECREL: return "SECREL";
+  case VK_ARM_NONE: return "(NONE)";
   case VK_ARM_PLT: return "(PLT)";
   case VK_ARM_GOT: return "(GOT)";
   case VK_ARM_GOTOFF: return "(GOTOFF)";
@@ -201,6 +204,7 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_ARM_TLSGD: return "(tlsgd)";
   case VK_ARM_TARGET1: return "(target1)";
   case VK_ARM_TARGET2: return "(target2)";
+  case VK_ARM_PREL31: return "(prel31)";
   case VK_PPC_TOC: return "tocbase";
   case VK_PPC_TOC_ENTRY: return "toc";
   case VK_PPC_DARWIN_HA16: return "ha16";
@@ -209,10 +213,19 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_PPC_GAS_LO16: return "l";
   case VK_PPC_TPREL16_HA: return "tprel@ha";
   case VK_PPC_TPREL16_LO: return "tprel@l";
+  case VK_PPC_DTPREL16_HA: return "dtprel@ha";
+  case VK_PPC_DTPREL16_LO: return "dtprel@l";
   case VK_PPC_TOC16_HA: return "toc@ha";
   case VK_PPC_TOC16_LO: return "toc@l";
-  case VK_PPC_GOT_TPREL16_DS: return "got@tprel";
+  case VK_PPC_GOT_TPREL16_HA: return "got@tprel@ha";
+  case VK_PPC_GOT_TPREL16_LO: return "got@tprel@l";
   case VK_PPC_TLS: return "tls";
+  case VK_PPC_GOT_TLSGD16_HA: return "got@tlsgd@ha";
+  case VK_PPC_GOT_TLSGD16_LO: return "got@tlsgd@l";
+  case VK_PPC_GOT_TLSLD16_HA: return "got@tlsld@ha";
+  case VK_PPC_GOT_TLSLD16_LO: return "got@tlsld@l";
+  case VK_PPC_TLSGD: return "tlsgd";
+  case VK_PPC_TLSLD: return "tlsld";
   case VK_Mips_GPREL: return "GPREL";
   case VK_Mips_GOT_CALL: return "GOT_CALL";
   case VK_Mips_GOT16: return "GOT16";
diff --git a/lib/MC/MCMachOStreamer.cpp b/lib/MC/MCMachOStreamer.cpp
index f279e74e38..82ccdd4ac0 100644
--- a/lib/MC/MCMachOStreamer.cpp
+++ b/lib/MC/MCMachOStreamer.cpp
@@ -43,6 +43,7 @@ public:
 
   virtual void InitSections();
   virtual void EmitLabel(MCSymbol *Symbol);
+  virtual void EmitDebugLabel(MCSymbol *Symbol);
   virtual void EmitEHSymAttributes(const MCSymbol *Symbol,
                                    MCSymbol *EHSymbol);
   virtual void EmitAssemblerFlag(MCAssemblerFlag Flag);
@@ -130,6 +131,9 @@ void MCMachOStreamer::EmitLabel(MCSymbol *Symbol) {
   SD.setFlags(SD.getFlags() & ~SF_ReferenceTypeMask);
 }
 
+void MCMachOStreamer::EmitDebugLabel(MCSymbol *Symbol) {
+  EmitLabel(Symbol);
+}
 void MCMachOStreamer::EmitDataRegion(DataRegionData::KindTy Kind) {
   if (!getAssembler().getBackend().hasDataInCodeSupport())
     return;
diff --git a/lib/MC/MCNullStreamer.cpp b/lib/MC/MCNullStreamer.cpp
index f5ece10dc4..bb84c4f22f 100644
--- a/lib/MC/MCNullStreamer.cpp
+++ b/lib/MC/MCNullStreamer.cpp
@@ -35,7 +35,9 @@ namespace {
       assert(getCurrentSection() && "Cannot emit before setting section!");
       Symbol->setSection(*getCurrentSection());
     }
-
+    virtual void EmitDebugLabel(MCSymbol *Symbol) {
+      EmitLabel(Symbol);
+    }
     virtual void EmitAssemblerFlag(MCAssemblerFlag Flag) {}
     virtual void EmitThumbFunc(MCSymbol *Func) {}
 
diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp
index c2171ffaa5..4d6900f7c4 100644
--- a/lib/MC/MCObjectStreamer.cpp
+++ b/lib/MC/MCObjectStreamer.cpp
@@ -44,6 +44,12 @@ MCObjectStreamer::~MCObjectStreamer() {
   delete Assembler;
 }
 
+void MCObjectStreamer::reset() {
+  if (Assembler)
+    Assembler->reset();
+  MCStreamer::reset();
+}
+
 MCFragment *MCObjectStreamer::getCurrentFragment() const {
   assert(getCurrentSectionData() && "No current section!");
 
@@ -128,6 +134,10 @@ void MCObjectStreamer::EmitLabel(MCSymbol *Symbol) {
   SD.setOffset(F->getContents().size());
 }
 
+void MCObjectStreamer::EmitDebugLabel(MCSymbol *Symbol) {
+  EmitLabel(Symbol);
+}
+
 void MCObjectStreamer::EmitULEB128Value(const MCExpr *Value) {
   int64_t IntValue;
   if (Value->EvaluateAsAbsolute(IntValue, getAssembler())) {
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index 2c06604be2..85d31872a7 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -46,7 +46,7 @@ static cl::opt<bool>
 FatalAssemblerWarnings("fatal-assembler-warnings",
                        cl::desc("Consider warnings as error"));
 
-MCAsmParserSemaCallback::~MCAsmParserSemaCallback() {} 
+MCAsmParserSemaCallback::~MCAsmParserSemaCallback() {}
 
 namespace {
 
@@ -97,11 +97,14 @@ struct ParseStatementInfo {
   /// Opcode - The opcode from the last parsed instruction.
   unsigned Opcode;
 
+  /// Error - Was there an error parsing the inline assembly?
+  bool ParseError;
+
   SmallVectorImpl<AsmRewrite> *AsmRewrites;
 
-  ParseStatementInfo() : Opcode(~0U), AsmRewrites(0) {}
+  ParseStatementInfo() : Opcode(~0U), ParseError(false), AsmRewrites(0) {}
   ParseStatementInfo(SmallVectorImpl<AsmRewrite> *rewrites)
-    : Opcode(~0), AsmRewrites(rewrites) {}
+    : Opcode(~0), ParseError(false), AsmRewrites(rewrites) {}
 
   ~ParseStatementInfo() {
     // Free any parsed operands.
@@ -189,9 +192,9 @@ public:
   virtual MCAsmLexer &getLexer() { return Lexer; }
   virtual MCContext &getContext() { return Ctx; }
   virtual MCStreamer &getStreamer() { return Out; }
-  virtual unsigned getAssemblerDialect() { 
+  virtual unsigned getAssemblerDialect() {
     if (AssemblerDialect == ~0U)
-      return MAI.getAssemblerDialect(); 
+      return MAI.getAssemblerDialect();
     else
       return AssemblerDialect;
   }
@@ -289,14 +292,15 @@ private:
 
   // Directive Parsing.
 
- // ".ascii", ".asciiz", ".string"
+  // ".ascii", ".asciiz", ".string"
   bool ParseDirectiveAscii(StringRef IDVal, bool ZeroTerminated);
   bool ParseDirectiveValue(unsigned Size); // ".byte", ".long", ...
   bool ParseDirectiveRealValue(const fltSemantics &); // ".single", ...
   bool ParseDirectiveFill(); // ".fill"
   bool ParseDirectiveSpace(); // ".space"
   bool ParseDirectiveZero(); // ".zero"
-  bool ParseDirectiveSet(StringRef IDVal, bool allow_redef); // ".set", ".equ", ".equiv"
+  // ".set", ".equ", ".equiv"
+  bool ParseDirectiveSet(StringRef IDVal, bool allow_redef);
   bool ParseDirectiveOrg(); // ".org"
   // ".align{,32}", ".p2align{,w,l}"
   bool ParseDirectiveAlign(bool IsPow2, unsigned ValueSize);
@@ -609,7 +613,8 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
     getStreamer().EmitLabel(SectionStartSym);
     getContext().setGenDwarfSectionStartSym(SectionStartSym);
     getStreamer().EmitDwarfFileDirective(getContext().nextGenDwarfFileNumber(),
-      StringRef(), SrcMgr.getMemoryBuffer(CurBuffer)->getBufferIdentifier());
+                                         StringRef(),
+                                         getContext().getMainFileName());
   }
 
   // While we have input, parse each statement.
@@ -1385,6 +1390,7 @@ bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
   ParseInstructionInfo IInfo(Info.AsmRewrites);
   bool HadError = getTargetParser().ParseInstruction(IInfo, OpcodeStr.str(),
                                                      IDLoc,Info.ParsedOperands);
+  Info.ParseError = HadError;
 
   // Dump the parsed representation, if requested.
   if (getShowParsedOperands()) {
@@ -1405,7 +1411,7 @@ bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
   // section is the initial text section then generate a .loc directive for
   // the instruction.
   if (!HadError && getContext().getGenDwarfForAssembly() &&
-      getContext().getGenDwarfSection() == getStreamer().getCurrentSection() ) {
+      getContext().getGenDwarfSection() == getStreamer().getCurrentSection()) {
 
      unsigned Line = SrcMgr.FindLineNumber(IDLoc, CurBuffer);
 
@@ -1417,8 +1423,8 @@ bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
      if (CppHashFilename.size() != 0) {
        if(MCDwarfFiles[getContext().getGenDwarfFileNumber()]->getName() !=
           CppHashFilename)
-	 getStreamer().EmitDwarfFileDirective(
-	   getContext().nextGenDwarfFileNumber(), StringRef(), CppHashFilename);
+         getStreamer().EmitDwarfFileDirective(
+           getContext().nextGenDwarfFileNumber(), StringRef(), CppHashFilename);
 
        unsigned CppHashLocLineNo = SrcMgr.FindLineNumber(CppHashLoc,CppHashBuf);
        Line = CppHashLineNumber - 1 + (Line - CppHashLocLineNo);
@@ -1508,7 +1514,7 @@ void AsmParser::DiagHandler(const SMDiagnostic &Diag, void *Context) {
      DiagSrcMgr.PrintIncludeStack(ParentIncludeLoc, OS);
   }
 
-  // If we have not parsed a cpp hash line filename comment or the source 
+  // If we have not parsed a cpp hash line filename comment or the source
   // manager changed or buffer changed (like in a nested include) then just
   // print the normal diagnostic using its Filename and LineNo.
   if (!Parser->CppHashLineNumber ||
@@ -1521,7 +1527,7 @@ void AsmParser::DiagHandler(const SMDiagnostic &Diag, void *Context) {
     return;
   }
 
-  // Use the CppHashFilename and calculate a line number based on the 
+  // Use the CppHashFilename and calculate a line number based on the
   // CppHashLoc and CppHashLineNumber relative to this Diag's SMLoc for
   // the diagnostic.
   const std::string Filename = Parser->CppHashFilename;
@@ -3705,6 +3711,9 @@ bool AsmParser::ParseMSInlineAsm(void *AsmLoc, std::string &AsmString,
     if (ParseStatement(Info))
       return true;
 
+    if (Info.ParseError)
+      return true;
+
     if (Info.Opcode != ~0U) {
       const MCInstrDesc &Desc = MII->get(Info.Opcode);
 
diff --git a/lib/MC/MCParser/DarwinAsmParser.cpp b/lib/MC/MCParser/DarwinAsmParser.cpp
index 20c949dbda..7b042df292 100644
--- a/lib/MC/MCParser/DarwinAsmParser.cpp
+++ b/lib/MC/MCParser/DarwinAsmParser.cpp
@@ -314,7 +314,7 @@ bool DarwinAsmParser::ParseSectionSwitch(const char *Segment,
   Lex();
 
   // FIXME: Arch specific.
-  bool isText = StringRef(Segment) == "__TEXT";  // FIXME: Hack.
+  bool isText = TAA & MCSectionMachO::S_ATTR_PURE_INSTRUCTIONS;
   getStreamer().SwitchSection(getContext().getMachOSection(
                                 Segment, Section, TAA, StubSize,
                                 isText ? SectionKind::getText()
diff --git a/lib/MC/MCPureStreamer.cpp b/lib/MC/MCPureStreamer.cpp
index 1563bdd107..ca559b7fe1 100644
--- a/lib/MC/MCPureStreamer.cpp
+++ b/lib/MC/MCPureStreamer.cpp
@@ -37,6 +37,7 @@ public:
 
   virtual void InitSections();
   virtual void EmitLabel(MCSymbol *Symbol);
+  virtual void EmitDebugLabel(MCSymbol *Symbol);
   virtual void EmitZerofill(const MCSection *Section, MCSymbol *Symbol = 0,
                             uint64_t Size = 0, unsigned ByteAlignment = 0);
   virtual void EmitBytes(StringRef Data, unsigned AddrSpace);
@@ -134,6 +135,11 @@ void MCPureStreamer::EmitLabel(MCSymbol *Symbol) {
   SD.setOffset(F->getContents().size());
 }
 
+
+void MCPureStreamer::EmitDebugLabel(MCSymbol *Symbol) {
+  EmitLabel(Symbol);
+}
+
 void MCPureStreamer::EmitZerofill(const MCSection *Section, MCSymbol *Symbol,
                                   uint64_t Size, unsigned ByteAlignment) {
   report_fatal_error("not yet implemented in pure streamer");
diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp
index 96d6d691d2..6f0ada277c 100644
--- a/lib/MC/MCStreamer.cpp
+++ b/lib/MC/MCStreamer.cpp
@@ -34,6 +34,19 @@ MCStreamer::~MCStreamer() {
     delete W64UnwindInfos[i];
 }
 
+void MCStreamer::reset() {
+  for (unsigned i = 0; i < getNumW64UnwindInfos(); ++i)
+    delete W64UnwindInfos[i];
+  EmitEHFrame = true;
+  EmitDebugFrame = false;
+  CurrentW64UnwindInfo = 0;
+  LastSymbol = 0;
+  AutoInitSections = false;
+  const MCSection *section = NULL;
+  SectionStack.clear();
+  SectionStack.push_back(std::make_pair(section, section));  
+}
+
 const MCExpr *MCStreamer::BuildSymbolDiff(MCContext &Context,
                                           const MCSymbol *A,
                                           const MCSymbol *B) {
@@ -182,6 +195,13 @@ void MCStreamer::EmitLabel(MCSymbol *Symbol) {
   LastSymbol = Symbol;
 }
 
+void MCStreamer::EmitDebugLabel(MCSymbol *Symbol) {
+  assert(!Symbol->isVariable() && "Cannot emit a variable symbol!");
+  assert(getCurrentSection() && "Cannot emit before setting section!");
+  Symbol->setSection(*getCurrentSection());
+  LastSymbol = Symbol;
+}
+
 void MCStreamer::EmitCompactUnwindEncoding(uint32_t CompactUnwindEncoding) {
   EnsureValidFrame();
   MCDwarfFrameInfo *CurFrame = getCurrentFrameInfo();
diff --git a/lib/MC/MachObjectWriter.cpp b/lib/MC/MachObjectWriter.cpp
index cc8d2fb477..0098bead45 100644
--- a/lib/MC/MachObjectWriter.cpp
+++ b/lib/MC/MachObjectWriter.cpp
@@ -27,6 +27,16 @@
 using namespace llvm;
 using namespace llvm::object;
 
+void MachObjectWriter::reset() {
+  Relocations.clear();
+  IndirectSymBase.clear();
+  StringTable.clear();
+  LocalSymbolData.clear();
+  ExternalSymbolData.clear();
+  UndefinedSymbolData.clear();
+  MCObjectWriter::reset();
+}
+
 bool MachObjectWriter::
 doesSymbolRequireExternRelocation(const MCSymbolData *SD) {
   // Undefined symbols are always extern.
diff --git a/lib/MC/WinCOFFStreamer.cpp b/lib/MC/WinCOFFStreamer.cpp
index 359b388618..d12201a6ca 100644
--- a/lib/MC/WinCOFFStreamer.cpp
+++ b/lib/MC/WinCOFFStreamer.cpp
@@ -51,6 +51,7 @@ public:
 
   virtual void InitSections();
   virtual void EmitLabel(MCSymbol *Symbol);
+  virtual void EmitDebugLabel(MCSymbol *Symbol);
   virtual void EmitAssemblerFlag(MCAssemblerFlag Flag);
   virtual void EmitThumbFunc(MCSymbol *Func);
   virtual void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value);
@@ -176,6 +177,9 @@ void WinCOFFStreamer::EmitLabel(MCSymbol *Symbol) {
   MCObjectStreamer::EmitLabel(Symbol);
 }
 
+void WinCOFFStreamer::EmitDebugLabel(MCSymbol *Symbol) {
+  EmitLabel(Symbol);
+}
 void WinCOFFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
   llvm_unreachable("not implemented");
 }
diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp
index da7615714e..40f5390a96 100644
--- a/lib/Object/MachOObjectFile.cpp
+++ b/lib/Object/MachOObjectFile.cpp
@@ -447,9 +447,7 @@ error_code MachOObjectFile::getSectionNext(DataRefImpl DRI,
 void
 MachOObjectFile::getSection(DataRefImpl DRI,
                             InMemoryStruct<macho::Section> &Res) const {
-  InMemoryStruct<macho::SegmentLoadCommand> SLC;
   LoadCommandInfo LCI = MachOObj->getLoadCommandInfo(DRI.d.a);
-  MachOObj->ReadSegmentLoadCommand(LCI, SLC);
   MachOObj->ReadSection(LCI, DRI.d.b, Res);
 }
 
@@ -463,9 +461,7 @@ std::size_t MachOObjectFile::getSectionIndex(DataRefImpl Sec) const {
 void
 MachOObjectFile::getSection64(DataRefImpl DRI,
                             InMemoryStruct<macho::Section64> &Res) const {
-  InMemoryStruct<macho::Segment64LoadCommand> SLC;
   LoadCommandInfo LCI = MachOObj->getLoadCommandInfo(DRI.d.a);
-  MachOObj->ReadSegment64LoadCommand(LCI, SLC);
   MachOObj->ReadSection64(LCI, DRI.d.b, Res);
 }
 
@@ -482,9 +478,7 @@ error_code MachOObjectFile::getSectionName(DataRefImpl DRI,
   // FIXME: thread safety.
   static char result[34];
   if (is64BitLoadCommand(MachOObj.get(), DRI)) {
-    InMemoryStruct<macho::Segment64LoadCommand> SLC;
     LoadCommandInfo LCI = MachOObj->getLoadCommandInfo(DRI.d.a);
-    MachOObj->ReadSegment64LoadCommand(LCI, SLC);
     InMemoryStruct<macho::Section64> Sect;
     MachOObj->ReadSection64(LCI, DRI.d.b, Sect);
 
@@ -492,9 +486,7 @@ error_code MachOObjectFile::getSectionName(DataRefImpl DRI,
     strcat(result, ",");
     strcat(result, Sect->Name);
   } else {
-    InMemoryStruct<macho::SegmentLoadCommand> SLC;
     LoadCommandInfo LCI = MachOObj->getLoadCommandInfo(DRI.d.a);
-    MachOObj->ReadSegmentLoadCommand(LCI, SLC);
     InMemoryStruct<macho::Section> Sect;
     MachOObj->ReadSection(LCI, DRI.d.b, Sect);
 
@@ -567,11 +559,11 @@ error_code MachOObjectFile::isSectionText(DataRefImpl DRI,
   if (is64BitLoadCommand(MachOObj.get(), DRI)) {
     InMemoryStruct<macho::Section64> Sect;
     getSection64(DRI, Sect);
-    Result = !strcmp(Sect->Name, "__text");
+    Result = Sect->Flags & macho::SF_PureInstructions;
   } else {
     InMemoryStruct<macho::Section> Sect;
     getSection(DRI, Sect);
-    Result = !strcmp(Sect->Name, "__text");
+    Result = Sect->Flags & macho::SF_PureInstructions;
   }
   return object_error::success;
 }
diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt
index 6af0f4a6c9..f294a175e7 100644
--- a/lib/Support/CMakeLists.txt
+++ b/lib/Support/CMakeLists.txt
@@ -50,6 +50,7 @@ add_llvm_library(LLVMSupport
   Triple.cpp
   Twine.cpp
   YAMLParser.cpp
+  YAMLTraits.cpp
   raw_os_ostream.cpp
   raw_ostream.cpp
   regcomp.c
diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp
index 35bfb49a1f..5ad5308c41 100644
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp
@@ -517,6 +517,64 @@ std::string sys::getHostCPUName() {
 }
 #endif
 
+#if defined(__linux__) && defined(__arm__)
+bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
+  std::string Err;
+  DataStreamer *DS = getDataFileStreamer("/proc/cpuinfo", &Err);
+  if (!DS) {
+    DEBUG(dbgs() << "Unable to open /proc/cpuinfo: " << Err << "\n");
+    return false;
+  }
+
+  // Read 1024 bytes from /proc/cpuinfo, which should contain the Features line
+  // in all cases.
+  char buffer[1024];
+  size_t CPUInfoSize = DS->GetBytes((unsigned char*) buffer, sizeof(buffer));
+  delete DS;
+
+  StringRef Str(buffer, CPUInfoSize);
+
+  SmallVector<StringRef, 32> Lines;
+  Str.split(Lines, "\n");
+
+  // Look for the CPU implementer line.
+  StringRef Implementer;
+  for (unsigned I = 0, E = Lines.size(); I != E; ++I)
+    if (Lines[I].startswith("CPU implementer"))
+      Implementer = Lines[I].substr(15).ltrim("\t :");
+
+  if (Implementer == "0x41") { // ARM Ltd.
+    SmallVector<StringRef, 32> CPUFeatures;
+
+    // Look for the CPU features.
+    for (unsigned I = 0, E = Lines.size(); I != E; ++I)
+      if (Lines[I].startswith("Features")) {
+        Lines[I].split(CPUFeatures, " ");
+        break;
+      }
+
+    for (unsigned I = 0, E = CPUFeatures.size(); I != E; ++I) {
+      StringRef LLVMFeatureStr = StringSwitch<StringRef>(CPUFeatures[I])
+        .Case("half", "fp16")
+        .Case("neon", "neon")
+        .Case("vfpv3", "vfp3")
+        .Case("vfpv3d16", "d16")
+        .Case("vfpv4", "vfp4")
+        .Case("idiva", "hwdiv-arm")
+        .Case("idivt", "hwdiv")
+        .Default("");
+
+      if (LLVMFeatureStr != "")
+        Features.GetOrCreateValue(LLVMFeatureStr).setValue(true);
+    }
+
+    return true;
+  }
+
+  return false;
+}
+#else
 bool sys::getHostCPUFeatures(StringMap<bool> &Features){
   return false;
 }
+#endif
diff --git a/lib/Support/YAMLTraits.cpp b/lib/Support/YAMLTraits.cpp
new file mode 100644
index 0000000000..a31c31915a
--- /dev/null
+++ b/lib/Support/YAMLTraits.cpp
@@ -0,0 +1,804 @@
+//===- lib/Support/YAMLTraits.cpp -----------------------------------------===//
+//
+//                             The LLVM Linker
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/YAMLParser.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstring>
+using namespace llvm;
+using namespace yaml;
+
+//===----------------------------------------------------------------------===//
+//  IO
+//===----------------------------------------------------------------------===//
+
+IO::IO(void *Context) : Ctxt(Context) {
+}
+
+IO::~IO() {
+}
+
+void *IO::getContext() {
+  return Ctxt;
+}
+
+void IO::setContext(void *Context) {
+  Ctxt = Context;
+}
+
+//===----------------------------------------------------------------------===//
+//  Input
+//===----------------------------------------------------------------------===//
+
+Input::Input(StringRef InputContent, void *Ctxt) : IO(Ctxt), CurrentNode(NULL) {
+  Strm = new Stream(InputContent, SrcMgr);
+  DocIterator = Strm->begin();
+}
+
+error_code Input::error() {
+  return EC;
+}
+
+void Input::setDiagHandler(SourceMgr::DiagHandlerTy Handler, void *Ctxt) {
+  SrcMgr.setDiagHandler(Handler, Ctxt);
+}
+
+bool Input::outputting() {
+  return false;
+}
+
+bool Input::setCurrentDocument() {
+  if (DocIterator != Strm->end()) {
+    Node *N = DocIterator->getRoot();
+    if (isa<NullNode>(N)) {
+      // Empty files are allowed and ignored
+      ++DocIterator;
+      return setCurrentDocument();
+    }
+    CurrentNode = this->createHNodes(N);
+    return true;
+  }
+  return false;
+}
+
+void Input::nextDocument() {
+  ++DocIterator;
+}
+
+void Input::beginMapping() {
+  if (EC)
+    return;
+  MapHNode *MN = dyn_cast<MapHNode>(CurrentNode);
+  if (MN) {
+    MN->ValidKeys.clear();
+  }
+}
+
+bool Input::preflightKey(const char *Key, bool Required, bool, bool &UseDefault,
+                         void *&SaveInfo) {
+  UseDefault = false;
+  if (EC)
+    return false;
+  MapHNode *MN = dyn_cast<MapHNode>(CurrentNode);
+  if (!MN) {
+    setError(CurrentNode, "not a mapping");
+    return false;
+  }
+  MN->ValidKeys.push_back(Key);
+  HNode *Value = MN->Mapping[Key];
+  if (!Value) {
+    if (Required)
+      setError(CurrentNode, Twine("missing required key '") + Key + "'");
+    else
+      UseDefault = true;
+    return false;
+  }
+  SaveInfo = CurrentNode;
+  CurrentNode = Value;
+  return true;
+}
+
+void Input::postflightKey(void *saveInfo) {
+  CurrentNode = reinterpret_cast<HNode *>(saveInfo);
+}
+
+void Input::endMapping() {
+  if (EC)
+    return;
+  MapHNode *MN = dyn_cast<MapHNode>(CurrentNode);
+  if (!MN)
+    return;
+  for (MapHNode::NameToNode::iterator i = MN->Mapping.begin(),
+       End = MN->Mapping.end(); i != End; ++i) {
+    if (!MN->isValidKey(i->first)) {
+      setError(i->second, Twine("unknown key '") + i->first + "'");
+      break;
+    }
+  }
+}
+
+unsigned Input::beginSequence() {
+  if (SequenceHNode *SQ = dyn_cast<SequenceHNode>(CurrentNode)) {
+    return SQ->Entries.size();
+  }
+  return 0;
+}
+
+void Input::endSequence() {
+}
+
+bool Input::preflightElement(unsigned Index, void *&SaveInfo) {
+  if (EC)
+    return false;
+  if (SequenceHNode *SQ = dyn_cast<SequenceHNode>(CurrentNode)) {
+    SaveInfo = CurrentNode;
+    CurrentNode = SQ->Entries[Index];
+    return true;
+  }
+  return false;
+}
+
+void Input::postflightElement(void *SaveInfo) {
+  CurrentNode = reinterpret_cast<HNode *>(SaveInfo);
+}
+
+unsigned Input::beginFlowSequence() {
+  if (SequenceHNode *SQ = dyn_cast<SequenceHNode>(CurrentNode)) {
+    return SQ->Entries.size();
+  }
+  return 0;
+}
+
+bool Input::preflightFlowElement(unsigned index, void *&SaveInfo) {
+  if (EC)
+    return false;
+  if (SequenceHNode *SQ = dyn_cast<SequenceHNode>(CurrentNode)) {
+    SaveInfo = CurrentNode;
+    CurrentNode = SQ->Entries[index];
+    return true;
+  }
+  return false;
+}
+
+void Input::postflightFlowElement(void *SaveInfo) {
+  CurrentNode = reinterpret_cast<HNode *>(SaveInfo);
+}
+
+void Input::endFlowSequence() {
+}
+
+void Input::beginEnumScalar() {
+  ScalarMatchFound = false;
+}
+
+bool Input::matchEnumScalar(const char *Str, bool) {
+  if (ScalarMatchFound)
+    return false;
+  if (ScalarHNode *SN = dyn_cast<ScalarHNode>(CurrentNode)) {
+    if (SN->value().equals(Str)) {
+      ScalarMatchFound = true;
+      return true;
+    }
+  }
+  return false;
+}
+
+void Input::endEnumScalar() {
+  if (!ScalarMatchFound) {
+    setError(CurrentNode, "unknown enumerated scalar");
+  }
+}
+
+bool Input::beginBitSetScalar(bool &DoClear) {
+  BitValuesUsed.clear();
+  if (SequenceHNode *SQ = dyn_cast<SequenceHNode>(CurrentNode)) {
+    BitValuesUsed.insert(BitValuesUsed.begin(), SQ->Entries.size(), false);
+  } else {
+    setError(CurrentNode, "expected sequence of bit values");
+  }
+  DoClear = true;
+  return true;
+}
+
+bool Input::bitSetMatch(const char *Str, bool) {
+  if (EC)
+    return false;
+  if (SequenceHNode *SQ = dyn_cast<SequenceHNode>(CurrentNode)) {
+    unsigned Index = 0;
+    for (std::vector<HNode *>::iterator i = SQ->Entries.begin(),
+         End = SQ->Entries.end(); i != End; ++i) {
+      if (ScalarHNode *SN = dyn_cast<ScalarHNode>(*i)) {
+        if (SN->value().equals(Str)) {
+          BitValuesUsed[Index] = true;
+          return true;
+        }
+      } else {
+        setError(CurrentNode, "unexpected scalar in sequence of bit values");
+      }
+      ++Index;
+    }
+  } else {
+    setError(CurrentNode, "expected sequence of bit values");
+  }
+  return false;
+}
+
+void Input::endBitSetScalar() {
+  if (EC)
+    return;
+  if (SequenceHNode *SQ = dyn_cast<SequenceHNode>(CurrentNode)) {
+    assert(BitValuesUsed.size() == SQ->Entries.size());
+    for (unsigned i = 0; i < SQ->Entries.size(); ++i) {
+      if (!BitValuesUsed[i]) {
+        setError(SQ->Entries[i], "unknown bit value");
+        return;
+      }
+    }
+  }
+}
+
+void Input::scalarString(StringRef &S) {
+  if (ScalarHNode *SN = dyn_cast<ScalarHNode>(CurrentNode)) {
+    S = SN->value();
+  } else {
+    setError(CurrentNode, "unexpected scalar");
+  }
+}
+
+void Input::setError(HNode *hnode, const Twine &message) {
+  this->setError(hnode->_node, message);
+}
+
+void Input::setError(Node *node, const Twine &message) {
+  Strm->printError(node, message);
+  EC = make_error_code(errc::invalid_argument);
+}
+
+Input::HNode *Input::createHNodes(Node *N) {
+  SmallString<128> StringStorage;
+  if (ScalarNode *SN = dyn_cast<ScalarNode>(N)) {
+    StringRef KeyStr = SN->getValue(StringStorage);
+    if (!StringStorage.empty()) {
+      // Copy string to permanent storage
+      unsigned Len = StringStorage.size();
+      char *Buf = Allocator.Allocate<char>(Len);
+      memcpy(Buf, &StringStorage[0], Len);
+      KeyStr = StringRef(Buf, Len);
+    }
+    return new (Allocator) ScalarHNode(N, KeyStr);
+  } else if (SequenceNode *SQ = dyn_cast<SequenceNode>(N)) {
+    SequenceHNode *SQHNode = new (Allocator) SequenceHNode(N);
+    for (SequenceNode::iterator i = SQ->begin(), End = SQ->end(); i != End;
+         ++i) {
+      HNode *Entry = this->createHNodes(i);
+      if (EC)
+        break;
+      SQHNode->Entries.push_back(Entry);
+    }
+    return SQHNode;
+  } else if (MappingNode *Map = dyn_cast<MappingNode>(N)) {
+    MapHNode *mapHNode = new (Allocator) MapHNode(N);
+    for (MappingNode::iterator i = Map->begin(), End = Map->end(); i != End;
+         ++i) {
+      ScalarNode *KeyScalar = dyn_cast<ScalarNode>(i->getKey());
+      StringStorage.clear();
+      StringRef KeyStr = KeyScalar->getValue(StringStorage);
+      if (!StringStorage.empty()) {
+        // Copy string to permanent storage
+        unsigned Len = StringStorage.size();
+        char *Buf = Allocator.Allocate<char>(Len);
+        memcpy(Buf, &StringStorage[0], Len);
+        KeyStr = StringRef(Buf, Len);
+      }
+      HNode *ValueHNode = this->createHNodes(i->getValue());
+      if (EC)
+        break;
+      mapHNode->Mapping[KeyStr] = ValueHNode;
+    }
+    return mapHNode;
+  } else if (isa<NullNode>(N)) {
+    return new (Allocator) EmptyHNode(N);
+  } else {
+    setError(N, "unknown node kind");
+    return NULL;
+  }
+}
+
+bool Input::MapHNode::isValidKey(StringRef Key) {
+  for (SmallVector<const char *, 6>::iterator i = ValidKeys.begin(),
+       End = ValidKeys.end(); i != End; ++i) {
+    if (Key.equals(*i))
+      return true;
+  }
+  return false;
+}
+
+void Input::setError(const Twine &Message) {
+  this->setError(CurrentNode, Message);
+}
+
+//===----------------------------------------------------------------------===//
+//  Output
+//===----------------------------------------------------------------------===//
+
+Output::Output(raw_ostream &yout, void *context)
+    : IO(context),
+      Out(yout),
+      Column(0),
+      ColumnAtFlowStart(0),
+      NeedBitValueComma(false),
+      NeedFlowSequenceComma(false),
+      EnumerationMatchFound(false),
+      NeedsNewLine(false) {
+}
+
+Output::~Output() {
+}
+
+bool Output::outputting() {
+  return true;
+}
+
+void Output::beginMapping() {
+  StateStack.push_back(inMapFirstKey);
+  NeedsNewLine = true;
+}
+
+void Output::endMapping() {
+  StateStack.pop_back();
+}
+
+bool Output::preflightKey(const char *Key, bool Required, bool SameAsDefault,
+                          bool &UseDefault, void *&) {
+  UseDefault = false;
+  if (Required || !SameAsDefault) {
+    this->newLineCheck();
+    this->paddedKey(Key);
+    return true;
+  }
+  return false;
+}
+
+void Output::postflightKey(void *) {
+  if (StateStack.back() == inMapFirstKey) {
+    StateStack.pop_back();
+    StateStack.push_back(inMapOtherKey);
+  }
+}
+
+void Output::beginDocuments() {
+  this->outputUpToEndOfLine("---");
+}
+
+bool Output::preflightDocument(unsigned index) {
+  if (index > 0)
+    this->outputUpToEndOfLine("\n---");
+  return true;
+}
+
+void Output::postflightDocument() {
+}
+
+void Output::endDocuments() {
+  output("\n...\n");
+}
+
+unsigned Output::beginSequence() {
+  StateStack.push_back(inSeq);
+  NeedsNewLine = true;
+  return 0;
+}
+
+void Output::endSequence() {
+  StateStack.pop_back();
+}
+
+bool Output::preflightElement(unsigned, void *&) {
+  return true;
+}
+
+void Output::postflightElement(void *) {
+}
+
+unsigned Output::beginFlowSequence() {
+  this->newLineCheck();
+  StateStack.push_back(inFlowSeq);
+  ColumnAtFlowStart = Column;
+  output("[ ");
+  NeedFlowSequenceComma = false;
+  return 0;
+}
+
+void Output::endFlowSequence() {
+  StateStack.pop_back();
+  this->outputUpToEndOfLine(" ]");
+}
+
+bool Output::preflightFlowElement(unsigned, void *&) {
+  if (NeedFlowSequenceComma)
+    output(", ");
+  if (Column > 70) {
+    output("\n");
+    for (int i = 0; i < ColumnAtFlowStart; ++i)
+      output(" ");
+    Column = ColumnAtFlowStart;
+    output("  ");
+  }
+  return true;
+}
+
+void Output::postflightFlowElement(void *) {
+  NeedFlowSequenceComma = true;
+}
+
+void Output::beginEnumScalar() {
+  EnumerationMatchFound = false;
+}
+
+bool Output::matchEnumScalar(const char *Str, bool Match) {
+  if (Match && !EnumerationMatchFound) {
+    this->newLineCheck();
+    this->outputUpToEndOfLine(Str);
+    EnumerationMatchFound = true;
+  }
+  return false;
+}
+
+void Output::endEnumScalar() {
+  if (!EnumerationMatchFound)
+    llvm_unreachable("bad runtime enum value");
+}
+
+bool Output::beginBitSetScalar(bool &DoClear) {
+  this->newLineCheck();
+  output("[ ");
+  NeedBitValueComma = false;
+  DoClear = false;
+  return true;
+}
+
+bool Output::bitSetMatch(const char *Str, bool Matches) {
+  if (Matches) {
+    if (NeedBitValueComma)
+      output(", ");
+    this->output(Str);
+    NeedBitValueComma = true;
+  }
+  return false;
+}
+
+void Output::endBitSetScalar() {
+  this->outputUpToEndOfLine(" ]");
+}
+
+void Output::scalarString(StringRef &S) {
+  this->newLineCheck();
+  if (S.find('\n') == StringRef::npos) {
+    // No embedded new-line chars, just print string.
+    this->outputUpToEndOfLine(S);
+    return;
+  }
+  unsigned i = 0;
+  unsigned j = 0;
+  unsigned End = S.size();
+  output("'"); // Starting single quote.
+  const char *Base = S.data();
+  while (j < End) {
+    // Escape a single quote by doubling it.
+    if (S[j] == '\'') {
+      output(StringRef(&Base[i], j - i + 1));
+      output("'");
+      i = j + 1;
+    }
+    ++j;
+  }
+  output(StringRef(&Base[i], j - i));
+  this->outputUpToEndOfLine("'"); // Ending single quote.
+}
+
+void Output::setError(const Twine &message) {
+}
+
+void Output::output(StringRef s) {
+  Column += s.size();
+  Out << s;
+}
+
+void Output::outputUpToEndOfLine(StringRef s) {
+  this->output(s);
+  if (StateStack.back() != inFlowSeq)
+    NeedsNewLine = true;
+}
+
+void Output::outputNewLine() {
+  Out << "\n";
+  Column = 0;
+}
+
+// if seq at top, indent as if map, then add "- "
+// if seq in middle, use "- " if firstKey, else use "  "
+//
+
+void Output::newLineCheck() {
+  if (!NeedsNewLine)
+    return;
+  NeedsNewLine = false;
+
+  this->outputNewLine();
+
+  assert(StateStack.size() > 0);
+  unsigned Indent = StateStack.size() - 1;
+  bool OutputDash = false;
+
+  if (StateStack.back() == inSeq) {
+    OutputDash = true;
+  } else if ((StateStack.size() > 1) && (StateStack.back() == inMapFirstKey) &&
+             (StateStack[StateStack.size() - 2] == inSeq)) {
+    --Indent;
+    OutputDash = true;
+  }
+
+  for (unsigned i = 0; i < Indent; ++i) {
+    output("  ");
+  }
+  if (OutputDash) {
+    output("- ");
+  }
+
+}
+
+void Output::paddedKey(StringRef key) {
+  output(key);
+  output(":");
+  const char *spaces = "                ";
+  if (key.size() < strlen(spaces))
+    output(&spaces[key.size()]);
+  else
+    output(" ");
+}
+
+//===----------------------------------------------------------------------===//
+//  traits for built-in types
+//===----------------------------------------------------------------------===//
+
+void ScalarTraits<bool>::output(const bool &Val, void *, raw_ostream &Out) {
+  Out << (Val ? "true" : "false");
+}
+
+StringRef ScalarTraits<bool>::input(StringRef Scalar, void *, bool &Val) {
+  if (Scalar.equals("true")) {
+    Val = true;
+    return StringRef();
+  } else if (Scalar.equals("false")) {
+    Val = false;
+    return StringRef();
+  }
+  return "invalid boolean";
+}
+
+void ScalarTraits<StringRef>::output(const StringRef &Val, void *,
+                                     raw_ostream &Out) {
+  Out << Val;
+}
+
+StringRef ScalarTraits<StringRef>::input(StringRef Scalar, void *,
+                                         StringRef &Val) {
+  Val = Scalar;
+  return StringRef();
+}
+
+void ScalarTraits<uint8_t>::output(const uint8_t &Val, void *,
+                                   raw_ostream &Out) {
+  // use temp uin32_t because ostream thinks uint8_t is a character
+  uint32_t Num = Val;
+  Out << Num;
+}
+
+StringRef ScalarTraits<uint8_t>::input(StringRef Scalar, void *, uint8_t &Val) {
+  unsigned long long n;
+  if (getAsUnsignedInteger(Scalar, 0, n))
+    return "invalid number";
+  if (n > 0xFF)
+    return "out of range number";
+  Val = n;
+  return StringRef();
+}
+
+void ScalarTraits<uint16_t>::output(const uint16_t &Val, void *,
+                                    raw_ostream &Out) {
+  Out << Val;
+}
+
+StringRef ScalarTraits<uint16_t>::input(StringRef Scalar, void *,
+                                        uint16_t &Val) {
+  unsigned long long n;
+  if (getAsUnsignedInteger(Scalar, 0, n))
+    return "invalid number";
+  if (n > 0xFFFF)
+    return "out of range number";
+  Val = n;
+  return StringRef();
+}
+
+void ScalarTraits<uint32_t>::output(const uint32_t &Val, void *,
+                                    raw_ostream &Out) {
+  Out << Val;
+}
+
+StringRef ScalarTraits<uint32_t>::input(StringRef Scalar, void *,
+                                        uint32_t &Val) {
+  unsigned long long n;
+  if (getAsUnsignedInteger(Scalar, 0, n))
+    return "invalid number";
+  if (n > 0xFFFFFFFFUL)
+    return "out of range number";
+  Val = n;
+  return StringRef();
+}
+
+void ScalarTraits<uint64_t>::output(const uint64_t &Val, void *,
+                                    raw_ostream &Out) {
+  Out << Val;
+}
+
+StringRef ScalarTraits<uint64_t>::input(StringRef Scalar, void *,
+                                        uint64_t &Val) {
+  unsigned long long N;
+  if (getAsUnsignedInteger(Scalar, 0, N))
+    return "invalid number";
+  Val = N;
+  return StringRef();
+}
+
+void ScalarTraits<int8_t>::output(const int8_t &Val, void *, raw_ostream &Out) {
+  // use temp in32_t because ostream thinks int8_t is a character
+  int32_t Num = Val;
+  Out << Num;
+}
+
+StringRef ScalarTraits<int8_t>::input(StringRef Scalar, void *, int8_t &Val) {
+  long long N;
+  if (getAsSignedInteger(Scalar, 0, N))
+    return "invalid number";
+  if ((N > 127) || (N < -128))
+    return "out of range number";
+  Val = N;
+  return StringRef();
+}
+
+void ScalarTraits<int16_t>::output(const int16_t &Val, void *,
+                                   raw_ostream &Out) {
+  Out << Val;
+}
+
+StringRef ScalarTraits<int16_t>::input(StringRef Scalar, void *, int16_t &Val) {
+  long long N;
+  if (getAsSignedInteger(Scalar, 0, N))
+    return "invalid number";
+  if ((N > INT16_MAX) || (N < INT16_MIN))
+    return "out of range number";
+  Val = N;
+  return StringRef();
+}
+
+void ScalarTraits<int32_t>::output(const int32_t &Val, void *,
+                                   raw_ostream &Out) {
+  Out << Val;
+}
+
+StringRef ScalarTraits<int32_t>::input(StringRef Scalar, void *, int32_t &Val) {
+  long long N;
+  if (getAsSignedInteger(Scalar, 0, N))
+    return "invalid number";
+  if ((N > INT32_MAX) || (N < INT32_MIN))
+    return "out of range number";
+  Val = N;
+  return StringRef();
+}
+
+void ScalarTraits<int64_t>::output(const int64_t &Val, void *,
+                                   raw_ostream &Out) {
+  Out << Val;
+}
+
+StringRef ScalarTraits<int64_t>::input(StringRef Scalar, void *, int64_t &Val) {
+  long long N;
+  if (getAsSignedInteger(Scalar, 0, N))
+    return "invalid number";
+  Val = N;
+  return StringRef();
+}
+
+void ScalarTraits<double>::output(const double &Val, void *, raw_ostream &Out) {
+  Out << format("%g", Val);
+}
+
+StringRef ScalarTraits<double>::input(StringRef Scalar, void *, double &Val) {
+  SmallString<32> buff(Scalar.begin(), Scalar.end());
+  char *end;
+  Val = strtod(buff.c_str(), &end);
+  if (*end != '\0')
+    return "invalid floating point number";
+  return StringRef();
+}
+
+void ScalarTraits<float>::output(const float &Val, void *, raw_ostream &Out) {
+  Out << format("%g", Val);
+}
+
+StringRef ScalarTraits<float>::input(StringRef Scalar, void *, float &Val) {
+  SmallString<32> buff(Scalar.begin(), Scalar.end());
+  char *end;
+  Val = strtod(buff.c_str(), &end);
+  if (*end != '\0')
+    return "invalid floating point number";
+  return StringRef();
+}
+
+void ScalarTraits<Hex8>::output(const Hex8 &Val, void *, raw_ostream &Out) {
+  uint8_t Num = Val;
+  Out << format("0x%02X", Num);
+}
+
+StringRef ScalarTraits<Hex8>::input(StringRef Scalar, void *, Hex8 &Val) {
+  unsigned long long n;
+  if (getAsUnsignedInteger(Scalar, 0, n))
+    return "invalid hex8 number";
+  if (n > 0xFF)
+    return "out of range hex8 number";
+  Val = n;
+  return StringRef();
+}
+
+void ScalarTraits<Hex16>::output(const Hex16 &Val, void *, raw_ostream &Out) {
+  uint16_t Num = Val;
+  Out << format("0x%04X", Num);
+}
+
+StringRef ScalarTraits<Hex16>::input(StringRef Scalar, void *, Hex16 &Val) {
+  unsigned long long n;
+  if (getAsUnsignedInteger(Scalar, 0, n))
+    return "invalid hex16 number";
+  if (n > 0xFFFF)
+    return "out of range hex16 number";
+  Val = n;
+  return StringRef();
+}
+
+void ScalarTraits<Hex32>::output(const Hex32 &Val, void *, raw_ostream &Out) {
+  uint32_t Num = Val;
+  Out << format("0x%08X", Num);
+}
+
+StringRef ScalarTraits<Hex32>::input(StringRef Scalar, void *, Hex32 &Val) {
+  unsigned long long n;
+  if (getAsUnsignedInteger(Scalar, 0, n))
+    return "invalid hex32 number";
+  if (n > 0xFFFFFFFFUL)
+    return "out of range hex32 number";
+  Val = n;
+  return StringRef();
+}
+
+void ScalarTraits<Hex64>::output(const Hex64 &Val, void *, raw_ostream &Out) {
+  uint64_t Num = Val;
+  Out << format("0x%016llX", Num);
+}
+
+StringRef ScalarTraits<Hex64>::input(StringRef Scalar, void *, Hex64 &Val) {
+  unsigned long long Num;
+  if (getAsUnsignedInteger(Scalar, 0, Num))
+    return "invalid hex64 number";
+  Val = Num;
+  return StringRef();
+}
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 827cb64d14..5a83bf7ee8 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -31,7 +31,6 @@
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
 #include "llvm/LLVMContext.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -44,16 +43,6 @@
 
 using namespace llvm;
 
-static cl::opt<bool>
-ForceAllBaseRegAlloc("arm-force-base-reg-alloc", cl::Hidden, cl::init(false),
-          cl::desc("Force use of virtual base registers for stack load/store"));
-static cl::opt<bool>
-EnableLocalStackAlloc("enable-local-stack-alloc", cl::init(true), cl::Hidden,
-          cl::desc("Enable pre-regalloc stack frame index allocation"));
-static cl::opt<bool>
-EnableBasePointer("arm-use-base-pointer", cl::Hidden, cl::init(true),
-          cl::desc("Enable use of a base pointer for complex stack frames"));
-
 ARMBaseRegisterInfo::ARMBaseRegisterInfo(const ARMBaseInstrInfo &tii,
                                          const ARMSubtarget &sti)
   : ARMGenRegisterInfo(ARM::LR), TII(tii), STI(sti),
@@ -280,9 +269,6 @@ bool ARMBaseRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
   const ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
 
-  if (!EnableBasePointer)
-    return false;
-
   // When outgoing call frames are so large that we adjust the stack pointer
   // around the call, we can no longer use the stack pointer to reach the
   // emergency spill slot.
@@ -328,8 +314,6 @@ bool ARMBaseRegisterInfo::canRealignStack(const MachineFunction &MF) const {
   // pointer adjustments around calls.
   if (MF.getTarget().getFrameLowering()->hasReservedCallFrame(MF))
     return true;
-  if (!EnableBasePointer)
-    return false;
   // A base pointer is required and allowed.  Check that it isn't too late to
   // reserve it.
   return MRI->canReserveReg(BasePtr);
@@ -412,7 +396,7 @@ requiresFrameIndexScavenging(const MachineFunction &MF) const {
 
 bool ARMBaseRegisterInfo::
 requiresVirtualBaseRegisters(const MachineFunction &MF) const {
-  return EnableLocalStackAlloc;
+  return true;
 }
 
 static void
@@ -551,8 +535,6 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
   case ARM::VLDRS: case ARM::VLDRD:
   case ARM::VSTRS: case ARM::VSTRD:
   case ARM::tSTRspi: case ARM::tLDRspi:
-    if (ForceAllBaseRegAlloc)
-      return true;
     break;
   default:
     return false;
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index c3a5e6472e..7d4cacd7de 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -178,24 +178,24 @@ class ARMFastISel : public FastISel {
     bool isLoadTypeLegal(Type *Ty, MVT &VT);
     bool ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
                     bool isZExt);
-    bool ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr,
+    bool ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
                      unsigned Alignment = 0, bool isZExt = true,
                      bool allocReg = true);
-    bool ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr,
+    bool ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr,
                       unsigned Alignment = 0);
     bool ARMComputeAddress(const Value *Obj, Address &Addr);
-    void ARMSimplifyAddress(Address &Addr, EVT VT, bool useAM3);
+    void ARMSimplifyAddress(Address &Addr, MVT VT, bool useAM3);
     bool ARMIsMemCpySmall(uint64_t Len);
     bool ARMTryEmitSmallMemCpy(Address Dest, Address Src, uint64_t Len,
                                unsigned Alignment);
-    unsigned ARMEmitIntExt(EVT SrcVT, unsigned SrcReg, EVT DestVT, bool isZExt);
-    unsigned ARMMaterializeFP(const ConstantFP *CFP, EVT VT);
-    unsigned ARMMaterializeInt(const Constant *C, EVT VT);
-    unsigned ARMMaterializeGV(const GlobalValue *GV, EVT VT);
-    unsigned ARMMoveToFPReg(EVT VT, unsigned SrcReg);
-    unsigned ARMMoveToIntReg(EVT VT, unsigned SrcReg);
+    unsigned ARMEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT, bool isZExt);
+    unsigned ARMMaterializeFP(const ConstantFP *CFP, MVT VT);
+    unsigned ARMMaterializeInt(const Constant *C, MVT VT);
+    unsigned ARMMaterializeGV(const GlobalValue *GV, MVT VT);
+    unsigned ARMMoveToFPReg(MVT VT, unsigned SrcReg);
+    unsigned ARMMoveToIntReg(MVT VT, unsigned SrcReg);
     unsigned ARMSelectCallOp(bool UseReg);
-    unsigned ARMLowerPICELF(const GlobalValue *GV, unsigned Align, EVT VT);
+    unsigned ARMLowerPICELF(const GlobalValue *GV, unsigned Align, MVT VT);
 
     // Call handling routines.
   private:
@@ -221,7 +221,7 @@ class ARMFastISel : public FastISel {
     bool isARMNEONPred(const MachineInstr *MI);
     bool DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR);
     const MachineInstrBuilder &AddOptionalDefs(const MachineInstrBuilder &MIB);
-    void AddLoadStoreOperands(EVT VT, Address &Addr,
+    void AddLoadStoreOperands(MVT VT, Address &Addr,
                               const MachineInstrBuilder &MIB,
                               unsigned Flags, bool useAM3);
 };
@@ -487,7 +487,7 @@ unsigned ARMFastISel::FastEmitInst_extractsubreg(MVT RetVT,
 
 // TODO: Don't worry about 64-bit now, but when this is fixed remove the
 // checks from the various callers.
-unsigned ARMFastISel::ARMMoveToFPReg(EVT VT, unsigned SrcReg) {
+unsigned ARMFastISel::ARMMoveToFPReg(MVT VT, unsigned SrcReg) {
   if (VT == MVT::f64) return 0;
 
   unsigned MoveReg = createResultReg(TLI.getRegClassFor(VT));
@@ -497,7 +497,7 @@ unsigned ARMFastISel::ARMMoveToFPReg(EVT VT, unsigned SrcReg) {
   return MoveReg;
 }
 
-unsigned ARMFastISel::ARMMoveToIntReg(EVT VT, unsigned SrcReg) {
+unsigned ARMFastISel::ARMMoveToIntReg(MVT VT, unsigned SrcReg) {
   if (VT == MVT::i64) return 0;
 
   unsigned MoveReg = createResultReg(TLI.getRegClassFor(VT));
@@ -510,7 +510,7 @@ unsigned ARMFastISel::ARMMoveToIntReg(EVT VT, unsigned SrcReg) {
 // For double width floating point we need to materialize two constants
 // (the high and the low) into integer registers then use a move to get
 // the combined constant into an FP reg.
-unsigned ARMFastISel::ARMMaterializeFP(const ConstantFP *CFP, EVT VT) {
+unsigned ARMFastISel::ARMMaterializeFP(const ConstantFP *CFP, MVT VT) {
   const APFloat Val = CFP->getValueAPF();
   bool is64bit = VT == MVT::f64;
 
@@ -554,7 +554,7 @@ unsigned ARMFastISel::ARMMaterializeFP(const ConstantFP *CFP, EVT VT) {
   return DestReg;
 }
 
-unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, EVT VT) {
+unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, MVT VT) {
 
   if (VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8 && VT != MVT::i1)
     return false;
@@ -616,7 +616,7 @@ unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, EVT VT) {
   return DestReg;
 }
 
-unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, EVT VT) {
+unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
   // For now 32-bit only.
   if (VT != MVT::i32) return 0;
 
@@ -719,10 +719,11 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, EVT VT) {
 }
 
 unsigned ARMFastISel::TargetMaterializeConstant(const Constant *C) {
-  EVT VT = TLI.getValueType(C->getType(), true);
+  EVT CEVT = TLI.getValueType(C->getType(), true);
 
   // Only handle simple types.
-  if (!VT.isSimple()) return 0;
+  if (!CEVT.isSimple()) return 0;
+  MVT VT = CEVT.getSimpleVT();
 
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
     return ARMMaterializeFP(CFP, VT);
@@ -898,12 +899,9 @@ bool ARMFastISel::ARMComputeAddress(const Value *Obj, Address &Addr) {
   return Addr.Base.Reg != 0;
 }
 
-void ARMFastISel::ARMSimplifyAddress(Address &Addr, EVT VT, bool useAM3) {
-
-  assert(VT.isSimple() && "Non-simple types are invalid here!");
-
+void ARMFastISel::ARMSimplifyAddress(Address &Addr, MVT VT, bool useAM3) {
   bool needsLowering = false;
-  switch (VT.getSimpleVT().SimpleTy) {
+  switch (VT.SimpleTy) {
     default: llvm_unreachable("Unhandled load/store type!");
     case MVT::i1:
     case MVT::i8:
@@ -954,13 +952,12 @@ void ARMFastISel::ARMSimplifyAddress(Address &Addr, EVT VT, bool useAM3) {
   }
 }
 
-void ARMFastISel::AddLoadStoreOperands(EVT VT, Address &Addr,
+void ARMFastISel::AddLoadStoreOperands(MVT VT, Address &Addr,
                                        const MachineInstrBuilder &MIB,
                                        unsigned Flags, bool useAM3) {
   // addrmode5 output depends on the selection dag addressing dividing the
   // offset by 4 that it then later multiplies. Do this here as well.
-  if (VT.getSimpleVT().SimpleTy == MVT::f32 ||
-      VT.getSimpleVT().SimpleTy == MVT::f64)
+  if (VT.SimpleTy == MVT::f32 || VT.SimpleTy == MVT::f64)
     Addr.Offset /= 4;
 
   // Frame base works a bit differently. Handle it separately.
@@ -1003,14 +1000,13 @@ void ARMFastISel::AddLoadStoreOperands(EVT VT, Address &Addr,
   AddOptionalDefs(MIB);
 }
 
-bool ARMFastISel::ARMEmitLoad(EVT VT, unsigned &ResultReg, Address &Addr,
+bool ARMFastISel::ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
                               unsigned Alignment, bool isZExt, bool allocReg) {
-  assert(VT.isSimple() && "Non-simple types are invalid here!");
   unsigned Opc;
   bool useAM3 = false;
   bool needVMOV = false;
   const TargetRegisterClass *RC;
-  switch (VT.getSimpleVT().SimpleTy) {
+  switch (VT.SimpleTy) {
     // This is mostly going to be Neon/vector support.
     default: return false;
     case MVT::i1:
@@ -1127,11 +1123,11 @@ bool ARMFastISel::SelectLoad(const Instruction *I) {
   return true;
 }
 
-bool ARMFastISel::ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr,
+bool ARMFastISel::ARMEmitStore(MVT VT, unsigned SrcReg, Address &Addr,
                                unsigned Alignment) {
   unsigned StrOpc;
   bool useAM3 = false;
-  switch (VT.getSimpleVT().SimpleTy) {
+  switch (VT.SimpleTy) {
     // This is mostly going to be Neon/vector support.
     default: return false;
     case MVT::i1: {
@@ -1405,8 +1401,9 @@ bool ARMFastISel::SelectIndirectBr(const Instruction *I) {
 bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
                              bool isZExt) {
   Type *Ty = Src1Value->getType();
-  EVT SrcVT = TLI.getValueType(Ty, true);
-  if (!SrcVT.isSimple()) return false;
+  EVT SrcEVT = TLI.getValueType(Ty, true);
+  if (!SrcEVT.isSimple()) return false;
+  MVT SrcVT = SrcEVT.getSimpleVT();
 
   bool isFloat = (Ty->isFloatTy() || Ty->isDoubleTy());
   if (isFloat && !Subtarget->hasVFP2())
@@ -1443,7 +1440,7 @@ bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
   unsigned CmpOpc;
   bool isICmp = true;
   bool needsExt = false;
-  switch (SrcVT.getSimpleVT().SimpleTy) {
+  switch (SrcVT.SimpleTy) {
     default: return false;
     // TODO: Verify compares.
     case MVT::f32:
@@ -1595,7 +1592,10 @@ bool ARMFastISel::SelectIToFP(const Instruction *I, bool isSigned) {
     return false;
 
   Value *Src = I->getOperand(0);
-  EVT SrcVT = TLI.getValueType(Src->getType(), true);
+  EVT SrcEVT = TLI.getValueType(Src->getType(), true);
+  if (!SrcEVT.isSimple())
+    return false;
+  MVT SrcVT = SrcEVT.getSimpleVT();
   if (SrcVT != MVT::i32 && SrcVT != MVT::i16 && SrcVT != MVT::i8)
     return false;
 
@@ -1604,8 +1604,7 @@ bool ARMFastISel::SelectIToFP(const Instruction *I, bool isSigned) {
 
   // Handle sign-extension.
   if (SrcVT == MVT::i16 || SrcVT == MVT::i8) {
-    EVT DestVT = MVT::i32;
-    SrcReg = ARMEmitIntExt(SrcVT, SrcReg, DestVT,
+    SrcReg = ARMEmitIntExt(SrcVT, SrcReg, MVT::i32,
                                        /*isZExt*/!isSigned);
     if (SrcReg == 0) return false;
   }
@@ -1811,7 +1810,9 @@ bool ARMFastISel::SelectBinaryIntOp(const Instruction *I, unsigned ISDOpcode) {
 }
 
 bool ARMFastISel::SelectBinaryFPOp(const Instruction *I, unsigned ISDOpcode) {
-  EVT VT  = TLI.getValueType(I->getType(), true);
+  EVT FPVT = TLI.getValueType(I->getType(), true);
+  if (!FPVT.isSimple()) return false;
+  MVT VT = FPVT.getSimpleVT();
 
   // We can get here in the case when we want to use NEON for our fp
   // operations, but can't figure out how to. Just use the vfp instructions
@@ -1842,7 +1843,7 @@ bool ARMFastISel::SelectBinaryFPOp(const Instruction *I, unsigned ISDOpcode) {
   unsigned Op2 = getRegForValue(I->getOperand(1));
   if (Op2 == 0) return false;
 
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+  unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT.SimpleTy));
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                           TII.get(Opc), ResultReg)
                   .addReg(Op1).addReg(Op2));
@@ -2055,7 +2056,7 @@ bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
     if (RVLocs.size() == 2 && RetVT == MVT::f64) {
       // For this move we copy into two registers and then move into the
       // double fp reg we want.
-      EVT DestVT = RVLocs[0].getValVT();
+      MVT DestVT = RVLocs[0].getValVT();
       const TargetRegisterClass* DstRC = TLI.getRegClassFor(DestVT);
       unsigned ResultReg = createResultReg(DstRC);
       AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
@@ -2070,7 +2071,7 @@ bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<unsigned> &UsedRegs,
       UpdateValueMap(I, ResultReg);
     } else {
       assert(RVLocs.size() == 1 &&"Can't handle non-double multi-reg retvals!");
-      EVT CopyVT = RVLocs[0].getValVT();
+      MVT CopyVT = RVLocs[0].getValVT();
 
       // Special handling for extended integers.
       if (RetVT == MVT::i1 || RetVT == MVT::i8 || RetVT == MVT::i16)
@@ -2129,8 +2130,10 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
       return false;
 
     unsigned SrcReg = Reg + VA.getValNo();
-    EVT RVVT = TLI.getValueType(RV->getType());
-    EVT DestVT = VA.getValVT();
+    EVT RVEVT = TLI.getValueType(RV->getType());
+    if (!RVEVT.isSimple()) return false;
+    MVT RVVT = RVEVT.getSimpleVT();
+    MVT DestVT = VA.getValVT();
     // Special handling for extended integers.
     if (RVVT != DestVT) {
       if (RVVT != MVT::i1 && RVVT != MVT::i8 && RVVT != MVT::i16)
@@ -2175,7 +2178,9 @@ unsigned ARMFastISel::ARMSelectCallOp(bool UseReg) {
 unsigned ARMFastISel::getLibcallReg(const Twine &Name) {
   GlobalValue *GV = new GlobalVariable(Type::getInt32Ty(*Context), false,
                                        GlobalValue::ExternalLinkage, 0, Name);
-  return ARMMaterializeGV(GV, TLI.getValueType(GV->getType()));
+  EVT LCREVT = TLI.getValueType(GV->getType());
+  if (!LCREVT.isSimple()) return 0;
+  return ARMMaterializeGV(GV, LCREVT.getSimpleVT());
 }
 
 // A quick function that will emit a call for a named libcall in F with the
@@ -2587,7 +2592,7 @@ bool ARMFastISel::SelectTrunc(const Instruction *I) {
   return true;
 }
 
-unsigned ARMFastISel::ARMEmitIntExt(EVT SrcVT, unsigned SrcReg, EVT DestVT,
+unsigned ARMFastISel::ARMEmitIntExt(MVT SrcVT, unsigned SrcReg, MVT DestVT,
                                     bool isZExt) {
   if (DestVT != MVT::i32 && DestVT != MVT::i16 && DestVT != MVT::i8)
     return 0;
@@ -2595,8 +2600,7 @@ unsigned ARMFastISel::ARMEmitIntExt(EVT SrcVT, unsigned SrcReg, EVT DestVT,
   unsigned Opc;
   bool isBoolZext = false;
   const TargetRegisterClass *RC = TLI.getRegClassFor(MVT::i32);
-  if (!SrcVT.isSimple()) return 0;
-  switch (SrcVT.getSimpleVT().SimpleTy) {
+  switch (SrcVT.SimpleTy) {
   default: return 0;
   case MVT::i16:
     if (!Subtarget->hasV6Ops()) return 0;
@@ -2643,14 +2647,18 @@ bool ARMFastISel::SelectIntExt(const Instruction *I) {
   Value *Src = I->getOperand(0);
   Type *SrcTy = Src->getType();
 
-  EVT SrcVT, DestVT;
-  SrcVT = TLI.getValueType(SrcTy, true);
-  DestVT = TLI.getValueType(DestTy, true);
-
   bool isZExt = isa<ZExtInst>(I);
   unsigned SrcReg = getRegForValue(Src);
   if (!SrcReg) return false;
 
+  EVT SrcEVT, DestEVT;
+  SrcEVT = TLI.getValueType(SrcTy, true);
+  DestEVT = TLI.getValueType(DestTy, true);
+  if (!SrcEVT.isSimple()) return false;
+  if (!DestEVT.isSimple()) return false;
+
+  MVT SrcVT = SrcEVT.getSimpleVT();
+  MVT DestVT = DestEVT.getSimpleVT();
   unsigned ResultReg = ARMEmitIntExt(SrcVT, SrcReg, DestVT, isZExt);
   if (ResultReg == 0) return false;
   UpdateValueMap(I, ResultReg);
@@ -2830,7 +2838,7 @@ bool ARMFastISel::TryToFoldLoad(MachineInstr *MI, unsigned OpNo,
 }
 
 unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV,
-                                     unsigned Align, EVT VT) {
+                                     unsigned Align, MVT VT) {
   bool UseGOTOFF = GV->hasLocalLinkage() || GV->hasHiddenVisibility();
   ARMConstantPoolConstant *CPV =
     ARMConstantPoolConstant::Create(GV, UseGOTOFF ? ARMCP::GOTOFF : ARMCP::GOT);
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index a0fe215e94..67a9998d0a 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -1046,7 +1046,7 @@ EVT ARMTargetLowering::getSetCCResultType(EVT VT) const {
 
 /// getRegClassFor - Return the register class that should be used for the
 /// specified value type.
-const TargetRegisterClass *ARMTargetLowering::getRegClassFor(EVT VT) const {
+const TargetRegisterClass *ARMTargetLowering::getRegClassFor(MVT VT) const {
   // Map v4i64 to QQ registers but do not make the type legal. Similarly map
   // v8i64 to QQQQ registers. v4i64 and v8i64 are only used for REG_SEQUENCE to
   // load / store 4 to 8 consecutive D registers.
@@ -9450,33 +9450,32 @@ static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
 
 EVT ARMTargetLowering::getOptimalMemOpType(uint64_t Size,
                                            unsigned DstAlign, unsigned SrcAlign,
-                                           bool IsZeroVal,
+                                           bool IsMemset, bool ZeroMemset,
                                            bool MemcpyStrSrc,
                                            MachineFunction &MF) const {
   const Function *F = MF.getFunction();
 
   // See if we can use NEON instructions for this...
-  if (IsZeroVal &&
+  if ((!IsMemset || ZeroMemset) &&
       Subtarget->hasNEON() &&
       !F->getFnAttributes().hasAttribute(Attributes::NoImplicitFloat)) {
     bool Fast;
-    if (Size >= 16 && (memOpAlign(SrcAlign, DstAlign, 16) ||
-                       (allowsUnalignedMemoryAccesses(MVT::v2f64, &Fast) &&
-                        Fast))) {
+    if (Size >= 16 &&
+        (memOpAlign(SrcAlign, DstAlign, 16) ||
+         (allowsUnalignedMemoryAccesses(MVT::v2f64, &Fast) && Fast))) {
       return MVT::v2f64;
-    } else if (Size >= 8 && (memOpAlign(SrcAlign, DstAlign, 8) ||
-                             (allowsUnalignedMemoryAccesses(MVT::f64, &Fast) &&
-                              Fast))) {
+    } else if (Size >= 8 &&
+               (memOpAlign(SrcAlign, DstAlign, 8) ||
+                (allowsUnalignedMemoryAccesses(MVT::f64, &Fast) && Fast))) {
       return MVT::f64;
     }
   }
 
   // Lowering to i32/i16 if the size permits.
-  if (Size >= 4) {
+  if (Size >= 4)
     return MVT::i32;
-  } else if (Size >= 2) {
+  else if (Size >= 2)
     return MVT::i16;
-  }
 
   // Let the target-independent logic figure it out.
   return MVT::Other;
@@ -10261,24 +10260,6 @@ bool ARMTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
   return false;
 }
 
-bool ARMTargetLowering::isIntImmLegal(const APInt &Imm, EVT VT) const {
-  if (VT.getSizeInBits() > 32)
-    return false;
-
-  int32_t ImmVal = Imm.getSExtValue();
-  if (!Subtarget->isThumb()) {
-    return (ImmVal >= 0 && ImmVal < 65536) ||
-      (ARM_AM::getSOImmVal(ImmVal) != -1) ||
-      (ARM_AM::getSOImmVal(~ImmVal) != -1);
-  } else if (Subtarget->isThumb2()) {
-    return (ImmVal >= 0 && ImmVal < 65536) ||
-      (ARM_AM::getT2SOImmVal(ImmVal) != -1) ||
-      (ARM_AM::getT2SOImmVal(~ImmVal) != -1);
-  } else /*Thumb1*/ {
-    return (ImmVal >= 0 && ImmVal < 256);
-  }
-}
-
 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
 /// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
 /// specified in the intrinsic calls.
@@ -10360,3 +10341,36 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
 
   return false;
 }
+
+unsigned
+ARMScalarTargetTransformImpl::getIntImmCost(const APInt &Imm, Type *Ty) const {
+  assert(Ty->isIntegerTy());
+
+  unsigned Bits = Ty->getPrimitiveSizeInBits();
+  if (Bits == 0 || Bits > 32)
+    return 4;
+
+  int32_t SImmVal = Imm.getSExtValue();
+  uint32_t ZImmVal = Imm.getZExtValue();
+  if (!Subtarget->isThumb()) {
+    if ((SImmVal >= 0 && SImmVal < 65536) ||
+        (ARM_AM::getSOImmVal(ZImmVal) != -1) ||
+        (ARM_AM::getSOImmVal(~ZImmVal) != -1))
+      return 1;
+    return Subtarget->hasV6T2Ops() ? 2 : 3;
+  } else if (Subtarget->isThumb2()) {
+    if ((SImmVal >= 0 && SImmVal < 65536) ||
+        (ARM_AM::getT2SOImmVal(ZImmVal) != -1) ||
+        (ARM_AM::getT2SOImmVal(~ZImmVal) != -1))
+      return 1;
+    return Subtarget->hasV6T2Ops() ? 2 : 3;
+  } else /*Thumb1*/ {
+    if (SImmVal >= 0 && SImmVal < 256)
+      return 1;
+    if ((~ZImmVal < 256) || ARM_AM::isThumbImmShiftedVal(ZImmVal))
+      return 2;
+    // Load from constantpool.
+    return 3;
+  }
+  return 2;
+}
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 5cf40236c5..fa0e5e4fc4 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetTransformImpl.h"
 #include <vector>
 
 namespace llvm {
@@ -291,7 +292,7 @@ namespace llvm {
 
     virtual EVT getOptimalMemOpType(uint64_t Size,
                                     unsigned DstAlign, unsigned SrcAlign,
-                                    bool IsZeroVal,
+                                    bool IsMemset, bool ZeroMemset,
                                     bool MemcpyStrSrc,
                                     MachineFunction &MF) const;
 
@@ -366,7 +367,7 @@ namespace llvm {
 
     /// getRegClassFor - Return the register class that should be used for the
     /// specified value type.
-    virtual const TargetRegisterClass *getRegClassFor(EVT VT) const;
+    virtual const TargetRegisterClass *getRegClassFor(MVT VT) const;
 
     /// getMaximalGlobalOffset - Returns the maximal possible offset which can
     /// be used for loads / stores from the global.
@@ -387,8 +388,6 @@ namespace llvm {
     /// materialize the FP immediate as a load from a constant pool.
     virtual bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
 
-    virtual bool isIntImmLegal(const APInt &Imm, EVT VT) const;
-
     virtual bool getTgtMemIntrinsic(IntrinsicInfo &Info,
                                     const CallInst &I,
                                     unsigned Intrinsic) const;
@@ -575,6 +574,16 @@ namespace llvm {
     FastISel *createFastISel(FunctionLoweringInfo &funcInfo,
                              const TargetLibraryInfo *libInfo);
   }
+
+  class ARMScalarTargetTransformImpl : public ScalarTargetTransformImpl {
+    const ARMSubtarget *Subtarget;
+  public:
+    explicit ARMScalarTargetTransformImpl(const TargetLowering *TL) :
+      ScalarTargetTransformImpl(TL),
+      Subtarget(&TL->getTargetMachine().getSubtarget<ARMSubtarget>()) {};
+
+    virtual unsigned getIntImmCost(const APInt &Imm, Type *Ty) const;
+  };
 }
 
 #endif  // ARMISELLOWERING_H
diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
index 834eb3c66a..6f5f50d3c6 100644
--- a/lib/Target/ARM/ARMTargetMachine.h
+++ b/lib/Target/ARM/ARMTargetMachine.h
@@ -66,7 +66,7 @@ class ARMTargetMachine : public ARMBaseTargetMachine {
   ARMTargetLowering   TLInfo;
   ARMSelectionDAGInfo TSInfo;
   ARMFrameLowering    FrameLowering;
-  ScalarTargetTransformImpl STTI;
+  ARMScalarTargetTransformImpl STTI;
   VectorTargetTransformImpl VTTI;
  public:
   ARMTargetMachine(const Target &T, StringRef TT,
@@ -112,7 +112,7 @@ class ThumbTargetMachine : public ARMBaseTargetMachine {
   ARMSelectionDAGInfo TSInfo;
   // Either Thumb1FrameLowering or ARMFrameLowering.
   OwningPtr<ARMFrameLowering> FrameLowering;
-  ScalarTargetTransformImpl STTI;
+  ARMScalarTargetTransformImpl STTI;
   VectorTargetTransformImpl VTTI;
 public:
   ThumbTargetMachine(const Target &T, StringRef TT,
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 4685b1d193..6b42239747 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -5723,7 +5723,12 @@ processInstruction(MCInst &Inst,
   }
   // Aliases for alternate PC+imm syntax of LDR instructions.
   case ARM::t2LDRpcrel:
-    Inst.setOpcode(ARM::t2LDRpci);
+    // Select the narrow version if the immediate will fit.
+    if (Inst.getOperand(1).getImm() > 0 &&
+        Inst.getOperand(1).getImm() <= 0xff)
+      Inst.setOpcode(ARM::tLDRpci);
+    else
+      Inst.setOpcode(ARM::t2LDRpci);
     return true;
   case ARM::t2LDRBpcrel:
     Inst.setOpcode(ARM::t2LDRBpci);
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index 813c7844ec..9193e40bed 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -133,6 +133,7 @@ const MCSymbol *ARMELFObjectWriter::ExplicitRelSym(const MCAssembler &Asm,
     switch (RelocType) {
     default: EmitThisSym = true; break;
     case ELF::R_ARM_ABS32: EmitThisSym = false; break;
+    case ELF::R_ARM_PREL31: EmitThisSym = false; break;
     }
   }
 
@@ -225,6 +226,9 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
     case FK_Data_4:
       switch (Modifier) {
       default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_ARM_NONE:
+        Type = ELF::R_ARM_NONE;
+        break;
       case MCSymbolRefExpr::VK_ARM_GOT:
         Type = ELF::R_ARM_GOT_BREL;
         break;
@@ -249,7 +253,10 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
       case MCSymbolRefExpr::VK_ARM_TARGET2:
         Type = ELF::R_ARM_TARGET2;
         break;
-      } 
+      case MCSymbolRefExpr::VK_ARM_PREL31:
+        Type = ELF::R_ARM_PREL31;
+        break;
+      }
       break;
     case ARM::fixup_arm_ldst_pcrel_12:
     case ARM::fixup_arm_pcrel_10:
diff --git a/lib/Target/ARM/Thumb2SizeReduction.cpp b/lib/Target/ARM/Thumb2SizeReduction.cpp
index d1cde1bf98..7e3b55273e 100644
--- a/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -175,6 +175,11 @@ namespace {
                         bool LiveCPSR, MachineInstr *CPSRDef,
                         bool IsSelfLoop);
 
+    /// ReduceMI - Attempt to reduce MI, return true on success.
+    bool ReduceMI(MachineBasicBlock &MBB, MachineInstr *MI,
+                  bool LiveCPSR, MachineInstr *CPSRDef,
+                  bool IsSelfLoop);
+
     /// ReduceMBB - Reduce width of instructions in the specified basic block.
     bool ReduceMBB(MachineBasicBlock &MBB);
   };
@@ -841,6 +846,32 @@ static bool UpdateCPSRUse(MachineInstr &MI, bool LiveCPSR) {
   return LiveCPSR;
 }
 
+bool Thumb2SizeReduce::ReduceMI(MachineBasicBlock &MBB, MachineInstr *MI,
+                                bool LiveCPSR, MachineInstr *CPSRDef,
+                                bool IsSelfLoop) {
+  unsigned Opcode = MI->getOpcode();
+  DenseMap<unsigned, unsigned>::iterator OPI = ReduceOpcodeMap.find(Opcode);
+  if (OPI == ReduceOpcodeMap.end())
+    return false;
+  const ReduceEntry &Entry = ReduceTable[OPI->second];
+
+  // Don't attempt normal reductions on "special" cases for now.
+  if (Entry.Special)
+    return ReduceSpecial(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop);
+
+  // Try to transform to a 16-bit two-address instruction.
+  if (Entry.NarrowOpc2 &&
+      ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop))
+    return true;
+
+  // Try to transform to a 16-bit non-two-address instruction.
+  if (Entry.NarrowOpc1 &&
+      ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop))
+    return true;
+
+  return false;
+}
+
 bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
   bool Modified = false;
 
@@ -865,39 +896,12 @@ bool Thumb2SizeReduce::ReduceMBB(MachineBasicBlock &MBB) {
 
     LiveCPSR = UpdateCPSRUse(*MI, LiveCPSR);
 
-    unsigned Opcode = MI->getOpcode();
-    DenseMap<unsigned, unsigned>::iterator OPI = ReduceOpcodeMap.find(Opcode);
-    if (OPI != ReduceOpcodeMap.end()) {
-      const ReduceEntry &Entry = ReduceTable[OPI->second];
-      // Ignore "special" cases for now.
-      if (Entry.Special) {
-        if (ReduceSpecial(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop)) {
-          Modified = true;
-          MachineBasicBlock::instr_iterator I = prior(NextMII);
-          MI = &*I;
-        }
-        goto ProcessNext;
-      }
-
-      // Try to transform to a 16-bit two-address instruction.
-      if (Entry.NarrowOpc2 &&
-          ReduceTo2Addr(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop)) {
-        Modified = true;
-        MachineBasicBlock::instr_iterator I = prior(NextMII);
-        MI = &*I;
-        goto ProcessNext;
-      }
-
-      // Try to transform to a 16-bit non-two-address instruction.
-      if (Entry.NarrowOpc1 &&
-          ReduceToNarrow(MBB, MI, Entry, LiveCPSR, CPSRDef, IsSelfLoop)) {
-        Modified = true;
-        MachineBasicBlock::instr_iterator I = prior(NextMII);
-        MI = &*I;
-      }
+    if (ReduceMI(MBB, MI, LiveCPSR, CPSRDef, IsSelfLoop)) {
+      Modified = true;
+      MachineBasicBlock::instr_iterator I = prior(NextMII);
+      MI = &*I;
     }
 
-  ProcessNext:
     if (NextMII != E && MI->isInsideBundle() && !NextMII->isInsideBundle()) {
       // FIXME: Since post-ra scheduler operates on bundles, the CPSR kill
       // marker is only on the BUNDLE instruction. Process the BUNDLE
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV4.td b/lib/Target/Hexagon/HexagonInstrInfoV4.td
index ae407db029..c2e1282224 100644
--- a/lib/Target/Hexagon/HexagonInstrInfoV4.td
+++ b/lib/Target/Hexagon/HexagonInstrInfoV4.td
@@ -2425,16 +2425,56 @@ let addrMode = BaseImmOffset, validSubTargets = HasV4SubT in {
                                  u6_2Ext, 13, 8>, AddrModeRel;
 }
 
-// Store new-value byte.
+// multiclass for new-value store instructions with base + immediate offset.
+// and MEMri operand.
+multiclass ST_MEMri_Pbase_nv<string mnemonic, RegisterClass RC, bit isNot,
+                          bit isPredNew> {
+  let PNewValue = #!if(isPredNew, "new", "") in
+  def #NAME#_nv_V4 : NVInst_V4<(outs),
+            (ins PredRegs:$src1, MEMri:$addr, RC: $src2),
+            !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
+            ") ")#mnemonic#"($addr) = $src2.new",
+            []>,
+            Requires<[HasV4T]>;
+}
+
+multiclass ST_MEMri_Pred_nv<string mnemonic, RegisterClass RC, bit PredNot> {
+  let PredSense = #!if(PredNot, "false", "true") in {
+    defm _c#NAME# : ST_MEMri_Pbase_nv<mnemonic, RC, PredNot, 0>;
 
-// memb(Re=#U6)=Nt.new
-// memb(Rs+#s11:0)=Nt.new
-let mayStore = 1, isPredicable = 1 in
-def STrib_nv_V4 : NVInst_V4<(outs), (ins MEMri:$addr, IntRegs:$src1),
-            "memb($addr) = $src1.new",
+    // Predicate new
+    defm _cdn#NAME# : ST_MEMri_Pbase_nv<mnemonic, RC, PredNot, 1>;
+  }
+}
+
+let mayStore = 1, isNVStore = 1, isExtendable = 1, neverHasSideEffects = 1 in
+multiclass ST_MEMri_nv<string mnemonic, string CextOp, RegisterClass RC,
+                    bits<5> ImmBits, bits<5> PredImmBits> {
+
+  let CextOpcode = CextOp, BaseOpcode = CextOp in {
+    let opExtendable = 1, isExtentSigned = 1, opExtentBits = ImmBits,
+         isPredicable = 1 in
+    def #NAME#_nv_V4 : NVInst_V4<(outs),
+            (ins MEMri:$addr, RC:$src),
+            #mnemonic#"($addr) = $src.new",
             []>,
             Requires<[HasV4T]>;
 
+    let opExtendable = 2, isExtentSigned = 0, opExtentBits = PredImmBits,
+        neverHasSideEffects = 1, isPredicated = 1 in {
+      defm Pt : ST_MEMri_Pred_nv<mnemonic, RC, 0>;
+      defm NotPt : ST_MEMri_Pred_nv<mnemonic, RC, 1>;
+    }
+  }
+}
+
+let addrMode = BaseImmOffset, isMEMri = "true", validSubTargets = HasV4SubT,
+mayStore = 1 in {
+  defm STrib: ST_MEMri_nv<"memb", "STrib", IntRegs, 11, 6>, AddrModeRel;
+  defm STrih: ST_MEMri_nv<"memh", "STrih", IntRegs, 12, 7>, AddrModeRel;
+  defm STriw: ST_MEMri_nv<"memw", "STriw", IntRegs, 13, 8>, AddrModeRel;
+}
+
 // memb(Ru<<#u2+#U6)=Nt.new
 let mayStore = 1, AddedComplexity = 10 in
 def STrib_shl_nv_V4 : NVInst_V4<(outs),
@@ -2473,44 +2513,6 @@ def STb_GP_nv_V4 : NVInst_V4<(outs),
             []>,
             Requires<[HasV4T]>;
 
-// Store new-value byte conditionally.
-// if ([!]Pv[.new]) memb(#u6)=Nt.new
-// if (Pv) memb(Rs+#u6:0)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1,
-    isPredicated = 1 in
-def STrib_cPt_nv_V4 : NVInst_V4<(outs),
-            (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
-            "if ($src1) memb($addr) = $src2.new",
-            []>,
-            Requires<[HasV4T]>;
-
-// if (Pv.new) memb(Rs+#u6:0)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1,
-    isPredicated = 1 in
-def STrib_cdnPt_nv_V4 : NVInst_V4<(outs),
-            (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
-            "if ($src1.new) memb($addr) = $src2.new",
-            []>,
-            Requires<[HasV4T]>;
-
-// if (!Pv) memb(Rs+#u6:0)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1,
-    isPredicated = 1 in
-def STrib_cNotPt_nv_V4 : NVInst_V4<(outs),
-            (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
-            "if (!$src1) memb($addr) = $src2.new",
-            []>,
-            Requires<[HasV4T]>;
-
-// if (!Pv.new) memb(Rs+#u6:0)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1,
-    isPredicated = 1 in
-def STrib_cdnNotPt_nv_V4 : NVInst_V4<(outs),
-            (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
-            "if (!$src1.new) memb($addr) = $src2.new",
-            []>,
-            Requires<[HasV4T]>;
-
 // if ([!]Pv[.new]) memb(Rx++#s4:0)=Nt.new
 // if (Pv) memb(Rx++#s4:0)=Nt.new
 let mayStore = 1, hasCtrlDep = 1,
@@ -2548,16 +2550,6 @@ def POST_STbri_cdnNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             [],"$src3 = $dst">,
             Requires<[HasV4T]>;
 
-
-// Store new-value halfword.
-// memh(Re=#U6)=Nt.new
-// memh(Rs+#s11:1)=Nt.new
-let mayStore = 1, isPredicable = 1 in
-def STrih_nv_V4 : NVInst_V4<(outs), (ins MEMri:$addr, IntRegs:$src1),
-            "memh($addr) = $src1.new",
-            []>,
-            Requires<[HasV4T]>;
-
 // memh(Ru<<#u2+#U6)=Nt.new
 let mayStore = 1, AddedComplexity = 10 in
 def STrih_shl_nv_V4 : NVInst_V4<(outs),
@@ -2597,47 +2589,6 @@ def STh_GP_nv_V4 : NVInst_V4<(outs),
             Requires<[HasV4T]>;
 
 
-// Store new-value halfword conditionally.
-
-// if ([!]Pv[.new]) memh(#u6)=Nt.new
-
-// if ([!]Pv[.new]) memh(Rs+#u6:1)=Nt.new
-// if (Pv) memh(Rs+#u6:1)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1,
-    isPredicated = 1 in
-def STrih_cPt_nv_V4 : NVInst_V4<(outs),
-            (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
-            "if ($src1) memh($addr) = $src2.new",
-            []>,
-            Requires<[HasV4T]>;
-
-// if (Pv.new) memh(Rs+#u6:1)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1,
-    isPredicated = 1 in
-def STrih_cdnPt_nv_V4 : NVInst_V4<(outs),
-            (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
-            "if ($src1.new) memh($addr) = $src2.new",
-            []>,
-            Requires<[HasV4T]>;
-
-// if (!Pv) memh(Rs+#u6:1)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1,
-    isPredicated = 1 in
-def STrih_cNotPt_nv_V4 : NVInst_V4<(outs),
-            (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
-            "if (!$src1) memh($addr) = $src2.new",
-            []>,
-            Requires<[HasV4T]>;
-
-// if (!Pv.new) memh(Rs+#u6:1)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1,
-    isPredicated = 1 in
-def STrih_cdnNotPt_nv_V4 : NVInst_V4<(outs),
-            (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
-            "if (!$src1.new) memh($addr) = $src2.new",
-            []>,
-            Requires<[HasV4T]>;
-
 // if ([!]Pv[]) memh(Rx++#s4:1)=Nt.new
 // if (Pv) memh(Rx++#s4:1)=Nt.new
 let mayStore = 1, hasCtrlDep = 1,
@@ -2675,18 +2626,6 @@ def POST_SThri_cdnNotPt_nv_V4 : NVInstPI_V4<(outs IntRegs:$dst),
             [],"$src3 = $dst">,
             Requires<[HasV4T]>;
 
-
-// Store new-value word.
-
-// memw(Re=#U6)=Nt.new
-// memw(Rs+#s11:2)=Nt.new
-let mayStore = 1, isPredicable = 1 in
-def STriw_nv_V4 : NVInst_V4<(outs),
-            (ins MEMri:$addr, IntRegs:$src1),
-            "memw($addr) = $src1.new",
-            []>,
-            Requires<[HasV4T]>;
-
 // memw(Ru<<#u2+#U6)=Nt.new
 let mayStore = 1, AddedComplexity = 10 in
 def STriw_shl_nv_V4 : NVInst_V4<(outs),
@@ -2723,47 +2662,6 @@ def STw_GP_nv_V4 : NVInst_V4<(outs),
             []>,
             Requires<[HasV4T]>;
 
-// Store new-value word conditionally.
-
-// if ([!]Pv[.new]) memw(#u6)=Nt.new
-
-// if ([!]Pv[.new]) memw(Rs+#u6:2)=Nt.new
-// if (Pv) memw(Rs+#u6:2)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1,
-    isPredicated = 1 in
-def STriw_cPt_nv_V4 : NVInst_V4<(outs),
-            (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
-            "if ($src1) memw($addr) = $src2.new",
-            []>,
-            Requires<[HasV4T]>;
-
-// if (Pv.new) memw(Rs+#u6:2)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1,
-    isPredicated = 1 in
-def STriw_cdnPt_nv_V4 : NVInst_V4<(outs),
-            (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
-            "if ($src1.new) memw($addr) = $src2.new",
-            []>,
-            Requires<[HasV4T]>;
-
-// if (!Pv) memw(Rs+#u6:2)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1,
-    isPredicated = 1 in
-def STriw_cNotPt_nv_V4 : NVInst_V4<(outs),
-            (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
-            "if (!$src1) memw($addr) = $src2.new",
-            []>,
-            Requires<[HasV4T]>;
-
-// if (!Pv.new) memw(Rs+#u6:2)=Nt.new
-let mayStore = 1, neverHasSideEffects = 1,
-    isPredicated = 1 in
-def STriw_cdnNotPt_nv_V4 : NVInst_V4<(outs),
-            (ins PredRegs:$src1, MEMri:$addr, IntRegs:$src2),
-            "if (!$src1.new) memw($addr) = $src2.new",
-            []>,
-            Requires<[HasV4T]>;
-
 // if ([!]Pv[.new]) memw(Rx++#s4:2)=Nt.new
 // if (Pv) memw(Rx++#s4:2)=Nt.new
 let mayStore = 1, hasCtrlDep = 1,
diff --git a/lib/Target/LLVMBuild.txt b/lib/Target/LLVMBuild.txt
index eb6c779f45..f3a9c1c3e7 100644
--- a/lib/Target/LLVMBuild.txt
+++ b/lib/Target/LLVMBuild.txt
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = ARM CppBackend Hexagon MBlaze MSP430 NVPTX Mips PowerPC Sparc X86 XCore
+subdirectories = ARM CppBackend Hexagon MBlaze MSP430 NVPTX Mips PowerPC R600 Sparc X86 XCore
 
 ; This is a special group whose required libraries are extended (by llvm-build)
 ; with the best execution engine (the native JIT, if available, or the
diff --git a/lib/Target/MBlaze/InstPrinter/CMakeLists.txt b/lib/Target/MBlaze/InstPrinter/CMakeLists.txt
index bb2c31a33a..586e2d3eef 100644
--- a/lib/Target/MBlaze/InstPrinter/CMakeLists.txt
+++ b/lib/Target/MBlaze/InstPrinter/CMakeLists.txt
@@ -5,4 +5,4 @@ add_llvm_library(LLVMMBlazeAsmPrinter
   MBlazeInstPrinter.cpp
   )
 
-add_dependencies(LLVMMBlazeAsmPrinter intrinsics_gen MBlazeCommonTableGen)
+add_dependencies(LLVMMBlazeAsmPrinter MBlazeCommonTableGen)
diff --git a/lib/Target/MSP430/InstPrinter/CMakeLists.txt b/lib/Target/MSP430/InstPrinter/CMakeLists.txt
index 99457b924c..64ac994b7f 100644
--- a/lib/Target/MSP430/InstPrinter/CMakeLists.txt
+++ b/lib/Target/MSP430/InstPrinter/CMakeLists.txt
@@ -4,4 +4,4 @@ add_llvm_library(LLVMMSP430AsmPrinter
   MSP430InstPrinter.cpp
   )
 
-add_dependencies(LLVMMSP430AsmPrinter intrinsics_gen MSP430CommonTableGen)
+add_dependencies(LLVMMSP430AsmPrinter MSP430CommonTableGen)
diff --git a/lib/Target/Mips/InstPrinter/CMakeLists.txt b/lib/Target/Mips/InstPrinter/CMakeLists.txt
index c3f4a6e1be..3e9fbf1c55 100644
--- a/lib/Target/Mips/InstPrinter/CMakeLists.txt
+++ b/lib/Target/Mips/InstPrinter/CMakeLists.txt
@@ -4,4 +4,4 @@ add_llvm_library(LLVMMipsAsmPrinter
   MipsInstPrinter.cpp
   )
 
-add_dependencies(LLVMMipsAsmPrinter intrinsics_gen MipsCommonTableGen)
+add_dependencies(LLVMMipsAsmPrinter MipsCommonTableGen)
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
index 9d67aa1856..d5ed8b13f5 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -34,7 +34,7 @@ MipsMCAsmInfo::MipsMCAsmInfo(const Target &T, StringRef TT) {
   GPRel32Directive            = "\t.gpword\t";
   GPRel64Directive            = "\t.gpdword\t";
   WeakRefDirective            = "\t.weak\t";
-
+  DebugLabelSuffix            = "=.";
   SupportsDebugInformation = true;
   ExceptionsType = ExceptionHandling::DwarfCFI;
   HasLEB128 = true;
diff --git a/lib/Target/Mips/MipsCondMov.td b/lib/Target/Mips/MipsCondMov.td
index 457d238a01..5e21d4dc2b 100644
--- a/lib/Target/Mips/MipsCondMov.td
+++ b/lib/Target/Mips/MipsCondMov.td
@@ -25,33 +25,29 @@ class CondMovIntInt<RegisterClass CRC, RegisterClass DRC, bits<6> funct,
 }
 
 // cond:int, data:float
-class CondMovIntFP<RegisterClass CRC, RegisterClass DRC, bits<5> fmt,
-                   bits<6> func, string instr_asm> :
-  FFR<0x11, func, fmt, (outs DRC:$fd), (ins DRC:$fs, CRC:$rt, DRC:$F),
-      !strconcat(instr_asm, "\t$fd, $fs, $rt"), []> {
-  bits<5> rt;
-  let ft = rt;
+class CMov_I_F_FT<string opstr, RegisterClass CRC, RegisterClass DRC,
+                  InstrItinClass Itin> :
+  InstSE<(outs DRC:$fd), (ins DRC:$fs, CRC:$rt, DRC:$F),
+         !strconcat(opstr, "\t$fd, $fs, $rt"), [], Itin, FrmFR> {
   let Constraints = "$F = $fd";
 }
 
 // cond:float, data:int
-class CondMovFPInt<RegisterClass RC, SDNode cmov, bits<1> tf,
-                   string instr_asm> :
-  FCMOV<tf, (outs RC:$rd), (ins RC:$rs, RC:$F),
-        !strconcat(instr_asm, "\t$rd, $rs, $$fcc0"),
-        [(set RC:$rd, (cmov RC:$rs, RC:$F))]> {
-  let cc = 0;
+class CMov_F_I_FT<string opstr, RegisterClass RC, InstrItinClass Itin,
+                  SDPatternOperator OpNode = null_frag> :
+  InstSE<(outs RC:$rd), (ins RC:$rs, RC:$F),
+         !strconcat(opstr, "\t$rd, $rs, $$fcc0"),
+         [(set RC:$rd, (OpNode RC:$rs, RC:$F))], Itin, FrmFR> {
   let Uses = [FCR31];
   let Constraints = "$F = $rd";
 }
 
 // cond:float, data:float
-class CondMovFPFP<RegisterClass RC, SDNode cmov, bits<5> fmt, bits<1> tf,
-                  string instr_asm> :
-  FFCMOV<fmt, tf, (outs RC:$fd), (ins RC:$fs, RC:$F),
-         !strconcat(instr_asm, "\t$fd, $fs, $$fcc0"),
-         [(set RC:$fd, (cmov RC:$fs, RC:$F))]> {
-  let cc = 0;
+class CMov_F_F_FT<string opstr, RegisterClass RC, InstrItinClass Itin,
+                  SDPatternOperator OpNode = null_frag> :
+  InstSE<(outs RC:$fd), (ins RC:$fs, RC:$F),
+         !strconcat(opstr, "\t$fd, $fs, $$fcc0"),
+         [(set RC:$fd, (OpNode RC:$fs, RC:$F))], Itin, FrmFR> {
   let Uses = [FCR31];
   let Constraints = "$F = $fd";
 }
@@ -130,57 +126,71 @@ let Predicates = [HasStdEnc],
   }
 }
 
-def MOVZ_I_S   : CondMovIntFP<CPURegs, FGR32, 16, 18, "movz.s">;
-def MOVZ_I64_S : CondMovIntFP<CPU64Regs, FGR32, 16, 18, "movz.s">,
-                 Requires<[HasMips64, HasStdEnc]> {
+def MOVZ_I_S : CMov_I_F_FT<"movz.s", CPURegs, FGR32, IIFmove>,
+               CMov_I_F_FM<18, 16>;
+def MOVZ_I64_S : CMov_I_F_FT<"movz.s", CPU64Regs, FGR32, IIFmove>,
+                 CMov_I_F_FM<18, 16>, Requires<[HasMips64, HasStdEnc]> {
   let DecoderNamespace = "Mips64";
 }
 
-def MOVN_I_S   : CondMovIntFP<CPURegs, FGR32, 16, 19, "movn.s">;
-def MOVN_I64_S : CondMovIntFP<CPU64Regs, FGR32, 16, 19, "movn.s">,
-                 Requires<[HasMips64, HasStdEnc]> {
+def MOVN_I_S : CMov_I_F_FT<"movn.s", CPURegs, FGR32, IIFmove>,
+               CMov_I_F_FM<19, 16>;
+def MOVN_I64_S : CMov_I_F_FT<"movn.s", CPU64Regs, FGR32, IIFmove>,
+                 CMov_I_F_FM<19, 16>, Requires<[HasMips64, HasStdEnc]> {
   let DecoderNamespace = "Mips64";
 }
 
 let Predicates = [NotFP64bit, HasStdEnc] in {
-  def MOVZ_I_D32   : CondMovIntFP<CPURegs, AFGR64, 17, 18, "movz.d">;
-  def MOVN_I_D32   : CondMovIntFP<CPURegs, AFGR64, 17, 19, "movn.d">;
+  def MOVZ_I_D32 : CMov_I_F_FT<"movz.d", CPURegs, AFGR64, IIFmove>,
+                   CMov_I_F_FM<18, 17>;
+  def MOVN_I_D32 : CMov_I_F_FT<"movn.d", CPURegs, AFGR64, IIFmove>,
+                   CMov_I_F_FM<19, 17>;
 }
 let Predicates = [IsFP64bit, HasStdEnc],
                   DecoderNamespace = "Mips64" in {
-  def MOVZ_I_D64   : CondMovIntFP<CPURegs, FGR64, 17, 18, "movz.d">;
-  def MOVZ_I64_D64 : CondMovIntFP<CPU64Regs, FGR64, 17, 18, "movz.d"> {
+  def MOVZ_I_D64 : CMov_I_F_FT<"movz.d", CPURegs, FGR64, IIFmove>,
+                   CMov_I_F_FM<18, 17>;
+  def MOVZ_I64_D64 : CMov_I_F_FT<"movz.d", CPU64Regs, FGR64, IIFmove>,
+                     CMov_I_F_FM<18, 17> {
     let isCodeGenOnly = 1;
   }
-  def MOVN_I_D64   : CondMovIntFP<CPURegs, FGR64, 17, 19, "movn.d">;
-  def MOVN_I64_D64 : CondMovIntFP<CPU64Regs, FGR64, 17, 19, "movn.d"> {
+  def MOVN_I_D64 : CMov_I_F_FT<"movn.d", CPURegs, FGR64, IIFmove>,
+                   CMov_I_F_FM<19, 17>;
+  def MOVN_I64_D64 : CMov_I_F_FT<"movn.d", CPU64Regs, FGR64, IIFmove>,
+                     CMov_I_F_FM<19, 17> {
     let isCodeGenOnly = 1;
   }
 }
 
-def MOVT_I   : CondMovFPInt<CPURegs, MipsCMovFP_T, 1, "movt">;
-def MOVT_I64 : CondMovFPInt<CPU64Regs, MipsCMovFP_T, 1, "movt">,
-               Requires<[HasMips64, HasStdEnc]> {
+def MOVT_I : CMov_F_I_FT<"movt", CPURegs, IIAlu, MipsCMovFP_T>, CMov_F_I_FM<1>;
+def MOVT_I64 : CMov_F_I_FT<"movt", CPU64Regs, IIAlu, MipsCMovFP_T>,
+               CMov_F_I_FM<1>, Requires<[HasMips64, HasStdEnc]> {
   let DecoderNamespace = "Mips64";
 }
 
-def MOVF_I   : CondMovFPInt<CPURegs, MipsCMovFP_F, 0, "movf">;
-def MOVF_I64 : CondMovFPInt<CPU64Regs, MipsCMovFP_F, 0, "movf">,
-               Requires<[HasMips64, HasStdEnc]> {
+def MOVF_I : CMov_F_I_FT<"movf", CPURegs, IIAlu, MipsCMovFP_F>, CMov_F_I_FM<0>;
+def MOVF_I64 : CMov_F_I_FT<"movf", CPU64Regs, IIAlu, MipsCMovFP_F>,
+               CMov_F_I_FM<0>, Requires<[HasMips64, HasStdEnc]> {
   let DecoderNamespace = "Mips64";
 }
 
-def MOVT_S : CondMovFPFP<FGR32, MipsCMovFP_T, 16, 1, "movt.s">;
-def MOVF_S : CondMovFPFP<FGR32, MipsCMovFP_F, 16, 0, "movf.s">;
+def MOVT_S : CMov_F_F_FT<"movt.s", FGR32, IIFmove, MipsCMovFP_T>,
+             CMov_F_F_FM<16, 1>;
+def MOVF_S : CMov_F_F_FT<"movf.s", FGR32, IIFmove, MipsCMovFP_F>,
+             CMov_F_F_FM<16, 0>;
 
 let Predicates = [NotFP64bit, HasStdEnc] in {
-  def MOVT_D32 : CondMovFPFP<AFGR64, MipsCMovFP_T, 17, 1, "movt.d">;
-  def MOVF_D32 : CondMovFPFP<AFGR64, MipsCMovFP_F, 17, 0, "movf.d">;
+  def MOVT_D32 : CMov_F_F_FT<"movt.d", AFGR64, IIFmove, MipsCMovFP_T>,
+                 CMov_F_F_FM<17, 1>;
+  def MOVF_D32 : CMov_F_F_FT<"movf.d", AFGR64, IIFmove, MipsCMovFP_F>,
+                 CMov_F_F_FM<17, 0>;
 }
 let Predicates = [IsFP64bit, HasStdEnc],
     DecoderNamespace = "Mips64" in {
-  def MOVT_D64 : CondMovFPFP<FGR64, MipsCMovFP_T, 17, 1, "movt.d">;
-  def MOVF_D64 : CondMovFPFP<FGR64, MipsCMovFP_F, 17, 0, "movf.d">;
+  def MOVT_D64 : CMov_F_F_FT<"movt.d", FGR64, IIFmove, MipsCMovFP_T>,
+                 CMov_F_F_FM<17, 1>;
+  def MOVF_D64 : CMov_F_F_FT<"movf.d", FGR64, IIFmove, MipsCMovFP_F>,
+                 CMov_F_F_FM<17, 0>;
 }
 
 // Instantiation of conditional move patterns.
diff --git a/lib/Target/Mips/MipsDSPInstrInfo.td b/lib/Target/Mips/MipsDSPInstrInfo.td
index ef9402865b..e28a1389b4 100644
--- a/lib/Target/Mips/MipsDSPInstrInfo.td
+++ b/lib/Target/Mips/MipsDSPInstrInfo.td
@@ -75,10 +75,6 @@ def MipsMSUB_DSP : MipsDSPBase<"MSUB_DSP", SDT_MipsDPA>;
 def MipsMSUBU_DSP : MipsDSPBase<"MSUBU_DSP", SDT_MipsDPA>;
 
 // Flags.
-class IsCommutable {
-  bit isCommutable = 1;
-}
-
 class UseAC {
   list<Register> Uses = [AC0];
 }
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index 619ae077b3..3fad6eec92 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -50,6 +50,13 @@ static cl::opt<bool>
 LargeGOT("mxgot", cl::Hidden,
          cl::desc("MIPS: Enable GOT larger than 64k."), cl::init(false));
 
+static cl::opt<bool>
+Mips16HardFloat("mips16-hard-float", cl::NotHidden,
+                cl::desc("MIPS: mips16 hard float enable."),
+                cl::init(false));
+
+
+
 static const uint16_t O32IntRegs[4] = {
   Mips::A0, Mips::A1, Mips::A2, Mips::A3
 };
@@ -198,6 +205,41 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   }
 }
 
+void MipsTargetLowering::setMips16HardFloatLibCalls() {
+  setLibcallName(RTLIB::ADD_F32, "__mips16_addsf3");
+  setLibcallName(RTLIB::ADD_F64, "__mips16_adddf3");
+  setLibcallName(RTLIB::SUB_F32, "__mips16_subsf3");
+  setLibcallName(RTLIB::SUB_F64, "__mips16_subdf3");
+  setLibcallName(RTLIB::MUL_F32, "__mips16_mulsf3");
+  setLibcallName(RTLIB::MUL_F64, "__mips16_muldf3");
+  setLibcallName(RTLIB::DIV_F32, "__mips16_divsf3");
+  setLibcallName(RTLIB::DIV_F64, "__mips16_divdf3");
+  setLibcallName(RTLIB::FPEXT_F32_F64, "__mips16_extendsfdf2");
+  setLibcallName(RTLIB::FPROUND_F64_F32, "__mips16_truncdfsf2");
+  setLibcallName(RTLIB::FPTOSINT_F32_I32, "__mips16_fix_truncsfsi");
+  setLibcallName(RTLIB::FPTOSINT_F64_I32, "__mips16_fix_truncdfsi");
+  setLibcallName(RTLIB::SINTTOFP_I32_F32, "__mips16_floatsisf");
+  setLibcallName(RTLIB::SINTTOFP_I32_F64, "__mips16_floatsidf");
+  setLibcallName(RTLIB::UINTTOFP_I32_F32, "__mips16_floatunsisf");
+  setLibcallName(RTLIB::UINTTOFP_I32_F64, "__mips16_floatunsidf");
+  setLibcallName(RTLIB::OEQ_F32, "__mips16_eqsf2");
+  setLibcallName(RTLIB::OEQ_F64, "__mips16_eqdf2");
+  setLibcallName(RTLIB::UNE_F32, "__mips16_nesf2");
+  setLibcallName(RTLIB::UNE_F64, "__mips16_nedf2");
+  setLibcallName(RTLIB::OGE_F32, "__mips16_gesf2");
+  setLibcallName(RTLIB::OGE_F64, "__mips16_gedf2");
+  setLibcallName(RTLIB::OLT_F32, "__mips16_ltsf2");
+  setLibcallName(RTLIB::OLT_F64, "__mips16_ltdf2");
+  setLibcallName(RTLIB::OLE_F32, "__mips16_lesf2");
+  setLibcallName(RTLIB::OLE_F64, "__mips16_ledf2");
+  setLibcallName(RTLIB::OGT_F32, "__mips16_gtsf2");
+  setLibcallName(RTLIB::OGT_F64, "__mips16_gtdf2");
+  setLibcallName(RTLIB::UO_F32, "__mips16_unordsf2");
+  setLibcallName(RTLIB::UO_F64, "__mips16_unorddf2");
+  setLibcallName(RTLIB::O_F32, "__mips16_unordsf2");
+  setLibcallName(RTLIB::O_F64, "__mips16_unorddf2");
+}
+
 MipsTargetLowering::
 MipsTargetLowering(MipsTargetMachine &TM)
   : TargetLowering(TM, new MipsTargetObjectFile()),
@@ -218,6 +260,8 @@ MipsTargetLowering(MipsTargetMachine &TM)
 
   if (Subtarget->inMips16Mode()) {
     addRegisterClass(MVT::i32, &Mips::CPU16RegsRegClass);
+    if (Mips16HardFloat)
+      setMips16HardFloatLibCalls();
   }
 
   if (Subtarget->hasDSP()) {
@@ -2155,7 +2199,7 @@ SDValue MipsTargetLowering::LowerRETURNADDR(SDValue Op,
 
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
-  EVT VT = Op.getValueType();
+  MVT VT = Op.getSimpleValueType();
   unsigned RA = IsN64 ? Mips::RA_64 : Mips::RA;
   MFI->setReturnAddressIsTaken(true);
 
@@ -2849,12 +2893,14 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // direct call is) turn it into a TargetGlobalAddress/TargetExternalSymbol
   // node so that legalize doesn't hack it.
   bool IsPICCall = (IsN64 || IsPIC); // true if calls are translated to jalr $25
-  bool GlobalOrExternal = false;
+  bool GlobalOrExternal = false, InternalLinkage = false;
   SDValue CalleeLo;
 
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
     if (IsPICCall) {
-      if (G->getGlobal()->hasInternalLinkage())
+      InternalLinkage = G->getGlobal()->hasInternalLinkage();
+
+      if (InternalLinkage)
         Callee = getAddrLocal(Callee, DAG, HasMips64);
       else if (LargeGOT)
         Callee = getAddrGlobalLargeGOT(Callee, DAG, MipsII::MO_CALL_HI16,
@@ -2901,8 +2947,11 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   }
 
   // Insert node "GP copy globalreg" before call to function.
-  // Lazy-binding stubs require GP to point to the GOT.
-  if (IsPICCall) {
+  //
+  // R_MIPS_CALL* operators (emitted when non-internal functions are called
+  // in PIC mode) allow symbols to be resolved via lazy binding.
+  // The lazy binding stub requires GP to point to the GOT.
+  if (IsPICCall && !InternalLinkage) {
     unsigned GPReg = IsN64 ? Mips::GP_64 : Mips::GP;
     EVT Ty = IsN64 ? MVT::i64 : MVT::i32;
     RegsToPass.push_back(std::make_pair(GPReg, GetGlobalReg(DAG, Ty)));
@@ -3476,7 +3525,8 @@ MipsTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
 }
 
 EVT MipsTargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
-                                            unsigned SrcAlign, bool IsZeroVal,
+                                            unsigned SrcAlign,
+                                            bool IsMemset, bool ZeroMemset,
                                             bool MemcpyStrSrc,
                                             MachineFunction &MF) const {
   if (Subtarget->hasMips64())
@@ -3655,7 +3705,7 @@ copyByValRegs(SDValue Chain, DebugLoc DL, std::vector<SDValue> &OutChains,
     return;
 
   // Copy arg registers.
-  EVT RegTy = MVT::getIntegerVT(CC.regSize() * 8);
+  MVT RegTy = MVT::getIntegerVT(CC.regSize() * 8);
   const TargetRegisterClass *RC = getRegClassFor(RegTy);
 
   for (unsigned I = 0; I < ByVal.NumRegs; ++I) {
@@ -3777,7 +3827,7 @@ MipsTargetLowering::writeVarArgRegs(std::vector<SDValue> &OutChains,
   const CCState &CCInfo = CC.getCCInfo();
   unsigned Idx = CCInfo.getFirstUnallocated(ArgRegs, NumRegs);
   unsigned RegSize = CC.regSize();
-  EVT RegTy = MVT::getIntegerVT(RegSize * 8);
+  MVT RegTy = MVT::getIntegerVT(RegSize * 8);
   const TargetRegisterClass *RC = getRegClassFor(RegTy);
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo *MFI = MF.getFrameInfo();
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index 4b318dc16f..c4b38c66ee 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -174,6 +174,8 @@ namespace llvm {
     virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   private:
 
+    void setMips16HardFloatLibCalls();
+
     /// ByValArgInfo - Byval argument information.
     struct ByValArgInfo {
       unsigned FirstIdx; // Index of the first register used.
@@ -362,7 +364,8 @@ namespace llvm {
     virtual bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const;
 
     virtual EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
-                                    unsigned SrcAlign, bool IsZeroVal,
+                                    unsigned SrcAlign,
+                                    bool IsMemset, bool ZeroMemset,
                                     bool MemcpyStrSrc,
                                     MachineFunction &MF) const;
 
diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
index 2a19583535..3abc986ab3 100644
--- a/lib/Target/Mips/MipsInstrFPU.td
+++ b/lib/Target/Mips/MipsInstrFPU.td
@@ -86,272 +86,310 @@ def fpimm0neg : PatLeaf<(fpimm), [{
 // Only S32 and D32 are supported right now.
 //===----------------------------------------------------------------------===//
 
-// FP load.
-let DecoderMethod = "DecodeFMem" in {
-class FPLoad<bits<6> op, string opstr, RegisterClass RC, Operand MemOpnd>:
-  FMem<op, (outs RC:$ft), (ins MemOpnd:$addr),
-      !strconcat(opstr, "\t$ft, $addr"), [(set RC:$ft, (load addr:$addr))],
-      IILoad>;
-
-// FP store.
-class FPStore<bits<6> op, string opstr, RegisterClass RC, Operand MemOpnd>:
-  FMem<op, (outs), (ins RC:$ft, MemOpnd:$addr),
-      !strconcat(opstr, "\t$ft, $addr"), [(store RC:$ft, addr:$addr)],
-      IIStore>;
-}
-// FP indexed load.
-class FPIdxLoad<bits<6> funct, string opstr, RegisterClass DRC,
-                RegisterClass PRC, SDPatternOperator FOp = null_frag>:
-  FFMemIdx<funct, (outs DRC:$fd), (ins PRC:$base, PRC:$index),
-           !strconcat(opstr, "\t$fd, ${index}(${base})"),
-           [(set DRC:$fd, (FOp (add PRC:$base, PRC:$index)))]> {
-  let fs = 0;
-}
-
-// FP indexed store.
-class FPIdxStore<bits<6> funct, string opstr, RegisterClass DRC,
-                 RegisterClass PRC, SDPatternOperator FOp= null_frag>:
-  FFMemIdx<funct, (outs), (ins DRC:$fs, PRC:$base, PRC:$index),
-           !strconcat(opstr, "\t$fs, ${index}(${base})"),
-           [(FOp DRC:$fs, (add PRC:$base, PRC:$index))]> {
-  let fd = 0;
-}
-
-// Instructions that convert an FP value to 32-bit fixed point.
-multiclass FFR1_W_M<bits<6> funct, string opstr> {
-  def _S   : FFR1<funct, 16, opstr, "w.s", FGR32, FGR32>;
-  def _D32 : FFR1<funct, 17, opstr, "w.d", FGR32, AFGR64>,
+class ADDS_FT<string opstr, RegisterClass RC, InstrItinClass Itin, bit IsComm,
+              SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs RC:$fd), (ins RC:$fs, RC:$ft),
+         !strconcat(opstr, "\t$fd, $fs, $ft"),
+         [(set RC:$fd, (OpNode RC:$fs, RC:$ft))], Itin, FrmFR> {
+  let isCommutable = IsComm;
+}
+
+multiclass ADDS_M<string opstr, InstrItinClass Itin, bit IsComm,
+                  SDPatternOperator OpNode = null_frag> {
+  def _D32 : ADDS_FT<opstr, AFGR64, Itin, IsComm, OpNode>,
              Requires<[NotFP64bit, HasStdEnc]>;
-  def _D64 : FFR1<funct, 17, opstr, "w.d", FGR32, FGR64>,
+  def _D64 : ADDS_FT<opstr, FGR64, Itin, IsComm, OpNode>,
              Requires<[IsFP64bit, HasStdEnc]> {
-    let DecoderNamespace = "Mips64";
+    string DecoderNamespace = "Mips64";
   }
 }
 
-// Instructions that convert an FP value to 64-bit fixed point.
-let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in
-multiclass FFR1_L_M<bits<6> funct, string opstr> {
-  def _S   : FFR1<funct, 16, opstr, "l.s", FGR64, FGR32>;
-  def _D64 : FFR1<funct, 17, opstr, "l.d", FGR64, FGR64>;
-}
+class ABSS_FT<string opstr, RegisterClass DstRC, RegisterClass SrcRC,
+              InstrItinClass Itin, SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs DstRC:$fd), (ins SrcRC:$fs), !strconcat(opstr, "\t$fd, $fs"),
+         [(set DstRC:$fd, (OpNode SrcRC:$fs))], Itin, FrmFR>;
 
-// FP-to-FP conversion instructions.
-multiclass FFR1P_M<bits<6> funct, string opstr, SDNode OpNode> {
-  def _S   : FFR1P<funct, 16, opstr, "s", FGR32, FGR32, OpNode>;
-  def _D32 : FFR1P<funct, 17, opstr, "d", AFGR64, AFGR64, OpNode>,
+multiclass ABSS_M<string opstr, InstrItinClass Itin,
+                  SDPatternOperator OpNode= null_frag> {
+  def _D32 : ABSS_FT<opstr, AFGR64, AFGR64, Itin, OpNode>,
              Requires<[NotFP64bit, HasStdEnc]>;
-  def _D64 : FFR1P<funct, 17, opstr, "d", FGR64, FGR64, OpNode>,
+  def _D64 : ABSS_FT<opstr, FGR64, FGR64, Itin, OpNode>,
              Requires<[IsFP64bit, HasStdEnc]> {
-    let DecoderNamespace = "Mips64";
+    string DecoderNamespace = "Mips64";
   }
 }
 
-multiclass FFR2P_M<bits<6> funct, string opstr, SDNode OpNode, bit isComm = 0> {
-  let isCommutable = isComm in {
-  def _S   : FFR2P<funct, 16, opstr, "s", FGR32, OpNode>;
-  def _D32 : FFR2P<funct, 17, opstr, "d", AFGR64, OpNode>,
+multiclass ROUND_M<string opstr, InstrItinClass Itin> {
+  def _D32 : ABSS_FT<opstr, FGR32, AFGR64, Itin>,
              Requires<[NotFP64bit, HasStdEnc]>;
-  def _D64 : FFR2P<funct, 17, opstr, "d", FGR64, OpNode>,
+  def _D64 : ABSS_FT<opstr, FGR32, FGR64, Itin>,
              Requires<[IsFP64bit, HasStdEnc]> {
     let DecoderNamespace = "Mips64";
   }
 }
-}
 
-// FP madd/msub/nmadd/nmsub instruction classes.
-class FMADDSUB<bits<3> funct, bits<3> fmt, string opstr, string fmtstr,
-               SDNode OpNode, RegisterClass RC> :
-  FFMADDSUB<funct, fmt, (outs RC:$fd), (ins RC:$fr, RC:$fs, RC:$ft),
-            !strconcat(opstr, ".", fmtstr, "\t$fd, $fr, $fs, $ft"),
-            [(set RC:$fd, (OpNode (fmul RC:$fs, RC:$ft), RC:$fr))]>;
-
-class FNMADDSUB<bits<3> funct, bits<3> fmt, string opstr, string fmtstr,
-                SDNode OpNode, RegisterClass RC> :
-  FFMADDSUB<funct, fmt, (outs RC:$fd), (ins RC:$fr, RC:$fs, RC:$ft),
-            !strconcat(opstr, ".", fmtstr, "\t$fd, $fr, $fs, $ft"),
-            [(set RC:$fd, (fsub fpimm0, (OpNode (fmul RC:$fs, RC:$ft), RC:$fr)))]>;
+class MFC1_FT<string opstr, RegisterClass DstRC, RegisterClass SrcRC,
+              InstrItinClass Itin, SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs DstRC:$rt), (ins SrcRC:$fs), !strconcat(opstr, "\t$rt, $fs"),
+         [(set DstRC:$rt, (OpNode SrcRC:$fs))], Itin, FrmFR>;
+
+class MTC1_FT<string opstr, RegisterClass DstRC, RegisterClass SrcRC,
+              InstrItinClass Itin, SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs DstRC:$fs), (ins SrcRC:$rt), !strconcat(opstr, "\t$rt, $fs"),
+         [(set DstRC:$fs, (OpNode SrcRC:$rt))], Itin, FrmFR>;
+
+class LW_FT<string opstr, RegisterClass RC, InstrItinClass Itin,
+            Operand MemOpnd, SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs RC:$rt), (ins MemOpnd:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(set RC:$rt, (OpNode addr:$addr))], Itin, FrmFI> {
+  let DecoderMethod = "DecodeFMem";
+}
+
+class SW_FT<string opstr, RegisterClass RC, InstrItinClass Itin,
+            Operand MemOpnd, SDPatternOperator OpNode= null_frag> :
+  InstSE<(outs), (ins RC:$rt, MemOpnd:$addr), !strconcat(opstr, "\t$rt, $addr"),
+         [(OpNode RC:$rt, addr:$addr)], Itin, FrmFI> {
+  let DecoderMethod = "DecodeFMem";
+}
+
+class MADDS_FT<string opstr, RegisterClass RC, InstrItinClass Itin,
+               SDPatternOperator OpNode = null_frag> :
+  InstSE<(outs RC:$fd), (ins RC:$fr, RC:$fs, RC:$ft),
+         !strconcat(opstr, "\t$fd, $fr, $fs, $ft"),
+         [(set RC:$fd, (OpNode (fmul RC:$fs, RC:$ft), RC:$fr))], Itin, FrmFR>;
+
+class NMADDS_FT<string opstr, RegisterClass RC, InstrItinClass Itin,
+                SDPatternOperator OpNode = null_frag> :
+  InstSE<(outs RC:$fd), (ins RC:$fr, RC:$fs, RC:$ft),
+         !strconcat(opstr, "\t$fd, $fr, $fs, $ft"),
+         [(set RC:$fd, (fsub fpimm0, (OpNode (fmul RC:$fs, RC:$ft), RC:$fr)))],
+         Itin, FrmFR>;
+
+class LWXC1_FT<string opstr, RegisterClass DRC, RegisterClass PRC,
+               InstrItinClass Itin, SDPatternOperator OpNode = null_frag> :
+  InstSE<(outs DRC:$fd), (ins PRC:$base, PRC:$index),
+         !strconcat(opstr, "\t$fd, ${index}(${base})"),
+         [(set DRC:$fd, (OpNode (add PRC:$base, PRC:$index)))], Itin, FrmFI>;
+
+class SWXC1_FT<string opstr, RegisterClass DRC, RegisterClass PRC,
+               InstrItinClass Itin, SDPatternOperator OpNode = null_frag> :
+  InstSE<(outs), (ins DRC:$fs, PRC:$base, PRC:$index),
+         !strconcat(opstr, "\t$fs, ${index}(${base})"),
+         [(OpNode DRC:$fs, (add PRC:$base, PRC:$index))], Itin, FrmFI>;
+
+class BC1F_FT<string opstr, InstrItinClass Itin,
+              SDPatternOperator Op = null_frag>  :
+  InstSE<(outs), (ins brtarget:$offset), !strconcat(opstr, "\t$offset"),
+         [(MipsFPBrcond Op, bb:$offset)], Itin, FrmFI> {
+  let isBranch = 1;
+  let isTerminator = 1;
+  let hasDelaySlot = 1;
+  let Defs = [AT];
+  let Uses = [FCR31];
+}
+
+class CEQS_FT<string typestr, RegisterClass RC, InstrItinClass Itin,
+              SDPatternOperator OpNode = null_frag>  :
+  InstSE<(outs), (ins RC:$fs, RC:$ft, condcode:$cond),
+         !strconcat("c.$cond.", typestr, "\t$fs, $ft"),
+         [(OpNode RC:$fs, RC:$ft, imm:$cond)], Itin, FrmFR> {
+  let Defs = [FCR31];
+}
 
 //===----------------------------------------------------------------------===//
 // Floating Point Instructions
 //===----------------------------------------------------------------------===//
-defm ROUND_W : FFR1_W_M<0xc, "round">;
-defm ROUND_L : FFR1_L_M<0x8, "round">;
-defm TRUNC_W : FFR1_W_M<0xd, "trunc">;
-defm TRUNC_L : FFR1_L_M<0x9, "trunc">;
-defm CEIL_W  : FFR1_W_M<0xe, "ceil">;
-defm CEIL_L  : FFR1_L_M<0xa, "ceil">;
-defm FLOOR_W : FFR1_W_M<0xf, "floor">;
-defm FLOOR_L : FFR1_L_M<0xb, "floor">;
-defm CVT_W   : FFR1_W_M<0x24, "cvt">, NeverHasSideEffects;
-//defm CVT_L   : FFR1_L_M<0x25, "cvt">;
-
-def CVT_S_W : FFR1<0x20, 20, "cvt", "s.w", FGR32, FGR32>, NeverHasSideEffects;
-def CVT_L_S : FFR1<0x25, 16, "cvt", "l.s", FGR64, FGR32>, NeverHasSideEffects;
-def CVT_L_D64: FFR1<0x25, 17, "cvt", "l.d", FGR64, FGR64>, NeverHasSideEffects;
+def ROUND_W_S  : ABSS_FT<"round.w.s", FGR32, FGR32, IIFcvt>, ABSS_FM<0xc, 16>;
+def TRUNC_W_S  : ABSS_FT<"trunc.w.s", FGR32, FGR32, IIFcvt>, ABSS_FM<0xd, 16>;
+def CEIL_W_S   : ABSS_FT<"ceil.w.s", FGR32, FGR32, IIFcvt>, ABSS_FM<0xe, 16>;
+def FLOOR_W_S  : ABSS_FT<"floor.w.s", FGR32, FGR32, IIFcvt>, ABSS_FM<0xf, 16>;
+def CVT_W_S    : ABSS_FT<"cvt.w.s", FGR32, FGR32, IIFcvt>, ABSS_FM<0x24, 16>,
+                 NeverHasSideEffects;
+
+defm ROUND_W : ROUND_M<"round.w.d", IIFcvt>, ABSS_FM<0xc, 17>;
+defm TRUNC_W : ROUND_M<"trunc.w.d", IIFcvt>, ABSS_FM<0xd, 17>;
+defm CEIL_W  : ROUND_M<"ceil.w.d", IIFcvt>, ABSS_FM<0xe, 17>;
+defm FLOOR_W : ROUND_M<"floor.w.d", IIFcvt>, ABSS_FM<0xf, 17>;
+defm CVT_W   : ROUND_M<"cvt.w.d", IIFcvt>, ABSS_FM<0x24, 17>,
+               NeverHasSideEffects;
+
+let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64" in {
+  def ROUND_L_S : ABSS_FT<"round.l.s", FGR64, FGR32, IIFcvt>, ABSS_FM<0x8, 16>;
+  def ROUND_L_D64 : ABSS_FT<"round.l.d", FGR64, FGR64, IIFcvt>,
+                    ABSS_FM<0x8, 17>;
+  def TRUNC_L_S : ABSS_FT<"trunc.l.s", FGR64, FGR32, IIFcvt>, ABSS_FM<0x9, 16>;
+  def TRUNC_L_D64 : ABSS_FT<"trunc.l.d", FGR64, FGR64, IIFcvt>,
+                    ABSS_FM<0x9, 17>;
+  def CEIL_L_S  : ABSS_FT<"ceil.l.s", FGR64, FGR32, IIFcvt>, ABSS_FM<0xa, 16>;
+  def CEIL_L_D64 : ABSS_FT<"ceil.l.d", FGR64, FGR64, IIFcvt>, ABSS_FM<0xa, 17>;
+  def FLOOR_L_S : ABSS_FT<"floor.l.s", FGR64, FGR32, IIFcvt>, ABSS_FM<0xb, 16>;
+  def FLOOR_L_D64 : ABSS_FT<"floor.l.d", FGR64, FGR64, IIFcvt>,
+                    ABSS_FM<0xb, 17>;
+}
+
+def CVT_S_W : ABSS_FT<"cvt.s.w", FGR32, FGR32, IIFcvt>, ABSS_FM<0x20, 20>;
+def CVT_L_S : ABSS_FT<"cvt.l.s", FGR64, FGR32, IIFcvt>, ABSS_FM<0x25, 16>,
+              NeverHasSideEffects;
+def CVT_L_D64: ABSS_FT<"cvt.l.d", FGR64, FGR64, IIFcvt>, ABSS_FM<0x25, 17>,
+               NeverHasSideEffects;
 
 let Predicates = [NotFP64bit, HasStdEnc], neverHasSideEffects = 1 in {
-  def CVT_S_D32 : FFR1<0x20, 17, "cvt", "s.d", FGR32, AFGR64>;
-  def CVT_D32_W : FFR1<0x21, 20, "cvt", "d.w", AFGR64, FGR32>;
-  def CVT_D32_S : FFR1<0x21, 16, "cvt", "d.s", AFGR64, FGR32>;
+  def CVT_S_D32 : ABSS_FT<"cvt.s.d", FGR32, AFGR64, IIFcvt>, ABSS_FM<0x20, 17>;
+  def CVT_D32_W : ABSS_FT<"cvt.d.w", AFGR64, FGR32, IIFcvt>, ABSS_FM<0x21, 20>;
+  def CVT_D32_S : ABSS_FT<"cvt.d.s", AFGR64, FGR32, IIFcvt>, ABSS_FM<0x21, 16>;
 }
 
 let Predicates = [IsFP64bit, HasStdEnc], DecoderNamespace = "Mips64",
     neverHasSideEffects = 1 in {
- def CVT_S_D64 : FFR1<0x20, 17, "cvt", "s.d", FGR32, FGR64>;
- def CVT_S_L   : FFR1<0x20, 21, "cvt", "s.l", FGR32, FGR64>;
- def CVT_D64_W : FFR1<0x21, 20, "cvt", "d.w", FGR64, FGR32>;
- def CVT_D64_S : FFR1<0x21, 16, "cvt", "d.s", FGR64, FGR32>;
- def CVT_D64_L : FFR1<0x21, 21, "cvt", "d.l", FGR64, FGR64>;
+ def CVT_S_D64 : ABSS_FT<"cvt.s.d", FGR32, FGR64, IIFcvt>, ABSS_FM<0x20, 17>;
+ def CVT_S_L   : ABSS_FT<"cvt.s.l", FGR32, FGR64, IIFcvt>, ABSS_FM<0x20, 21>;
+ def CVT_D64_W : ABSS_FT<"cvt.d.w", FGR64, FGR32, IIFcvt>, ABSS_FM<0x21, 20>;
+ def CVT_D64_S : ABSS_FT<"cvt.d.s", FGR64, FGR32, IIFcvt>, ABSS_FM<0x21, 16>;
+ def CVT_D64_L : ABSS_FT<"cvt.d.l", FGR64, FGR64, IIFcvt>, ABSS_FM<0x21, 21>;
 }
 
 let Predicates = [NoNaNsFPMath, HasStdEnc] in {
-  defm FABS    : FFR1P_M<0x5, "abs",  fabs>;
-  defm FNEG    : FFR1P_M<0x7, "neg",  fneg>;
+  def FABS_S : ABSS_FT<"abs.s", FGR32, FGR32, IIFcvt, fabs>, ABSS_FM<0x5, 16>;
+  def FNEG_S : ABSS_FT<"neg.s", FGR32, FGR32, IIFcvt, fneg>, ABSS_FM<0x7, 16>;
+  defm FABS : ABSS_M<"abs.d", IIFcvt, fabs>, ABSS_FM<0x5, 17>;
+  defm FNEG : ABSS_M<"neg.d", IIFcvt, fneg>, ABSS_FM<0x7, 17>;
 }
-defm FSQRT   : FFR1P_M<0x4, "sqrt", fsqrt>;
+
+def  FSQRT_S : ABSS_FT<"sqrt.s", FGR32, FGR32, IIFsqrtSingle, fsqrt>,
+               ABSS_FM<0x4, 16>;
+defm FSQRT : ABSS_M<"sqrt.d", IIFsqrtDouble, fsqrt>, ABSS_FM<0x4, 17>;
 
 // The odd-numbered registers are only referenced when doing loads,
 // stores, and moves between floating-point and integer registers.
 // When defining instructions, we reference all 32-bit registers,
 // regardless of register aliasing.
 
-class FFRGPR<bits<5> _fmt, dag outs, dag ins, string asmstr, list<dag> pattern>:
-             FFR<0x11, 0x0, _fmt, outs, ins, asmstr, pattern> {
-  bits<5> rt;
-  let ft = rt;
-  let fd = 0;
-}
-
 /// Move Control Registers From/To CPU Registers
-def CFC1  : FFRGPR<0x2, (outs CPURegs:$rt), (ins CCR:$fs),
-                  "cfc1\t$rt, $fs", []>;
-
-def CTC1  : FFRGPR<0x6, (outs CCR:$fs), (ins CPURegs:$rt),
-                  "ctc1\t$rt, $fs", []>;
-
-def MFC1  : FFRGPR<0x00, (outs CPURegs:$rt), (ins FGR32:$fs),
-                  "mfc1\t$rt, $fs",
-                  [(set CPURegs:$rt, (bitconvert FGR32:$fs))]>;
-
-def MTC1  : FFRGPR<0x04, (outs FGR32:$fs), (ins CPURegs:$rt),
-                  "mtc1\t$rt, $fs",
-                  [(set FGR32:$fs, (bitconvert CPURegs:$rt))]>;
-
-def DMFC1 : FFRGPR<0x01, (outs CPU64Regs:$rt), (ins FGR64:$fs),
-                  "dmfc1\t$rt, $fs",
-                  [(set CPU64Regs:$rt, (bitconvert FGR64:$fs))]>;
-
-def DMTC1 : FFRGPR<0x05, (outs FGR64:$fs), (ins CPU64Regs:$rt),
-                  "dmtc1\t$rt, $fs",
-                  [(set FGR64:$fs, (bitconvert CPU64Regs:$rt))]>;
-
-def FMOV_S   : FFR1<0x6, 16, "mov", "s", FGR32, FGR32>;
-def FMOV_D32 : FFR1<0x6, 17, "mov", "d", AFGR64, AFGR64>,
+def CFC1 : MFC1_FT<"cfc1", CPURegs, CCR, IIFmove>, MFC1_FM<2>;
+def CTC1 : MTC1_FT<"ctc1", CCR, CPURegs, IIFmove>, MFC1_FM<6>;
+def MFC1 : MFC1_FT<"mfc1", CPURegs, FGR32, IIFmove, bitconvert>, MFC1_FM<0>;
+def MTC1 : MTC1_FT<"mtc1", FGR32, CPURegs, IIFmove, bitconvert>, MFC1_FM<4>;
+def DMFC1 : MFC1_FT<"dmfc1", CPU64Regs, FGR64, IIFmove, bitconvert>, MFC1_FM<1>;
+def DMTC1 : MTC1_FT<"dmtc1", FGR64, CPU64Regs, IIFmove, bitconvert>, MFC1_FM<5>;
+
+def FMOV_S   : ABSS_FT<"mov.s", FGR32, FGR32, IIFmove>, ABSS_FM<0x6, 16>;
+def FMOV_D32 : ABSS_FT<"mov.d", AFGR64, AFGR64, IIFmove>, ABSS_FM<0x6, 17>,
                Requires<[NotFP64bit, HasStdEnc]>;
-def FMOV_D64 : FFR1<0x6, 17, "mov", "d", FGR64, FGR64>,
+def FMOV_D64 : ABSS_FT<"mov.d", FGR64, FGR64, IIFmove>, ABSS_FM<0x6, 17>,
                Requires<[IsFP64bit, HasStdEnc]> {
   let DecoderNamespace = "Mips64";
 }
 
 /// Floating Point Memory Instructions
 let Predicates = [IsN64, HasStdEnc], DecoderNamespace = "Mips64" in {
-  def LWC1_P8   : FPLoad<0x31, "lwc1", FGR32, mem64>;
-  def SWC1_P8   : FPStore<0x39, "swc1", FGR32, mem64>;
-  def LDC164_P8 : FPLoad<0x35, "ldc1", FGR64, mem64> {
+  def LWC1_P8 : LW_FT<"lwc1", FGR32, IILoad, mem64, load>, LW_FM<0x31>;
+  def SWC1_P8 : SW_FT<"swc1", FGR32, IIStore, mem64, store>, LW_FM<0x39>;
+  def LDC164_P8 : LW_FT<"ldc1", FGR64, IILoad, mem64, load>, LW_FM<0x35> {
     let isCodeGenOnly =1;
   }
-  def SDC164_P8 : FPStore<0x3d, "sdc1", FGR64, mem64> {
+  def SDC164_P8 : SW_FT<"sdc1", FGR64, IIStore, mem64, store>, LW_FM<0x3d> {
     let isCodeGenOnly =1;
   }
 }
 
 let Predicates = [NotN64, HasStdEnc] in {
-  def LWC1   : FPLoad<0x31, "lwc1", FGR32, mem>;
-  def SWC1   : FPStore<0x39, "swc1", FGR32, mem>;
+  def LWC1 : LW_FT<"lwc1", FGR32, IILoad, mem, load>, LW_FM<0x31>;
+  def SWC1 : SW_FT<"swc1", FGR32, IIStore, mem, store>, LW_FM<0x39>;
 }
 
 let Predicates = [NotN64, HasMips64, HasStdEnc],
   DecoderNamespace = "Mips64" in {
-  def LDC164 : FPLoad<0x35, "ldc1", FGR64, mem>;
-  def SDC164 : FPStore<0x3d, "sdc1", FGR64, mem>;
+  def LDC164 : LW_FT<"ldc1", FGR64, IILoad, mem, load>, LW_FM<0x35>;
+  def SDC164 : SW_FT<"sdc1", FGR64, IIStore, mem, store>, LW_FM<0x3d>;
 }
 
 let Predicates = [NotN64, NotMips64, HasStdEnc] in {
-  def LDC1   : FPLoad<0x35, "ldc1", AFGR64, mem>;
-  def SDC1   : FPStore<0x3d, "sdc1", AFGR64, mem>;
+  def LDC1 : LW_FT<"ldc1", AFGR64, IILoad, mem, load>, LW_FM<0x35>;
+  def SDC1 : SW_FT<"sdc1", AFGR64, IIStore, mem, store>, LW_FM<0x3d>;
 }
 
 // Indexed loads and stores.
 let Predicates = [HasFPIdx, HasStdEnc] in {
-  def LWXC1 : FPIdxLoad<0x0, "lwxc1", FGR32, CPURegs, load>;
-  def SWXC1 : FPIdxStore<0x8, "swxc1", FGR32, CPURegs, store>;
+  def LWXC1 : LWXC1_FT<"lwxc1", FGR32, CPURegs, IILoad, load>, LWXC1_FM<0>;
+  def SWXC1 : SWXC1_FT<"swxc1", FGR32, CPURegs, IIStore, store>, SWXC1_FM<8>;
 }
 
 let Predicates = [HasMips32r2, NotMips64, HasStdEnc] in {
-  def LDXC1 : FPIdxLoad<0x1, "ldxc1", AFGR64, CPURegs, load>;
-  def SDXC1 : FPIdxStore<0x9, "sdxc1", AFGR64, CPURegs, store>;
+  def LDXC1 : LWXC1_FT<"ldxc1", AFGR64, CPURegs, IILoad, load>, LWXC1_FM<1>;
+  def SDXC1 : SWXC1_FT<"sdxc1", AFGR64, CPURegs, IIStore, store>, SWXC1_FM<9>;
 }
 
 let Predicates = [HasMips64, NotN64, HasStdEnc], DecoderNamespace="Mips64" in {
-  def LDXC164 : FPIdxLoad<0x1, "ldxc1", FGR64, CPURegs, load>;
-  def SDXC164 : FPIdxStore<0x9, "sdxc1", FGR64, CPURegs, store>;
+  def LDXC164 : LWXC1_FT<"ldxc1", FGR64, CPURegs, IILoad, load>, LWXC1_FM<1>;
+  def SDXC164 : SWXC1_FT<"sdxc1", FGR64, CPURegs, IIStore, store>, SWXC1_FM<9>;
 }
 
 // n64
 let Predicates = [IsN64, HasStdEnc], isCodeGenOnly=1 in {
-  def LWXC1_P8   : FPIdxLoad<0x0, "lwxc1", FGR32, CPU64Regs, load>;
-  def LDXC164_P8 : FPIdxLoad<0x1, "ldxc1", FGR64, CPU64Regs, load>;
-  def SWXC1_P8   : FPIdxStore<0x8, "swxc1", FGR32, CPU64Regs, store>;
-  def SDXC164_P8 : FPIdxStore<0x9, "sdxc1", FGR64, CPU64Regs, store>;
+  def LWXC1_P8 : LWXC1_FT<"lwxc1", FGR32, CPU64Regs, IILoad, load>, LWXC1_FM<0>;
+  def LDXC164_P8 : LWXC1_FT<"ldxc1", FGR64, CPU64Regs, IILoad, load>,
+                   LWXC1_FM<1>;
+  def SWXC1_P8 : SWXC1_FT<"swxc1", FGR32, CPU64Regs, IIStore, store>,
+                 SWXC1_FM<8>;
+  def SDXC164_P8 : SWXC1_FT<"sdxc1", FGR64, CPU64Regs, IIStore, store>,
+                   SWXC1_FM<9>;
 }
 
 // Load/store doubleword indexed unaligned.
 let Predicates = [NotMips64, HasStdEnc] in {
-  def LUXC1 : FPIdxLoad<0x5, "luxc1", AFGR64, CPURegs>;
-  def SUXC1 : FPIdxStore<0xd, "suxc1", AFGR64, CPURegs>;
+  def LUXC1 : LWXC1_FT<"luxc1", AFGR64, CPURegs, IILoad>, LWXC1_FM<0x5>;
+  def SUXC1 : SWXC1_FT<"suxc1", AFGR64, CPURegs, IIStore>, SWXC1_FM<0xd>;
 }
 
 let Predicates = [HasMips64, HasStdEnc],
   DecoderNamespace="Mips64" in {
-  def LUXC164 : FPIdxLoad<0x5, "luxc1", FGR64, CPURegs>;
-  def SUXC164 : FPIdxStore<0xd, "suxc1", FGR64, CPURegs>;
+  def LUXC164 : LWXC1_FT<"luxc1", FGR64, CPURegs, IILoad>, LWXC1_FM<0x5>;
+  def SUXC164 : SWXC1_FT<"suxc1", FGR64, CPURegs, IIStore>, SWXC1_FM<0xd>;
 }
 
 /// Floating-point Aritmetic
-defm FADD : FFR2P_M<0x00, "add", fadd, 1>;
-defm FDIV : FFR2P_M<0x03, "div", fdiv>;
-defm FMUL : FFR2P_M<0x02, "mul", fmul, 1>;
-defm FSUB : FFR2P_M<0x01, "sub", fsub>;
+def FADD_S : ADDS_FT<"add.s", FGR32, IIFadd, 1, fadd>, ADDS_FM<0x00, 16>;
+defm FADD : ADDS_M<"add.d", IIFadd, 1, fadd>, ADDS_FM<0x00, 17>;
+def FDIV_S : ADDS_FT<"div.s", FGR32, IIFdivSingle, 0, fdiv>, ADDS_FM<0x03, 16>;
+defm FDIV : ADDS_M<"div.d", IIFdivDouble, 0, fdiv>, ADDS_FM<0x03, 17>;
+def FMUL_S : ADDS_FT<"mul.s", FGR32, IIFmulSingle, 1, fmul>, ADDS_FM<0x02, 16>;
+defm FMUL : ADDS_M<"mul.d", IIFmulDouble, 1, fmul>, ADDS_FM<0x02, 17>;
+def FSUB_S : ADDS_FT<"sub.s", FGR32, IIFadd, 0, fsub>, ADDS_FM<0x01, 16>;
+defm FSUB : ADDS_M<"sub.d", IIFadd, 0, fsub>, ADDS_FM<0x01, 17>;
 
 let Predicates = [HasMips32r2, HasStdEnc] in {
-  def MADD_S : FMADDSUB<0x4, 0, "madd", "s", fadd, FGR32>;
-  def MSUB_S : FMADDSUB<0x5, 0, "msub", "s", fsub, FGR32>;
+  def MADD_S : MADDS_FT<"madd.s", FGR32, IIFmulSingle, fadd>, MADDS_FM<4, 0>;
+  def MSUB_S : MADDS_FT<"msub.s", FGR32, IIFmulSingle, fsub>, MADDS_FM<5, 0>;
 }
 
 let Predicates = [HasMips32r2, NoNaNsFPMath, HasStdEnc] in {
-  def NMADD_S : FNMADDSUB<0x6, 0, "nmadd", "s", fadd, FGR32>;
-  def NMSUB_S : FNMADDSUB<0x7, 0, "nmsub", "s", fsub, FGR32>;
+  def NMADD_S : NMADDS_FT<"nmadd.s", FGR32, IIFmulSingle, fadd>, MADDS_FM<6, 0>;
+  def NMSUB_S : NMADDS_FT<"nmsub.s", FGR32, IIFmulSingle, fsub>, MADDS_FM<7, 0>;
 }
 
 let Predicates = [HasMips32r2, NotFP64bit, HasStdEnc] in {
-  def MADD_D32 : FMADDSUB<0x4, 1, "madd", "d", fadd, AFGR64>;
-  def MSUB_D32 : FMADDSUB<0x5, 1, "msub", "d", fsub, AFGR64>;
+  def MADD_D32 : MADDS_FT<"madd.d", AFGR64, IIFmulDouble, fadd>, MADDS_FM<4, 1>;
+  def MSUB_D32 : MADDS_FT<"msub.d", AFGR64, IIFmulDouble, fsub>, MADDS_FM<5, 1>;
 }
 
 let Predicates = [HasMips32r2, NotFP64bit, NoNaNsFPMath, HasStdEnc] in {
-  def NMADD_D32 : FNMADDSUB<0x6, 1, "nmadd", "d", fadd, AFGR64>;
-  def NMSUB_D32 : FNMADDSUB<0x7, 1, "nmsub", "d", fsub, AFGR64>;
+  def NMADD_D32 : NMADDS_FT<"nmadd.d", AFGR64, IIFmulDouble, fadd>,
+                  MADDS_FM<6, 1>;
+  def NMSUB_D32 : NMADDS_FT<"nmsub.d", AFGR64, IIFmulDouble, fsub>,
+                  MADDS_FM<7, 1>;
 }
 
 let Predicates = [HasMips32r2, IsFP64bit, HasStdEnc], isCodeGenOnly=1 in {
-  def MADD_D64 : FMADDSUB<0x4, 1, "madd", "d", fadd, FGR64>;
-  def MSUB_D64 : FMADDSUB<0x5, 1, "msub", "d", fsub, FGR64>;
+  def MADD_D64 : MADDS_FT<"madd.d", FGR64, IIFmulDouble, fadd>, MADDS_FM<4, 1>;
+  def MSUB_D64 : MADDS_FT<"msub.d", FGR64, IIFmulDouble, fsub>, MADDS_FM<5, 1>;
 }
 
 let Predicates = [HasMips32r2, IsFP64bit, NoNaNsFPMath, HasStdEnc],
     isCodeGenOnly=1 in {
-  def NMADD_D64 : FNMADDSUB<0x6, 1, "nmadd", "d", fadd, FGR64>;
-  def NMSUB_D64 : FNMADDSUB<0x7, 1, "nmsub", "d", fsub, FGR64>;
+  def NMADD_D64 : NMADDS_FT<"nmadd.d", FGR64, IIFmulDouble, fadd>,
+                  MADDS_FM<6, 1>;
+  def NMSUB_D64 : NMADDS_FT<"nmsub.d", FGR64, IIFmulDouble, fsub>,
+                  MADDS_FM<7, 1>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -362,19 +400,9 @@ let Predicates = [HasMips32r2, IsFP64bit, NoNaNsFPMath, HasStdEnc],
 def MIPS_BRANCH_F  : PatLeaf<(i32 0)>;
 def MIPS_BRANCH_T  : PatLeaf<(i32 1)>;
 
-/// Floating Point Branch of False/True (Likely)
-let isBranch=1, isTerminator=1, hasDelaySlot=1, base=0x8, Uses=[FCR31] in
-  class FBRANCH<bits<1> nd, bits<1> tf, PatLeaf op, string asmstr> :
-      FFI<0x11, (outs), (ins brtarget:$dst), !strconcat(asmstr, "\t$dst"),
-        [(MipsFPBrcond op, bb:$dst)]> {
-  let Inst{20-18} = 0;
-  let Inst{17} = nd;
-  let Inst{16} = tf;
-}
-
 let DecoderMethod = "DecodeBC1" in {
-def BC1F  : FBRANCH<0, 0, MIPS_BRANCH_F,  "bc1f">;
-def BC1T  : FBRANCH<0, 1, MIPS_BRANCH_T,  "bc1t">;
+def BC1F : BC1F_FT<"bc1f", IIBranch, MIPS_BRANCH_F>, BC1F_FM<0, 0>;
+def BC1T : BC1F_FT<"bc1t", IIBranch, MIPS_BRANCH_T>, BC1F_FM<0, 1>;
 }
 //===----------------------------------------------------------------------===//
 // Floating Point Flag Conditions
@@ -398,21 +426,13 @@ def MIPS_FCOND_NGE  : PatLeaf<(i32 13)>;
 def MIPS_FCOND_LE   : PatLeaf<(i32 14)>;
 def MIPS_FCOND_NGT  : PatLeaf<(i32 15)>;
 
-class FCMP<bits<5> fmt, RegisterClass RC, string typestr> :
-  FCC<fmt, (outs), (ins RC:$fs, RC:$ft, condcode:$cc),
-      !strconcat("c.$cc.", typestr, "\t$fs, $ft"),
-      [(MipsFPCmp RC:$fs, RC:$ft, imm:$cc)]>;
-
 /// Floating Point Compare
-let Defs=[FCR31] in {
-  def FCMP_S32 : FCMP<0x10, FGR32, "s">;
-  def FCMP_D32 : FCMP<0x11, AFGR64, "d">,
-      Requires<[NotFP64bit, HasStdEnc]>;
-  def FCMP_D64 : FCMP<0x11, FGR64, "d">,
-      Requires<[IsFP64bit, HasStdEnc]> {
-    let DecoderNamespace = "Mips64";
-  }
-}
+def FCMP_S32 : CEQS_FT<"s", FGR32, IIFcmp, MipsFPCmp>, CEQS_FM<16>;
+def FCMP_D32 : CEQS_FT<"d", AFGR64, IIFcmp, MipsFPCmp>, CEQS_FM<17>,
+               Requires<[NotFP64bit, HasStdEnc]>;
+let DecoderNamespace = "Mips64" in
+def FCMP_D64 : CEQS_FT<"d", FGR64, IIFcmp, MipsFPCmp>, CEQS_FM<17>,
+               Requires<[IsFP64bit, HasStdEnc]>;
 
 //===----------------------------------------------------------------------===//
 // Floating Point Pseudo-Instructions
diff --git a/lib/Target/Mips/MipsInstrFormats.td b/lib/Target/Mips/MipsInstrFormats.td
index c3c12c1fea..986580bf9f 100644
--- a/lib/Target/Mips/MipsInstrFormats.td
+++ b/lib/Target/Mips/MipsInstrFormats.td
@@ -206,31 +206,6 @@ class MFC3OP<bits<6> op, bits<5> _mfmt, dag outs, dag ins, string asmstr>:
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
-// Format FR instruction class in Mips : <|opcode|fmt|ft|fs|fd|funct|>
-//===----------------------------------------------------------------------===//
-
-class FFR<bits<6> op, bits<6> _funct, bits<5> _fmt, dag outs, dag ins,
-          string asmstr, list<dag> pattern> :
-  InstSE<outs, ins, asmstr, pattern, NoItinerary, FrmFR>
-{
-  bits<5>  fd;
-  bits<5>  fs;
-  bits<5>  ft;
-  bits<5>  fmt;
-  bits<6>  funct;
-
-  let Opcode = op;
-  let funct  = _funct;
-  let fmt    = _fmt;
-
-  let Inst{25-21} = fmt;
-  let Inst{20-16} = ft;
-  let Inst{15-11} = fs;
-  let Inst{10-6}  = fd;
-  let Inst{5-0}   = funct;
-}
-
-//===----------------------------------------------------------------------===//
 // Format FI instruction class in Mips : <|opcode|base|ft|immediate|>
 //===----------------------------------------------------------------------===//
 
@@ -248,130 +223,179 @@ class FFI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern>:
   let Inst{15-0}  = imm16;
 }
 
-//===----------------------------------------------------------------------===//
-// Compare instruction class in Mips : <|010001|fmt|ft|fs|0000011|condcode|>
-//===----------------------------------------------------------------------===//
-
-class FCC<bits<5> _fmt, dag outs, dag ins, string asmstr, list<dag> pattern> :
-  InstSE<outs, ins, asmstr, pattern, NoItinerary, FrmOther>
-{
-  bits<5>  fs;
-  bits<5>  ft;
-  bits<4>  cc;
-  bits<5>  fmt;
+class ADDS_FM<bits<6> funct, bits<5> fmt> {
+  bits<5> fd;
+  bits<5> fs;
+  bits<5> ft;
 
-  let Opcode = 0x11;
-  let fmt    = _fmt;
+  bits<32> Inst;
 
+  let Inst{31-26} = 0x11;
   let Inst{25-21} = fmt;
   let Inst{20-16} = ft;
   let Inst{15-11} = fs;
-  let Inst{10-6}  = 0;
-  let Inst{5-4}   = 0b11;
-  let Inst{3-0}   = cc;
+  let Inst{10-6}  = fd;
+  let Inst{5-0}   = funct;
 }
 
+class ABSS_FM<bits<6> funct, bits<5> fmt> {
+  bits<5> fd;
+  bits<5> fs;
 
-class FCMOV<bits<1> _tf, dag outs, dag ins, string asmstr,
-            list<dag> pattern> :
-  InstSE<outs, ins, asmstr, pattern, NoItinerary, FrmOther>
-{
-  bits<5>  rd;
-  bits<5>  rs;
-  bits<3>  cc;
-  bits<1>  tf;
-
-  let Opcode = 0;
-  let tf = _tf;
+  bits<32> Inst;
 
-  let Inst{25-21} = rs;
-  let Inst{20-18} = cc;
-  let Inst{17} = 0;
-  let Inst{16} = tf;
-  let Inst{15-11} = rd;
-  let Inst{10-6}  = 0;
-  let Inst{5-0}   = 1;
+  let Inst{31-26} = 0x11;
+  let Inst{25-21} = fmt;
+  let Inst{20-16} = 0;
+  let Inst{15-11} = fs;
+  let Inst{10-6}  = fd;
+  let Inst{5-0}   = funct;
 }
 
-class FFCMOV<bits<5> _fmt, bits<1> _tf, dag outs, dag ins, string asmstr,
-             list<dag> pattern> :
-  InstSE<outs, ins, asmstr, pattern, NoItinerary, FrmOther>
-{
-  bits<5>  fd;
-  bits<5>  fs;
-  bits<3>  cc;
-  bits<5>  fmt;
-  bits<1>  tf;
+class MFC1_FM<bits<5> funct> {
+  bits<5> rt;
+  bits<5> fs;
 
-  let Opcode = 17;
-  let fmt = _fmt;
-  let tf = _tf;
+  bits<32> Inst;
 
-  let Inst{25-21} = fmt;
-  let Inst{20-18} = cc;
-  let Inst{17} = 0;
-  let Inst{16} = tf;
+  let Inst{31-26} = 0x11;
+  let Inst{25-21} = funct;
+  let Inst{20-16} = rt;
   let Inst{15-11} = fs;
-  let Inst{10-6}  = fd;
-  let Inst{5-0}   = 17;
+  let Inst{10-0}  = 0;
 }
 
-// FP unary instructions without patterns.
-class FFR1<bits<6> funct, bits<5> fmt, string opstr, string fmtstr,
-           RegisterClass DstRC, RegisterClass SrcRC> :
-  FFR<0x11, funct, fmt, (outs DstRC:$fd), (ins SrcRC:$fs),
-      !strconcat(opstr, ".", fmtstr, "\t$fd, $fs"), []> {
-  let ft = 0;
-}
+class LW_FM<bits<6> op> {
+  bits<5> rt;
+  bits<21> addr;
 
-// FP unary instructions with patterns.
-class FFR1P<bits<6> funct, bits<5> fmt, string opstr, string fmtstr,
-            RegisterClass DstRC, RegisterClass SrcRC, SDNode OpNode> :
-  FFR<0x11, funct, fmt, (outs DstRC:$fd), (ins SrcRC:$fs),
-      !strconcat(opstr, ".", fmtstr, "\t$fd, $fs"),
-      [(set DstRC:$fd, (OpNode SrcRC:$fs))]> {
-  let ft = 0;
-}
+  bits<32> Inst;
 
-class FFR2P<bits<6> funct, bits<5> fmt, string opstr,
-            string fmtstr, RegisterClass RC, SDNode OpNode> :
-  FFR<0x11, funct, fmt, (outs RC:$fd), (ins RC:$fs, RC:$ft),
-      !strconcat(opstr, ".", fmtstr, "\t$fd, $fs, $ft"),
-      [(set RC:$fd, (OpNode RC:$fs, RC:$ft))]>;
+  let Inst{31-26} = op;
+  let Inst{25-21} = addr{20-16};
+  let Inst{20-16} = rt;
+  let Inst{15-0}  = addr{15-0};
+}
 
-// Floating point madd/msub/nmadd/nmsub.
-class FFMADDSUB<bits<3> funct, bits<3> fmt, dag outs, dag ins, string asmstr,
-                list<dag> pattern>
-  : InstSE<outs, ins, asmstr, pattern, NoItinerary, FrmOther> {
+class MADDS_FM<bits<3> funct, bits<3> fmt> {
   bits<5> fd;
   bits<5> fr;
   bits<5> fs;
   bits<5> ft;
 
-  let Opcode = 0x13;
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x13;
   let Inst{25-21} = fr;
   let Inst{20-16} = ft;
   let Inst{15-11} = fs;
-  let Inst{10-6} = fd;
-  let Inst{5-3} = funct;
-  let Inst{2-0} = fmt;
+  let Inst{10-6}  = fd;
+  let Inst{5-3}   = funct;
+  let Inst{2-0}   = fmt;
 }
 
-// FP indexed load/store instructions.
-class FFMemIdx<bits<6> funct, dag outs, dag ins, string asmstr,
-               list<dag> pattern> :
-  InstSE<outs, ins, asmstr, pattern, NoItinerary, FrmOther>
-{
-  bits<5>  base;
-  bits<5>  index;
-  bits<5>  fs;
-  bits<5>  fd;
+class LWXC1_FM<bits<6> funct> {
+  bits<5> fd;
+  bits<5> base;
+  bits<5> index;
 
-  let Opcode = 0x13;
+  bits<32> Inst;
 
+  let Inst{31-26} = 0x13;
   let Inst{25-21} = base;
   let Inst{20-16} = index;
+  let Inst{15-11} = 0;
+  let Inst{10-6}  = fd;
+  let Inst{5-0}   = funct;
+}
+
+class SWXC1_FM<bits<6> funct> {
+  bits<5> fs;
+  bits<5> base;
+  bits<5> index;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x13;
+  let Inst{25-21} = base;
+  let Inst{20-16} = index;
+  let Inst{15-11} = fs;
+  let Inst{10-6}  = 0;
+  let Inst{5-0}   = funct;
+}
+
+class BC1F_FM<bit nd, bit tf> {
+  bits<16> offset;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x11;
+  let Inst{25-21} = 0x8;
+  let Inst{20-18} = 0; // cc
+  let Inst{17} = nd;
+  let Inst{16} = tf;
+  let Inst{15-0} = offset;
+}
+
+class CEQS_FM<bits<5> fmt> {
+  bits<5> fs;
+  bits<5> ft;
+  bits<4> cond;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x11;
+  let Inst{25-21} = fmt;
+  let Inst{20-16} = ft;
+  let Inst{15-11} = fs;
+  let Inst{10-8} = 0; // cc
+  let Inst{7-4} = 0x3;
+  let Inst{3-0} = cond;
+}
+
+class CMov_I_F_FM<bits<6> funct, bits<5> fmt> {
+  bits<5> fd;
+  bits<5> fs;
+  bits<5> rt;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x11;
+  let Inst{25-21} = fmt;
+  let Inst{20-16} = rt;
   let Inst{15-11} = fs;
   let Inst{10-6} = fd;
   let Inst{5-0} = funct;
 }
+
+class CMov_F_I_FM<bit tf> {
+  bits<5> rd;
+  bits<5> rs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0;
+  let Inst{25-21} = rs;
+  let Inst{20-18} = 0; // cc
+  let Inst{17} = 0;
+  let Inst{16} = tf;
+  let Inst{15-11} = rd;
+  let Inst{10-6} = 0;
+  let Inst{5-0} = 1;
+}
+
+class CMov_F_F_FM<bits<5> fmt, bit tf> {
+  bits<5> fd;
+  bits<5> fs;
+
+  bits<32> Inst;
+
+  let Inst{31-26} = 0x11;
+  let Inst{25-21} = fmt;
+  let Inst{20-18} = 0; // cc
+  let Inst{17} = 0;
+  let Inst{16} = tf;
+  let Inst{15-11} = fs;
+  let Inst{10-6} = fd;
+  let Inst{5-0} = 0x11;
+}
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index 309f9d23e5..dc694716a3 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -173,6 +173,10 @@ class MipsPat<dag pattern, dag result> : Pat<pattern, result> {
   let Predicates = [HasStdEnc];
 }
 
+class IsCommutable {
+  bit isCommutable = 1;
+}
+
 class IsBranch {
   bit isBranch = 1;
 }
diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h
index 8b1a06f320..ddf15010dc 100644
--- a/lib/Target/Mips/MipsTargetMachine.h
+++ b/lib/Target/Mips/MipsTargetMachine.h
@@ -20,6 +20,7 @@
 #include "MipsJITInfo.h"
 #include "MipsSelectionDAGInfo.h"
 #include "MipsSubtarget.h"
+#include "llvm/ADT/OwningPtr.h"
 #include "llvm/DataLayout.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetMachine.h"
@@ -32,8 +33,8 @@ class MipsRegisterInfo;
 class MipsTargetMachine : public LLVMTargetMachine {
   MipsSubtarget       Subtarget;
   const DataLayout    DL; // Calculates type size & alignment
-  const MipsInstrInfo *InstrInfo;
-  const MipsFrameLowering *FrameLowering;
+  OwningPtr<const MipsInstrInfo> InstrInfo;
+  OwningPtr<const MipsFrameLowering> FrameLowering;
   MipsTargetLowering  TLInfo;
   MipsSelectionDAGInfo TSInfo;
   MipsJITInfo JITInfo;
@@ -47,12 +48,12 @@ public:
                     CodeGenOpt::Level OL,
                     bool isLittle);
 
-  virtual ~MipsTargetMachine() { delete InstrInfo; }
+  virtual ~MipsTargetMachine() {}
 
   virtual const MipsInstrInfo *getInstrInfo() const
-  { return InstrInfo; }
+  { return InstrInfo.get(); }
   virtual const TargetFrameLowering *getFrameLowering() const
-  { return FrameLowering; }
+  { return FrameLowering.get(); }
   virtual const MipsSubtarget *getSubtargetImpl() const
   { return &Subtarget; }
   virtual const DataLayout *getDataLayout()    const
diff --git a/lib/Target/NVPTX/NVPTXSection.h b/lib/Target/NVPTX/NVPTXSection.h
index 9c832e1b51..04c2da8171 100644
--- a/lib/Target/NVPTX/NVPTXSection.h
+++ b/lib/Target/NVPTX/NVPTXSection.h
@@ -38,6 +38,8 @@ public:
   virtual bool isBaseAddressKnownZero() const { return true; }
   virtual bool UseCodeAlign() const { return false; }
   virtual bool isVirtualSection() const { return false; }
+  virtual std::string getLabelBeginName() const { return ""; }
+  virtual std::string getLabelEndName() const { return ""; }
 };
 
 } // end namespace llvm
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index 631f749581..f9a1ebfd4a 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -32,6 +32,7 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
   case FK_Data_8:
   case PPC::fixup_ppc_toc:
   case PPC::fixup_ppc_tlsreg:
+  case PPC::fixup_ppc_nofixup:
     return Value;
   case PPC::fixup_ppc_lo14:
   case PPC::fixup_ppc_toc16_ds:
@@ -85,7 +86,8 @@ public:
       { "fixup_ppc_toc",          0,     64,   0 },
       { "fixup_ppc_toc16",       16,     16,   0 },
       { "fixup_ppc_toc16_ds",    16,     14,   0 },
-      { "fixup_ppc_tlsreg",       0,      0,   0 }
+      { "fixup_ppc_tlsreg",       0,      0,   0 },
+      { "fixup_ppc_nofixup",      0,      0,   0 }
     };
 
     if (Kind < FirstTargetFixupKind)
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
index 462d7072b5..7f4d9a28d0 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
@@ -9,6 +9,7 @@
 
 #include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "MCTargetDesc/PPCFixupKinds.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCValue.h"
@@ -33,9 +34,25 @@ namespace {
                                                     const MCFixup &Fixup,
                                                     bool IsPCRel) const;
     virtual void adjustFixupOffset(const MCFixup &Fixup, uint64_t &RelocOffset);
+
+    virtual void sortRelocs(const MCAssembler &Asm,
+                            std::vector<ELFRelocationEntry> &Relocs);
+  };
+
+  class PPCELFRelocationEntry : public ELFRelocationEntry {
+  public:
+    PPCELFRelocationEntry(const ELFRelocationEntry &RE);
+    bool operator<(const PPCELFRelocationEntry &RE) const {
+      return (RE.r_offset < r_offset ||
+              (RE.r_offset == r_offset && RE.Type > Type));
+    }
   };
 }
 
+PPCELFRelocationEntry::PPCELFRelocationEntry(const ELFRelocationEntry &RE)
+  : ELFRelocationEntry(RE.r_offset, RE.Index, RE.Type, RE.Symbol,
+                       RE.r_addend, *RE.Fixup) {}
+
 PPCELFObjectWriter::PPCELFObjectWriter(bool Is64Bit, uint8_t OSABI)
   : MCELFObjectTargetWriter(Is64Bit, OSABI,
                             Is64Bit ?  ELF::EM_PPC64 : ELF::EM_PPC,
@@ -79,12 +96,24 @@ unsigned PPCELFObjectWriter::getRelocTypeInner(const MCValue &Target,
       case MCSymbolRefExpr::VK_PPC_TPREL16_HA:
         Type = ELF::R_PPC_TPREL16_HA;
         break;
+      case MCSymbolRefExpr::VK_PPC_DTPREL16_HA:
+        Type = ELF::R_PPC64_DTPREL16_HA;
+        break;
       case MCSymbolRefExpr::VK_None:
         Type = ELF::R_PPC_ADDR16_HA;
 	break;
       case MCSymbolRefExpr::VK_PPC_TOC16_HA:
         Type = ELF::R_PPC64_TOC16_HA;
         break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TPREL16_HA:
+        Type = ELF::R_PPC64_GOT_TPREL16_HA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TLSGD16_HA:
+        Type = ELF::R_PPC64_GOT_TLSGD16_HA;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TLSLD16_HA:
+        Type = ELF::R_PPC64_GOT_TLSLD16_HA;
+        break;
       }
       break;
     case PPC::fixup_ppc_lo16:
@@ -93,12 +122,21 @@ unsigned PPCELFObjectWriter::getRelocTypeInner(const MCValue &Target,
       case MCSymbolRefExpr::VK_PPC_TPREL16_LO:
         Type = ELF::R_PPC_TPREL16_LO;
         break;
+      case MCSymbolRefExpr::VK_PPC_DTPREL16_LO:
+        Type = ELF::R_PPC64_DTPREL16_LO;
+        break;
       case MCSymbolRefExpr::VK_None:
         Type = ELF::R_PPC_ADDR16_LO;
 	break;
       case MCSymbolRefExpr::VK_PPC_TOC16_LO:
         Type = ELF::R_PPC64_TOC16_LO;
         break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TLSGD16_LO:
+        Type = ELF::R_PPC64_GOT_TLSGD16_LO;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_TLSLD16_LO:
+        Type = ELF::R_PPC64_GOT_TLSLD16_LO;
+        break;
       }
       break;
     case PPC::fixup_ppc_lo14:
@@ -119,14 +157,25 @@ unsigned PPCELFObjectWriter::getRelocTypeInner(const MCValue &Target,
       case MCSymbolRefExpr::VK_PPC_TOC16_LO:
         Type = ELF::R_PPC64_TOC16_LO_DS;
         break;
-      case MCSymbolRefExpr::VK_PPC_GOT_TPREL16_DS:
-        Type = ELF::R_PPC64_GOT_TPREL16_DS;
+      case MCSymbolRefExpr::VK_PPC_GOT_TPREL16_LO:
+        Type = ELF::R_PPC64_GOT_TPREL16_LO_DS;
         break;
       }
       break;
     case PPC::fixup_ppc_tlsreg:
       Type = ELF::R_PPC64_TLS;
       break;
+    case PPC::fixup_ppc_nofixup:
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_PPC_TLSGD:
+        Type = ELF::R_PPC64_TLSGD;
+        break;
+      case MCSymbolRefExpr::VK_PPC_TLSLD:
+        Type = ELF::R_PPC64_TLSLD;
+        break;
+      }
+      break;
     case FK_Data_8:
       switch (Modifier) {
       default: llvm_unreachable("Unsupported Modifier");
@@ -191,6 +240,34 @@ adjustFixupOffset(const MCFixup &Fixup, uint64_t &RelocOffset) {
   }
 }
 
+// The standard sorter only sorts on the r_offset field, but PowerPC can
+// have multiple relocations at the same offset.  Sort secondarily on the
+// relocation type to avoid nondeterminism.
+void PPCELFObjectWriter::sortRelocs(const MCAssembler &Asm,
+                                    std::vector<ELFRelocationEntry> &Relocs) {
+
+  // Copy to a temporary vector of relocation entries having a different
+  // sort function.
+  std::vector<PPCELFRelocationEntry> TmpRelocs;
+  
+  for (std::vector<ELFRelocationEntry>::iterator R = Relocs.begin();
+       R != Relocs.end(); ++R) {
+    TmpRelocs.push_back(PPCELFRelocationEntry(*R));
+  }
+
+  // Sort in place by ascending r_offset and descending r_type.
+  array_pod_sort(TmpRelocs.begin(), TmpRelocs.end());
+
+  // Copy back to the original vector.
+  unsigned I = 0;
+  for (std::vector<PPCELFRelocationEntry>::iterator R = TmpRelocs.begin();
+       R != TmpRelocs.end(); ++R, ++I) {
+    Relocs[I] = ELFRelocationEntry(R->r_offset, R->Index, R->Type,
+                                   R->Symbol, R->r_addend, *R->Fixup);
+  }
+}
+
+
 MCObjectWriter *llvm::createPPCELFObjectWriter(raw_ostream &OS,
                                                bool Is64Bit,
                                                uint8_t OSABI) {
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
index 75bb851630..7917f7736e 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
@@ -47,6 +47,10 @@ enum Fixups {
 
   /// fixup_ppc_tlsreg - Insert thread-pointer register number.
   fixup_ppc_tlsreg,
+
+  /// fixup_ppc_nofixup - Not a true fixup, but ties a symbol to a call
+  /// to __tls_get_addr for the TLS general and local dynamic models.
+  fixup_ppc_nofixup,
   
   // Marker
   LastTargetFixupKind,
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index 5b208d41f4..d048426d43 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -17,6 +17,7 @@
 #include "MCTargetDesc/PPCFixupKinds.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -62,8 +63,6 @@ public:
                             SmallVectorImpl<MCFixup> &Fixups) const;
   unsigned getMemRIXEncoding(const MCInst &MI, unsigned OpNo,
                              SmallVectorImpl<MCFixup> &Fixups) const;
-  unsigned getTLSOffsetEncoding(const MCInst &MI, unsigned OpNo,
-                                SmallVectorImpl<MCFixup> &Fixups) const;
   unsigned getTLSRegEncoding(const MCInst &MI, unsigned OpNo,
                              SmallVectorImpl<MCFixup> &Fixups) const;
   unsigned get_crbitm_encoding(const MCInst &MI, unsigned OpNo,
@@ -82,11 +81,12 @@ public:
                          SmallVectorImpl<MCFixup> &Fixups) const {
     uint64_t Bits = getBinaryCodeForInstr(MI, Fixups);
 
-    // BL8_NOPELF and BLA8_NOP_ELF is both size of 8 bacause of the
+    // BL8_NOP_ELF, BLA8_NOP_ELF, etc., all have a size of 8 because of the
     // following 'nop'.
     unsigned Size = 4; // FIXME: Have Desc.getSize() return the correct value!
     unsigned Opcode = MI.getOpcode();
-    if (Opcode == PPC::BL8_NOP_ELF || Opcode == PPC::BLA8_NOP_ELF)
+    if (Opcode == PPC::BL8_NOP_ELF || Opcode == PPC::BLA8_NOP_ELF ||
+        Opcode == PPC::BL8_NOP_ELF_TLSGD || Opcode == PPC::BL8_NOP_ELF_TLSLD)
       Size = 8;
     
     // Output the constant in big endian byte order.
@@ -119,6 +119,17 @@ getDirectBrEncoding(const MCInst &MI, unsigned OpNo,
   // Add a fixup for the branch target.
   Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
                                    (MCFixupKind)PPC::fixup_ppc_br24));
+
+  // For special TLS calls, add another fixup for the symbol.  Apparently
+  // BL8_NOP_ELF, BL8_NOP_ELF_TLSGD, and BL8_NOP_ELF_TLSLD are sufficiently
+  // similar that TblGen will not generate a separate case for the latter
+  // two, so this is the only way to get the extra fixup generated.
+  unsigned Opcode = MI.getOpcode();
+  if (Opcode == PPC::BL8_NOP_ELF_TLSGD || Opcode == PPC::BL8_NOP_ELF_TLSLD) {
+    const MCOperand &MO2 = MI.getOperand(OpNo+1);
+    Fixups.push_back(MCFixup::Create(0, MO2.getExpr(),
+                                     (MCFixupKind)PPC::fixup_ppc_nofixup));
+  }
   return 0;
 }
 
@@ -199,17 +210,6 @@ unsigned PPCMCCodeEmitter::getMemRIXEncoding(const MCInst &MI, unsigned OpNo,
 }
 
 
-unsigned PPCMCCodeEmitter::getTLSOffsetEncoding(const MCInst &MI, unsigned OpNo,
-                                       SmallVectorImpl<MCFixup> &Fixups) const {
-  const MCOperand &MO = MI.getOperand(OpNo);
-  
-  // Add a fixup for the GOT displacement to the TLS block offset.
-  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
-                                   (MCFixupKind)PPC::fixup_ppc_toc16_ds));
-  return 0;
-}
-
-
 unsigned PPCMCCodeEmitter::getTLSRegEncoding(const MCInst &MI, unsigned OpNo,
                                        SmallVectorImpl<MCFixup> &Fixups) const {
   const MCOperand &MO = MI.getOperand(OpNo);
@@ -223,7 +223,6 @@ unsigned PPCMCCodeEmitter::getTLSRegEncoding(const MCInst &MI, unsigned OpNo,
   return getPPCRegisterNumbering(PPC::X13);
 }
 
-
 unsigned PPCMCCodeEmitter::
 get_crbitm_encoding(const MCInst &MI, unsigned OpNo,
                     SmallVectorImpl<MCFixup> &Fixups) const {
diff --git a/lib/Target/PowerPC/PPC.h b/lib/Target/PowerPC/PPC.h
index bbd247f20f..e6d38ebf21 100644
--- a/lib/Target/PowerPC/PPC.h
+++ b/lib/Target/PowerPC/PPC.h
@@ -72,9 +72,7 @@ namespace llvm {
     MO_HA16 = 2 << 5,
 
     MO_TPREL16_HA = 3 << 5,
-    MO_TPREL16_LO = 4 << 5,
-    MO_GOT_TPREL16_DS = 5 << 5,
-    MO_TLS = 6 << 5
+    MO_TPREL16_LO = 4 << 5
   };
   } // end namespace PPCII
   
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 4315bc1a07..35b6234a34 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -513,8 +513,24 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     OutStreamer.EmitInstruction(TmpInst);
     return;
   }
-  case PPC::LDgotTPREL: {
-    // Transform %Xd = LDgotTPREL <ga:@sym>, %Xs
+  case PPC::ADDISgotTprelHA: {
+    // Transform: %Xd = ADDISgotTprelHA %X2, <ga:@sym>
+    // Into:      %Xd = ADDIS8 %X2, sym@got@tlsgd@ha
+    assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
+    const MachineOperand &MO = MI->getOperand(2);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    const MCExpr *SymGotTprel =
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL16_HA,
+                              OutContext);
+    OutStreamer.EmitInstruction(MCInstBuilder(PPC::ADDIS8)
+                                .addReg(MI->getOperand(0).getReg())
+                                .addReg(PPC::X2)
+                                .addExpr(SymGotTprel));
+    return;
+  }
+  case PPC::LDgotTprelL: {
+    // Transform %Xd = LDgotTprelL <ga:@sym>, %Xs
     LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, Subtarget.isDarwin());
 
     // Change the opcode to LDrs, which is a form of LD with the offset
@@ -524,12 +540,148 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const GlobalValue *GValue = MO.getGlobal();
     MCSymbol *MOSymbol = Mang->getSymbol(GValue);
     const MCExpr *Exp =
-      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL16_DS,
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TPREL16_LO,
                               OutContext);
     TmpInst.getOperand(1) = MCOperand::CreateExpr(Exp);
     OutStreamer.EmitInstruction(TmpInst);
     return;
   }
+  case PPC::ADDIStlsgdHA: {
+    // Transform: %Xd = ADDIStlsgdHA %X2, <ga:@sym>
+    // Into:      %Xd = ADDIS8 %X2, sym@got@tlsgd@ha
+    assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
+    const MachineOperand &MO = MI->getOperand(2);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    const MCExpr *SymGotTlsGD =
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSGD16_HA,
+                              OutContext);
+    OutStreamer.EmitInstruction(MCInstBuilder(PPC::ADDIS8)
+                                .addReg(MI->getOperand(0).getReg())
+                                .addReg(PPC::X2)
+                                .addExpr(SymGotTlsGD));
+    return;
+  }
+  case PPC::ADDItlsgdL: {
+    // Transform: %Xd = ADDItlsgdL %Xs, <ga:@sym>
+    // Into:      %Xd = ADDI8L %Xs, sym@got@tlsgd@l
+    assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
+    const MachineOperand &MO = MI->getOperand(2);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    const MCExpr *SymGotTlsGD =
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSGD16_LO,
+                              OutContext);
+    OutStreamer.EmitInstruction(MCInstBuilder(PPC::ADDI8L)
+                                .addReg(MI->getOperand(0).getReg())
+                                .addReg(MI->getOperand(1).getReg())
+                                .addExpr(SymGotTlsGD));
+    return;
+  }
+  case PPC::GETtlsADDR: {
+    // Transform: %X3 = GETtlsADDR %X3, <ga:@sym>
+    // Into:      BL8_NOP_ELF_TLSGD __tls_get_addr(sym@tlsgd)
+    assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
+
+    StringRef Name = "__tls_get_addr";
+    MCSymbol *TlsGetAddr = OutContext.GetOrCreateSymbol(Name);
+    const MCSymbolRefExpr *TlsRef = 
+      MCSymbolRefExpr::Create(TlsGetAddr, MCSymbolRefExpr::VK_None, OutContext);
+    const MachineOperand &MO = MI->getOperand(2);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    const MCExpr *SymVar =
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TLSGD,
+                              OutContext);
+    OutStreamer.EmitInstruction(MCInstBuilder(PPC::BL8_NOP_ELF_TLSGD)
+                                .addExpr(TlsRef)
+                                .addExpr(SymVar));
+    return;
+  }
+  case PPC::ADDIStlsldHA: {
+    // Transform: %Xd = ADDIStlsldHA %X2, <ga:@sym>
+    // Into:      %Xd = ADDIS8 %X2, sym@got@tlsld@ha
+    assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
+    const MachineOperand &MO = MI->getOperand(2);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    const MCExpr *SymGotTlsLD =
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSLD16_HA,
+                              OutContext);
+    OutStreamer.EmitInstruction(MCInstBuilder(PPC::ADDIS8)
+                                .addReg(MI->getOperand(0).getReg())
+                                .addReg(PPC::X2)
+                                .addExpr(SymGotTlsLD));
+    return;
+  }
+  case PPC::ADDItlsldL: {
+    // Transform: %Xd = ADDItlsldL %Xs, <ga:@sym>
+    // Into:      %Xd = ADDI8L %Xs, sym@got@tlsld@l
+    assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
+    const MachineOperand &MO = MI->getOperand(2);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    const MCExpr *SymGotTlsLD =
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_GOT_TLSLD16_LO,
+                              OutContext);
+    OutStreamer.EmitInstruction(MCInstBuilder(PPC::ADDI8L)
+                                .addReg(MI->getOperand(0).getReg())
+                                .addReg(MI->getOperand(1).getReg())
+                                .addExpr(SymGotTlsLD));
+    return;
+  }
+  case PPC::GETtlsldADDR: {
+    // Transform: %X3 = GETtlsldADDR %X3, <ga:@sym>
+    // Into:      BL8_NOP_ELF_TLSLD __tls_get_addr(sym@tlsld)
+    assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
+
+    StringRef Name = "__tls_get_addr";
+    MCSymbol *TlsGetAddr = OutContext.GetOrCreateSymbol(Name);
+    const MCSymbolRefExpr *TlsRef = 
+      MCSymbolRefExpr::Create(TlsGetAddr, MCSymbolRefExpr::VK_None, OutContext);
+    const MachineOperand &MO = MI->getOperand(2);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    const MCExpr *SymVar =
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_TLSLD,
+                              OutContext);
+    OutStreamer.EmitInstruction(MCInstBuilder(PPC::BL8_NOP_ELF_TLSLD)
+                                .addExpr(TlsRef)
+                                .addExpr(SymVar));
+    return;
+  }
+  case PPC::ADDISdtprelHA: {
+    // Transform: %Xd = ADDISdtprelHA %X3, <ga:@sym>
+    // Into:      %Xd = ADDIS8 %X3, sym@dtprel@ha
+    assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
+    const MachineOperand &MO = MI->getOperand(2);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    const MCExpr *SymDtprel =
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL16_HA,
+                              OutContext);
+    OutStreamer.EmitInstruction(MCInstBuilder(PPC::ADDIS8)
+                                .addReg(MI->getOperand(0).getReg())
+                                .addReg(PPC::X3)
+                                .addExpr(SymDtprel));
+    return;
+  }
+  case PPC::ADDIdtprelL: {
+    // Transform: %Xd = ADDIdtprelL %Xs, <ga:@sym>
+    // Into:      %Xd = ADDI8L %Xs, sym@dtprel@l
+    assert(Subtarget.isPPC64() && "Not supported for 32-bit PowerPC");
+    const MachineOperand &MO = MI->getOperand(2);
+    const GlobalValue *GValue = MO.getGlobal();
+    MCSymbol *MOSymbol = Mang->getSymbol(GValue);
+    const MCExpr *SymDtprel =
+      MCSymbolRefExpr::Create(MOSymbol, MCSymbolRefExpr::VK_PPC_DTPREL16_LO,
+                              OutContext);
+    OutStreamer.EmitInstruction(MCInstBuilder(PPC::ADDI8L)
+                                .addReg(MI->getOperand(0).getReg())
+                                .addReg(MI->getOperand(1).getReg())
+                                .addExpr(SymDtprel));
+    return;
+  }
   case PPC::MFCRpseud:
   case PPC::MFCR8pseud:
     // Transform: %R3 = MFCRpseud %CR7
diff --git a/lib/Target/PowerPC/PPCCodeEmitter.cpp b/lib/Target/PowerPC/PPCCodeEmitter.cpp
index 0dcb5deb5d..d013b0f7e6 100644
--- a/lib/Target/PowerPC/PPCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/PPCCodeEmitter.cpp
@@ -68,7 +68,6 @@ namespace {
     unsigned getLO16Encoding(const MachineInstr &MI, unsigned OpNo) const;
     unsigned getMemRIEncoding(const MachineInstr &MI, unsigned OpNo) const;
     unsigned getMemRIXEncoding(const MachineInstr &MI, unsigned OpNo) const;
-    unsigned getTLSOffsetEncoding(const MachineInstr &MI, unsigned OpNo) const;
     unsigned getTLSRegEncoding(const MachineInstr &MI, unsigned OpNo) const;
 
     const char *getPassName() const { return "PowerPC Machine Code Emitter"; }
@@ -245,13 +244,6 @@ unsigned PPCCodeEmitter::getMemRIXEncoding(const MachineInstr &MI,
 }
 
 
-unsigned PPCCodeEmitter::getTLSOffsetEncoding(const MachineInstr &MI,
-                                           unsigned OpNo) const {
-  llvm_unreachable("TLS not supported on the old JIT.");
-  return 0;
-}
-
-
 unsigned PPCCodeEmitter::getTLSRegEncoding(const MachineInstr &MI,
                                            unsigned OpNo) const {
   llvm_unreachable("TLS not supported on the old JIT.");
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index abcaa9cab0..c7c265ce78 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -1311,11 +1311,6 @@ SDNode *PPCDAGToDAGISel::Select(SDNode *N) {
     return CurDAG->getMachineNode(PPC::ADDItocL, dl, MVT::i64,
                                   SDValue(Tmp, 0), GA);
   }
-  case PPCISD::LD_GOT_TPREL: {
-    assert (PPCSubTarget.isPPC64() && "Only supported for 64-bit ABI");
-    return CurDAG->getMachineNode(PPC::LDgotTPREL, dl, MVT::i64, 
-                                  N->getOperand(0), N->getOperand(1));
-  }
   }
 
   return SelectCode(N);
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 1168171b23..4a1b388a59 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -578,8 +578,17 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::ADDIS_TOC_HA:    return "PPCISD::ADDIS_TOC_HA";
   case PPCISD::LD_TOC_L:        return "PPCISD::LD_TOC_L";
   case PPCISD::ADDI_TOC_L:      return "PPCISD::ADDI_TOC_L";
-  case PPCISD::LD_GOT_TPREL:    return "PPCISD::LD_GOT_TPREL";
+  case PPCISD::ADDIS_GOT_TPREL_HA: return "PPCISD::ADDIS_GOT_TPREL_HA";
+  case PPCISD::LD_GOT_TPREL_L:  return "PPCISD::LD_GOT_TPREL_L";
   case PPCISD::ADD_TLS:         return "PPCISD::ADD_TLS";
+  case PPCISD::ADDIS_TLSGD_HA:  return "PPCISD::ADDIS_TLSGD_HA";
+  case PPCISD::ADDI_TLSGD_L:    return "PPCISD::ADDI_TLSGD_L";
+  case PPCISD::GET_TLS_ADDR:    return "PPCISD::GET_TLS_ADDR";
+  case PPCISD::ADDIS_TLSLD_HA:  return "PPCISD::ADDIS_TLSLD_HA";
+  case PPCISD::ADDI_TLSLD_L:    return "PPCISD::ADDI_TLSLD_L";
+  case PPCISD::GET_TLSLD_ADDR:  return "PPCISD::GET_TLSLD_ADDR";
+  case PPCISD::ADDIS_DTPREL_HA: return "PPCISD::ADDIS_DTPREL_HA";
+  case PPCISD::ADDI_DTPREL_L:   return "PPCISD::ADDI_DTPREL_L";
   }
 }
 
@@ -1342,18 +1351,65 @@ SDValue PPCTargetLowering::LowerGlobalTLSAddress(SDValue Op,
   if (!is64bit)
     llvm_unreachable("only local-exec is currently supported for ppc32");
 
-  if (Model != TLSModel::InitialExec)
-    llvm_unreachable("only local-exec and initial-exec TLS modes supported");
-
-  SDValue GOTOffset = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
-                                                 PPCII::MO_GOT_TPREL16_DS);
-  SDValue TPReg = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0,
-                                             PPCII::MO_TLS);
-  SDValue GOTReg = DAG.getRegister(is64bit ? PPC::X2  : PPC::R2,
-                                   is64bit ? MVT::i64 : MVT::i32);
-  SDValue TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL, dl, PtrVT,
-                                 GOTOffset, GOTReg);
-  return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TPOffset, TPReg);
+  if (Model == TLSModel::InitialExec) {
+    SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
+    SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
+    SDValue TPOffsetHi = DAG.getNode(PPCISD::ADDIS_GOT_TPREL_HA, dl,
+                                     PtrVT, GOTReg, TGA);
+    SDValue TPOffset = DAG.getNode(PPCISD::LD_GOT_TPREL_L, dl,
+                                   PtrVT, TGA, TPOffsetHi);
+    return DAG.getNode(PPCISD::ADD_TLS, dl, PtrVT, TPOffset, TGA);
+  }
+
+  if (Model == TLSModel::GeneralDynamic) {
+    SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
+    SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
+    SDValue GOTEntryHi = DAG.getNode(PPCISD::ADDIS_TLSGD_HA, dl, PtrVT,
+                                     GOTReg, TGA);
+    SDValue GOTEntry = DAG.getNode(PPCISD::ADDI_TLSGD_L, dl, PtrVT,
+                                   GOTEntryHi, TGA);
+
+    // We need a chain node, and don't have one handy.  The underlying
+    // call has no side effects, so using the function entry node
+    // suffices.
+    SDValue Chain = DAG.getEntryNode();
+    Chain = DAG.getCopyToReg(Chain, dl, PPC::X3, GOTEntry);
+    SDValue ParmReg = DAG.getRegister(PPC::X3, MVT::i64);
+    SDValue TLSAddr = DAG.getNode(PPCISD::GET_TLS_ADDR, dl,
+                                  PtrVT, ParmReg, TGA);
+    // The return value from GET_TLS_ADDR really is in X3 already, but
+    // some hacks are needed here to tie everything together.  The extra
+    // copies dissolve during subsequent transforms.
+    Chain = DAG.getCopyToReg(Chain, dl, PPC::X3, TLSAddr);
+    return DAG.getCopyFromReg(Chain, dl, PPC::X3, PtrVT);
+  }
+
+  if (Model == TLSModel::LocalDynamic) {
+    SDValue TGA = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, 0);
+    SDValue GOTReg = DAG.getRegister(PPC::X2, MVT::i64);
+    SDValue GOTEntryHi = DAG.getNode(PPCISD::ADDIS_TLSLD_HA, dl, PtrVT,
+                                     GOTReg, TGA);
+    SDValue GOTEntry = DAG.getNode(PPCISD::ADDI_TLSLD_L, dl, PtrVT,
+                                   GOTEntryHi, TGA);
+
+    // We need a chain node, and don't have one handy.  The underlying
+    // call has no side effects, so using the function entry node
+    // suffices.
+    SDValue Chain = DAG.getEntryNode();
+    Chain = DAG.getCopyToReg(Chain, dl, PPC::X3, GOTEntry);
+    SDValue ParmReg = DAG.getRegister(PPC::X3, MVT::i64);
+    SDValue TLSAddr = DAG.getNode(PPCISD::GET_TLSLD_ADDR, dl,
+                                  PtrVT, ParmReg, TGA);
+    // The return value from GET_TLSLD_ADDR really is in X3 already, but
+    // some hacks are needed here to tie everything together.  The extra
+    // copies dissolve during subsequent transforms.
+    Chain = DAG.getCopyToReg(Chain, dl, PPC::X3, TLSAddr);
+    SDValue DtvOffsetHi = DAG.getNode(PPCISD::ADDIS_DTPREL_HA, dl, PtrVT,
+                                      Chain, ParmReg, TGA);
+    return DAG.getNode(PPCISD::ADDI_DTPREL_L, dl, PtrVT, DtvOffsetHi, TGA);
+  }
+
+  llvm_unreachable("Unknown TLS model!");
 }
 
 SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
@@ -6787,16 +6843,15 @@ PPCTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
 /// lowering. If DstAlign is zero that means it's safe to destination
 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
 /// means there isn't a need to check it against alignment requirement,
-/// probably because the source does not need to be loaded. If
-/// 'IsZeroVal' is true, that means it's safe to return a
-/// non-scalar-integer type, e.g. empty string source, constant, or loaded
-/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is
-/// constant so it does not need to be loaded.
+/// probably because the source does not need to be loaded. If 'IsMemset' is
+/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
+/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
+/// source is constant so it does not need to be loaded.
 /// It returns EVT::Other if the type should be determined using generic
 /// target-independent logic.
 EVT PPCTargetLowering::getOptimalMemOpType(uint64_t Size,
                                            unsigned DstAlign, unsigned SrcAlign,
-                                           bool IsZeroVal,
+                                           bool IsMemset, bool ZeroMemset,
                                            bool MemcpyStrSrc,
                                            MachineFunction &MF) const {
   if (this->PPCSubTarget.isPPC64()) {
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index f54a8b77a4..12b3df7c9a 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -178,11 +178,16 @@ namespace llvm {
       CR6SET,
       CR6UNSET,
 
-      /// G8RC = LD_GOT_TPREL Symbol, G8RReg - Used by the initial-exec
+      /// G8RC = ADDIS_GOT_TPREL_HA %X2, Symbol - Used by the initial-exec
+      /// TLS model, produces an ADDIS8 instruction that adds the GOT
+      /// base to sym@got@tprel@ha.
+      ADDIS_GOT_TPREL_HA,
+
+      /// G8RC = LD_GOT_TPREL_L Symbol, G8RReg - Used by the initial-exec
       /// TLS model, produces a LD instruction with base register G8RReg
-      /// and offset sym@got@tprel.  The latter identifies the GOT entry
-      /// containing the offset of "sym" relative to the thread pointer.
-      LD_GOT_TPREL,
+      /// and offset sym@got@tprel@l.  This completes the addition that
+      /// finds the offset of "sym" relative to the thread pointer.
+      LD_GOT_TPREL_L,
 
       /// G8RC = ADD_TLS G8RReg, Symbol - Used by the initial-exec TLS
       /// model, produces an ADD instruction that adds the contents of
@@ -192,6 +197,46 @@ namespace llvm {
       /// TLS sequence.
       ADD_TLS,
 
+      /// G8RC = ADDIS_TLSGD_HA %X2, Symbol - For the general-dynamic TLS
+      /// model, produces an ADDIS8 instruction that adds the GOT base
+      /// register to sym@got@tlsgd@ha.
+      ADDIS_TLSGD_HA,
+
+      /// G8RC = ADDI_TLSGD_L G8RReg, Symbol - For the general-dynamic TLS
+      /// model, produces an ADDI8 instruction that adds G8RReg to
+      /// sym@got@tlsgd@l.
+      ADDI_TLSGD_L,
+
+      /// G8RC = GET_TLS_ADDR %X3, Symbol - For the general-dynamic TLS
+      /// model, produces a call to __tls_get_addr(sym@tlsgd).
+      GET_TLS_ADDR,
+
+      /// G8RC = ADDIS_TLSLD_HA %X2, Symbol - For the local-dynamic TLS
+      /// model, produces an ADDIS8 instruction that adds the GOT base
+      /// register to sym@got@tlsld@ha.
+      ADDIS_TLSLD_HA,
+
+      /// G8RC = ADDI_TLSLD_L G8RReg, Symbol - For the local-dynamic TLS
+      /// model, produces an ADDI8 instruction that adds G8RReg to
+      /// sym@got@tlsld@l.
+      ADDI_TLSLD_L,
+
+      /// G8RC = GET_TLSLD_ADDR %X3, Symbol - For the local-dynamic TLS
+      /// model, produces a call to __tls_get_addr(sym@tlsld).
+      GET_TLSLD_ADDR,
+
+      /// G8RC = ADDIS_DTPREL_HA %X3, Symbol, Chain - For the
+      /// local-dynamic TLS model, produces an ADDIS8 instruction
+      /// that adds X3 to sym@dtprel@ha.  The Chain operand is needed 
+      /// to tie this in place following a copy to %X3 from the result
+      /// of a GET_TLSLD_ADDR.
+      ADDIS_DTPREL_HA,
+
+      /// G8RC = ADDI_DTPREL_L G8RReg, Symbol - For the local-dynamic TLS
+      /// model, produces an ADDI8 instruction that adds G8RReg to
+      /// sym@got@dtprel@l.
+      ADDI_DTPREL_L,
+
       /// STD_32 - This is the STD instruction for use with "32-bit" registers.
       STD_32 = ISD::FIRST_TARGET_MEMORY_OPCODE,
 
@@ -386,16 +431,15 @@ namespace llvm {
     /// lowering. If DstAlign is zero that means it's safe to destination
     /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
     /// means there isn't a need to check it against alignment requirement,
-    /// probably because the source does not need to be loaded. If
-    /// 'IsZeroVal' is true, that means it's safe to return a
-    /// non-scalar-integer type, e.g. empty string source, constant, or loaded
-    /// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is
-    /// constant so it does not need to be loaded.
+    /// probably because the source does not need to be loaded. If 'IsMemset' is
+    /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
+    /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
+    /// source is constant so it does not need to be loaded.
     /// It returns EVT::Other if the type should be determined using generic
     /// target-independent logic.
     virtual EVT
-    getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
-                        bool IsZeroVal, bool MemcpyStrSrc,
+    getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, 
+                        bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
                         MachineFunction &MF) const;
 
     /// isFMAFasterThanMulAndAdd - Return true if an FMA operation is faster than
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index dff15664a3..1dd5415733 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -37,12 +37,10 @@ def memrs : Operand<iPTR> {   // memri where the immediate is a symbolLo64
   let EncoderMethod = "getMemRIXEncoding";
   let MIOperandInfo = (ops symbolLo64:$off, ptr_rc:$reg);
 }
-def tlsaddr : Operand<i64> {
-  let EncoderMethod = "getTLSOffsetEncoding";
-}
 def tlsreg : Operand<i64> {
   let EncoderMethod = "getTLSRegEncoding";
 }
+def tlsgd : Operand<i64> {}
 
 //===----------------------------------------------------------------------===//
 // 64-bit transformation functions.
@@ -110,6 +108,16 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR8] in {
                              (outs), (ins calltarget:$func),
                              "bl $func\n\tnop", BrB, []>;
 
+    let isCodeGenOnly = 1 in
+    def BL8_NOP_ELF_TLSGD : IForm_and_DForm_4_zero<18, 0, 1, 24,
+                                  (outs), (ins calltarget:$func, tlsgd:$sym),
+                                  "bl $func($sym)\n\tnop", BrB, []>;
+
+    let isCodeGenOnly = 1 in
+    def BL8_NOP_ELF_TLSLD : IForm_and_DForm_4_zero<18, 0, 1, 24,
+                                  (outs), (ins calltarget:$func, tlsgd:$sym),
+                                  "bl $func($sym)\n\tnop", BrB, []>;
+
     def BLA8_ELF : IForm<18, 1, 1,
                          (outs), (ins aaddr:$func),
                          "bla $func", BrB, [(PPCcall_SVR4 (i64 imm:$func))]>;
@@ -373,7 +381,7 @@ def ADD8  : XOForm_1<31, 266, 0, (outs G8RC:$rT), (ins G8RC:$rA, G8RC:$rB),
 // ADD8 has a special form: reg = ADD8(reg, sym@tls) for use by the
 // initial-exec thread-local storage model.
 def ADD8TLS  : XOForm_1<31, 266, 0, (outs G8RC:$rT), (ins G8RC:$rA, tlsreg:$rB),
-                        "add $rT, $rA, $rB", IntSimple,
+                        "add $rT, $rA, $rB@tls", IntSimple,
                         [(set G8RC:$rT, (add G8RC:$rA, tglobaltlsaddr:$rB))]>;
                      
 let Defs = [CARRY] in {
@@ -709,13 +717,60 @@ def ADDItocL: Pseudo<(outs G8RC:$rD), (ins G8RC:$reg, tocentry:$disp),
                        (PPCaddiTocL G8RC:$reg, tglobaladdr:$disp))]>, isPPC64;
 
 // Support for thread-local storage.
-def LDgotTPREL: Pseudo<(outs G8RC:$rD), (ins tlsaddr:$disp, G8RC:$reg),
-                       "#LDgotTPREL",
-                       [(set G8RC:$rD,
-                         (PPCldGotTprel G8RC:$reg, tglobaltlsaddr:$disp))]>,
-                      isPPC64;
+def ADDISgotTprelHA: Pseudo<(outs G8RC:$rD), (ins G8RC:$reg, symbolHi64:$disp),
+                         "#ADDISgotTprelHA",
+                         [(set G8RC:$rD,
+                           (PPCaddisGotTprelHA G8RC:$reg,
+                                               tglobaltlsaddr:$disp))]>,
+                  isPPC64;
+def LDgotTprelL: Pseudo<(outs G8RC:$rD), (ins symbolLo64:$disp, G8RC:$reg),
+                        "#LDgotTprelL",
+                        [(set G8RC:$rD,
+                          (PPCldGotTprelL tglobaltlsaddr:$disp, G8RC:$reg))]>,
+                 isPPC64;
 def : Pat<(PPCaddTls G8RC:$in, tglobaltlsaddr:$g),
           (ADD8TLS G8RC:$in, tglobaltlsaddr:$g)>;
+def ADDIStlsgdHA: Pseudo<(outs G8RC:$rD), (ins G8RC:$reg, symbolHi64:$disp),
+                         "#ADDIStlsgdHA",
+                         [(set G8RC:$rD,
+                           (PPCaddisTlsgdHA G8RC:$reg, tglobaltlsaddr:$disp))]>,
+                  isPPC64;
+def ADDItlsgdL : Pseudo<(outs G8RC:$rD), (ins G8RC:$reg, symbolLo64:$disp),
+                       "#ADDItlsgdL",
+                       [(set G8RC:$rD,
+                         (PPCaddiTlsgdL G8RC:$reg, tglobaltlsaddr:$disp))]>,
+                 isPPC64;
+def GETtlsADDR : Pseudo<(outs G8RC:$rD), (ins G8RC:$reg, tlsgd:$sym),
+                        "#GETtlsADDR",
+                        [(set G8RC:$rD,
+                          (PPCgetTlsAddr G8RC:$reg, tglobaltlsaddr:$sym))]>,
+                 isPPC64;
+def ADDIStlsldHA: Pseudo<(outs G8RC:$rD), (ins G8RC:$reg, symbolHi64:$disp),
+                         "#ADDIStlsldHA",
+                         [(set G8RC:$rD,
+                           (PPCaddisTlsldHA G8RC:$reg, tglobaltlsaddr:$disp))]>,
+                  isPPC64;
+def ADDItlsldL : Pseudo<(outs G8RC:$rD), (ins G8RC:$reg, symbolLo64:$disp),
+                       "#ADDItlsldL",
+                       [(set G8RC:$rD,
+                         (PPCaddiTlsldL G8RC:$reg, tglobaltlsaddr:$disp))]>,
+                 isPPC64;
+def GETtlsldADDR : Pseudo<(outs G8RC:$rD), (ins G8RC:$reg, tlsgd:$sym),
+                          "#GETtlsldADDR",
+                          [(set G8RC:$rD,
+                            (PPCgetTlsldAddr G8RC:$reg, tglobaltlsaddr:$sym))]>,
+                   isPPC64;
+def ADDISdtprelHA: Pseudo<(outs G8RC:$rD), (ins G8RC:$reg, symbolHi64:$disp),
+                          "#ADDISdtprelHA",
+                          [(set G8RC:$rD,
+                            (PPCaddisDtprelHA G8RC:$reg,
+                                              tglobaltlsaddr:$disp))]>,
+                   isPPC64;
+def ADDIdtprelL : Pseudo<(outs G8RC:$rD), (ins G8RC:$reg, symbolLo64:$disp),
+                         "#ADDIdtprelL",
+                         [(set G8RC:$rD,
+                           (PPCaddiDtprelL G8RC:$reg, tglobaltlsaddr:$disp))]>,
+                  isPPC64;
 
 let PPC970_Unit = 2 in {
 // Truncating stores.                       
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index a29f40ad53..8c077b7517 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -91,8 +91,19 @@ def PPCtoc_entry: SDNode<"PPCISD::TOC_ENTRY", SDTIntBinOp, [SDNPMayLoad]>;
 def PPCvmaddfp  : SDNode<"PPCISD::VMADDFP", SDTFPTernaryOp, []>;
 def PPCvnmsubfp : SDNode<"PPCISD::VNMSUBFP", SDTFPTernaryOp, []>;
 
-def PPCldGotTprel : SDNode<"PPCISD::LD_GOT_TPREL", SDTIntBinOp, [SDNPMayLoad]>;
+def PPCaddisGotTprelHA : SDNode<"PPCISD::ADDIS_GOT_TPREL_HA", SDTIntBinOp>;
+def PPCldGotTprelL : SDNode<"PPCISD::LD_GOT_TPREL_L", SDTIntBinOp,
+                            [SDNPMayLoad]>;
 def PPCaddTls     : SDNode<"PPCISD::ADD_TLS", SDTIntBinOp, []>;
+def PPCaddisTlsgdHA : SDNode<"PPCISD::ADDIS_TLSGD_HA", SDTIntBinOp>;
+def PPCaddiTlsgdL   : SDNode<"PPCISD::ADDI_TLSGD_L", SDTIntBinOp>;
+def PPCgetTlsAddr   : SDNode<"PPCISD::GET_TLS_ADDR", SDTIntBinOp>;
+def PPCaddisTlsldHA : SDNode<"PPCISD::ADDIS_TLSLD_HA", SDTIntBinOp>;
+def PPCaddiTlsldL   : SDNode<"PPCISD::ADDI_TLSLD_L", SDTIntBinOp>;
+def PPCgetTlsldAddr : SDNode<"PPCISD::GET_TLSLD_ADDR", SDTIntBinOp>;
+def PPCaddisDtprelHA : SDNode<"PPCISD::ADDIS_DTPREL_HA", SDTIntBinOp,
+                              [SDNPHasChain]>;
+def PPCaddiDtprelL   : SDNode<"PPCISD::ADDI_DTPREL_L", SDTIntBinOp>;
 
 def PPCvperm    : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>;
 
diff --git a/lib/Target/PowerPC/PPCMCInstLower.cpp b/lib/Target/PowerPC/PPCMCInstLower.cpp
index 1c4af901f7..73f7a2cfd5 100644
--- a/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -114,12 +114,6 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
                                break;
     case PPCII::MO_TPREL16_LO: RefKind = MCSymbolRefExpr::VK_PPC_TPREL16_LO;
                                break;
-    case PPCII::MO_GOT_TPREL16_DS:
-      RefKind = MCSymbolRefExpr::VK_PPC_GOT_TPREL16_DS;
-      break;
-    case PPCII::MO_TLS:
-      RefKind = MCSymbolRefExpr::VK_PPC_TLS;
-      break;
    }
 
   // FIXME: This isn't right, but we don't have a good way to express this in
diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
new file mode 100644
index 0000000000..40864b09dd
--- /dev/null
+++ b/lib/Target/R600/AMDGPU.h
@@ -0,0 +1,48 @@
+//===-- AMDGPU.h - MachineFunction passes hw codegen --------------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPU_H
+#define AMDGPU_H
+
+#include "AMDGPUTargetMachine.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class FunctionPass;
+class AMDGPUTargetMachine;
+
+// R600 Passes
+FunctionPass* createR600KernelParametersPass(const DataLayout *TD);
+FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm);
+
+// SI Passes
+FunctionPass *createSIAssignInterpRegsPass(TargetMachine &tm);
+FunctionPass *createSILowerControlFlowPass(TargetMachine &tm);
+FunctionPass *createSICodeEmitterPass(formatted_raw_ostream &OS);
+FunctionPass *createSILowerLiteralConstantsPass(TargetMachine &tm);
+FunctionPass *createSIFixSGPRLivenessPass(TargetMachine &tm);
+
+// Passes common to R600 and SI
+FunctionPass *createAMDGPUConvertToISAPass(TargetMachine &tm);
+
+} // End namespace llvm
+
+namespace ShaderType {
+  enum Type {
+    PIXEL = 0,
+    VERTEX = 1,
+    GEOMETRY = 2,
+    COMPUTE = 3
+  };
+}
+
+#endif // AMDGPU_H
diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td
new file mode 100644
index 0000000000..40f474161a
--- /dev/null
+++ b/lib/Target/R600/AMDGPU.td
@@ -0,0 +1,40 @@
+//===-- AMDIL.td - AMDIL Tablegen files --*- tablegen -*-------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+
+// Include AMDIL TD files
+include "AMDILBase.td"
+
+
+def AMDGPUInstrInfo : InstrInfo {
+  let guessInstructionProperties = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Declare the target which we are implementing
+//===----------------------------------------------------------------------===//
+def AMDGPUAsmWriter : AsmWriter {
+    string AsmWriterClassName = "InstPrinter";
+    int Variant = 0;
+    bit isMCAsmWriter = 1;
+}
+
+def AMDGPU : Target {
+  // Pull in Instruction Info:
+  let InstructionSet = AMDGPUInstrInfo;
+  let AssemblyWriters = [AMDGPUAsmWriter];
+}
+
+// Include AMDGPU TD files
+include "R600Schedule.td"
+include "SISchedule.td"
+include "Processors.td"
+include "AMDGPUInstrInfo.td"
+include "AMDGPUIntrinsics.td"
+include "AMDGPURegisterInfo.td"
+include "AMDGPUInstructions.td"
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp
new file mode 100644
index 0000000000..4553c4556c
--- /dev/null
+++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp
@@ -0,0 +1,138 @@
+//===-- AMDGPUAsmPrinter.cpp - AMDGPU Assebly printer  --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// The AMDGPUAsmPrinter is used to print both assembly string and also binary
+/// code.  When passed an MCAsmStreamer it prints assembly and when passed
+/// an MCObjectStreamer it outputs binary code.
+//
+//===----------------------------------------------------------------------===//
+//
+
+
+#include "AMDGPUAsmPrinter.h"
+#include "AMDGPU.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+
+static AsmPrinter *createAMDGPUAsmPrinterPass(TargetMachine &tm,
+                                              MCStreamer &Streamer) {
+  return new AMDGPUAsmPrinter(tm, Streamer);
+}
+
+extern "C" void LLVMInitializeR600AsmPrinter() {
+  TargetRegistry::RegisterAsmPrinter(TheAMDGPUTarget, createAMDGPUAsmPrinterPass);
+}
+
+/// We need to override this function so we can avoid
+/// the call to EmitFunctionHeader(), which the MCPureStreamer can't handle.
+bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  const AMDGPUSubtarget &STM = TM.getSubtarget<AMDGPUSubtarget>();
+  if (STM.dumpCode()) {
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    MF.dump();
+#endif
+  }
+  SetupMachineFunction(MF);
+  OutStreamer.SwitchSection(getObjFileLowering().getTextSection());
+  if (STM.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
+    EmitProgramInfo(MF);
+  }
+  EmitFunctionBody();
+  return false;
+}
+
+void AMDGPUAsmPrinter::EmitProgramInfo(MachineFunction &MF) {
+  unsigned MaxSGPR = 0;
+  unsigned MaxVGPR = 0;
+  bool VCCUsed = false;
+  const SIRegisterInfo * RI =
+                static_cast<const SIRegisterInfo*>(TM.getRegisterInfo());
+
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+                                                    I != E; ++I) {
+      MachineInstr &MI = *I;
+
+      unsigned numOperands = MI.getNumOperands();
+      for (unsigned op_idx = 0; op_idx < numOperands; op_idx++) {
+        MachineOperand & MO = MI.getOperand(op_idx);
+        unsigned maxUsed;
+        unsigned width = 0;
+        bool isSGPR = false;
+        unsigned reg;
+        unsigned hwReg;
+        if (!MO.isReg()) {
+          continue;
+        }
+        reg = MO.getReg();
+        if (reg == AMDGPU::VCC) {
+          VCCUsed = true;
+          continue;
+        }
+        switch (reg) {
+        default: break;
+        case AMDGPU::EXEC:
+        case AMDGPU::SI_LITERAL_CONSTANT:
+        case AMDGPU::SREG_LIT_0:
+        case AMDGPU::M0:
+          continue;
+        }
+
+        if (AMDGPU::SReg_32RegClass.contains(reg)) {
+          isSGPR = true;
+          width = 1;
+        } else if (AMDGPU::VReg_32RegClass.contains(reg)) {
+          isSGPR = false;
+          width = 1;
+        } else if (AMDGPU::SReg_64RegClass.contains(reg)) {
+          isSGPR = true;
+          width = 2;
+        } else if (AMDGPU::VReg_64RegClass.contains(reg)) {
+          isSGPR = false;
+          width = 2;
+        } else if (AMDGPU::SReg_128RegClass.contains(reg)) {
+          isSGPR = true;
+          width = 4;
+        } else if (AMDGPU::VReg_128RegClass.contains(reg)) {
+          isSGPR = false;
+          width = 4;
+        } else if (AMDGPU::SReg_256RegClass.contains(reg)) {
+          isSGPR = true;
+          width = 8;
+        } else {
+          assert(!"Unknown register class");
+        }
+        hwReg = RI->getEncodingValue(reg);
+        maxUsed = hwReg + width - 1;
+        if (isSGPR) {
+          MaxSGPR = maxUsed > MaxSGPR ? maxUsed : MaxSGPR;
+        } else {
+          MaxVGPR = maxUsed > MaxVGPR ? maxUsed : MaxVGPR;
+        }
+      }
+    }
+  }
+  if (VCCUsed) {
+    MaxSGPR += 2;
+  }
+  SIMachineFunctionInfo * MFI = MF.getInfo<SIMachineFunctionInfo>();
+  OutStreamer.EmitIntValue(MaxSGPR + 1, 4);
+  OutStreamer.EmitIntValue(MaxVGPR + 1, 4);
+  OutStreamer.EmitIntValue(MFI->SPIPSInputAddr, 4);
+}
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.h b/lib/Target/R600/AMDGPUAsmPrinter.h
new file mode 100644
index 0000000000..3812282b17
--- /dev/null
+++ b/lib/Target/R600/AMDGPUAsmPrinter.h
@@ -0,0 +1,44 @@
+//===-- AMDGPUAsmPrinter.h - Print AMDGPU assembly code -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU Assembly printer class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPU_ASMPRINTER_H
+#define AMDGPU_ASMPRINTER_H
+
+#include "llvm/CodeGen/AsmPrinter.h"
+
+namespace llvm {
+
+class AMDGPUAsmPrinter : public AsmPrinter {
+
+public:
+  explicit AMDGPUAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
+    : AsmPrinter(TM, Streamer) { }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  virtual const char *getPassName() const {
+    return "AMDGPU Assembly Printer";
+  }
+
+  /// \brief Emit register usage information so that the GPU driver
+  /// can correctly setup the GPU state.
+  void EmitProgramInfo(MachineFunction &MF);
+
+  /// Implemented in AMDGPUMCInstLower.cpp
+  virtual void EmitInstruction(const MachineInstr *MI);
+};
+
+} // End anonymous llvm
+
+#endif //AMDGPU_ASMPRINTER_H
diff --git a/lib/Target/R600/AMDGPUCodeEmitter.h b/lib/Target/R600/AMDGPUCodeEmitter.h
new file mode 100644
index 0000000000..84f3588496
--- /dev/null
+++ b/lib/Target/R600/AMDGPUCodeEmitter.h
@@ -0,0 +1,49 @@
+//===-- AMDGPUCodeEmitter.h - AMDGPU Code Emitter interface -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief CodeEmitter interface for R600 and SI codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPUCODEEMITTER_H
+#define AMDGPUCODEEMITTER_H
+
+namespace llvm {
+
+class AMDGPUCodeEmitter {
+public:
+  uint64_t getBinaryCodeForInstr(const MachineInstr &MI) const;
+  virtual uint64_t getMachineOpValue(const MachineInstr &MI,
+                                   const MachineOperand &MO) const { return 0; }
+  virtual unsigned GPR4AlignEncode(const MachineInstr  &MI,
+                                     unsigned OpNo) const {
+    return 0;
+  }
+  virtual unsigned GPR2AlignEncode(const MachineInstr &MI,
+                                   unsigned OpNo) const {
+    return 0;
+  }
+  virtual uint64_t VOPPostEncode(const MachineInstr &MI,
+                                 uint64_t Value) const {
+    return Value;
+  }
+  virtual uint64_t i32LiteralEncode(const MachineInstr &MI,
+                                    unsigned OpNo) const {
+    return 0;
+  }
+  virtual uint32_t SMRDmemriEncode(const MachineInstr &MI, unsigned OpNo)
+                                                                   const {
+    return 0;
+  }
+};
+
+} // End namespace llvm
+
+#endif // AMDGPUCODEEMITTER_H
diff --git a/lib/Target/R600/AMDGPUConvertToISA.cpp b/lib/Target/R600/AMDGPUConvertToISA.cpp
new file mode 100644
index 0000000000..50297d1f60
--- /dev/null
+++ b/lib/Target/R600/AMDGPUConvertToISA.cpp
@@ -0,0 +1,62 @@
+//===-- AMDGPUConvertToISA.cpp - Lower AMDIL to HW ISA --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This pass lowers AMDIL machine instructions to the appropriate
+/// hardware instructions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUInstrInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUConvertToISAPass : public MachineFunctionPass {
+
+private:
+  static char ID;
+  TargetMachine &TM;
+
+public:
+  AMDGPUConvertToISAPass(TargetMachine &tm) :
+    MachineFunctionPass(ID), TM(tm) { }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  virtual const char *getPassName() const {return "AMDGPU Convert to ISA";}
+
+};
+
+} // End anonymous namespace
+
+char AMDGPUConvertToISAPass::ID = 0;
+
+FunctionPass *llvm::createAMDGPUConvertToISAPass(TargetMachine &tm) {
+  return new AMDGPUConvertToISAPass(tm);
+}
+
+bool AMDGPUConvertToISAPass::runOnMachineFunction(MachineFunction &MF) {
+  const AMDGPUInstrInfo * TII =
+                      static_cast<const AMDGPUInstrInfo*>(TM.getInstrInfo());
+
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+                                                      I != E; ++I) {
+      MachineInstr &MI = *I;
+      TII->convertToISA(MI, MF, MBB.findDebugLoc(I));
+    }
+  }
+  return false;
+}
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
new file mode 100644
index 0000000000..473dac4ddc
--- /dev/null
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -0,0 +1,417 @@
+//===-- AMDGPUISelLowering.cpp - AMDGPU Common DAG lowering functions -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This is the parent TargetLowering class for hardware code gen
+/// targets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUISelLowering.h"
+#include "AMDILIntrinsicInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+
+using namespace llvm;
+
+AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM) :
+  TargetLowering(TM, new TargetLoweringObjectFileELF()) {
+
+  // Initialize target lowering borrowed from AMDIL
+  InitAMDILLowering();
+
+  // We need to custom lower some of the intrinsics
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
+  // Library functions.  These default to Expand, but we have instructions
+  // for them.
+  setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
+  setOperationAction(ISD::FEXP2,  MVT::f32, Legal);
+  setOperationAction(ISD::FPOW,   MVT::f32, Legal);
+  setOperationAction(ISD::FLOG2,  MVT::f32, Legal);
+  setOperationAction(ISD::FABS,   MVT::f32, Legal);
+  setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
+  setOperationAction(ISD::FRINT,  MVT::f32, Legal);
+
+  // Lower floating point store/load to integer store/load to reduce the number
+  // of patterns in tablegen.
+  setOperationAction(ISD::STORE, MVT::f32, Promote);
+  AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
+
+  setOperationAction(ISD::STORE, MVT::v4f32, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
+
+  setOperationAction(ISD::LOAD, MVT::f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
+
+  setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
+
+  setOperationAction(ISD::UDIV, MVT::i32, Expand);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
+  setOperationAction(ISD::UREM, MVT::i32, Expand);
+}
+
+//===---------------------------------------------------------------------===//
+// TargetLowering Callbacks
+//===---------------------------------------------------------------------===//
+
+SDValue AMDGPUTargetLowering::LowerFormalArguments(
+                                      SDValue Chain,
+                                      CallingConv::ID CallConv,
+                                      bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                      DebugLoc DL, SelectionDAG &DAG,
+                                      SmallVectorImpl<SDValue> &InVals) const {
+  for (unsigned i = 0, e = Ins.size(); i < e; ++i) {
+    InVals.push_back(SDValue());
+  }
+  return Chain;
+}
+
+SDValue AMDGPUTargetLowering::LowerReturn(
+                                     SDValue Chain,
+                                     CallingConv::ID CallConv,
+                                     bool isVarArg,
+                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                     const SmallVectorImpl<SDValue> &OutVals,
+                                     DebugLoc DL, SelectionDAG &DAG) const {
+  return DAG.getNode(AMDGPUISD::RET_FLAG, DL, MVT::Other, Chain);
+}
+
+//===---------------------------------------------------------------------===//
+// Target specific lowering
+//===---------------------------------------------------------------------===//
+
+SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG)
+    const {
+  switch (Op.getOpcode()) {
+  default:
+    Op.getNode()->dump();
+    assert(0 && "Custom lowering code for this"
+        "instruction is not implemented yet!");
+    break;
+  // AMDIL DAG lowering
+  case ISD::SDIV: return LowerSDIV(Op, DAG);
+  case ISD::SREM: return LowerSREM(Op, DAG);
+  case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
+  case ISD::BRCOND: return LowerBRCOND(Op, DAG);
+  // AMDGPU DAG lowering
+  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
+  }
+  return Op;
+}
+
+SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
+    SelectionDAG &DAG) const {
+  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  DebugLoc DL = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+
+  switch (IntrinsicID) {
+    default: return Op;
+    case AMDGPUIntrinsic::AMDIL_abs:
+      return LowerIntrinsicIABS(Op, DAG);
+    case AMDGPUIntrinsic::AMDIL_exp:
+      return DAG.getNode(ISD::FEXP2, DL, VT, Op.getOperand(1));
+    case AMDGPUIntrinsic::AMDGPU_lrp:
+      return LowerIntrinsicLRP(Op, DAG);
+    case AMDGPUIntrinsic::AMDIL_fraction:
+      return DAG.getNode(AMDGPUISD::FRACT, DL, VT, Op.getOperand(1));
+    case AMDGPUIntrinsic::AMDIL_mad:
+      return DAG.getNode(AMDGPUISD::MAD, DL, VT, Op.getOperand(1),
+                              Op.getOperand(2), Op.getOperand(3));
+    case AMDGPUIntrinsic::AMDIL_max:
+      return DAG.getNode(AMDGPUISD::FMAX, DL, VT, Op.getOperand(1),
+                                                  Op.getOperand(2));
+    case AMDGPUIntrinsic::AMDGPU_imax:
+      return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Op.getOperand(1),
+                                                  Op.getOperand(2));
+    case AMDGPUIntrinsic::AMDGPU_umax:
+      return DAG.getNode(AMDGPUISD::UMAX, DL, VT, Op.getOperand(1),
+                                                  Op.getOperand(2));
+    case AMDGPUIntrinsic::AMDIL_min:
+      return DAG.getNode(AMDGPUISD::FMIN, DL, VT, Op.getOperand(1),
+                                                  Op.getOperand(2));
+    case AMDGPUIntrinsic::AMDGPU_imin:
+      return DAG.getNode(AMDGPUISD::SMIN, DL, VT, Op.getOperand(1),
+                                                  Op.getOperand(2));
+    case AMDGPUIntrinsic::AMDGPU_umin:
+      return DAG.getNode(AMDGPUISD::UMIN, DL, VT, Op.getOperand(1),
+                                                  Op.getOperand(2));
+    case AMDGPUIntrinsic::AMDIL_round_nearest:
+      return DAG.getNode(ISD::FRINT, DL, VT, Op.getOperand(1));
+  }
+}
+
+///IABS(a) = SMAX(sub(0, a), a)
+SDValue AMDGPUTargetLowering::LowerIntrinsicIABS(SDValue Op,
+    SelectionDAG &DAG) const {
+
+  DebugLoc DL = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+  SDValue Neg = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
+                                              Op.getOperand(1));
+
+  return DAG.getNode(AMDGPUISD::SMAX, DL, VT, Neg, Op.getOperand(1));
+}
+
+/// Linear Interpolation
+/// LRP(a, b, c) = muladd(a,  b, (1 - a) * c)
+SDValue AMDGPUTargetLowering::LowerIntrinsicLRP(SDValue Op,
+    SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+  SDValue OneSubA = DAG.getNode(ISD::FSUB, DL, VT,
+                                DAG.getConstantFP(1.0f, MVT::f32),
+                                Op.getOperand(1));
+  SDValue OneSubAC = DAG.getNode(ISD::FMUL, DL, VT, OneSubA,
+                                                    Op.getOperand(3));
+  return DAG.getNode(AMDGPUISD::MAD, DL, VT, Op.getOperand(1),
+                                               Op.getOperand(2),
+                                               OneSubAC);
+}
+
+/// \brief Generate Min/Max node
+SDValue AMDGPUTargetLowering::LowerMinMax(SDValue Op,
+    SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue True = Op.getOperand(2);
+  SDValue False = Op.getOperand(3);
+  SDValue CC = Op.getOperand(4);
+
+  if (VT != MVT::f32 ||
+      !((LHS == True && RHS == False) || (LHS == False && RHS == True))) {
+    return SDValue();
+  }
+
+  ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
+  switch (CCOpcode) {
+  case ISD::SETOEQ:
+  case ISD::SETONE:
+  case ISD::SETUNE:
+  case ISD::SETNE:
+  case ISD::SETUEQ:
+  case ISD::SETEQ:
+  case ISD::SETFALSE:
+  case ISD::SETFALSE2:
+  case ISD::SETTRUE:
+  case ISD::SETTRUE2:
+  case ISD::SETUO:
+  case ISD::SETO:
+    assert(0 && "Operation should already be optimised !");
+  case ISD::SETULE:
+  case ISD::SETULT:
+  case ISD::SETOLE:
+  case ISD::SETOLT:
+  case ISD::SETLE:
+  case ISD::SETLT: {
+    if (LHS == True)
+      return DAG.getNode(AMDGPUISD::FMIN, DL, VT, LHS, RHS);
+    else
+      return DAG.getNode(AMDGPUISD::FMAX, DL, VT, LHS, RHS);
+  }
+  case ISD::SETGT:
+  case ISD::SETGE:
+  case ISD::SETUGE:
+  case ISD::SETOGE:
+  case ISD::SETUGT:
+  case ISD::SETOGT: {
+    if (LHS == True)
+      return DAG.getNode(AMDGPUISD::FMAX, DL, VT, LHS, RHS);
+    else
+      return DAG.getNode(AMDGPUISD::FMIN, DL, VT, LHS, RHS);
+  }
+  case ISD::SETCC_INVALID:
+    assert(0 && "Invalid setcc condcode !");
+  }
+  return Op;
+}
+
+
+
+SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
+    SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+
+  SDValue Num = Op.getOperand(0);
+  SDValue Den = Op.getOperand(1);
+
+  SmallVector<SDValue, 8> Results;
+
+  // RCP =  URECIP(Den) = 2^32 / Den + e
+  // e is rounding error.
+  SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
+
+  // RCP_LO = umulo(RCP, Den) */
+  SDValue RCP_LO = DAG.getNode(ISD::UMULO, DL, VT, RCP, Den);
+
+  // RCP_HI = mulhu (RCP, Den) */
+  SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
+
+  // NEG_RCP_LO = -RCP_LO
+  SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, VT),
+                                                     RCP_LO);
+
+  // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
+  SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT),
+                                           NEG_RCP_LO, RCP_LO,
+                                           ISD::SETEQ);
+  // Calculate the rounding error from the URECIP instruction
+  // E = mulhu(ABS_RCP_LO, RCP)
+  SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
+
+  // RCP_A_E = RCP + E
+  SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
+
+  // RCP_S_E = RCP - E
+  SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
+
+  // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
+  SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, VT),
+                                     RCP_A_E, RCP_S_E,
+                                     ISD::SETEQ);
+  // Quotient = mulhu(Tmp0, Num)
+  SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
+
+  // Num_S_Remainder = Quotient * Den
+  SDValue Num_S_Remainder = DAG.getNode(ISD::UMULO, DL, VT, Quotient, Den);
+
+  // Remainder = Num - Num_S_Remainder
+  SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
+
+  // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
+  SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
+                                                 DAG.getConstant(-1, VT),
+                                                 DAG.getConstant(0, VT),
+                                                 ISD::SETGE);
+  // Remainder_GE_Zero = (Remainder >= 0 ? -1 : 0)
+  SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Remainder,
+                                                  DAG.getConstant(0, VT),
+                                                  DAG.getConstant(-1, VT),
+                                                  DAG.getConstant(0, VT),
+                                                  ISD::SETGE);
+  // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
+  SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
+                                               Remainder_GE_Zero);
+
+  // Calculate Division result:
+
+  // Quotient_A_One = Quotient + 1
+  SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
+                                                         DAG.getConstant(1, VT));
+
+  // Quotient_S_One = Quotient - 1
+  SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
+                                                         DAG.getConstant(1, VT));
+
+  // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
+  SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT),
+                                     Quotient, Quotient_A_One, ISD::SETEQ);
+
+  // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
+  Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT),
+                            Quotient_S_One, Div, ISD::SETEQ);
+
+  // Calculate Rem result:
+
+  // Remainder_S_Den = Remainder - Den
+  SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
+
+  // Remainder_A_Den = Remainder + Den
+  SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
+
+  // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
+  SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, VT),
+                                    Remainder, Remainder_S_Den, ISD::SETEQ);
+
+  // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
+  Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, VT),
+                            Remainder_A_Den, Rem, ISD::SETEQ);
+  SDValue Ops[2];
+  Ops[0] = Div;
+  Ops[1] = Rem;
+  return DAG.getMergeValues(Ops, 2, DL);
+}
+
+//===----------------------------------------------------------------------===//
+// Helper functions
+//===----------------------------------------------------------------------===//
+
+bool AMDGPUTargetLowering::isHWTrueValue(SDValue Op) const {
+  if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
+    return CFP->isExactlyValue(1.0);
+  }
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+    return C->isAllOnesValue();
+  }
+  return false;
+}
+
+bool AMDGPUTargetLowering::isHWFalseValue(SDValue Op) const {
+  if (ConstantFPSDNode * CFP = dyn_cast<ConstantFPSDNode>(Op)) {
+    return CFP->getValueAPF().isZero();
+  }
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+    return C->isNullValue();
+  }
+  return false;
+}
+
+SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
+                                                  const TargetRegisterClass *RC,
+                                                   unsigned Reg, EVT VT) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  unsigned VirtualRegister;
+  if (!MRI.isLiveIn(Reg)) {
+    VirtualRegister = MRI.createVirtualRegister(RC);
+    MRI.addLiveIn(Reg, VirtualRegister);
+  } else {
+    VirtualRegister = MRI.getLiveInVirtReg(Reg);
+  }
+  return DAG.getRegister(VirtualRegister, VT);
+}
+
+#define NODE_NAME_CASE(node) case AMDGPUISD::node: return #node;
+
+const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return 0;
+  // AMDIL DAG nodes
+  NODE_NAME_CASE(MAD);
+  NODE_NAME_CASE(CALL);
+  NODE_NAME_CASE(UMUL);
+  NODE_NAME_CASE(DIV_INF);
+  NODE_NAME_CASE(RET_FLAG);
+  NODE_NAME_CASE(BRANCH_COND);
+
+  // AMDGPU DAG nodes
+  NODE_NAME_CASE(DWORDADDR)
+  NODE_NAME_CASE(FRACT)
+  NODE_NAME_CASE(FMAX)
+  NODE_NAME_CASE(SMAX)
+  NODE_NAME_CASE(UMAX)
+  NODE_NAME_CASE(FMIN)
+  NODE_NAME_CASE(SMIN)
+  NODE_NAME_CASE(UMIN)
+  NODE_NAME_CASE(URECIP)
+  NODE_NAME_CASE(INTERP)
+  NODE_NAME_CASE(INTERP_P0)
+  NODE_NAME_CASE(EXPORT)
+  }
+}
diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
new file mode 100644
index 0000000000..c7abaf69b4
--- /dev/null
+++ b/lib/Target/R600/AMDGPUISelLowering.h
@@ -0,0 +1,144 @@
+//===-- AMDGPUISelLowering.h - AMDGPU Lowering Interface --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface definition of the TargetLowering class that is common
+/// to all AMD GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPUISELLOWERING_H
+#define AMDGPUISELLOWERING_H
+
+#include "llvm/Target/TargetLowering.h"
+
+namespace llvm {
+
+class MachineRegisterInfo;
+
+class AMDGPUTargetLowering : public TargetLowering {
+private:
+  SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerUDIVREM(SDValue Op, SelectionDAG &DAG) const;
+
+protected:
+
+  /// \brief Helper function that adds Reg to the LiveIn list of the DAG's
+  /// MachineFunction.
+  ///
+  /// \returns a RegisterSDNode representing Reg.
+  SDValue CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC,
+                                                  unsigned Reg, EVT VT) const;
+
+  bool isHWTrueValue(SDValue Op) const;
+  bool isHWFalseValue(SDValue Op) const;
+
+public:
+  AMDGPUTargetLowering(TargetMachine &TM);
+
+  virtual SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
+                             bool isVarArg,
+                             const SmallVectorImpl<ISD::InputArg> &Ins,
+                             DebugLoc DL, SelectionDAG &DAG,
+                             SmallVectorImpl<SDValue> &InVals) const;
+
+  virtual SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv,
+                              bool isVarArg,
+                              const SmallVectorImpl<ISD::OutputArg> &Outs,
+                              const SmallVectorImpl<SDValue> &OutVals,
+                              DebugLoc DL, SelectionDAG &DAG) const;
+
+  virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerIntrinsicIABS(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerIntrinsicLRP(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerMinMax(SDValue Op, SelectionDAG &DAG) const;
+  virtual const char* getTargetNodeName(unsigned Opcode) const;
+
+// Functions defined in AMDILISelLowering.cpp
+public:
+
+  /// \brief Determine which of the bits specified in \p Mask are known to be
+  /// either zero or one and return them in the \p KnownZero and \p KnownOne
+  /// bitsets.
+  virtual void computeMaskedBitsForTargetNode(const SDValue Op,
+                                              APInt &KnownZero,
+                                              APInt &KnownOne,
+                                              const SelectionDAG &DAG,
+                                              unsigned Depth = 0) const;
+
+  virtual bool getTgtMemIntrinsic(IntrinsicInfo &Info,
+                                  const CallInst &I, unsigned Intrinsic) const;
+
+  /// We want to mark f32/f64 floating point values as legal.
+  bool isFPImmLegal(const APFloat &Imm, EVT VT) const;
+
+  /// We don't want to shrink f64/f32 constants.
+  bool ShouldShrinkFPConstant(EVT VT) const;
+
+private:
+  void InitAMDILLowering();
+  SDValue LowerSREM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSREM8(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSREM16(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSREM32(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSREM64(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSDIV(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSDIV24(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSDIV32(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSDIV64(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
+  EVT genIntType(uint32_t size = 32, uint32_t numEle = 1) const;
+  SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
+};
+
+namespace AMDGPUISD {
+
+enum {
+  // AMDIL ISD Opcodes
+  FIRST_NUMBER = ISD::BUILTIN_OP_END,
+  MAD,         // 32bit Fused Multiply Add instruction
+  CALL,        // Function call based on a single integer
+  UMUL,        // 32bit unsigned multiplication
+  DIV_INF,      // Divide with infinity returned on zero divisor
+  RET_FLAG,
+  BRANCH_COND,
+  // End AMDIL ISD Opcodes
+  BITALIGN,
+  DWORDADDR,
+  FRACT,
+  FMAX,
+  SMAX,
+  UMAX,
+  FMIN,
+  SMIN,
+  UMIN,
+  URECIP,
+  INTERP,
+  INTERP_P0,
+  EXPORT,
+  LAST_AMDGPU_ISD_NUMBER
+};
+
+
+} // End namespace AMDGPUISD
+
+namespace SIISD {
+
+enum {
+  SI_FIRST = AMDGPUISD::LAST_AMDGPU_ISD_NUMBER,
+  VCC_AND,
+  VCC_BITCAST
+};
+
+} // End namespace SIISD
+
+} // End namespace llvm
+
+#endif // AMDGPUISELLOWERING_H
diff --git a/lib/Target/R600/AMDGPUInstrInfo.cpp b/lib/Target/R600/AMDGPUInstrInfo.cpp
new file mode 100644
index 0000000000..e42a46d839
--- /dev/null
+++ b/lib/Target/R600/AMDGPUInstrInfo.cpp
@@ -0,0 +1,257 @@
+//===-- AMDGPUInstrInfo.cpp - Base class for AMD GPU InstrInfo ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Implementation of the TargetInstrInfo class that is common to all
+/// AMD GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPURegisterInfo.h"
+#include "AMDGPUTargetMachine.h"
+#include "AMDIL.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+#define GET_INSTRINFO_CTOR
+#include "AMDGPUGenInstrInfo.inc"
+
+using namespace llvm;
+
+AMDGPUInstrInfo::AMDGPUInstrInfo(TargetMachine &tm)
+  : AMDGPUGenInstrInfo(0,0), RI(tm, *this), TM(tm) { }
+
+const AMDGPURegisterInfo &AMDGPUInstrInfo::getRegisterInfo() const {
+  return RI;
+}
+
+bool AMDGPUInstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
+                                           unsigned &SrcReg, unsigned &DstReg,
+                                           unsigned &SubIdx) const {
+// TODO: Implement this function
+  return false;
+}
+
+unsigned AMDGPUInstrInfo::isLoadFromStackSlot(const MachineInstr *MI,
+                                             int &FrameIndex) const {
+// TODO: Implement this function
+  return 0;
+}
+
+unsigned AMDGPUInstrInfo::isLoadFromStackSlotPostFE(const MachineInstr *MI,
+                                                   int &FrameIndex) const {
+// TODO: Implement this function
+  return 0;
+}
+
+bool AMDGPUInstrInfo::hasLoadFromStackSlot(const MachineInstr *MI,
+                                          const MachineMemOperand *&MMO,
+                                          int &FrameIndex) const {
+// TODO: Implement this function
+  return false;
+}
+unsigned AMDGPUInstrInfo::isStoreFromStackSlot(const MachineInstr *MI,
+                                              int &FrameIndex) const {
+// TODO: Implement this function
+  return 0;
+}
+unsigned AMDGPUInstrInfo::isStoreFromStackSlotPostFE(const MachineInstr *MI,
+                                                    int &FrameIndex) const {
+// TODO: Implement this function
+  return 0;
+}
+bool AMDGPUInstrInfo::hasStoreFromStackSlot(const MachineInstr *MI,
+                                           const MachineMemOperand *&MMO,
+                                           int &FrameIndex) const {
+// TODO: Implement this function
+  return false;
+}
+
+MachineInstr *
+AMDGPUInstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
+                                      MachineBasicBlock::iterator &MBBI,
+                                      LiveVariables *LV) const {
+// TODO: Implement this function
+  return NULL;
+}
+bool AMDGPUInstrInfo::getNextBranchInstr(MachineBasicBlock::iterator &iter,
+                                        MachineBasicBlock &MBB) const {
+  while (iter != MBB.end()) {
+    switch (iter->getOpcode()) {
+    default:
+      break;
+    case AMDGPU::BRANCH_COND_i32:
+    case AMDGPU::BRANCH_COND_f32:
+    case AMDGPU::BRANCH:
+      return true;
+    };
+    ++iter;
+  }
+  return false;
+}
+
+MachineBasicBlock::iterator skipFlowControl(MachineBasicBlock *MBB) {
+  MachineBasicBlock::iterator tmp = MBB->end();
+  if (!MBB->size()) {
+    return MBB->end();
+  }
+  while (--tmp) {
+    if (tmp->getOpcode() == AMDGPU::ENDLOOP
+        || tmp->getOpcode() == AMDGPU::ENDIF
+        || tmp->getOpcode() == AMDGPU::ELSE) {
+      if (tmp == MBB->begin()) {
+        return tmp;
+      } else {
+        continue;
+      }
+    }  else {
+      return ++tmp;
+    }
+  }
+  return MBB->end();
+}
+
+void
+AMDGPUInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MI,
+                                    unsigned SrcReg, bool isKill,
+                                    int FrameIndex,
+                                    const TargetRegisterClass *RC,
+                                    const TargetRegisterInfo *TRI) const {
+  assert(!"Not Implemented");
+}
+
+void
+AMDGPUInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MI,
+                                     unsigned DestReg, int FrameIndex,
+                                     const TargetRegisterClass *RC,
+                                     const TargetRegisterInfo *TRI) const {
+  assert(!"Not Implemented");
+}
+
+MachineInstr *
+AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                      MachineInstr *MI,
+                                      const SmallVectorImpl<unsigned> &Ops,
+                                      int FrameIndex) const {
+// TODO: Implement this function
+  return 0;
+}
+MachineInstr*
+AMDGPUInstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
+                                      MachineInstr *MI,
+                                      const SmallVectorImpl<unsigned> &Ops,
+                                      MachineInstr *LoadMI) const {
+  // TODO: Implement this function
+  return 0;
+}
+bool
+AMDGPUInstrInfo::canFoldMemoryOperand(const MachineInstr *MI,
+                                     const SmallVectorImpl<unsigned> &Ops) const {
+  // TODO: Implement this function
+  return false;
+}
+bool
+AMDGPUInstrInfo::unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
+                                 unsigned Reg, bool UnfoldLoad,
+                                 bool UnfoldStore,
+                                 SmallVectorImpl<MachineInstr*> &NewMIs) const {
+  // TODO: Implement this function
+  return false;
+}
+
+bool
+AMDGPUInstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
+                                    SmallVectorImpl<SDNode*> &NewNodes) const {
+  // TODO: Implement this function
+  return false;
+}
+
+unsigned
+AMDGPUInstrInfo::getOpcodeAfterMemoryUnfold(unsigned Opc,
+                                           bool UnfoldLoad, bool UnfoldStore,
+                                           unsigned *LoadRegIndex) const {
+  // TODO: Implement this function
+  return 0;
+}
+
+bool AMDGPUInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+                                             int64_t Offset1, int64_t Offset2,
+                                             unsigned NumLoads) const {
+  assert(Offset2 > Offset1
+         && "Second offset should be larger than first offset!");
+  // If we have less than 16 loads in a row, and the offsets are within 16,
+  // then schedule together.
+  // TODO: Make the loads schedule near if it fits in a cacheline
+  return (NumLoads < 16 && (Offset2 - Offset1) < 16);
+}
+
+bool
+AMDGPUInstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond)
+  const {
+  // TODO: Implement this function
+  return true;
+}
+void AMDGPUInstrInfo::insertNoop(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI) const {
+  // TODO: Implement this function
+}
+
+bool AMDGPUInstrInfo::isPredicated(const MachineInstr *MI) const {
+  // TODO: Implement this function
+  return false;
+}
+bool
+AMDGPUInstrInfo::SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+                                  const SmallVectorImpl<MachineOperand> &Pred2)
+  const {
+  // TODO: Implement this function
+  return false;
+}
+
+bool AMDGPUInstrInfo::DefinesPredicate(MachineInstr *MI,
+                                      std::vector<MachineOperand> &Pred) const {
+  // TODO: Implement this function
+  return false;
+}
+
+bool AMDGPUInstrInfo::isPredicable(MachineInstr *MI) const {
+  // TODO: Implement this function
+  return MI->getDesc().isPredicable();
+}
+
+bool
+AMDGPUInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
+  // TODO: Implement this function
+  return true;
+}
+ 
+void AMDGPUInstrInfo::convertToISA(MachineInstr & MI, MachineFunction &MF,
+    DebugLoc DL) const {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const AMDGPURegisterInfo & RI = getRegisterInfo();
+
+  for (unsigned i = 0; i < MI.getNumOperands(); i++) {
+    MachineOperand &MO = MI.getOperand(i);
+    // Convert dst regclass to one that is supported by the ISA
+    if (MO.isReg() && MO.isDef()) {
+      if (TargetRegisterInfo::isVirtualRegister(MO.getReg())) {
+        const TargetRegisterClass * oldRegClass = MRI.getRegClass(MO.getReg());
+        const TargetRegisterClass * newRegClass = RI.getISARegClass(oldRegClass);
+
+        assert(newRegClass);
+
+        MRI.setRegClass(MO.getReg(), newRegClass);
+      }
+    }
+  }
+}
diff --git a/lib/Target/R600/AMDGPUInstrInfo.h b/lib/Target/R600/AMDGPUInstrInfo.h
new file mode 100644
index 0000000000..32ac691fe0
--- /dev/null
+++ b/lib/Target/R600/AMDGPUInstrInfo.h
@@ -0,0 +1,149 @@
+//===-- AMDGPUInstrInfo.h - AMDGPU Instruction Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Contains the definition of a TargetInstrInfo class that is common
+/// to all AMD GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPUINSTRUCTIONINFO_H
+#define AMDGPUINSTRUCTIONINFO_H
+
+#include "AMDGPURegisterInfo.h"
+#include "AMDGPUInstrInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#include <map>
+
+#define GET_INSTRINFO_HEADER
+#define GET_INSTRINFO_ENUM
+#include "AMDGPUGenInstrInfo.inc"
+
+#define OPCODE_IS_ZERO_INT AMDGPU::PRED_SETE_INT
+#define OPCODE_IS_NOT_ZERO_INT AMDGPU::PRED_SETNE_INT
+#define OPCODE_IS_ZERO AMDGPU::PRED_SETE
+#define OPCODE_IS_NOT_ZERO AMDGPU::PRED_SETNE
+
+namespace llvm {
+
+class AMDGPUTargetMachine;
+class MachineFunction;
+class MachineInstr;
+class MachineInstrBuilder;
+
+class AMDGPUInstrInfo : public AMDGPUGenInstrInfo {
+private:
+  const AMDGPURegisterInfo RI;
+  TargetMachine &TM;
+  bool getNextBranchInstr(MachineBasicBlock::iterator &iter,
+                          MachineBasicBlock &MBB) const;
+public:
+  explicit AMDGPUInstrInfo(TargetMachine &tm);
+
+  virtual const AMDGPURegisterInfo &getRegisterInfo() const = 0;
+
+  bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
+                             unsigned &DstReg, unsigned &SubIdx) const;
+
+  unsigned isLoadFromStackSlot(const MachineInstr *MI, int &FrameIndex) const;
+  unsigned isLoadFromStackSlotPostFE(const MachineInstr *MI,
+                                     int &FrameIndex) const;
+  bool hasLoadFromStackSlot(const MachineInstr *MI,
+                            const MachineMemOperand *&MMO,
+                            int &FrameIndex) const;
+  unsigned isStoreFromStackSlot(const MachineInstr *MI, int &FrameIndex) const;
+  unsigned isStoreFromStackSlotPostFE(const MachineInstr *MI,
+                                      int &FrameIndex) const;
+  bool hasStoreFromStackSlot(const MachineInstr *MI,
+                             const MachineMemOperand *&MMO,
+                             int &FrameIndex) const;
+
+  MachineInstr *
+  convertToThreeAddress(MachineFunction::iterator &MFI,
+                        MachineBasicBlock::iterator &MBBI,
+                        LiveVariables *LV) const;
+
+
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const = 0;
+
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI,
+                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const;
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            unsigned DestReg, int FrameIndex,
+                            const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const;
+
+protected:
+  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF,
+                                      MachineInstr *MI,
+                                      const SmallVectorImpl<unsigned> &Ops,
+                                      int FrameIndex) const;
+  MachineInstr *foldMemoryOperandImpl(MachineFunction &MF,
+                                      MachineInstr *MI,
+                                      const SmallVectorImpl<unsigned> &Ops,
+                                      MachineInstr *LoadMI) const;
+public:
+  bool canFoldMemoryOperand(const MachineInstr *MI,
+                            const SmallVectorImpl<unsigned> &Ops) const;
+  bool unfoldMemoryOperand(MachineFunction &MF, MachineInstr *MI,
+                           unsigned Reg, bool UnfoldLoad, bool UnfoldStore,
+                           SmallVectorImpl<MachineInstr *> &NewMIs) const;
+  bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
+                           SmallVectorImpl<SDNode *> &NewNodes) const;
+  unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
+                                      bool UnfoldLoad, bool UnfoldStore,
+                                      unsigned *LoadRegIndex = 0) const;
+  bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
+                               int64_t Offset1, int64_t Offset2,
+                               unsigned NumLoads) const;
+
+  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+  void insertNoop(MachineBasicBlock &MBB,
+                  MachineBasicBlock::iterator MI) const;
+  bool isPredicated(const MachineInstr *MI) const;
+  bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+                         const SmallVectorImpl<MachineOperand> &Pred2) const;
+  bool DefinesPredicate(MachineInstr *MI,
+                        std::vector<MachineOperand> &Pred) const;
+  bool isPredicable(MachineInstr *MI) const;
+  bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const;
+
+  // Helper functions that check the opcode for status information
+  bool isLoadInst(llvm::MachineInstr *MI) const;
+  bool isExtLoadInst(llvm::MachineInstr *MI) const;
+  bool isSWSExtLoadInst(llvm::MachineInstr *MI) const;
+  bool isSExtLoadInst(llvm::MachineInstr *MI) const;
+  bool isZExtLoadInst(llvm::MachineInstr *MI) const;
+  bool isAExtLoadInst(llvm::MachineInstr *MI) const;
+  bool isStoreInst(llvm::MachineInstr *MI) const;
+  bool isTruncStoreInst(llvm::MachineInstr *MI) const;
+
+  virtual MachineInstr* getMovImmInstr(MachineFunction *MF, unsigned DstReg,
+                                       int64_t Imm) const = 0;
+  virtual unsigned getIEQOpcode() const = 0;
+  virtual bool isMov(unsigned opcode) const = 0;
+
+  /// \brief Convert the AMDIL MachineInstr to a supported ISA
+  /// MachineInstr
+  virtual void convertToISA(MachineInstr & MI, MachineFunction &MF,
+    DebugLoc DL) const;
+
+};
+
+} // End llvm namespace
+
+#endif // AMDGPUINSTRINFO_H
diff --git a/lib/Target/R600/AMDGPUInstrInfo.td b/lib/Target/R600/AMDGPUInstrInfo.td
new file mode 100644
index 0000000000..96368e8541
--- /dev/null
+++ b/lib/Target/R600/AMDGPUInstrInfo.td
@@ -0,0 +1,74 @@
+//===-- AMDGPUInstrInfo.td - AMDGPU DAG nodes --------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains DAG node defintions for the AMDGPU target.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// AMDGPU DAG Profiles
+//===----------------------------------------------------------------------===//
+
+def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [
+  SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3>
+]>;
+
+//===----------------------------------------------------------------------===//
+// AMDGPU DAG Nodes
+//
+
+// out = ((a << 32) | b) >> c)
+//
+// Can be used to optimize rtol:
+// rotl(a, b) = bitalign(a, a, 32 - b)
+def AMDGPUbitalign : SDNode<"AMDGPUISD::BITALIGN", AMDGPUDTIntTernaryOp>;
+
+// This argument to this node is a dword address.
+def AMDGPUdwordaddr : SDNode<"AMDGPUISD::DWORDADDR", SDTIntUnaryOp>;
+
+// out = a - floor(a)
+def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>;
+
+// out = max(a, b) a and b are floats
+def AMDGPUfmax : SDNode<"AMDGPUISD::FMAX", SDTFPBinOp,
+  [SDNPCommutative, SDNPAssociative]
+>;
+
+// out = max(a, b) a and b are signed ints
+def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp,
+  [SDNPCommutative, SDNPAssociative]
+>;
+
+// out = max(a, b) a and b are unsigned ints
+def AMDGPUumax : SDNode<"AMDGPUISD::UMAX", SDTIntBinOp,
+  [SDNPCommutative, SDNPAssociative]
+>;
+
+// out = min(a, b) a and b are floats
+def AMDGPUfmin : SDNode<"AMDGPUISD::FMIN", SDTFPBinOp,
+  [SDNPCommutative, SDNPAssociative]
+>;
+
+// out = min(a, b) a snd b are signed ints
+def AMDGPUsmin : SDNode<"AMDGPUISD::SMIN", SDTIntBinOp,
+  [SDNPCommutative, SDNPAssociative]
+>;
+
+// out = min(a, b) a and b are unsigned ints
+def AMDGPUumin : SDNode<"AMDGPUISD::UMIN", SDTIntBinOp,
+  [SDNPCommutative, SDNPAssociative]
+>;
+
+// urecip - This operation is a helper for integer division, it returns the
+// result of 1 / a as a fractional unsigned integer.
+// out = (2^32 / a) + e
+// e is rounding error
+def AMDGPUurecip : SDNode<"AMDGPUISD::URECIP", SDTIntUnaryOp>;
+
+def fpow : SDNode<"ISD::FPOW", SDTFPBinOp>;
diff --git a/lib/Target/R600/AMDGPUInstructions.td b/lib/Target/R600/AMDGPUInstructions.td
new file mode 100644
index 0000000000..e634d20b61
--- /dev/null
+++ b/lib/Target/R600/AMDGPUInstructions.td
@@ -0,0 +1,190 @@
+//===-- AMDGPUInstructions.td - Common instruction defs ---*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains instruction defs that are common to all hw codegen
+// targets.
+//
+//===----------------------------------------------------------------------===//
+
+class AMDGPUInst <dag outs, dag ins, string asm, list<dag> pattern> : Instruction {
+  field bits<16> AMDILOp = 0;
+  field bits<3> Gen = 0;
+
+  let Namespace = "AMDGPU";
+  let OutOperandList = outs;
+  let InOperandList = ins;
+  let AsmString = asm;
+  let Pattern = pattern;
+  let Itinerary = NullALU;
+  let TSFlags{42-40} = Gen;
+  let TSFlags{63-48} = AMDILOp;
+}
+
+class AMDGPUShaderInst <dag outs, dag ins, string asm, list<dag> pattern>
+    : AMDGPUInst<outs, ins, asm, pattern> {
+
+  field bits<32> Inst = 0xffffffff;
+
+}
+
+def InstFlag : OperandWithDefaultOps <i32, (ops (i32 0))>;
+
+def COND_EQ : PatLeaf <
+  (cond),
+  [{switch(N->get()){{default: return false;
+                     case ISD::SETOEQ: case ISD::SETUEQ:
+                     case ISD::SETEQ: return true;}}}]
+>;
+
+def COND_NE : PatLeaf <
+  (cond),
+  [{switch(N->get()){{default: return false;
+                     case ISD::SETONE: case ISD::SETUNE:
+                     case ISD::SETNE: return true;}}}]
+>;
+def COND_GT : PatLeaf <
+  (cond),
+  [{switch(N->get()){{default: return false;
+                     case ISD::SETOGT: case ISD::SETUGT:
+                     case ISD::SETGT: return true;}}}]
+>;
+
+def COND_GE : PatLeaf <
+  (cond),
+  [{switch(N->get()){{default: return false;
+                     case ISD::SETOGE: case ISD::SETUGE:
+                     case ISD::SETGE: return true;}}}]
+>;
+
+def COND_LT : PatLeaf <
+  (cond),
+  [{switch(N->get()){{default: return false;
+                     case ISD::SETOLT: case ISD::SETULT:
+                     case ISD::SETLT: return true;}}}]
+>;
+
+def COND_LE : PatLeaf <
+  (cond),
+  [{switch(N->get()){{default: return false;
+                     case ISD::SETOLE: case ISD::SETULE:
+                     case ISD::SETLE: return true;}}}]
+>;
+
+//===----------------------------------------------------------------------===//
+// Load/Store Pattern Fragments
+//===----------------------------------------------------------------------===//
+
+def zextloadi8_global : PatFrag<(ops node:$ptr), (zextloadi8 node:$ptr), [{
+    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+
+class Constants {
+int TWO_PI = 0x40c90fdb;
+int PI = 0x40490fdb;
+int TWO_PI_INV = 0x3e22f983;
+}
+def CONST : Constants;
+
+def FP_ZERO : PatLeaf <
+  (fpimm),
+  [{return N->getValueAPF().isZero();}]
+>;
+
+def FP_ONE : PatLeaf <
+  (fpimm),
+  [{return N->isExactlyValue(1.0);}]
+>;
+
+let isCodeGenOnly = 1, isPseudo = 1, usesCustomInserter = 1  in {
+
+class CLAMP <RegisterClass rc> : AMDGPUShaderInst <
+  (outs rc:$dst),
+  (ins rc:$src0),
+  "CLAMP $dst, $src0",
+  [(set rc:$dst, (int_AMDIL_clamp rc:$src0, (f32 FP_ZERO), (f32 FP_ONE)))]
+>;
+
+class FABS <RegisterClass rc> : AMDGPUShaderInst <
+  (outs rc:$dst),
+  (ins rc:$src0),
+  "FABS $dst, $src0",
+  [(set rc:$dst, (fabs rc:$src0))]
+>;
+
+class FNEG <RegisterClass rc> : AMDGPUShaderInst <
+  (outs rc:$dst),
+  (ins rc:$src0),
+  "FNEG $dst, $src0",
+  [(set rc:$dst, (fneg rc:$src0))]
+>;
+
+def SHADER_TYPE : AMDGPUShaderInst <
+  (outs),
+  (ins i32imm:$type),
+  "SHADER_TYPE $type",
+  [(int_AMDGPU_shader_type imm:$type)]
+>;
+
+} // End isCodeGenOnly = 1, isPseudo = 1, hasCustomInserter = 1
+
+/* Generic helper patterns for intrinsics */
+/* -------------------------------------- */
+
+class POW_Common <AMDGPUInst log_ieee, AMDGPUInst exp_ieee, AMDGPUInst mul,
+                  RegisterClass rc> : Pat <
+  (fpow rc:$src0, rc:$src1),
+  (exp_ieee (mul rc:$src1, (log_ieee rc:$src0)))
+>;
+
+/* Other helper patterns */
+/* --------------------- */
+
+/* Extract element pattern */
+class Extract_Element <ValueType sub_type, ValueType vec_type,
+                     RegisterClass vec_class, int sub_idx, 
+                     SubRegIndex sub_reg>: Pat<
+  (sub_type (vector_extract (vec_type vec_class:$src), sub_idx)),
+  (EXTRACT_SUBREG vec_class:$src, sub_reg)
+>;
+
+/* Insert element pattern */
+class Insert_Element <ValueType elem_type, ValueType vec_type,
+                      RegisterClass elem_class, RegisterClass vec_class,
+                      int sub_idx, SubRegIndex sub_reg> : Pat <
+
+  (vec_type (vector_insert (vec_type vec_class:$vec),
+                           (elem_type elem_class:$elem), sub_idx)),
+  (INSERT_SUBREG vec_class:$vec, elem_class:$elem, sub_reg)
+>;
+
+// Vector Build pattern
+class Vector_Build <ValueType vecType, RegisterClass vectorClass,
+                    ValueType elemType, RegisterClass elemClass> : Pat <
+  (vecType (build_vector (elemType elemClass:$x), (elemType elemClass:$y),
+                         (elemType elemClass:$z), (elemType elemClass:$w))),
+  (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG
+  (vecType (IMPLICIT_DEF)), elemClass:$x, sel_x), elemClass:$y, sel_y),
+                            elemClass:$z, sel_z), elemClass:$w, sel_w)
+>;
+
+// bitconvert pattern
+class BitConvert <ValueType dt, ValueType st, RegisterClass rc> : Pat <
+  (dt (bitconvert (st rc:$src0))),
+  (dt rc:$src0)
+>;
+
+class DwordAddrPat<ValueType vt, RegisterClass rc> : Pat <
+  (vt (AMDGPUdwordaddr (vt rc:$addr))),
+  (vt rc:$addr)
+>;
+
+include "R600Instructions.td"
+
+include "SIInstrInfo.td"
+
diff --git a/lib/Target/R600/AMDGPUIntrinsics.td b/lib/Target/R600/AMDGPUIntrinsics.td
new file mode 100644
index 0000000000..2ba2d4b90d
--- /dev/null
+++ b/lib/Target/R600/AMDGPUIntrinsics.td
@@ -0,0 +1,62 @@
+//===-- AMDGPUIntrinsics.td - Common intrinsics  -*- tablegen -*-----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines intrinsics that are used by all hw codegen targets.
+//
+//===----------------------------------------------------------------------===//
+
+let TargetPrefix = "AMDGPU", isTarget = 1 in {
+
+  def int_AMDGPU_load_const : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_load_imm : Intrinsic<[llvm_v4f32_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_reserve_reg : Intrinsic<[], [llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_store_output : Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>;
+  def int_AMDGPU_swizzle : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>;
+
+  def int_AMDGPU_arl : Intrinsic<[llvm_i32_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_cndlt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_div : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_dp4 : Intrinsic<[llvm_float_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+  def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>;
+  def int_AMDGPU_kilp : Intrinsic<[], [], []>;
+  def int_AMDGPU_lrp : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_mul : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_pow : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_rcp : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_seq : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_sgt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_sge : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_sle : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_sne : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_mullit : Intrinsic<[llvm_v4f32_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_tex : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_txb : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_txf : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_txq : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_txd : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_txl : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_trunc : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>;
+  def int_AMDGPU_ddx : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_ddy : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_imax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_imin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_umax : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_umin : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+
+  def int_AMDGPU_shader_type : Intrinsic<[], [llvm_i32_ty], []>;
+}
+
+let TargetPrefix = "TGSI", isTarget = 1 in {
+
+  def int_TGSI_lit_z : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty],[IntrNoMem]>;
+}
+
+include "SIIntrinsics.td"
diff --git a/lib/Target/R600/AMDGPUMCInstLower.cpp b/lib/Target/R600/AMDGPUMCInstLower.cpp
new file mode 100644
index 0000000000..32275a2b04
--- /dev/null
+++ b/lib/Target/R600/AMDGPUMCInstLower.cpp
@@ -0,0 +1,83 @@
+//===- AMDGPUMCInstLower.cpp - Lower AMDGPU MachineInstr to an MCInst -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Code to lower AMDGPU MachineInstrs to their corresponding MCInst.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#include "AMDGPUMCInstLower.h"
+#include "AMDGPUAsmPrinter.h"
+#include "R600InstrInfo.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Constants.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+AMDGPUMCInstLower::AMDGPUMCInstLower(MCContext &ctx):
+  Ctx(ctx)
+{ }
+
+void AMDGPUMCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (unsigned i = 0, e = MI->getNumExplicitOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+
+    MCOperand MCOp;
+    switch (MO.getType()) {
+    default:
+      llvm_unreachable("unknown operand type");
+    case MachineOperand::MO_FPImmediate: {
+      const APFloat &FloatValue = MO.getFPImm()->getValueAPF();
+      assert(&FloatValue.getSemantics() == &APFloat::IEEEsingle &&
+             "Only floating point immediates are supported at the moment.");
+      MCOp = MCOperand::CreateFPImm(FloatValue.convertToFloat());
+      break;
+    }
+    case MachineOperand::MO_Immediate:
+      MCOp = MCOperand::CreateImm(MO.getImm());
+      break;
+    case MachineOperand::MO_Register:
+      MCOp = MCOperand::CreateReg(MO.getReg());
+      break;
+    case MachineOperand::MO_MachineBasicBlock:
+      MCOp = MCOperand::CreateExpr(MCSymbolRefExpr::Create(
+                                   MO.getMBB()->getSymbol(), Ctx));
+    }
+    OutMI.addOperand(MCOp);
+  }
+}
+
+void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  AMDGPUMCInstLower MCInstLowering(OutContext);
+
+  if (MI->isBundle()) {
+    const MachineBasicBlock *MBB = MI->getParent();
+    MachineBasicBlock::const_instr_iterator I = MI;
+    ++I;
+    while (I != MBB->end() && I->isInsideBundle()) {
+      MCInst MCBundleInst;
+      const MachineInstr *BundledInst = I;
+      MCInstLowering.lower(BundledInst, MCBundleInst);
+      OutStreamer.EmitInstruction(MCBundleInst);
+      ++I;
+    }
+  } else {
+    MCInst TmpInst;
+    MCInstLowering.lower(MI, TmpInst);
+    OutStreamer.EmitInstruction(TmpInst);
+  }
+}
diff --git a/lib/Target/R600/AMDGPUMCInstLower.h b/lib/Target/R600/AMDGPUMCInstLower.h
new file mode 100644
index 0000000000..d7d538e925
--- /dev/null
+++ b/lib/Target/R600/AMDGPUMCInstLower.h
@@ -0,0 +1,34 @@
+//===- AMDGPUMCInstLower.h MachineInstr Lowering Interface ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPU_MCINSTLOWER_H
+#define AMDGPU_MCINSTLOWER_H
+
+namespace llvm {
+
+class MCInst;
+class MCContext;
+class MachineInstr;
+
+class AMDGPUMCInstLower {
+
+  MCContext &Ctx;
+
+public:
+  AMDGPUMCInstLower(MCContext &ctx);
+
+  /// \brief Lower a MachineInstr to an MCInst
+  void lower(const MachineInstr *MI, MCInst &OutMI) const;
+
+};
+
+} // End namespace llvm
+
+#endif //AMDGPU_MCINSTLOWER_H
diff --git a/lib/Target/R600/AMDGPURegisterInfo.cpp b/lib/Target/R600/AMDGPURegisterInfo.cpp
new file mode 100644
index 0000000000..eeafec898d
--- /dev/null
+++ b/lib/Target/R600/AMDGPURegisterInfo.cpp
@@ -0,0 +1,51 @@
+//===-- AMDGPURegisterInfo.cpp - AMDGPU Register Information -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Parent TargetRegisterInfo class common to all hw codegen targets.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPURegisterInfo.h"
+#include "AMDGPUTargetMachine.h"
+
+using namespace llvm;
+
+AMDGPURegisterInfo::AMDGPURegisterInfo(TargetMachine &tm,
+    const TargetInstrInfo &tii)
+: AMDGPUGenRegisterInfo(0),
+  TM(tm),
+  TII(tii)
+  { }
+
+//===----------------------------------------------------------------------===//
+// Function handling callbacks - Functions are a seldom used feature of GPUS, so
+// they are not supported at this time.
+//===----------------------------------------------------------------------===//
+
+const uint16_t AMDGPURegisterInfo::CalleeSavedReg = AMDGPU::NoRegister;
+
+const uint16_t* AMDGPURegisterInfo::getCalleeSavedRegs(const MachineFunction *MF)
+                                                                         const {
+  return &CalleeSavedReg;
+}
+
+void AMDGPURegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
+                                             int SPAdj,
+                                             RegScavenger *RS) const {
+  assert(!"Subroutines not supported yet");
+}
+
+unsigned AMDGPURegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  assert(!"Subroutines not supported yet");
+  return 0;
+}
+
+#define GET_REGINFO_TARGET_DESC
+#include "AMDGPUGenRegisterInfo.inc"
diff --git a/lib/Target/R600/AMDGPURegisterInfo.h b/lib/Target/R600/AMDGPURegisterInfo.h
new file mode 100644
index 0000000000..76ee7ae06a
--- /dev/null
+++ b/lib/Target/R600/AMDGPURegisterInfo.h
@@ -0,0 +1,63 @@
+//===-- AMDGPURegisterInfo.h - AMDGPURegisterInfo Interface -*- C++ -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief TargetRegisterInfo interface that is implemented by all hw codegen
+/// targets.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPUREGISTERINFO_H
+#define AMDGPUREGISTERINFO_H
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_REGINFO_HEADER
+#define GET_REGINFO_ENUM
+#include "AMDGPUGenRegisterInfo.inc"
+
+namespace llvm {
+
+class AMDGPUTargetMachine;
+class TargetInstrInfo;
+
+struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
+  TargetMachine &TM;
+  const TargetInstrInfo &TII;
+  static const uint16_t CalleeSavedReg;
+
+  AMDGPURegisterInfo(TargetMachine &tm, const TargetInstrInfo &tii);
+
+  virtual BitVector getReservedRegs(const MachineFunction &MF) const {
+    assert(!"Unimplemented");  return BitVector();
+  }
+
+  /// \param RC is an AMDIL reg class.
+  ///
+  /// \returns The ISA reg class that is equivalent to \p RC.
+  virtual const TargetRegisterClass * getISARegClass(
+                                         const TargetRegisterClass * RC) const {
+    assert(!"Unimplemented"); return NULL;
+  }
+
+  virtual const TargetRegisterClass* getCFGStructurizerRegClass(MVT VT) const {
+    assert(!"Unimplemented"); return NULL;
+  }
+
+  const uint16_t* getCalleeSavedRegs(const MachineFunction *MF) const;
+  void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
+                           RegScavenger *RS) const;
+  unsigned getFrameRegister(const MachineFunction &MF) const;
+
+};
+
+} // End namespace llvm
+
+#endif // AMDIDSAREGISTERINFO_H
diff --git a/lib/Target/R600/AMDGPURegisterInfo.td b/lib/Target/R600/AMDGPURegisterInfo.td
new file mode 100644
index 0000000000..8181e023aa
--- /dev/null
+++ b/lib/Target/R600/AMDGPURegisterInfo.td
@@ -0,0 +1,22 @@
+//===-- AMDGPURegisterInfo.td - AMDGPU register info -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Tablegen register definitions common to all hw codegen targets.
+//
+//===----------------------------------------------------------------------===//
+
+let Namespace = "AMDGPU" in {
+  def sel_x : SubRegIndex;
+  def sel_y : SubRegIndex;
+  def sel_z : SubRegIndex;
+  def sel_w : SubRegIndex;
+}
+
+include "R600RegisterInfo.td"
+include "SIRegisterInfo.td"
diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp
new file mode 100644
index 0000000000..0f356a1c3f
--- /dev/null
+++ b/lib/Target/R600/AMDGPUSubtarget.cpp
@@ -0,0 +1,87 @@
+//===-- AMDGPUSubtarget.cpp - AMDGPU Subtarget Information ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Implements the AMDGPU specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUSubtarget.h"
+
+using namespace llvm;
+
+#define GET_SUBTARGETINFO_ENUM
+#define GET_SUBTARGETINFO_TARGET_DESC
+#define GET_SUBTARGETINFO_CTOR
+#include "AMDGPUGenSubtargetInfo.inc"
+
+AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) :
+  AMDGPUGenSubtargetInfo(TT, CPU, FS), DumpCode(false) {
+    InstrItins = getInstrItineraryForCPU(CPU);
+
+  memset(CapsOverride, 0, sizeof(*CapsOverride)
+      * AMDGPUDeviceInfo::MaxNumberCapabilities);
+  // Default card
+  StringRef GPU = CPU;
+  Is64bit = false;
+  DefaultSize[0] = 64;
+  DefaultSize[1] = 1;
+  DefaultSize[2] = 1;
+  ParseSubtargetFeatures(GPU, FS);
+  DevName = GPU;
+  Device = AMDGPUDeviceInfo::getDeviceFromName(DevName, this, Is64bit);
+}
+
+AMDGPUSubtarget::~AMDGPUSubtarget() {
+  delete Device;
+}
+
+bool
+AMDGPUSubtarget::isOverride(AMDGPUDeviceInfo::Caps caps) const {
+  assert(caps < AMDGPUDeviceInfo::MaxNumberCapabilities &&
+      "Caps index is out of bounds!");
+  return CapsOverride[caps];
+}
+bool
+AMDGPUSubtarget::is64bit() const  {
+  return Is64bit;
+}
+bool
+AMDGPUSubtarget::isTargetELF() const {
+  return false;
+}
+size_t
+AMDGPUSubtarget::getDefaultSize(uint32_t dim) const {
+  if (dim > 3) {
+    return 1;
+  } else {
+    return DefaultSize[dim];
+  }
+}
+
+std::string
+AMDGPUSubtarget::getDataLayout() const {
+    if (!Device) {
+        return std::string("e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16"
+                "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32"
+                "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64"
+                "-v96:128:128-v128:128:128-v192:256:256-v256:256:256"
+                "-v512:512:512-v1024:1024:1024-v2048:2048:2048-a0:0:64");
+    }
+    return Device->getDataLayout();
+}
+
+std::string
+AMDGPUSubtarget::getDeviceName() const {
+  return DevName;
+}
+const AMDGPUDevice *
+AMDGPUSubtarget::device() const {
+  return Device;
+}
diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
new file mode 100644
index 0000000000..cab7884ea3
--- /dev/null
+++ b/lib/Target/R600/AMDGPUSubtarget.h
@@ -0,0 +1,65 @@
+//=====-- AMDGPUSubtarget.h - Define Subtarget for the AMDIL ---*- C++ -*-====//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU specific subclass of TargetSubtarget.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPUSUBTARGET_H
+#define AMDGPUSUBTARGET_H
+#include "AMDILDevice.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+#define GET_SUBTARGETINFO_HEADER
+#include "AMDGPUGenSubtargetInfo.inc"
+
+#define MAX_CB_SIZE (1 << 16)
+
+namespace llvm {
+
+class AMDGPUSubtarget : public AMDGPUGenSubtargetInfo {
+private:
+  bool CapsOverride[AMDGPUDeviceInfo::MaxNumberCapabilities];
+  const AMDGPUDevice *Device;
+  size_t DefaultSize[3];
+  std::string DevName;
+  bool Is64bit;
+  bool Is32on64bit;
+  bool DumpCode;
+  bool R600ALUInst;
+
+  InstrItineraryData InstrItins;
+
+public:
+  AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS);
+  virtual ~AMDGPUSubtarget();
+
+  const InstrItineraryData &getInstrItineraryData() const { return InstrItins; }
+  virtual void ParseSubtargetFeatures(llvm::StringRef CPU, llvm::StringRef FS);
+
+  bool isOverride(AMDGPUDeviceInfo::Caps) const;
+  bool is64bit() const;
+
+  // Helper functions to simplify if statements
+  bool isTargetELF() const;
+  const AMDGPUDevice* device() const;
+  std::string getDataLayout() const;
+  std::string getDeviceName() const;
+  virtual size_t getDefaultSize(uint32_t dim) const;
+  bool dumpCode() const { return DumpCode; }
+  bool r600ALUEncoding() const { return R600ALUInst; }
+
+};
+
+} // End namespace llvm
+
+#endif // AMDGPUSUBTARGET_H
diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
new file mode 100644
index 0000000000..98a3064f7e
--- /dev/null
+++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
@@ -0,0 +1,141 @@
+//===-- AMDGPUTargetMachine.cpp - TargetMachine for hw codegen targets-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief The AMDGPU target machine contains all of the hardware specific
+/// information  needed to emit code for R600 and SI GPUs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUTargetMachine.h"
+#include "AMDGPU.h"
+#include "R600ISelLowering.h"
+#include "R600InstrInfo.h"
+#include "SIISelLowering.h"
+#include "SIInstrInfo.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/PassManager.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_os_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Scalar.h"
+#include <llvm/CodeGen/Passes.h>
+
+using namespace llvm;
+
+extern "C" void LLVMInitializeR600Target() {
+  // Register the target
+  RegisterTargetMachine<AMDGPUTargetMachine> X(TheAMDGPUTarget);
+}
+
+AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, StringRef TT,
+    StringRef CPU, StringRef FS,
+  TargetOptions Options,
+  Reloc::Model RM, CodeModel::Model CM,
+  CodeGenOpt::Level OptLevel
+)
+:
+  LLVMTargetMachine(T, TT, CPU, FS, Options, RM, CM, OptLevel),
+  Subtarget(TT, CPU, FS),
+  Layout(Subtarget.getDataLayout()),
+  FrameLowering(TargetFrameLowering::StackGrowsUp,
+      Subtarget.device()->getStackAlignment(), 0),
+  IntrinsicInfo(this),
+  InstrItins(&Subtarget.getInstrItineraryData()) {
+  // TLInfo uses InstrInfo so it must be initialized after.
+  if (Subtarget.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
+    InstrInfo = new R600InstrInfo(*this);
+    TLInfo = new R600TargetLowering(*this);
+  } else {
+    InstrInfo = new SIInstrInfo(*this);
+    TLInfo = new SITargetLowering(*this);
+  }
+}
+
+AMDGPUTargetMachine::~AMDGPUTargetMachine() {
+}
+
+namespace {
+class AMDGPUPassConfig : public TargetPassConfig {
+public:
+  AMDGPUPassConfig(AMDGPUTargetMachine *TM, PassManagerBase &PM)
+    : TargetPassConfig(TM, PM) {}
+
+  AMDGPUTargetMachine &getAMDGPUTargetMachine() const {
+    return getTM<AMDGPUTargetMachine>();
+  }
+
+  virtual bool addPreISel();
+  virtual bool addInstSelector();
+  virtual bool addPreRegAlloc();
+  virtual bool addPostRegAlloc();
+  virtual bool addPreSched2();
+  virtual bool addPreEmitPass();
+};
+} // End of anonymous namespace
+
+TargetPassConfig *AMDGPUTargetMachine::createPassConfig(PassManagerBase &PM) {
+  return new AMDGPUPassConfig(this, PM);
+}
+
+bool
+AMDGPUPassConfig::addPreISel() {
+  return false;
+}
+
+bool AMDGPUPassConfig::addInstSelector() {
+  addPass(createAMDGPUPeepholeOpt(*TM));
+  addPass(createAMDGPUISelDag(getAMDGPUTargetMachine()));
+  return false;
+}
+
+bool AMDGPUPassConfig::addPreRegAlloc() {
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
+
+  if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
+    addPass(createSIAssignInterpRegsPass(*TM));
+  }
+  addPass(createAMDGPUConvertToISAPass(*TM));
+  if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
+    addPass(createSIFixSGPRLivenessPass(*TM));
+  }
+  return false;
+}
+
+bool AMDGPUPassConfig::addPostRegAlloc() {
+  return false;
+}
+
+bool AMDGPUPassConfig::addPreSched2() {
+
+  addPass(&IfConverterID);
+  return false;
+}
+
+bool AMDGPUPassConfig::addPreEmitPass() {
+  addPass(createAMDGPUCFGPreparationPass(*TM));
+  addPass(createAMDGPUCFGStructurizerPass(*TM));
+
+  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
+  if (ST.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX) {
+    addPass(createR600ExpandSpecialInstrsPass(*TM));
+    addPass(&FinalizeMachineBundlesID);
+  } else {
+    addPass(createSILowerLiteralConstantsPass(*TM));
+    addPass(createSILowerControlFlowPass(*TM));
+  }
+
+  return false;
+}
+
diff --git a/lib/Target/R600/AMDGPUTargetMachine.h b/lib/Target/R600/AMDGPUTargetMachine.h
new file mode 100644
index 0000000000..399e55c8ad
--- /dev/null
+++ b/lib/Target/R600/AMDGPUTargetMachine.h
@@ -0,0 +1,70 @@
+//===-- AMDGPUTargetMachine.h - AMDGPU TargetMachine Interface --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief The AMDGPU TargetMachine interface definition for hw codgen targets.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPU_TARGET_MACHINE_H
+#define AMDGPU_TARGET_MACHINE_H
+
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "AMDILFrameLowering.h"
+#include "AMDILIntrinsicInfo.h"
+#include "R600ISelLowering.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/DataLayout.h"
+
+namespace llvm {
+
+MCAsmInfo* createMCAsmInfo(const Target &T, StringRef TT);
+
+class AMDGPUTargetMachine : public LLVMTargetMachine {
+
+  AMDGPUSubtarget Subtarget;
+  const DataLayout Layout;
+  AMDGPUFrameLowering FrameLowering;
+  AMDGPUIntrinsicInfo IntrinsicInfo;
+  const AMDGPUInstrInfo * InstrInfo;
+  AMDGPUTargetLowering * TLInfo;
+  const InstrItineraryData* InstrItins;
+
+public:
+   AMDGPUTargetMachine(const Target &T, StringRef TT, StringRef FS,
+                       StringRef CPU,
+                       TargetOptions Options,
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
+   ~AMDGPUTargetMachine();
+   virtual const AMDGPUFrameLowering* getFrameLowering() const {
+     return &FrameLowering;
+   }
+   virtual const AMDGPUIntrinsicInfo* getIntrinsicInfo() const {
+     return &IntrinsicInfo;
+   }
+   virtual const AMDGPUInstrInfo *getInstrInfo() const {return InstrInfo;}
+   virtual const AMDGPUSubtarget *getSubtargetImpl() const {return &Subtarget; }
+   virtual const AMDGPURegisterInfo *getRegisterInfo() const {
+      return &InstrInfo->getRegisterInfo();
+   }
+   virtual AMDGPUTargetLowering * getTargetLowering() const {
+      return TLInfo;
+   }
+   virtual const InstrItineraryData* getInstrItineraryData() const {
+      return InstrItins;
+   }
+   virtual const DataLayout* getDataLayout() const { return &Layout; }
+   virtual TargetPassConfig *createPassConfig(PassManagerBase &PM);
+};
+
+} // End namespace llvm
+
+#endif // AMDGPU_TARGET_MACHINE_H
diff --git a/lib/Target/R600/AMDIL.h b/lib/Target/R600/AMDIL.h
new file mode 100644
index 0000000000..4e577dc234
--- /dev/null
+++ b/lib/Target/R600/AMDIL.h
@@ -0,0 +1,106 @@
+//===-- AMDIL.h - Top-level interface for AMDIL representation --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// This file contains the entry points for global functions defined in the LLVM
+/// AMDGPU back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDIL_H
+#define AMDIL_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Target/TargetMachine.h"
+
+#define ARENA_SEGMENT_RESERVED_UAVS 12
+#define DEFAULT_ARENA_UAV_ID 8
+#define DEFAULT_RAW_UAV_ID 7
+#define GLOBAL_RETURN_RAW_UAV_ID 11
+#define HW_MAX_NUM_CB 8
+#define MAX_NUM_UNIQUE_UAVS 8
+#define OPENCL_MAX_NUM_ATOMIC_COUNTERS 8
+#define OPENCL_MAX_READ_IMAGES 128
+#define OPENCL_MAX_WRITE_IMAGES 8
+#define OPENCL_MAX_SAMPLERS 16
+
+// The next two values can never be zero, as zero is the ID that is
+// used to assert against.
+#define DEFAULT_LDS_ID     1
+#define DEFAULT_GDS_ID     1
+#define DEFAULT_SCRATCH_ID 1
+#define DEFAULT_VEC_SLOTS  8
+
+#define OCL_DEVICE_RV710        0x0001
+#define OCL_DEVICE_RV730        0x0002
+#define OCL_DEVICE_RV770        0x0004
+#define OCL_DEVICE_CEDAR        0x0008
+#define OCL_DEVICE_REDWOOD      0x0010
+#define OCL_DEVICE_JUNIPER      0x0020
+#define OCL_DEVICE_CYPRESS      0x0040
+#define OCL_DEVICE_CAICOS       0x0080
+#define OCL_DEVICE_TURKS        0x0100
+#define OCL_DEVICE_BARTS        0x0200
+#define OCL_DEVICE_CAYMAN       0x0400
+#define OCL_DEVICE_ALL          0x3FFF
+
+/// The number of function ID's that are reserved for 
+/// internal compiler usage.
+const unsigned int RESERVED_FUNCS = 1024;
+
+namespace llvm {
+class AMDGPUInstrPrinter;
+class FunctionPass;
+class MCAsmInfo;
+class raw_ostream;
+class Target;
+class TargetMachine;
+
+// Instruction selection passes.
+FunctionPass*
+  createAMDGPUISelDag(TargetMachine &TM);
+FunctionPass*
+  createAMDGPUPeepholeOpt(TargetMachine &TM);
+
+// Pre emit passes.
+FunctionPass*
+  createAMDGPUCFGPreparationPass(TargetMachine &TM);
+FunctionPass*
+  createAMDGPUCFGStructurizerPass(TargetMachine &TM);
+
+extern Target TheAMDGPUTarget;
+} // end namespace llvm;
+
+// Include device information enumerations
+#include "AMDILDeviceInfo.h"
+
+namespace llvm {
+/// OpenCL uses address spaces to differentiate between
+/// various memory regions on the hardware. On the CPU
+/// all of the address spaces point to the same memory,
+/// however on the GPU, each address space points to
+/// a seperate piece of memory that is unique from other
+/// memory locations.
+namespace AMDGPUAS {
+enum AddressSpaces {
+  PRIVATE_ADDRESS  = 0, ///< Address space for private memory.
+  GLOBAL_ADDRESS   = 1, ///< Address space for global memory (RAT0, VTX0).
+  CONSTANT_ADDRESS = 2, ///< Address space for constant memory.
+  LOCAL_ADDRESS    = 3, ///< Address space for local memory.
+  REGION_ADDRESS   = 4, ///< Address space for region memory.
+  ADDRESS_NONE     = 5, ///< Address space for unknown memory.
+  PARAM_D_ADDRESS  = 6, ///< Address space for direct addressible parameter memory (CONST0)
+  PARAM_I_ADDRESS  = 7, ///< Address space for indirect addressible parameter memory (VTX1)
+  USER_SGPR_ADDRESS = 8, ///< Address space for USER_SGPRS on SI
+  LAST_ADDRESS     = 9
+};
+
+} // namespace AMDGPUAS
+
+} // end namespace llvm
+#endif // AMDIL_H
diff --git a/lib/Target/R600/AMDIL7XXDevice.cpp b/lib/Target/R600/AMDIL7XXDevice.cpp
new file mode 100644
index 0000000000..ea6ac34f57
--- /dev/null
+++ b/lib/Target/R600/AMDIL7XXDevice.cpp
@@ -0,0 +1,115 @@
+//===-- AMDIL7XXDevice.cpp - Device Info for 7XX GPUs ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// \file
+//==-----------------------------------------------------------------------===//
+#include "AMDIL7XXDevice.h"
+#include "AMDGPUSubtarget.h"
+#include "AMDILDevice.h"
+
+using namespace llvm;
+
+AMDGPU7XXDevice::AMDGPU7XXDevice(AMDGPUSubtarget *ST) : AMDGPUDevice(ST) {
+  setCaps();
+  std::string name = mSTM->getDeviceName();
+  if (name == "rv710") {
+    DeviceFlag = OCL_DEVICE_RV710;
+  } else if (name == "rv730") {
+    DeviceFlag = OCL_DEVICE_RV730;
+  } else {
+    DeviceFlag = OCL_DEVICE_RV770;
+  }
+}
+
+AMDGPU7XXDevice::~AMDGPU7XXDevice() {
+}
+
+void AMDGPU7XXDevice::setCaps() {
+  mSWBits.set(AMDGPUDeviceInfo::LocalMem);
+}
+
+size_t AMDGPU7XXDevice::getMaxLDSSize() const {
+  if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
+    return MAX_LDS_SIZE_700;
+  }
+  return 0;
+}
+
+size_t AMDGPU7XXDevice::getWavefrontSize() const {
+  return AMDGPUDevice::HalfWavefrontSize;
+}
+
+uint32_t AMDGPU7XXDevice::getGeneration() const {
+  return AMDGPUDeviceInfo::HD4XXX;
+}
+
+uint32_t AMDGPU7XXDevice::getResourceID(uint32_t DeviceID) const {
+  switch (DeviceID) {
+  default:
+    assert(0 && "ID type passed in is unknown!");
+    break;
+  case GLOBAL_ID:
+  case CONSTANT_ID:
+  case RAW_UAV_ID:
+  case ARENA_UAV_ID:
+    break;
+  case LDS_ID:
+    if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
+      return DEFAULT_LDS_ID;
+    }
+    break;
+  case SCRATCH_ID:
+    if (usesHardware(AMDGPUDeviceInfo::PrivateMem)) {
+      return DEFAULT_SCRATCH_ID;
+    }
+    break;
+  case GDS_ID:
+    assert(0 && "GDS UAV ID is not supported on this chip");
+    if (usesHardware(AMDGPUDeviceInfo::RegionMem)) {
+      return DEFAULT_GDS_ID;
+    }
+    break;
+  };
+
+  return 0;
+}
+
+uint32_t AMDGPU7XXDevice::getMaxNumUAVs() const {
+  return 1;
+}
+
+AMDGPU770Device::AMDGPU770Device(AMDGPUSubtarget *ST): AMDGPU7XXDevice(ST) {
+  setCaps();
+}
+
+AMDGPU770Device::~AMDGPU770Device() {
+}
+
+void AMDGPU770Device::setCaps() {
+  if (mSTM->isOverride(AMDGPUDeviceInfo::DoubleOps)) {
+    mSWBits.set(AMDGPUDeviceInfo::FMA);
+    mHWBits.set(AMDGPUDeviceInfo::DoubleOps);
+  }
+  mSWBits.set(AMDGPUDeviceInfo::BarrierDetect);
+  mHWBits.reset(AMDGPUDeviceInfo::LongOps);
+  mSWBits.set(AMDGPUDeviceInfo::LongOps);
+  mSWBits.set(AMDGPUDeviceInfo::LocalMem);
+}
+
+size_t AMDGPU770Device::getWavefrontSize() const {
+  return AMDGPUDevice::WavefrontSize;
+}
+
+AMDGPU710Device::AMDGPU710Device(AMDGPUSubtarget *ST) : AMDGPU7XXDevice(ST) {
+}
+
+AMDGPU710Device::~AMDGPU710Device() {
+}
+
+size_t AMDGPU710Device::getWavefrontSize() const {
+  return AMDGPUDevice::QuarterWavefrontSize;
+}
diff --git a/lib/Target/R600/AMDIL7XXDevice.h b/lib/Target/R600/AMDIL7XXDevice.h
new file mode 100644
index 0000000000..1cf4ca415a
--- /dev/null
+++ b/lib/Target/R600/AMDIL7XXDevice.h
@@ -0,0 +1,72 @@
+//==-- AMDIL7XXDevice.h - Define 7XX Device Device for AMDIL ---*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+/// \file
+/// \brief Interface for the subtarget data classes.
+///
+/// This file will define the interface that each generation needs to
+/// implement in order to correctly answer queries on the capabilities of the
+/// specific hardware.
+//===----------------------------------------------------------------------===//
+#ifndef AMDIL7XXDEVICEIMPL_H
+#define AMDIL7XXDEVICEIMPL_H
+#include "AMDILDevice.h"
+
+namespace llvm {
+class AMDGPUSubtarget;
+
+//===----------------------------------------------------------------------===//
+// 7XX generation of devices and their respective sub classes
+//===----------------------------------------------------------------------===//
+
+/// \brief The AMDGPU7XXDevice class represents the generic 7XX device.
+///
+/// All 7XX devices are derived from this class. The AMDGPU7XX device will only
+/// support the minimal features that are required to be considered OpenCL 1.0
+/// compliant and nothing more.
+class AMDGPU7XXDevice : public AMDGPUDevice {
+public:
+  AMDGPU7XXDevice(AMDGPUSubtarget *ST);
+  virtual ~AMDGPU7XXDevice();
+  virtual size_t getMaxLDSSize() const;
+  virtual size_t getWavefrontSize() const;
+  virtual uint32_t getGeneration() const;
+  virtual uint32_t getResourceID(uint32_t DeviceID) const;
+  virtual uint32_t getMaxNumUAVs() const;
+
+protected:
+  virtual void setCaps();
+};
+
+/// \brief The AMDGPU770Device class represents the RV770 chip and it's
+/// derivative cards.
+///
+/// The difference between this device and the base class is this device device
+/// adds support for double precision and has a larger wavefront size.
+class AMDGPU770Device : public AMDGPU7XXDevice {
+public:
+  AMDGPU770Device(AMDGPUSubtarget *ST);
+  virtual ~AMDGPU770Device();
+  virtual size_t getWavefrontSize() const;
+private:
+  virtual void setCaps();
+};
+
+/// \brief The AMDGPU710Device class derives from the 7XX base class.
+///
+/// This class is a smaller derivative, so we need to overload some of the
+/// functions in order to correctly specify this information.
+class AMDGPU710Device : public AMDGPU7XXDevice {
+public:
+  AMDGPU710Device(AMDGPUSubtarget *ST);
+  virtual ~AMDGPU710Device();
+  virtual size_t getWavefrontSize() const;
+};
+
+} // namespace llvm
+#endif // AMDILDEVICEIMPL_H
diff --git a/lib/Target/R600/AMDILBase.td b/lib/Target/R600/AMDILBase.td
new file mode 100644
index 0000000000..c12cedcf7f
--- /dev/null
+++ b/lib/Target/R600/AMDILBase.td
@@ -0,0 +1,85 @@
+//===- AMDIL.td - AMDIL Target Machine -------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Target-independent interfaces which we are implementing
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/Target.td"
+
+// Dummy Instruction itineraries for pseudo instructions
+def ALU_NULL : FuncUnit;
+def NullALU : InstrItinClass;
+
+//===----------------------------------------------------------------------===//
+// AMDIL Subtarget features.
+//===----------------------------------------------------------------------===//
+def FeatureFP64     : SubtargetFeature<"fp64",
+        "CapsOverride[AMDGPUDeviceInfo::DoubleOps]",
+        "true",
+        "Enable 64bit double precision operations">;
+def FeatureByteAddress    : SubtargetFeature<"byte_addressable_store",
+        "CapsOverride[AMDGPUDeviceInfo::ByteStores]",
+        "true",
+        "Enable byte addressable stores">;
+def FeatureBarrierDetect : SubtargetFeature<"barrier_detect",
+        "CapsOverride[AMDGPUDeviceInfo::BarrierDetect]",
+        "true",
+        "Enable duplicate barrier detection(HD5XXX or later).">;
+def FeatureImages : SubtargetFeature<"images",
+        "CapsOverride[AMDGPUDeviceInfo::Images]",
+        "true",
+        "Enable image functions">;
+def FeatureMultiUAV : SubtargetFeature<"multi_uav",
+        "CapsOverride[AMDGPUDeviceInfo::MultiUAV]",
+        "true",
+        "Generate multiple UAV code(HD5XXX family or later)">;
+def FeatureMacroDB : SubtargetFeature<"macrodb",
+        "CapsOverride[AMDGPUDeviceInfo::MacroDB]",
+        "true",
+        "Use internal macrodb, instead of macrodb in driver">;
+def FeatureNoAlias : SubtargetFeature<"noalias",
+        "CapsOverride[AMDGPUDeviceInfo::NoAlias]",
+        "true",
+        "assert that all kernel argument pointers are not aliased">;
+def FeatureNoInline : SubtargetFeature<"no-inline",
+        "CapsOverride[AMDGPUDeviceInfo::NoInline]",
+        "true",
+        "specify whether to not inline functions">;
+
+def Feature64BitPtr : SubtargetFeature<"64BitPtr",
+        "Is64bit",
+        "false",
+        "Specify if 64bit addressing should be used.">;
+
+def Feature32on64BitPtr : SubtargetFeature<"64on32BitPtr",
+        "Is32on64bit",
+        "false",
+        "Specify if 64bit sized pointers with 32bit addressing should be used.">;
+def FeatureDebug : SubtargetFeature<"debug",
+        "CapsOverride[AMDGPUDeviceInfo::Debug]",
+        "true",
+        "Debug mode is enabled, so disable hardware accelerated address spaces.">;
+def FeatureDumpCode : SubtargetFeature <"DumpCode",
+        "DumpCode",
+        "true",
+        "Dump MachineInstrs in the CodeEmitter">;
+
+def FeatureR600ALUInst : SubtargetFeature<"R600ALUInst",
+        "R600ALUInst",
+        "false",
+        "Older version of ALU instructions encoding.">;
+
+
+//===----------------------------------------------------------------------===//
+// Register File, Calling Conv, Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+
+include "AMDILRegisterInfo.td"
+include "AMDILInstrInfo.td"
+
diff --git a/lib/Target/R600/AMDILCFGStructurizer.cpp b/lib/Target/R600/AMDILCFGStructurizer.cpp
new file mode 100644
index 0000000000..1f276dc570
--- /dev/null
+++ b/lib/Target/R600/AMDILCFGStructurizer.cpp
@@ -0,0 +1,3049 @@
+//===-- AMDILCFGStructurizer.cpp - CFG Structurizer -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//==-----------------------------------------------------------------------===//
+
+#define DEBUGME 0
+#define DEBUG_TYPE "structcfg"
+
+#include "AMDGPUInstrInfo.h"
+#include "AMDIL.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/DominatorInternals.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+using namespace llvm;
+
+// TODO: move-begin.
+
+//===----------------------------------------------------------------------===//
+//
+// Statistics for CFGStructurizer.
+//
+//===----------------------------------------------------------------------===//
+
+STATISTIC(numSerialPatternMatch,    "CFGStructurizer number of serial pattern "
+    "matched");
+STATISTIC(numIfPatternMatch,        "CFGStructurizer number of if pattern "
+    "matched");
+STATISTIC(numLoopbreakPatternMatch, "CFGStructurizer number of loop-break "
+    "pattern matched");
+STATISTIC(numLoopcontPatternMatch,  "CFGStructurizer number of loop-continue "
+    "pattern matched");
+STATISTIC(numLoopPatternMatch,      "CFGStructurizer number of loop pattern "
+    "matched");
+STATISTIC(numClonedBlock,           "CFGStructurizer cloned blocks");
+STATISTIC(numClonedInstr,           "CFGStructurizer cloned instructions");
+
+//===----------------------------------------------------------------------===//
+//
+// Miscellaneous utility for CFGStructurizer.
+//
+//===----------------------------------------------------------------------===//
+namespace llvmCFGStruct {
+#define SHOWNEWINSTR(i) \
+  if (DEBUGME) errs() << "New instr: " << *i << "\n"
+
+#define SHOWNEWBLK(b, msg) \
+if (DEBUGME) { \
+  errs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
+  errs() << "\n"; \
+}
+
+#define SHOWBLK_DETAIL(b, msg) \
+if (DEBUGME) { \
+  if (b) { \
+  errs() << msg << "BB" << b->getNumber() << "size " << b->size(); \
+  b->print(errs()); \
+  errs() << "\n"; \
+  } \
+}
+
+#define INVALIDSCCNUM -1
+#define INVALIDREGNUM 0
+
+template<class LoopinfoT>
+void PrintLoopinfo(const LoopinfoT &LoopInfo, llvm::raw_ostream &OS) {
+  for (typename LoopinfoT::iterator iter = LoopInfo.begin(),
+       iterEnd = LoopInfo.end();
+       iter != iterEnd; ++iter) {
+    (*iter)->print(OS, 0);
+  }
+}
+
+template<class NodeT>
+void ReverseVector(SmallVector<NodeT *, DEFAULT_VEC_SLOTS> &Src) {
+  size_t sz = Src.size();
+  for (size_t i = 0; i < sz/2; ++i) {
+    NodeT *t = Src[i];
+    Src[i] = Src[sz - i - 1];
+    Src[sz - i - 1] = t;
+  }
+}
+
+} //end namespace llvmCFGStruct
+
+//===----------------------------------------------------------------------===//
+//
+// supporting data structure for CFGStructurizer
+//
+//===----------------------------------------------------------------------===//
+
+namespace llvmCFGStruct {
+template<class PassT>
+struct CFGStructTraits {
+};
+
+template <class InstrT>
+class BlockInformation {
+public:
+  bool isRetired;
+  int  sccNum;
+  //SmallVector<InstrT*, DEFAULT_VEC_SLOTS> succInstr;
+  //Instructions defining the corresponding successor.
+  BlockInformation() : isRetired(false), sccNum(INVALIDSCCNUM) {}
+};
+
+template <class BlockT, class InstrT, class RegiT>
+class LandInformation {
+public:
+  BlockT *landBlk;
+  std::set<RegiT> breakInitRegs;  //Registers that need to "reg = 0", before
+                                  //WHILELOOP(thisloop) init before entering
+                                  //thisloop.
+  std::set<RegiT> contInitRegs;   //Registers that need to "reg = 0", after
+                                  //WHILELOOP(thisloop) init after entering
+                                  //thisloop.
+  std::set<RegiT> endbranchInitRegs; //Init before entering this loop, at loop
+                                     //land block, branch cond on this reg.
+  std::set<RegiT> breakOnRegs;       //registers that need to "if (reg) break
+                                     //endif" after ENDLOOP(thisloop) break
+                                     //outerLoopOf(thisLoop).
+  std::set<RegiT> contOnRegs;       //registers that need to "if (reg) continue
+                                    //endif" after ENDLOOP(thisloop) continue on
+                                    //outerLoopOf(thisLoop).
+  LandInformation() : landBlk(NULL) {}
+};
+
+} //end of namespace llvmCFGStruct
+
+//===----------------------------------------------------------------------===//
+//
+// CFGStructurizer
+//
+//===----------------------------------------------------------------------===//
+
+namespace llvmCFGStruct {
+// bixia TODO: port it to BasicBlock, not just MachineBasicBlock.
+template<class PassT>
+class  CFGStructurizer {
+public:
+  typedef enum {
+    Not_SinglePath = 0,
+    SinglePath_InPath = 1,
+    SinglePath_NotInPath = 2
+  } PathToKind;
+
+public:
+  typedef typename PassT::InstructionType         InstrT;
+  typedef typename PassT::FunctionType            FuncT;
+  typedef typename PassT::DominatortreeType       DomTreeT;
+  typedef typename PassT::PostDominatortreeType   PostDomTreeT;
+  typedef typename PassT::DomTreeNodeType         DomTreeNodeT;
+  typedef typename PassT::LoopinfoType            LoopInfoT;
+
+  typedef GraphTraits<FuncT *>                    FuncGTraits;
+  //typedef FuncGTraits::nodes_iterator BlockIterator;
+  typedef typename FuncT::iterator                BlockIterator;
+
+  typedef typename FuncGTraits::NodeType          BlockT;
+  typedef GraphTraits<BlockT *>                   BlockGTraits;
+  typedef GraphTraits<Inverse<BlockT *> >         InvBlockGTraits;
+  //typedef BlockGTraits::succ_iterator InstructionIterator;
+  typedef typename BlockT::iterator               InstrIterator;
+
+  typedef CFGStructTraits<PassT>                  CFGTraits;
+  typedef BlockInformation<InstrT>                BlockInfo;
+  typedef std::map<BlockT *, BlockInfo *>         BlockInfoMap;
+
+  typedef int                                     RegiT;
+  typedef typename PassT::LoopType                LoopT;
+  typedef LandInformation<BlockT, InstrT, RegiT>  LoopLandInfo;
+        typedef std::map<LoopT *, LoopLandInfo *> LoopLandInfoMap;
+        //landing info for loop break
+  typedef SmallVector<BlockT *, 32>               BlockTSmallerVector;
+
+public:
+  CFGStructurizer();
+  ~CFGStructurizer();
+
+  /// Perform the CFG structurization
+  bool run(FuncT &Func, PassT &Pass, const AMDGPURegisterInfo *tri);
+
+  /// Perform the CFG preparation
+  bool prepare(FuncT &Func, PassT &Pass, const AMDGPURegisterInfo *tri);
+
+private:
+  void reversePredicateSetter(typename BlockT::iterator);
+  void   orderBlocks();
+  void   printOrderedBlocks(llvm::raw_ostream &OS);
+  int patternMatch(BlockT *CurBlock);
+  int patternMatchGroup(BlockT *CurBlock);
+
+  int serialPatternMatch(BlockT *CurBlock);
+  int ifPatternMatch(BlockT *CurBlock);
+  int switchPatternMatch(BlockT *CurBlock);
+  int loopendPatternMatch(BlockT *CurBlock);
+  int loopPatternMatch(BlockT *CurBlock);
+
+  int loopbreakPatternMatch(LoopT *LoopRep, BlockT *LoopHeader);
+  int loopcontPatternMatch(LoopT *LoopRep, BlockT *LoopHeader);
+  //int loopWithoutBreak(BlockT *);
+
+  void handleLoopbreak (BlockT *ExitingBlock, LoopT *ExitingLoop,
+                        BlockT *ExitBlock, LoopT *exitLoop, BlockT *landBlock);
+  void handleLoopcontBlock(BlockT *ContingBlock, LoopT *contingLoop,
+                           BlockT *ContBlock, LoopT *contLoop);
+  bool isSameloopDetachedContbreak(BlockT *Src1Block, BlockT *Src2Block);
+  int handleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock,
+                       BlockT *FalseBlock);
+  int handleJumpintoIfImp(BlockT *HeadBlock, BlockT *TrueBlock,
+                          BlockT *FalseBlock);
+  int improveSimpleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock,
+                              BlockT *FalseBlock, BlockT **LandBlockPtr);
+  void showImproveSimpleJumpintoIf(BlockT *HeadBlock, BlockT *TrueBlock,
+                                   BlockT *FalseBlock, BlockT *LandBlock,
+                                   bool Detail = false);
+  PathToKind singlePathTo(BlockT *SrcBlock, BlockT *DstBlock,
+                          bool AllowSideEntry = true);
+  BlockT *singlePathEnd(BlockT *srcBlock, BlockT *DstBlock,
+                        bool AllowSideEntry = true);
+  int cloneOnSideEntryTo(BlockT *PreBlock, BlockT *SrcBlock, BlockT *DstBlock);
+  void mergeSerialBlock(BlockT *DstBlock, BlockT *srcBlock);
+
+  void mergeIfthenelseBlock(InstrT *BranchInstr, BlockT *CurBlock,
+                            BlockT *TrueBlock, BlockT *FalseBlock,
+                            BlockT *LandBlock);
+  void mergeLooplandBlock(BlockT *DstBlock, LoopLandInfo *LoopLand);
+  void mergeLoopbreakBlock(BlockT *ExitingBlock, BlockT *ExitBlock,
+                           BlockT *ExitLandBlock, RegiT SetReg);
+  void settleLoopcontBlock(BlockT *ContingBlock, BlockT *ContBlock,
+                           RegiT SetReg);
+  BlockT *relocateLoopcontBlock(LoopT *ParentLoopRep, LoopT *LoopRep,
+                                std::set<BlockT*> &ExitBlockSet,
+                                BlockT *ExitLandBlk);
+  BlockT *addLoopEndbranchBlock(LoopT *LoopRep,
+                                BlockTSmallerVector &ExitingBlocks,
+                                BlockTSmallerVector &ExitBlocks);
+  BlockT *normalizeInfiniteLoopExit(LoopT *LoopRep);
+  void removeUnconditionalBranch(BlockT *SrcBlock);
+  void removeRedundantConditionalBranch(BlockT *SrcBlock);
+  void addDummyExitBlock(SmallVector<BlockT *, DEFAULT_VEC_SLOTS> &RetBlocks);
+
+  void removeSuccessor(BlockT *SrcBlock);
+  BlockT *cloneBlockForPredecessor(BlockT *CurBlock, BlockT *PredBlock);
+  BlockT *exitingBlock2ExitBlock (LoopT *LoopRep, BlockT *exitingBlock);
+
+  void migrateInstruction(BlockT *SrcBlock, BlockT *DstBlock,
+                          InstrIterator InsertPos);
+
+  void recordSccnum(BlockT *SrcBlock, int SCCNum);
+  int getSCCNum(BlockT *srcBlk);
+
+  void retireBlock(BlockT *DstBlock, BlockT *SrcBlock);
+  bool isRetiredBlock(BlockT *SrcBlock);
+  bool isActiveLoophead(BlockT *CurBlock);
+  bool needMigrateBlock(BlockT *Block);
+
+  BlockT *recordLoopLandBlock(LoopT *LoopRep, BlockT *LandBlock,
+                              BlockTSmallerVector &exitBlocks,
+                              std::set<BlockT*> &ExitBlockSet);
+  void setLoopLandBlock(LoopT *LoopRep, BlockT *Block = NULL);
+  BlockT *getLoopLandBlock(LoopT *LoopRep);
+  LoopLandInfo *getLoopLandInfo(LoopT *LoopRep);
+
+  void addLoopBreakOnReg(LoopT *LoopRep, RegiT RegNum);
+  void addLoopContOnReg(LoopT *LoopRep, RegiT RegNum);
+  void addLoopBreakInitReg(LoopT *LoopRep, RegiT RegNum);
+  void addLoopContInitReg(LoopT *LoopRep, RegiT RegNum);
+  void addLoopEndbranchInitReg(LoopT *LoopRep, RegiT RegNum);
+
+  bool hasBackEdge(BlockT *curBlock);
+  unsigned getLoopDepth  (LoopT *LoopRep);
+  int countActiveBlock(
+    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator IterStart,
+    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator IterEnd);
+    BlockT *findNearestCommonPostDom(std::set<BlockT *>&);
+  BlockT *findNearestCommonPostDom(BlockT *Block1, BlockT *Block2);
+
+private:
+  DomTreeT *domTree;
+  PostDomTreeT *postDomTree;
+  LoopInfoT *loopInfo;
+  PassT *passRep;
+  FuncT *funcRep;
+
+  BlockInfoMap blockInfoMap;
+  LoopLandInfoMap loopLandInfoMap;
+  SmallVector<BlockT *, DEFAULT_VEC_SLOTS> orderedBlks;
+  const AMDGPURegisterInfo *TRI;
+
+};  //template class CFGStructurizer
+
+template<class PassT> CFGStructurizer<PassT>::CFGStructurizer()
+  : domTree(NULL), postDomTree(NULL), loopInfo(NULL) {
+}
+
+template<class PassT> CFGStructurizer<PassT>::~CFGStructurizer() {
+  for (typename BlockInfoMap::iterator I = blockInfoMap.begin(),
+       E = blockInfoMap.end(); I != E; ++I) {
+    delete I->second;
+  }
+}
+
+template<class PassT>
+bool CFGStructurizer<PassT>::prepare(FuncT &func, PassT &pass,
+                                     const AMDGPURegisterInfo * tri) {
+  passRep = &pass;
+  funcRep = &func;
+  TRI = tri;
+
+  bool changed = false;
+
+  //FIXME: if not reducible flow graph, make it so ???
+
+  if (DEBUGME) {
+        errs() << "AMDGPUCFGStructurizer::prepare\n";
+  }
+
+  loopInfo = CFGTraits::getLoopInfo(pass);
+  if (DEBUGME) {
+    errs() << "LoopInfo:\n";
+    PrintLoopinfo(*loopInfo, errs());
+  }
+
+  orderBlocks();
+  if (DEBUGME) {
+    errs() << "Ordered blocks:\n";
+    printOrderedBlocks(errs());
+  }
+
+  SmallVector<BlockT *, DEFAULT_VEC_SLOTS> retBlks;
+
+  for (typename LoopInfoT::iterator iter = loopInfo->begin(),
+       iterEnd = loopInfo->end();
+       iter != iterEnd; ++iter) {
+    LoopT* loopRep = (*iter);
+    BlockTSmallerVector exitingBlks;
+    loopRep->getExitingBlocks(exitingBlks);
+    
+    if (exitingBlks.size() == 0) {
+      BlockT* dummyExitBlk = normalizeInfiniteLoopExit(loopRep);
+      if (dummyExitBlk != NULL)
+        retBlks.push_back(dummyExitBlk);
+    }
+  }
+
+  // Remove unconditional branch instr.
+  // Add dummy exit block iff there are multiple returns.
+
+  for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
+       iterBlk = orderedBlks.begin(), iterEndBlk = orderedBlks.end();
+       iterBlk != iterEndBlk;
+       ++iterBlk) {
+    BlockT *curBlk = *iterBlk;
+    removeUnconditionalBranch(curBlk);
+    removeRedundantConditionalBranch(curBlk);
+    if (CFGTraits::isReturnBlock(curBlk)) {
+      retBlks.push_back(curBlk);
+    }
+    assert(curBlk->succ_size() <= 2);
+  } //for
+
+  if (retBlks.size() >= 2) {
+    addDummyExitBlock(retBlks);
+    changed = true;
+  }
+
+  return changed;
+} //CFGStructurizer::prepare
+
+template<class PassT>
+bool CFGStructurizer<PassT>::run(FuncT &func, PassT &pass,
+    const AMDGPURegisterInfo * tri) {
+  passRep = &pass;
+  funcRep = &func;
+  TRI = tri;
+
+  //Assume reducible CFG...
+  if (DEBUGME) {
+    errs() << "AMDGPUCFGStructurizer::run\n";
+    func.viewCFG();
+  }
+
+  domTree = CFGTraits::getDominatorTree(pass);
+  if (DEBUGME) {
+    domTree->print(errs(), (const llvm::Module*)0);
+  }
+
+  postDomTree = CFGTraits::getPostDominatorTree(pass);
+  if (DEBUGME) {
+    postDomTree->print(errs());
+  }
+
+  loopInfo = CFGTraits::getLoopInfo(pass);
+  if (DEBUGME) {
+    errs() << "LoopInfo:\n";
+    PrintLoopinfo(*loopInfo, errs());
+  }
+
+  orderBlocks();
+#ifdef STRESSTEST
+  //Use the worse block ordering to test the algorithm.
+  ReverseVector(orderedBlks);
+#endif
+
+  if (DEBUGME) {
+    errs() << "Ordered blocks:\n";
+    printOrderedBlocks(errs());
+  }
+  int numIter = 0;
+  bool finish = false;
+  BlockT *curBlk;
+  bool makeProgress = false;
+  int numRemainedBlk = countActiveBlock(orderedBlks.begin(),
+                                        orderedBlks.end());
+
+  do {
+    ++numIter;
+    if (DEBUGME) {
+      errs() << "numIter = " << numIter
+             << ", numRemaintedBlk = " << numRemainedBlk << "\n";
+    }
+
+    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
+      iterBlk = orderedBlks.begin();
+    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
+      iterBlkEnd = orderedBlks.end();
+
+    typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
+      sccBeginIter = iterBlk;
+    BlockT *sccBeginBlk = NULL;
+    int sccNumBlk = 0;  // The number of active blocks, init to a
+                        // maximum possible number.
+    int sccNumIter;     // Number of iteration in this SCC.
+
+    while (iterBlk != iterBlkEnd) {
+      curBlk = *iterBlk;
+
+      if (sccBeginBlk == NULL) {
+        sccBeginIter = iterBlk;
+        sccBeginBlk = curBlk;
+        sccNumIter = 0;
+        sccNumBlk = numRemainedBlk; // Init to maximum possible number.
+        if (DEBUGME) {
+              errs() << "start processing SCC" << getSCCNum(sccBeginBlk);
+              errs() << "\n";
+        }
+      }
+
+      if (!isRetiredBlock(curBlk)) {
+        patternMatch(curBlk);
+      }
+
+      ++iterBlk;
+
+      bool contNextScc = true;
+      if (iterBlk == iterBlkEnd
+          || getSCCNum(sccBeginBlk) != getSCCNum(*iterBlk)) {
+        // Just finish one scc.
+        ++sccNumIter;
+        int sccRemainedNumBlk = countActiveBlock(sccBeginIter, iterBlk);
+        if (sccRemainedNumBlk != 1 && sccRemainedNumBlk >= sccNumBlk) {
+          if (DEBUGME) {
+            errs() << "Can't reduce SCC " << getSCCNum(curBlk)
+                   << ", sccNumIter = " << sccNumIter;
+            errs() << "doesn't make any progress\n";
+          }
+          contNextScc = true;
+        } else if (sccRemainedNumBlk != 1 && sccRemainedNumBlk < sccNumBlk) {
+          sccNumBlk = sccRemainedNumBlk;
+          iterBlk = sccBeginIter;
+          contNextScc = false;
+          if (DEBUGME) {
+            errs() << "repeat processing SCC" << getSCCNum(curBlk)
+                   << "sccNumIter = " << sccNumIter << "\n";
+            func.viewCFG();
+          }
+        } else {
+          // Finish the current scc.
+          contNextScc = true;
+        }
+      } else {
+        // Continue on next component in the current scc.
+        contNextScc = false;
+      }
+
+      if (contNextScc) {
+        sccBeginBlk = NULL;
+      }
+    } //while, "one iteration" over the function.
+
+    BlockT *entryBlk = FuncGTraits::nodes_begin(&func);
+    if (entryBlk->succ_size() == 0) {
+      finish = true;
+      if (DEBUGME) {
+        errs() << "Reduce to one block\n";
+      }
+    } else {
+      int newnumRemainedBlk
+        = countActiveBlock(orderedBlks.begin(), orderedBlks.end());
+      // consider cloned blocks ??
+      if (newnumRemainedBlk == 1 || newnumRemainedBlk < numRemainedBlk) {
+        makeProgress = true;
+        numRemainedBlk = newnumRemainedBlk;
+      } else {
+        makeProgress = false;
+        if (DEBUGME) {
+          errs() << "No progress\n";
+        }
+      }
+    }
+  } while (!finish && makeProgress);
+
+  // Misc wrap up to maintain the consistency of the Function representation.
+  CFGTraits::wrapup(FuncGTraits::nodes_begin(&func));
+
+  // Detach retired Block, release memory.
+  for (typename BlockInfoMap::iterator iterMap = blockInfoMap.begin(),
+       iterEndMap = blockInfoMap.end(); iterMap != iterEndMap; ++iterMap) {
+    if ((*iterMap).second && (*iterMap).second->isRetired) {
+      assert(((*iterMap).first)->getNumber() != -1);
+      if (DEBUGME) {
+        errs() << "Erase BB" << ((*iterMap).first)->getNumber() << "\n";
+      }
+      (*iterMap).first->eraseFromParent();  //Remove from the parent Function.
+    }
+    delete (*iterMap).second;
+  }
+  blockInfoMap.clear();
+
+  // clear loopLandInfoMap
+  for (typename LoopLandInfoMap::iterator iterMap = loopLandInfoMap.begin(),
+       iterEndMap = loopLandInfoMap.end(); iterMap != iterEndMap; ++iterMap) {
+    delete (*iterMap).second;
+  }
+  loopLandInfoMap.clear();
+
+  if (DEBUGME) {
+    func.viewCFG();
+  }
+
+  if (!finish) {
+    assert(!"IRREDUCIBL_CF");
+  }
+
+  return true;
+} //CFGStructurizer::run
+
+/// Print the ordered Blocks.
+///
+template<class PassT>
+void CFGStructurizer<PassT>::printOrderedBlocks(llvm::raw_ostream &os) {
+  size_t i = 0;
+  for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::const_iterator
+      iterBlk = orderedBlks.begin(), iterBlkEnd = orderedBlks.end();
+       iterBlk != iterBlkEnd;
+       ++iterBlk, ++i) {
+    os << "BB" << (*iterBlk)->getNumber();
+    os << "(" << getSCCNum(*iterBlk) << "," << (*iterBlk)->size() << ")";
+    if (i != 0 && i % 10 == 0) {
+      os << "\n";
+    } else {
+      os << " ";
+    }
+  }
+} //printOrderedBlocks
+
+/// Compute the reversed DFS post order of Blocks
+///
+template<class PassT> void CFGStructurizer<PassT>::orderBlocks() {
+  int sccNum = 0;
+  BlockT *bb;
+  for (scc_iterator<FuncT *> sccIter = scc_begin(funcRep),
+       sccEnd = scc_end(funcRep); sccIter != sccEnd; ++sccIter, ++sccNum) {
+    std::vector<BlockT *> &sccNext = *sccIter;
+    for (typename std::vector<BlockT *>::const_iterator
+         blockIter = sccNext.begin(), blockEnd = sccNext.end();
+         blockIter != blockEnd; ++blockIter) {
+      bb = *blockIter;
+      orderedBlks.push_back(bb);
+      recordSccnum(bb, sccNum);
+    }
+  }
+
+  //walk through all the block in func to check for unreachable
+  for (BlockIterator blockIter1 = FuncGTraits::nodes_begin(funcRep),
+       blockEnd1 = FuncGTraits::nodes_end(funcRep);
+       blockIter1 != blockEnd1; ++blockIter1) {
+    BlockT *bb = &(*blockIter1);
+    sccNum = getSCCNum(bb);
+    if (sccNum == INVALIDSCCNUM) {
+      errs() << "unreachable block BB" << bb->getNumber() << "\n";
+    }
+  }
+} //orderBlocks
+
+template<class PassT> int CFGStructurizer<PassT>::patternMatch(BlockT *curBlk) {
+  int numMatch = 0;
+  int curMatch;
+
+  if (DEBUGME) {
+        errs() << "Begin patternMatch BB" << curBlk->getNumber() << "\n";
+  }
+
+  while ((curMatch = patternMatchGroup(curBlk)) > 0) {
+    numMatch += curMatch;
+  }
+
+  if (DEBUGME) {
+        errs() << "End patternMatch BB" << curBlk->getNumber()
+      << ", numMatch = " << numMatch << "\n";
+  }
+
+  return numMatch;
+} //patternMatch
+
+template<class PassT>
+int CFGStructurizer<PassT>::patternMatchGroup(BlockT *curBlk) {
+  int numMatch = 0;
+  numMatch += serialPatternMatch(curBlk);
+  numMatch += ifPatternMatch(curBlk);
+  numMatch += loopendPatternMatch(curBlk);
+  numMatch += loopPatternMatch(curBlk);
+  return numMatch;
+}//patternMatchGroup
+
+template<class PassT>
+int CFGStructurizer<PassT>::serialPatternMatch(BlockT *curBlk) {
+  if (curBlk->succ_size() != 1) {
+    return 0;
+  }
+
+  BlockT *childBlk = *curBlk->succ_begin();
+  if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk)) {
+    return 0;
+  }
+
+  mergeSerialBlock(curBlk, childBlk);
+  ++numSerialPatternMatch;
+  return 1;
+} //serialPatternMatch
+
+template<class PassT>
+int CFGStructurizer<PassT>::ifPatternMatch(BlockT *curBlk) {
+  //two edges
+  if (curBlk->succ_size() != 2) {
+    return 0;
+  }
+
+  if (hasBackEdge(curBlk)) {
+    return 0;
+  }
+
+  InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(curBlk);
+  if (branchInstr == NULL) {
+    return 0;
+  }
+
+  assert(CFGTraits::isCondBranch(branchInstr));
+
+  BlockT *trueBlk = CFGTraits::getTrueBranch(branchInstr);
+  BlockT *falseBlk = CFGTraits::getFalseBranch(curBlk, branchInstr);
+  BlockT *landBlk;
+  int cloned = 0;
+
+  // TODO: Simplify
+  if (trueBlk->succ_size() == 1 && falseBlk->succ_size() == 1
+    && *trueBlk->succ_begin() == *falseBlk->succ_begin()) {
+    landBlk = *trueBlk->succ_begin();
+  } else if (trueBlk->succ_size() == 0 && falseBlk->succ_size() == 0) {
+    landBlk = NULL;
+  } else if (trueBlk->succ_size() == 1 && *trueBlk->succ_begin() == falseBlk) {
+    landBlk = falseBlk;
+    falseBlk = NULL;
+  } else if (falseBlk->succ_size() == 1
+             && *falseBlk->succ_begin() == trueBlk) {
+    landBlk = trueBlk;
+    trueBlk = NULL;
+  } else if (falseBlk->succ_size() == 1
+             && isSameloopDetachedContbreak(trueBlk, falseBlk)) {
+    landBlk = *falseBlk->succ_begin();
+  } else if (trueBlk->succ_size() == 1
+    && isSameloopDetachedContbreak(falseBlk, trueBlk)) {
+    landBlk = *trueBlk->succ_begin();
+  } else {
+    return handleJumpintoIf(curBlk, trueBlk, falseBlk);
+  }
+
+  // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but the
+  // new BB created for landBlk==NULL may introduce new challenge to the
+  // reduction process.
+  if (landBlk != NULL &&
+      ((trueBlk && trueBlk->pred_size() > 1)
+      || (falseBlk && falseBlk->pred_size() > 1))) {
+     cloned += improveSimpleJumpintoIf(curBlk, trueBlk, falseBlk, &landBlk);
+  }
+
+  if (trueBlk && trueBlk->pred_size() > 1) {
+    trueBlk = cloneBlockForPredecessor(trueBlk, curBlk);
+    ++cloned;
+  }
+
+  if (falseBlk && falseBlk->pred_size() > 1) {
+    falseBlk = cloneBlockForPredecessor(falseBlk, curBlk);
+    ++cloned;
+  }
+
+  mergeIfthenelseBlock(branchInstr, curBlk, trueBlk, falseBlk, landBlk);
+
+  ++numIfPatternMatch;
+
+  numClonedBlock += cloned;
+
+  return 1 + cloned;
+} //ifPatternMatch
+
+template<class PassT>
+int CFGStructurizer<PassT>::switchPatternMatch(BlockT *curBlk) {
+  return 0;
+} //switchPatternMatch
+
+template<class PassT>
+int CFGStructurizer<PassT>::loopendPatternMatch(BlockT *curBlk) {
+  LoopT *loopRep = loopInfo->getLoopFor(curBlk);
+  typename std::vector<LoopT *> nestedLoops;
+  while (loopRep) {
+    nestedLoops.push_back(loopRep);
+    loopRep = loopRep->getParentLoop();
+  }
+
+  if (nestedLoops.size() == 0) {
+    return 0;
+  }
+
+  // Process nested loop outside->inside, so "continue" to a outside loop won't
+  // be mistaken as "break" of the current loop.
+  int num = 0;
+  for (typename std::vector<LoopT *>::reverse_iterator
+       iter = nestedLoops.rbegin(), iterEnd = nestedLoops.rend();
+       iter != iterEnd; ++iter) {
+    loopRep = *iter;
+
+    if (getLoopLandBlock(loopRep) != NULL) {
+      continue;
+    }
+
+    BlockT *loopHeader = loopRep->getHeader();
+
+    int numBreak = loopbreakPatternMatch(loopRep, loopHeader);
+
+    if (numBreak == -1) {
+      break;
+    }
+
+    int numCont = loopcontPatternMatch(loopRep, loopHeader);
+    num += numBreak + numCont;
+  }
+
+  return num;
+} //loopendPatternMatch
+
+template<class PassT>
+int CFGStructurizer<PassT>::loopPatternMatch(BlockT *curBlk) {
+  if (curBlk->succ_size() != 0) {
+    return 0;
+  }
+
+  int numLoop = 0;
+  LoopT *loopRep = loopInfo->getLoopFor(curBlk);
+  while (loopRep && loopRep->getHeader() == curBlk) {
+    LoopLandInfo *loopLand = getLoopLandInfo(loopRep);
+    if (loopLand) {
+      BlockT *landBlk = loopLand->landBlk;
+      assert(landBlk);
+      if (!isRetiredBlock(landBlk)) {
+        mergeLooplandBlock(curBlk, loopLand);
+        ++numLoop;
+      }
+    }
+    loopRep = loopRep->getParentLoop();
+  }
+
+  numLoopPatternMatch += numLoop;
+
+  return numLoop;
+} //loopPatternMatch
+
+template<class PassT>
+int CFGStructurizer<PassT>::loopbreakPatternMatch(LoopT *loopRep,
+                                                  BlockT *loopHeader) {
+  BlockTSmallerVector exitingBlks;
+  loopRep->getExitingBlocks(exitingBlks);
+
+  if (DEBUGME) {
+    errs() << "Loop has " << exitingBlks.size() << " exiting blocks\n";
+  }
+
+  if (exitingBlks.size() == 0) {
+    setLoopLandBlock(loopRep);
+    return 0;
+  }
+
+  // Compute the corresponding exitBlks and exit block set.
+  BlockTSmallerVector exitBlks;
+  std::set<BlockT *> exitBlkSet;
+  for (typename BlockTSmallerVector::const_iterator iter = exitingBlks.begin(),
+       iterEnd = exitingBlks.end(); iter != iterEnd; ++iter) {
+    BlockT *exitingBlk = *iter;
+    BlockT *exitBlk = exitingBlock2ExitBlock(loopRep, exitingBlk);
+    exitBlks.push_back(exitBlk);
+    exitBlkSet.insert(exitBlk);  //non-duplicate insert
+  }
+
+  assert(exitBlkSet.size() > 0);
+  assert(exitBlks.size() == exitingBlks.size());
+
+  if (DEBUGME) {
+    errs() << "Loop has " << exitBlkSet.size() << " exit blocks\n";
+  }
+
+  // Find exitLandBlk.
+  BlockT *exitLandBlk = NULL;
+  int numCloned = 0;
+  int numSerial = 0;
+
+  if (exitBlkSet.size() == 1) {
+    exitLandBlk = *exitBlkSet.begin();
+  } else {
+    exitLandBlk = findNearestCommonPostDom(exitBlkSet);
+
+    if (exitLandBlk == NULL) {
+      return -1;
+    }
+
+    bool allInPath = true;
+    bool allNotInPath = true;
+    for (typename std::set<BlockT*>::const_iterator
+         iter = exitBlkSet.begin(),
+         iterEnd = exitBlkSet.end();
+         iter != iterEnd; ++iter) {
+      BlockT *exitBlk = *iter;
+
+      PathToKind pathKind = singlePathTo(exitBlk, exitLandBlk, true);
+      if (DEBUGME) {
+        errs() << "BB" << exitBlk->getNumber()
+               << " to BB" << exitLandBlk->getNumber() << " PathToKind="
+               << pathKind << "\n";
+      }
+
+      allInPath = allInPath && (pathKind == SinglePath_InPath);
+      allNotInPath = allNotInPath && (pathKind == SinglePath_NotInPath);
+
+      if (!allInPath && !allNotInPath) {
+        if (DEBUGME) {
+              errs() << "singlePath check fail\n";
+        }
+        return -1;
+      }
+    } // check all exit blocks
+
+    if (allNotInPath) {
+
+      // TODO: Simplify, maybe separate function?
+      LoopT *parentLoopRep = loopRep->getParentLoop();
+      BlockT *parentLoopHeader = NULL;
+      if (parentLoopRep)
+        parentLoopHeader = parentLoopRep->getHeader();
+
+      if (exitLandBlk == parentLoopHeader &&
+          (exitLandBlk = relocateLoopcontBlock(parentLoopRep,
+                                               loopRep,
+                                               exitBlkSet,
+                                               exitLandBlk)) != NULL) {
+        if (DEBUGME) {
+          errs() << "relocateLoopcontBlock success\n";
+        }
+      } else if ((exitLandBlk = addLoopEndbranchBlock(loopRep,
+                                                      exitingBlks,
+                                                      exitBlks)) != NULL) {
+        if (DEBUGME) {
+          errs() << "insertEndbranchBlock success\n";
+        }
+      } else {
+        if (DEBUGME) {
+          errs() << "loop exit fail\n";
+        }
+        return -1;
+      }
+    }
+
+    // Handle side entry to exit path.
+    exitBlks.clear();
+    exitBlkSet.clear();
+    for (typename BlockTSmallerVector::iterator iterExiting =
+           exitingBlks.begin(),
+         iterExitingEnd = exitingBlks.end();
+         iterExiting != iterExitingEnd; ++iterExiting) {
+      BlockT *exitingBlk = *iterExiting;
+      BlockT *exitBlk = exitingBlock2ExitBlock(loopRep, exitingBlk);
+      BlockT *newExitBlk = exitBlk;
+
+      if (exitBlk != exitLandBlk && exitBlk->pred_size() > 1) {
+        newExitBlk = cloneBlockForPredecessor(exitBlk, exitingBlk);
+        ++numCloned;
+      }
+
+      numCloned += cloneOnSideEntryTo(exitingBlk, newExitBlk, exitLandBlk);
+
+      exitBlks.push_back(newExitBlk);
+      exitBlkSet.insert(newExitBlk);
+    }
+
+    for (typename BlockTSmallerVector::iterator iterExit = exitBlks.begin(),
+         iterExitEnd = exitBlks.end();
+         iterExit != iterExitEnd; ++iterExit) {
+      BlockT *exitBlk = *iterExit;
+      numSerial += serialPatternMatch(exitBlk);
+    }
+
+    for (typename BlockTSmallerVector::iterator iterExit = exitBlks.begin(),
+         iterExitEnd = exitBlks.end();
+         iterExit != iterExitEnd; ++iterExit) {
+      BlockT *exitBlk = *iterExit;
+      if (exitBlk->pred_size() > 1) {
+        if (exitBlk != exitLandBlk) {
+          return -1;
+        }
+      } else {
+        if (exitBlk != exitLandBlk &&
+            (exitBlk->succ_size() != 1 ||
+            *exitBlk->succ_begin() != exitLandBlk)) {
+          return -1;
+        }
+      }
+    }
+  } // else
+
+  exitLandBlk = recordLoopLandBlock(loopRep, exitLandBlk, exitBlks, exitBlkSet);
+
+  // Fold break into the breaking block. Leverage across level breaks.
+  assert(exitingBlks.size() == exitBlks.size());
+  for (typename BlockTSmallerVector::const_iterator iterExit = exitBlks.begin(),
+       iterExiting = exitingBlks.begin(), iterExitEnd = exitBlks.end();
+       iterExit != iterExitEnd; ++iterExit, ++iterExiting) {
+    BlockT *exitBlk = *iterExit;
+    BlockT *exitingBlk = *iterExiting;
+    assert(exitBlk->pred_size() == 1 || exitBlk == exitLandBlk);
+    LoopT *exitingLoop = loopInfo->getLoopFor(exitingBlk);
+    handleLoopbreak(exitingBlk, exitingLoop, exitBlk, loopRep, exitLandBlk);
+  }
+
+  int numBreak = static_cast<int>(exitingBlks.size());
+  numLoopbreakPatternMatch += numBreak;
+  numClonedBlock += numCloned;
+  return numBreak + numSerial + numCloned;
+} //loopbreakPatternMatch
+
+template<class PassT>
+int CFGStructurizer<PassT>::loopcontPatternMatch(LoopT *loopRep,
+                                                 BlockT *loopHeader) {
+  int numCont = 0;
+  SmallVector<BlockT *, DEFAULT_VEC_SLOTS> contBlk;
+  for (typename InvBlockGTraits::ChildIteratorType iter =
+       InvBlockGTraits::child_begin(loopHeader),
+       iterEnd = InvBlockGTraits::child_end(loopHeader);
+       iter != iterEnd; ++iter) {
+    BlockT *curBlk = *iter;
+    if (loopRep->contains(curBlk)) {
+      handleLoopcontBlock(curBlk, loopInfo->getLoopFor(curBlk),
+                          loopHeader, loopRep);
+      contBlk.push_back(curBlk);
+      ++numCont;
+    }
+  }
+
+  for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::iterator
+       iter = contBlk.begin(), iterEnd = contBlk.end();
+       iter != iterEnd; ++iter) {
+    (*iter)->removeSuccessor(loopHeader);
+  }
+
+  numLoopcontPatternMatch += numCont;
+
+  return numCont;
+} //loopcontPatternMatch
+
+
+template<class PassT>
+bool CFGStructurizer<PassT>::isSameloopDetachedContbreak(BlockT *src1Blk,
+                                                         BlockT *src2Blk) {
+  // return true iff src1Blk->succ_size() == 0 && src1Blk and src2Blk are in the
+  // same loop with LoopLandInfo without explicitly keeping track of
+  // loopContBlks and loopBreakBlks, this is a method to get the information.
+  //
+  if (src1Blk->succ_size() == 0) {
+    LoopT *loopRep = loopInfo->getLoopFor(src1Blk);
+    if (loopRep != NULL && loopRep == loopInfo->getLoopFor(src2Blk)) {
+      LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+      if (theEntry != NULL) {
+        if (DEBUGME) {
+          errs() << "isLoopContBreakBlock yes src1 = BB"
+                 << src1Blk->getNumber()
+                 << " src2 = BB" << src2Blk->getNumber() << "\n";
+        }
+        return true;
+      }
+    }
+  }
+  return false;
+}  //isSameloopDetachedContbreak
+
+template<class PassT>
+int CFGStructurizer<PassT>::handleJumpintoIf(BlockT *headBlk,
+                                             BlockT *trueBlk,
+                                             BlockT *falseBlk) {
+  int num = handleJumpintoIfImp(headBlk, trueBlk, falseBlk);
+  if (num == 0) {
+    if (DEBUGME) {
+      errs() << "handleJumpintoIf swap trueBlk and FalseBlk" << "\n";
+    }
+    num = handleJumpintoIfImp(headBlk, falseBlk, trueBlk);
+  }
+  return num;
+}
+
+template<class PassT>
+int CFGStructurizer<PassT>::handleJumpintoIfImp(BlockT *headBlk,
+                                                BlockT *trueBlk,
+                                                BlockT *falseBlk) {
+  int num = 0;
+  BlockT *downBlk;
+
+  //trueBlk could be the common post dominator
+  downBlk = trueBlk;
+
+  if (DEBUGME) {
+    errs() << "handleJumpintoIfImp head = BB" << headBlk->getNumber()
+           << " true = BB" << trueBlk->getNumber()
+           << ", numSucc=" << trueBlk->succ_size()
+           << " false = BB" << falseBlk->getNumber() << "\n";
+  }
+
+  while (downBlk) {
+    if (DEBUGME) {
+      errs() << "check down = BB" << downBlk->getNumber();
+    }
+
+    if (singlePathTo(falseBlk, downBlk) == SinglePath_InPath) {
+      if (DEBUGME) {
+        errs() << " working\n";
+      }
+
+      num += cloneOnSideEntryTo(headBlk, trueBlk, downBlk);
+      num += cloneOnSideEntryTo(headBlk, falseBlk, downBlk);
+
+      numClonedBlock += num;
+      num += serialPatternMatch(*headBlk->succ_begin());
+      num += serialPatternMatch(*(++headBlk->succ_begin()));
+      num += ifPatternMatch(headBlk);
+      assert(num > 0);
+
+      break;
+    }
+    if (DEBUGME) {
+      errs() << " not working\n";
+    }
+    downBlk = (downBlk->succ_size() == 1) ? (*downBlk->succ_begin()) : NULL;
+  } // walk down the postDomTree
+
+  return num;
+} //handleJumpintoIf
+
+template<class PassT>
+void CFGStructurizer<PassT>::showImproveSimpleJumpintoIf(BlockT *headBlk,
+                                                         BlockT *trueBlk,
+                                                         BlockT *falseBlk,
+                                                         BlockT *landBlk,
+                                                         bool detail) {
+  errs() << "head = BB" << headBlk->getNumber()
+         << " size = " << headBlk->size();
+  if (detail) {
+    errs() << "\n";
+    headBlk->print(errs());
+    errs() << "\n";
+  }
+
+  if (trueBlk) {
+    errs() << ", true = BB" << trueBlk->getNumber() << " size = "
+           << trueBlk->size() << " numPred = " << trueBlk->pred_size();
+    if (detail) {
+      errs() << "\n";
+      trueBlk->print(errs());
+      errs() << "\n";
+    }
+  }
+  if (falseBlk) {
+    errs() << ", false = BB" << falseBlk->getNumber() << " size = "
+           << falseBlk->size() << " numPred = " << falseBlk->pred_size();
+    if (detail) {
+      errs() << "\n";
+      falseBlk->print(errs());
+      errs() << "\n";
+    }
+  }
+  if (landBlk) {
+    errs() << ", land = BB" << landBlk->getNumber() << " size = "
+           << landBlk->size() << " numPred = " << landBlk->pred_size();
+    if (detail) {
+      errs() << "\n";
+      landBlk->print(errs());
+      errs() << "\n";
+    }
+  }
+
+    errs() << "\n";
+} //showImproveSimpleJumpintoIf
+
+template<class PassT>
+int CFGStructurizer<PassT>::improveSimpleJumpintoIf(BlockT *headBlk,
+                                                    BlockT *trueBlk,
+                                                    BlockT *falseBlk,
+                                                    BlockT **plandBlk) {
+  bool migrateTrue = false;
+  bool migrateFalse = false;
+
+  BlockT *landBlk = *plandBlk;
+
+  assert((trueBlk == NULL || trueBlk->succ_size() <= 1)
+         && (falseBlk == NULL || falseBlk->succ_size() <= 1));
+
+  if (trueBlk == falseBlk) {
+    return 0;
+  }
+
+  migrateTrue = needMigrateBlock(trueBlk);
+  migrateFalse = needMigrateBlock(falseBlk);
+
+  if (!migrateTrue && !migrateFalse) {
+    return 0;
+  }
+
+  // If we need to migrate either trueBlk and falseBlk, migrate the rest that
+  // have more than one predecessors.  without doing this, its predecessor
+  // rather than headBlk will have undefined value in initReg.
+  if (!migrateTrue && trueBlk && trueBlk->pred_size() > 1) {
+    migrateTrue = true;
+  }
+  if (!migrateFalse && falseBlk && falseBlk->pred_size() > 1) {
+    migrateFalse = true;
+  }
+
+  if (DEBUGME) {
+    errs() << "before improveSimpleJumpintoIf: ";
+    showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 0);
+  }
+
+  // org: headBlk => if () {trueBlk} else {falseBlk} => landBlk
+  //
+  // new: headBlk => if () {initReg = 1; org trueBlk branch} else
+  //      {initReg = 0; org falseBlk branch }
+  //      => landBlk => if (initReg) {org trueBlk} else {org falseBlk}
+  //      => org landBlk
+  //      if landBlk->pred_size() > 2, put the about if-else inside
+  //      if (initReg !=2) {...}
+  //
+  // add initReg = initVal to headBlk
+
+  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
+  unsigned initReg =
+    funcRep->getRegInfo().createVirtualRegister(I32RC);
+  if (!migrateTrue || !migrateFalse) {
+    int initVal = migrateTrue ? 0 : 1;
+    CFGTraits::insertAssignInstrBefore(headBlk, passRep, initReg, initVal);
+  }
+
+  int numNewBlk = 0;
+
+  if (landBlk == NULL) {
+    landBlk = funcRep->CreateMachineBasicBlock();
+    funcRep->push_back(landBlk);  //insert to function
+
+    if (trueBlk) {
+      trueBlk->addSuccessor(landBlk);
+    } else {
+      headBlk->addSuccessor(landBlk);
+    }
+
+    if (falseBlk) {
+      falseBlk->addSuccessor(landBlk);
+    } else {
+      headBlk->addSuccessor(landBlk);
+    }
+
+    numNewBlk ++;
+  }
+
+  bool landBlkHasOtherPred = (landBlk->pred_size() > 2);
+
+  //insert AMDGPU::ENDIF to avoid special case "input landBlk == NULL"
+  typename BlockT::iterator insertPos =
+    CFGTraits::getInstrPos
+    (landBlk, CFGTraits::insertInstrBefore(landBlk, AMDGPU::ENDIF, passRep));
+
+  if (landBlkHasOtherPred) {
+    unsigned immReg =
+      funcRep->getRegInfo().createVirtualRegister(I32RC);
+    CFGTraits::insertAssignInstrBefore(insertPos, passRep, immReg, 2);
+    unsigned cmpResReg =
+      funcRep->getRegInfo().createVirtualRegister(I32RC);
+
+    CFGTraits::insertCompareInstrBefore(landBlk, insertPos, passRep, cmpResReg,
+                                        initReg, immReg);
+    CFGTraits::insertCondBranchBefore(landBlk, insertPos,
+                                      AMDGPU::IF_PREDICATE_SET, passRep,
+                                      cmpResReg, DebugLoc());
+  }
+
+  CFGTraits::insertCondBranchBefore(landBlk, insertPos, AMDGPU::IF_PREDICATE_SET,
+                                    passRep, initReg, DebugLoc());
+
+  if (migrateTrue) {
+    migrateInstruction(trueBlk, landBlk, insertPos);
+    // need to uncondionally insert the assignment to ensure a path from its
+    // predecessor rather than headBlk has valid value in initReg if
+    // (initVal != 1).
+    CFGTraits::insertAssignInstrBefore(trueBlk, passRep, initReg, 1);
+  }
+  CFGTraits::insertInstrBefore(insertPos, AMDGPU::ELSE, passRep);
+
+  if (migrateFalse) {
+    migrateInstruction(falseBlk, landBlk, insertPos);
+    // need to uncondionally insert the assignment to ensure a path from its
+    // predecessor rather than headBlk has valid value in initReg if
+    // (initVal != 0)
+    CFGTraits::insertAssignInstrBefore(falseBlk, passRep, initReg, 0);
+  }
+
+  if (landBlkHasOtherPred) {
+    // add endif
+    CFGTraits::insertInstrBefore(insertPos, AMDGPU::ENDIF, passRep);
+
+    // put initReg = 2 to other predecessors of landBlk
+    for (typename BlockT::pred_iterator predIter = landBlk->pred_begin(),
+         predIterEnd = landBlk->pred_end(); predIter != predIterEnd;
+         ++predIter) {
+      BlockT *curBlk = *predIter;
+      if (curBlk != trueBlk && curBlk != falseBlk) {
+        CFGTraits::insertAssignInstrBefore(curBlk, passRep, initReg, 2);
+      }
+    } //for
+  }
+  if (DEBUGME) {
+    errs() << "result from improveSimpleJumpintoIf: ";
+    showImproveSimpleJumpintoIf(headBlk, trueBlk, falseBlk, landBlk, 0);
+  }
+
+  // update landBlk
+  *plandBlk = landBlk;
+
+  return numNewBlk;
+} //improveSimpleJumpintoIf
+
+template<class PassT>
+void CFGStructurizer<PassT>::handleLoopbreak(BlockT *exitingBlk,
+                                              LoopT *exitingLoop,
+                                             BlockT *exitBlk,
+                                              LoopT *exitLoop,
+                                             BlockT *landBlk) {
+  if (DEBUGME) {
+    errs() << "Trying to break loop-depth = " << getLoopDepth(exitLoop)
+           << " from loop-depth = " << getLoopDepth(exitingLoop) << "\n";
+  }
+  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
+
+  RegiT initReg = INVALIDREGNUM;
+  if (exitingLoop != exitLoop) {
+    initReg = static_cast<int>
+      (funcRep->getRegInfo().createVirtualRegister(I32RC));
+    assert(initReg != INVALIDREGNUM);
+    addLoopBreakInitReg(exitLoop, initReg);
+    while (exitingLoop != exitLoop && exitingLoop) {
+      addLoopBreakOnReg(exitingLoop, initReg);
+      exitingLoop = exitingLoop->getParentLoop();
+    }
+    assert(exitingLoop == exitLoop);
+  }
+
+  mergeLoopbreakBlock(exitingBlk, exitBlk, landBlk, initReg);
+
+} //handleLoopbreak
+
+template<class PassT>
+void CFGStructurizer<PassT>::handleLoopcontBlock(BlockT *contingBlk,
+                                                  LoopT *contingLoop,
+                                                 BlockT *contBlk,
+                                                  LoopT *contLoop) {
+  if (DEBUGME) {
+    errs() << "loopcontPattern cont = BB" << contingBlk->getNumber()
+           << " header = BB" << contBlk->getNumber() << "\n";
+
+    errs() << "Trying to continue loop-depth = "
+           << getLoopDepth(contLoop)
+           << " from loop-depth = " << getLoopDepth(contingLoop) << "\n";
+  }
+
+  RegiT initReg = INVALIDREGNUM;
+  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
+  if (contingLoop != contLoop) {
+    initReg = static_cast<int>
+      (funcRep->getRegInfo().createVirtualRegister(I32RC));
+    assert(initReg != INVALIDREGNUM);
+    addLoopContInitReg(contLoop, initReg);
+    while (contingLoop && contingLoop->getParentLoop() != contLoop) {
+      addLoopBreakOnReg(contingLoop, initReg);  //not addLoopContOnReg
+      contingLoop = contingLoop->getParentLoop();
+    }
+    assert(contingLoop && contingLoop->getParentLoop() == contLoop);
+    addLoopContOnReg(contingLoop, initReg);
+  }
+
+  settleLoopcontBlock(contingBlk, contBlk, initReg);
+} //handleLoopcontBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::mergeSerialBlock(BlockT *dstBlk, BlockT *srcBlk) {
+  if (DEBUGME) {
+    errs() << "serialPattern BB" << dstBlk->getNumber()
+           << " <= BB" << srcBlk->getNumber() << "\n";
+  }
+  dstBlk->splice(dstBlk->end(), srcBlk, srcBlk->begin(), srcBlk->end());
+
+  dstBlk->removeSuccessor(srcBlk);
+  CFGTraits::cloneSuccessorList(dstBlk, srcBlk);
+
+  removeSuccessor(srcBlk);
+  retireBlock(dstBlk, srcBlk);
+} //mergeSerialBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::mergeIfthenelseBlock(InstrT *branchInstr,
+                                                  BlockT *curBlk,
+                                                  BlockT *trueBlk,
+                                                  BlockT *falseBlk,
+                                                  BlockT *landBlk) {
+  if (DEBUGME) {
+    errs() << "ifPattern BB" << curBlk->getNumber();
+    errs() << "{  ";
+    if (trueBlk) {
+      errs() << "BB" << trueBlk->getNumber();
+    }
+    errs() << "  } else ";
+    errs() << "{  ";
+    if (falseBlk) {
+      errs() << "BB" << falseBlk->getNumber();
+    }
+    errs() << "  }\n ";
+    errs() << "landBlock: ";
+    if (landBlk == NULL) {
+      errs() << "NULL";
+    } else {
+      errs() << "BB" << landBlk->getNumber();
+    }
+    errs() << "\n";
+  }
+
+  int oldOpcode = branchInstr->getOpcode();
+  DebugLoc branchDL = branchInstr->getDebugLoc();
+
+//    transform to
+//    if cond
+//       trueBlk
+//    else
+//       falseBlk
+//    endif
+//    landBlk
+
+  typename BlockT::iterator branchInstrPos =
+    CFGTraits::getInstrPos(curBlk, branchInstr);
+  CFGTraits::insertCondBranchBefore(branchInstrPos,
+                                    CFGTraits::getBranchNzeroOpcode(oldOpcode),
+                                    passRep,
+                                    branchDL);
+
+  if (trueBlk) {
+    curBlk->splice(branchInstrPos, trueBlk, trueBlk->begin(), trueBlk->end());
+    curBlk->removeSuccessor(trueBlk);
+    if (landBlk && trueBlk->succ_size()!=0) {
+      trueBlk->removeSuccessor(landBlk);
+    }
+    retireBlock(curBlk, trueBlk);
+  }
+  CFGTraits::insertInstrBefore(branchInstrPos, AMDGPU::ELSE, passRep);
+
+  if (falseBlk) {
+    curBlk->splice(branchInstrPos, falseBlk, falseBlk->begin(),
+                   falseBlk->end());
+    curBlk->removeSuccessor(falseBlk);
+    if (landBlk && falseBlk->succ_size() != 0) {
+      falseBlk->removeSuccessor(landBlk);
+    }
+    retireBlock(curBlk, falseBlk);
+  }
+  CFGTraits::insertInstrBefore(branchInstrPos, AMDGPU::ENDIF, passRep);
+
+  branchInstr->eraseFromParent();
+
+  if (landBlk && trueBlk && falseBlk) {
+    curBlk->addSuccessor(landBlk);
+  }
+
+} //mergeIfthenelseBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::mergeLooplandBlock(BlockT *dstBlk,
+                                                LoopLandInfo *loopLand) {
+  BlockT *landBlk = loopLand->landBlk;
+
+  if (DEBUGME) {
+    errs() << "loopPattern header = BB" << dstBlk->getNumber()
+           << " land = BB" << landBlk->getNumber() << "\n";
+  }
+
+  // Loop contInitRegs are init at the beginning of the loop.
+  for (typename std::set<RegiT>::const_iterator iter =
+         loopLand->contInitRegs.begin(),
+       iterEnd = loopLand->contInitRegs.end(); iter != iterEnd; ++iter) {
+    CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0);
+  }
+
+  /* we last inserterd the DebugLoc in the
+   * BREAK_LOGICALZ_i32 or AMDGPU::BREAK_LOGICALNZ statement in the current dstBlk.
+   * search for the DebugLoc in the that statement.
+   * if not found, we have to insert the empty/default DebugLoc */
+  InstrT *loopBreakInstr = CFGTraits::getLoopBreakInstr(dstBlk);
+  DebugLoc DLBreak = (loopBreakInstr) ? loopBreakInstr->getDebugLoc() : DebugLoc();
+
+  CFGTraits::insertInstrBefore(dstBlk, AMDGPU::WHILELOOP, passRep, DLBreak);
+  // Loop breakInitRegs are init before entering the loop.
+  for (typename std::set<RegiT>::const_iterator iter =
+         loopLand->breakInitRegs.begin(),
+       iterEnd = loopLand->breakInitRegs.end(); iter != iterEnd; ++iter) {
+    CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0);
+  }
+  // Loop endbranchInitRegs are init before entering the loop.
+  for (typename std::set<RegiT>::const_iterator iter =
+         loopLand->endbranchInitRegs.begin(),
+       iterEnd = loopLand->endbranchInitRegs.end(); iter != iterEnd; ++iter) {
+    CFGTraits::insertAssignInstrBefore(dstBlk, passRep, *iter, 0);
+  }
+
+  /* we last inserterd the DebugLoc in the continue statement in the current dstBlk
+   * search for the DebugLoc in the continue statement.
+   * if not found, we have to insert the empty/default DebugLoc */
+  InstrT *continueInstr = CFGTraits::getContinueInstr(dstBlk);
+  DebugLoc DLContinue = (continueInstr) ? continueInstr->getDebugLoc() : DebugLoc();
+
+  CFGTraits::insertInstrEnd(dstBlk, AMDGPU::ENDLOOP, passRep, DLContinue);
+  // Loop breakOnRegs are check after the ENDLOOP: break the loop outside this
+  // loop.
+  for (typename std::set<RegiT>::const_iterator iter =
+         loopLand->breakOnRegs.begin(),
+       iterEnd = loopLand->breakOnRegs.end(); iter != iterEnd; ++iter) {
+    CFGTraits::insertCondBranchEnd(dstBlk, AMDGPU::PREDICATED_BREAK, passRep,
+                                   *iter);
+  }
+
+  // Loop contOnRegs are check after the ENDLOOP: cont the loop outside this
+  // loop.
+  for (std::set<RegiT>::const_iterator iter = loopLand->contOnRegs.begin(),
+       iterEnd = loopLand->contOnRegs.end(); iter != iterEnd; ++iter) {
+    CFGTraits::insertCondBranchEnd(dstBlk, AMDGPU::CONTINUE_LOGICALNZ_i32,
+                                   passRep, *iter);
+  }
+
+  dstBlk->splice(dstBlk->end(), landBlk, landBlk->begin(), landBlk->end());
+
+  for (typename BlockT::succ_iterator iter = landBlk->succ_begin(),
+       iterEnd = landBlk->succ_end(); iter != iterEnd; ++iter) {
+    dstBlk->addSuccessor(*iter);  // *iter's predecessor is also taken care of.
+  }
+
+  removeSuccessor(landBlk);
+  retireBlock(dstBlk, landBlk);
+} //mergeLooplandBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::reversePredicateSetter(typename BlockT::iterator I) {
+  while (I--) {
+    if (I->getOpcode() == AMDGPU::PRED_X) {
+      switch (static_cast<MachineInstr *>(I)->getOperand(2).getImm()) {
+      case OPCODE_IS_ZERO_INT:
+        static_cast<MachineInstr *>(I)->getOperand(2).setImm(OPCODE_IS_NOT_ZERO_INT);
+        return;
+      case OPCODE_IS_NOT_ZERO_INT:
+        static_cast<MachineInstr *>(I)->getOperand(2).setImm(OPCODE_IS_ZERO_INT);
+        return;
+      case OPCODE_IS_ZERO:
+        static_cast<MachineInstr *>(I)->getOperand(2).setImm(OPCODE_IS_NOT_ZERO);
+        return;
+      case OPCODE_IS_NOT_ZERO:
+        static_cast<MachineInstr *>(I)->getOperand(2).setImm(OPCODE_IS_ZERO);
+        return;
+      default:
+        assert(0 && "PRED_X Opcode invalid!");
+      }
+    }
+  }
+}
+
+template<class PassT>
+void CFGStructurizer<PassT>::mergeLoopbreakBlock(BlockT *exitingBlk,
+                                                 BlockT *exitBlk,
+                                                 BlockT *exitLandBlk,
+                                                 RegiT  setReg) {
+  if (DEBUGME) {
+    errs() << "loopbreakPattern exiting = BB" << exitingBlk->getNumber()
+           << " exit = BB" << exitBlk->getNumber()
+           << " land = BB" << exitLandBlk->getNumber() << "\n";
+  }
+
+  InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(exitingBlk);
+  assert(branchInstr && CFGTraits::isCondBranch(branchInstr));
+
+  DebugLoc DL = branchInstr->getDebugLoc();
+
+  BlockT *trueBranch = CFGTraits::getTrueBranch(branchInstr);
+
+  //    transform exitingBlk to
+  //    if ( ) {
+  //       exitBlk (if exitBlk != exitLandBlk)
+  //       setReg = 1
+  //       break
+  //    }endif
+  //    successor = {orgSuccessor(exitingBlk) - exitBlk}
+
+  typename BlockT::iterator branchInstrPos =
+    CFGTraits::getInstrPos(exitingBlk, branchInstr);
+
+  if (exitBlk == exitLandBlk && setReg == INVALIDREGNUM) {
+    //break_logical
+
+    if (trueBranch != exitBlk) {
+      reversePredicateSetter(branchInstrPos);
+    }
+    CFGTraits::insertCondBranchBefore(branchInstrPos, AMDGPU::PREDICATED_BREAK, passRep, DL);
+  } else {
+    if (trueBranch != exitBlk) {
+      reversePredicateSetter(branchInstr);
+    }
+    CFGTraits::insertCondBranchBefore(branchInstrPos, AMDGPU::PREDICATED_BREAK, passRep, DL);
+    if (exitBlk != exitLandBlk) {
+      //splice is insert-before ...
+      exitingBlk->splice(branchInstrPos, exitBlk, exitBlk->begin(),
+                         exitBlk->end());
+    }
+    if (setReg != INVALIDREGNUM) {
+      CFGTraits::insertAssignInstrBefore(branchInstrPos, passRep, setReg, 1);
+    }
+    CFGTraits::insertInstrBefore(branchInstrPos, AMDGPU::BREAK, passRep);
+  } //if_logical
+
+  //now branchInst can be erase safely
+  branchInstr->eraseFromParent();
+
+  //now take care of successors, retire blocks
+  exitingBlk->removeSuccessor(exitBlk);
+  if (exitBlk != exitLandBlk) {
+    //splice is insert-before ...
+    exitBlk->removeSuccessor(exitLandBlk);
+    retireBlock(exitingBlk, exitBlk);
+  }
+
+} //mergeLoopbreakBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::settleLoopcontBlock(BlockT *contingBlk,
+                                                 BlockT *contBlk,
+                                                 RegiT   setReg) {
+  if (DEBUGME) {
+    errs() << "settleLoopcontBlock conting = BB"
+           << contingBlk->getNumber()
+           << ", cont = BB" << contBlk->getNumber() << "\n";
+  }
+
+  InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(contingBlk);
+  if (branchInstr) {
+    assert(CFGTraits::isCondBranch(branchInstr));
+    typename BlockT::iterator branchInstrPos =
+      CFGTraits::getInstrPos(contingBlk, branchInstr);
+    BlockT *trueBranch = CFGTraits::getTrueBranch(branchInstr);
+    int oldOpcode = branchInstr->getOpcode();
+    DebugLoc DL = branchInstr->getDebugLoc();
+
+    //    transform contingBlk to
+    //     if () {
+    //          move instr after branchInstr
+    //          continue
+    //        or
+    //          setReg = 1
+    //          break
+    //     }endif
+    //     successor = {orgSuccessor(contingBlk) - loopHeader}
+
+    bool useContinueLogical = 
+      (setReg == INVALIDREGNUM && (&*contingBlk->rbegin()) == branchInstr);
+
+    if (useContinueLogical == false) {
+      int branchOpcode =
+        trueBranch == contBlk ? CFGTraits::getBranchNzeroOpcode(oldOpcode)
+                              : CFGTraits::getBranchZeroOpcode(oldOpcode);
+
+      CFGTraits::insertCondBranchBefore(branchInstrPos, branchOpcode, passRep, DL);
+
+      if (setReg != INVALIDREGNUM) {
+        CFGTraits::insertAssignInstrBefore(branchInstrPos, passRep, setReg, 1);
+        // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
+        CFGTraits::insertInstrEnd(contingBlk, AMDGPU::BREAK, passRep, DL);
+      } else {
+        // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
+        CFGTraits::insertInstrEnd(contingBlk, AMDGPU::CONTINUE, passRep, DL);
+      }
+
+      CFGTraits::insertInstrEnd(contingBlk, AMDGPU::ENDIF, passRep, DL);
+    } else {
+      int branchOpcode =
+        trueBranch == contBlk ? CFGTraits::getContinueNzeroOpcode(oldOpcode)
+                              : CFGTraits::getContinueZeroOpcode(oldOpcode);
+
+      CFGTraits::insertCondBranchBefore(branchInstrPos, branchOpcode, passRep, DL);
+    }
+
+    branchInstr->eraseFromParent();
+  } else {
+    // if we've arrived here then we've already erased the branch instruction
+    // travel back up the basic block to see the last reference of our debug location
+    // we've just inserted that reference here so it should be representative
+    if (setReg != INVALIDREGNUM) {
+      CFGTraits::insertAssignInstrBefore(contingBlk, passRep, setReg, 1);
+      // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
+      CFGTraits::insertInstrEnd(contingBlk, AMDGPU::BREAK, passRep, CFGTraits::getLastDebugLocInBB(contingBlk));
+    } else {
+      // insertEnd to ensure phi-moves, if exist, go before the continue-instr.
+      CFGTraits::insertInstrEnd(contingBlk, AMDGPU::CONTINUE, passRep, CFGTraits::getLastDebugLocInBB(contingBlk));
+    }
+  } //else
+
+} //settleLoopcontBlock
+
+// BBs in exitBlkSet are determined as in break-path for loopRep,
+// before we can put code for BBs as inside loop-body for loopRep
+// check whether those BBs are determined as cont-BB for parentLoopRep
+// earlier.
+// If so, generate a new BB newBlk
+//    (1) set newBlk common successor of BBs in exitBlkSet
+//    (2) change the continue-instr in BBs in exitBlkSet to break-instr
+//    (3) generate continue-instr in newBlk
+//
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::relocateLoopcontBlock(LoopT *parentLoopRep,
+                                              LoopT *loopRep,
+                                              std::set<BlockT *> &exitBlkSet,
+                                              BlockT *exitLandBlk) {
+  std::set<BlockT *> endBlkSet;
+
+
+
+  for (typename std::set<BlockT *>::const_iterator iter = exitBlkSet.begin(),
+       iterEnd = exitBlkSet.end();
+       iter != iterEnd; ++iter) {
+    BlockT *exitBlk = *iter;
+    BlockT *endBlk = singlePathEnd(exitBlk, exitLandBlk);
+
+    if (endBlk == NULL || CFGTraits::getContinueInstr(endBlk) == NULL)
+      return NULL;
+
+    endBlkSet.insert(endBlk);
+  }
+
+  BlockT *newBlk = funcRep->CreateMachineBasicBlock();
+  funcRep->push_back(newBlk);  //insert to function
+  CFGTraits::insertInstrEnd(newBlk, AMDGPU::CONTINUE, passRep);
+  SHOWNEWBLK(newBlk, "New continue block: ");
+
+  for (typename std::set<BlockT*>::const_iterator iter = endBlkSet.begin(),
+       iterEnd = endBlkSet.end();
+       iter != iterEnd; ++iter) {
+      BlockT *endBlk = *iter;
+      InstrT *contInstr = CFGTraits::getContinueInstr(endBlk);
+      if (contInstr) {
+        contInstr->eraseFromParent();
+      }
+      endBlk->addSuccessor(newBlk);
+      if (DEBUGME) {
+        errs() << "Add new continue Block to BB"
+               << endBlk->getNumber() << " successors\n";
+      }
+  }
+
+  return newBlk;
+} //relocateLoopcontBlock
+
+
+// LoopEndbranchBlock is a BB created by the CFGStructurizer to use as
+// LoopLandBlock. This BB branch on the loop endBranchInit register to the
+// pathes corresponding to the loop exiting branches.
+
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::addLoopEndbranchBlock(LoopT *loopRep,
+                                              BlockTSmallerVector &exitingBlks,
+                                              BlockTSmallerVector &exitBlks) {
+  const AMDGPUInstrInfo *tii =
+             static_cast<const AMDGPUInstrInfo *>(passRep->getTargetInstrInfo());
+  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
+
+  RegiT endBranchReg = static_cast<int>
+    (funcRep->getRegInfo().createVirtualRegister(I32RC));
+  assert(endBranchReg >= 0);
+
+  // reg = 0 before entering the loop
+  addLoopEndbranchInitReg(loopRep, endBranchReg);
+
+  uint32_t numBlks = static_cast<uint32_t>(exitingBlks.size());
+  assert(numBlks >=2 && numBlks == exitBlks.size());
+
+  BlockT *preExitingBlk = exitingBlks[0];
+  BlockT *preExitBlk = exitBlks[0];
+  BlockT *preBranchBlk = funcRep->CreateMachineBasicBlock();
+  funcRep->push_back(preBranchBlk);  //insert to function
+  SHOWNEWBLK(preBranchBlk, "New loopEndbranch block: ");
+
+  BlockT *newLandBlk = preBranchBlk;
+
+      CFGTraits::replaceInstrUseOfBlockWith(preExitingBlk, preExitBlk,
+        newLandBlk);
+  preExitingBlk->removeSuccessor(preExitBlk);
+  preExitingBlk->addSuccessor(newLandBlk);
+
+  //it is redundant to add reg = 0 to exitingBlks[0]
+
+  // For 1..n th exiting path (the last iteration handles two pathes) create the
+  // branch to the previous path and the current path.
+  for (uint32_t i = 1; i < numBlks; ++i) {
+    BlockT *curExitingBlk = exitingBlks[i];
+    BlockT *curExitBlk = exitBlks[i];
+    BlockT *curBranchBlk;
+
+    if (i == numBlks - 1) {
+      curBranchBlk = curExitBlk;
+    } else {
+      curBranchBlk = funcRep->CreateMachineBasicBlock();
+      funcRep->push_back(curBranchBlk);  //insert to function
+      SHOWNEWBLK(curBranchBlk, "New loopEndbranch block: ");
+    }
+
+    // Add reg = i to exitingBlks[i].
+    CFGTraits::insertAssignInstrBefore(curExitingBlk, passRep,
+                                       endBranchReg, i);
+
+    // Remove the edge (exitingBlks[i] exitBlks[i]) add new edge
+    // (exitingBlks[i], newLandBlk).
+    CFGTraits::replaceInstrUseOfBlockWith(curExitingBlk, curExitBlk,
+                                          newLandBlk);
+    curExitingBlk->removeSuccessor(curExitBlk);
+    curExitingBlk->addSuccessor(newLandBlk);
+
+    // add to preBranchBlk the branch instruction:
+    // if (endBranchReg == preVal)
+    //    preExitBlk
+    // else
+    //    curBranchBlk
+    //
+    // preValReg = i - 1
+
+  DebugLoc DL;
+  RegiT preValReg = static_cast<int>
+    (funcRep->getRegInfo().createVirtualRegister(I32RC));
+
+  preBranchBlk->insert(preBranchBlk->begin(),
+                       tii->getMovImmInstr(preBranchBlk->getParent(), preValReg,
+                       i - 1));
+
+  // condResReg = (endBranchReg == preValReg)
+    RegiT condResReg = static_cast<int>
+      (funcRep->getRegInfo().createVirtualRegister(I32RC));
+    BuildMI(preBranchBlk, DL, tii->get(tii->getIEQOpcode()), condResReg)
+      .addReg(endBranchReg).addReg(preValReg);
+
+    BuildMI(preBranchBlk, DL, tii->get(AMDGPU::BRANCH_COND_i32))
+      .addMBB(preExitBlk).addReg(condResReg);
+
+    preBranchBlk->addSuccessor(preExitBlk);
+    preBranchBlk->addSuccessor(curBranchBlk);
+
+    // Update preExitingBlk, preExitBlk, preBranchBlk.
+    preExitingBlk = curExitingBlk;
+    preExitBlk = curExitBlk;
+    preBranchBlk = curBranchBlk;
+
+  }  //end for 1 .. n blocks
+
+  return newLandBlk;
+} //addLoopEndbranchBlock
+
+template<class PassT>
+typename CFGStructurizer<PassT>::PathToKind
+CFGStructurizer<PassT>::singlePathTo(BlockT *srcBlk, BlockT *dstBlk,
+                                     bool allowSideEntry) {
+  assert(dstBlk);
+
+  if (srcBlk == dstBlk) {
+    return SinglePath_InPath;
+  }
+
+  while (srcBlk && srcBlk->succ_size() == 1) {
+    srcBlk = *srcBlk->succ_begin();
+    if (srcBlk == dstBlk) {
+      return SinglePath_InPath;
+    }
+
+    if (!allowSideEntry && srcBlk->pred_size() > 1) {
+      return Not_SinglePath;
+    }
+  }
+
+  if (srcBlk && srcBlk->succ_size()==0) {
+    return SinglePath_NotInPath;
+  }
+
+  return Not_SinglePath;
+} //singlePathTo
+
+// If there is a single path from srcBlk to dstBlk, return the last block before
+// dstBlk If there is a single path from srcBlk->end without dstBlk, return the
+// last block in the path Otherwise, return NULL
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::singlePathEnd(BlockT *srcBlk, BlockT *dstBlk,
+                                      bool allowSideEntry) {
+  assert(dstBlk);
+
+  if (srcBlk == dstBlk) {
+    return srcBlk;
+  }
+
+  if (srcBlk->succ_size() == 0) {
+    return srcBlk;
+  }
+
+  while (srcBlk && srcBlk->succ_size() == 1) {
+    BlockT *preBlk = srcBlk;
+
+    srcBlk = *srcBlk->succ_begin();
+    if (srcBlk == NULL) {
+      return preBlk;
+    }
+
+    if (!allowSideEntry && srcBlk->pred_size() > 1) {
+      return NULL;
+    }
+  }
+
+  if (srcBlk && srcBlk->succ_size()==0) {
+    return srcBlk;
+  }
+
+  return NULL;
+
+} //singlePathEnd
+
+template<class PassT>
+int CFGStructurizer<PassT>::cloneOnSideEntryTo(BlockT *preBlk, BlockT *srcBlk,
+                                               BlockT *dstBlk) {
+  int cloned = 0;
+  assert(preBlk->isSuccessor(srcBlk));
+  while (srcBlk && srcBlk != dstBlk) {
+    assert(srcBlk->succ_size() == 1);
+    if (srcBlk->pred_size() > 1) {
+      srcBlk = cloneBlockForPredecessor(srcBlk, preBlk);
+      ++cloned;
+    }
+
+    preBlk = srcBlk;
+    srcBlk = *srcBlk->succ_begin();
+  }
+
+  return cloned;
+} //cloneOnSideEntryTo
+
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::cloneBlockForPredecessor(BlockT *curBlk,
+                                                 BlockT *predBlk) {
+  assert(predBlk->isSuccessor(curBlk) &&
+         "succBlk is not a prececessor of curBlk");
+
+  BlockT *cloneBlk = CFGTraits::clone(curBlk);  //clone instructions
+  CFGTraits::replaceInstrUseOfBlockWith(predBlk, curBlk, cloneBlk);
+  //srcBlk, oldBlk, newBlk
+
+  predBlk->removeSuccessor(curBlk);
+  predBlk->addSuccessor(cloneBlk);
+
+  // add all successor to cloneBlk
+  CFGTraits::cloneSuccessorList(cloneBlk, curBlk);
+
+  numClonedInstr += curBlk->size();
+
+  if (DEBUGME) {
+    errs() << "Cloned block: " << "BB"
+           << curBlk->getNumber() << "size " << curBlk->size() << "\n";
+  }
+
+  SHOWNEWBLK(cloneBlk, "result of Cloned block: ");
+
+  return cloneBlk;
+} //cloneBlockForPredecessor
+
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::exitingBlock2ExitBlock(LoopT *loopRep,
+                                               BlockT *exitingBlk) {
+  BlockT *exitBlk = NULL;
+
+  for (typename BlockT::succ_iterator iterSucc = exitingBlk->succ_begin(),
+       iterSuccEnd = exitingBlk->succ_end();
+       iterSucc != iterSuccEnd; ++iterSucc) {
+    BlockT *curBlk = *iterSucc;
+    if (!loopRep->contains(curBlk)) {
+      assert(exitBlk == NULL);
+      exitBlk = curBlk;
+    }
+  }
+
+  assert(exitBlk != NULL);
+
+  return exitBlk;
+} //exitingBlock2ExitBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::migrateInstruction(BlockT *srcBlk,
+                                                BlockT *dstBlk,
+                                                InstrIterator insertPos) {
+  InstrIterator spliceEnd;
+  //look for the input branchinstr, not the AMDGPU branchinstr
+  InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(srcBlk);
+  if (branchInstr == NULL) {
+    if (DEBUGME) {
+      errs() << "migrateInstruction don't see branch instr\n" ;
+    }
+    spliceEnd = srcBlk->end();
+  } else {
+    if (DEBUGME) {
+      errs() << "migrateInstruction see branch instr\n" ;
+      branchInstr->dump();
+    }
+    spliceEnd = CFGTraits::getInstrPos(srcBlk, branchInstr);
+  }
+  if (DEBUGME) {
+    errs() << "migrateInstruction before splice dstSize = " << dstBlk->size()
+      << "srcSize = " << srcBlk->size() << "\n";
+  }
+
+  //splice insert before insertPos
+  dstBlk->splice(insertPos, srcBlk, srcBlk->begin(), spliceEnd);
+
+  if (DEBUGME) {
+    errs() << "migrateInstruction after splice dstSize = " << dstBlk->size()
+      << "srcSize = " << srcBlk->size() << "\n";
+  }
+} //migrateInstruction
+
+// normalizeInfiniteLoopExit change
+//   B1:
+//        uncond_br LoopHeader
+//
+// to
+//   B1:
+//        cond_br 1 LoopHeader dummyExit
+// and return the newly added dummy exit block
+// 
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::normalizeInfiniteLoopExit(LoopT* LoopRep) {
+  BlockT *loopHeader;
+  BlockT *loopLatch;
+  loopHeader = LoopRep->getHeader();
+  loopLatch = LoopRep->getLoopLatch();
+  BlockT *dummyExitBlk = NULL;
+  const TargetRegisterClass * I32RC = TRI->getCFGStructurizerRegClass(MVT::i32);
+  if (loopHeader!=NULL && loopLatch!=NULL) {
+    InstrT *branchInstr = CFGTraits::getLoopendBlockBranchInstr(loopLatch);
+    if (branchInstr!=NULL && CFGTraits::isUncondBranch(branchInstr)) {
+      dummyExitBlk = funcRep->CreateMachineBasicBlock();
+      funcRep->push_back(dummyExitBlk);  //insert to function
+      SHOWNEWBLK(dummyExitBlk, "DummyExitBlock to normalize infiniteLoop: ");
+
+      if (DEBUGME) errs() << "Old branch instr: " << *branchInstr << "\n";
+
+      typename BlockT::iterator insertPos =
+        CFGTraits::getInstrPos(loopLatch, branchInstr);
+      unsigned immReg =
+        funcRep->getRegInfo().createVirtualRegister(I32RC);
+      CFGTraits::insertAssignInstrBefore(insertPos, passRep, immReg, 1);
+      InstrT *newInstr = 
+        CFGTraits::insertInstrBefore(insertPos, AMDGPU::BRANCH_COND_i32, passRep);
+      MachineInstrBuilder(newInstr).addMBB(loopHeader).addReg(immReg, false);
+
+      SHOWNEWINSTR(newInstr);
+
+      branchInstr->eraseFromParent();
+      loopLatch->addSuccessor(dummyExitBlk);
+    }
+  }
+
+  return dummyExitBlk;
+} //normalizeInfiniteLoopExit
+
+template<class PassT>
+void CFGStructurizer<PassT>::removeUnconditionalBranch(BlockT *srcBlk) {
+  InstrT *branchInstr;
+
+  // I saw two unconditional branch in one basic block in example
+  // test_fc_do_while_or.c need to fix the upstream on this to remove the loop.
+  while ((branchInstr = CFGTraits::getLoopendBlockBranchInstr(srcBlk))
+          && CFGTraits::isUncondBranch(branchInstr)) {
+    if (DEBUGME) {
+          errs() << "Removing unconditional branch instruction" ;
+      branchInstr->dump();
+    }
+    branchInstr->eraseFromParent();
+  }
+} //removeUnconditionalBranch
+
+template<class PassT>
+void CFGStructurizer<PassT>::removeRedundantConditionalBranch(BlockT *srcBlk) {
+  if (srcBlk->succ_size() == 2) {
+    BlockT *blk1 = *srcBlk->succ_begin();
+    BlockT *blk2 = *(++srcBlk->succ_begin());
+
+    if (blk1 == blk2) {
+      InstrT *branchInstr = CFGTraits::getNormalBlockBranchInstr(srcBlk);
+      assert(branchInstr && CFGTraits::isCondBranch(branchInstr));
+      if (DEBUGME) {
+        errs() << "Removing unneeded conditional branch instruction" ;
+        branchInstr->dump();
+      }
+      branchInstr->eraseFromParent();
+      SHOWNEWBLK(blk1, "Removing redundant successor");
+      srcBlk->removeSuccessor(blk1);
+    }
+  }
+} //removeRedundantConditionalBranch
+
+template<class PassT>
+void CFGStructurizer<PassT>::addDummyExitBlock(SmallVector<BlockT*,
+                                               DEFAULT_VEC_SLOTS> &retBlks) {
+  BlockT *dummyExitBlk = funcRep->CreateMachineBasicBlock();
+  funcRep->push_back(dummyExitBlk);  //insert to function
+  CFGTraits::insertInstrEnd(dummyExitBlk, AMDGPU::RETURN, passRep);
+
+  for (typename SmallVector<BlockT *, DEFAULT_VEC_SLOTS>::iterator iter =
+         retBlks.begin(),
+       iterEnd = retBlks.end(); iter != iterEnd; ++iter) {
+    BlockT *curBlk = *iter;
+    InstrT *curInstr = CFGTraits::getReturnInstr(curBlk);
+    if (curInstr) {
+      curInstr->eraseFromParent();
+    }
+    curBlk->addSuccessor(dummyExitBlk);
+    if (DEBUGME) {
+      errs() << "Add dummyExitBlock to BB" << curBlk->getNumber()
+             << " successors\n";
+    }
+  } //for
+
+  SHOWNEWBLK(dummyExitBlk, "DummyExitBlock: ");
+} //addDummyExitBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::removeSuccessor(BlockT *srcBlk) {
+  while (srcBlk->succ_size()) {
+    srcBlk->removeSuccessor(*srcBlk->succ_begin());
+  }
+}
+
+template<class PassT>
+void CFGStructurizer<PassT>::recordSccnum(BlockT *srcBlk, int sccNum) {
+  BlockInfo *&srcBlkInfo = blockInfoMap[srcBlk];
+
+  if (srcBlkInfo == NULL) {
+    srcBlkInfo = new BlockInfo();
+  }
+
+  srcBlkInfo->sccNum = sccNum;
+}
+
+template<class PassT>
+int CFGStructurizer<PassT>::getSCCNum(BlockT *srcBlk) {
+  BlockInfo *srcBlkInfo = blockInfoMap[srcBlk];
+  return srcBlkInfo ? srcBlkInfo->sccNum : INVALIDSCCNUM;
+}
+
+template<class PassT>
+void CFGStructurizer<PassT>::retireBlock(BlockT *dstBlk, BlockT *srcBlk) {
+  if (DEBUGME) {
+        errs() << "Retiring BB" << srcBlk->getNumber() << "\n";
+  }
+
+  BlockInfo *&srcBlkInfo = blockInfoMap[srcBlk];
+
+  if (srcBlkInfo == NULL) {
+    srcBlkInfo = new BlockInfo();
+  }
+
+  srcBlkInfo->isRetired = true;
+  assert(srcBlk->succ_size() == 0 && srcBlk->pred_size() == 0
+         && "can't retire block yet");
+}
+
+template<class PassT>
+bool CFGStructurizer<PassT>::isRetiredBlock(BlockT *srcBlk) {
+  BlockInfo *srcBlkInfo = blockInfoMap[srcBlk];
+  return (srcBlkInfo && srcBlkInfo->isRetired);
+}
+
+template<class PassT>
+bool CFGStructurizer<PassT>::isActiveLoophead(BlockT *curBlk) {
+  LoopT *loopRep = loopInfo->getLoopFor(curBlk);
+  while (loopRep && loopRep->getHeader() == curBlk) {
+    LoopLandInfo *loopLand = getLoopLandInfo(loopRep);
+
+    if(loopLand == NULL)
+      return true;
+
+    BlockT *landBlk = loopLand->landBlk;
+    assert(landBlk);
+    if (!isRetiredBlock(landBlk)) {
+      return true;
+    }
+
+    loopRep = loopRep->getParentLoop();
+  }
+
+  return false;
+} //isActiveLoophead
+
+template<class PassT>
+bool CFGStructurizer<PassT>::needMigrateBlock(BlockT *blk) {
+  const unsigned blockSizeThreshold = 30;
+  const unsigned cloneInstrThreshold = 100;
+
+  bool multiplePreds = blk && (blk->pred_size() > 1);
+
+  if(!multiplePreds)
+    return false;
+
+  unsigned blkSize = blk->size();
+  return ((blkSize > blockSizeThreshold)
+          && (blkSize * (blk->pred_size() - 1) > cloneInstrThreshold));
+} //needMigrateBlock
+
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::recordLoopLandBlock(LoopT *loopRep, BlockT *landBlk,
+                                            BlockTSmallerVector &exitBlks,
+                                            std::set<BlockT *> &exitBlkSet) {
+  SmallVector<BlockT *, DEFAULT_VEC_SLOTS> inpathBlks;  //in exit path blocks
+
+  for (typename BlockT::pred_iterator predIter = landBlk->pred_begin(),
+       predIterEnd = landBlk->pred_end();
+       predIter != predIterEnd; ++predIter) {
+    BlockT *curBlk = *predIter;
+    if (loopRep->contains(curBlk) || exitBlkSet.count(curBlk)) {
+      inpathBlks.push_back(curBlk);
+    }
+  } //for
+
+  //if landBlk has predecessors that are not in the given loop,
+  //create a new block
+  BlockT *newLandBlk = landBlk;
+  if (inpathBlks.size() != landBlk->pred_size()) {
+    newLandBlk = funcRep->CreateMachineBasicBlock();
+    funcRep->push_back(newLandBlk);  //insert to function
+    newLandBlk->addSuccessor(landBlk);
+    for (typename SmallVector<BlockT*, DEFAULT_VEC_SLOTS>::iterator iter =
+         inpathBlks.begin(),
+         iterEnd = inpathBlks.end(); iter != iterEnd; ++iter) {
+      BlockT *curBlk = *iter;
+      CFGTraits::replaceInstrUseOfBlockWith(curBlk, landBlk, newLandBlk);
+      //srcBlk, oldBlk, newBlk
+      curBlk->removeSuccessor(landBlk);
+      curBlk->addSuccessor(newLandBlk);
+    }
+    for (size_t i = 0, tot = exitBlks.size(); i < tot; ++i) {
+      if (exitBlks[i] == landBlk) {
+        exitBlks[i] = newLandBlk;
+      }
+    }
+    SHOWNEWBLK(newLandBlk, "NewLandingBlock: ");
+  }
+
+  setLoopLandBlock(loopRep, newLandBlk);
+
+  return newLandBlk;
+} // recordLoopbreakLand
+
+template<class PassT>
+void CFGStructurizer<PassT>::setLoopLandBlock(LoopT *loopRep, BlockT *blk) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  if (theEntry == NULL) {
+    theEntry = new LoopLandInfo();
+  }
+  assert(theEntry->landBlk == NULL);
+
+  if (blk == NULL) {
+    blk = funcRep->CreateMachineBasicBlock();
+    funcRep->push_back(blk);  //insert to function
+    SHOWNEWBLK(blk, "DummyLandingBlock for loop without break: ");
+  }
+
+  theEntry->landBlk = blk;
+
+  if (DEBUGME) {
+    errs() << "setLoopLandBlock loop-header = BB"
+           << loopRep->getHeader()->getNumber()
+           << "  landing-block = BB" << blk->getNumber() << "\n";
+  }
+} // setLoopLandBlock
+
+template<class PassT>
+void CFGStructurizer<PassT>::addLoopBreakOnReg(LoopT *loopRep, RegiT regNum) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  if (theEntry == NULL) {
+    theEntry = new LoopLandInfo();
+  }
+
+  theEntry->breakOnRegs.insert(regNum);
+
+  if (DEBUGME) {
+    errs() << "addLoopBreakOnReg loop-header = BB"
+           << loopRep->getHeader()->getNumber()
+           << "  regNum = " << regNum << "\n";
+  }
+} // addLoopBreakOnReg
+
+template<class PassT>
+void CFGStructurizer<PassT>::addLoopContOnReg(LoopT *loopRep, RegiT regNum) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  if (theEntry == NULL) {
+    theEntry = new LoopLandInfo();
+  }
+  theEntry->contOnRegs.insert(regNum);
+
+  if (DEBUGME) {
+    errs() << "addLoopContOnReg loop-header = BB"
+           << loopRep->getHeader()->getNumber()
+           << "  regNum = " << regNum << "\n";
+  }
+} // addLoopContOnReg
+
+template<class PassT>
+void CFGStructurizer<PassT>::addLoopBreakInitReg(LoopT *loopRep, RegiT regNum) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  if (theEntry == NULL) {
+    theEntry = new LoopLandInfo();
+  }
+  theEntry->breakInitRegs.insert(regNum);
+
+  if (DEBUGME) {
+    errs() << "addLoopBreakInitReg loop-header = BB"
+           << loopRep->getHeader()->getNumber()
+           << "  regNum = " << regNum << "\n";
+  }
+} // addLoopBreakInitReg
+
+template<class PassT>
+void CFGStructurizer<PassT>::addLoopContInitReg(LoopT *loopRep, RegiT regNum) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  if (theEntry == NULL) {
+    theEntry = new LoopLandInfo();
+  }
+  theEntry->contInitRegs.insert(regNum);
+
+  if (DEBUGME) {
+    errs() << "addLoopContInitReg loop-header = BB"
+           << loopRep->getHeader()->getNumber()
+           << "  regNum = " << regNum << "\n";
+  }
+} // addLoopContInitReg
+
+template<class PassT>
+void CFGStructurizer<PassT>::addLoopEndbranchInitReg(LoopT *loopRep,
+                                                     RegiT regNum) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  if (theEntry == NULL) {
+    theEntry = new LoopLandInfo();
+  }
+  theEntry->endbranchInitRegs.insert(regNum);
+
+  if (DEBUGME) {
+        errs() << "addLoopEndbranchInitReg loop-header = BB"
+      << loopRep->getHeader()->getNumber()
+      << "  regNum = " << regNum << "\n";
+  }
+} // addLoopEndbranchInitReg
+
+template<class PassT>
+typename CFGStructurizer<PassT>::LoopLandInfo *
+CFGStructurizer<PassT>::getLoopLandInfo(LoopT *loopRep) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  return theEntry;
+} // getLoopLandInfo
+
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::getLoopLandBlock(LoopT *loopRep) {
+  LoopLandInfo *&theEntry = loopLandInfoMap[loopRep];
+
+  return theEntry ? theEntry->landBlk : NULL;
+} // getLoopLandBlock
+
+
+template<class PassT>
+bool CFGStructurizer<PassT>::hasBackEdge(BlockT *curBlk) {
+  LoopT *loopRep = loopInfo->getLoopFor(curBlk);
+  if (loopRep == NULL)
+    return false;
+
+  BlockT *loopHeader = loopRep->getHeader();
+
+  return curBlk->isSuccessor(loopHeader);
+
+} //hasBackEdge
+
+template<class PassT>
+unsigned CFGStructurizer<PassT>::getLoopDepth(LoopT *loopRep) {
+  return loopRep ? loopRep->getLoopDepth() : 0;
+} //getLoopDepth
+
+template<class PassT>
+int CFGStructurizer<PassT>::countActiveBlock
+(typename SmallVector<BlockT*, DEFAULT_VEC_SLOTS>::const_iterator iterStart,
+ typename SmallVector<BlockT*, DEFAULT_VEC_SLOTS>::const_iterator iterEnd) {
+  int count = 0;
+  while (iterStart != iterEnd) {
+    if (!isRetiredBlock(*iterStart)) {
+      ++count;
+    }
+    ++iterStart;
+  }
+
+  return count;
+} //countActiveBlock
+
+// This is work around solution for findNearestCommonDominator not avaiable to
+// post dom a proper fix should go to Dominators.h.
+
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT*
+CFGStructurizer<PassT>::findNearestCommonPostDom(BlockT *blk1, BlockT *blk2) {
+
+  if (postDomTree->dominates(blk1, blk2)) {
+    return blk1;
+  }
+  if (postDomTree->dominates(blk2, blk1)) {
+    return blk2;
+  }
+
+  DomTreeNodeT *node1 = postDomTree->getNode(blk1);
+  DomTreeNodeT *node2 = postDomTree->getNode(blk2);
+
+  // Handle newly cloned node.
+  if (node1 == NULL && blk1->succ_size() == 1) {
+    return findNearestCommonPostDom(*blk1->succ_begin(), blk2);
+  }
+  if (node2 == NULL && blk2->succ_size() == 1) {
+    return findNearestCommonPostDom(blk1, *blk2->succ_begin());
+  }
+
+  if (node1 == NULL || node2 == NULL) {
+    return NULL;
+  }
+
+  node1 = node1->getIDom();
+  while (node1) {
+    if (postDomTree->dominates(node1, node2)) {
+      return node1->getBlock();
+    }
+    node1 = node1->getIDom();
+  }
+
+  return NULL;
+}
+
+template<class PassT>
+typename CFGStructurizer<PassT>::BlockT *
+CFGStructurizer<PassT>::findNearestCommonPostDom
+(typename std::set<BlockT *> &blks) {
+  BlockT *commonDom;
+  typename std::set<BlockT *>::const_iterator iter = blks.begin();
+  typename std::set<BlockT *>::const_iterator iterEnd = blks.end();
+  for (commonDom = *iter; iter != iterEnd && commonDom != NULL; ++iter) {
+    BlockT *curBlk = *iter;
+    if (curBlk != commonDom) {
+      commonDom = findNearestCommonPostDom(curBlk, commonDom);
+    }
+  }
+
+  if (DEBUGME) {
+    errs() << "Common post dominator for exit blocks is ";
+    if (commonDom) {
+          errs() << "BB" << commonDom->getNumber() << "\n";
+    } else {
+      errs() << "NULL\n";
+    }
+  }
+
+  return commonDom;
+} //findNearestCommonPostDom
+
+} //end namespace llvm
+
+//todo: move-end
+
+
+//===----------------------------------------------------------------------===//
+//
+// CFGStructurizer for AMDGPU
+//
+//===----------------------------------------------------------------------===//
+
+
+using namespace llvmCFGStruct;
+
+namespace llvm {
+class AMDGPUCFGStructurizer : public MachineFunctionPass {
+public:
+  typedef MachineInstr              InstructionType;
+  typedef MachineFunction           FunctionType;
+  typedef MachineBasicBlock         BlockType;
+  typedef MachineLoopInfo           LoopinfoType;
+  typedef MachineDominatorTree      DominatortreeType;
+  typedef MachinePostDominatorTree  PostDominatortreeType;
+  typedef MachineDomTreeNode        DomTreeNodeType;
+  typedef MachineLoop               LoopType;
+
+protected:
+  TargetMachine &TM;
+  const TargetInstrInfo *TII;
+  const AMDGPURegisterInfo *TRI;
+
+public:
+  AMDGPUCFGStructurizer(char &pid, TargetMachine &tm);
+  const TargetInstrInfo *getTargetInstrInfo() const;
+
+private:
+
+};
+
+} //end of namespace llvm
+AMDGPUCFGStructurizer::AMDGPUCFGStructurizer(char &pid, TargetMachine &tm)
+: MachineFunctionPass(pid), TM(tm), TII(tm.getInstrInfo()),
+  TRI(static_cast<const AMDGPURegisterInfo *>(tm.getRegisterInfo())) {
+}
+
+const TargetInstrInfo *AMDGPUCFGStructurizer::getTargetInstrInfo() const {
+  return TII;
+}
+//===----------------------------------------------------------------------===//
+//
+// CFGPrepare
+//
+//===----------------------------------------------------------------------===//
+
+
+using namespace llvmCFGStruct;
+
+namespace llvm {
+class AMDGPUCFGPrepare : public AMDGPUCFGStructurizer {
+public:
+  static char ID;
+
+public:
+  AMDGPUCFGPrepare(TargetMachine &tm);
+
+  virtual const char *getPassName() const;
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+
+  bool runOnMachineFunction(MachineFunction &F);
+
+private:
+
+};
+
+char AMDGPUCFGPrepare::ID = 0;
+} //end of namespace llvm
+
+AMDGPUCFGPrepare::AMDGPUCFGPrepare(TargetMachine &tm)
+  : AMDGPUCFGStructurizer(ID, tm )  {
+}
+const char *AMDGPUCFGPrepare::getPassName() const {
+  return "AMD IL Control Flow Graph Preparation Pass";
+}
+
+void AMDGPUCFGPrepare::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addPreserved<MachineFunctionAnalysis>();
+  AU.addRequired<MachineFunctionAnalysis>();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<MachinePostDominatorTree>();
+  AU.addRequired<MachineLoopInfo>();
+}
+
+//===----------------------------------------------------------------------===//
+//
+// CFGPerform
+//
+//===----------------------------------------------------------------------===//
+
+
+using namespace llvmCFGStruct;
+
+namespace llvm {
+class AMDGPUCFGPerform : public AMDGPUCFGStructurizer {
+public:
+  static char ID;
+
+public:
+  AMDGPUCFGPerform(TargetMachine &tm);
+  virtual const char *getPassName() const;
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+  bool runOnMachineFunction(MachineFunction &F);
+
+private:
+
+};
+
+char AMDGPUCFGPerform::ID = 0;
+} //end of namespace llvm
+
+  AMDGPUCFGPerform::AMDGPUCFGPerform(TargetMachine &tm)
+: AMDGPUCFGStructurizer(ID, tm) {
+}
+
+const char *AMDGPUCFGPerform::getPassName() const {
+  return "AMD IL Control Flow Graph structurizer Pass";
+}
+
+void AMDGPUCFGPerform::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addPreserved<MachineFunctionAnalysis>();
+  AU.addRequired<MachineFunctionAnalysis>();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<MachinePostDominatorTree>();
+  AU.addRequired<MachineLoopInfo>();
+}
+
+//===----------------------------------------------------------------------===//
+//
+// CFGStructTraits<AMDGPUCFGStructurizer>
+//
+//===----------------------------------------------------------------------===//
+
+namespace llvmCFGStruct {
+// this class is tailor to the AMDGPU backend
+template<>
+struct CFGStructTraits<AMDGPUCFGStructurizer> {
+  typedef int RegiT;
+
+  static int getBranchNzeroOpcode(int oldOpcode) {
+    switch(oldOpcode) {
+    case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
+    case AMDGPU::BRANCH_COND_i32:
+    case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALNZ_f32;
+    case AMDGPU::SI_IF_NZ: return AMDGPU::SI_IF_NZ;
+    default:
+      assert(0 && "internal error");
+    }
+    return -1;
+  }
+
+  static int getBranchZeroOpcode(int oldOpcode) {
+    switch(oldOpcode) {
+    case AMDGPU::JUMP: return AMDGPU::IF_PREDICATE_SET;
+    case AMDGPU::BRANCH_COND_i32:
+    case AMDGPU::BRANCH_COND_f32: return AMDGPU::IF_LOGICALZ_f32;
+    case AMDGPU::SI_IF_Z: return AMDGPU::SI_IF_Z;
+    default:
+      assert(0 && "internal error");
+    }
+    return -1;
+  }
+
+  static int getContinueNzeroOpcode(int oldOpcode) {
+    switch(oldOpcode) {
+    case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALNZ_i32;
+    default:
+      assert(0 && "internal error");
+    };
+    return -1;
+  }
+
+  static int getContinueZeroOpcode(int oldOpcode) {
+    switch(oldOpcode) {
+    case AMDGPU::JUMP: return AMDGPU::CONTINUE_LOGICALZ_i32;
+    default:
+      assert(0 && "internal error");
+    }
+    return -1;
+  }
+
+  static MachineBasicBlock *getTrueBranch(MachineInstr *instr) {
+    return instr->getOperand(0).getMBB();
+  }
+
+  static void setTrueBranch(MachineInstr *instr, MachineBasicBlock *blk) {
+    instr->getOperand(0).setMBB(blk);
+  }
+
+  static MachineBasicBlock *
+  getFalseBranch(MachineBasicBlock *blk, MachineInstr *instr) {
+    assert(blk->succ_size() == 2);
+    MachineBasicBlock *trueBranch = getTrueBranch(instr);
+    MachineBasicBlock::succ_iterator iter = blk->succ_begin();
+    MachineBasicBlock::succ_iterator iterNext = iter;
+    ++iterNext;
+
+    return (*iter == trueBranch) ? *iterNext : *iter;
+  }
+
+  static bool isCondBranch(MachineInstr *instr) {
+    switch (instr->getOpcode()) {
+      case AMDGPU::JUMP:
+        return instr->getOperand(instr->findFirstPredOperandIdx()).getReg() != 0;
+      case AMDGPU::BRANCH_COND_i32:
+      case AMDGPU::BRANCH_COND_f32:
+      case AMDGPU::SI_IF_NZ:
+      case AMDGPU::SI_IF_Z:
+      break;
+    default:
+      return false;
+    }
+    return true;
+  }
+
+  static bool isUncondBranch(MachineInstr *instr) {
+    switch (instr->getOpcode()) {
+    case AMDGPU::JUMP:
+      return instr->getOperand(instr->findFirstPredOperandIdx()).getReg() == 0;
+    case AMDGPU::BRANCH:
+      return true;
+    default:
+      return false;
+    }
+    return true;
+  }
+
+  static DebugLoc getLastDebugLocInBB(MachineBasicBlock *blk) {
+    //get DebugLoc from the first MachineBasicBlock instruction with debug info
+    DebugLoc DL;
+    for (MachineBasicBlock::iterator iter = blk->begin(); iter != blk->end(); ++iter) {
+      MachineInstr *instr = &(*iter);
+      if (instr->getDebugLoc().isUnknown() == false) {
+        DL = instr->getDebugLoc();
+      }
+    }
+    return DL;
+  }
+
+  static MachineInstr *getNormalBlockBranchInstr(MachineBasicBlock *blk) {
+    MachineBasicBlock::reverse_iterator iter = blk->rbegin();
+    MachineInstr *instr = &*iter;
+    if (instr && (isCondBranch(instr) || isUncondBranch(instr))) {
+      return instr;
+    }
+    return NULL;
+  }
+
+  // The correct naming for this is getPossibleLoopendBlockBranchInstr.
+  //
+  // BB with backward-edge could have move instructions after the branch
+  // instruction.  Such move instruction "belong to" the loop backward-edge.
+  //
+  static MachineInstr *getLoopendBlockBranchInstr(MachineBasicBlock *blk) {
+    const AMDGPUInstrInfo * TII = static_cast<const AMDGPUInstrInfo *>(
+                                  blk->getParent()->getTarget().getInstrInfo());
+
+    for (MachineBasicBlock::reverse_iterator iter = blk->rbegin(),
+         iterEnd = blk->rend(); iter != iterEnd; ++iter) {
+      // FIXME: Simplify
+      MachineInstr *instr = &*iter;
+      if (instr) {
+        if (isCondBranch(instr) || isUncondBranch(instr)) {
+          return instr;
+        } else if (!TII->isMov(instr->getOpcode())) {
+          break;
+        }
+      }
+    }
+    return NULL;
+  }
+
+  static MachineInstr *getReturnInstr(MachineBasicBlock *blk) {
+    MachineBasicBlock::reverse_iterator iter = blk->rbegin();
+    if (iter != blk->rend()) {
+      MachineInstr *instr = &(*iter);
+      if (instr->getOpcode() == AMDGPU::RETURN) {
+        return instr;
+      }
+    }
+    return NULL;
+  }
+
+  static MachineInstr *getContinueInstr(MachineBasicBlock *blk) {
+    MachineBasicBlock::reverse_iterator iter = blk->rbegin();
+    if (iter != blk->rend()) {
+      MachineInstr *instr = &(*iter);
+      if (instr->getOpcode() == AMDGPU::CONTINUE) {
+        return instr;
+      }
+    }
+    return NULL;
+  }
+
+  static MachineInstr *getLoopBreakInstr(MachineBasicBlock *blk) {
+    for (MachineBasicBlock::iterator iter = blk->begin(); (iter != blk->end()); ++iter) {
+      MachineInstr *instr = &(*iter);
+      if (instr->getOpcode() == AMDGPU::PREDICATED_BREAK) {
+        return instr;
+      }
+    }
+    return NULL;
+  }
+
+  static bool isReturnBlock(MachineBasicBlock *blk) {
+    MachineInstr *instr = getReturnInstr(blk);
+    bool isReturn = (blk->succ_size() == 0);
+    if (instr) {
+      assert(isReturn);
+    } else if (isReturn) {
+      if (DEBUGME) {
+        errs() << "BB" << blk->getNumber()
+               <<" is return block without RETURN instr\n";
+      }
+    }
+
+    return  isReturn;
+  }
+
+  static MachineBasicBlock::iterator
+  getInstrPos(MachineBasicBlock *blk, MachineInstr *instr) {
+    assert(instr->getParent() == blk && "instruction doesn't belong to block");
+    MachineBasicBlock::iterator iter = blk->begin();
+    MachineBasicBlock::iterator iterEnd = blk->end();
+    while (&(*iter) != instr && iter != iterEnd) {
+      ++iter;
+    }
+
+    assert(iter != iterEnd);
+    return iter;
+  }//getInstrPos
+
+  static MachineInstr *insertInstrBefore(MachineBasicBlock *blk, int newOpcode,
+                                         AMDGPUCFGStructurizer *passRep) {
+    return insertInstrBefore(blk,newOpcode,passRep,DebugLoc());
+  } //insertInstrBefore
+
+  static MachineInstr *insertInstrBefore(MachineBasicBlock *blk, int newOpcode,
+                                         AMDGPUCFGStructurizer *passRep, DebugLoc DL) {
+    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+    MachineInstr *newInstr =
+      blk->getParent()->CreateMachineInstr(tii->get(newOpcode), DL);
+
+    MachineBasicBlock::iterator res;
+    if (blk->begin() != blk->end()) {
+      blk->insert(blk->begin(), newInstr);
+    } else {
+      blk->push_back(newInstr);
+    }
+
+    SHOWNEWINSTR(newInstr);
+
+    return newInstr;
+  } //insertInstrBefore
+
+  static void insertInstrEnd(MachineBasicBlock *blk, int newOpcode,
+                             AMDGPUCFGStructurizer *passRep) {
+    insertInstrEnd(blk,newOpcode,passRep,DebugLoc());
+  } //insertInstrEnd
+
+  static void insertInstrEnd(MachineBasicBlock *blk, int newOpcode,
+                             AMDGPUCFGStructurizer *passRep, DebugLoc DL) {
+    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+   MachineInstr *newInstr = blk->getParent()
+      ->CreateMachineInstr(tii->get(newOpcode), DL);
+
+    blk->push_back(newInstr);
+    //assume the instruction doesn't take any reg operand ...
+
+    SHOWNEWINSTR(newInstr);
+  } //insertInstrEnd
+
+  static MachineInstr *insertInstrBefore(MachineBasicBlock::iterator instrPos,
+                                         int newOpcode, 
+                                         AMDGPUCFGStructurizer *passRep) {
+    MachineInstr *oldInstr = &(*instrPos);
+    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+    MachineBasicBlock *blk = oldInstr->getParent();
+    MachineInstr *newInstr =
+      blk->getParent()->CreateMachineInstr(tii->get(newOpcode),
+                                           DebugLoc());
+
+    blk->insert(instrPos, newInstr);
+    //assume the instruction doesn't take any reg operand ...
+
+    SHOWNEWINSTR(newInstr);
+    return newInstr;
+  } //insertInstrBefore
+
+  static void insertCondBranchBefore(MachineBasicBlock::iterator instrPos,
+                                     int newOpcode,
+                                     AMDGPUCFGStructurizer *passRep,
+                                     DebugLoc DL) {
+    MachineInstr *oldInstr = &(*instrPos);
+    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+    MachineBasicBlock *blk = oldInstr->getParent();
+    MachineInstr *newInstr =
+      blk->getParent()->CreateMachineInstr(tii->get(newOpcode),
+                                           DL);
+
+    blk->insert(instrPos, newInstr);
+    MachineInstrBuilder(newInstr).addReg(oldInstr->getOperand(1).getReg(),
+                                         false);
+
+    SHOWNEWINSTR(newInstr);
+    //erase later oldInstr->eraseFromParent();
+  } //insertCondBranchBefore
+
+  static void insertCondBranchBefore(MachineBasicBlock *blk,
+                                     MachineBasicBlock::iterator insertPos,
+                                     int newOpcode,
+                                     AMDGPUCFGStructurizer *passRep,
+                                     RegiT regNum,
+                                     DebugLoc DL) {
+    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+
+    MachineInstr *newInstr =
+      blk->getParent()->CreateMachineInstr(tii->get(newOpcode), DL);
+
+    //insert before
+    blk->insert(insertPos, newInstr);
+    MachineInstrBuilder(newInstr).addReg(regNum, false);
+
+    SHOWNEWINSTR(newInstr);
+  } //insertCondBranchBefore
+
+  static void insertCondBranchEnd(MachineBasicBlock *blk,
+                                  int newOpcode,
+                                  AMDGPUCFGStructurizer *passRep,
+                                  RegiT regNum) {
+    const TargetInstrInfo *tii = passRep->getTargetInstrInfo();
+    MachineInstr *newInstr =
+      blk->getParent()->CreateMachineInstr(tii->get(newOpcode), DebugLoc());
+
+    blk->push_back(newInstr);
+    MachineInstrBuilder(newInstr).addReg(regNum, false);
+
+    SHOWNEWINSTR(newInstr);
+  } //insertCondBranchEnd
+
+
+  static void insertAssignInstrBefore(MachineBasicBlock::iterator instrPos,
+                                      AMDGPUCFGStructurizer *passRep,
+                                      RegiT regNum, int regVal) {
+    MachineInstr *oldInstr = &(*instrPos);
+    const AMDGPUInstrInfo *tii =
+             static_cast<const AMDGPUInstrInfo *>(passRep->getTargetInstrInfo());
+    MachineBasicBlock *blk = oldInstr->getParent();
+    MachineInstr *newInstr = tii->getMovImmInstr(blk->getParent(), regNum,
+                                                 regVal);
+    blk->insert(instrPos, newInstr);
+
+    SHOWNEWINSTR(newInstr);
+  } //insertAssignInstrBefore
+
+  static void insertAssignInstrBefore(MachineBasicBlock *blk,
+                                      AMDGPUCFGStructurizer *passRep,
+                                      RegiT regNum, int regVal) {
+    const AMDGPUInstrInfo *tii =
+             static_cast<const AMDGPUInstrInfo *>(passRep->getTargetInstrInfo());
+
+    MachineInstr *newInstr = tii->getMovImmInstr(blk->getParent(), regNum,
+                                                 regVal);
+    if (blk->begin() != blk->end()) {
+      blk->insert(blk->begin(), newInstr);
+    } else {
+      blk->push_back(newInstr);
+    }
+
+    SHOWNEWINSTR(newInstr);
+
+  } //insertInstrBefore
+
+  static void insertCompareInstrBefore(MachineBasicBlock *blk,
+                                       MachineBasicBlock::iterator instrPos,
+                                       AMDGPUCFGStructurizer *passRep,
+                                       RegiT dstReg, RegiT src1Reg,
+                                       RegiT src2Reg) {
+    const AMDGPUInstrInfo *tii =
+             static_cast<const AMDGPUInstrInfo *>(passRep->getTargetInstrInfo());
+    MachineInstr *newInstr =
+      blk->getParent()->CreateMachineInstr(tii->get(tii->getIEQOpcode()), DebugLoc());
+
+    MachineInstrBuilder(newInstr).addReg(dstReg, RegState::Define); //set target
+    MachineInstrBuilder(newInstr).addReg(src1Reg); //set src value
+    MachineInstrBuilder(newInstr).addReg(src2Reg); //set src value
+
+    blk->insert(instrPos, newInstr);
+    SHOWNEWINSTR(newInstr);
+
+  } //insertCompareInstrBefore
+
+  static void cloneSuccessorList(MachineBasicBlock *dstBlk,
+                                 MachineBasicBlock *srcBlk) {
+    for (MachineBasicBlock::succ_iterator iter = srcBlk->succ_begin(),
+         iterEnd = srcBlk->succ_end(); iter != iterEnd; ++iter) {
+      dstBlk->addSuccessor(*iter);  // *iter's predecessor is also taken care of
+    }
+  } //cloneSuccessorList
+
+  static MachineBasicBlock *clone(MachineBasicBlock *srcBlk) {
+    MachineFunction *func = srcBlk->getParent();
+    MachineBasicBlock *newBlk = func->CreateMachineBasicBlock();
+    func->push_back(newBlk);  //insert to function
+    for (MachineBasicBlock::iterator iter = srcBlk->begin(),
+         iterEnd = srcBlk->end();
+         iter != iterEnd; ++iter) {
+      MachineInstr *instr = func->CloneMachineInstr(iter);
+      newBlk->push_back(instr);
+    }
+    return newBlk;
+  }
+
+  //MachineBasicBlock::ReplaceUsesOfBlockWith doesn't serve the purpose because
+  //the AMDGPU instruction is not recognized as terminator fix this and retire
+  //this routine
+  static void replaceInstrUseOfBlockWith(MachineBasicBlock *srcBlk,
+                                         MachineBasicBlock *oldBlk,
+                                         MachineBasicBlock *newBlk) {
+    MachineInstr *branchInstr = getLoopendBlockBranchInstr(srcBlk);
+    if (branchInstr && isCondBranch(branchInstr) &&
+        getTrueBranch(branchInstr) == oldBlk) {
+      setTrueBranch(branchInstr, newBlk);
+    }
+  }
+
+  static void wrapup(MachineBasicBlock *entryBlk) {
+    assert((!entryBlk->getParent()->getJumpTableInfo()
+            || entryBlk->getParent()->getJumpTableInfo()->isEmpty())
+           && "found a jump table");
+
+     //collect continue right before endloop
+     SmallVector<MachineInstr *, DEFAULT_VEC_SLOTS> contInstr;
+     MachineBasicBlock::iterator pre = entryBlk->begin();
+     MachineBasicBlock::iterator iterEnd = entryBlk->end();
+     MachineBasicBlock::iterator iter = pre;
+     while (iter != iterEnd) {
+       if (pre->getOpcode() == AMDGPU::CONTINUE
+           && iter->getOpcode() == AMDGPU::ENDLOOP) {
+         contInstr.push_back(pre);
+       }
+       pre = iter;
+       ++iter;
+     } //end while
+
+     //delete continue right before endloop
+     for (unsigned i = 0; i < contInstr.size(); ++i) {
+        contInstr[i]->eraseFromParent();
+     }
+
+     // TODO to fix up jump table so later phase won't be confused.  if
+     // (jumpTableInfo->isEmpty() == false) { need to clean the jump table, but
+     // there isn't such an interface yet.  alternatively, replace all the other
+     // blocks in the jump table with the entryBlk //}
+
+  } //wrapup
+
+  static MachineDominatorTree *getDominatorTree(AMDGPUCFGStructurizer &pass) {
+    return &pass.getAnalysis<MachineDominatorTree>();
+  }
+
+  static MachinePostDominatorTree*
+  getPostDominatorTree(AMDGPUCFGStructurizer &pass) {
+    return &pass.getAnalysis<MachinePostDominatorTree>();
+  }
+
+  static MachineLoopInfo *getLoopInfo(AMDGPUCFGStructurizer &pass) {
+    return &pass.getAnalysis<MachineLoopInfo>();
+  }
+}; // template class CFGStructTraits
+} //end of namespace llvm
+
+// createAMDGPUCFGPreparationPass- Returns a pass
+FunctionPass *llvm::createAMDGPUCFGPreparationPass(TargetMachine &tm
+                                                 ) {
+  return new AMDGPUCFGPrepare(tm );
+}
+
+bool AMDGPUCFGPrepare::runOnMachineFunction(MachineFunction &func) {
+  return llvmCFGStruct::CFGStructurizer<AMDGPUCFGStructurizer>().prepare(func,
+                                                                        *this,
+                                                                        TRI);
+}
+
+// createAMDGPUCFGStructurizerPass- Returns a pass
+FunctionPass *llvm::createAMDGPUCFGStructurizerPass(TargetMachine &tm
+                                                  ) {
+  return new AMDGPUCFGPerform(tm );
+}
+
+bool AMDGPUCFGPerform::runOnMachineFunction(MachineFunction &func) {
+  return llvmCFGStruct::CFGStructurizer<AMDGPUCFGStructurizer>().run(func,
+                                                                    *this,
+                                                                    TRI);
+}
diff --git a/lib/Target/R600/AMDILDevice.cpp b/lib/Target/R600/AMDILDevice.cpp
new file mode 100644
index 0000000000..eec5059de2
--- /dev/null
+++ b/lib/Target/R600/AMDILDevice.cpp
@@ -0,0 +1,124 @@
+//===-- AMDILDevice.cpp - Base class for AMDIL Devices --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//==-----------------------------------------------------------------------===//
+#include "AMDILDevice.h"
+#include "AMDGPUSubtarget.h"
+
+using namespace llvm;
+// Default implementation for all of the classes.
+AMDGPUDevice::AMDGPUDevice(AMDGPUSubtarget *ST) : mSTM(ST) {
+  mHWBits.resize(AMDGPUDeviceInfo::MaxNumberCapabilities);
+  mSWBits.resize(AMDGPUDeviceInfo::MaxNumberCapabilities);
+  setCaps();
+  DeviceFlag = OCL_DEVICE_ALL;
+}
+
+AMDGPUDevice::~AMDGPUDevice() {
+    mHWBits.clear();
+    mSWBits.clear();
+}
+
+size_t AMDGPUDevice::getMaxGDSSize() const {
+  return 0;
+}
+
+uint32_t 
+AMDGPUDevice::getDeviceFlag() const {
+  return DeviceFlag;
+}
+
+size_t AMDGPUDevice::getMaxNumCBs() const {
+  if (usesHardware(AMDGPUDeviceInfo::ConstantMem)) {
+    return HW_MAX_NUM_CB;
+  }
+
+  return 0;
+}
+
+size_t AMDGPUDevice::getMaxCBSize() const {
+  if (usesHardware(AMDGPUDeviceInfo::ConstantMem)) {
+    return MAX_CB_SIZE;
+  }
+
+  return 0;
+}
+
+size_t AMDGPUDevice::getMaxScratchSize() const {
+  return 65536;
+}
+
+uint32_t AMDGPUDevice::getStackAlignment() const {
+  return 16;
+}
+
+void AMDGPUDevice::setCaps() {
+  mSWBits.set(AMDGPUDeviceInfo::HalfOps);
+  mSWBits.set(AMDGPUDeviceInfo::ByteOps);
+  mSWBits.set(AMDGPUDeviceInfo::ShortOps);
+  mSWBits.set(AMDGPUDeviceInfo::HW64BitDivMod);
+  if (mSTM->isOverride(AMDGPUDeviceInfo::NoInline)) {
+    mSWBits.set(AMDGPUDeviceInfo::NoInline);
+  }
+  if (mSTM->isOverride(AMDGPUDeviceInfo::MacroDB)) {
+    mSWBits.set(AMDGPUDeviceInfo::MacroDB);
+  }
+  if (mSTM->isOverride(AMDGPUDeviceInfo::Debug)) {
+    mSWBits.set(AMDGPUDeviceInfo::ConstantMem);
+  } else {
+    mHWBits.set(AMDGPUDeviceInfo::ConstantMem);
+  }
+  if (mSTM->isOverride(AMDGPUDeviceInfo::Debug)) {
+    mSWBits.set(AMDGPUDeviceInfo::PrivateMem);
+  } else {
+    mHWBits.set(AMDGPUDeviceInfo::PrivateMem);
+  }
+  if (mSTM->isOverride(AMDGPUDeviceInfo::BarrierDetect)) {
+    mSWBits.set(AMDGPUDeviceInfo::BarrierDetect);
+  }
+  mSWBits.set(AMDGPUDeviceInfo::ByteLDSOps);
+  mSWBits.set(AMDGPUDeviceInfo::LongOps);
+}
+
+AMDGPUDeviceInfo::ExecutionMode
+AMDGPUDevice::getExecutionMode(AMDGPUDeviceInfo::Caps Caps) const {
+  if (mHWBits[Caps]) {
+    assert(!mSWBits[Caps] && "Cannot set both SW and HW caps");
+    return AMDGPUDeviceInfo::Hardware;
+  }
+
+  if (mSWBits[Caps]) {
+    assert(!mHWBits[Caps] && "Cannot set both SW and HW caps");
+    return AMDGPUDeviceInfo::Software;
+  }
+
+  return AMDGPUDeviceInfo::Unsupported;
+
+}
+
+bool AMDGPUDevice::isSupported(AMDGPUDeviceInfo::Caps Mode) const {
+  return getExecutionMode(Mode) != AMDGPUDeviceInfo::Unsupported;
+}
+
+bool AMDGPUDevice::usesHardware(AMDGPUDeviceInfo::Caps Mode) const {
+  return getExecutionMode(Mode) == AMDGPUDeviceInfo::Hardware;
+}
+
+bool AMDGPUDevice::usesSoftware(AMDGPUDeviceInfo::Caps Mode) const {
+  return getExecutionMode(Mode) == AMDGPUDeviceInfo::Software;
+}
+
+std::string
+AMDGPUDevice::getDataLayout() const {
+    return std::string("e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16"
+      "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32"
+      "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64"
+      "-v96:128:128-v128:128:128-v192:256:256-v256:256:256"
+      "-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+      "-n8:16:32:64");
+}
diff --git a/lib/Target/R600/AMDILDevice.h b/lib/Target/R600/AMDILDevice.h
new file mode 100644
index 0000000000..b9a15609df
--- /dev/null
+++ b/lib/Target/R600/AMDILDevice.h
@@ -0,0 +1,117 @@
+//===---- AMDILDevice.h - Define Device Data for AMDGPU -----*- C++ -*------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface for the subtarget data classes.
+//
+/// This file will define the interface that each generation needs to
+/// implement in order to correctly answer queries on the capabilities of the
+/// specific hardware.
+//===----------------------------------------------------------------------===//
+#ifndef AMDILDEVICEIMPL_H
+#define AMDILDEVICEIMPL_H
+#include "AMDIL.h"
+#include "llvm/ADT/BitVector.h"
+
+namespace llvm {
+  class AMDGPUSubtarget;
+  class MCStreamer;
+//===----------------------------------------------------------------------===//
+// Interface for data that is specific to a single device
+//===----------------------------------------------------------------------===//
+class AMDGPUDevice {
+public:
+  AMDGPUDevice(AMDGPUSubtarget *ST);
+  virtual ~AMDGPUDevice();
+
+  // Enum values for the various memory types.
+  enum {
+    RAW_UAV_ID   = 0,
+    ARENA_UAV_ID = 1,
+    LDS_ID       = 2,
+    GDS_ID       = 3,
+    SCRATCH_ID   = 4,
+    CONSTANT_ID  = 5,
+    GLOBAL_ID    = 6,
+    MAX_IDS      = 7
+  } IO_TYPE_IDS;
+
+  /// \returns The max LDS size that the hardware supports.  Size is in
+  /// bytes.
+  virtual size_t getMaxLDSSize() const = 0;
+
+  /// \returns The max GDS size that the hardware supports if the GDS is
+  /// supported by the hardware.  Size is in bytes.
+  virtual size_t getMaxGDSSize() const;
+
+  /// \returns The max number of hardware constant address spaces that
+  /// are supported by this device.
+  virtual size_t getMaxNumCBs() const;
+
+  /// \returns The max number of bytes a single hardware constant buffer
+  /// can support.  Size is in bytes.
+  virtual size_t getMaxCBSize() const;
+
+  /// \returns The max number of bytes allowed by the hardware scratch
+  /// buffer.  Size is in bytes.
+  virtual size_t getMaxScratchSize() const;
+
+  /// \brief Get the flag that corresponds to the device.
+  virtual uint32_t getDeviceFlag() const;
+
+  /// \returns The number of work-items that exist in a single hardware
+  /// wavefront.
+  virtual size_t getWavefrontSize() const = 0;
+
+  /// \brief Get the generational name of this specific device.
+  virtual uint32_t getGeneration() const = 0;
+
+  /// \brief Get the stack alignment of this specific device.
+  virtual uint32_t getStackAlignment() const;
+
+  /// \brief Get the resource ID for this specific device.
+  virtual uint32_t getResourceID(uint32_t DeviceID) const = 0;
+
+  /// \brief Get the max number of UAV's for this device.
+  virtual uint32_t getMaxNumUAVs() const = 0;
+
+
+  // API utilizing more detailed capabilities of each family of
+  // cards. If a capability is supported, then either usesHardware or
+  // usesSoftware returned true.  If usesHardware returned true, then
+  // usesSoftware must return false for the same capability.  Hardware
+  // execution means that the feature is done natively by the hardware
+  // and is not emulated by the softare.  Software execution means
+  // that the feature could be done in the hardware, but there is
+  // software that emulates it with possibly using the hardware for
+  // support since the hardware does not fully comply with OpenCL
+  // specs.
+
+  bool isSupported(AMDGPUDeviceInfo::Caps Mode) const;
+  bool usesHardware(AMDGPUDeviceInfo::Caps Mode) const;
+  bool usesSoftware(AMDGPUDeviceInfo::Caps Mode) const;
+  virtual std::string getDataLayout() const;
+  static const unsigned int MAX_LDS_SIZE_700 = 16384;
+  static const unsigned int MAX_LDS_SIZE_800 = 32768;
+  static const unsigned int WavefrontSize = 64;
+  static const unsigned int HalfWavefrontSize = 32;
+  static const unsigned int QuarterWavefrontSize = 16;
+protected:
+  virtual void setCaps();
+  llvm::BitVector mHWBits;
+  llvm::BitVector mSWBits;
+  AMDGPUSubtarget *mSTM;
+  uint32_t DeviceFlag;
+private:
+  AMDGPUDeviceInfo::ExecutionMode
+  getExecutionMode(AMDGPUDeviceInfo::Caps Caps) const;
+};
+
+} // namespace llvm
+#endif // AMDILDEVICEIMPL_H
diff --git a/lib/Target/R600/AMDILDeviceInfo.cpp b/lib/Target/R600/AMDILDeviceInfo.cpp
new file mode 100644
index 0000000000..9605fbe633
--- /dev/null
+++ b/lib/Target/R600/AMDILDeviceInfo.cpp
@@ -0,0 +1,94 @@
+//===-- AMDILDeviceInfo.cpp - AMDILDeviceInfo class -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Function that creates DeviceInfo from a device name and other information.
+//
+//==-----------------------------------------------------------------------===//
+#include "AMDILDevices.h"
+#include "AMDGPUSubtarget.h"
+
+using namespace llvm;
+namespace llvm {
+namespace AMDGPUDeviceInfo {
+
+AMDGPUDevice* getDeviceFromName(const std::string &deviceName,
+                                AMDGPUSubtarget *ptr,
+                                bool is64bit, bool is64on32bit) {
+  if (deviceName.c_str()[2] == '7') {
+    switch (deviceName.c_str()[3]) {
+    case '1':
+      return new AMDGPU710Device(ptr);
+    case '7':
+      return new AMDGPU770Device(ptr);
+    default:
+      return new AMDGPU7XXDevice(ptr);
+    }
+  } else if (deviceName == "cypress") {
+#if DEBUG
+    assert(!is64bit && "This device does not support 64bit pointers!");
+    assert(!is64on32bit && "This device does not support 64bit"
+          " on 32bit pointers!");
+#endif
+    return new AMDGPUCypressDevice(ptr);
+  } else if (deviceName == "juniper") {
+#if DEBUG
+    assert(!is64bit && "This device does not support 64bit pointers!");
+    assert(!is64on32bit && "This device does not support 64bit"
+          " on 32bit pointers!");
+#endif
+    return new AMDGPUEvergreenDevice(ptr);
+  } else if (deviceName == "redwood") {
+#if DEBUG
+    assert(!is64bit && "This device does not support 64bit pointers!");
+    assert(!is64on32bit && "This device does not support 64bit"
+          " on 32bit pointers!");
+#endif
+    return new AMDGPURedwoodDevice(ptr);
+  } else if (deviceName == "cedar") {
+#if DEBUG
+    assert(!is64bit && "This device does not support 64bit pointers!");
+    assert(!is64on32bit && "This device does not support 64bit"
+          " on 32bit pointers!");
+#endif
+    return new AMDGPUCedarDevice(ptr);
+  } else if (deviceName == "barts" || deviceName == "turks") {
+#if DEBUG
+    assert(!is64bit && "This device does not support 64bit pointers!");
+    assert(!is64on32bit && "This device does not support 64bit"
+          " on 32bit pointers!");
+#endif
+    return new AMDGPUNIDevice(ptr);
+  } else if (deviceName == "cayman") {
+#if DEBUG
+    assert(!is64bit && "This device does not support 64bit pointers!");
+    assert(!is64on32bit && "This device does not support 64bit"
+          " on 32bit pointers!");
+#endif
+    return new AMDGPUCaymanDevice(ptr);
+  } else if (deviceName == "caicos") {
+#if DEBUG
+    assert(!is64bit && "This device does not support 64bit pointers!");
+    assert(!is64on32bit && "This device does not support 64bit"
+          " on 32bit pointers!");
+#endif
+    return new AMDGPUNIDevice(ptr);
+  } else if (deviceName == "SI") {
+    return new AMDGPUSIDevice(ptr);
+  } else {
+#if DEBUG
+    assert(!is64bit && "This device does not support 64bit pointers!");
+    assert(!is64on32bit && "This device does not support 64bit"
+          " on 32bit pointers!");
+#endif
+    return new AMDGPU7XXDevice(ptr);
+  }
+}
+} // End namespace AMDGPUDeviceInfo
+} // End namespace llvm
diff --git a/lib/Target/R600/AMDILDeviceInfo.h b/lib/Target/R600/AMDILDeviceInfo.h
new file mode 100644
index 0000000000..4b2c3a53c7
--- /dev/null
+++ b/lib/Target/R600/AMDILDeviceInfo.h
@@ -0,0 +1,88 @@
+//===-- AMDILDeviceInfo.h - Constants for describing devices --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//==-----------------------------------------------------------------------===//
+#ifndef AMDILDEVICEINFO_H
+#define AMDILDEVICEINFO_H
+
+
+#include <string>
+
+namespace llvm {
+  class AMDGPUDevice;
+  class AMDGPUSubtarget;
+  namespace AMDGPUDeviceInfo {
+    /// Each Capabilities can be executed using a hardware instruction,
+    /// emulated with a sequence of software instructions, or not
+    /// supported at all.
+    enum ExecutionMode {
+      Unsupported = 0, ///< Unsupported feature on the card(Default value)
+       /// This is the execution mode that is set if the feature is emulated in
+       /// software.
+      Software,
+      /// This execution mode is set if the feature exists natively in hardware
+      Hardware
+    };
+
+    enum Caps {
+      HalfOps          = 0x1,  ///< Half float is supported or not.
+      DoubleOps        = 0x2,  ///< Double is supported or not.
+      ByteOps          = 0x3,  ///< Byte(char) is support or not.
+      ShortOps         = 0x4,  ///< Short is supported or not.
+      LongOps          = 0x5,  ///< Long is supported or not.
+      Images           = 0x6,  ///< Images are supported or not.
+      ByteStores       = 0x7,  ///< ByteStores available(!HD4XXX).
+      ConstantMem      = 0x8,  ///< Constant/CB memory.
+      LocalMem         = 0x9,  ///< Local/LDS memory.
+      PrivateMem       = 0xA,  ///< Scratch/Private/Stack memory.
+      RegionMem        = 0xB,  ///< OCL GDS Memory Extension.
+      FMA              = 0xC,  ///< Use HW FMA or SW FMA.
+      ArenaSegment     = 0xD,  ///< Use for Arena UAV per pointer 12-1023.
+      MultiUAV         = 0xE,  ///< Use for UAV per Pointer 0-7.
+      Reserved0        = 0xF,  ///< ReservedFlag
+      NoAlias          = 0x10, ///< Cached loads.
+      Signed24BitOps   = 0x11, ///< Peephole Optimization.
+      /// Debug mode implies that no hardware features or optimizations
+      /// are performned and that all memory access go through a single
+      /// uav(Arena on HD5XXX/HD6XXX and Raw on HD4XXX).
+      Debug            = 0x12,
+      CachedMem        = 0x13, ///< Cached mem is available or not.
+      BarrierDetect    = 0x14, ///< Detect duplicate barriers.
+      Reserved1        = 0x15, ///< Reserved flag
+      ByteLDSOps       = 0x16, ///< Flag to specify if byte LDS ops are available.
+      ArenaVectors     = 0x17, ///< Flag to specify if vector loads from arena work.
+      TmrReg           = 0x18, ///< Flag to specify if Tmr register is supported.
+      NoInline         = 0x19, ///< Flag to specify that no inlining should occur.
+      MacroDB          = 0x1A, ///< Flag to specify that backend handles macrodb.
+      HW64BitDivMod    = 0x1B, ///< Flag for backend to generate 64bit div/mod.
+      ArenaUAV         = 0x1C, ///< Flag to specify that arena uav is supported.
+      PrivateUAV       = 0x1D, ///< Flag to specify that private memory uses uav's.
+      /// If more capabilities are required, then
+      /// this number needs to be increased.
+      /// All capabilities must come before this
+      /// number.
+      MaxNumberCapabilities = 0x20
+    };
+    /// These have to be in order with the older generations
+    /// having the lower number enumerations.
+    enum Generation {
+      HD4XXX = 0, ///< 7XX based devices.
+      HD5XXX, ///< Evergreen based devices.
+      HD6XXX, ///< NI/Evergreen+ based devices.
+      HD7XXX, ///< Southern Islands based devices.
+      HDTEST, ///< Experimental feature testing device.
+      HDNUMGEN
+    };
+
+
+  AMDGPUDevice*
+    getDeviceFromName(const std::string &name, AMDGPUSubtarget *ptr,
+                      bool is64bit = false, bool is64on32bit = false);
+  } // namespace AMDILDeviceInfo
+} // namespace llvm
+#endif // AMDILDEVICEINFO_H
diff --git a/lib/Target/R600/AMDILDevices.h b/lib/Target/R600/AMDILDevices.h
new file mode 100644
index 0000000000..636fa6d359
--- /dev/null
+++ b/lib/Target/R600/AMDILDevices.h
@@ -0,0 +1,19 @@
+//===-- AMDILDevices.h - Consolidate AMDIL Device headers -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//==-----------------------------------------------------------------------===//
+#ifndef AMDIL_DEVICES_H
+#define AMDIL_DEVICES_H
+// Include all of the device specific header files
+#include "AMDIL7XXDevice.h"
+#include "AMDILDevice.h"
+#include "AMDILEvergreenDevice.h"
+#include "AMDILNIDevice.h"
+#include "AMDILSIDevice.h"
+
+#endif // AMDIL_DEVICES_H
diff --git a/lib/Target/R600/AMDILEvergreenDevice.cpp b/lib/Target/R600/AMDILEvergreenDevice.cpp
new file mode 100644
index 0000000000..c5213a0410
--- /dev/null
+++ b/lib/Target/R600/AMDILEvergreenDevice.cpp
@@ -0,0 +1,169 @@
+//===-- AMDILEvergreenDevice.cpp - Device Info for Evergreen --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//==-----------------------------------------------------------------------===//
+#include "AMDILEvergreenDevice.h"
+
+using namespace llvm;
+
+AMDGPUEvergreenDevice::AMDGPUEvergreenDevice(AMDGPUSubtarget *ST)
+: AMDGPUDevice(ST) {
+  setCaps();
+  std::string name = ST->getDeviceName();
+  if (name == "cedar") {
+    DeviceFlag = OCL_DEVICE_CEDAR;
+  } else if (name == "redwood") {
+    DeviceFlag = OCL_DEVICE_REDWOOD;
+  } else if (name == "cypress") {
+    DeviceFlag = OCL_DEVICE_CYPRESS;
+  } else {
+    DeviceFlag = OCL_DEVICE_JUNIPER;
+  }
+}
+
+AMDGPUEvergreenDevice::~AMDGPUEvergreenDevice() {
+}
+
+size_t AMDGPUEvergreenDevice::getMaxLDSSize() const {
+  if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
+    return MAX_LDS_SIZE_800;
+  } else {
+    return 0;
+  }
+}
+size_t AMDGPUEvergreenDevice::getMaxGDSSize() const {
+  if (usesHardware(AMDGPUDeviceInfo::RegionMem)) {
+    return MAX_LDS_SIZE_800;
+  } else {
+    return 0;
+  }
+}
+uint32_t AMDGPUEvergreenDevice::getMaxNumUAVs() const {
+  return 12;
+}
+
+uint32_t AMDGPUEvergreenDevice::getResourceID(uint32_t id) const {
+  switch(id) {
+  default:
+    assert(0 && "ID type passed in is unknown!");
+    break;
+  case CONSTANT_ID:
+  case RAW_UAV_ID:
+    return GLOBAL_RETURN_RAW_UAV_ID;
+  case GLOBAL_ID:
+  case ARENA_UAV_ID:
+    return DEFAULT_ARENA_UAV_ID;
+  case LDS_ID:
+    if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
+      return DEFAULT_LDS_ID;
+    } else {
+      return DEFAULT_ARENA_UAV_ID;
+    }
+  case GDS_ID:
+    if (usesHardware(AMDGPUDeviceInfo::RegionMem)) {
+      return DEFAULT_GDS_ID;
+    } else {
+      return DEFAULT_ARENA_UAV_ID;
+    }
+  case SCRATCH_ID:
+    if (usesHardware(AMDGPUDeviceInfo::PrivateMem)) {
+      return DEFAULT_SCRATCH_ID;
+    } else {
+      return DEFAULT_ARENA_UAV_ID;
+    }
+  };
+  return 0;
+}
+
+size_t AMDGPUEvergreenDevice::getWavefrontSize() const {
+  return AMDGPUDevice::WavefrontSize;
+}
+
+uint32_t AMDGPUEvergreenDevice::getGeneration() const {
+  return AMDGPUDeviceInfo::HD5XXX;
+}
+
+void AMDGPUEvergreenDevice::setCaps() {
+  mSWBits.set(AMDGPUDeviceInfo::ArenaSegment);
+  mHWBits.set(AMDGPUDeviceInfo::ArenaUAV);
+  mHWBits.set(AMDGPUDeviceInfo::HW64BitDivMod);
+  mSWBits.reset(AMDGPUDeviceInfo::HW64BitDivMod);
+  mSWBits.set(AMDGPUDeviceInfo::Signed24BitOps);
+  if (mSTM->isOverride(AMDGPUDeviceInfo::ByteStores)) {
+    mHWBits.set(AMDGPUDeviceInfo::ByteStores);
+  }
+  if (mSTM->isOverride(AMDGPUDeviceInfo::Debug)) {
+    mSWBits.set(AMDGPUDeviceInfo::LocalMem);
+    mSWBits.set(AMDGPUDeviceInfo::RegionMem);
+  } else {
+    mHWBits.set(AMDGPUDeviceInfo::LocalMem);
+    mHWBits.set(AMDGPUDeviceInfo::RegionMem);
+  }
+  mHWBits.set(AMDGPUDeviceInfo::Images);
+  if (mSTM->isOverride(AMDGPUDeviceInfo::NoAlias)) {
+    mHWBits.set(AMDGPUDeviceInfo::NoAlias);
+  }
+  mHWBits.set(AMDGPUDeviceInfo::CachedMem);
+  if (mSTM->isOverride(AMDGPUDeviceInfo::MultiUAV)) {
+    mHWBits.set(AMDGPUDeviceInfo::MultiUAV);
+  }
+  mHWBits.set(AMDGPUDeviceInfo::ByteLDSOps);
+  mSWBits.reset(AMDGPUDeviceInfo::ByteLDSOps);
+  mHWBits.set(AMDGPUDeviceInfo::ArenaVectors);
+  mHWBits.set(AMDGPUDeviceInfo::LongOps);
+  mSWBits.reset(AMDGPUDeviceInfo::LongOps);
+  mHWBits.set(AMDGPUDeviceInfo::TmrReg);
+}
+
+AMDGPUCypressDevice::AMDGPUCypressDevice(AMDGPUSubtarget *ST)
+  : AMDGPUEvergreenDevice(ST) {
+  setCaps();
+}
+
+AMDGPUCypressDevice::~AMDGPUCypressDevice() {
+}
+
+void AMDGPUCypressDevice::setCaps() {
+  if (mSTM->isOverride(AMDGPUDeviceInfo::DoubleOps)) {
+    mHWBits.set(AMDGPUDeviceInfo::DoubleOps);
+    mHWBits.set(AMDGPUDeviceInfo::FMA);
+  }
+}
+
+
+AMDGPUCedarDevice::AMDGPUCedarDevice(AMDGPUSubtarget *ST)
+  : AMDGPUEvergreenDevice(ST) {
+  setCaps();
+}
+
+AMDGPUCedarDevice::~AMDGPUCedarDevice() {
+}
+
+void AMDGPUCedarDevice::setCaps() {
+  mSWBits.set(AMDGPUDeviceInfo::FMA);
+}
+
+size_t AMDGPUCedarDevice::getWavefrontSize() const {
+  return AMDGPUDevice::QuarterWavefrontSize;
+}
+
+AMDGPURedwoodDevice::AMDGPURedwoodDevice(AMDGPUSubtarget *ST)
+  : AMDGPUEvergreenDevice(ST) {
+  setCaps();
+}
+
+AMDGPURedwoodDevice::~AMDGPURedwoodDevice() {
+}
+
+void AMDGPURedwoodDevice::setCaps() {
+  mSWBits.set(AMDGPUDeviceInfo::FMA);
+}
+
+size_t AMDGPURedwoodDevice::getWavefrontSize() const {
+  return AMDGPUDevice::HalfWavefrontSize;
+}
diff --git a/lib/Target/R600/AMDILEvergreenDevice.h b/lib/Target/R600/AMDILEvergreenDevice.h
new file mode 100644
index 0000000000..6dc2deb9ed
--- /dev/null
+++ b/lib/Target/R600/AMDILEvergreenDevice.h
@@ -0,0 +1,93 @@
+//==- AMDILEvergreenDevice.h - Define Evergreen Device for AMDIL -*- C++ -*--=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface for the subtarget data classes.
+///
+/// This file will define the interface that each generation needs to
+/// implement in order to correctly answer queries on the capabilities of the
+/// specific hardware.
+//===----------------------------------------------------------------------===//
+#ifndef AMDILEVERGREENDEVICE_H
+#define AMDILEVERGREENDEVICE_H
+#include "AMDILDevice.h"
+#include "AMDGPUSubtarget.h"
+
+namespace llvm {
+  class AMDGPUSubtarget;
+//===----------------------------------------------------------------------===//
+// Evergreen generation of devices and their respective sub classes
+//===----------------------------------------------------------------------===//
+
+
+/// \brief The AMDGPUEvergreenDevice is the base device class for all of the Evergreen
+/// series of cards.
+///
+/// This class contains information required to differentiate
+/// the Evergreen device from the generic AMDGPUDevice. This device represents
+/// that capabilities of the 'Juniper' cards, also known as the HD57XX.
+class AMDGPUEvergreenDevice : public AMDGPUDevice {
+public:
+  AMDGPUEvergreenDevice(AMDGPUSubtarget *ST);
+  virtual ~AMDGPUEvergreenDevice();
+  virtual size_t getMaxLDSSize() const;
+  virtual size_t getMaxGDSSize() const;
+  virtual size_t getWavefrontSize() const;
+  virtual uint32_t getGeneration() const;
+  virtual uint32_t getMaxNumUAVs() const;
+  virtual uint32_t getResourceID(uint32_t) const;
+protected:
+  virtual void setCaps();
+};
+
+/// The AMDGPUCypressDevice is similiar to the AMDGPUEvergreenDevice, except it has
+/// support for double precision operations. This device is used to represent
+/// both the Cypress and Hemlock cards, which are commercially known as HD58XX
+/// and HD59XX cards.
+class AMDGPUCypressDevice : public AMDGPUEvergreenDevice {
+public:
+  AMDGPUCypressDevice(AMDGPUSubtarget *ST);
+  virtual ~AMDGPUCypressDevice();
+private:
+  virtual void setCaps();
+};
+
+
+/// \brief The AMDGPUCedarDevice is the class that represents all of the 'Cedar' based
+/// devices.
+///
+/// This class differs from the base AMDGPUEvergreenDevice in that the
+/// device is a ~quarter of the 'Juniper'. These are commercially known as the
+/// HD54XX and HD53XX series of cards.
+class AMDGPUCedarDevice : public AMDGPUEvergreenDevice {
+public:
+  AMDGPUCedarDevice(AMDGPUSubtarget *ST);
+  virtual ~AMDGPUCedarDevice();
+  virtual size_t getWavefrontSize() const;
+private:
+  virtual void setCaps();
+};
+
+/// \brief The AMDGPURedwoodDevice is the class the represents all of the 'Redwood' based
+/// devices.
+///
+/// This class differs from the base class, in that these devices are
+/// considered about half of a 'Juniper' device. These are commercially known as
+/// the HD55XX and HD56XX series of cards.
+class AMDGPURedwoodDevice : public AMDGPUEvergreenDevice {
+public:
+  AMDGPURedwoodDevice(AMDGPUSubtarget *ST);
+  virtual ~AMDGPURedwoodDevice();
+  virtual size_t getWavefrontSize() const;
+private:
+  virtual void setCaps();
+};
+  
+} // namespace llvm
+#endif // AMDILEVERGREENDEVICE_H
diff --git a/lib/Target/R600/AMDILFrameLowering.cpp b/lib/Target/R600/AMDILFrameLowering.cpp
new file mode 100644
index 0000000000..9ad495ab48
--- /dev/null
+++ b/lib/Target/R600/AMDILFrameLowering.cpp
@@ -0,0 +1,47 @@
+//===----------------------- AMDILFrameLowering.cpp -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface to describe a layout of a stack frame on a AMDGPU target
+/// machine.
+//
+//===----------------------------------------------------------------------===//
+#include "AMDILFrameLowering.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+
+using namespace llvm;
+AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl,
+    int LAO, unsigned TransAl)
+  : TargetFrameLowering(D, StackAl, LAO, TransAl) {
+}
+
+AMDGPUFrameLowering::~AMDGPUFrameLowering() {
+}
+
+int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
+                                         int FI) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+  return MFI->getObjectOffset(FI);
+}
+
+const TargetFrameLowering::SpillSlot *
+AMDGPUFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const {
+  NumEntries = 0;
+  return 0;
+}
+void
+AMDGPUFrameLowering::emitPrologue(MachineFunction &MF) const {
+}
+void
+AMDGPUFrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const {
+}
+bool
+AMDGPUFrameLowering::hasFP(const MachineFunction &MF) const {
+  return false;
+}
diff --git a/lib/Target/R600/AMDILFrameLowering.h b/lib/Target/R600/AMDILFrameLowering.h
new file mode 100644
index 0000000000..51337c3dd2
--- /dev/null
+++ b/lib/Target/R600/AMDILFrameLowering.h
@@ -0,0 +1,40 @@
+//===--------------------- AMDILFrameLowering.h -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface to describe a layout of a stack frame on a AMDIL target
+/// machine.
+//
+//===----------------------------------------------------------------------===//
+#ifndef AMDILFRAME_LOWERING_H
+#define AMDILFRAME_LOWERING_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Target/TargetFrameLowering.h"
+
+namespace llvm {
+
+/// \brief Information about the stack frame layout on the AMDGPU targets.
+///
+/// It holds the direction of the stack growth, the known stack alignment on
+/// entry to each function, and the offset to the locals area.
+/// See TargetFrameInfo for more comments.
+class AMDGPUFrameLowering : public TargetFrameLowering {
+public:
+  AMDGPUFrameLowering(StackDirection D, unsigned StackAl, int LAO,
+                      unsigned TransAl = 1);
+  virtual ~AMDGPUFrameLowering();
+  virtual int getFrameIndexOffset(const MachineFunction &MF, int FI) const;
+  virtual const SpillSlot *getCalleeSavedSpillSlots(unsigned &NumEntries) const;
+  virtual void emitPrologue(MachineFunction &MF) const;
+  virtual void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const;
+  virtual bool hasFP(const MachineFunction &MF) const;
+};
+} // namespace llvm
+#endif // AMDILFRAME_LOWERING_H
diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp b/lib/Target/R600/AMDILISelDAGToDAG.cpp
new file mode 100644
index 0000000000..d15ed393c1
--- /dev/null
+++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp
@@ -0,0 +1,485 @@
+//===-- AMDILISelDAGToDAG.cpp - A dag to dag inst selector for AMDIL ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Defines an instruction selector for the AMDGPU target.
+//
+//===----------------------------------------------------------------------===//
+#include "AMDGPUInstrInfo.h"
+#include "AMDGPUISelLowering.h" // For AMDGPUISD
+#include "AMDGPURegisterInfo.h"
+#include "AMDILDevices.h"
+#include "R600InstrInfo.h"
+#include "llvm/ADT/ValueMap.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Support/Compiler.h"
+#include <list>
+#include <queue>
+
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+// Instruction Selector Implementation
+//===----------------------------------------------------------------------===//
+
+namespace {
+/// AMDGPU specific code to select AMDGPU machine instructions for
+/// SelectionDAG operations.
+class AMDGPUDAGToDAGISel : public SelectionDAGISel {
+  // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
+  // make the right decision when generating code for different targets.
+  const AMDGPUSubtarget &Subtarget;
+public:
+  AMDGPUDAGToDAGISel(TargetMachine &TM);
+  virtual ~AMDGPUDAGToDAGISel();
+
+  SDNode *Select(SDNode *N);
+  virtual const char *getPassName() const;
+
+private:
+  inline SDValue getSmallIPtrImm(unsigned Imm);
+
+  // Complex pattern selectors
+  bool SelectADDRParam(SDValue Addr, SDValue& R1, SDValue& R2);
+  bool SelectADDR(SDValue N, SDValue &R1, SDValue &R2);
+  bool SelectADDR64(SDValue N, SDValue &R1, SDValue &R2);
+
+  static bool checkType(const Value *ptr, unsigned int addrspace);
+  static const Value *getBasePointerValue(const Value *V);
+
+  static bool isGlobalStore(const StoreSDNode *N);
+  static bool isPrivateStore(const StoreSDNode *N);
+  static bool isLocalStore(const StoreSDNode *N);
+  static bool isRegionStore(const StoreSDNode *N);
+
+  static bool isCPLoad(const LoadSDNode *N);
+  static bool isConstantLoad(const LoadSDNode *N, int cbID);
+  static bool isGlobalLoad(const LoadSDNode *N);
+  static bool isParamLoad(const LoadSDNode *N);
+  static bool isPrivateLoad(const LoadSDNode *N);
+  static bool isLocalLoad(const LoadSDNode *N);
+  static bool isRegionLoad(const LoadSDNode *N);
+
+  bool SelectADDR8BitOffset(SDValue Addr, SDValue& Base, SDValue& Offset);
+  bool SelectADDRReg(SDValue Addr, SDValue& Base, SDValue& Offset);
+  bool SelectADDRVTX_READ(SDValue Addr, SDValue &Base, SDValue &Offset);
+
+  // Include the pieces autogenerated from the target description.
+#include "AMDGPUGenDAGISel.inc"
+};
+}  // end anonymous namespace
+
+/// \brief This pass converts a legalized DAG into a AMDGPU-specific
+// DAG, ready for instruction scheduling.
+FunctionPass *llvm::createAMDGPUISelDag(TargetMachine &TM
+                                       ) {
+  return new AMDGPUDAGToDAGISel(TM);
+}
+
+AMDGPUDAGToDAGISel::AMDGPUDAGToDAGISel(TargetMachine &TM
+                                     )
+  : SelectionDAGISel(TM), Subtarget(TM.getSubtarget<AMDGPUSubtarget>()) {
+}
+
+AMDGPUDAGToDAGISel::~AMDGPUDAGToDAGISel() {
+}
+
+SDValue AMDGPUDAGToDAGISel::getSmallIPtrImm(unsigned int Imm) {
+  return CurDAG->getTargetConstant(Imm, MVT::i32);
+}
+
+bool AMDGPUDAGToDAGISel::SelectADDRParam(
+    SDValue Addr, SDValue& R1, SDValue& R2) {
+
+  if (Addr.getOpcode() == ISD::FrameIndex) {
+    if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+      R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i32);
+      R2 = CurDAG->getTargetConstant(0, MVT::i32);
+    } else {
+      R1 = Addr;
+      R2 = CurDAG->getTargetConstant(0, MVT::i32);
+    }
+  } else if (Addr.getOpcode() == ISD::ADD) {
+    R1 = Addr.getOperand(0);
+    R2 = Addr.getOperand(1);
+  } else {
+    R1 = Addr;
+    R2 = CurDAG->getTargetConstant(0, MVT::i32);
+  }
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectADDR(SDValue Addr, SDValue& R1, SDValue& R2) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress) {
+    return false;
+  }
+  return SelectADDRParam(Addr, R1, R2);
+}
+
+
+bool AMDGPUDAGToDAGISel::SelectADDR64(SDValue Addr, SDValue& R1, SDValue& R2) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress) {
+    return false;
+  }
+
+  if (Addr.getOpcode() == ISD::FrameIndex) {
+    if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+      R1 = CurDAG->getTargetFrameIndex(FIN->getIndex(), MVT::i64);
+      R2 = CurDAG->getTargetConstant(0, MVT::i64);
+    } else {
+      R1 = Addr;
+      R2 = CurDAG->getTargetConstant(0, MVT::i64);
+    }
+  } else if (Addr.getOpcode() == ISD::ADD) {
+    R1 = Addr.getOperand(0);
+    R2 = Addr.getOperand(1);
+  } else {
+    R1 = Addr;
+    R2 = CurDAG->getTargetConstant(0, MVT::i64);
+  }
+  return true;
+}
+
+SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
+  unsigned int Opc = N->getOpcode();
+  if (N->isMachineOpcode()) {
+    return NULL;   // Already selected.
+  }
+  switch (Opc) {
+  default: break;
+  case ISD::FrameIndex: {
+    if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(N)) {
+      unsigned int FI = FIN->getIndex();
+      EVT OpVT = N->getValueType(0);
+      unsigned int NewOpc = AMDGPU::COPY;
+      SDValue TFI = CurDAG->getTargetFrameIndex(FI, MVT::i32);
+      return CurDAG->SelectNodeTo(N, NewOpc, OpVT, TFI);
+    }
+    break;
+  }
+  case ISD::ConstantFP:
+  case ISD::Constant: {
+    const AMDGPUSubtarget &ST = TM.getSubtarget<AMDGPUSubtarget>();
+    // XXX: Custom immediate lowering not implemented yet.  Instead we use
+    // pseudo instructions defined in SIInstructions.td
+    if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
+      break;
+    }
+    const R600InstrInfo *TII = static_cast<const R600InstrInfo*>(TM.getInstrInfo());
+
+    uint64_t ImmValue = 0;
+    unsigned ImmReg = AMDGPU::ALU_LITERAL_X;
+
+    if (N->getOpcode() == ISD::ConstantFP) {
+      // XXX: 64-bit Immediates not supported yet
+      assert(N->getValueType(0) != MVT::f64);
+
+      ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N);
+      APFloat Value = C->getValueAPF();
+      float FloatValue = Value.convertToFloat();
+      if (FloatValue == 0.0) {
+        ImmReg = AMDGPU::ZERO;
+      } else if (FloatValue == 0.5) {
+        ImmReg = AMDGPU::HALF;
+      } else if (FloatValue == 1.0) {
+        ImmReg = AMDGPU::ONE;
+      } else {
+        ImmValue = Value.bitcastToAPInt().getZExtValue();
+      }
+    } else {
+      // XXX: 64-bit Immediates not supported yet
+      assert(N->getValueType(0) != MVT::i64);
+
+      ConstantSDNode *C = dyn_cast<ConstantSDNode>(N);
+      if (C->getZExtValue() == 0) {
+        ImmReg = AMDGPU::ZERO;
+      } else if (C->getZExtValue() == 1) {
+        ImmReg = AMDGPU::ONE_INT;
+      } else {
+        ImmValue = C->getZExtValue();
+      }
+    }
+
+    for (SDNode::use_iterator Use = N->use_begin(), Next = llvm::next(Use);
+                              Use != SDNode::use_end(); Use = Next) {
+      Next = llvm::next(Use);
+      std::vector<SDValue> Ops;
+      for (unsigned i = 0; i < Use->getNumOperands(); ++i) {
+        Ops.push_back(Use->getOperand(i));
+      }
+
+      if (!Use->isMachineOpcode()) {
+          if (ImmReg == AMDGPU::ALU_LITERAL_X) {
+            // We can only use literal constants (e.g. AMDGPU::ZERO,
+            // AMDGPU::ONE, etc) in machine opcodes.
+            continue;
+          }
+      } else {
+        if (!TII->isALUInstr(Use->getMachineOpcode())) {
+          continue;
+        }
+
+        int ImmIdx = TII->getOperandIdx(Use->getMachineOpcode(), R600Operands::IMM);
+        assert(ImmIdx != -1);
+
+        // subtract one from ImmIdx, because the DST operand is usually index
+        // 0 for MachineInstrs, but we have no DST in the Ops vector.
+        ImmIdx--;
+
+        // Check that we aren't already using an immediate.
+        // XXX: It's possible for an instruction to have more than one
+        // immediate operand, but this is not supported yet.
+        if (ImmReg == AMDGPU::ALU_LITERAL_X) {
+          ConstantSDNode *C = dyn_cast<ConstantSDNode>(Use->getOperand(ImmIdx));
+          assert(C);
+
+          if (C->getZExtValue() != 0) {
+            // This instruction is already using an immediate.
+            continue;
+          }
+
+          // Set the immediate value
+          Ops[ImmIdx] = CurDAG->getTargetConstant(ImmValue, MVT::i32);
+        }
+      }
+      // Set the immediate register
+      Ops[Use.getOperandNo()] = CurDAG->getRegister(ImmReg, MVT::i32);
+
+      CurDAG->UpdateNodeOperands(*Use, Ops.data(), Use->getNumOperands());
+    }
+    break;
+  }
+  }
+  return SelectCode(N);
+}
+
+bool AMDGPUDAGToDAGISel::checkType(const Value *ptr, unsigned int addrspace) {
+  if (!ptr) {
+    return false;
+  }
+  Type *ptrType = ptr->getType();
+  return dyn_cast<PointerType>(ptrType)->getAddressSpace() == addrspace;
+}
+
+const Value * AMDGPUDAGToDAGISel::getBasePointerValue(const Value *V) {
+  if (!V) {
+    return NULL;
+  }
+  const Value *ret = NULL;
+  ValueMap<const Value *, bool> ValueBitMap;
+  std::queue<const Value *, std::list<const Value *> > ValueQueue;
+  ValueQueue.push(V);
+  while (!ValueQueue.empty()) {
+    V = ValueQueue.front();
+    if (ValueBitMap.find(V) == ValueBitMap.end()) {
+      ValueBitMap[V] = true;
+      if (dyn_cast<Argument>(V) && dyn_cast<PointerType>(V->getType())) {
+        ret = V;
+        break;
+      } else if (dyn_cast<GlobalVariable>(V)) {
+        ret = V;
+        break;
+      } else if (dyn_cast<Constant>(V)) {
+        const ConstantExpr *CE = dyn_cast<ConstantExpr>(V);
+        if (CE) {
+          ValueQueue.push(CE->getOperand(0));
+        }
+      } else if (const AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
+        ret = AI;
+        break;
+      } else if (const Instruction *I = dyn_cast<Instruction>(V)) {
+        uint32_t numOps = I->getNumOperands();
+        for (uint32_t x = 0; x < numOps; ++x) {
+          ValueQueue.push(I->getOperand(x));
+        }
+      } else {
+        assert(!"Found a Value that we didn't know how to handle!");
+      }
+    }
+    ValueQueue.pop();
+  }
+  return ret;
+}
+
+bool AMDGPUDAGToDAGISel::isGlobalStore(const StoreSDNode *N) {
+  return checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isPrivateStore(const StoreSDNode *N) {
+  return (!checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS)
+          && !checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS)
+          && !checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS));
+}
+
+bool AMDGPUDAGToDAGISel::isLocalStore(const StoreSDNode *N) {
+  return checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isRegionStore(const StoreSDNode *N) {
+  return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isConstantLoad(const LoadSDNode *N, int cbID) {
+  if (checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_ADDRESS)) {
+    return true;
+  }
+  MachineMemOperand *MMO = N->getMemOperand();
+  const Value *V = MMO->getValue();
+  const Value *BV = getBasePointerValue(V);
+  if (MMO
+      && MMO->getValue()
+      && ((V && dyn_cast<GlobalValue>(V))
+          || (BV && dyn_cast<GlobalValue>(
+                        getBasePointerValue(MMO->getValue()))))) {
+    return checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS);
+  } else {
+    return false;
+  }
+}
+
+bool AMDGPUDAGToDAGISel::isGlobalLoad(const LoadSDNode *N) {
+  return checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isParamLoad(const LoadSDNode *N) {
+  return checkType(N->getSrcValue(), AMDGPUAS::PARAM_I_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isLocalLoad(const  LoadSDNode *N) {
+  return checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isRegionLoad(const  LoadSDNode *N) {
+  return checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS);
+}
+
+bool AMDGPUDAGToDAGISel::isCPLoad(const LoadSDNode *N) {
+  MachineMemOperand *MMO = N->getMemOperand();
+  if (checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS)) {
+    if (MMO) {
+      const Value *V = MMO->getValue();
+      const PseudoSourceValue *PSV = dyn_cast<PseudoSourceValue>(V);
+      if (PSV && PSV == PseudoSourceValue::getConstantPool()) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool AMDGPUDAGToDAGISel::isPrivateLoad(const LoadSDNode *N) {
+  if (checkType(N->getSrcValue(), AMDGPUAS::PRIVATE_ADDRESS)) {
+    // Check to make sure we are not a constant pool load or a constant load
+    // that is marked as a private load
+    if (isCPLoad(N) || isConstantLoad(N, -1)) {
+      return false;
+    }
+  }
+  if (!checkType(N->getSrcValue(), AMDGPUAS::LOCAL_ADDRESS)
+      && !checkType(N->getSrcValue(), AMDGPUAS::GLOBAL_ADDRESS)
+      && !checkType(N->getSrcValue(), AMDGPUAS::REGION_ADDRESS)
+      && !checkType(N->getSrcValue(), AMDGPUAS::CONSTANT_ADDRESS)
+      && !checkType(N->getSrcValue(), AMDGPUAS::PARAM_D_ADDRESS)
+      && !checkType(N->getSrcValue(), AMDGPUAS::PARAM_I_ADDRESS)) {
+    return true;
+  }
+  return false;
+}
+
+const char *AMDGPUDAGToDAGISel::getPassName() const {
+  return "AMDGPU DAG->DAG Pattern Instruction Selection";
+}
+
+#ifdef DEBUGTMP
+#undef INT64_C
+#endif
+#undef DEBUGTMP
+
+///==== AMDGPU Functions ====///
+
+bool AMDGPUDAGToDAGISel::SelectADDR8BitOffset(SDValue Addr, SDValue& Base,
+                                             SDValue& Offset) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress) {
+    return false;
+  }
+
+
+  if (Addr.getOpcode() == ISD::ADD) {
+    bool Match = false;
+
+    // Find the base ptr and the offset
+    for (unsigned i = 0; i < Addr.getNumOperands(); i++) {
+      SDValue Arg = Addr.getOperand(i);
+      ConstantSDNode * OffsetNode = dyn_cast<ConstantSDNode>(Arg);
+      // This arg isn't a constant so it must be the base PTR.
+      if (!OffsetNode) {
+        Base = Addr.getOperand(i);
+        continue;
+      }
+      // Check if the constant argument fits in 8-bits.  The offset is in bytes
+      // so we need to convert it to dwords.
+      if (isUInt<8>(OffsetNode->getZExtValue() >> 2)) {
+        Match = true;
+        Offset = CurDAG->getTargetConstant(OffsetNode->getZExtValue() >> 2,
+                                           MVT::i32);
+      }
+    }
+    return Match;
+  }
+
+  // Default case, no offset
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectADDRVTX_READ(SDValue Addr, SDValue &Base,
+                                           SDValue &Offset) {
+  ConstantSDNode * IMMOffset;
+
+  if (Addr.getOpcode() == ISD::ADD
+      && (IMMOffset = dyn_cast<ConstantSDNode>(Addr.getOperand(1)))
+      && isInt<16>(IMMOffset->getZExtValue())) {
+
+      Base = Addr.getOperand(0);
+      Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), MVT::i32);
+      return true;
+  // If the pointer address is constant, we can move it to the offset field.
+  } else if ((IMMOffset = dyn_cast<ConstantSDNode>(Addr))
+             && isInt<16>(IMMOffset->getZExtValue())) {
+    Base = CurDAG->getCopyFromReg(CurDAG->getEntryNode(),
+                                  CurDAG->getEntryNode().getDebugLoc(),
+                                  AMDGPU::ZERO, MVT::i32);
+    Offset = CurDAG->getTargetConstant(IMMOffset->getZExtValue(), MVT::i32);
+    return true;
+  }
+
+  // Default case, no offset
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, MVT::i32);
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectADDRReg(SDValue Addr, SDValue& Base,
+                                      SDValue& Offset) {
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress  ||
+      Addr.getOpcode() != ISD::ADD) {
+    return false;
+  }
+
+  Base = Addr.getOperand(0);
+  Offset = Addr.getOperand(1);
+
+  return true;
+}
diff --git a/lib/Target/R600/AMDILISelLowering.cpp b/lib/Target/R600/AMDILISelLowering.cpp
new file mode 100644
index 0000000000..8bfd30c6e3
--- /dev/null
+++ b/lib/Target/R600/AMDILISelLowering.cpp
@@ -0,0 +1,651 @@
+//===-- AMDILISelLowering.cpp - AMDIL DAG Lowering Implementation ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief TargetLowering functions borrowed from AMDIL.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUISelLowering.h"
+#include "AMDGPURegisterInfo.h"
+#include "AMDILDevices.h"
+#include "AMDILIntrinsicInfo.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/CallingConv.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetOptions.h"
+
+using namespace llvm;
+//===----------------------------------------------------------------------===//
+// Calling Convention Implementation
+//===----------------------------------------------------------------------===//
+#include "AMDGPUGenCallingConv.inc"
+
+//===----------------------------------------------------------------------===//
+// TargetLowering Implementation Help Functions End
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// TargetLowering Class Implementation Begins
+//===----------------------------------------------------------------------===//
+void AMDGPUTargetLowering::InitAMDILLowering() {
+  int types[] = {
+    (int)MVT::i8,
+    (int)MVT::i16,
+    (int)MVT::i32,
+    (int)MVT::f32,
+    (int)MVT::f64,
+    (int)MVT::i64,
+    (int)MVT::v2i8,
+    (int)MVT::v4i8,
+    (int)MVT::v2i16,
+    (int)MVT::v4i16,
+    (int)MVT::v4f32,
+    (int)MVT::v4i32,
+    (int)MVT::v2f32,
+    (int)MVT::v2i32,
+    (int)MVT::v2f64,
+    (int)MVT::v2i64
+  };
+
+  int IntTypes[] = {
+    (int)MVT::i8,
+    (int)MVT::i16,
+    (int)MVT::i32,
+    (int)MVT::i64
+  };
+
+  int FloatTypes[] = {
+    (int)MVT::f32,
+    (int)MVT::f64
+  };
+
+  int VectorTypes[] = {
+    (int)MVT::v2i8,
+    (int)MVT::v4i8,
+    (int)MVT::v2i16,
+    (int)MVT::v4i16,
+    (int)MVT::v4f32,
+    (int)MVT::v4i32,
+    (int)MVT::v2f32,
+    (int)MVT::v2i32,
+    (int)MVT::v2f64,
+    (int)MVT::v2i64
+  };
+  size_t NumTypes = sizeof(types) / sizeof(*types);
+  size_t NumFloatTypes = sizeof(FloatTypes) / sizeof(*FloatTypes);
+  size_t NumIntTypes = sizeof(IntTypes) / sizeof(*IntTypes);
+  size_t NumVectorTypes = sizeof(VectorTypes) / sizeof(*VectorTypes);
+
+  const AMDGPUSubtarget &STM = getTargetMachine().getSubtarget<AMDGPUSubtarget>();
+  // These are the current register classes that are
+  // supported
+
+  for (unsigned int x  = 0; x < NumTypes; ++x) {
+    MVT::SimpleValueType VT = (MVT::SimpleValueType)types[x];
+
+    //FIXME: SIGN_EXTEND_INREG is not meaningful for floating point types
+    // We cannot sextinreg, expand to shifts
+    setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Custom);
+    setOperationAction(ISD::SUBE, VT, Expand);
+    setOperationAction(ISD::SUBC, VT, Expand);
+    setOperationAction(ISD::ADDE, VT, Expand);
+    setOperationAction(ISD::ADDC, VT, Expand);
+    setOperationAction(ISD::BRCOND, VT, Custom);
+    setOperationAction(ISD::BR_JT, VT, Expand);
+    setOperationAction(ISD::BRIND, VT, Expand);
+    // TODO: Implement custom UREM/SREM routines
+    setOperationAction(ISD::SREM, VT, Expand);
+    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+    if (VT != MVT::i64 && VT != MVT::v2i64) {
+      setOperationAction(ISD::SDIV, VT, Custom);
+    }
+  }
+  for (unsigned int x = 0; x < NumFloatTypes; ++x) {
+    MVT::SimpleValueType VT = (MVT::SimpleValueType)FloatTypes[x];
+
+    // IL does not have these operations for floating point types
+    setOperationAction(ISD::FP_ROUND_INREG, VT, Expand);
+    setOperationAction(ISD::SETOLT, VT, Expand);
+    setOperationAction(ISD::SETOGE, VT, Expand);
+    setOperationAction(ISD::SETOGT, VT, Expand);
+    setOperationAction(ISD::SETOLE, VT, Expand);
+    setOperationAction(ISD::SETULT, VT, Expand);
+    setOperationAction(ISD::SETUGE, VT, Expand);
+    setOperationAction(ISD::SETUGT, VT, Expand);
+    setOperationAction(ISD::SETULE, VT, Expand);
+  }
+
+  for (unsigned int x = 0; x < NumIntTypes; ++x) {
+    MVT::SimpleValueType VT = (MVT::SimpleValueType)IntTypes[x];
+
+    // GPU also does not have divrem function for signed or unsigned
+    setOperationAction(ISD::SDIVREM, VT, Expand);
+
+    // GPU does not have [S|U]MUL_LOHI functions as a single instruction
+    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+    setOperationAction(ISD::UMUL_LOHI, VT, Expand);
+
+    // GPU doesn't have a rotl, rotr, or byteswap instruction
+    setOperationAction(ISD::ROTR, VT, Expand);
+    setOperationAction(ISD::BSWAP, VT, Expand);
+
+    // GPU doesn't have any counting operators
+    setOperationAction(ISD::CTPOP, VT, Expand);
+    setOperationAction(ISD::CTTZ, VT, Expand);
+    setOperationAction(ISD::CTLZ, VT, Expand);
+  }
+
+  for (unsigned int ii = 0; ii < NumVectorTypes; ++ii) {
+    MVT::SimpleValueType VT = (MVT::SimpleValueType)VectorTypes[ii];
+
+    setOperationAction(ISD::VECTOR_SHUFFLE, VT, Expand);
+    setOperationAction(ISD::SDIVREM, VT, Expand);
+    setOperationAction(ISD::SMUL_LOHI, VT, Expand);
+    // setOperationAction(ISD::VSETCC, VT, Expand);
+    setOperationAction(ISD::SELECT_CC, VT, Expand);
+
+  }
+  if (STM.device()->isSupported(AMDGPUDeviceInfo::LongOps)) {
+    setOperationAction(ISD::MULHU, MVT::i64, Expand);
+    setOperationAction(ISD::MULHU, MVT::v2i64, Expand);
+    setOperationAction(ISD::MULHS, MVT::i64, Expand);
+    setOperationAction(ISD::MULHS, MVT::v2i64, Expand);
+    setOperationAction(ISD::ADD, MVT::v2i64, Expand);
+    setOperationAction(ISD::SREM, MVT::v2i64, Expand);
+    setOperationAction(ISD::Constant          , MVT::i64  , Legal);
+    setOperationAction(ISD::SDIV, MVT::v2i64, Expand);
+    setOperationAction(ISD::TRUNCATE, MVT::v2i64, Expand);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Expand);
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Expand);
+    setOperationAction(ISD::ANY_EXTEND, MVT::v2i64, Expand);
+  }
+  if (STM.device()->isSupported(AMDGPUDeviceInfo::DoubleOps)) {
+    // we support loading/storing v2f64 but not operations on the type
+    setOperationAction(ISD::FADD, MVT::v2f64, Expand);
+    setOperationAction(ISD::FSUB, MVT::v2f64, Expand);
+    setOperationAction(ISD::FMUL, MVT::v2f64, Expand);
+    setOperationAction(ISD::FP_ROUND_INREG, MVT::v2f64, Expand);
+    setOperationAction(ISD::FP_EXTEND, MVT::v2f64, Expand);
+    setOperationAction(ISD::ConstantFP        , MVT::f64  , Legal);
+    // We want to expand vector conversions into their scalar
+    // counterparts.
+    setOperationAction(ISD::TRUNCATE, MVT::v2f64, Expand);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v2f64, Expand);
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v2f64, Expand);
+    setOperationAction(ISD::ANY_EXTEND, MVT::v2f64, Expand);
+    setOperationAction(ISD::FABS, MVT::f64, Expand);
+    setOperationAction(ISD::FABS, MVT::v2f64, Expand);
+  }
+  // TODO: Fix the UDIV24 algorithm so it works for these
+  // types correctly. This needs vector comparisons
+  // for this to work correctly.
+  setOperationAction(ISD::UDIV, MVT::v2i8, Expand);
+  setOperationAction(ISD::UDIV, MVT::v4i8, Expand);
+  setOperationAction(ISD::UDIV, MVT::v2i16, Expand);
+  setOperationAction(ISD::UDIV, MVT::v4i16, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Custom);
+  setOperationAction(ISD::SUBC, MVT::Other, Expand);
+  setOperationAction(ISD::ADDE, MVT::Other, Expand);
+  setOperationAction(ISD::ADDC, MVT::Other, Expand);
+  setOperationAction(ISD::BRCOND, MVT::Other, Custom);
+  setOperationAction(ISD::BR_JT, MVT::Other, Expand);
+  setOperationAction(ISD::BRIND, MVT::Other, Expand);
+  setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Expand);
+
+
+  // Use the default implementation.
+  setOperationAction(ISD::ConstantFP        , MVT::f32    , Legal);
+  setOperationAction(ISD::Constant          , MVT::i32    , Legal);
+
+  setSchedulingPreference(Sched::RegPressure);
+  setPow2DivIsCheap(false);
+  setSelectIsExpensive(true);
+  setJumpIsExpensive(true);
+
+  maxStoresPerMemcpy  = 4096;
+  maxStoresPerMemmove = 4096;
+  maxStoresPerMemset  = 4096;
+
+}
+
+bool
+AMDGPUTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
+    const CallInst &I, unsigned Intrinsic) const {
+  return false;
+}
+
+// The backend supports 32 and 64 bit floating point immediates
+bool
+AMDGPUTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const {
+  if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32
+      || VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) {
+    return true;
+  } else {
+    return false;
+  }
+}
+
+bool
+AMDGPUTargetLowering::ShouldShrinkFPConstant(EVT VT) const {
+  if (VT.getScalarType().getSimpleVT().SimpleTy == MVT::f32
+      || VT.getScalarType().getSimpleVT().SimpleTy == MVT::f64) {
+    return false;
+  } else {
+    return true;
+  }
+}
+
+
+// isMaskedValueZeroForTargetNode - Return true if 'Op & Mask' is known to
+// be zero. Op is expected to be a target specific node. Used by DAG
+// combiner.
+
+void
+AMDGPUTargetLowering::computeMaskedBitsForTargetNode(
+    const SDValue Op,
+    APInt &KnownZero,
+    APInt &KnownOne,
+    const SelectionDAG &DAG,
+    unsigned Depth) const {
+  APInt KnownZero2;
+  APInt KnownOne2;
+  KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything
+  switch (Op.getOpcode()) {
+    default: break;
+    case ISD::SELECT_CC:
+             DAG.ComputeMaskedBits(
+                 Op.getOperand(1),
+                 KnownZero,
+                 KnownOne,
+                 Depth + 1
+                 );
+             DAG.ComputeMaskedBits(
+                 Op.getOperand(0),
+                 KnownZero2,
+                 KnownOne2
+                 );
+             assert((KnownZero & KnownOne) == 0
+                 && "Bits known to be one AND zero?");
+             assert((KnownZero2 & KnownOne2) == 0
+                 && "Bits known to be one AND zero?");
+             // Only known if known in both the LHS and RHS
+             KnownOne &= KnownOne2;
+             KnownZero &= KnownZero2;
+             break;
+  };
+}
+
+//===----------------------------------------------------------------------===//
+//                           Other Lowering Hooks
+//===----------------------------------------------------------------------===//
+
+SDValue
+AMDGPUTargetLowering::LowerSDIV(SDValue Op, SelectionDAG &DAG) const {
+  EVT OVT = Op.getValueType();
+  SDValue DST;
+  if (OVT.getScalarType() == MVT::i64) {
+    DST = LowerSDIV64(Op, DAG);
+  } else if (OVT.getScalarType() == MVT::i32) {
+    DST = LowerSDIV32(Op, DAG);
+  } else if (OVT.getScalarType() == MVT::i16
+      || OVT.getScalarType() == MVT::i8) {
+    DST = LowerSDIV24(Op, DAG);
+  } else {
+    DST = SDValue(Op.getNode(), 0);
+  }
+  return DST;
+}
+
+SDValue
+AMDGPUTargetLowering::LowerSREM(SDValue Op, SelectionDAG &DAG) const {
+  EVT OVT = Op.getValueType();
+  SDValue DST;
+  if (OVT.getScalarType() == MVT::i64) {
+    DST = LowerSREM64(Op, DAG);
+  } else if (OVT.getScalarType() == MVT::i32) {
+    DST = LowerSREM32(Op, DAG);
+  } else if (OVT.getScalarType() == MVT::i16) {
+    DST = LowerSREM16(Op, DAG);
+  } else if (OVT.getScalarType() == MVT::i8) {
+    DST = LowerSREM8(Op, DAG);
+  } else {
+    DST = SDValue(Op.getNode(), 0);
+  }
+  return DST;
+}
+
+SDValue
+AMDGPUTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Data = Op.getOperand(0);
+  VTSDNode *BaseType = cast<VTSDNode>(Op.getOperand(1));
+  DebugLoc DL = Op.getDebugLoc();
+  EVT DVT = Data.getValueType();
+  EVT BVT = BaseType->getVT();
+  unsigned baseBits = BVT.getScalarType().getSizeInBits();
+  unsigned srcBits = DVT.isSimple() ? DVT.getScalarType().getSizeInBits() : 1;
+  unsigned shiftBits = srcBits - baseBits;
+  if (srcBits < 32) {
+    // If the op is less than 32 bits, then it needs to extend to 32bits
+    // so it can properly keep the upper bits valid.
+    EVT IVT = genIntType(32, DVT.isVector() ? DVT.getVectorNumElements() : 1);
+    Data = DAG.getNode(ISD::ZERO_EXTEND, DL, IVT, Data);
+    shiftBits = 32 - baseBits;
+    DVT = IVT;
+  }
+  SDValue Shift = DAG.getConstant(shiftBits, DVT);
+  // Shift left by 'Shift' bits.
+  Data = DAG.getNode(ISD::SHL, DL, DVT, Data, Shift);
+  // Signed shift Right by 'Shift' bits.
+  Data = DAG.getNode(ISD::SRA, DL, DVT, Data, Shift);
+  if (srcBits < 32) {
+    // Once the sign extension is done, the op needs to be converted to
+    // its original type.
+    Data = DAG.getSExtOrTrunc(Data, DL, Op.getOperand(0).getValueType());
+  }
+  return Data;
+}
+EVT
+AMDGPUTargetLowering::genIntType(uint32_t size, uint32_t numEle) const {
+  int iSize = (size * numEle);
+  int vEle = (iSize >> ((size == 64) ? 6 : 5));
+  if (!vEle) {
+    vEle = 1;
+  }
+  if (size == 64) {
+    if (vEle == 1) {
+      return EVT(MVT::i64);
+    } else {
+      return EVT(MVT::getVectorVT(MVT::i64, vEle));
+    }
+  } else {
+    if (vEle == 1) {
+      return EVT(MVT::i32);
+    } else {
+      return EVT(MVT::getVectorVT(MVT::i32, vEle));
+    }
+  }
+}
+
+SDValue
+AMDGPUTargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDValue Cond  = Op.getOperand(1);
+  SDValue Jump  = Op.getOperand(2);
+  SDValue Result;
+  Result = DAG.getNode(
+      AMDGPUISD::BRANCH_COND,
+      Op.getDebugLoc(),
+      Op.getValueType(),
+      Chain, Jump, Cond);
+  return Result;
+}
+
+SDValue
+AMDGPUTargetLowering::LowerSDIV24(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  EVT OVT = Op.getValueType();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  MVT INTTY;
+  MVT FLTTY;
+  if (!OVT.isVector()) {
+    INTTY = MVT::i32;
+    FLTTY = MVT::f32;
+  } else if (OVT.getVectorNumElements() == 2) {
+    INTTY = MVT::v2i32;
+    FLTTY = MVT::v2f32;
+  } else if (OVT.getVectorNumElements() == 4) {
+    INTTY = MVT::v4i32;
+    FLTTY = MVT::v4f32;
+  }
+  unsigned bitsize = OVT.getScalarType().getSizeInBits();
+  // char|short jq = ia ^ ib;
+  SDValue jq = DAG.getNode(ISD::XOR, DL, OVT, LHS, RHS);
+
+  // jq = jq >> (bitsize - 2)
+  jq = DAG.getNode(ISD::SRA, DL, OVT, jq, DAG.getConstant(bitsize - 2, OVT)); 
+
+  // jq = jq | 0x1
+  jq = DAG.getNode(ISD::OR, DL, OVT, jq, DAG.getConstant(1, OVT));
+
+  // jq = (int)jq
+  jq = DAG.getSExtOrTrunc(jq, DL, INTTY);
+
+  // int ia = (int)LHS;
+  SDValue ia = DAG.getSExtOrTrunc(LHS, DL, INTTY);
+
+  // int ib, (int)RHS;
+  SDValue ib = DAG.getSExtOrTrunc(RHS, DL, INTTY);
+
+  // float fa = (float)ia;
+  SDValue fa = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ia);
+
+  // float fb = (float)ib;
+  SDValue fb = DAG.getNode(ISD::SINT_TO_FP, DL, FLTTY, ib);
+
+  // float fq = native_divide(fa, fb);
+  SDValue fq = DAG.getNode(AMDGPUISD::DIV_INF, DL, FLTTY, fa, fb);
+
+  // fq = trunc(fq);
+  fq = DAG.getNode(ISD::FTRUNC, DL, FLTTY, fq);
+
+  // float fqneg = -fq;
+  SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FLTTY, fq);
+
+  // float fr = mad(fqneg, fb, fa);
+  SDValue fr = DAG.getNode(AMDGPUISD::MAD, DL, FLTTY, fqneg, fb, fa);
+
+  // int iq = (int)fq;
+  SDValue iq = DAG.getNode(ISD::FP_TO_SINT, DL, INTTY, fq);
+
+  // fr = fabs(fr);
+  fr = DAG.getNode(ISD::FABS, DL, FLTTY, fr);
+
+  // fb = fabs(fb);
+  fb = DAG.getNode(ISD::FABS, DL, FLTTY, fb);
+
+  // int cv = fr >= fb;
+  SDValue cv;
+  if (INTTY == MVT::i32) {
+    cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
+  } else {
+    cv = DAG.getSetCC(DL, INTTY, fr, fb, ISD::SETOGE);
+  }
+  // jq = (cv ? jq : 0);
+  jq = DAG.getNode(ISD::SELECT, DL, OVT, cv, jq, 
+      DAG.getConstant(0, OVT));
+  // dst = iq + jq;
+  iq = DAG.getSExtOrTrunc(iq, DL, OVT);
+  iq = DAG.getNode(ISD::ADD, DL, OVT, iq, jq);
+  return iq;
+}
+
+SDValue
+AMDGPUTargetLowering::LowerSDIV32(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  EVT OVT = Op.getValueType();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  // The LowerSDIV32 function generates equivalent to the following IL.
+  // mov r0, LHS
+  // mov r1, RHS
+  // ilt r10, r0, 0
+  // ilt r11, r1, 0
+  // iadd r0, r0, r10
+  // iadd r1, r1, r11
+  // ixor r0, r0, r10
+  // ixor r1, r1, r11
+  // udiv r0, r0, r1
+  // ixor r10, r10, r11
+  // iadd r0, r0, r10
+  // ixor DST, r0, r10
+
+  // mov r0, LHS
+  SDValue r0 = LHS;
+
+  // mov r1, RHS
+  SDValue r1 = RHS;
+
+  // ilt r10, r0, 0
+  SDValue r10 = DAG.getSelectCC(DL,
+      r0, DAG.getConstant(0, OVT),
+      DAG.getConstant(-1, MVT::i32),
+      DAG.getConstant(0, MVT::i32),
+      ISD::SETLT);
+
+  // ilt r11, r1, 0
+  SDValue r11 = DAG.getSelectCC(DL,
+      r1, DAG.getConstant(0, OVT),
+      DAG.getConstant(-1, MVT::i32),
+      DAG.getConstant(0, MVT::i32),
+      ISD::SETLT);
+
+  // iadd r0, r0, r10
+  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+
+  // iadd r1, r1, r11
+  r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
+
+  // ixor r0, r0, r10
+  r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
+
+  // ixor r1, r1, r11
+  r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
+
+  // udiv r0, r0, r1
+  r0 = DAG.getNode(ISD::UDIV, DL, OVT, r0, r1);
+
+  // ixor r10, r10, r11
+  r10 = DAG.getNode(ISD::XOR, DL, OVT, r10, r11);
+
+  // iadd r0, r0, r10
+  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+
+  // ixor DST, r0, r10
+  SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); 
+  return DST;
+}
+
+SDValue
+AMDGPUTargetLowering::LowerSDIV64(SDValue Op, SelectionDAG &DAG) const {
+  return SDValue(Op.getNode(), 0);
+}
+
+SDValue
+AMDGPUTargetLowering::LowerSREM8(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  EVT OVT = Op.getValueType();
+  MVT INTTY = MVT::i32;
+  if (OVT == MVT::v2i8) {
+    INTTY = MVT::v2i32;
+  } else if (OVT == MVT::v4i8) {
+    INTTY = MVT::v4i32;
+  }
+  SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY);
+  SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY);
+  LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS);
+  LHS = DAG.getSExtOrTrunc(LHS, DL, OVT);
+  return LHS;
+}
+
+SDValue
+AMDGPUTargetLowering::LowerSREM16(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  EVT OVT = Op.getValueType();
+  MVT INTTY = MVT::i32;
+  if (OVT == MVT::v2i16) {
+    INTTY = MVT::v2i32;
+  } else if (OVT == MVT::v4i16) {
+    INTTY = MVT::v4i32;
+  }
+  SDValue LHS = DAG.getSExtOrTrunc(Op.getOperand(0), DL, INTTY);
+  SDValue RHS = DAG.getSExtOrTrunc(Op.getOperand(1), DL, INTTY);
+  LHS = DAG.getNode(ISD::SREM, DL, INTTY, LHS, RHS);
+  LHS = DAG.getSExtOrTrunc(LHS, DL, OVT);
+  return LHS;
+}
+
+SDValue
+AMDGPUTargetLowering::LowerSREM32(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  EVT OVT = Op.getValueType();
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  // The LowerSREM32 function generates equivalent to the following IL.
+  // mov r0, LHS
+  // mov r1, RHS
+  // ilt r10, r0, 0
+  // ilt r11, r1, 0
+  // iadd r0, r0, r10
+  // iadd r1, r1, r11
+  // ixor r0, r0, r10
+  // ixor r1, r1, r11
+  // udiv r20, r0, r1
+  // umul r20, r20, r1
+  // sub r0, r0, r20
+  // iadd r0, r0, r10
+  // ixor DST, r0, r10
+
+  // mov r0, LHS
+  SDValue r0 = LHS;
+
+  // mov r1, RHS
+  SDValue r1 = RHS;
+
+  // ilt r10, r0, 0
+  SDValue r10 = DAG.getSetCC(DL, OVT, r0, DAG.getConstant(0, OVT), ISD::SETLT);
+
+  // ilt r11, r1, 0
+  SDValue r11 = DAG.getSetCC(DL, OVT, r1, DAG.getConstant(0, OVT), ISD::SETLT);
+
+  // iadd r0, r0, r10
+  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+
+  // iadd r1, r1, r11
+  r1 = DAG.getNode(ISD::ADD, DL, OVT, r1, r11);
+
+  // ixor r0, r0, r10
+  r0 = DAG.getNode(ISD::XOR, DL, OVT, r0, r10);
+
+  // ixor r1, r1, r11
+  r1 = DAG.getNode(ISD::XOR, DL, OVT, r1, r11);
+
+  // udiv r20, r0, r1
+  SDValue r20 = DAG.getNode(ISD::UREM, DL, OVT, r0, r1);
+
+  // umul r20, r20, r1
+  r20 = DAG.getNode(AMDGPUISD::UMUL, DL, OVT, r20, r1);
+
+  // sub r0, r0, r20
+  r0 = DAG.getNode(ISD::SUB, DL, OVT, r0, r20);
+
+  // iadd r0, r0, r10
+  r0 = DAG.getNode(ISD::ADD, DL, OVT, r0, r10);
+
+  // ixor DST, r0, r10
+  SDValue DST = DAG.getNode(ISD::XOR, DL, OVT, r0, r10); 
+  return DST;
+}
+
+SDValue
+AMDGPUTargetLowering::LowerSREM64(SDValue Op, SelectionDAG &DAG) const {
+  return SDValue(Op.getNode(), 0);
+}
diff --git a/lib/Target/R600/AMDILInstrInfo.td b/lib/Target/R600/AMDILInstrInfo.td
new file mode 100644
index 0000000000..ac6745148e
--- /dev/null
+++ b/lib/Target/R600/AMDILInstrInfo.td
@@ -0,0 +1,273 @@
+//===------------ AMDILInstrInfo.td - AMDIL Target ------*-tablegen-*------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// This file describes the AMDIL instructions in TableGen format.
+//
+//===----------------------------------------------------------------------===//
+// AMDIL Instruction Predicate Definitions
+// Predicate that is set to true if the hardware supports double precision
+// divide
+def HasHWDDiv                 : Predicate<"Subtarget.device()"
+                           "->getGeneration() > AMDGPUDeviceInfo::HD4XXX && "
+              "Subtarget.device()->usesHardware(AMDGPUDeviceInfo::DoubleOps)">;
+
+// Predicate that is set to true if the hardware supports double, but not double
+// precision divide in hardware
+def HasSWDDiv             : Predicate<"Subtarget.device()"
+                           "->getGeneration() == AMDGPUDeviceInfo::HD4XXX &&"
+              "Subtarget.device()->usesHardware(AMDGPUDeviceInfo::DoubleOps)">;
+
+// Predicate that is set to true if the hardware support 24bit signed
+// math ops. Otherwise a software expansion to 32bit math ops is used instead.
+def HasHWSign24Bit          : Predicate<"Subtarget.device()"
+                            "->getGeneration() > AMDGPUDeviceInfo::HD5XXX">;
+
+// Predicate that is set to true if 64bit operations are supported or not
+def HasHW64Bit              : Predicate<"Subtarget.device()"
+                            "->usesHardware(AMDGPUDeviceInfo::LongOps)">;
+def HasSW64Bit              : Predicate<"Subtarget.device()"
+                            "->usesSoftware(AMDGPUDeviceInfo::LongOps)">;
+
+// Predicate that is set to true if the timer register is supported
+def HasTmrRegister          : Predicate<"Subtarget.device()"
+                            "->isSupported(AMDGPUDeviceInfo::TmrReg)">;
+// Predicate that is true if we are at least evergreen series
+def HasDeviceIDInst         : Predicate<"Subtarget.device()"
+                            "->getGeneration() >= AMDGPUDeviceInfo::HD5XXX">;
+
+// Predicate that is true if we have region address space.
+def hasRegionAS             : Predicate<"Subtarget.device()"
+                            "->usesHardware(AMDGPUDeviceInfo::RegionMem)">;
+
+// Predicate that is false if we don't have region address space.
+def noRegionAS             : Predicate<"!Subtarget.device()"
+                            "->isSupported(AMDGPUDeviceInfo::RegionMem)">;
+
+
+// Predicate that is set to true if 64bit Mul is supported in the IL or not
+def HasHW64Mul              : Predicate<"Subtarget.calVersion()" 
+                                          ">= CAL_VERSION_SC_139"
+                                          "&& Subtarget.device()"
+                                          "->getGeneration() >="
+                                          "AMDGPUDeviceInfo::HD5XXX">;
+def HasSW64Mul              : Predicate<"Subtarget.calVersion()" 
+                                          "< CAL_VERSION_SC_139">;
+// Predicate that is set to true if 64bit Div/Mod is supported in the IL or not
+def HasHW64DivMod           : Predicate<"Subtarget.device()"
+                            "->usesHardware(AMDGPUDeviceInfo::HW64BitDivMod)">;
+def HasSW64DivMod           : Predicate<"Subtarget.device()"
+                            "->usesSoftware(AMDGPUDeviceInfo::HW64BitDivMod)">;
+
+// Predicate that is set to true if 64bit pointer are used.
+def Has64BitPtr             : Predicate<"Subtarget.is64bit()">;
+def Has32BitPtr             : Predicate<"!Subtarget.is64bit()">;
+//===--------------------------------------------------------------------===//
+// Custom Operands
+//===--------------------------------------------------------------------===//
+def brtarget   : Operand<OtherVT>;
+
+//===--------------------------------------------------------------------===//
+// Custom Selection DAG Type Profiles
+//===--------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// Generic Profile Types
+//===----------------------------------------------------------------------===//
+
+def SDTIL_GenBinaryOp : SDTypeProfile<1, 2, [
+    SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>
+    ]>;
+def SDTIL_GenTernaryOp : SDTypeProfile<1, 3, [
+    SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisSameAs<2, 3>
+    ]>;
+def SDTIL_GenVecBuild : SDTypeProfile<1, 1, [
+    SDTCisEltOfVec<1, 0>
+    ]>;
+
+//===----------------------------------------------------------------------===//
+// Flow Control Profile Types
+//===----------------------------------------------------------------------===//
+// Branch instruction where second and third are basic blocks
+def SDTIL_BRCond : SDTypeProfile<0, 2, [
+    SDTCisVT<0, OtherVT>
+    ]>;
+
+//===--------------------------------------------------------------------===//
+// Custom Selection DAG Nodes
+//===--------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+// Flow Control DAG Nodes
+//===----------------------------------------------------------------------===//
+def IL_brcond      : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChain]>;
+
+//===----------------------------------------------------------------------===//
+// Call/Return DAG Nodes
+//===----------------------------------------------------------------------===//
+def IL_retflag       : SDNode<"AMDGPUISD::RET_FLAG", SDTNone,
+    [SDNPHasChain, SDNPOptInGlue]>;
+
+//===--------------------------------------------------------------------===//
+// Instructions
+//===--------------------------------------------------------------------===//
+// Floating point math functions
+def IL_div_inf      : SDNode<"AMDGPUISD::DIV_INF", SDTIL_GenBinaryOp>;
+def IL_mad          : SDNode<"AMDGPUISD::MAD", SDTIL_GenTernaryOp>;
+
+//===----------------------------------------------------------------------===//
+// Integer functions
+//===----------------------------------------------------------------------===//
+def IL_umul        : SDNode<"AMDGPUISD::UMUL"    , SDTIntBinOp,
+    [SDNPCommutative, SDNPAssociative]>;
+
+//===--------------------------------------------------------------------===//
+// Custom Pattern DAG Nodes
+//===--------------------------------------------------------------------===//
+def global_store : PatFrag<(ops node:$val, node:$ptr),
+    (store node:$val, node:$ptr), [{
+        return isGlobalStore(dyn_cast<StoreSDNode>(N));
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Load pattern fragments
+//===----------------------------------------------------------------------===//
+// Global address space loads
+def global_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+    return isGlobalLoad(dyn_cast<LoadSDNode>(N));
+}]>;
+// Constant address space loads
+def constant_load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+    return isConstantLoad(dyn_cast<LoadSDNode>(N), -1);
+}]>;
+
+//===----------------------------------------------------------------------===//
+// Complex addressing mode patterns
+//===----------------------------------------------------------------------===//
+def ADDR : ComplexPattern<i32, 2, "SelectADDR", [], []>;
+def ADDRF : ComplexPattern<i32, 2, "SelectADDR", [frameindex], []>;
+def ADDR64 : ComplexPattern<i64, 2, "SelectADDR64", [], []>;
+def ADDR64F : ComplexPattern<i64, 2, "SelectADDR64", [frameindex], []>;
+
+//===----------------------------------------------------------------------===//
+// Instruction format classes
+//===----------------------------------------------------------------------===//
+class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern>
+: Instruction {
+
+     let Namespace = "AMDGPU";
+     dag OutOperandList = outs;
+     dag InOperandList = ins;
+     let Pattern = pattern;
+     let AsmString = !strconcat(asmstr, "\n");
+     let isPseudo = 1;
+     let Itinerary = NullALU;
+     bit hasIEEEFlag = 0;
+     bit hasZeroOpFlag = 0;
+     let mayLoad = 0;
+     let mayStore = 0;
+     let hasSideEffects = 0;
+}
+
+//===--------------------------------------------------------------------===//
+// Multiclass Instruction formats
+//===--------------------------------------------------------------------===//
+// Multiclass that handles branch instructions
+multiclass BranchConditional<SDNode Op> {
+    def _i32 : ILFormat<(outs),
+  (ins brtarget:$target, GPRI32:$src0),
+        "; i32 Pseudo branch instruction",
+  [(Op bb:$target, GPRI32:$src0)]>;
+    def _f32 : ILFormat<(outs),
+  (ins brtarget:$target, GPRF32:$src0),
+        "; f32 Pseudo branch instruction",
+  [(Op bb:$target, GPRF32:$src0)]>;
+}
+
+// Only scalar types should generate flow control
+multiclass BranchInstr<string name> {
+  def _i32 : ILFormat<(outs), (ins GPRI32:$src),
+      !strconcat(name, " $src"), []>;
+  def _f32 : ILFormat<(outs), (ins GPRF32:$src),
+      !strconcat(name, " $src"), []>;
+}
+// Only scalar types should generate flow control
+multiclass BranchInstr2<string name> {
+  def _i32 : ILFormat<(outs), (ins GPRI32:$src0, GPRI32:$src1),
+      !strconcat(name, " $src0, $src1"), []>;
+  def _f32 : ILFormat<(outs), (ins GPRF32:$src0, GPRF32:$src1),
+      !strconcat(name, " $src0, $src1"), []>;
+}
+
+//===--------------------------------------------------------------------===//
+// Intrinsics support
+//===--------------------------------------------------------------------===//
+include "AMDILIntrinsics.td"
+
+//===--------------------------------------------------------------------===//
+// Instructions support
+//===--------------------------------------------------------------------===//
+//===---------------------------------------------------------------------===//
+// Custom Inserter for Branches and returns, this eventually will be a
+// seperate pass
+//===---------------------------------------------------------------------===//
+let isTerminator = 1, usesCustomInserter = 1, isBranch = 1, isBarrier = 1 in {
+  def BRANCH : ILFormat<(outs), (ins brtarget:$target),
+      "; Pseudo unconditional branch instruction",
+      [(br bb:$target)]>;
+  defm BRANCH_COND : BranchConditional<IL_brcond>;
+}
+
+//===---------------------------------------------------------------------===//
+// Flow and Program control Instructions
+//===---------------------------------------------------------------------===//
+let isTerminator=1 in {
+  def SWITCH      : ILFormat< (outs), (ins GPRI32:$src),
+  !strconcat("SWITCH", " $src"), []>;
+  def CASE        : ILFormat< (outs), (ins GPRI32:$src),
+      !strconcat("CASE", " $src"), []>;
+  def BREAK       : ILFormat< (outs), (ins),
+      "BREAK", []>;
+  def CONTINUE    : ILFormat< (outs), (ins),
+      "CONTINUE", []>;
+  def DEFAULT     : ILFormat< (outs), (ins),
+      "DEFAULT", []>;
+  def ELSE        : ILFormat< (outs), (ins),
+      "ELSE", []>;
+  def ENDSWITCH   : ILFormat< (outs), (ins),
+      "ENDSWITCH", []>;
+  def ENDMAIN     : ILFormat< (outs), (ins),
+      "ENDMAIN", []>;
+  def END         : ILFormat< (outs), (ins),
+      "END", []>;
+  def ENDFUNC     : ILFormat< (outs), (ins),
+      "ENDFUNC", []>;
+  def ENDIF       : ILFormat< (outs), (ins),
+      "ENDIF", []>;
+  def WHILELOOP   : ILFormat< (outs), (ins),
+      "WHILE", []>;
+  def ENDLOOP     : ILFormat< (outs), (ins),
+      "ENDLOOP", []>;
+  def FUNC        : ILFormat< (outs), (ins),
+      "FUNC", []>;
+  def RETDYN      : ILFormat< (outs), (ins),
+      "RET_DYN", []>;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm IF_LOGICALNZ  : BranchInstr<"IF_LOGICALNZ">;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm IF_LOGICALZ   : BranchInstr<"IF_LOGICALZ">;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm BREAK_LOGICALNZ : BranchInstr<"BREAK_LOGICALNZ">;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm BREAK_LOGICALZ : BranchInstr<"BREAK_LOGICALZ">;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm CONTINUE_LOGICALNZ : BranchInstr<"CONTINUE_LOGICALNZ">;
+  // This opcode has custom swizzle pattern encoded in Swizzle Encoder
+  defm CONTINUE_LOGICALZ : BranchInstr<"CONTINUE_LOGICALZ">;
+  defm IFC         : BranchInstr2<"IFC">;
+  defm BREAKC      : BranchInstr2<"BREAKC">;
+  defm CONTINUEC   : BranchInstr2<"CONTINUEC">;
+}
diff --git a/lib/Target/R600/AMDILIntrinsicInfo.cpp b/lib/Target/R600/AMDILIntrinsicInfo.cpp
new file mode 100644
index 0000000000..70db4e6da2
--- /dev/null
+++ b/lib/Target/R600/AMDILIntrinsicInfo.cpp
@@ -0,0 +1,79 @@
+//===- AMDILIntrinsicInfo.cpp - AMDGPU Intrinsic Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU Implementation of the IntrinsicInfo class.
+//
+//===-----------------------------------------------------------------------===//
+
+#include "AMDILIntrinsicInfo.h"
+#include "AMDIL.h"
+#include "AMDGPUSubtarget.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/Module.h"
+
+using namespace llvm;
+
+#define GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
+#include "AMDGPUGenIntrinsics.inc"
+#undef GET_LLVM_INTRINSIC_FOR_GCC_BUILTIN
+
+AMDGPUIntrinsicInfo::AMDGPUIntrinsicInfo(TargetMachine *tm) 
+  : TargetIntrinsicInfo() {
+}
+
+std::string 
+AMDGPUIntrinsicInfo::getName(unsigned int IntrID, Type **Tys,
+    unsigned int numTys) const  {
+  static const char* const names[] = {
+#define GET_INTRINSIC_NAME_TABLE
+#include "AMDGPUGenIntrinsics.inc"
+#undef GET_INTRINSIC_NAME_TABLE
+  };
+
+  if (IntrID < Intrinsic::num_intrinsics) {
+    return 0;
+  }
+  assert(IntrID < AMDGPUIntrinsic::num_AMDGPU_intrinsics
+      && "Invalid intrinsic ID");
+
+  std::string Result(names[IntrID - Intrinsic::num_intrinsics]);
+  return Result;
+}
+
+unsigned int
+AMDGPUIntrinsicInfo::lookupName(const char *Name, unsigned int Len) const  {
+#define GET_FUNCTION_RECOGNIZER
+#include "AMDGPUGenIntrinsics.inc"
+#undef GET_FUNCTION_RECOGNIZER
+  AMDGPUIntrinsic::ID IntrinsicID
+    = (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic;
+  IntrinsicID = getIntrinsicForGCCBuiltin("AMDGPU", Name);
+
+  if (IntrinsicID != (AMDGPUIntrinsic::ID)Intrinsic::not_intrinsic) {
+    return IntrinsicID;
+  }
+  return 0;
+}
+
+bool 
+AMDGPUIntrinsicInfo::isOverloaded(unsigned id) const  {
+  // Overload Table
+#define GET_INTRINSIC_OVERLOAD_TABLE
+#include "AMDGPUGenIntrinsics.inc"
+#undef GET_INTRINSIC_OVERLOAD_TABLE
+}
+
+Function*
+AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
+    Type **Tys,
+    unsigned numTys) const  {
+  llvm_unreachable("Not implemented");
+}
diff --git a/lib/Target/R600/AMDILIntrinsicInfo.h b/lib/Target/R600/AMDILIntrinsicInfo.h
new file mode 100644
index 0000000000..83f4933611
--- /dev/null
+++ b/lib/Target/R600/AMDILIntrinsicInfo.h
@@ -0,0 +1,49 @@
+//===- AMDILIntrinsicInfo.h - AMDGPU Intrinsic Information ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface for the AMDGPU Implementation of the Intrinsic Info class.
+//
+//===-----------------------------------------------------------------------===//
+#ifndef AMDIL_INTRINSICS_H
+#define AMDIL_INTRINSICS_H
+
+#include "llvm/Intrinsics.h"
+#include "llvm/Target/TargetIntrinsicInfo.h"
+
+namespace llvm {
+class TargetMachine;
+
+namespace AMDGPUIntrinsic {
+enum ID {
+  last_non_AMDGPU_intrinsic = Intrinsic::num_intrinsics - 1,
+#define GET_INTRINSIC_ENUM_VALUES
+#include "AMDGPUGenIntrinsics.inc"
+#undef GET_INTRINSIC_ENUM_VALUES
+      , num_AMDGPU_intrinsics
+};
+
+} // end namespace AMDGPUIntrinsic
+
+class AMDGPUIntrinsicInfo : public TargetIntrinsicInfo {
+public:
+  AMDGPUIntrinsicInfo(TargetMachine *tm);
+  std::string getName(unsigned int IntrId, Type **Tys = 0,
+                      unsigned int numTys = 0) const;
+  unsigned int lookupName(const char *Name, unsigned int Len) const;
+  bool isOverloaded(unsigned int IID) const;
+  Function *getDeclaration(Module *M, unsigned int ID,
+                           Type **Tys = 0,
+                           unsigned int numTys = 0) const;
+};
+
+} // end namespace llvm
+
+#endif // AMDIL_INTRINSICS_H
+
diff --git a/lib/Target/R600/AMDILIntrinsics.td b/lib/Target/R600/AMDILIntrinsics.td
new file mode 100644
index 0000000000..3f9e20f0c8
--- /dev/null
+++ b/lib/Target/R600/AMDILIntrinsics.td
@@ -0,0 +1,242 @@
+//===- AMDILIntrinsics.td - Defines AMDIL Intrinscs -*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+// This file defines all of the amdil-specific intrinsics
+//
+//===---------------------------------------------------------------===//
+//===--------------------------------------------------------------------===//
+// Intrinsic classes
+// Generic versions of the above classes but for Target specific intrinsics
+// instead of SDNode patterns.
+//===--------------------------------------------------------------------===//
+let TargetPrefix = "AMDIL", isTarget = 1 in {
+     class VoidIntLong :
+          Intrinsic<[llvm_i64_ty], [], []>;
+     class VoidIntInt :
+          Intrinsic<[llvm_i32_ty], [], []>;
+     class VoidIntBool :
+          Intrinsic<[llvm_i32_ty], [], []>;
+     class UnaryIntInt :
+          Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+     class UnaryIntFloat :
+          Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
+     class ConvertIntFTOI :
+          Intrinsic<[llvm_anyint_ty], [llvm_anyfloat_ty], [IntrNoMem]>;
+     class ConvertIntITOF :
+          Intrinsic<[llvm_anyfloat_ty], [llvm_anyint_ty], [IntrNoMem]>;
+     class UnaryIntNoRetInt :
+          Intrinsic<[], [llvm_anyint_ty], []>;
+     class UnaryIntNoRetFloat :
+          Intrinsic<[], [llvm_anyfloat_ty], []>;
+     class BinaryIntInt :
+          Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+     class BinaryIntFloat :
+          Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+     class BinaryIntNoRetInt :
+          Intrinsic<[], [llvm_anyint_ty, LLVMMatchType<0>], []>;
+     class BinaryIntNoRetFloat :
+          Intrinsic<[], [llvm_anyfloat_ty, LLVMMatchType<0>], []>;
+     class TernaryIntInt :
+          Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
+          LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+     class TernaryIntFloat :
+          Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>,
+          LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+     class QuaternaryIntInt :
+          Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>,
+          LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
+     class UnaryAtomicInt :
+          Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
+     class BinaryAtomicInt :
+          Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
+     class TernaryAtomicInt :
+          Intrinsic<[llvm_i32_ty], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty]>;
+     class UnaryAtomicIntNoRet :
+          Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
+     class BinaryAtomicIntNoRet :
+          Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
+     class TernaryAtomicIntNoRet :
+          Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadWriteArgMem]>;
+}
+
+let TargetPrefix = "AMDIL", isTarget = 1 in {
+  def int_AMDIL_abs : GCCBuiltin<"__amdil_abs">, UnaryIntInt;
+
+  def int_AMDIL_bit_extract_i32 : GCCBuiltin<"__amdil_ibit_extract">,
+          TernaryIntInt;
+  def int_AMDIL_bit_extract_u32 : GCCBuiltin<"__amdil_ubit_extract">,
+          TernaryIntInt;
+  def int_AMDIL_bit_reverse_u32 : GCCBuiltin<"__amdil_ubit_reverse">,
+          UnaryIntInt;
+  def int_AMDIL_bit_count_i32 : GCCBuiltin<"__amdil_count_bits">,
+          UnaryIntInt;
+  def int_AMDIL_bit_find_first_lo : GCCBuiltin<"__amdil_ffb_lo">,
+          UnaryIntInt;
+  def int_AMDIL_bit_find_first_hi : GCCBuiltin<"__amdil_ffb_hi">,
+          UnaryIntInt;
+  def int_AMDIL_bit_find_first_sgn : GCCBuiltin<"__amdil_ffb_signed">,
+          UnaryIntInt;
+  def int_AMDIL_media_bitalign : GCCBuiltin<"__amdil_bitalign">,
+                    TernaryIntInt;
+  def int_AMDIL_media_bytealign : GCCBuiltin<"__amdil_bytealign">,
+                    TernaryIntInt;
+  def int_AMDIL_bit_insert_u32 : GCCBuiltin<"__amdil_ubit_insert">,
+                    QuaternaryIntInt;
+  def int_AMDIL_bfi : GCCBuiltin<"__amdil_bfi">,
+      TernaryIntInt;
+  def int_AMDIL_bfm : GCCBuiltin<"__amdil_bfm">,
+      BinaryIntInt;
+  def int_AMDIL_mad_i32 : GCCBuiltin<"__amdil_imad">,
+          TernaryIntInt;
+  def int_AMDIL_mad_u32 : GCCBuiltin<"__amdil_umad">,
+          TernaryIntInt;
+  def int_AMDIL_mad     : GCCBuiltin<"__amdil_mad">,
+          TernaryIntFloat;
+  def int_AMDIL_mulhi_i32 : GCCBuiltin<"__amdil_imul_high">,
+          BinaryIntInt;
+  def int_AMDIL_mulhi_u32 : GCCBuiltin<"__amdil_umul_high">,
+          BinaryIntInt;
+  def int_AMDIL_mul24_i32 : GCCBuiltin<"__amdil_imul24">,
+          BinaryIntInt;
+  def int_AMDIL_mul24_u32 : GCCBuiltin<"__amdil_umul24">,
+          BinaryIntInt;
+  def int_AMDIL_mulhi24_i32 : GCCBuiltin<"__amdil_imul24_high">,
+          BinaryIntInt;
+  def int_AMDIL_mulhi24_u32 : GCCBuiltin<"__amdil_umul24_high">,
+          BinaryIntInt;
+  def int_AMDIL_mad24_i32 : GCCBuiltin<"__amdil_imad24">,
+          TernaryIntInt;
+  def int_AMDIL_mad24_u32 : GCCBuiltin<"__amdil_umad24">,
+          TernaryIntInt;
+  def int_AMDIL_carry_i32 : GCCBuiltin<"__amdil_carry">,
+          BinaryIntInt;
+  def int_AMDIL_borrow_i32 : GCCBuiltin<"__amdil_borrow">,
+          BinaryIntInt;
+  def int_AMDIL_min_i32 : GCCBuiltin<"__amdil_imin">,
+          BinaryIntInt;
+  def int_AMDIL_min_u32 : GCCBuiltin<"__amdil_umin">,
+          BinaryIntInt;
+  def int_AMDIL_min     : GCCBuiltin<"__amdil_min">,
+          BinaryIntFloat;
+  def int_AMDIL_max_i32 : GCCBuiltin<"__amdil_imax">,
+          BinaryIntInt;
+  def int_AMDIL_max_u32 : GCCBuiltin<"__amdil_umax">,
+          BinaryIntInt;
+  def int_AMDIL_max     : GCCBuiltin<"__amdil_max">,
+          BinaryIntFloat;
+  def int_AMDIL_media_lerp_u4 : GCCBuiltin<"__amdil_u4lerp">,
+          TernaryIntInt;
+  def int_AMDIL_media_sad : GCCBuiltin<"__amdil_sad">,
+          TernaryIntInt;
+  def int_AMDIL_media_sad_hi : GCCBuiltin<"__amdil_sadhi">,
+          TernaryIntInt;
+  def int_AMDIL_fraction : GCCBuiltin<"__amdil_fraction">,
+          UnaryIntFloat;
+  def int_AMDIL_clamp : GCCBuiltin<"__amdil_clamp">,
+          TernaryIntFloat;
+  def int_AMDIL_pireduce : GCCBuiltin<"__amdil_pireduce">,
+          UnaryIntFloat;
+  def int_AMDIL_round_nearest : GCCBuiltin<"__amdil_round_nearest">,
+          UnaryIntFloat;
+  def int_AMDIL_round_neginf : GCCBuiltin<"__amdil_round_neginf">,
+          UnaryIntFloat;
+  def int_AMDIL_round_zero : GCCBuiltin<"__amdil_round_zero">,
+          UnaryIntFloat;
+  def int_AMDIL_acos : GCCBuiltin<"__amdil_acos">,
+          UnaryIntFloat;
+  def int_AMDIL_atan : GCCBuiltin<"__amdil_atan">,
+          UnaryIntFloat;
+  def int_AMDIL_asin : GCCBuiltin<"__amdil_asin">,
+          UnaryIntFloat;
+  def int_AMDIL_cos : GCCBuiltin<"__amdil_cos">,
+          UnaryIntFloat;
+  def int_AMDIL_cos_vec : GCCBuiltin<"__amdil_cos_vec">,
+          UnaryIntFloat;
+  def int_AMDIL_tan : GCCBuiltin<"__amdil_tan">,
+          UnaryIntFloat;
+  def int_AMDIL_sin : GCCBuiltin<"__amdil_sin">,
+          UnaryIntFloat;
+  def int_AMDIL_sin_vec : GCCBuiltin<"__amdil_sin_vec">,
+          UnaryIntFloat;
+  def int_AMDIL_pow : GCCBuiltin<"__amdil_pow">, BinaryIntFloat;
+  def int_AMDIL_div : GCCBuiltin<"__amdil_div">, BinaryIntFloat;
+  def int_AMDIL_udiv : GCCBuiltin<"__amdil_udiv">, BinaryIntInt;
+  def int_AMDIL_sqrt: GCCBuiltin<"__amdil_sqrt">,
+          UnaryIntFloat;
+  def int_AMDIL_sqrt_vec: GCCBuiltin<"__amdil_sqrt_vec">,
+          UnaryIntFloat;
+  def int_AMDIL_exp : GCCBuiltin<"__amdil_exp">,
+          UnaryIntFloat;
+  def int_AMDIL_exp_vec : GCCBuiltin<"__amdil_exp_vec">,
+          UnaryIntFloat;
+  def int_AMDIL_exn : GCCBuiltin<"__amdil_exn">,
+          UnaryIntFloat;
+  def int_AMDIL_log_vec : GCCBuiltin<"__amdil_log_vec">,
+          UnaryIntFloat;
+  def int_AMDIL_ln : GCCBuiltin<"__amdil_ln">,
+          UnaryIntFloat;
+  def int_AMDIL_sign: GCCBuiltin<"__amdil_sign">,
+          UnaryIntFloat;
+  def int_AMDIL_fma: GCCBuiltin<"__amdil_fma">,
+          TernaryIntFloat;
+  def int_AMDIL_rsq : GCCBuiltin<"__amdil_rsq">,
+          UnaryIntFloat;
+  def int_AMDIL_rsq_vec : GCCBuiltin<"__amdil_rsq_vec">,
+          UnaryIntFloat;
+  def int_AMDIL_length : GCCBuiltin<"__amdil_length">,
+          UnaryIntFloat;
+  def int_AMDIL_lerp : GCCBuiltin<"__amdil_lerp">,
+          TernaryIntFloat;
+  def int_AMDIL_media_sad4 : GCCBuiltin<"__amdil_sad4">,
+      Intrinsic<[llvm_i32_ty], [llvm_v4i32_ty,
+           llvm_v4i32_ty, llvm_i32_ty], []>;
+
+  def int_AMDIL_frexp_f64 : GCCBuiltin<"__amdil_frexp">,
+        Intrinsic<[llvm_v2i64_ty], [llvm_double_ty], []>;
+ def int_AMDIL_ldexp : GCCBuiltin<"__amdil_ldexp">,
+    Intrinsic<[llvm_anyfloat_ty], [llvm_anyfloat_ty, llvm_anyint_ty], []>;
+  def int_AMDIL_drcp : GCCBuiltin<"__amdil_rcp">,
+      Intrinsic<[llvm_double_ty], [llvm_double_ty], []>;
+  def int_AMDIL_convert_f16_f32 : GCCBuiltin<"__amdil_half_to_float">,
+      ConvertIntITOF;
+  def int_AMDIL_convert_f32_f16 : GCCBuiltin<"__amdil_float_to_half">,
+      ConvertIntFTOI;
+  def int_AMDIL_convert_f32_i32_rpi : GCCBuiltin<"__amdil_float_to_int_rpi">,
+      ConvertIntFTOI;
+  def int_AMDIL_convert_f32_i32_flr : GCCBuiltin<"__amdil_float_to_int_flr">,
+      ConvertIntFTOI;
+  def int_AMDIL_convert_f32_f16_near : GCCBuiltin<"__amdil_float_to_half_near">,
+      ConvertIntFTOI;
+  def int_AMDIL_convert_f32_f16_neg_inf : GCCBuiltin<"__amdil_float_to_half_neg_inf">,
+      ConvertIntFTOI;
+  def int_AMDIL_convert_f32_f16_plus_inf : GCCBuiltin<"__amdil_float_to_half_plus_inf">,
+      ConvertIntFTOI;
+ def int_AMDIL_media_convert_f2v4u8 : GCCBuiltin<"__amdil_f_2_u4">,
+      Intrinsic<[llvm_i32_ty], [llvm_v4f32_ty], []>;
+  def int_AMDIL_media_unpack_byte_0 : GCCBuiltin<"__amdil_unpack_0">,
+      ConvertIntITOF;
+  def int_AMDIL_media_unpack_byte_1 : GCCBuiltin<"__amdil_unpack_1">,
+      ConvertIntITOF;
+  def int_AMDIL_media_unpack_byte_2 : GCCBuiltin<"__amdil_unpack_2">,
+      ConvertIntITOF;
+  def int_AMDIL_media_unpack_byte_3 : GCCBuiltin<"__amdil_unpack_3">,
+      ConvertIntITOF;
+  def int_AMDIL_dp2_add : GCCBuiltin<"__amdil_dp2_add">,
+        Intrinsic<[llvm_float_ty], [llvm_v2f32_ty,
+          llvm_v2f32_ty, llvm_float_ty], []>;
+  def int_AMDIL_dp2 : GCCBuiltin<"__amdil_dp2">,
+        Intrinsic<[llvm_float_ty], [llvm_v2f32_ty,
+          llvm_v2f32_ty], []>;
+  def int_AMDIL_dp3 : GCCBuiltin<"__amdil_dp3">,
+        Intrinsic<[llvm_float_ty], [llvm_v4f32_ty,
+          llvm_v4f32_ty], []>;
+  def int_AMDIL_dp4 : GCCBuiltin<"__amdil_dp4">,
+        Intrinsic<[llvm_float_ty], [llvm_v4f32_ty,
+          llvm_v4f32_ty], []>;
+}
diff --git a/lib/Target/R600/AMDILNIDevice.cpp b/lib/Target/R600/AMDILNIDevice.cpp
new file mode 100644
index 0000000000..b82da5908a
--- /dev/null
+++ b/lib/Target/R600/AMDILNIDevice.cpp
@@ -0,0 +1,65 @@
+//===-- AMDILNIDevice.cpp - Device Info for Northern Islands devices ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//==-----------------------------------------------------------------------===//
+#include "AMDILNIDevice.h"
+#include "AMDILEvergreenDevice.h"
+#include "AMDGPUSubtarget.h"
+
+using namespace llvm;
+
+AMDGPUNIDevice::AMDGPUNIDevice(AMDGPUSubtarget *ST)
+  : AMDGPUEvergreenDevice(ST) {
+  std::string name = ST->getDeviceName();
+  if (name == "caicos") {
+    DeviceFlag = OCL_DEVICE_CAICOS;
+  } else if (name == "turks") {
+    DeviceFlag = OCL_DEVICE_TURKS;
+  } else if (name == "cayman") {
+    DeviceFlag = OCL_DEVICE_CAYMAN;
+  } else {
+    DeviceFlag = OCL_DEVICE_BARTS;
+  }
+}
+AMDGPUNIDevice::~AMDGPUNIDevice() {
+}
+
+size_t
+AMDGPUNIDevice::getMaxLDSSize() const {
+  if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
+    return MAX_LDS_SIZE_900;
+  } else {
+    return 0;
+  }
+}
+
+uint32_t
+AMDGPUNIDevice::getGeneration() const {
+  return AMDGPUDeviceInfo::HD6XXX;
+}
+
+
+AMDGPUCaymanDevice::AMDGPUCaymanDevice(AMDGPUSubtarget *ST)
+  : AMDGPUNIDevice(ST) {
+  setCaps();
+}
+
+AMDGPUCaymanDevice::~AMDGPUCaymanDevice() {
+}
+
+void
+AMDGPUCaymanDevice::setCaps() {
+  if (mSTM->isOverride(AMDGPUDeviceInfo::DoubleOps)) {
+    mHWBits.set(AMDGPUDeviceInfo::DoubleOps);
+    mHWBits.set(AMDGPUDeviceInfo::FMA);
+  }
+  mHWBits.set(AMDGPUDeviceInfo::Signed24BitOps);
+  mSWBits.reset(AMDGPUDeviceInfo::Signed24BitOps);
+  mSWBits.set(AMDGPUDeviceInfo::ArenaSegment);
+}
+
diff --git a/lib/Target/R600/AMDILNIDevice.h b/lib/Target/R600/AMDILNIDevice.h
new file mode 100644
index 0000000000..bc7df37b26
--- /dev/null
+++ b/lib/Target/R600/AMDILNIDevice.h
@@ -0,0 +1,57 @@
+//===------- AMDILNIDevice.h - Define NI Device for AMDIL -*- C++ -*------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+/// \file
+/// \brief Interface for the subtarget data classes.
+///
+/// This file will define the interface that each generation needs to
+/// implement in order to correctly answer queries on the capabilities of the
+/// specific hardware.
+//===---------------------------------------------------------------------===//
+#ifndef AMDILNIDEVICE_H
+#define AMDILNIDEVICE_H
+#include "AMDILEvergreenDevice.h"
+#include "AMDGPUSubtarget.h"
+
+namespace llvm {
+
+class AMDGPUSubtarget;
+//===---------------------------------------------------------------------===//
+// NI generation of devices and their respective sub classes
+//===---------------------------------------------------------------------===//
+
+/// \brief The AMDGPUNIDevice is the base class for all Northern Island series of
+/// cards.
+///
+/// It is very similiar to the AMDGPUEvergreenDevice, with the major
+/// exception being differences in wavefront size and hardware capabilities.  The
+/// NI devices are all 64 wide wavefronts and also add support for signed 24 bit
+/// integer operations
+class AMDGPUNIDevice : public AMDGPUEvergreenDevice {
+public:
+  AMDGPUNIDevice(AMDGPUSubtarget*);
+  virtual ~AMDGPUNIDevice();
+  virtual size_t getMaxLDSSize() const;
+  virtual uint32_t getGeneration() const;
+};
+
+/// Just as the AMDGPUCypressDevice is the double capable version of the
+/// AMDGPUEvergreenDevice, the AMDGPUCaymanDevice is the double capable version
+/// of the AMDGPUNIDevice.  The other major difference is that the Cayman Device
+/// has 4 wide ALU's, whereas the rest of the NI family is a 5 wide.
+class AMDGPUCaymanDevice: public AMDGPUNIDevice {
+public:
+  AMDGPUCaymanDevice(AMDGPUSubtarget*);
+  virtual ~AMDGPUCaymanDevice();
+private:
+  virtual void setCaps();
+};
+
+static const unsigned int MAX_LDS_SIZE_900 = AMDGPUDevice::MAX_LDS_SIZE_800;
+} // namespace llvm
+#endif // AMDILNIDEVICE_H
diff --git a/lib/Target/R600/AMDILPeepholeOptimizer.cpp b/lib/Target/R600/AMDILPeepholeOptimizer.cpp
new file mode 100644
index 0000000000..4a748b8e9c
--- /dev/null
+++ b/lib/Target/R600/AMDILPeepholeOptimizer.cpp
@@ -0,0 +1,1215 @@
+//===-- AMDILPeepholeOptimizer.cpp - AMDGPU Peephole optimizations ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//==-----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "PeepholeOpt"
+#ifdef DEBUG
+#define DEBUGME (DebugFlag && isCurrentDebugType(DEBUG_TYPE))
+#else
+#define DEBUGME 0
+#endif
+
+#include "AMDILDevices.h"
+#include "AMDGPUInstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Constants.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionAnalysis.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+
+#include <sstream>
+
+#if 0
+STATISTIC(PointerAssignments, "Number of dynamic pointer "
+    "assigments discovered");
+STATISTIC(PointerSubtract, "Number of pointer subtractions discovered");
+#endif
+
+using namespace llvm;
+// The Peephole optimization pass is used to do simple last minute optimizations
+// that are required for correct code or to remove redundant functions
+namespace {
+
+class OpaqueType;
+
+class LLVM_LIBRARY_VISIBILITY AMDGPUPeepholeOpt : public FunctionPass {
+public:
+  TargetMachine &TM;
+  static char ID;
+  AMDGPUPeepholeOpt(TargetMachine &tm);
+  ~AMDGPUPeepholeOpt();
+  const char *getPassName() const;
+  bool runOnFunction(Function &F);
+  bool doInitialization(Module &M);
+  bool doFinalization(Module &M);
+  void getAnalysisUsage(AnalysisUsage &AU) const;
+protected:
+private:
+  // Function to initiate all of the instruction level optimizations.
+  bool instLevelOptimizations(BasicBlock::iterator *inst);
+  // Quick check to see if we need to dump all of the pointers into the
+  // arena. If this is correct, then we set all pointers to exist in arena. This
+  // is a workaround for aliasing of pointers in a struct/union.
+  bool dumpAllIntoArena(Function &F);
+  // Because I don't want to invalidate any pointers while in the
+  // safeNestedForEachFunction. I push atomic conversions to a vector and handle
+  // it later. This function does the conversions if required.
+  void doAtomicConversionIfNeeded(Function &F);
+  // Because __amdil_is_constant cannot be properly evaluated if
+  // optimizations are disabled, the call's are placed in a vector
+  // and evaluated after the __amdil_image* functions are evaluated
+  // which should allow the __amdil_is_constant function to be
+  // evaluated correctly.
+  void doIsConstCallConversionIfNeeded();
+  bool mChanged;
+  bool mDebug;
+  bool mConvertAtomics;
+  CodeGenOpt::Level optLevel;
+  // Run a series of tests to see if we can optimize a CALL instruction.
+  bool optimizeCallInst(BasicBlock::iterator *bbb);
+  // A peephole optimization to optimize bit extract sequences.
+  bool optimizeBitExtract(Instruction *inst);
+  // A peephole optimization to optimize bit insert sequences.
+  bool optimizeBitInsert(Instruction *inst);
+  bool setupBitInsert(Instruction *base, 
+                      Instruction *&src, 
+                      Constant *&mask, 
+                      Constant *&shift);
+  // Expand the bit field insert instruction on versions of OpenCL that
+  // don't support it.
+  bool expandBFI(CallInst *CI);
+  // Expand the bit field mask instruction on version of OpenCL that 
+  // don't support it.
+  bool expandBFM(CallInst *CI);
+  // On 7XX and 8XX operations, we do not have 24 bit signed operations. So in
+  // this case we need to expand them. These functions check for 24bit functions
+  // and then expand.
+  bool isSigned24BitOps(CallInst *CI);
+  void expandSigned24BitOps(CallInst *CI);
+  // One optimization that can occur is that if the required workgroup size is
+  // specified then the result of get_local_size is known at compile time and
+  // can be returned accordingly.
+  bool isRWGLocalOpt(CallInst *CI);
+  // On northern island cards, the division is slightly less accurate than on
+  // previous generations, so we need to utilize a more accurate division. So we
+  // can translate the accurate divide to a normal divide on all other cards.
+  bool convertAccurateDivide(CallInst *CI);
+  void expandAccurateDivide(CallInst *CI);
+  // If the alignment is set incorrectly, it can produce really inefficient
+  // code. This checks for this scenario and fixes it if possible.
+  bool correctMisalignedMemOp(Instruction *inst);
+
+  // If we are in no opt mode, then we need to make sure that
+  // local samplers are properly propagated as constant propagation 
+  // doesn't occur and we need to know the value of kernel defined
+  // samplers at compile time.
+  bool propagateSamplerInst(CallInst *CI);
+
+  // Helper functions
+
+  // Group of functions that recursively calculate the size of a structure based
+  // on it's sub-types.
+  size_t getTypeSize(Type * const T, bool dereferencePtr = false);
+  size_t getTypeSize(StructType * const ST, bool dereferencePtr = false);
+  size_t getTypeSize(IntegerType * const IT, bool dereferencePtr = false);
+  size_t getTypeSize(FunctionType * const FT,bool dereferencePtr = false);
+  size_t getTypeSize(ArrayType * const AT, bool dereferencePtr = false);
+  size_t getTypeSize(VectorType * const VT, bool dereferencePtr = false);
+  size_t getTypeSize(PointerType * const PT, bool dereferencePtr = false);
+  size_t getTypeSize(OpaqueType * const OT, bool dereferencePtr = false);
+
+  LLVMContext *mCTX;
+  Function *mF;
+  const AMDGPUSubtarget *mSTM;
+  SmallVector< std::pair<CallInst *, Function *>, 16> atomicFuncs;
+  SmallVector<CallInst *, 16> isConstVec;
+}; // class AMDGPUPeepholeOpt
+  char AMDGPUPeepholeOpt::ID = 0;
+
+// A template function that has two levels of looping before calling the
+// function with a pointer to the current iterator.
+template<class InputIterator, class SecondIterator, class Function>
+Function safeNestedForEach(InputIterator First, InputIterator Last,
+                              SecondIterator S, Function F) {
+  for ( ; First != Last; ++First) {
+    SecondIterator sf, sl;
+    for (sf = First->begin(), sl = First->end();
+         sf != sl; )  {
+      if (!F(&sf)) {
+        ++sf;
+      } 
+    }
+  }
+  return F;
+}
+
+} // anonymous namespace
+
+namespace llvm {
+  FunctionPass *
+  createAMDGPUPeepholeOpt(TargetMachine &tm) {
+    return new AMDGPUPeepholeOpt(tm);
+  }
+} // llvm namespace
+
+AMDGPUPeepholeOpt::AMDGPUPeepholeOpt(TargetMachine &tm)
+  : FunctionPass(ID), TM(tm)  {
+  mDebug = DEBUGME;
+  optLevel = TM.getOptLevel();
+
+}
+
+AMDGPUPeepholeOpt::~AMDGPUPeepholeOpt()  {
+}
+
+const char *
+AMDGPUPeepholeOpt::getPassName() const  {
+  return "AMDGPU PeepHole Optimization Pass";
+}
+
+bool 
+containsPointerType(Type *Ty)  {
+  if (!Ty) {
+    return false;
+  }
+  switch(Ty->getTypeID()) {
+  default:
+    return false;
+  case Type::StructTyID: {
+    const StructType *ST = dyn_cast<StructType>(Ty);
+    for (StructType::element_iterator stb = ST->element_begin(),
+           ste = ST->element_end(); stb != ste; ++stb) {
+      if (!containsPointerType(*stb)) {
+        continue;
+      }
+      return true;
+    }
+    break;
+  }
+  case Type::VectorTyID:
+  case Type::ArrayTyID:
+    return containsPointerType(dyn_cast<SequentialType>(Ty)->getElementType());
+  case Type::PointerTyID:
+    return true;
+  };
+  return false;
+}
+
+bool 
+AMDGPUPeepholeOpt::dumpAllIntoArena(Function &F)  {
+  bool dumpAll = false;
+  for (Function::const_arg_iterator cab = F.arg_begin(),
+       cae = F.arg_end(); cab != cae; ++cab) {
+    const Argument *arg = cab;
+    const PointerType *PT = dyn_cast<PointerType>(arg->getType());
+    if (!PT) {
+      continue;
+    }
+    Type *DereferencedType = PT->getElementType();
+    if (!dyn_cast<StructType>(DereferencedType) 
+        ) {
+      continue;
+    }
+    if (!containsPointerType(DereferencedType)) {
+      continue;
+    }
+    // FIXME: Because a pointer inside of a struct/union may be aliased to
+    // another pointer we need to take the conservative approach and place all
+    // pointers into the arena until more advanced detection is implemented.
+    dumpAll = true;
+  }
+  return dumpAll;
+}
+void
+AMDGPUPeepholeOpt::doIsConstCallConversionIfNeeded() {
+  if (isConstVec.empty()) {
+    return;
+  }
+  for (unsigned x = 0, y = isConstVec.size(); x < y; ++x) {
+    CallInst *CI = isConstVec[x];
+    Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
+    Type *aType = Type::getInt32Ty(*mCTX);
+    Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
+      : ConstantInt::get(aType, 0);
+    CI->replaceAllUsesWith(Val);
+    CI->eraseFromParent();
+  }
+  isConstVec.clear();
+}
+void 
+AMDGPUPeepholeOpt::doAtomicConversionIfNeeded(Function &F)  {
+  // Don't do anything if we don't have any atomic operations.
+  if (atomicFuncs.empty()) {
+    return;
+  }
+  // Change the function name for the atomic if it is required
+  uint32_t size = atomicFuncs.size();
+  for (uint32_t x = 0; x < size; ++x) {
+    atomicFuncs[x].first->setOperand(
+        atomicFuncs[x].first->getNumOperands()-1, 
+        atomicFuncs[x].second);
+
+  }
+  mChanged = true;
+  if (mConvertAtomics) {
+    return;
+  }
+}
+
+bool 
+AMDGPUPeepholeOpt::runOnFunction(Function &MF)  {
+  mChanged = false;
+  mF = &MF;
+  mSTM = &TM.getSubtarget<AMDGPUSubtarget>();
+  if (mDebug) {
+    MF.dump();
+  }
+  mCTX = &MF.getType()->getContext();
+  mConvertAtomics = true;
+  safeNestedForEach(MF.begin(), MF.end(), MF.begin()->begin(),
+     std::bind1st(std::mem_fun(&AMDGPUPeepholeOpt::instLevelOptimizations),
+                  this));
+
+  doAtomicConversionIfNeeded(MF);
+  doIsConstCallConversionIfNeeded();
+
+  if (mDebug) {
+    MF.dump();
+  }
+  return mChanged;
+}
+
+bool 
+AMDGPUPeepholeOpt::optimizeCallInst(BasicBlock::iterator *bbb)  {
+  Instruction *inst = (*bbb);
+  CallInst *CI = dyn_cast<CallInst>(inst);
+  if (!CI) {
+    return false;
+  }
+  if (isSigned24BitOps(CI)) {
+    expandSigned24BitOps(CI);
+    ++(*bbb);
+    CI->eraseFromParent();
+    return true;
+  }
+  if (propagateSamplerInst(CI)) {
+    return false;
+  }
+  if (expandBFI(CI) || expandBFM(CI)) {
+    ++(*bbb);
+    CI->eraseFromParent();
+    return true;
+  }
+  if (convertAccurateDivide(CI)) {
+    expandAccurateDivide(CI);
+    ++(*bbb);
+    CI->eraseFromParent();
+    return true;
+  }
+
+  StringRef calleeName = CI->getOperand(CI->getNumOperands()-1)->getName();
+  if (calleeName.startswith("__amdil_is_constant")) {
+    // If we do not have optimizations, then this
+    // cannot be properly evaluated, so we add the
+    // call instruction to a vector and process
+    // them at the end of processing after the
+    // samplers have been correctly handled.
+    if (optLevel == CodeGenOpt::None) {
+      isConstVec.push_back(CI);
+      return false;
+    } else {
+      Constant *CV = dyn_cast<Constant>(CI->getOperand(0));
+      Type *aType = Type::getInt32Ty(*mCTX);
+      Value *Val = (CV != NULL) ? ConstantInt::get(aType, 1)
+        : ConstantInt::get(aType, 0);
+      CI->replaceAllUsesWith(Val);
+      ++(*bbb);
+      CI->eraseFromParent();
+      return true;
+    }
+  }
+
+  if (calleeName.equals("__amdil_is_asic_id_i32")) {
+    ConstantInt *CV = dyn_cast<ConstantInt>(CI->getOperand(0));
+    Type *aType = Type::getInt32Ty(*mCTX);
+    Value *Val = CV;
+    if (Val) {
+      Val = ConstantInt::get(aType, 
+          mSTM->device()->getDeviceFlag() & CV->getZExtValue());
+    } else {
+      Val = ConstantInt::get(aType, 0);
+    }
+    CI->replaceAllUsesWith(Val);
+    ++(*bbb);
+    CI->eraseFromParent();
+    return true;
+  }
+  Function *F = dyn_cast<Function>(CI->getOperand(CI->getNumOperands()-1));
+  if (!F) {
+    return false;
+  } 
+  if (F->getName().startswith("__atom") && !CI->getNumUses() 
+      && F->getName().find("_xchg") == StringRef::npos) {
+    std::string buffer(F->getName().str() + "_noret");
+    F = dyn_cast<Function>(
+          F->getParent()->getOrInsertFunction(buffer, F->getFunctionType()));
+    atomicFuncs.push_back(std::make_pair <CallInst*, Function*>(CI, F));
+  }
+  
+  if (!mSTM->device()->isSupported(AMDGPUDeviceInfo::ArenaSegment)
+      && !mSTM->device()->isSupported(AMDGPUDeviceInfo::MultiUAV)) {
+    return false;
+  }
+  if (!mConvertAtomics) {
+    return false;
+  }
+  StringRef name = F->getName();
+  if (name.startswith("__atom") && name.find("_g") != StringRef::npos) {
+    mConvertAtomics = false;
+  }
+  return false;
+}
+
+bool
+AMDGPUPeepholeOpt::setupBitInsert(Instruction *base, 
+    Instruction *&src, 
+    Constant *&mask, 
+    Constant *&shift) {
+  if (!base) {
+    if (mDebug) {
+      dbgs() << "Null pointer passed into function.\n";
+    }
+    return false;
+  }
+  bool andOp = false;
+  if (base->getOpcode() == Instruction::Shl) {
+    shift = dyn_cast<Constant>(base->getOperand(1));
+  } else if (base->getOpcode() == Instruction::And) {
+    mask = dyn_cast<Constant>(base->getOperand(1));
+    andOp = true;
+  } else {
+    if (mDebug) {
+      dbgs() << "Failed setup with no Shl or And instruction on base opcode!\n";
+    }
+    // If the base is neither a Shl or a And, we don't fit any of the patterns above.
+    return false;
+  }
+  src = dyn_cast<Instruction>(base->getOperand(0));
+  if (!src) {
+    if (mDebug) {
+      dbgs() << "Failed setup since the base operand is not an instruction!\n";
+    }
+    return false;
+  }
+  // If we find an 'and' operation, then we don't need to
+  // find the next operation as we already know the
+  // bits that are valid at this point.
+  if (andOp) {
+    return true;
+  }
+  if (src->getOpcode() == Instruction::Shl && !shift) {
+    shift = dyn_cast<Constant>(src->getOperand(1));
+    src = dyn_cast<Instruction>(src->getOperand(0));
+  } else if (src->getOpcode() == Instruction::And && !mask) {
+    mask = dyn_cast<Constant>(src->getOperand(1));
+  }
+  if (!mask && !shift) {
+    if (mDebug) {
+      dbgs() << "Failed setup since both mask and shift are NULL!\n";
+    }
+    // Did not find a constant mask or a shift.
+    return false;
+  }
+  return true;
+}
+bool
+AMDGPUPeepholeOpt::optimizeBitInsert(Instruction *inst)  {
+  if (!inst) {
+    return false;
+  }
+  if (!inst->isBinaryOp()) {
+    return false;
+  }
+  if (inst->getOpcode() != Instruction::Or) {
+    return false;
+  }
+  if (optLevel == CodeGenOpt::None) {
+    return false;
+  }
+  // We want to do an optimization on a sequence of ops that in the end equals a
+  // single ISA instruction.
+  // The base pattern for this optimization is - ((A & B) << C) | ((D & E) << F)
+  // Some simplified versions of this pattern are as follows:
+  // (A & B) | (D & E) when B & E == 0 && C == 0 && F == 0
+  // ((A & B) << C) | (D & E) when B ^ E == 0 && (1 << C) >= E
+  // (A & B) | ((D & E) << F) when B ^ E == 0 && (1 << F) >= B
+  // (A & B) | (D << F) when (1 << F) >= B
+  // (A << C) | (D & E) when (1 << C) >= E
+  if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
+    // The HD4XXX hardware doesn't support the ubit_insert instruction.
+    return false;
+  }
+  Type *aType = inst->getType();
+  bool isVector = aType->isVectorTy();
+  int numEle = 1;
+  // This optimization only works on 32bit integers.
+  if (aType->getScalarType()
+      != Type::getInt32Ty(inst->getContext())) {
+    return false;
+  }
+  if (isVector) {
+    const VectorType *VT = dyn_cast<VectorType>(aType);
+    numEle = VT->getNumElements();
+    // We currently cannot support more than 4 elements in a intrinsic and we
+    // cannot support Vec3 types.
+    if (numEle > 4 || numEle == 3) {
+      return false;
+    }
+  }
+  // TODO: Handle vectors.
+  if (isVector) {
+    if (mDebug) {
+      dbgs() << "!!! Vectors are not supported yet!\n";
+    }
+    return false;
+  }
+  Instruction *LHSSrc = NULL, *RHSSrc = NULL;
+  Constant *LHSMask = NULL, *RHSMask = NULL;
+  Constant *LHSShift = NULL, *RHSShift = NULL;
+  Instruction *LHS = dyn_cast<Instruction>(inst->getOperand(0));
+  Instruction *RHS = dyn_cast<Instruction>(inst->getOperand(1));
+  if (!setupBitInsert(LHS, LHSSrc, LHSMask, LHSShift)) {
+    if (mDebug) {
+      dbgs() << "Found an OR Operation that failed setup!\n";
+      inst->dump();
+      if (LHS) { LHS->dump(); }
+      if (LHSSrc) { LHSSrc->dump(); }
+      if (LHSMask) { LHSMask->dump(); }
+      if (LHSShift) { LHSShift->dump(); }
+    }
+    // There was an issue with the setup for BitInsert.
+    return false;
+  }
+  if (!setupBitInsert(RHS, RHSSrc, RHSMask, RHSShift)) {
+    if (mDebug) {
+      dbgs() << "Found an OR Operation that failed setup!\n";
+      inst->dump();
+      if (RHS) { RHS->dump(); }
+      if (RHSSrc) { RHSSrc->dump(); }
+      if (RHSMask) { RHSMask->dump(); }
+      if (RHSShift) { RHSShift->dump(); }
+    }
+    // There was an issue with the setup for BitInsert.
+    return false;
+  }
+  if (mDebug) {
+    dbgs() << "Found an OR operation that can possible be optimized to ubit insert!\n";
+    dbgs() << "Op:        "; inst->dump();
+    dbgs() << "LHS:       "; if (LHS) { LHS->dump(); } else { dbgs() << "(None)\n"; }
+    dbgs() << "LHS Src:   "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(None)\n"; }
+    dbgs() << "LHS Mask:  "; if (LHSMask) { LHSMask->dump(); } else { dbgs() << "(None)\n"; }
+    dbgs() << "LHS Shift: "; if (LHSShift) { LHSShift->dump(); } else { dbgs() << "(None)\n"; }
+    dbgs() << "RHS:       "; if (RHS) { RHS->dump(); } else { dbgs() << "(None)\n"; }
+    dbgs() << "RHS Src:   "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(None)\n"; }
+    dbgs() << "RHS Mask:  "; if (RHSMask) { RHSMask->dump(); } else { dbgs() << "(None)\n"; }
+    dbgs() << "RHS Shift: "; if (RHSShift) { RHSShift->dump(); } else { dbgs() << "(None)\n"; }
+  }
+  Constant *offset = NULL;
+  Constant *width = NULL;
+  uint32_t lhsMaskVal = 0, rhsMaskVal = 0;
+  uint32_t lhsShiftVal = 0, rhsShiftVal = 0;
+  uint32_t lhsMaskWidth = 0, rhsMaskWidth = 0;
+  uint32_t lhsMaskOffset = 0, rhsMaskOffset = 0;
+  lhsMaskVal = (LHSMask 
+      ? dyn_cast<ConstantInt>(LHSMask)->getZExtValue() : 0);
+  rhsMaskVal = (RHSMask 
+      ? dyn_cast<ConstantInt>(RHSMask)->getZExtValue() : 0);
+  lhsShiftVal = (LHSShift 
+      ? dyn_cast<ConstantInt>(LHSShift)->getZExtValue() : 0);
+  rhsShiftVal = (RHSShift 
+      ? dyn_cast<ConstantInt>(RHSShift)->getZExtValue() : 0);
+  lhsMaskWidth = lhsMaskVal ? CountPopulation_32(lhsMaskVal) : 32 - lhsShiftVal;
+  rhsMaskWidth = rhsMaskVal ? CountPopulation_32(rhsMaskVal) : 32 - rhsShiftVal;
+  lhsMaskOffset = lhsMaskVal ? CountTrailingZeros_32(lhsMaskVal) : lhsShiftVal;
+  rhsMaskOffset = rhsMaskVal ? CountTrailingZeros_32(rhsMaskVal) : rhsShiftVal;
+  // TODO: Handle the case of A & B | D & ~B(i.e. inverted masks).
+  if ((lhsMaskVal || rhsMaskVal) && !(lhsMaskVal ^ rhsMaskVal)) {
+    return false;
+  }
+  if (lhsMaskOffset >= (rhsMaskWidth + rhsMaskOffset)) {
+    offset = ConstantInt::get(aType, lhsMaskOffset, false);
+    width = ConstantInt::get(aType, lhsMaskWidth, false);
+    RHSSrc = RHS;
+    if (!isMask_32(lhsMaskVal) && !isShiftedMask_32(lhsMaskVal)) {
+      return false;
+    }
+    if (!LHSShift) {
+      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
+          "MaskShr", LHS);
+    } else if (lhsShiftVal != lhsMaskOffset) {
+      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
+          "MaskShr", LHS);
+    }
+    if (mDebug) {
+      dbgs() << "Optimizing LHS!\n";
+    }
+  } else if (rhsMaskOffset >= (lhsMaskWidth + lhsMaskOffset)) {
+    offset = ConstantInt::get(aType, rhsMaskOffset, false);
+    width = ConstantInt::get(aType, rhsMaskWidth, false);
+    LHSSrc = RHSSrc;
+    RHSSrc = LHS;
+    if (!isMask_32(rhsMaskVal) && !isShiftedMask_32(rhsMaskVal)) {
+      return false;
+    }
+    if (!RHSShift) {
+      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
+          "MaskShr", RHS);
+    } else if (rhsShiftVal != rhsMaskOffset) {
+      LHSSrc = BinaryOperator::Create(Instruction::LShr, LHSSrc, offset,
+          "MaskShr", RHS);
+    }
+    if (mDebug) {
+      dbgs() << "Optimizing RHS!\n";
+    }
+  } else {
+    if (mDebug) {
+      dbgs() << "Failed constraint 3!\n";
+    }
+    return false;
+  }
+  if (mDebug) {
+    dbgs() << "Width:  "; if (width) { width->dump(); } else { dbgs() << "(0)\n"; }
+    dbgs() << "Offset: "; if (offset) { offset->dump(); } else { dbgs() << "(0)\n"; }
+    dbgs() << "LHSSrc: "; if (LHSSrc) { LHSSrc->dump(); } else { dbgs() << "(0)\n"; }
+    dbgs() << "RHSSrc: "; if (RHSSrc) { RHSSrc->dump(); } else { dbgs() << "(0)\n"; }
+  }
+  if (!offset || !width) {
+    if (mDebug) {
+      dbgs() << "Either width or offset are NULL, failed detection!\n";
+    }
+    return false;
+  }
+  // Lets create the function signature.
+  std::vector<Type *> callTypes;
+  callTypes.push_back(aType);
+  callTypes.push_back(aType);
+  callTypes.push_back(aType);
+  callTypes.push_back(aType);
+  FunctionType *funcType = FunctionType::get(aType, callTypes, false);
+  std::string name = "__amdil_ubit_insert";
+  if (isVector) { name += "_v" + itostr(numEle) + "u32"; } else { name += "_u32"; }
+  Function *Func = 
+    dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
+        getOrInsertFunction(llvm::StringRef(name), funcType));
+  Value *Operands[4] = {
+    width,
+    offset,
+    LHSSrc,
+    RHSSrc
+  };
+  CallInst *CI = CallInst::Create(Func, Operands, "BitInsertOpt");
+  if (mDebug) {
+    dbgs() << "Old Inst: ";
+    inst->dump();
+    dbgs() << "New Inst: ";
+    CI->dump();
+    dbgs() << "\n\n";
+  }
+  CI->insertBefore(inst);
+  inst->replaceAllUsesWith(CI);
+  return true;
+}
+
+bool 
+AMDGPUPeepholeOpt::optimizeBitExtract(Instruction *inst)  {
+  if (!inst) {
+    return false;
+  }
+  if (!inst->isBinaryOp()) {
+    return false;
+  }
+  if (inst->getOpcode() != Instruction::And) {
+    return false;
+  }
+  if (optLevel == CodeGenOpt::None) {
+    return false;
+  }
+  // We want to do some simple optimizations on Shift right/And patterns. The
+  // basic optimization is to turn (A >> B) & C where A is a 32bit type, B is a
+  // value smaller than 32 and C is a mask. If C is a constant value, then the
+  // following transformation can occur. For signed integers, it turns into the
+  // function call dst = __amdil_ibit_extract(log2(C), B, A) For unsigned
+  // integers, it turns into the function call dst =
+  // __amdil_ubit_extract(log2(C), B, A) The function __amdil_[u|i]bit_extract
+  // can be found in Section 7.9 of the ATI IL spec of the stream SDK for
+  // Evergreen hardware.
+  if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD4XXX) {
+    // This does not work on HD4XXX hardware.
+    return false;
+  }
+  Type *aType = inst->getType();
+  bool isVector = aType->isVectorTy();
+
+  // XXX Support vector types
+  if (isVector) {
+    return false;
+  }
+  int numEle = 1;
+  // This only works on 32bit integers
+  if (aType->getScalarType()
+      != Type::getInt32Ty(inst->getContext())) {
+    return false;
+  }
+  if (isVector) {
+    const VectorType *VT = dyn_cast<VectorType>(aType);
+    numEle = VT->getNumElements();
+    // We currently cannot support more than 4 elements in a intrinsic and we
+    // cannot support Vec3 types.
+    if (numEle > 4 || numEle == 3) {
+      return false;
+    }
+  }
+  BinaryOperator *ShiftInst = dyn_cast<BinaryOperator>(inst->getOperand(0));
+  // If the first operand is not a shift instruction, then we can return as it
+  // doesn't match this pattern.
+  if (!ShiftInst || !ShiftInst->isShift()) {
+    return false;
+  }
+  // If we are a shift left, then we need don't match this pattern.
+  if (ShiftInst->getOpcode() == Instruction::Shl) {
+    return false;
+  }
+  bool isSigned = ShiftInst->isArithmeticShift();
+  Constant *AndMask = dyn_cast<Constant>(inst->getOperand(1));
+  Constant *ShrVal = dyn_cast<Constant>(ShiftInst->getOperand(1));
+  // Lets make sure that the shift value and the and mask are constant integers.
+  if (!AndMask || !ShrVal) {
+    return false;
+  }
+  Constant *newMaskConst;
+  Constant *shiftValConst;
+  if (isVector) {
+    // Handle the vector case
+    std::vector<Constant *> maskVals;
+    std::vector<Constant *> shiftVals;
+    ConstantVector *AndMaskVec = dyn_cast<ConstantVector>(AndMask);
+    ConstantVector *ShrValVec = dyn_cast<ConstantVector>(ShrVal);
+    Type *scalarType = AndMaskVec->getType()->getScalarType();
+    assert(AndMaskVec->getNumOperands() ==
+           ShrValVec->getNumOperands() && "cannot have a "
+           "combination where the number of elements to a "
+           "shift and an and are different!");
+    for (size_t x = 0, y = AndMaskVec->getNumOperands(); x < y; ++x) {
+      ConstantInt *AndCI = dyn_cast<ConstantInt>(AndMaskVec->getOperand(x));
+      ConstantInt *ShiftIC = dyn_cast<ConstantInt>(ShrValVec->getOperand(x));
+      if (!AndCI || !ShiftIC) {
+        return false;
+      }
+      uint32_t maskVal = (uint32_t)AndCI->getZExtValue();
+      if (!isMask_32(maskVal)) {
+        return false;
+      }
+      maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
+      uint32_t shiftVal = (uint32_t)ShiftIC->getZExtValue();
+      // If the mask or shiftval is greater than the bitcount, then break out.
+      if (maskVal >= 32 || shiftVal >= 32) {
+        return false;
+      }
+      // If the mask val is greater than the the number of original bits left
+      // then this optimization is invalid.
+      if (maskVal > (32 - shiftVal)) {
+        return false;
+      }
+      maskVals.push_back(ConstantInt::get(scalarType, maskVal, isSigned));
+      shiftVals.push_back(ConstantInt::get(scalarType, shiftVal, isSigned));
+    }
+    newMaskConst = ConstantVector::get(maskVals);
+    shiftValConst = ConstantVector::get(shiftVals);
+  } else {
+    // Handle the scalar case
+    uint32_t maskVal = (uint32_t)dyn_cast<ConstantInt>(AndMask)->getZExtValue();
+    // This must be a mask value where all lower bits are set to 1 and then any
+    // bit higher is set to 0.
+    if (!isMask_32(maskVal)) {
+      return false;
+    }
+    maskVal = (uint32_t)CountTrailingOnes_32(maskVal);
+    // Count the number of bits set in the mask, this is the width of the
+    // resulting bit set that is extracted from the source value.
+    uint32_t shiftVal = (uint32_t)dyn_cast<ConstantInt>(ShrVal)->getZExtValue();
+    // If the mask or shift val is greater than the bitcount, then break out.
+    if (maskVal >= 32 || shiftVal >= 32) {
+      return false;
+    }
+    // If the mask val is greater than the the number of original bits left then
+    // this optimization is invalid.
+    if (maskVal > (32 - shiftVal)) {
+      return false;
+    }
+    newMaskConst = ConstantInt::get(aType, maskVal, isSigned);
+    shiftValConst = ConstantInt::get(aType, shiftVal, isSigned);
+  }
+  // Lets create the function signature.
+  std::vector<Type *> callTypes;
+  callTypes.push_back(aType);
+  callTypes.push_back(aType);
+  callTypes.push_back(aType);
+  FunctionType *funcType = FunctionType::get(aType, callTypes, false);
+  std::string name = "llvm.AMDGPU.bit.extract.u32";
+  if (isVector) {
+    name += ".v" + itostr(numEle) + "i32";
+  } else {
+    name += ".";
+  }
+  // Lets create the function.
+  Function *Func = 
+    dyn_cast<Function>(inst->getParent()->getParent()->getParent()->
+                       getOrInsertFunction(llvm::StringRef(name), funcType));
+  Value *Operands[3] = {
+    ShiftInst->getOperand(0),
+    shiftValConst,
+    newMaskConst
+  };
+  // Lets create the Call with the operands
+  CallInst *CI = CallInst::Create(Func, Operands, "ByteExtractOpt");
+  CI->setDoesNotAccessMemory();
+  CI->insertBefore(inst);
+  inst->replaceAllUsesWith(CI);
+  return true;
+}
+
+bool
+AMDGPUPeepholeOpt::expandBFI(CallInst *CI) {
+  if (!CI) {
+    return false;
+  }
+  Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
+  if (!LHS->getName().startswith("__amdil_bfi")) {
+    return false;
+  }
+  Type* type = CI->getOperand(0)->getType();
+  Constant *negOneConst = NULL;
+  if (type->isVectorTy()) {
+    std::vector<Constant *> negOneVals;
+    negOneConst = ConstantInt::get(CI->getContext(), 
+        APInt(32, StringRef("-1"), 10));
+    for (size_t x = 0,
+        y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
+      negOneVals.push_back(negOneConst);
+    }
+    negOneConst = ConstantVector::get(negOneVals);
+  } else {
+    negOneConst = ConstantInt::get(CI->getContext(), 
+        APInt(32, StringRef("-1"), 10));
+  }
+  // __amdil_bfi => (A & B) | (~A & C)
+  BinaryOperator *lhs = 
+    BinaryOperator::Create(Instruction::And, CI->getOperand(0),
+        CI->getOperand(1), "bfi_and", CI);
+  BinaryOperator *rhs =
+    BinaryOperator::Create(Instruction::Xor, CI->getOperand(0), negOneConst,
+        "bfi_not", CI);
+  rhs = BinaryOperator::Create(Instruction::And, rhs, CI->getOperand(2),
+      "bfi_and", CI);
+  lhs = BinaryOperator::Create(Instruction::Or, lhs, rhs, "bfi_or", CI);
+  CI->replaceAllUsesWith(lhs);
+  return true;
+}
+
+bool
+AMDGPUPeepholeOpt::expandBFM(CallInst *CI) {
+  if (!CI) {
+    return false;
+  }
+  Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
+  if (!LHS->getName().startswith("__amdil_bfm")) {
+    return false;
+  }
+  // __amdil_bfm => ((1 << (src0 & 0x1F)) - 1) << (src1 & 0x1f)
+  Constant *newMaskConst = NULL;
+  Constant *newShiftConst = NULL;
+  Type* type = CI->getOperand(0)->getType();
+  if (type->isVectorTy()) {
+    std::vector<Constant*> newMaskVals, newShiftVals;
+    newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
+    newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
+    for (size_t x = 0,
+        y = dyn_cast<VectorType>(type)->getNumElements(); x < y; ++x) {
+      newMaskVals.push_back(newMaskConst);
+      newShiftVals.push_back(newShiftConst);
+    }
+    newMaskConst = ConstantVector::get(newMaskVals);
+    newShiftConst = ConstantVector::get(newShiftVals);
+  } else {
+    newMaskConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 0x1F);
+    newShiftConst = ConstantInt::get(Type::getInt32Ty(*mCTX), 1);
+  }
+  BinaryOperator *lhs =
+    BinaryOperator::Create(Instruction::And, CI->getOperand(0),
+        newMaskConst, "bfm_mask", CI);
+  lhs = BinaryOperator::Create(Instruction::Shl, newShiftConst,
+      lhs, "bfm_shl", CI);
+  lhs = BinaryOperator::Create(Instruction::Sub, lhs,
+      newShiftConst, "bfm_sub", CI);
+  BinaryOperator *rhs =
+    BinaryOperator::Create(Instruction::And, CI->getOperand(1),
+        newMaskConst, "bfm_mask", CI);
+  lhs = BinaryOperator::Create(Instruction::Shl, lhs, rhs, "bfm_shl", CI);
+  CI->replaceAllUsesWith(lhs);
+  return true;
+}
+
+bool
+AMDGPUPeepholeOpt::instLevelOptimizations(BasicBlock::iterator *bbb)  {
+  Instruction *inst = (*bbb);
+  if (optimizeCallInst(bbb)) {
+    return true;
+  }
+  if (optimizeBitExtract(inst)) {
+    return false;
+  }
+  if (optimizeBitInsert(inst)) {
+    return false;
+  }
+  if (correctMisalignedMemOp(inst)) {
+    return false;
+  }
+  return false;
+}
+bool
+AMDGPUPeepholeOpt::correctMisalignedMemOp(Instruction *inst) {
+  LoadInst *linst = dyn_cast<LoadInst>(inst);
+  StoreInst *sinst = dyn_cast<StoreInst>(inst);
+  unsigned alignment;
+  Type* Ty = inst->getType();
+  if (linst) {
+    alignment = linst->getAlignment();
+    Ty = inst->getType();
+  } else if (sinst) {
+    alignment = sinst->getAlignment();
+    Ty = sinst->getValueOperand()->getType();
+  } else {
+    return false;
+  }
+  unsigned size = getTypeSize(Ty);
+  if (size == alignment || size < alignment) {
+    return false;
+  }
+  if (!Ty->isStructTy()) {
+    return false;
+  }
+  if (alignment < 4) {
+    if (linst) {
+      linst->setAlignment(0);
+      return true;
+    } else if (sinst) {
+      sinst->setAlignment(0);
+      return true;
+    }
+  }
+  return false;
+}
+bool 
+AMDGPUPeepholeOpt::isSigned24BitOps(CallInst *CI)  {
+  if (!CI) {
+    return false;
+  }
+  Value *LHS = CI->getOperand(CI->getNumOperands() - 1);
+  std::string namePrefix = LHS->getName().substr(0, 14);
+  if (namePrefix != "__amdil_imad24" && namePrefix != "__amdil_imul24"
+      && namePrefix != "__amdil__imul24_high") {
+    return false;
+  }
+  if (mSTM->device()->usesHardware(AMDGPUDeviceInfo::Signed24BitOps)) {
+    return false;
+  }
+  return true;
+}
+
+void 
+AMDGPUPeepholeOpt::expandSigned24BitOps(CallInst *CI)  {
+  assert(isSigned24BitOps(CI) && "Must be a "
+      "signed 24 bit operation to call this function!");
+  Value *LHS = CI->getOperand(CI->getNumOperands()-1);
+  // On 7XX and 8XX we do not have signed 24bit, so we need to
+  // expand it to the following:
+  // imul24 turns into 32bit imul
+  // imad24 turns into 32bit imad
+  // imul24_high turns into 32bit imulhigh
+  if (LHS->getName().substr(0, 14) == "__amdil_imad24") {
+    Type *aType = CI->getOperand(0)->getType();
+    bool isVector = aType->isVectorTy();
+    int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
+    std::vector<Type*> callTypes;
+    callTypes.push_back(CI->getOperand(0)->getType());
+    callTypes.push_back(CI->getOperand(1)->getType());
+    callTypes.push_back(CI->getOperand(2)->getType());
+    FunctionType *funcType =
+      FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
+    std::string name = "__amdil_imad";
+    if (isVector) {
+      name += "_v" + itostr(numEle) + "i32";
+    } else {
+      name += "_i32";
+    }
+    Function *Func = dyn_cast<Function>(
+                       CI->getParent()->getParent()->getParent()->
+                       getOrInsertFunction(llvm::StringRef(name), funcType));
+    Value *Operands[3] = {
+      CI->getOperand(0),
+      CI->getOperand(1),
+      CI->getOperand(2)
+    };
+    CallInst *nCI = CallInst::Create(Func, Operands, "imad24");
+    nCI->insertBefore(CI);
+    CI->replaceAllUsesWith(nCI);
+  } else if (LHS->getName().substr(0, 14) == "__amdil_imul24") {
+    BinaryOperator *mulOp =
+      BinaryOperator::Create(Instruction::Mul, CI->getOperand(0),
+          CI->getOperand(1), "imul24", CI);
+    CI->replaceAllUsesWith(mulOp);
+  } else if (LHS->getName().substr(0, 19) == "__amdil_imul24_high") {
+    Type *aType = CI->getOperand(0)->getType();
+
+    bool isVector = aType->isVectorTy();
+    int numEle = isVector ? dyn_cast<VectorType>(aType)->getNumElements() : 1;
+    std::vector<Type*> callTypes;
+    callTypes.push_back(CI->getOperand(0)->getType());
+    callTypes.push_back(CI->getOperand(1)->getType());
+    FunctionType *funcType =
+      FunctionType::get(CI->getOperand(0)->getType(), callTypes, false);
+    std::string name = "__amdil_imul_high";
+    if (isVector) {
+      name += "_v" + itostr(numEle) + "i32";
+    } else {
+      name += "_i32";
+    }
+    Function *Func = dyn_cast<Function>(
+                       CI->getParent()->getParent()->getParent()->
+                       getOrInsertFunction(llvm::StringRef(name), funcType));
+    Value *Operands[2] = {
+      CI->getOperand(0),
+      CI->getOperand(1)
+    };
+    CallInst *nCI = CallInst::Create(Func, Operands, "imul24_high");
+    nCI->insertBefore(CI);
+    CI->replaceAllUsesWith(nCI);
+  }
+}
+
+bool 
+AMDGPUPeepholeOpt::isRWGLocalOpt(CallInst *CI)  {
+  return (CI != NULL
+          && CI->getOperand(CI->getNumOperands() - 1)->getName() 
+          == "__amdil_get_local_size_int");
+}
+
+bool 
+AMDGPUPeepholeOpt::convertAccurateDivide(CallInst *CI)  {
+  if (!CI) {
+    return false;
+  }
+  if (mSTM->device()->getGeneration() == AMDGPUDeviceInfo::HD6XXX
+      && (mSTM->getDeviceName() == "cayman")) {
+    return false;
+  }
+  return CI->getOperand(CI->getNumOperands() - 1)->getName().substr(0, 20) 
+      == "__amdil_improved_div";
+}
+
+void 
+AMDGPUPeepholeOpt::expandAccurateDivide(CallInst *CI)  {
+  assert(convertAccurateDivide(CI)
+         && "expanding accurate divide can only happen if it is expandable!");
+  BinaryOperator *divOp =
+    BinaryOperator::Create(Instruction::FDiv, CI->getOperand(0),
+                           CI->getOperand(1), "fdiv32", CI);
+  CI->replaceAllUsesWith(divOp);
+}
+
+bool
+AMDGPUPeepholeOpt::propagateSamplerInst(CallInst *CI) {
+  if (optLevel != CodeGenOpt::None) {
+    return false;
+  }
+
+  if (!CI) {
+    return false;
+  }
+
+  unsigned funcNameIdx = 0;
+  funcNameIdx = CI->getNumOperands() - 1;
+  StringRef calleeName = CI->getOperand(funcNameIdx)->getName();
+  if (calleeName != "__amdil_image2d_read_norm"
+   && calleeName != "__amdil_image2d_read_unnorm"
+   && calleeName != "__amdil_image3d_read_norm"
+   && calleeName != "__amdil_image3d_read_unnorm") {
+    return false;
+  }
+
+  unsigned samplerIdx = 2;
+  samplerIdx = 1;
+  Value *sampler = CI->getOperand(samplerIdx);
+  LoadInst *lInst = dyn_cast<LoadInst>(sampler);
+  if (!lInst) {
+    return false;
+  }
+
+  if (lInst->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
+    return false;
+  }
+
+  GlobalVariable *gv = dyn_cast<GlobalVariable>(lInst->getPointerOperand());
+  // If we are loading from what is not a global value, then we
+  // fail and return.
+  if (!gv) {
+    return false;
+  }
+
+  // If we don't have an initializer or we have an initializer and
+  // the initializer is not a 32bit integer, we fail.
+  if (!gv->hasInitializer() 
+      || !gv->getInitializer()->getType()->isIntegerTy(32)) {
+      return false;
+  }
+
+  // Now that we have the global variable initializer, lets replace
+  // all uses of the load instruction with the samplerVal and
+  // reparse the __amdil_is_constant() function.
+  Constant *samplerVal = gv->getInitializer();
+  lInst->replaceAllUsesWith(samplerVal);
+  return true;
+}
+
+bool 
+AMDGPUPeepholeOpt::doInitialization(Module &M)  {
+  return false;
+}
+
+bool 
+AMDGPUPeepholeOpt::doFinalization(Module &M)  {
+  return false;
+}
+
+void 
+AMDGPUPeepholeOpt::getAnalysisUsage(AnalysisUsage &AU) const  {
+  AU.addRequired<MachineFunctionAnalysis>();
+  FunctionPass::getAnalysisUsage(AU);
+  AU.setPreservesAll();
+}
+
+size_t AMDGPUPeepholeOpt::getTypeSize(Type * const T, bool dereferencePtr) {
+  size_t size = 0;
+  if (!T) {
+    return size;
+  }
+  switch (T->getTypeID()) {
+  case Type::X86_FP80TyID:
+  case Type::FP128TyID:
+  case Type::PPC_FP128TyID:
+  case Type::LabelTyID:
+    assert(0 && "These types are not supported by this backend");
+  default:
+  case Type::FloatTyID:
+  case Type::DoubleTyID:
+    size = T->getPrimitiveSizeInBits() >> 3;
+    break;
+  case Type::PointerTyID:
+    size = getTypeSize(dyn_cast<PointerType>(T), dereferencePtr);
+    break;
+  case Type::IntegerTyID:
+    size = getTypeSize(dyn_cast<IntegerType>(T), dereferencePtr);
+    break;
+  case Type::StructTyID:
+    size = getTypeSize(dyn_cast<StructType>(T), dereferencePtr);
+    break;
+  case Type::ArrayTyID:
+    size = getTypeSize(dyn_cast<ArrayType>(T), dereferencePtr);
+    break;
+  case Type::FunctionTyID:
+    size = getTypeSize(dyn_cast<FunctionType>(T), dereferencePtr);
+    break;
+  case Type::VectorTyID:
+    size = getTypeSize(dyn_cast<VectorType>(T), dereferencePtr);
+    break;
+  };
+  return size;
+}
+
+size_t AMDGPUPeepholeOpt::getTypeSize(StructType * const ST,
+    bool dereferencePtr) {
+  size_t size = 0;
+  if (!ST) {
+    return size;
+  }
+  Type *curType;
+  StructType::element_iterator eib;
+  StructType::element_iterator eie;
+  for (eib = ST->element_begin(), eie = ST->element_end(); eib != eie; ++eib) {
+    curType = *eib;
+    size += getTypeSize(curType, dereferencePtr);
+  }
+  return size;
+}
+
+size_t AMDGPUPeepholeOpt::getTypeSize(IntegerType * const IT,
+    bool dereferencePtr) {
+  return IT ? (IT->getBitWidth() >> 3) : 0;
+}
+
+size_t AMDGPUPeepholeOpt::getTypeSize(FunctionType * const FT,
+    bool dereferencePtr) {
+    assert(0 && "Should not be able to calculate the size of an function type");
+    return 0;
+}
+
+size_t AMDGPUPeepholeOpt::getTypeSize(ArrayType * const AT,
+    bool dereferencePtr) {
+  return (size_t)(AT ? (getTypeSize(AT->getElementType(),
+                                    dereferencePtr) * AT->getNumElements())
+                     : 0);
+}
+
+size_t AMDGPUPeepholeOpt::getTypeSize(VectorType * const VT,
+    bool dereferencePtr) {
+  return VT ? (VT->getBitWidth() >> 3) : 0;
+}
+
+size_t AMDGPUPeepholeOpt::getTypeSize(PointerType * const PT,
+    bool dereferencePtr) {
+  if (!PT) {
+    return 0;
+  }
+  Type *CT = PT->getElementType();
+  if (CT->getTypeID() == Type::StructTyID &&
+      PT->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
+    return getTypeSize(dyn_cast<StructType>(CT));
+  } else if (dereferencePtr) {
+    size_t size = 0;
+    for (size_t x = 0, y = PT->getNumContainedTypes(); x < y; ++x) {
+      size += getTypeSize(PT->getContainedType(x), dereferencePtr);
+    }
+    return size;
+  } else {
+    return 4;
+  }
+}
+
+size_t AMDGPUPeepholeOpt::getTypeSize(OpaqueType * const OT,
+    bool dereferencePtr) {
+  //assert(0 && "Should not be able to calculate the size of an opaque type");
+  return 4;
+}
diff --git a/lib/Target/R600/AMDILRegisterInfo.td b/lib/Target/R600/AMDILRegisterInfo.td
new file mode 100644
index 0000000000..b9d033432e
--- /dev/null
+++ b/lib/Target/R600/AMDILRegisterInfo.td
@@ -0,0 +1,107 @@
+//===- AMDILRegisterInfo.td - AMDIL Register defs ----------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+//  Declarations that describe the AMDIL register file
+//
+//===----------------------------------------------------------------------===//
+
+class AMDILReg<bits<16> num, string n> : Register<n> {
+  field bits<16> Value;
+  let Value = num;
+  let Namespace = "AMDGPU";
+}
+
+// We will start with 8 registers for each class before expanding to more
+// Since the swizzle is added based on the register class, we can leave it
+// off here and just specify different registers for different register classes
+def R1 : AMDILReg<1, "r1">, DwarfRegNum<[1]>;
+def R2 : AMDILReg<2, "r2">, DwarfRegNum<[2]>;
+def R3 : AMDILReg<3, "r3">, DwarfRegNum<[3]>;
+def R4 : AMDILReg<4, "r4">, DwarfRegNum<[4]>;
+def R5 : AMDILReg<5, "r5">, DwarfRegNum<[5]>;
+def R6 : AMDILReg<6, "r6">, DwarfRegNum<[6]>;
+def R7 : AMDILReg<7, "r7">, DwarfRegNum<[7]>;
+def R8 : AMDILReg<8, "r8">, DwarfRegNum<[8]>;
+def R9 : AMDILReg<9, "r9">, DwarfRegNum<[9]>;
+def R10 : AMDILReg<10, "r10">, DwarfRegNum<[10]>;
+def R11 : AMDILReg<11, "r11">, DwarfRegNum<[11]>;
+def R12 : AMDILReg<12, "r12">, DwarfRegNum<[12]>;
+def R13 : AMDILReg<13, "r13">, DwarfRegNum<[13]>;
+def R14 : AMDILReg<14, "r14">, DwarfRegNum<[14]>;
+def R15 : AMDILReg<15, "r15">, DwarfRegNum<[15]>;
+def R16 : AMDILReg<16, "r16">, DwarfRegNum<[16]>;
+def R17 : AMDILReg<17, "r17">, DwarfRegNum<[17]>;
+def R18 : AMDILReg<18, "r18">, DwarfRegNum<[18]>;
+def R19 : AMDILReg<19, "r19">, DwarfRegNum<[19]>;
+def R20 : AMDILReg<20, "r20">, DwarfRegNum<[20]>;
+
+// All registers between 1000 and 1024 are reserved and cannot be used
+// unless commented in this section
+// r1021-r1025 are used to dynamically calculate the local/group/thread/region/region_local ID's
+// r1020 is used to hold the frame index for local arrays
+// r1019 is used to hold the dynamic stack allocation pointer
+// r1018 is used as a temporary register for handwritten code
+// r1017 is used as a temporary register for handwritten code
+// r1016 is used as a temporary register for load/store code
+// r1015 is used as a temporary register for data segment offset
+// r1014 is used as a temporary register for store code
+// r1013 is used as the section data pointer register
+// r1012-r1010 and r1001-r1008 are used for temporary I/O registers
+// r1009 is used as the frame pointer register
+// r999 is used as the mem register.
+// r998 is used as the return address register.
+//def R1025 : AMDILReg<1025, "r1025">, DwarfRegNum<[1025]>;
+//def R1024 : AMDILReg<1024, "r1024">, DwarfRegNum<[1024]>;
+//def R1023 : AMDILReg<1023, "r1023">, DwarfRegNum<[1023]>;
+//def R1022 : AMDILReg<1022, "r1022">, DwarfRegNum<[1022]>;
+//def R1021 : AMDILReg<1021, "r1021">, DwarfRegNum<[1021]>;
+//def R1020 : AMDILReg<1020, "r1020">, DwarfRegNum<[1020]>;
+def SP : AMDILReg<1019, "r1019">, DwarfRegNum<[1019]>;
+def T1 : AMDILReg<1018, "r1018">, DwarfRegNum<[1018]>;
+def T2 : AMDILReg<1017, "r1017">, DwarfRegNum<[1017]>;
+def T3 : AMDILReg<1016, "r1016">, DwarfRegNum<[1016]>;
+def T4 : AMDILReg<1015, "r1015">, DwarfRegNum<[1015]>;
+def T5 : AMDILReg<1014, "r1014">, DwarfRegNum<[1014]>;
+def SDP : AMDILReg<1013, "r1013">, DwarfRegNum<[1013]>;
+def R1012: AMDILReg<1012, "r1012">, DwarfRegNum<[1012]>;
+def R1011: AMDILReg<1011, "r1011">, DwarfRegNum<[1011]>;
+def R1010: AMDILReg<1010, "r1010">, DwarfRegNum<[1010]>;
+def DFP : AMDILReg<1009, "r1009">, DwarfRegNum<[1009]>;
+def R1008: AMDILReg<1008, "r1008">, DwarfRegNum<[1008]>;
+def R1007: AMDILReg<1007, "r1007">, DwarfRegNum<[1007]>;
+def R1006: AMDILReg<1006, "r1006">, DwarfRegNum<[1006]>;
+def R1005: AMDILReg<1005, "r1005">, DwarfRegNum<[1005]>;
+def R1004: AMDILReg<1004, "r1004">, DwarfRegNum<[1004]>;
+def R1003: AMDILReg<1003, "r1003">, DwarfRegNum<[1003]>;
+def R1002: AMDILReg<1002, "r1002">, DwarfRegNum<[1002]>;
+def R1001: AMDILReg<1001, "r1001">, DwarfRegNum<[1001]>;
+def MEM : AMDILReg<999, "mem">, DwarfRegNum<[999]>;
+def RA : AMDILReg<998, "r998">, DwarfRegNum<[998]>;
+def FP : AMDILReg<997, "r997">, DwarfRegNum<[997]>;
+def GPRI16 : RegisterClass<"AMDGPU", [i16], 16,
+  (add (sequence "R%u", 1, 20), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> {
+        let AltOrders = [(add (sequence "R%u", 1, 20))];
+        let AltOrderSelect = [{
+          return 1;
+        }];
+    }
+def GPRI32 : RegisterClass<"AMDGPU", [i32], 32,
+  (add (sequence "R%u", 1, 20), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> {
+        let AltOrders = [(add (sequence "R%u", 1, 20))];
+        let AltOrderSelect = [{
+          return 1;
+        }];
+    }
+def GPRF32 : RegisterClass<"AMDGPU", [f32], 32,
+  (add (sequence "R%u", 1, 20), RA, SP, T1, T2, T3, T4, T5, SDP, R1010, R1011, R1001, R1002, R1003, R1004, R1005, R1006, R1007, R1008, MEM, R1012)> {
+        let AltOrders = [(add (sequence "R%u", 1, 20))];
+        let AltOrderSelect = [{
+          return 1;
+        }];
+    }
diff --git a/lib/Target/R600/AMDILSIDevice.cpp b/lib/Target/R600/AMDILSIDevice.cpp
new file mode 100644
index 0000000000..7c2710f1b2
--- /dev/null
+++ b/lib/Target/R600/AMDILSIDevice.cpp
@@ -0,0 +1,45 @@
+//===-- AMDILSIDevice.cpp - Device Info for Southern Islands GPUs ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//==-----------------------------------------------------------------------===//
+#include "AMDILSIDevice.h"
+#include "AMDILEvergreenDevice.h"
+#include "AMDILNIDevice.h"
+#include "AMDGPUSubtarget.h"
+
+using namespace llvm;
+
+AMDGPUSIDevice::AMDGPUSIDevice(AMDGPUSubtarget *ST)
+  : AMDGPUEvergreenDevice(ST) {
+}
+AMDGPUSIDevice::~AMDGPUSIDevice() {
+}
+
+size_t
+AMDGPUSIDevice::getMaxLDSSize() const {
+  if (usesHardware(AMDGPUDeviceInfo::LocalMem)) {
+    return MAX_LDS_SIZE_900;
+  } else {
+    return 0;
+  }
+}
+
+uint32_t
+AMDGPUSIDevice::getGeneration() const {
+  return AMDGPUDeviceInfo::HD7XXX;
+}
+
+std::string
+AMDGPUSIDevice::getDataLayout() const {
+  return std::string("e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16"
+      "-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:32:32"
+      "-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64"
+      "-v96:128:128-v128:128:128-v192:256:256-v256:256:256"
+      "-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+      "-n8:16:32:64");
+}
diff --git a/lib/Target/R600/AMDILSIDevice.h b/lib/Target/R600/AMDILSIDevice.h
new file mode 100644
index 0000000000..5b2cb25022
--- /dev/null
+++ b/lib/Target/R600/AMDILSIDevice.h
@@ -0,0 +1,39 @@
+//===------- AMDILSIDevice.h - Define SI Device for AMDIL -*- C++ -*------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//==-----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface for the subtarget data classes.
+///
+/// This file will define the interface that each generation needs to
+/// implement in order to correctly answer queries on the capabilities of the
+/// specific hardware.
+//===---------------------------------------------------------------------===//
+#ifndef AMDILSIDEVICE_H
+#define AMDILSIDEVICE_H
+#include "AMDILEvergreenDevice.h"
+
+namespace llvm {
+class AMDGPUSubtarget;
+//===---------------------------------------------------------------------===//
+// SI generation of devices and their respective sub classes
+//===---------------------------------------------------------------------===//
+
+/// \brief The AMDGPUSIDevice is the base class for all Southern Island series
+/// of cards.
+class AMDGPUSIDevice : public AMDGPUEvergreenDevice {
+public:
+  AMDGPUSIDevice(AMDGPUSubtarget*);
+  virtual ~AMDGPUSIDevice();
+  virtual size_t getMaxLDSSize() const;
+  virtual uint32_t getGeneration() const;
+  virtual std::string getDataLayout() const;
+};
+
+} // namespace llvm
+#endif // AMDILSIDEVICE_H
diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt
new file mode 100644
index 0000000000..757b733522
--- /dev/null
+++ b/lib/Target/R600/CMakeLists.txt
@@ -0,0 +1,54 @@
+set(LLVM_TARGET_DEFINITIONS AMDGPU.td)
+
+tablegen(LLVM AMDGPUGenRegisterInfo.inc -gen-register-info)
+tablegen(LLVM AMDGPUGenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM AMDGPUGenDAGISel.inc -gen-dag-isel)
+tablegen(LLVM AMDGPUGenCallingConv.inc -gen-callingconv)
+tablegen(LLVM AMDGPUGenSubtargetInfo.inc -gen-subtarget)
+tablegen(LLVM AMDGPUGenIntrinsics.inc -gen-tgt-intrinsic)
+tablegen(LLVM AMDGPUGenMCCodeEmitter.inc -gen-emitter -mc-emitter)
+tablegen(LLVM AMDGPUGenDFAPacketizer.inc -gen-dfa-packetizer)
+tablegen(LLVM AMDGPUGenAsmWriter.inc -gen-asm-writer)
+add_public_tablegen_target(AMDGPUCommonTableGen)
+
+add_llvm_target(R600CodeGen
+  AMDIL7XXDevice.cpp
+  AMDILCFGStructurizer.cpp
+  AMDILDevice.cpp
+  AMDILDeviceInfo.cpp
+  AMDILEvergreenDevice.cpp
+  AMDILFrameLowering.cpp
+  AMDILIntrinsicInfo.cpp
+  AMDILISelDAGToDAG.cpp
+  AMDILISelLowering.cpp
+  AMDILNIDevice.cpp
+  AMDILPeepholeOptimizer.cpp
+  AMDILSIDevice.cpp
+  AMDGPUAsmPrinter.cpp
+  AMDGPUMCInstLower.cpp
+  AMDGPUSubtarget.cpp
+  AMDGPUTargetMachine.cpp
+  AMDGPUISelLowering.cpp
+  AMDGPUConvertToISA.cpp
+  AMDGPUInstrInfo.cpp
+  AMDGPURegisterInfo.cpp
+  R600ExpandSpecialInstrs.cpp
+  R600InstrInfo.cpp
+  R600ISelLowering.cpp
+  R600MachineFunctionInfo.cpp
+  R600RegisterInfo.cpp
+  SIAssignInterpRegs.cpp
+  SIInstrInfo.cpp
+  SIISelLowering.cpp
+  SILowerLiteralConstants.cpp
+  SILowerControlFlow.cpp
+  SIMachineFunctionInfo.cpp
+  SIRegisterInfo.cpp
+  SIFixSGPRLiveness.cpp
+  )
+
+add_dependencies(LLVMR600CodeGen intrinsics_gen)
+
+add_subdirectory(InstPrinter)
+add_subdirectory(TargetInfo)
+add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
new file mode 100644
index 0000000000..e6c550b5ac
--- /dev/null
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.cpp
@@ -0,0 +1,132 @@
+//===-- AMDGPUInstPrinter.cpp - AMDGPU MC Inst -> ASM ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+// \file
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUInstPrinter.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/MC/MCInst.h"
+
+using namespace llvm;
+
+void AMDGPUInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
+                             StringRef Annot) {
+  printInstruction(MI, OS);
+
+  printAnnotation(OS, Annot);
+}
+
+void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O) {
+
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    switch (Op.getReg()) {
+    // This is the default predicate state, so we don't need to print it.
+    case AMDGPU::PRED_SEL_OFF: break;
+    default: O << getRegisterName(Op.getReg()); break;
+    }
+  } else if (Op.isImm()) {
+    O << Op.getImm();
+  } else if (Op.isFPImm()) {
+    O << Op.getFPImm();
+  } else {
+    assert(!"unknown operand type in printOperand");
+  }
+}
+
+void AMDGPUInstPrinter::printMemOperand(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  printOperand(MI, OpNo, O);
+  O  << ", ";
+  printOperand(MI, OpNo + 1, O);
+}
+
+void AMDGPUInstPrinter::printIfSet(const MCInst *MI, unsigned OpNo,
+                                    raw_ostream &O, StringRef Asm) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  assert(Op.isImm());
+  if (Op.getImm() == 1) {
+    O << Asm;
+  }
+}
+
+void AMDGPUInstPrinter::printAbs(const MCInst *MI, unsigned OpNo,
+                                 raw_ostream &O) {
+  printIfSet(MI, OpNo, O, "|");
+}
+
+void AMDGPUInstPrinter::printClamp(const MCInst *MI, unsigned OpNo,
+                                   raw_ostream &O) {
+  printIfSet(MI, OpNo, O, "_SAT");
+}
+
+void AMDGPUInstPrinter::printLiteral(const MCInst *MI, unsigned OpNo,
+                                     raw_ostream &O) {
+  union Literal {
+    float f;
+    int32_t i;
+  } L;
+
+  L.i = MI->getOperand(OpNo).getImm();
+  O << L.i << "(" << L.f << ")";
+}
+
+void AMDGPUInstPrinter::printLast(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+  printIfSet(MI, OpNo, O, " *");
+}
+
+void AMDGPUInstPrinter::printNeg(const MCInst *MI, unsigned OpNo,
+                                 raw_ostream &O) {
+  printIfSet(MI, OpNo, O, "-");
+}
+
+void AMDGPUInstPrinter::printOMOD(const MCInst *MI, unsigned OpNo,
+                                  raw_ostream &O) {
+  switch (MI->getOperand(OpNo).getImm()) {
+  default: break;
+  case 1:
+    O << " * 2.0";
+    break;
+  case 2:
+    O << " * 4.0";
+    break;
+  case 3:
+    O << " / 2.0";
+    break;
+  }
+}
+
+void AMDGPUInstPrinter::printRel(const MCInst *MI, unsigned OpNo,
+                                 raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.getImm() != 0) {
+    O << " + " << Op.getImm();
+  }
+}
+
+void AMDGPUInstPrinter::printUpdateExecMask(const MCInst *MI, unsigned OpNo,
+                                            raw_ostream &O) {
+  printIfSet(MI, OpNo, O, "ExecMask,");
+}
+
+void AMDGPUInstPrinter::printUpdatePred(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  printIfSet(MI, OpNo, O, "Pred,");
+}
+
+void AMDGPUInstPrinter::printWrite(const MCInst *MI, unsigned OpNo,
+                                       raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.getImm() == 0) {
+    O << " (MASKED)";
+  }
+}
+
+#include "AMDGPUGenAsmWriter.inc"
diff --git a/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
new file mode 100644
index 0000000000..96e0e46f8a
--- /dev/null
+++ b/lib/Target/R600/InstPrinter/AMDGPUInstPrinter.h
@@ -0,0 +1,52 @@
+//===-- AMDGPUInstPrinter.h - AMDGPU MC Inst -> ASM interface ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPUINSTPRINTER_H
+#define AMDGPUINSTPRINTER_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+class AMDGPUInstPrinter : public MCInstPrinter {
+public:
+  AMDGPUInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                     const MCRegisterInfo &MRI)
+    : MCInstPrinter(MAI, MII, MRI) {}
+
+  //Autogenerated by tblgen
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+
+  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+
+private:
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printMemOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printIfSet(const MCInst *MI, unsigned OpNo, raw_ostream &O, StringRef Asm);
+  void printAbs(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printClamp(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printLiteral(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printLast(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printNeg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printOMOD(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printRel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printUpdateExecMask(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printUpdatePred(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printWrite(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+};
+
+} // End namespace llvm
+
+#endif // AMDGPUINSTRPRINTER_H
diff --git a/lib/Target/R600/InstPrinter/CMakeLists.txt b/lib/Target/R600/InstPrinter/CMakeLists.txt
new file mode 100644
index 0000000000..069c55ba94
--- /dev/null
+++ b/lib/Target/R600/InstPrinter/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMR600AsmPrinter
+  AMDGPUInstPrinter.cpp
+  )
+
+add_dependencies(LLVMR600AsmPrinter AMDGPUCommonTableGen)
diff --git a/lib/Target/R600/InstPrinter/LLVMBuild.txt b/lib/Target/R600/InstPrinter/LLVMBuild.txt
new file mode 100644
index 0000000000..ec0be89f10
--- /dev/null
+++ b/lib/Target/R600/InstPrinter/LLVMBuild.txt
@@ -0,0 +1,24 @@
+;===- ./lib/Target/R600/InstPrinter/LLVMBuild.txt -----------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = R600AsmPrinter
+parent = R600
+required_libraries = MC Support
+add_to_library_groups = R600
+
diff --git a/lib/Target/R600/InstPrinter/Makefile b/lib/Target/R600/InstPrinter/Makefile
new file mode 100644
index 0000000000..a794cc1124
--- /dev/null
+++ b/lib/Target/R600/InstPrinter/Makefile
@@ -0,0 +1,15 @@
+#===- lib/Target/R600/AsmPrinter/Makefile ------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMR600AsmPrinter
+
+# Hack: we need to include 'main' x86 target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/R600/LLVMBuild.txt b/lib/Target/R600/LLVMBuild.txt
new file mode 100644
index 0000000000..f2a7554e52
--- /dev/null
+++ b/lib/Target/R600/LLVMBuild.txt
@@ -0,0 +1,32 @@
+;===- ./lib/Target/AMDIL/LLVMBuild.txt -------------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[common]
+subdirectories = InstPrinter MCTargetDesc TargetInfo
+
+[component_0]
+type = TargetGroup
+name = R600
+parent = Target
+has_asmprinter = 1
+
+[component_1]
+type = Library
+name = R600CodeGen
+parent = R600
+required_libraries = AsmPrinter CodeGen Core SelectionDAG Support Target MC R600AsmPrinter R600Desc R600Info 
+add_to_library_groups = R600
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
new file mode 100644
index 0000000000..8f41ebbdc5
--- /dev/null
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -0,0 +1,90 @@
+//===-- AMDGPUAsmBackend.cpp - AMDGPU Assembler Backend -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+namespace {
+
+class AMDGPUMCObjectWriter : public MCObjectWriter {
+public:
+  AMDGPUMCObjectWriter(raw_ostream &OS) : MCObjectWriter(OS, true) { }
+  virtual void ExecutePostLayoutBinding(MCAssembler &Asm,
+                                        const MCAsmLayout &Layout) {
+    //XXX: Implement if necessary.
+  }
+  virtual void RecordRelocation(const MCAssembler &Asm,
+                                const MCAsmLayout &Layout,
+                                const MCFragment *Fragment,
+                                const MCFixup &Fixup,
+                                MCValue Target, uint64_t &FixedValue) {
+    assert(!"Not implemented");
+  }
+
+  virtual void WriteObject(MCAssembler &Asm, const MCAsmLayout &Layout);
+
+};
+
+class AMDGPUAsmBackend : public MCAsmBackend {
+public:
+  AMDGPUAsmBackend(const Target &T)
+    : MCAsmBackend() {}
+
+  virtual AMDGPUMCObjectWriter *createObjectWriter(raw_ostream &OS) const;
+  virtual unsigned getNumFixupKinds() const { return 0; };
+  virtual void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                          uint64_t Value) const;
+  virtual bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                                    const MCInstFragment *DF,
+                                    const MCAsmLayout &Layout) const {
+    return false;
+  }
+  virtual void relaxInstruction(const MCInst &Inst, MCInst &Res) const {
+    assert(!"Not implemented");
+  }
+  virtual bool mayNeedRelaxation(const MCInst &Inst) const { return false; }
+  virtual bool writeNopData(uint64_t Count, MCObjectWriter *OW) const {
+    return true;
+  }
+};
+
+} //End anonymous namespace
+
+void AMDGPUMCObjectWriter::WriteObject(MCAssembler &Asm,
+                                       const MCAsmLayout &Layout) {
+  for (MCAssembler::iterator I = Asm.begin(), E = Asm.end(); I != E; ++I) {
+    Asm.writeSectionData(I, Layout);
+  }
+}
+
+MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T, StringRef TT,
+                                           StringRef CPU) {
+  return new AMDGPUAsmBackend(T);
+}
+
+AMDGPUMCObjectWriter * AMDGPUAsmBackend::createObjectWriter(
+                                                        raw_ostream &OS) const {
+  return new AMDGPUMCObjectWriter(OS);
+}
+
+void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
+                                  unsigned DataSize, uint64_t Value) const {
+
+  uint16_t *Dst = (uint16_t*)(Data + Fixup.getOffset());
+  assert(Fixup.getKind() == FK_PCRel_4);
+  *Dst = (Value - 4) / 4;
+}
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
new file mode 100644
index 0000000000..4d3d3e7945
--- /dev/null
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -0,0 +1,85 @@
+//===-- MCTargetDesc/AMDGPUMCAsmInfo.cpp - Assembly Info ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMCAsmInfo.h"
+
+using namespace llvm;
+AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Target &T, StringRef &TT) : MCAsmInfo() {
+  HasSingleParameterDotFile = false;
+  WeakDefDirective = 0;
+  //===------------------------------------------------------------------===//
+  HasSubsectionsViaSymbols = true;
+  HasMachoZeroFillDirective = false;
+  HasMachoTBSSDirective = false;
+  HasStaticCtorDtorReferenceInStaticMode = false;
+  LinkerRequiresNonEmptyDwarfLines = true;
+  MaxInstLength = 16;
+  PCSymbol = "$";
+  SeparatorString = "\n";
+  CommentColumn = 40;
+  CommentString = ";";
+  LabelSuffix = ":";
+  GlobalPrefix = "@";
+  PrivateGlobalPrefix = ";.";
+  LinkerPrivateGlobalPrefix = "!";
+  InlineAsmStart = ";#ASMSTART";
+  InlineAsmEnd = ";#ASMEND";
+  AssemblerDialect = 0;
+  AllowQuotesInName = false;
+  AllowNameToStartWithDigit = false;
+  AllowPeriodsInName = false;
+
+  //===--- Data Emission Directives -------------------------------------===//
+  ZeroDirective = ".zero";
+  AsciiDirective = ".ascii\t";
+  AscizDirective = ".asciz\t";
+  Data8bitsDirective = ".byte\t";
+  Data16bitsDirective = ".short\t";
+  Data32bitsDirective = ".long\t";
+  Data64bitsDirective = ".quad\t";
+  GPRel32Directive = 0;
+  SunStyleELFSectionSwitchSyntax = true;
+  UsesELFSectionDirectiveForBSS = true;
+  HasMicrosoftFastStdCallMangling = false;
+
+  //===--- Alignment Information ----------------------------------------===//
+  AlignDirective = ".align\t";
+  AlignmentIsInBytes = true;
+  TextAlignFillValue = 0;
+
+  //===--- Global Variable Emission Directives --------------------------===//
+  GlobalDirective = ".global";
+  ExternDirective = ".extern";
+  HasSetDirective = false;
+  HasAggressiveSymbolFolding = true;
+  COMMDirectiveAlignmentIsInBytes = false;
+  HasDotTypeDotSizeDirective = false;
+  HasNoDeadStrip = true;
+  HasSymbolResolver = false;
+  WeakRefDirective = ".weakref\t";
+  LinkOnceDirective = 0;
+  //===--- Dwarf Emission Directives -----------------------------------===//
+  HasLEB128 = true;
+  SupportsDebugInformation = true;
+  ExceptionsType = ExceptionHandling::None;
+  DwarfUsesInlineInfoSection = false;
+  DwarfSectionOffsetDirective = ".offset";
+
+}
+
+const char*
+AMDGPUMCAsmInfo::getDataASDirective(unsigned int Size, unsigned int AS) const {
+  return 0;
+}
+
+const MCSection*
+AMDGPUMCAsmInfo::getNonexecutableStackSection(MCContext &CTX) const {
+  return 0;
+}
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
new file mode 100644
index 0000000000..3ad0fa6824
--- /dev/null
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCAsmInfo.h
@@ -0,0 +1,30 @@
+//===-- MCTargetDesc/AMDGPUMCAsmInfo.h - AMDGPU MCAsm Interface  ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPUMCASMINFO_H
+#define AMDGPUMCASMINFO_H
+
+#include "llvm/MC/MCAsmInfo.h"
+namespace llvm {
+
+class Target;
+class StringRef;
+
+class AMDGPUMCAsmInfo : public MCAsmInfo {
+public:
+  explicit AMDGPUMCAsmInfo(const Target &T, StringRef &TT);
+  const char* getDataASDirective(unsigned int Size, unsigned int AS) const;
+  const MCSection* getNonexecutableStackSection(MCContext &CTX) const;
+};
+} // namespace llvm
+#endif // AMDGPUMCASMINFO_H
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
new file mode 100644
index 0000000000..9d0d6cf6fd
--- /dev/null
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -0,0 +1,60 @@
+//===-- AMDGPUCodeEmitter.h - AMDGPU Code Emitter interface -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief CodeEmitter interface for R600 and SI codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPUCODEEMITTER_H
+#define AMDGPUCODEEMITTER_H
+
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+class MCInst;
+class MCOperand;
+
+class AMDGPUMCCodeEmitter : public MCCodeEmitter {
+public:
+
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups) const;
+
+  virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                                     SmallVectorImpl<MCFixup> &Fixups) const {
+    return 0;
+  }
+
+  virtual unsigned GPR4AlignEncode(const MCInst  &MI, unsigned OpNo,
+                                   SmallVectorImpl<MCFixup> &Fixups) const {
+    return 0;
+  }
+  virtual unsigned GPR2AlignEncode(const MCInst &MI, unsigned OpNo,
+                                   SmallVectorImpl<MCFixup> &Fixups) const {
+    return 0;
+  }
+  virtual uint64_t VOPPostEncode(const MCInst &MI, uint64_t Value) const {
+    return Value;
+  }
+  virtual uint64_t i32LiteralEncode(const MCInst &MI, unsigned OpNo,
+                                   SmallVectorImpl<MCFixup> &Fixups) const {
+    return 0;
+  }
+  virtual uint32_t SMRDmemriEncode(const MCInst &MI, unsigned OpNo,
+                                   SmallVectorImpl<MCFixup> &Fixups) const {
+    return 0;
+  }
+};
+
+} // End namespace llvm
+
+#endif // AMDGPUCODEEMITTER_H
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
new file mode 100644
index 0000000000..6a62856e10
--- /dev/null
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -0,0 +1,113 @@
+//===-- AMDGPUMCTargetDesc.cpp - AMDGPU Target Descriptions ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This file provides AMDGPU specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUMCTargetDesc.h"
+#include "AMDGPUMCAsmInfo.h"
+#include "InstPrinter/AMDGPUInstPrinter.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/MC/MCCodeGenInfo.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+
+#define GET_INSTRINFO_MC_DESC
+#include "AMDGPUGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_MC_DESC
+#include "AMDGPUGenSubtargetInfo.inc"
+
+#define GET_REGINFO_MC_DESC
+#include "AMDGPUGenRegisterInfo.inc"
+
+using namespace llvm;
+
+static MCInstrInfo *createAMDGPUMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitAMDGPUMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createAMDGPUMCRegisterInfo(StringRef TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitAMDGPUMCRegisterInfo(X, 0);
+  return X;
+}
+
+static MCSubtargetInfo *createAMDGPUMCSubtargetInfo(StringRef TT, StringRef CPU,
+                                                   StringRef FS) {
+  MCSubtargetInfo * X = new MCSubtargetInfo();
+  InitAMDGPUMCSubtargetInfo(X, TT, CPU, FS);
+  return X;
+}
+
+static MCCodeGenInfo *createAMDGPUMCCodeGenInfo(StringRef TT, Reloc::Model RM,
+                                               CodeModel::Model CM,
+                                               CodeGenOpt::Level OL) {
+  MCCodeGenInfo *X = new MCCodeGenInfo();
+  X->InitMCCodeGenInfo(RM, CM, OL);
+  return X;
+}
+
+static MCInstPrinter *createAMDGPUMCInstPrinter(const Target &T,
+                                                unsigned SyntaxVariant,
+                                                const MCAsmInfo &MAI,
+                                                const MCInstrInfo &MII,
+                                                const MCRegisterInfo &MRI,
+                                                const MCSubtargetInfo &STI) {
+  return new AMDGPUInstPrinter(MAI, MII, MRI);
+}
+
+static MCCodeEmitter *createAMDGPUMCCodeEmitter(const MCInstrInfo &MCII,
+                                                const MCRegisterInfo &MRI,
+                                                const MCSubtargetInfo &STI,
+                                                MCContext &Ctx) {
+  if (STI.getFeatureBits() & AMDGPU::Feature64BitPtr) {
+    return createSIMCCodeEmitter(MCII, MRI, STI, Ctx);
+  } else {
+    return createR600MCCodeEmitter(MCII, MRI, STI, Ctx);
+  }
+}
+
+static MCStreamer *createMCStreamer(const Target &T, StringRef TT,
+                                    MCContext &Ctx, MCAsmBackend &MAB,
+                                    raw_ostream &_OS,
+                                    MCCodeEmitter *_Emitter,
+                                    bool RelaxAll,
+                                    bool NoExecStack) {
+  return createPureStreamer(Ctx, MAB, _OS, _Emitter);
+}
+
+extern "C" void LLVMInitializeR600TargetMC() {
+
+  RegisterMCAsmInfo<AMDGPUMCAsmInfo> Y(TheAMDGPUTarget);
+
+  TargetRegistry::RegisterMCCodeGenInfo(TheAMDGPUTarget, createAMDGPUMCCodeGenInfo);
+
+  TargetRegistry::RegisterMCInstrInfo(TheAMDGPUTarget, createAMDGPUMCInstrInfo);
+
+  TargetRegistry::RegisterMCRegInfo(TheAMDGPUTarget, createAMDGPUMCRegisterInfo);
+
+  TargetRegistry::RegisterMCSubtargetInfo(TheAMDGPUTarget, createAMDGPUMCSubtargetInfo);
+
+  TargetRegistry::RegisterMCInstPrinter(TheAMDGPUTarget, createAMDGPUMCInstPrinter);
+
+  TargetRegistry::RegisterMCCodeEmitter(TheAMDGPUTarget, createAMDGPUMCCodeEmitter);
+
+  TargetRegistry::RegisterMCAsmBackend(TheAMDGPUTarget, createAMDGPUAsmBackend);
+
+  TargetRegistry::RegisterMCObjectStreamer(TheAMDGPUTarget, createMCStreamer);
+}
diff --git a/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
new file mode 100644
index 0000000000..363a4af3f3
--- /dev/null
+++ b/lib/Target/R600/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -0,0 +1,55 @@
+//===-- AMDGPUMCTargetDesc.h - AMDGPU Target Descriptions -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Provides AMDGPU specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+//
+
+#ifndef AMDGPUMCTARGETDESC_H
+#define AMDGPUMCTARGETDESC_H
+
+#include "llvm/ADT/StringRef.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCInstrInfo;
+class MCRegisterInfo;
+class MCSubtargetInfo;
+class Target;
+
+extern Target TheAMDGPUTarget;
+
+MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII,
+                                       const MCRegisterInfo &MRI,
+                                       const MCSubtargetInfo &STI,
+                                       MCContext &Ctx);
+
+MCCodeEmitter *createSIMCCodeEmitter(const MCInstrInfo &MCII,
+                                     const MCRegisterInfo &MRI,
+                                     const MCSubtargetInfo &STI,
+                                     MCContext &Ctx);
+
+MCAsmBackend *createAMDGPUAsmBackend(const Target &T, StringRef TT,
+                                     StringRef CPU);
+} // End llvm namespace
+
+#define GET_REGINFO_ENUM
+#include "AMDGPUGenRegisterInfo.inc"
+
+#define GET_INSTRINFO_ENUM
+#include "AMDGPUGenInstrInfo.inc"
+
+#define GET_SUBTARGETINFO_ENUM
+#include "AMDGPUGenSubtargetInfo.inc"
+
+#endif // AMDGPUMCTARGETDESC_H
diff --git a/lib/Target/R600/MCTargetDesc/CMakeLists.txt b/lib/Target/R600/MCTargetDesc/CMakeLists.txt
new file mode 100644
index 0000000000..37e714c2e7
--- /dev/null
+++ b/lib/Target/R600/MCTargetDesc/CMakeLists.txt
@@ -0,0 +1,10 @@
+
+add_llvm_library(LLVMR600Desc
+  AMDGPUAsmBackend.cpp
+  AMDGPUMCTargetDesc.cpp
+  AMDGPUMCAsmInfo.cpp
+  R600MCCodeEmitter.cpp
+  SIMCCodeEmitter.cpp
+  )
+
+add_dependencies(LLVMR600Desc AMDGPUCommonTableGen)
diff --git a/lib/Target/R600/MCTargetDesc/LLVMBuild.txt b/lib/Target/R600/MCTargetDesc/LLVMBuild.txt
new file mode 100644
index 0000000000..b1beab0bb3
--- /dev/null
+++ b/lib/Target/R600/MCTargetDesc/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Target/R600/MCTargetDesc/LLVMBuild.txt ------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = R600Desc
+parent = R600
+required_libraries = R600AsmPrinter R600Info MC
+add_to_library_groups = R600
diff --git a/lib/Target/R600/MCTargetDesc/Makefile b/lib/Target/R600/MCTargetDesc/Makefile
new file mode 100644
index 0000000000..8894a7607f
--- /dev/null
+++ b/lib/Target/R600/MCTargetDesc/Makefile
@@ -0,0 +1,16 @@
+##===- lib/Target/AMDGPU/TargetDesc/Makefile ----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMR600Desc
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
new file mode 100644
index 0000000000..dc91924c73
--- /dev/null
+++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -0,0 +1,575 @@
+//===- R600MCCodeEmitter.cpp - Code Emitter for R600->Cayman GPU families -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// This code emitter outputs bytecode that is understood by the r600g driver
+/// in the Mesa [1] project.  The bytecode is very similar to the hardware's ISA,
+/// but it still needs to be run through a finalizer in order to be executed
+/// by the GPU.
+///
+/// [1] http://www.mesa3d.org/
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600Defines.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <stdio.h>
+
+#define SRC_BYTE_COUNT 11
+#define DST_BYTE_COUNT 5
+
+using namespace llvm;
+
+namespace {
+
+class R600MCCodeEmitter : public AMDGPUMCCodeEmitter {
+  R600MCCodeEmitter(const R600MCCodeEmitter &); // DO NOT IMPLEMENT
+  void operator=(const R600MCCodeEmitter &); // DO NOT IMPLEMENT
+  const MCInstrInfo &MCII;
+  const MCRegisterInfo &MRI;
+  const MCSubtargetInfo &STI;
+  MCContext &Ctx;
+
+public:
+
+  R600MCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
+                    const MCSubtargetInfo &sti, MCContext &ctx)
+    : MCII(mcii), MRI(mri), STI(sti), Ctx(ctx) { }
+
+  /// \brief Encode the instruction and write it to the OS.
+  virtual void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// \returns the encoding for an MCOperand.
+  virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                                     SmallVectorImpl<MCFixup> &Fixups) const;
+private:
+
+  void EmitALUInstr(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups,
+                    raw_ostream &OS) const;
+  void EmitSrc(const MCInst &MI, unsigned OpIdx, raw_ostream &OS) const;
+  void EmitSrcISA(const MCInst &MI, unsigned OpIdx, uint64_t &Value,
+                  raw_ostream &OS) const;
+  void EmitDst(const MCInst &MI, raw_ostream &OS) const;
+  void EmitTexInstr(const MCInst &MI, SmallVectorImpl<MCFixup> &Fixups,
+                    raw_ostream &OS) const;
+  void EmitFCInstr(const MCInst &MI, raw_ostream &OS) const;
+
+  void EmitNullBytes(unsigned int byteCount, raw_ostream &OS) const;
+
+  void EmitByte(unsigned int byte, raw_ostream &OS) const;
+
+  void EmitTwoBytes(uint32_t bytes, raw_ostream &OS) const;
+
+  void Emit(uint32_t value, raw_ostream &OS) const;
+  void Emit(uint64_t value, raw_ostream &OS) const;
+
+  unsigned getHWRegChan(unsigned reg) const;
+  unsigned getHWReg(unsigned regNo) const;
+
+  bool isFCOp(unsigned opcode) const;
+  bool isTexOp(unsigned opcode) const;
+  bool isFlagSet(const MCInst &MI, unsigned Operand, unsigned Flag) const;
+
+};
+
+} // End anonymous namespace
+
+enum RegElement {
+  ELEMENT_X = 0,
+  ELEMENT_Y,
+  ELEMENT_Z,
+  ELEMENT_W
+};
+
+enum InstrTypes {
+  INSTR_ALU = 0,
+  INSTR_TEX,
+  INSTR_FC,
+  INSTR_NATIVE,
+  INSTR_VTX,
+  INSTR_EXPORT
+};
+
+enum FCInstr {
+  FC_IF_PREDICATE = 0,
+  FC_ELSE,
+  FC_ENDIF,
+  FC_BGNLOOP,
+  FC_ENDLOOP,
+  FC_BREAK_PREDICATE,
+  FC_CONTINUE
+};
+
+enum TextureTypes {
+  TEXTURE_1D = 1,
+  TEXTURE_2D,
+  TEXTURE_3D,
+  TEXTURE_CUBE,
+  TEXTURE_RECT,
+  TEXTURE_SHADOW1D,
+  TEXTURE_SHADOW2D,
+  TEXTURE_SHADOWRECT,
+  TEXTURE_1D_ARRAY,
+  TEXTURE_2D_ARRAY,
+  TEXTURE_SHADOW1D_ARRAY,
+  TEXTURE_SHADOW2D_ARRAY
+};
+
+MCCodeEmitter *llvm::createR600MCCodeEmitter(const MCInstrInfo &MCII,
+                                           const MCRegisterInfo &MRI,
+                                           const MCSubtargetInfo &STI,
+                                           MCContext &Ctx) {
+  return new R600MCCodeEmitter(MCII, MRI, STI, Ctx);
+}
+
+void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                                       SmallVectorImpl<MCFixup> &Fixups) const {
+  if (isTexOp(MI.getOpcode())) {
+    EmitTexInstr(MI, Fixups, OS);
+  } else if (isFCOp(MI.getOpcode())){
+    EmitFCInstr(MI, OS);
+  } else if (MI.getOpcode() == AMDGPU::RETURN ||
+    MI.getOpcode() == AMDGPU::BUNDLE ||
+    MI.getOpcode() == AMDGPU::KILL) {
+    return;
+  } else {
+    switch(MI.getOpcode()) {
+    case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
+    case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
+      uint64_t inst = getBinaryCodeForInstr(MI, Fixups);
+      EmitByte(INSTR_NATIVE, OS);
+      Emit(inst, OS);
+      break;
+    }
+    case AMDGPU::CONSTANT_LOAD_eg:
+    case AMDGPU::VTX_READ_PARAM_8_eg:
+    case AMDGPU::VTX_READ_PARAM_16_eg:
+    case AMDGPU::VTX_READ_PARAM_32_eg:
+    case AMDGPU::VTX_READ_GLOBAL_8_eg:
+    case AMDGPU::VTX_READ_GLOBAL_32_eg:
+    case AMDGPU::VTX_READ_GLOBAL_128_eg: {
+      uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups);
+      uint32_t InstWord2 = MI.getOperand(2).getImm(); // Offset
+
+      EmitByte(INSTR_VTX, OS);
+      Emit(InstWord01, OS);
+      Emit(InstWord2, OS);
+      break;
+    }
+    case AMDGPU::EG_ExportSwz:
+    case AMDGPU::R600_ExportSwz:
+    case AMDGPU::EG_ExportBuf:
+    case AMDGPU::R600_ExportBuf: {
+      uint64_t Inst = getBinaryCodeForInstr(MI, Fixups);
+      EmitByte(INSTR_EXPORT, OS);
+      Emit(Inst, OS);
+      break;
+    }
+
+    default:
+      EmitALUInstr(MI, Fixups, OS);
+      break;
+    }
+  }
+}
+
+void R600MCCodeEmitter::EmitALUInstr(const MCInst &MI,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     raw_ostream &OS) const {
+  const MCInstrDesc &MCDesc = MCII.get(MI.getOpcode());
+  unsigned NumOperands = MI.getNumOperands();
+
+  // Emit instruction type
+  EmitByte(INSTR_ALU, OS);
+
+  uint64_t InstWord01 = getBinaryCodeForInstr(MI, Fixups);
+
+  //older alu have different encoding for instructions with one or two src
+  //parameters.
+  if ((STI.getFeatureBits() & AMDGPU::FeatureR600ALUInst) &&
+      !(MCDesc.TSFlags & R600_InstFlag::OP3)) {
+    uint64_t ISAOpCode = InstWord01 & (0x3FFULL << 39);
+    InstWord01 &= ~(0x3FFULL << 39);
+    InstWord01 |= ISAOpCode << 1;
+  }
+
+  unsigned SrcIdx = 0;
+  for (unsigned int OpIdx = 1; OpIdx < NumOperands; ++OpIdx) {
+    if (MI.getOperand(OpIdx).isImm() || MI.getOperand(OpIdx).isFPImm() ||
+        OpIdx == (unsigned)MCDesc.findFirstPredOperandIdx()) {
+      continue;
+    }
+    EmitSrcISA(MI, OpIdx, InstWord01, OS);
+    SrcIdx++;
+  }
+
+  // Emit zeros for unused sources
+  for ( ; SrcIdx < 3; SrcIdx++) {
+    EmitNullBytes(SRC_BYTE_COUNT - 6, OS);
+  }
+
+  Emit(InstWord01, OS);
+  return;
+}
+
+void R600MCCodeEmitter::EmitSrc(const MCInst &MI, unsigned OpIdx,
+                                raw_ostream &OS) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  union {
+    float f;
+    uint32_t i;
+  } Value;
+  Value.i = 0;
+  // Emit the source select (2 bytes).  For GPRs, this is the register index.
+  // For other potential instruction operands, (e.g. constant registers) the
+  // value of the source select is defined in the r600isa docs.
+  if (MO.isReg()) {
+    unsigned reg = MO.getReg();
+    EmitTwoBytes(getHWReg(reg), OS);
+    if (reg == AMDGPU::ALU_LITERAL_X) {
+      unsigned ImmOpIndex = MI.getNumOperands() - 1;
+      MCOperand ImmOp = MI.getOperand(ImmOpIndex);
+      if (ImmOp.isFPImm()) {
+        Value.f = ImmOp.getFPImm();
+      } else {
+        assert(ImmOp.isImm());
+        Value.i = ImmOp.getImm();
+      }
+    }
+  } else {
+    // XXX: Handle other operand types.
+    EmitTwoBytes(0, OS);
+  }
+
+  // Emit the source channel (1 byte)
+  if (MO.isReg()) {
+    EmitByte(getHWRegChan(MO.getReg()), OS);
+  } else {
+    EmitByte(0, OS);
+  }
+
+  // XXX: Emit isNegated (1 byte)
+  if ((!(isFlagSet(MI, OpIdx, MO_FLAG_ABS)))
+      && (isFlagSet(MI, OpIdx, MO_FLAG_NEG) ||
+     (MO.isReg() &&
+      (MO.getReg() == AMDGPU::NEG_ONE || MO.getReg() == AMDGPU::NEG_HALF)))){
+    EmitByte(1, OS);
+  } else {
+    EmitByte(0, OS);
+  }
+
+  // Emit isAbsolute (1 byte)
+  if (isFlagSet(MI, OpIdx, MO_FLAG_ABS)) {
+    EmitByte(1, OS);
+  } else {
+    EmitByte(0, OS);
+  }
+
+  // XXX: Emit relative addressing mode (1 byte)
+  EmitByte(0, OS);
+
+  // Emit kc_bank, This will be adjusted later by r600_asm
+  EmitByte(0, OS);
+
+  // Emit the literal value, if applicable (4 bytes).
+  Emit(Value.i, OS);
+
+}
+
+void R600MCCodeEmitter::EmitSrcISA(const MCInst &MI, unsigned OpIdx,
+                                   uint64_t &Value, raw_ostream &OS) const {
+  const MCOperand &MO = MI.getOperand(OpIdx);
+  union {
+    float f;
+    uint32_t i;
+  } InlineConstant;
+  InlineConstant.i = 0;
+  // Emit the source select (2 bytes).  For GPRs, this is the register index.
+  // For other potential instruction operands, (e.g. constant registers) the
+  // value of the source select is defined in the r600isa docs.
+  if (MO.isReg()) {
+    unsigned Reg = MO.getReg();
+    if (AMDGPUMCRegisterClasses[AMDGPU::R600_CReg32RegClassID].contains(Reg)) {
+      EmitByte(1, OS);
+    } else {
+      EmitByte(0, OS);
+    }
+
+    if (Reg == AMDGPU::ALU_LITERAL_X) {
+      unsigned ImmOpIndex = MI.getNumOperands() - 1;
+      MCOperand ImmOp = MI.getOperand(ImmOpIndex);
+      if (ImmOp.isFPImm()) {
+        InlineConstant.f = ImmOp.getFPImm();
+      } else {
+        assert(ImmOp.isImm());
+        InlineConstant.i = ImmOp.getImm();
+      }
+    }
+  }
+
+  // Emit the literal value, if applicable (4 bytes).
+  Emit(InlineConstant.i, OS);
+}
+
+void R600MCCodeEmitter::EmitTexInstr(const MCInst &MI,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     raw_ostream &OS) const {
+
+  unsigned Opcode = MI.getOpcode();
+  bool hasOffsets = (Opcode == AMDGPU::TEX_LD);
+  unsigned OpOffset = hasOffsets ? 3 : 0;
+  int64_t Resource = MI.getOperand(OpOffset + 2).getImm();
+  int64_t Sampler = MI.getOperand(OpOffset + 3).getImm();
+  int64_t TextureType = MI.getOperand(OpOffset + 4).getImm();
+  unsigned srcSelect[4] = {0, 1, 2, 3};
+
+  // Emit instruction type
+  EmitByte(1, OS);
+
+  // Emit instruction
+  EmitByte(getBinaryCodeForInstr(MI, Fixups), OS);
+
+  // Emit resource id
+  EmitByte(Resource, OS);
+
+  // Emit source register
+  EmitByte(getHWReg(MI.getOperand(1).getReg()), OS);
+
+  // XXX: Emit src isRelativeAddress
+  EmitByte(0, OS);
+
+  // Emit destination register
+  EmitByte(getHWReg(MI.getOperand(0).getReg()), OS);
+
+  // XXX: Emit dst isRealtiveAddress
+  EmitByte(0, OS);
+
+  // XXX: Emit dst select
+  EmitByte(0, OS); // X
+  EmitByte(1, OS); // Y
+  EmitByte(2, OS); // Z
+  EmitByte(3, OS); // W
+
+  // XXX: Emit lod bias
+  EmitByte(0, OS);
+
+  // XXX: Emit coord types
+  unsigned coordType[4] = {1, 1, 1, 1};
+
+  if (TextureType == TEXTURE_RECT
+      || TextureType == TEXTURE_SHADOWRECT) {
+    coordType[ELEMENT_X] = 0;
+    coordType[ELEMENT_Y] = 0;
+  }
+
+  if (TextureType == TEXTURE_1D_ARRAY
+      || TextureType == TEXTURE_SHADOW1D_ARRAY) {
+    if (Opcode == AMDGPU::TEX_SAMPLE_C_L || Opcode == AMDGPU::TEX_SAMPLE_C_LB) {
+      coordType[ELEMENT_Y] = 0;
+    } else {
+      coordType[ELEMENT_Z] = 0;
+      srcSelect[ELEMENT_Z] = ELEMENT_Y;
+    }
+  } else if (TextureType == TEXTURE_2D_ARRAY
+             || TextureType == TEXTURE_SHADOW2D_ARRAY) {
+    coordType[ELEMENT_Z] = 0;
+  }
+
+  for (unsigned i = 0; i < 4; i++) {
+    EmitByte(coordType[i], OS);
+  }
+
+  // XXX: Emit offsets
+  if (hasOffsets)
+	  for (unsigned i = 2; i < 5; i++)
+		  EmitByte(MI.getOperand(i).getImm()<<1, OS);
+  else
+	  EmitNullBytes(3, OS);
+
+  // Emit sampler id
+  EmitByte(Sampler, OS);
+
+  // XXX:Emit source select
+  if ((TextureType == TEXTURE_SHADOW1D
+      || TextureType == TEXTURE_SHADOW2D
+      || TextureType == TEXTURE_SHADOWRECT
+      || TextureType == TEXTURE_SHADOW1D_ARRAY)
+      && Opcode != AMDGPU::TEX_SAMPLE_C_L
+      && Opcode != AMDGPU::TEX_SAMPLE_C_LB) {
+    srcSelect[ELEMENT_W] = ELEMENT_Z;
+  }
+
+  for (unsigned i = 0; i < 4; i++) {
+    EmitByte(srcSelect[i], OS);
+  }
+}
+
+void R600MCCodeEmitter::EmitFCInstr(const MCInst &MI, raw_ostream &OS) const {
+
+  // Emit instruction type
+  EmitByte(INSTR_FC, OS);
+
+  // Emit SRC
+  unsigned NumOperands = MI.getNumOperands();
+  if (NumOperands > 0) {
+    assert(NumOperands == 1);
+    EmitSrc(MI, 0, OS);
+  } else {
+    EmitNullBytes(SRC_BYTE_COUNT, OS);
+  }
+
+  // Emit FC Instruction
+  enum FCInstr instr;
+  switch (MI.getOpcode()) {
+  case AMDGPU::PREDICATED_BREAK:
+    instr = FC_BREAK_PREDICATE;
+    break;
+  case AMDGPU::CONTINUE:
+    instr = FC_CONTINUE;
+    break;
+  case AMDGPU::IF_PREDICATE_SET:
+    instr = FC_IF_PREDICATE;
+    break;
+  case AMDGPU::ELSE:
+    instr = FC_ELSE;
+    break;
+  case AMDGPU::ENDIF:
+    instr = FC_ENDIF;
+    break;
+  case AMDGPU::ENDLOOP:
+    instr = FC_ENDLOOP;
+    break;
+  case AMDGPU::WHILELOOP:
+    instr = FC_BGNLOOP;
+    break;
+  default:
+    abort();
+    break;
+  }
+  EmitByte(instr, OS);
+}
+
+void R600MCCodeEmitter::EmitNullBytes(unsigned int ByteCount,
+                                      raw_ostream &OS) const {
+
+  for (unsigned int i = 0; i < ByteCount; i++) {
+    EmitByte(0, OS);
+  }
+}
+
+void R600MCCodeEmitter::EmitByte(unsigned int Byte, raw_ostream &OS) const {
+  OS.write((uint8_t) Byte & 0xff);
+}
+
+void R600MCCodeEmitter::EmitTwoBytes(unsigned int Bytes,
+                                     raw_ostream &OS) const {
+  OS.write((uint8_t) (Bytes & 0xff));
+  OS.write((uint8_t) ((Bytes >> 8) & 0xff));
+}
+
+void R600MCCodeEmitter::Emit(uint32_t Value, raw_ostream &OS) const {
+  for (unsigned i = 0; i < 4; i++) {
+    OS.write((uint8_t) ((Value >> (8 * i)) & 0xff));
+  }
+}
+
+void R600MCCodeEmitter::Emit(uint64_t Value, raw_ostream &OS) const {
+  for (unsigned i = 0; i < 8; i++) {
+    EmitByte((Value >> (8 * i)) & 0xff, OS);
+  }
+}
+
+unsigned R600MCCodeEmitter::getHWRegChan(unsigned reg) const {
+  return MRI.getEncodingValue(reg) >> HW_CHAN_SHIFT;
+}
+
+unsigned R600MCCodeEmitter::getHWReg(unsigned RegNo) const {
+  return MRI.getEncodingValue(RegNo) & HW_REG_MASK;
+}
+
+uint64_t R600MCCodeEmitter::getMachineOpValue(const MCInst &MI,
+                                              const MCOperand &MO,
+                                        SmallVectorImpl<MCFixup> &Fixup) const {
+  if (MO.isReg()) {
+    if (HAS_NATIVE_OPERANDS(MCII.get(MI.getOpcode()).TSFlags)) {
+      return MRI.getEncodingValue(MO.getReg());
+    } else {
+      return getHWReg(MO.getReg());
+    }
+  } else if (MO.isImm()) {
+    return MO.getImm();
+  } else {
+    assert(0);
+    return 0;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Encoding helper functions
+//===----------------------------------------------------------------------===//
+
+bool R600MCCodeEmitter::isFCOp(unsigned opcode) const {
+  switch(opcode) {
+  default: return false;
+  case AMDGPU::PREDICATED_BREAK:
+  case AMDGPU::CONTINUE:
+  case AMDGPU::IF_PREDICATE_SET:
+  case AMDGPU::ELSE:
+  case AMDGPU::ENDIF:
+  case AMDGPU::ENDLOOP:
+  case AMDGPU::WHILELOOP:
+    return true;
+  }
+}
+
+bool R600MCCodeEmitter::isTexOp(unsigned opcode) const {
+  switch(opcode) {
+  default: return false;
+  case AMDGPU::TEX_LD:
+  case AMDGPU::TEX_GET_TEXTURE_RESINFO:
+  case AMDGPU::TEX_SAMPLE:
+  case AMDGPU::TEX_SAMPLE_C:
+  case AMDGPU::TEX_SAMPLE_L:
+  case AMDGPU::TEX_SAMPLE_C_L:
+  case AMDGPU::TEX_SAMPLE_LB:
+  case AMDGPU::TEX_SAMPLE_C_LB:
+  case AMDGPU::TEX_SAMPLE_G:
+  case AMDGPU::TEX_SAMPLE_C_G:
+  case AMDGPU::TEX_GET_GRADIENTS_H:
+  case AMDGPU::TEX_GET_GRADIENTS_V:
+  case AMDGPU::TEX_SET_GRADIENTS_H:
+  case AMDGPU::TEX_SET_GRADIENTS_V:
+    return true;
+  }
+}
+
+bool R600MCCodeEmitter::isFlagSet(const MCInst &MI, unsigned Operand,
+                                  unsigned Flag) const {
+  const MCInstrDesc &MCDesc = MCII.get(MI.getOpcode());
+  unsigned FlagIndex = GET_FLAG_OPERAND_IDX(MCDesc.TSFlags);
+  if (FlagIndex == 0) {
+    return false;
+  }
+  assert(MI.getOperand(FlagIndex).isImm());
+  return !!((MI.getOperand(FlagIndex).getImm() >>
+            (NUM_MO_FLAGS * Operand)) & Flag);
+}
+
+#include "AMDGPUGenMCCodeEmitter.inc"
diff --git a/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
new file mode 100644
index 0000000000..c47dc995c7
--- /dev/null
+++ b/lib/Target/R600/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -0,0 +1,298 @@
+//===-- SIMCCodeEmitter.cpp - SI Code Emitter -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief The SI code emitter produces machine code that can be executed
+/// directly on the GPU device.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define VGPR_BIT(src_idx) (1ULL << (9 * src_idx - 1))
+#define SI_INSTR_FLAGS_ENCODING_MASK 0xf
+
+// These must be kept in sync with SIInstructions.td and also the
+// InstrEncodingInfo array in SIInstrInfo.cpp.
+//
+// NOTE: This enum is only used to identify the encoding type within LLVM,
+// the actual encoding type that is part of the instruction format is different
+namespace SIInstrEncodingType {
+  enum Encoding {
+    EXP = 0,
+    LDS = 1,
+    MIMG = 2,
+    MTBUF = 3,
+    MUBUF = 4,
+    SMRD = 5,
+    SOP1 = 6,
+    SOP2 = 7,
+    SOPC = 8,
+    SOPK = 9,
+    SOPP = 10,
+    VINTRP = 11,
+    VOP1 = 12,
+    VOP2 = 13,
+    VOP3 = 14,
+    VOPC = 15
+  };
+}
+
+using namespace llvm;
+
+namespace {
+class SIMCCodeEmitter : public  AMDGPUMCCodeEmitter {
+  SIMCCodeEmitter(const SIMCCodeEmitter &); // DO NOT IMPLEMENT
+  void operator=(const SIMCCodeEmitter &); // DO NOT IMPLEMENT
+  const MCInstrInfo &MCII;
+  const MCRegisterInfo &MRI;
+  const MCSubtargetInfo &STI;
+  MCContext &Ctx;
+
+public:
+  SIMCCodeEmitter(const MCInstrInfo &mcii, const MCRegisterInfo &mri,
+                  const MCSubtargetInfo &sti, MCContext &ctx)
+    : MCII(mcii), MRI(mri), STI(sti), Ctx(ctx) { }
+
+  ~SIMCCodeEmitter() { }
+
+  /// \breif Encode the instruction and write it to the OS.
+  virtual void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups) const;
+
+  /// \returns the encoding for an MCOperand.
+  virtual uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                                     SmallVectorImpl<MCFixup> &Fixups) const;
+
+public:
+
+  /// \brief Encode a sequence of registers with the correct alignment.
+  unsigned GPRAlign(const MCInst &MI, unsigned OpNo, unsigned shift) const;
+
+  /// \brief Encoding for when 2 consecutive registers are used
+  virtual unsigned GPR2AlignEncode(const MCInst &MI, unsigned OpNo,
+                                   SmallVectorImpl<MCFixup> &Fixup) const;
+
+  /// \brief Encoding for when 4 consectuive registers are used
+  virtual unsigned GPR4AlignEncode(const MCInst &MI, unsigned OpNo,
+                                   SmallVectorImpl<MCFixup> &Fixup) const;
+
+  /// \brief Encoding for SMRD indexed loads
+  virtual uint32_t SMRDmemriEncode(const MCInst &MI, unsigned OpNo,
+                                   SmallVectorImpl<MCFixup> &Fixup) const;
+
+  /// \brief Post-Encoder method for VOP instructions
+  virtual uint64_t VOPPostEncode(const MCInst &MI, uint64_t Value) const;
+
+private:
+
+  /// \returns this SIInstrEncodingType for this instruction.
+  unsigned getEncodingType(const MCInst &MI) const;
+
+  /// \brief Get then size in bytes of this instructions encoding.
+  unsigned getEncodingBytes(const MCInst &MI) const;
+
+  /// \returns the hardware encoding for a register
+  unsigned getRegBinaryCode(unsigned reg) const;
+
+  /// \brief Generated function that returns the hardware encoding for
+  /// a register
+  unsigned getHWRegNum(unsigned reg) const;
+
+};
+
+} // End anonymous namespace
+
+MCCodeEmitter *llvm::createSIMCCodeEmitter(const MCInstrInfo &MCII,
+                                           const MCRegisterInfo &MRI,
+                                           const MCSubtargetInfo &STI,
+                                           MCContext &Ctx) {
+  return new SIMCCodeEmitter(MCII, MRI, STI, Ctx);
+}
+
+void SIMCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
+                                       SmallVectorImpl<MCFixup> &Fixups) const {
+  uint64_t Encoding = getBinaryCodeForInstr(MI, Fixups);
+  unsigned bytes = getEncodingBytes(MI);
+  for (unsigned i = 0; i < bytes; i++) {
+    OS.write((uint8_t) ((Encoding >> (8 * i)) & 0xff));
+  }
+}
+
+uint64_t SIMCCodeEmitter::getMachineOpValue(const MCInst &MI,
+                                            const MCOperand &MO,
+                                       SmallVectorImpl<MCFixup> &Fixups) const {
+  if (MO.isReg()) {
+    return getRegBinaryCode(MO.getReg());
+  } else if (MO.isImm()) {
+    return MO.getImm();
+  } else if (MO.isFPImm()) {
+    // XXX: Not all instructions can use inline literals
+    // XXX: We should make sure this is a 32-bit constant
+    union {
+      float F;
+      uint32_t I;
+    } Imm;
+    Imm.F = MO.getFPImm();
+    return Imm.I;
+  } else if (MO.isExpr()) {
+    const MCExpr *Expr = MO.getExpr();
+    MCFixupKind Kind = MCFixupKind(FK_PCRel_4);
+    Fixups.push_back(MCFixup::Create(0, Expr, Kind, MI.getLoc()));
+    return 0;
+  } else{
+    llvm_unreachable("Encoding of this operand type is not supported yet.");
+  }
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Custom Operand Encodings
+//===----------------------------------------------------------------------===//
+
+unsigned SIMCCodeEmitter::GPRAlign(const MCInst &MI, unsigned OpNo,
+                                   unsigned shift) const {
+  unsigned regCode = getRegBinaryCode(MI.getOperand(OpNo).getReg());
+  return regCode >> shift;
+  return 0;
+}
+unsigned SIMCCodeEmitter::GPR2AlignEncode(const MCInst &MI,
+                                          unsigned OpNo ,
+                                        SmallVectorImpl<MCFixup> &Fixup) const {
+  return GPRAlign(MI, OpNo, 1);
+}
+
+unsigned SIMCCodeEmitter::GPR4AlignEncode(const MCInst &MI,
+                                          unsigned OpNo,
+                                        SmallVectorImpl<MCFixup> &Fixup) const {
+  return GPRAlign(MI, OpNo, 2);
+}
+
+#define SMRD_OFFSET_MASK 0xff
+#define SMRD_IMM_SHIFT 8
+#define SMRD_SBASE_MASK 0x3f
+#define SMRD_SBASE_SHIFT 9
+/// This function is responsibe for encoding the offset
+/// and the base ptr for SMRD instructions it should return a bit string in
+/// this format:
+///
+/// OFFSET = bits{7-0}
+/// IMM    = bits{8}
+/// SBASE  = bits{14-9}
+///
+uint32_t SIMCCodeEmitter::SMRDmemriEncode(const MCInst &MI, unsigned OpNo,
+                                        SmallVectorImpl<MCFixup> &Fixup) const {
+  uint32_t Encoding;
+
+  const MCOperand &OffsetOp = MI.getOperand(OpNo + 1);
+
+  //XXX: Use this function for SMRD loads with register offsets
+  assert(OffsetOp.isImm());
+
+  Encoding =
+      (getMachineOpValue(MI, OffsetOp, Fixup) & SMRD_OFFSET_MASK)
+    | (1 << SMRD_IMM_SHIFT) //XXX If the Offset is a register we shouldn't set this bit
+    | ((GPR2AlignEncode(MI, OpNo, Fixup) & SMRD_SBASE_MASK) << SMRD_SBASE_SHIFT)
+    ;
+
+  return Encoding;
+}
+
+//===----------------------------------------------------------------------===//
+// Post Encoder Callbacks
+//===----------------------------------------------------------------------===//
+
+uint64_t SIMCCodeEmitter::VOPPostEncode(const MCInst &MI, uint64_t Value) const{
+  unsigned encodingType = getEncodingType(MI);
+  unsigned numSrcOps;
+  unsigned vgprBitOffset;
+
+  if (encodingType == SIInstrEncodingType::VOP3) {
+    numSrcOps = 3;
+    vgprBitOffset = 32;
+  } else {
+    numSrcOps = 1;
+    vgprBitOffset = 0;
+  }
+
+  // Add one to skip over the destination reg operand.
+  for (unsigned opIdx = 1; opIdx < numSrcOps + 1; opIdx++) {
+    const MCOperand &MO = MI.getOperand(opIdx);
+    if (MO.isReg()) {
+      unsigned reg = MI.getOperand(opIdx).getReg();
+      if (AMDGPUMCRegisterClasses[AMDGPU::VReg_32RegClassID].contains(reg) ||
+          AMDGPUMCRegisterClasses[AMDGPU::VReg_64RegClassID].contains(reg)) {
+        Value |= (VGPR_BIT(opIdx)) << vgprBitOffset;
+      }
+    } else if (MO.isFPImm()) {
+      union {
+        float f;
+        uint32_t i;
+      } Imm;
+      // XXX: Not all instructions can use inline literals
+      // XXX: We should make sure this is a 32-bit constant
+      Imm.f = MO.getFPImm();
+      Value |= ((uint64_t)Imm.i) << 32;
+    }
+  }
+  return Value;
+}
+
+//===----------------------------------------------------------------------===//
+// Encoding helper functions
+//===----------------------------------------------------------------------===//
+
+unsigned SIMCCodeEmitter::getEncodingType(const MCInst &MI) const {
+  return MCII.get(MI.getOpcode()).TSFlags & SI_INSTR_FLAGS_ENCODING_MASK;
+}
+
+unsigned SIMCCodeEmitter::getEncodingBytes(const MCInst &MI) const {
+
+  // These instructions aren't real instructions with an encoding type, so
+  // we need to manually specify their size.
+  switch (MI.getOpcode()) {
+  default: break;
+  case AMDGPU::SI_LOAD_LITERAL_I32:
+  case AMDGPU::SI_LOAD_LITERAL_F32:
+    return 4;
+  }
+
+  unsigned encoding_type = getEncodingType(MI);
+  switch (encoding_type) {
+    case SIInstrEncodingType::EXP:
+    case SIInstrEncodingType::LDS:
+    case SIInstrEncodingType::MUBUF:
+    case SIInstrEncodingType::MTBUF:
+    case SIInstrEncodingType::MIMG:
+    case SIInstrEncodingType::VOP3:
+      return 8;
+    default:
+      return 4;
+  }
+}
+
+
+unsigned SIMCCodeEmitter::getRegBinaryCode(unsigned reg) const {
+  switch (reg) {
+    case AMDGPU::M0: return 124;
+    case AMDGPU::SREG_LIT_0: return 128;
+    case AMDGPU::SI_LITERAL_CONSTANT: return 255;
+    default: return MRI.getEncodingValue(reg);
+  }
+}
+
diff --git a/lib/Target/R600/Makefile b/lib/Target/R600/Makefile
new file mode 100644
index 0000000000..1b3ebbe8c8
--- /dev/null
+++ b/lib/Target/R600/Makefile
@@ -0,0 +1,23 @@
+##===- lib/Target/R600/Makefile ---------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMR600CodeGen
+TARGET = AMDGPU
+
+# Make sure that tblgen is run, first thing.
+BUILT_SOURCES = AMDGPUGenRegisterInfo.inc AMDGPUGenInstrInfo.inc \
+		AMDGPUGenDAGISel.inc  AMDGPUGenSubtargetInfo.inc \
+		AMDGPUGenMCCodeEmitter.inc AMDGPUGenCallingConv.inc \
+		AMDGPUGenIntrinsics.inc AMDGPUGenDFAPacketizer.inc \
+		AMDGPUGenAsmWriter.inc
+
+DIRS = InstPrinter TargetInfo MCTargetDesc
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td
new file mode 100644
index 0000000000..3dc1ecda77
--- /dev/null
+++ b/lib/Target/R600/Processors.td
@@ -0,0 +1,29 @@
+//===-- Processors.td - TODO: Add brief description -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// AMDIL processors supported.
+//
+//===----------------------------------------------------------------------===//
+
+class Proc<string Name, ProcessorItineraries itin, list<SubtargetFeature> Features>
+: Processor<Name, itin, Features>;
+def : Proc<"r600",       R600_EG_Itin, [FeatureR600ALUInst]>;
+def : Proc<"rv710",      R600_EG_Itin, []>;
+def : Proc<"rv730",      R600_EG_Itin, []>;
+def : Proc<"rv770",      R600_EG_Itin, [FeatureFP64]>;
+def : Proc<"cedar",      R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
+def : Proc<"redwood",    R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
+def : Proc<"juniper",    R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
+def : Proc<"cypress",    R600_EG_Itin, [FeatureByteAddress, FeatureImages, FeatureFP64]>;
+def : Proc<"barts",      R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
+def : Proc<"turks",      R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
+def : Proc<"caicos",     R600_EG_Itin, [FeatureByteAddress, FeatureImages]>;
+def : Proc<"cayman",     R600_EG_Itin, [FeatureByteAddress, FeatureImages, FeatureFP64]>;
+def : Proc<"SI", SI_Itin, [Feature64BitPtr]>;
+
diff --git a/lib/Target/R600/R600Defines.h b/lib/Target/R600/R600Defines.h
new file mode 100644
index 0000000000..7dea8e44ea
--- /dev/null
+++ b/lib/Target/R600/R600Defines.h
@@ -0,0 +1,79 @@
+//===-- R600Defines.h - R600 Helper Macros ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#ifndef R600DEFINES_H_
+#define R600DEFINES_H_
+
+#include "llvm/MC/MCRegisterInfo.h"
+
+// Operand Flags
+#define MO_FLAG_CLAMP (1 << 0)
+#define MO_FLAG_NEG   (1 << 1)
+#define MO_FLAG_ABS   (1 << 2)
+#define MO_FLAG_MASK  (1 << 3)
+#define MO_FLAG_PUSH  (1 << 4)
+#define MO_FLAG_NOT_LAST  (1 << 5)
+#define MO_FLAG_LAST  (1 << 6)
+#define NUM_MO_FLAGS 7
+
+/// \brief Helper for getting the operand index for the instruction flags
+/// operand.
+#define GET_FLAG_OPERAND_IDX(Flags) (((Flags) >> 7) & 0x3)
+
+namespace R600_InstFlag {
+  enum TIF {
+    TRANS_ONLY = (1 << 0),
+    TEX = (1 << 1),
+    REDUCTION = (1 << 2),
+    FC = (1 << 3),
+    TRIG = (1 << 4),
+    OP3 = (1 << 5),
+    VECTOR = (1 << 6),
+    //FlagOperand bits 7, 8
+    NATIVE_OPERANDS = (1 << 9),
+    OP1 = (1 << 10),
+    OP2 = (1 << 11)
+  };
+}
+
+#define HAS_NATIVE_OPERANDS(Flags) ((Flags) & R600_InstFlag::NATIVE_OPERANDS)
+
+/// \brief Defines for extracting register infomation from register encoding
+#define HW_REG_MASK 0x1ff
+#define HW_CHAN_SHIFT 9
+
+namespace R600Operands {
+  enum Ops {
+    DST,
+    UPDATE_EXEC_MASK,
+    UPDATE_PREDICATE,
+    WRITE,
+    OMOD,
+    DST_REL,
+    CLAMP,
+    SRC0,
+    SRC0_NEG,
+    SRC0_REL,
+    SRC0_ABS,
+    SRC1,
+    SRC1_NEG,
+    SRC1_REL,
+    SRC1_ABS,
+    SRC2,
+    SRC2_NEG,
+    SRC2_REL,
+    LAST,
+    PRED_SEL,
+    IMM,
+    COUNT
+ };
+}
+
+#endif // R600DEFINES_H_
diff --git a/lib/Target/R600/R600ExpandSpecialInstrs.cpp b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
new file mode 100644
index 0000000000..b6e62b7cef
--- /dev/null
+++ b/lib/Target/R600/R600ExpandSpecialInstrs.cpp
@@ -0,0 +1,334 @@
+//===-- R600ExpandSpecialInstrs.cpp - Expand special instructions ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Vector, Reduction, and Cube instructions need to fill the entire instruction
+/// group to work correctly.  This pass expands these individual instructions
+/// into several instructions that will completely fill the instruction group.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "R600Defines.h"
+#include "R600InstrInfo.h"
+#include "R600RegisterInfo.h"
+#include "R600MachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+class R600ExpandSpecialInstrsPass : public MachineFunctionPass {
+
+private:
+  static char ID;
+  const R600InstrInfo *TII;
+
+  bool ExpandInputPerspective(MachineInstr& MI);
+  bool ExpandInputConstant(MachineInstr& MI);
+
+public:
+  R600ExpandSpecialInstrsPass(TargetMachine &tm) : MachineFunctionPass(ID),
+    TII (static_cast<const R600InstrInfo *>(tm.getInstrInfo())) { }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  const char *getPassName() const {
+    return "R600 Expand special instructions pass";
+  }
+};
+
+} // End anonymous namespace
+
+char R600ExpandSpecialInstrsPass::ID = 0;
+
+FunctionPass *llvm::createR600ExpandSpecialInstrsPass(TargetMachine &TM) {
+  return new R600ExpandSpecialInstrsPass(TM);
+}
+
+bool R600ExpandSpecialInstrsPass::ExpandInputPerspective(MachineInstr &MI) {
+  const R600RegisterInfo &TRI = TII->getRegisterInfo();
+  if (MI.getOpcode() != AMDGPU::input_perspective)
+    return false;
+
+  MachineBasicBlock::iterator I = &MI;
+  unsigned DstReg = MI.getOperand(0).getReg();
+  R600MachineFunctionInfo *MFI = MI.getParent()->getParent()
+      ->getInfo<R600MachineFunctionInfo>();
+  unsigned IJIndexBase;
+
+  // In Evergreen ISA doc section 8.3.2 :
+  // We need to interpolate XY and ZW in two different instruction groups.
+  // An INTERP_* must occupy all 4 slots of an instruction group.
+  // Output of INTERP_XY is written in X,Y slots
+  // Output of INTERP_ZW is written in Z,W slots
+  //
+  // Thus interpolation requires the following sequences :
+  //
+  // AnyGPR.x = INTERP_ZW; (Write Masked Out)
+  // AnyGPR.y = INTERP_ZW; (Write Masked Out)
+  // DstGPR.z = INTERP_ZW;
+  // DstGPR.w = INTERP_ZW; (End of first IG)
+  // DstGPR.x = INTERP_XY;
+  // DstGPR.y = INTERP_XY;
+  // AnyGPR.z = INTERP_XY; (Write Masked Out)
+  // AnyGPR.w = INTERP_XY; (Write Masked Out) (End of second IG)
+  //
+  switch (MI.getOperand(1).getImm()) {
+  case 0:
+    IJIndexBase = MFI->GetIJPerspectiveIndex();
+    break;
+  case 1:
+    IJIndexBase = MFI->GetIJLinearIndex();
+    break;
+  default:
+    assert(0 && "Unknow ij index");
+  }
+
+  for (unsigned i = 0; i < 8; i++) {
+    unsigned IJIndex = AMDGPU::R600_TReg32RegClass.getRegister(
+        2 * IJIndexBase + ((i + 1) % 2));
+    unsigned ReadReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
+        MI.getOperand(2).getImm());
+
+
+    unsigned Sel = AMDGPU::sel_x;
+    switch (i % 4) {
+    case 0:Sel = AMDGPU::sel_x;break;
+    case 1:Sel = AMDGPU::sel_y;break;
+    case 2:Sel = AMDGPU::sel_z;break;
+    case 3:Sel = AMDGPU::sel_w;break;
+    default:break;
+    }
+
+    unsigned Res = TRI.getSubReg(DstReg, Sel);
+
+    unsigned Opcode = (i < 4)?AMDGPU::INTERP_ZW:AMDGPU::INTERP_XY;
+
+    MachineBasicBlock &MBB = *(MI.getParent());
+    MachineInstr *NewMI =
+        TII->buildDefaultInstruction(MBB, I, Opcode, Res, IJIndex, ReadReg);
+
+    if (!(i> 1 && i < 6)) {
+      TII->addFlag(NewMI, 0, MO_FLAG_MASK);
+    }
+
+    if (i % 4 !=  3)
+      TII->addFlag(NewMI, 0, MO_FLAG_NOT_LAST);
+  }
+
+  MI.eraseFromParent();
+
+  return true;
+}
+
+bool R600ExpandSpecialInstrsPass::ExpandInputConstant(MachineInstr &MI) {
+  const R600RegisterInfo &TRI = TII->getRegisterInfo();
+  if (MI.getOpcode() != AMDGPU::input_constant)
+    return false;
+
+  MachineBasicBlock::iterator I = &MI;
+  unsigned DstReg = MI.getOperand(0).getReg();
+
+  for (unsigned i = 0; i < 4; i++) {
+    unsigned ReadReg = AMDGPU::R600_ArrayBaseRegClass.getRegister(
+        MI.getOperand(1).getImm());
+
+    unsigned Sel = AMDGPU::sel_x;
+    switch (i % 4) {
+    case 0:Sel = AMDGPU::sel_x;break;
+    case 1:Sel = AMDGPU::sel_y;break;
+    case 2:Sel = AMDGPU::sel_z;break;
+    case 3:Sel = AMDGPU::sel_w;break;
+    default:break;
+    }
+
+    unsigned Res = TRI.getSubReg(DstReg, Sel);
+
+    MachineBasicBlock &MBB = *(MI.getParent());
+    MachineInstr *NewMI = TII->buildDefaultInstruction(
+        MBB, I, AMDGPU::INTERP_LOAD_P0, Res, ReadReg);
+
+    if (i % 4 !=  3)
+      TII->addFlag(NewMI, 0, MO_FLAG_NOT_LAST);
+  }
+
+  MI.eraseFromParent();
+
+  return true;
+}
+
+bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
+
+  const R600RegisterInfo &TRI = TII->getRegisterInfo();
+
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    MachineBasicBlock::iterator I = MBB.begin();
+    while (I != MBB.end()) {
+      MachineInstr &MI = *I;
+      I = llvm::next(I);
+
+      switch (MI.getOpcode()) {
+      default: break;
+      // Expand PRED_X to one of the PRED_SET instructions.
+      case AMDGPU::PRED_X: {
+        uint64_t Flags = MI.getOperand(3).getImm();
+        // The native opcode used by PRED_X is stored as an immediate in the
+        // third operand.
+        MachineInstr *PredSet = TII->buildDefaultInstruction(MBB, I,
+                                            MI.getOperand(2).getImm(), // opcode
+                                            MI.getOperand(0).getReg(), // dst
+                                            MI.getOperand(1).getReg(), // src0
+                                            AMDGPU::ZERO);             // src1
+        TII->addFlag(PredSet, 0, MO_FLAG_MASK);
+        if (Flags & MO_FLAG_PUSH) {
+          TII->setImmOperand(PredSet, R600Operands::UPDATE_EXEC_MASK, 1);
+        } else {
+          TII->setImmOperand(PredSet, R600Operands::UPDATE_PREDICATE, 1);
+        }
+        MI.eraseFromParent();
+        continue;
+        }
+      case AMDGPU::BREAK:
+        MachineInstr *PredSet = TII->buildDefaultInstruction(MBB, I,
+                                          AMDGPU::PRED_SETE_INT,
+                                          AMDGPU::PREDICATE_BIT,
+                                          AMDGPU::ZERO,
+                                          AMDGPU::ZERO);
+        TII->addFlag(PredSet, 0, MO_FLAG_MASK);
+        TII->setImmOperand(PredSet, R600Operands::UPDATE_EXEC_MASK, 1);
+
+        BuildMI(MBB, I, MBB.findDebugLoc(I),
+                TII->get(AMDGPU::PREDICATED_BREAK))
+                .addReg(AMDGPU::PREDICATE_BIT);
+        MI.eraseFromParent();
+        continue;
+    }
+
+    if (ExpandInputPerspective(MI))
+      continue;
+    if (ExpandInputConstant(MI))
+      continue;
+
+      bool IsReduction = TII->isReductionOp(MI.getOpcode());
+      bool IsVector = TII->isVector(MI);
+      bool IsCube = TII->isCubeOp(MI.getOpcode());
+      if (!IsReduction && !IsVector && !IsCube) {
+        continue;
+      }
+
+      // Expand the instruction
+      //
+      // Reduction instructions:
+      // T0_X = DP4 T1_XYZW, T2_XYZW
+      // becomes:
+      // TO_X = DP4 T1_X, T2_X
+      // TO_Y (write masked) = DP4 T1_Y, T2_Y
+      // TO_Z (write masked) = DP4 T1_Z, T2_Z
+      // TO_W (write masked) = DP4 T1_W, T2_W
+      //
+      // Vector instructions:
+      // T0_X = MULLO_INT T1_X, T2_X
+      // becomes:
+      // T0_X = MULLO_INT T1_X, T2_X
+      // T0_Y (write masked) = MULLO_INT T1_X, T2_X
+      // T0_Z (write masked) = MULLO_INT T1_X, T2_X
+      // T0_W (write masked) = MULLO_INT T1_X, T2_X
+      //
+      // Cube instructions:
+      // T0_XYZW = CUBE T1_XYZW
+      // becomes:
+      // TO_X = CUBE T1_Z, T1_Y
+      // T0_Y = CUBE T1_Z, T1_X
+      // T0_Z = CUBE T1_X, T1_Z
+      // T0_W = CUBE T1_Y, T1_Z
+      for (unsigned Chan = 0; Chan < 4; Chan++) {
+        unsigned DstReg = MI.getOperand(
+                            TII->getOperandIdx(MI, R600Operands::DST)).getReg();
+        unsigned Src0 = MI.getOperand(
+                           TII->getOperandIdx(MI, R600Operands::SRC0)).getReg();
+        unsigned Src1 = 0;
+
+        // Determine the correct source registers
+        if (!IsCube) {
+          int Src1Idx = TII->getOperandIdx(MI, R600Operands::SRC1);
+          if (Src1Idx != -1) {
+            Src1 = MI.getOperand(Src1Idx).getReg();
+          }
+        }
+        if (IsReduction) {
+          unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
+          Src0 = TRI.getSubReg(Src0, SubRegIndex);
+          Src1 = TRI.getSubReg(Src1, SubRegIndex);
+        } else if (IsCube) {
+          static const int CubeSrcSwz[] = {2, 2, 0, 1};
+          unsigned SubRegIndex0 = TRI.getSubRegFromChannel(CubeSrcSwz[Chan]);
+          unsigned SubRegIndex1 = TRI.getSubRegFromChannel(CubeSrcSwz[3 - Chan]);
+          Src1 = TRI.getSubReg(Src0, SubRegIndex1);
+          Src0 = TRI.getSubReg(Src0, SubRegIndex0);
+        }
+
+        // Determine the correct destination registers;
+        bool Mask = false;
+        bool NotLast = true;
+        if (IsCube) {
+          unsigned SubRegIndex = TRI.getSubRegFromChannel(Chan);
+          DstReg = TRI.getSubReg(DstReg, SubRegIndex);
+        } else {
+          // Mask the write if the original instruction does not write to
+          // the current Channel.
+          Mask = (Chan != TRI.getHWRegChan(DstReg));
+          unsigned DstBase = TRI.getEncodingValue(DstReg) & HW_REG_MASK;
+          DstReg = AMDGPU::R600_TReg32RegClass.getRegister((DstBase * 4) + Chan);
+        }
+
+        // Set the IsLast bit
+        NotLast = (Chan != 3 );
+
+        // Add the new instruction
+        unsigned Opcode = MI.getOpcode();
+        switch (Opcode) {
+        case AMDGPU::CUBE_r600_pseudo:
+          Opcode = AMDGPU::CUBE_r600_real;
+          break;
+        case AMDGPU::CUBE_eg_pseudo:
+          Opcode = AMDGPU::CUBE_eg_real;
+          break;
+        case AMDGPU::DOT4_r600_pseudo:
+          Opcode = AMDGPU::DOT4_r600_real;
+          break;
+        case AMDGPU::DOT4_eg_pseudo:
+          Opcode = AMDGPU::DOT4_eg_real;
+          break;
+        default:
+          break;
+        }
+
+        MachineInstr *NewMI =
+          TII->buildDefaultInstruction(MBB, I, Opcode, DstReg, Src0, Src1);
+
+        if (Chan != 0)
+          NewMI->bundleWithPred();
+        if (Mask) {
+          TII->addFlag(NewMI, 0, MO_FLAG_MASK);
+        }
+        if (NotLast) {
+          TII->addFlag(NewMI, 0, MO_FLAG_NOT_LAST);
+        }
+      }
+      MI.eraseFromParent();
+    }
+  }
+  return false;
+}
diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
new file mode 100644
index 0000000000..28d5470b8c
--- /dev/null
+++ b/lib/Target/R600/R600ISelLowering.cpp
@@ -0,0 +1,905 @@
+//===-- R600ISelLowering.cpp - R600 DAG Lowering Implementation -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Custom DAG lowering for R600
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600ISelLowering.h"
+#include "R600Defines.h"
+#include "R600InstrInfo.h"
+#include "R600MachineFunctionInfo.h"
+#include "llvm/Argument.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+
+using namespace llvm;
+
+R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
+    AMDGPUTargetLowering(TM),
+    TII(static_cast<const R600InstrInfo*>(TM.getInstrInfo())) {
+  setOperationAction(ISD::MUL, MVT::i64, Expand);
+  addRegisterClass(MVT::v4f32, &AMDGPU::R600_Reg128RegClass);
+  addRegisterClass(MVT::f32, &AMDGPU::R600_Reg32RegClass);
+  addRegisterClass(MVT::v4i32, &AMDGPU::R600_Reg128RegClass);
+  addRegisterClass(MVT::i32, &AMDGPU::R600_Reg32RegClass);
+  computeRegisterProperties();
+
+  setOperationAction(ISD::FADD, MVT::v4f32, Expand);
+  setOperationAction(ISD::FMUL, MVT::v4f32, Expand);
+  setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
+  setOperationAction(ISD::FSUB, MVT::v4f32, Expand);
+
+  setOperationAction(ISD::ADD,  MVT::v4i32, Expand);
+  setOperationAction(ISD::AND,  MVT::v4i32, Expand);
+  setOperationAction(ISD::UDIV, MVT::v4i32, Expand);
+  setOperationAction(ISD::UREM, MVT::v4i32, Expand);
+  setOperationAction(ISD::SETCC, MVT::v4i32, Expand);
+
+  setOperationAction(ISD::BR_CC, MVT::i32, Custom);
+  setOperationAction(ISD::BR_CC, MVT::f32, Custom);
+
+  setOperationAction(ISD::FSUB, MVT::f32, Expand);
+
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i1, Custom);
+  setOperationAction(ISD::FPOW, MVT::f32, Custom);
+
+  setOperationAction(ISD::ROTL, MVT::i32, Custom);
+
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+
+  setOperationAction(ISD::SETCC, MVT::i32, Custom);
+  setOperationAction(ISD::SETCC, MVT::f32, Custom);
+  setOperationAction(ISD::FP_TO_UINT, MVT::i1, Custom);
+
+  setOperationAction(ISD::SELECT, MVT::i32, Custom);
+  setOperationAction(ISD::SELECT, MVT::f32, Custom);
+
+  setOperationAction(ISD::STORE, MVT::i32, Custom);
+  setOperationAction(ISD::STORE, MVT::v4i32, Custom);
+
+  setTargetDAGCombine(ISD::FP_ROUND);
+
+  setSchedulingPreference(Sched::VLIW);
+}
+
+MachineBasicBlock * R600TargetLowering::EmitInstrWithCustomInserter(
+    MachineInstr * MI, MachineBasicBlock * BB) const {
+  MachineFunction * MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  MachineBasicBlock::iterator I = *MI;
+
+  switch (MI->getOpcode()) {
+  default: return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
+  case AMDGPU::SHADER_TYPE: break;
+  case AMDGPU::CLAMP_R600: {
+    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
+                                                   AMDGPU::MOV,
+                                                   MI->getOperand(0).getReg(),
+                                                   MI->getOperand(1).getReg());
+    TII->addFlag(NewMI, 0, MO_FLAG_CLAMP);
+    break;
+  }
+
+  case AMDGPU::FABS_R600: {
+    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
+                                                    AMDGPU::MOV,
+                                                    MI->getOperand(0).getReg(),
+                                                    MI->getOperand(1).getReg());
+    TII->addFlag(NewMI, 0, MO_FLAG_ABS);
+    break;
+  }
+
+  case AMDGPU::FNEG_R600: {
+    MachineInstr *NewMI = TII->buildDefaultInstruction(*BB, I,
+                                                    AMDGPU::MOV,
+                                                    MI->getOperand(0).getReg(),
+                                                    MI->getOperand(1).getReg());
+    TII->addFlag(NewMI, 0, MO_FLAG_NEG);
+    break;
+  }
+
+  case AMDGPU::R600_LOAD_CONST: {
+    int64_t RegIndex = MI->getOperand(1).getImm();
+    unsigned ConstantReg = AMDGPU::R600_CReg32RegClass.getRegister(RegIndex);
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::COPY))
+                .addOperand(MI->getOperand(0))
+                .addReg(ConstantReg);
+    break;
+  }
+
+  case AMDGPU::MASK_WRITE: {
+    unsigned maskedRegister = MI->getOperand(0).getReg();
+    assert(TargetRegisterInfo::isVirtualRegister(maskedRegister));
+    MachineInstr * defInstr = MRI.getVRegDef(maskedRegister);
+    TII->addFlag(defInstr, 0, MO_FLAG_MASK);
+    break;
+  }
+
+  case AMDGPU::MOV_IMM_F32:
+    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
+                     MI->getOperand(1).getFPImm()->getValueAPF()
+                         .bitcastToAPInt().getZExtValue());
+    break;
+  case AMDGPU::MOV_IMM_I32:
+    TII->buildMovImm(*BB, I, MI->getOperand(0).getReg(),
+                     MI->getOperand(1).getImm());
+    break;
+
+
+  case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
+  case AMDGPU::RAT_WRITE_CACHELESS_128_eg: {
+    unsigned EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN) ? 1 : 0;
+
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
+            .addOperand(MI->getOperand(0))
+            .addOperand(MI->getOperand(1))
+            .addImm(EOP); // Set End of program bit
+    break;
+  }
+
+  case AMDGPU::RESERVE_REG: {
+    R600MachineFunctionInfo * MFI = MF->getInfo<R600MachineFunctionInfo>();
+    int64_t ReservedIndex = MI->getOperand(0).getImm();
+    unsigned ReservedReg =
+                         AMDGPU::R600_TReg32RegClass.getRegister(ReservedIndex);
+    MFI->ReservedRegs.push_back(ReservedReg);
+    unsigned SuperReg =
+          AMDGPU::R600_Reg128RegClass.getRegister(ReservedIndex / 4);
+    MFI->ReservedRegs.push_back(SuperReg);
+    break;
+  }
+
+  case AMDGPU::TXD: {
+    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
+    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
+
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
+            .addOperand(MI->getOperand(3))
+            .addOperand(MI->getOperand(4))
+            .addOperand(MI->getOperand(5))
+            .addOperand(MI->getOperand(6));
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
+            .addOperand(MI->getOperand(2))
+            .addOperand(MI->getOperand(4))
+            .addOperand(MI->getOperand(5))
+            .addOperand(MI->getOperand(6));
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_G))
+            .addOperand(MI->getOperand(0))
+            .addOperand(MI->getOperand(1))
+            .addOperand(MI->getOperand(4))
+            .addOperand(MI->getOperand(5))
+            .addOperand(MI->getOperand(6))
+            .addReg(T0, RegState::Implicit)
+            .addReg(T1, RegState::Implicit);
+    break;
+  }
+
+  case AMDGPU::TXD_SHADOW: {
+    unsigned T0 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
+    unsigned T1 = MRI.createVirtualRegister(&AMDGPU::R600_Reg128RegClass);
+
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_H), T0)
+            .addOperand(MI->getOperand(3))
+            .addOperand(MI->getOperand(4))
+            .addOperand(MI->getOperand(5))
+            .addOperand(MI->getOperand(6));
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SET_GRADIENTS_V), T1)
+            .addOperand(MI->getOperand(2))
+            .addOperand(MI->getOperand(4))
+            .addOperand(MI->getOperand(5))
+            .addOperand(MI->getOperand(6));
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::TEX_SAMPLE_C_G))
+            .addOperand(MI->getOperand(0))
+            .addOperand(MI->getOperand(1))
+            .addOperand(MI->getOperand(4))
+            .addOperand(MI->getOperand(5))
+            .addOperand(MI->getOperand(6))
+            .addReg(T0, RegState::Implicit)
+            .addReg(T1, RegState::Implicit);
+    break;
+  }
+
+  case AMDGPU::BRANCH:
+      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
+              .addOperand(MI->getOperand(0))
+              .addReg(0);
+      break;
+
+  case AMDGPU::BRANCH_COND_f32: {
+    MachineInstr *NewMI =
+      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
+              AMDGPU::PREDICATE_BIT)
+              .addOperand(MI->getOperand(1))
+              .addImm(OPCODE_IS_NOT_ZERO)
+              .addImm(0); // Flags
+    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
+            .addOperand(MI->getOperand(0))
+            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+    break;
+  }
+
+  case AMDGPU::BRANCH_COND_i32: {
+    MachineInstr *NewMI =
+      BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::PRED_X),
+            AMDGPU::PREDICATE_BIT)
+            .addOperand(MI->getOperand(1))
+            .addImm(OPCODE_IS_NOT_ZERO_INT)
+            .addImm(0); // Flags
+    TII->addFlag(NewMI, 0, MO_FLAG_PUSH);
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::JUMP))
+           .addOperand(MI->getOperand(0))
+            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+    break;
+  }
+
+  case AMDGPU::input_perspective: {
+    R600MachineFunctionInfo *MFI = MF->getInfo<R600MachineFunctionInfo>();
+
+    // XXX Be more fine about register reservation
+    for (unsigned i = 0; i < 4; i ++) {
+      unsigned ReservedReg = AMDGPU::R600_TReg32RegClass.getRegister(i);
+      MFI->ReservedRegs.push_back(ReservedReg);
+    }
+
+    switch (MI->getOperand(1).getImm()) {
+    case 0:// Perspective
+      MFI->HasPerspectiveInterpolation = true;
+      break;
+    case 1:// Linear
+      MFI->HasLinearInterpolation = true;
+      break;
+    default:
+      assert(0 && "Unknow ij index");
+    }
+
+    return BB;
+  }
+
+  case AMDGPU::EG_ExportSwz:
+  case AMDGPU::R600_ExportSwz: {
+    bool EOP = (llvm::next(I)->getOpcode() == AMDGPU::RETURN)? 1 : 0;
+    if (!EOP)
+      return BB;
+    unsigned CfInst = (MI->getOpcode() == AMDGPU::EG_ExportSwz)? 84 : 40;
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(MI->getOpcode()))
+            .addOperand(MI->getOperand(0))
+            .addOperand(MI->getOperand(1))
+            .addOperand(MI->getOperand(2))
+            .addOperand(MI->getOperand(3))
+            .addOperand(MI->getOperand(4))
+            .addOperand(MI->getOperand(5))
+            .addOperand(MI->getOperand(6))
+            .addImm(CfInst)
+            .addImm(1);
+    break;
+  }
+  }
+
+  MI->eraseFromParent();
+  return BB;
+}
+
+//===----------------------------------------------------------------------===//
+// Custom DAG Lowering Operations
+//===----------------------------------------------------------------------===//
+
+using namespace llvm::Intrinsic;
+using namespace llvm::AMDGPUIntrinsic;
+
+static SDValue
+InsertScalarToRegisterExport(SelectionDAG &DAG, DebugLoc DL, SDNode **ExportMap,
+    unsigned Slot, unsigned Channel, unsigned Inst, unsigned Type,
+    SDValue Scalar, SDValue Chain) {
+  if (!ExportMap[Slot]) {
+    SDValue Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT,
+      DL, MVT::v4f32,
+      DAG.getUNDEF(MVT::v4f32),
+      Scalar,
+      DAG.getConstant(Channel, MVT::i32));
+
+    unsigned Mask = 1 << Channel;
+
+    const SDValue Ops[] = {Chain, Vector, DAG.getConstant(Inst, MVT::i32),
+        DAG.getConstant(Type, MVT::i32), DAG.getConstant(Slot, MVT::i32),
+        DAG.getConstant(Mask, MVT::i32)};
+
+    SDValue Res =  DAG.getNode(
+        AMDGPUISD::EXPORT,
+        DL,
+        MVT::Other,
+        Ops, 6);
+     ExportMap[Slot] = Res.getNode();
+     return Res;
+  }
+
+  SDNode *ExportInstruction = (SDNode *) ExportMap[Slot] ;
+  SDValue PreviousVector = ExportInstruction->getOperand(1);
+  SDValue Vector = DAG.getNode(ISD::INSERT_VECTOR_ELT,
+      DL, MVT::v4f32,
+      PreviousVector,
+      Scalar,
+      DAG.getConstant(Channel, MVT::i32));
+
+  unsigned Mask = dyn_cast<ConstantSDNode>(ExportInstruction->getOperand(5))
+      ->getZExtValue();
+  Mask |= (1 << Channel);
+
+  const SDValue Ops[] = {ExportInstruction->getOperand(0), Vector,
+      DAG.getConstant(Inst, MVT::i32),
+      DAG.getConstant(Type, MVT::i32),
+      DAG.getConstant(Slot, MVT::i32),
+      DAG.getConstant(Mask, MVT::i32)};
+
+  DAG.UpdateNodeOperands(ExportInstruction,
+      Ops, 6);
+
+  return Chain;
+
+}
+
+SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+  case ISD::BR_CC: return LowerBR_CC(Op, DAG);
+  case ISD::ROTL: return LowerROTL(Op, DAG);
+  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
+  case ISD::SELECT: return LowerSELECT(Op, DAG);
+  case ISD::SETCC: return LowerSETCC(Op, DAG);
+  case ISD::STORE: return LowerSTORE(Op, DAG);
+  case ISD::FPOW: return LowerFPOW(Op, DAG);
+  case ISD::INTRINSIC_VOID: {
+    SDValue Chain = Op.getOperand(0);
+    unsigned IntrinsicID =
+                         cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+    switch (IntrinsicID) {
+    case AMDGPUIntrinsic::AMDGPU_store_output: {
+      MachineFunction &MF = DAG.getMachineFunction();
+      MachineRegisterInfo &MRI = MF.getRegInfo();
+      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
+      if (!MRI.isLiveOut(Reg)) {
+        MRI.addLiveOut(Reg);
+      }
+      return DAG.getCopyToReg(Chain, Op.getDebugLoc(), Reg, Op.getOperand(2));
+    }
+    case AMDGPUIntrinsic::R600_store_pixel_color: {
+      MachineFunction &MF = DAG.getMachineFunction();
+      R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+
+      SDNode **OutputsMap = MFI->Outputs;
+      return InsertScalarToRegisterExport(DAG, Op.getDebugLoc(), OutputsMap,
+          RegIndex / 4, RegIndex % 4, 0, 0, Op.getOperand(2),
+          Chain);
+
+    }
+    case AMDGPUIntrinsic::R600_store_stream_output : {
+      MachineFunction &MF = DAG.getMachineFunction();
+      R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(3))->getZExtValue();
+      int64_t BufIndex = cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue();
+
+      SDNode **OutputsMap = MFI->StreamOutputs[BufIndex];
+      unsigned Inst;
+      switch (cast<ConstantSDNode>(Op.getOperand(4))->getZExtValue()  ) {
+      // STREAM3
+      case 3:
+        Inst = 4;
+        break;
+      // STREAM2
+      case 2:
+        Inst = 3;
+        break;
+      // STREAM1
+      case 1:
+        Inst = 2;
+        break;
+      // STREAM0
+      case 0:
+        Inst = 1;
+        break;
+      default:
+        llvm_unreachable("Wrong buffer id for stream outputs !");
+      }
+
+      return InsertScalarToRegisterExport(DAG, Op.getDebugLoc(), OutputsMap,
+          RegIndex / 4, RegIndex % 4, Inst, 0, Op.getOperand(2),
+          Chain);
+    }
+    // default for switch(IntrinsicID)
+    default: break;
+    }
+    // break out of case ISD::INTRINSIC_VOID in switch(Op.getOpcode())
+    break;
+  }
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntrinsicID =
+                         cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    EVT VT = Op.getValueType();
+    DebugLoc DL = Op.getDebugLoc();
+    switch(IntrinsicID) {
+    default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+    case AMDGPUIntrinsic::R600_load_input: {
+      int64_t RegIndex = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+      unsigned Reg = AMDGPU::R600_TReg32RegClass.getRegister(RegIndex);
+      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass, Reg, VT);
+    }
+    case AMDGPUIntrinsic::R600_load_input_perspective: {
+      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+      if (slot < 0)
+        return DAG.getUNDEF(MVT::f32);
+      SDValue FullVector = DAG.getNode(
+          AMDGPUISD::INTERP,
+          DL, MVT::v4f32,
+          DAG.getConstant(0, MVT::i32), DAG.getConstant(slot / 4 , MVT::i32));
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT,
+        DL, VT, FullVector, DAG.getConstant(slot % 4, MVT::i32));
+    }
+    case AMDGPUIntrinsic::R600_load_input_linear: {
+      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+      if (slot < 0)
+        return DAG.getUNDEF(MVT::f32);
+      SDValue FullVector = DAG.getNode(
+        AMDGPUISD::INTERP,
+        DL, MVT::v4f32,
+        DAG.getConstant(1, MVT::i32), DAG.getConstant(slot / 4 , MVT::i32));
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT,
+        DL, VT, FullVector, DAG.getConstant(slot % 4, MVT::i32));
+    }
+    case AMDGPUIntrinsic::R600_load_input_constant: {
+      int slot = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
+      if (slot < 0)
+        return DAG.getUNDEF(MVT::f32);
+      SDValue FullVector = DAG.getNode(
+        AMDGPUISD::INTERP_P0,
+        DL, MVT::v4f32,
+        DAG.getConstant(slot / 4 , MVT::i32));
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT,
+          DL, VT, FullVector, DAG.getConstant(slot % 4, MVT::i32));
+    }
+
+    case r600_read_ngroups_x:
+      return LowerImplicitParameter(DAG, VT, DL, 0);
+    case r600_read_ngroups_y:
+      return LowerImplicitParameter(DAG, VT, DL, 1);
+    case r600_read_ngroups_z:
+      return LowerImplicitParameter(DAG, VT, DL, 2);
+    case r600_read_global_size_x:
+      return LowerImplicitParameter(DAG, VT, DL, 3);
+    case r600_read_global_size_y:
+      return LowerImplicitParameter(DAG, VT, DL, 4);
+    case r600_read_global_size_z:
+      return LowerImplicitParameter(DAG, VT, DL, 5);
+    case r600_read_local_size_x:
+      return LowerImplicitParameter(DAG, VT, DL, 6);
+    case r600_read_local_size_y:
+      return LowerImplicitParameter(DAG, VT, DL, 7);
+    case r600_read_local_size_z:
+      return LowerImplicitParameter(DAG, VT, DL, 8);
+
+    case r600_read_tgid_x:
+      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                                  AMDGPU::T1_X, VT);
+    case r600_read_tgid_y:
+      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                                  AMDGPU::T1_Y, VT);
+    case r600_read_tgid_z:
+      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                                  AMDGPU::T1_Z, VT);
+    case r600_read_tidig_x:
+      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                                  AMDGPU::T0_X, VT);
+    case r600_read_tidig_y:
+      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                                  AMDGPU::T0_Y, VT);
+    case r600_read_tidig_z:
+      return CreateLiveInRegister(DAG, &AMDGPU::R600_TReg32RegClass,
+                                  AMDGPU::T0_Z, VT);
+    }
+    // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
+    break;
+  }
+  } // end switch(Op.getOpcode())
+  return SDValue();
+}
+
+void R600TargetLowering::ReplaceNodeResults(SDNode *N,
+                                            SmallVectorImpl<SDValue> &Results,
+                                            SelectionDAG &DAG) const {
+  switch (N->getOpcode()) {
+  default: return;
+  case ISD::FP_TO_UINT: Results.push_back(LowerFPTOUINT(N->getOperand(0), DAG));
+  }
+}
+
+SDValue R600TargetLowering::LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const {
+  return DAG.getNode(
+      ISD::SETCC,
+      Op.getDebugLoc(),
+      MVT::i1,
+      Op, DAG.getConstantFP(0.0f, MVT::f32),
+      DAG.getCondCode(ISD::SETNE)
+      );
+}
+
+SDValue R600TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDValue CC = Op.getOperand(1);
+  SDValue LHS   = Op.getOperand(2);
+  SDValue RHS   = Op.getOperand(3);
+  SDValue JumpT  = Op.getOperand(4);
+  SDValue CmpValue;
+  SDValue Result;
+
+  if (LHS.getValueType() == MVT::i32) {
+    CmpValue = DAG.getNode(
+        ISD::SELECT_CC,
+        Op.getDebugLoc(),
+        MVT::i32,
+        LHS, RHS,
+        DAG.getConstant(-1, MVT::i32),
+        DAG.getConstant(0, MVT::i32),
+        CC);
+  } else if (LHS.getValueType() == MVT::f32) {
+    CmpValue = DAG.getNode(
+        ISD::SELECT_CC,
+        Op.getDebugLoc(),
+        MVT::f32,
+        LHS, RHS,
+        DAG.getConstantFP(1.0f, MVT::f32),
+        DAG.getConstantFP(0.0f, MVT::f32),
+        CC);
+  } else {
+    assert(0 && "Not valid type for br_cc");
+  }
+  Result = DAG.getNode(
+      AMDGPUISD::BRANCH_COND,
+      CmpValue.getDebugLoc(),
+      MVT::Other, Chain,
+      JumpT, CmpValue);
+  return Result;
+}
+
+SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
+                                                   DebugLoc DL,
+                                                   unsigned DwordOffset) const {
+  unsigned ByteOffset = DwordOffset * 4;
+  PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
+                                      AMDGPUAS::PARAM_I_ADDRESS);
+
+  // We shouldn't be using an offset wider than 16-bits for implicit parameters.
+  assert(isInt<16>(ByteOffset));
+
+  return DAG.getLoad(VT, DL, DAG.getEntryNode(),
+                     DAG.getConstant(ByteOffset, MVT::i32), // PTR
+                     MachinePointerInfo(ConstantPointerNull::get(PtrType)),
+                     false, false, false, 0);
+}
+
+SDValue R600TargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+
+  return DAG.getNode(AMDGPUISD::BITALIGN, DL, VT,
+                     Op.getOperand(0),
+                     Op.getOperand(0),
+                     DAG.getNode(ISD::SUB, DL, VT,
+                                 DAG.getConstant(32, MVT::i32),
+                                 Op.getOperand(1)));
+}
+
+bool R600TargetLowering::isZero(SDValue Op) const {
+  if(ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
+    return Cst->isNullValue();
+  } else if(ConstantFPSDNode *CstFP = dyn_cast<ConstantFPSDNode>(Op)){
+    return CstFP->isZero();
+  } else {
+    return false;
+  }
+}
+
+SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue True = Op.getOperand(2);
+  SDValue False = Op.getOperand(3);
+  SDValue CC = Op.getOperand(4);
+  SDValue Temp;
+
+  // LHS and RHS are guaranteed to be the same value type
+  EVT CompareVT = LHS.getValueType();
+
+  // Check if we can lower this to a native operation.
+
+  // Try to lower to a CND* instruction:
+  // CND* instructions requires RHS to be zero.  Some SELECT_CC nodes that
+  // can be lowered to CND* instructions can also be lowered to SET*
+  // instructions.  CND* instructions are cheaper, because they dont't
+  // require additional instructions to convert their result to the correct
+  // value type, so this check should be first.
+  if (isZero(LHS) || isZero(RHS)) {
+    SDValue Cond = (isZero(LHS) ? RHS : LHS);
+    SDValue Zero = (isZero(LHS) ? LHS : RHS);
+    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
+    if (CompareVT != VT) {
+      // Bitcast True / False to the correct types.  This will end up being
+      // a nop, but it allows us to define only a single pattern in the
+      // .TD files for each CND* instruction rather than having to have
+      // one pattern for integer True/False and one for fp True/False
+      True = DAG.getNode(ISD::BITCAST, DL, CompareVT, True);
+      False = DAG.getNode(ISD::BITCAST, DL, CompareVT, False);
+    }
+    if (isZero(LHS)) {
+      CCOpcode = ISD::getSetCCSwappedOperands(CCOpcode);
+    }
+
+    switch (CCOpcode) {
+    case ISD::SETONE:
+    case ISD::SETUNE:
+    case ISD::SETNE:
+    case ISD::SETULE:
+    case ISD::SETULT:
+    case ISD::SETOLE:
+    case ISD::SETOLT:
+    case ISD::SETLE:
+    case ISD::SETLT:
+      CCOpcode = ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32);
+      Temp = True;
+      True = False;
+      False = Temp;
+      break;
+    default:
+      break;
+    }
+    SDValue SelectNode = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
+        Cond, Zero,
+        True, False,
+        DAG.getCondCode(CCOpcode));
+    return DAG.getNode(ISD::BITCAST, DL, VT, SelectNode);
+  }
+
+  // Try to lower to a SET* instruction:
+  // We need all the operands of SELECT_CC to have the same value type, so if
+  // necessary we need to change True and False to be the same type as LHS and
+  // RHS, and then convert the result of the select_cc back to the correct type.
+
+  // Move hardware True/False values to the correct operand.
+  if (isHWTrueValue(False) && isHWFalseValue(True)) {
+    ISD::CondCode CCOpcode = cast<CondCodeSDNode>(CC)->get();
+    std::swap(False, True);
+    CC = DAG.getCondCode(ISD::getSetCCInverse(CCOpcode, CompareVT == MVT::i32));
+  }
+
+  if (isHWTrueValue(True) && isHWFalseValue(False)) {
+    if (CompareVT !=  VT) {
+      if (VT == MVT::f32 && CompareVT == MVT::i32) {
+        SDValue Boolean = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
+            LHS, RHS,
+            DAG.getConstant(-1, MVT::i32),
+            DAG.getConstant(0, MVT::i32),
+            CC);
+        // Convert integer values of true (-1) and false (0) to fp values of
+        // true (1.0f) and false (0.0f).
+        SDValue LSB = DAG.getNode(ISD::AND, DL, MVT::i32, Boolean,
+                                                  DAG.getConstant(1, MVT::i32));
+        return DAG.getNode(ISD::UINT_TO_FP, DL, VT, LSB);
+      } else if (VT == MVT::i32 && CompareVT == MVT::f32) {
+        SDValue BoolAsFlt = DAG.getNode(ISD::SELECT_CC, DL, CompareVT,
+            LHS, RHS,
+            DAG.getConstantFP(1.0f, MVT::f32),
+            DAG.getConstantFP(0.0f, MVT::f32),
+            CC);
+        // Convert fp values of true (1.0f) and false (0.0f) to integer values
+        // of true (-1) and false (0).
+        SDValue Neg = DAG.getNode(ISD::FNEG, DL, MVT::f32, BoolAsFlt);
+        return DAG.getNode(ISD::FP_TO_SINT, DL, VT, Neg);
+      } else {
+        // I don't think there will be any other type pairings.
+        assert(!"Unhandled operand type parings in SELECT_CC");
+      }
+    } else {
+      // This SELECT_CC is already legal.
+      return DAG.getNode(ISD::SELECT_CC, DL, VT, LHS, RHS, True, False, CC);
+    }
+  }
+
+  // Possible Min/Max pattern
+  SDValue MinMax = LowerMinMax(Op, DAG);
+  if (MinMax.getNode()) {
+    return MinMax;
+  }
+
+  // If we make it this for it means we have no native instructions to handle
+  // this SELECT_CC, so we must lower it.
+  SDValue HWTrue, HWFalse;
+
+  if (CompareVT == MVT::f32) {
+    HWTrue = DAG.getConstantFP(1.0f, CompareVT);
+    HWFalse = DAG.getConstantFP(0.0f, CompareVT);
+  } else if (CompareVT == MVT::i32) {
+    HWTrue = DAG.getConstant(-1, CompareVT);
+    HWFalse = DAG.getConstant(0, CompareVT);
+  }
+  else {
+    assert(!"Unhandled value type in LowerSELECT_CC");
+  }
+
+  // Lower this unsupported SELECT_CC into a combination of two supported
+  // SELECT_CC operations.
+  SDValue Cond = DAG.getNode(ISD::SELECT_CC, DL, CompareVT, LHS, RHS, HWTrue, HWFalse, CC);
+
+  return DAG.getNode(ISD::SELECT_CC, DL, VT,
+      Cond, HWFalse,
+      True, False,
+      DAG.getCondCode(ISD::SETNE));
+}
+
+SDValue R600TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
+  return DAG.getNode(ISD::SELECT_CC,
+      Op.getDebugLoc(),
+      Op.getValueType(),
+      Op.getOperand(0),
+      DAG.getConstant(0, MVT::i32),
+      Op.getOperand(1),
+      Op.getOperand(2),
+      DAG.getCondCode(ISD::SETNE));
+}
+
+SDValue R600TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Cond;
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue CC  = Op.getOperand(2);
+  DebugLoc DL = Op.getDebugLoc();
+  assert(Op.getValueType() == MVT::i32);
+  if (LHS.getValueType() == MVT::i32) {
+    Cond = DAG.getNode(
+        ISD::SELECT_CC,
+        Op.getDebugLoc(),
+        MVT::i32,
+        LHS, RHS,
+        DAG.getConstant(-1, MVT::i32),
+        DAG.getConstant(0, MVT::i32),
+        CC);
+  } else if (LHS.getValueType() == MVT::f32) {
+    Cond = DAG.getNode(
+        ISD::SELECT_CC,
+        Op.getDebugLoc(),
+        MVT::f32,
+        LHS, RHS,
+        DAG.getConstantFP(1.0f, MVT::f32),
+        DAG.getConstantFP(0.0f, MVT::f32),
+        CC);
+    Cond = DAG.getNode(
+        ISD::FP_TO_SINT,
+        DL,
+        MVT::i32,
+        Cond);
+  } else {
+    assert(0 && "Not valid type for set_cc");
+  }
+  Cond = DAG.getNode(
+      ISD::AND,
+      DL,
+      MVT::i32,
+      DAG.getConstant(1, MVT::i32),
+      Cond);
+  return Cond;
+}
+
+SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
+  SDValue Chain = Op.getOperand(0);
+  SDValue Value = Op.getOperand(1);
+  SDValue Ptr = Op.getOperand(2);
+
+  if (StoreNode->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
+      Ptr->getOpcode() != AMDGPUISD::DWORDADDR) {
+    // Convert pointer from byte address to dword address.
+    Ptr = DAG.getNode(AMDGPUISD::DWORDADDR, DL, Ptr.getValueType(),
+                      DAG.getNode(ISD::SRL, DL, Ptr.getValueType(),
+                                  Ptr, DAG.getConstant(2, MVT::i32)));
+
+    if (StoreNode->isTruncatingStore() || StoreNode->isIndexed()) {
+      assert(!"Truncated and indexed stores not supported yet");
+    } else {
+      Chain = DAG.getStore(Chain, DL, Value, Ptr, StoreNode->getMemOperand());
+    }
+    return Chain;
+  }
+  return SDValue();
+}
+
+
+SDValue R600TargetLowering::LowerFPOW(SDValue Op,
+    SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+  SDValue LogBase = DAG.getNode(ISD::FLOG2, DL, VT, Op.getOperand(0));
+  SDValue MulLogBase = DAG.getNode(ISD::FMUL, DL, VT, Op.getOperand(1), LogBase);
+  return DAG.getNode(ISD::FEXP2, DL, VT, MulLogBase);
+}
+
+/// XXX Only kernel functions are supported, so we can assume for now that
+/// every function is a kernel function, but in the future we should use
+/// separate calling conventions for kernel and non-kernel functions.
+SDValue R600TargetLowering::LowerFormalArguments(
+                                      SDValue Chain,
+                                      CallingConv::ID CallConv,
+                                      bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                      DebugLoc DL, SelectionDAG &DAG,
+                                      SmallVectorImpl<SDValue> &InVals) const {
+  unsigned ParamOffsetBytes = 36;
+  Function::const_arg_iterator FuncArg =
+                            DAG.getMachineFunction().getFunction()->arg_begin();
+  for (unsigned i = 0, e = Ins.size(); i < e; ++i, ++FuncArg) {
+    EVT VT = Ins[i].VT;
+    Type *ArgType = FuncArg->getType();
+    unsigned ArgSizeInBits = ArgType->isPointerTy() ?
+                             32 : ArgType->getPrimitiveSizeInBits();
+    unsigned ArgBytes = ArgSizeInBits >> 3;
+    EVT ArgVT;
+    if (ArgSizeInBits < VT.getSizeInBits()) {
+      assert(!ArgType->isFloatTy() &&
+             "Extending floating point arguments not supported yet");
+      ArgVT = MVT::getIntegerVT(ArgSizeInBits);
+    } else {
+      ArgVT = VT;
+    }
+    PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
+                                                    AMDGPUAS::PARAM_I_ADDRESS);
+    SDValue Arg = DAG.getExtLoad(ISD::ZEXTLOAD, DL, VT, DAG.getRoot(),
+                                DAG.getConstant(ParamOffsetBytes, MVT::i32),
+                                       MachinePointerInfo(new Argument(PtrTy)),
+                                       ArgVT, false, false, ArgBytes);
+    InVals.push_back(Arg);
+    ParamOffsetBytes += ArgBytes;
+  }
+  return Chain;
+}
+
+EVT R600TargetLowering::getSetCCResultType(EVT VT) const {
+   if (!VT.isVector()) return MVT::i32;
+   return VT.changeVectorElementTypeToInteger();
+}
+
+//===----------------------------------------------------------------------===//
+// Custom DAG Optimizations
+//===----------------------------------------------------------------------===//
+
+SDValue R600TargetLowering::PerformDAGCombine(SDNode *N,
+                                              DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+
+  switch (N->getOpcode()) {
+  // (f32 fp_round (f64 uint_to_fp a)) -> (f32 uint_to_fp a)
+  case ISD::FP_ROUND: {
+      SDValue Arg = N->getOperand(0);
+      if (Arg.getOpcode() == ISD::UINT_TO_FP && Arg.getValueType() == MVT::f64) {
+        return DAG.getNode(ISD::UINT_TO_FP, N->getDebugLoc(), N->getValueType(0),
+                           Arg.getOperand(0));
+      }
+      break;
+    }
+  }
+  return SDValue();
+}
diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h
new file mode 100644
index 0000000000..2b954dab55
--- /dev/null
+++ b/lib/Target/R600/R600ISelLowering.h
@@ -0,0 +1,72 @@
+//===-- R600ISelLowering.h - R600 DAG Lowering Interface -*- C++ -*--------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief R600 DAG Lowering interface definition
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef R600ISELLOWERING_H
+#define R600ISELLOWERING_H
+
+#include "AMDGPUISelLowering.h"
+
+namespace llvm {
+
+class R600InstrInfo;
+
+class R600TargetLowering : public AMDGPUTargetLowering {
+public:
+  R600TargetLowering(TargetMachine &TM);
+  virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr *MI,
+      MachineBasicBlock * BB) const;
+  virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+  virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  void ReplaceNodeResults(SDNode * N,
+      SmallVectorImpl<SDValue> &Results,
+      SelectionDAG &DAG) const;
+  virtual SDValue LowerFormalArguments(
+                                      SDValue Chain,
+                                      CallingConv::ID CallConv,
+                                      bool isVarArg,
+                                      const SmallVectorImpl<ISD::InputArg> &Ins,
+                                      DebugLoc DL, SelectionDAG &DAG,
+                                      SmallVectorImpl<SDValue> &InVals) const;
+  virtual EVT getSetCCResultType(EVT VT) const;
+private:
+  const R600InstrInfo * TII;
+
+  /// Each OpenCL kernel has nine implicit parameters that are stored in the
+  /// first nine dwords of a Vertex Buffer.  These implicit parameters are
+  /// lowered to load instructions which retreive the values from the Vertex
+  /// Buffer.
+  SDValue LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
+                                 DebugLoc DL, unsigned DwordOffset) const;
+
+  void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB,
+      MachineRegisterInfo & MRI, unsigned dword_offset) const;
+
+  SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+
+  /// \brief Lower ROTL opcode to BITALIGN
+  SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const;
+
+  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFPOW(SDValue Op, SelectionDAG &DAG) const;
+  
+  bool isZero(SDValue Op) const;
+};
+
+} // End namespace llvm;
+
+#endif // R600ISELLOWERING_H
diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp
new file mode 100644
index 0000000000..79bb97584e
--- /dev/null
+++ b/lib/Target/R600/R600InstrInfo.cpp
@@ -0,0 +1,664 @@
+//===-- R600InstrInfo.cpp - R600 Instruction Information ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief R600 Implementation of TargetInstrInfo.
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600InstrInfo.h"
+#include "AMDGPUTargetMachine.h"
+#include "AMDGPUSubtarget.h"
+#include "R600Defines.h"
+#include "R600RegisterInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+#define GET_INSTRINFO_CTOR
+#include "AMDGPUGenDFAPacketizer.inc"
+
+using namespace llvm;
+
+R600InstrInfo::R600InstrInfo(AMDGPUTargetMachine &tm)
+  : AMDGPUInstrInfo(tm),
+    RI(tm, *this)
+  { }
+
+const R600RegisterInfo &R600InstrInfo::getRegisterInfo() const {
+  return RI;
+}
+
+bool R600InstrInfo::isTrig(const MachineInstr &MI) const {
+  return get(MI.getOpcode()).TSFlags & R600_InstFlag::TRIG;
+}
+
+bool R600InstrInfo::isVector(const MachineInstr &MI) const {
+  return get(MI.getOpcode()).TSFlags & R600_InstFlag::VECTOR;
+}
+
+void
+R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const {
+  if (AMDGPU::R600_Reg128RegClass.contains(DestReg)
+      && AMDGPU::R600_Reg128RegClass.contains(SrcReg)) {
+    for (unsigned I = 0; I < 4; I++) {
+      unsigned SubRegIndex = RI.getSubRegFromChannel(I);
+      buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
+                              RI.getSubReg(DestReg, SubRegIndex),
+                              RI.getSubReg(SrcReg, SubRegIndex))
+                              .addReg(DestReg,
+                                      RegState::Define | RegState::Implicit);
+    }
+  } else {
+
+    // We can't copy vec4 registers
+    assert(!AMDGPU::R600_Reg128RegClass.contains(DestReg)
+           && !AMDGPU::R600_Reg128RegClass.contains(SrcReg));
+
+    MachineInstr *NewMI = buildDefaultInstruction(MBB, MI, AMDGPU::MOV,
+                                                  DestReg, SrcReg);
+    NewMI->getOperand(getOperandIdx(*NewMI, R600Operands::SRC0))
+                                    .setIsKill(KillSrc);
+  }
+}
+
+MachineInstr * R600InstrInfo::getMovImmInstr(MachineFunction *MF,
+                                             unsigned DstReg, int64_t Imm) const {
+  MachineInstr * MI = MF->CreateMachineInstr(get(AMDGPU::MOV), DebugLoc());
+  MachineInstrBuilder(MI).addReg(DstReg, RegState::Define);
+  MachineInstrBuilder(MI).addReg(AMDGPU::ALU_LITERAL_X);
+  MachineInstrBuilder(MI).addImm(Imm);
+  MachineInstrBuilder(MI).addReg(0); // PREDICATE_BIT
+
+  return MI;
+}
+
+unsigned R600InstrInfo::getIEQOpcode() const {
+  return AMDGPU::SETE_INT;
+}
+
+bool R600InstrInfo::isMov(unsigned Opcode) const {
+
+
+  switch(Opcode) {
+  default: return false;
+  case AMDGPU::MOV:
+  case AMDGPU::MOV_IMM_F32:
+  case AMDGPU::MOV_IMM_I32:
+    return true;
+  }
+}
+
+// Some instructions act as place holders to emulate operations that the GPU
+// hardware does automatically. This function can be used to check if
+// an opcode falls into this category.
+bool R600InstrInfo::isPlaceHolderOpcode(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return false;
+  case AMDGPU::RETURN:
+  case AMDGPU::RESERVE_REG:
+    return true;
+  }
+}
+
+bool R600InstrInfo::isReductionOp(unsigned Opcode) const {
+  switch(Opcode) {
+    default: return false;
+    case AMDGPU::DOT4_r600_pseudo:
+    case AMDGPU::DOT4_eg_pseudo:
+      return true;
+  }
+}
+
+bool R600InstrInfo::isCubeOp(unsigned Opcode) const {
+  switch(Opcode) {
+    default: return false;
+    case AMDGPU::CUBE_r600_pseudo:
+    case AMDGPU::CUBE_r600_real:
+    case AMDGPU::CUBE_eg_pseudo:
+    case AMDGPU::CUBE_eg_real:
+      return true;
+  }
+}
+
+bool R600InstrInfo::isALUInstr(unsigned Opcode) const {
+  unsigned TargetFlags = get(Opcode).TSFlags;
+
+  return ((TargetFlags & R600_InstFlag::OP1) |
+          (TargetFlags & R600_InstFlag::OP2) |
+          (TargetFlags & R600_InstFlag::OP3));
+}
+
+DFAPacketizer *R600InstrInfo::CreateTargetScheduleState(const TargetMachine *TM,
+    const ScheduleDAG *DAG) const {
+  const InstrItineraryData *II = TM->getInstrItineraryData();
+  return TM->getSubtarget<AMDGPUSubtarget>().createDFAPacketizer(II);
+}
+
+static bool
+isPredicateSetter(unsigned Opcode) {
+  switch (Opcode) {
+  case AMDGPU::PRED_X:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static MachineInstr *
+findFirstPredicateSetterFrom(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator I) {
+  while (I != MBB.begin()) {
+    --I;
+    MachineInstr *MI = I;
+    if (isPredicateSetter(MI->getOpcode()))
+      return MI;
+  }
+
+  return NULL;
+}
+
+bool
+R600InstrInfo::AnalyzeBranch(MachineBasicBlock &MBB,
+                             MachineBasicBlock *&TBB,
+                             MachineBasicBlock *&FBB,
+                             SmallVectorImpl<MachineOperand> &Cond,
+                             bool AllowModify) const {
+  // Most of the following comes from the ARM implementation of AnalyzeBranch
+
+  // If the block has no terminators, it just falls into the block after it.
+  MachineBasicBlock::iterator I = MBB.end();
+  if (I == MBB.begin())
+    return false;
+  --I;
+  while (I->isDebugValue()) {
+    if (I == MBB.begin())
+      return false;
+    --I;
+  }
+  if (static_cast<MachineInstr *>(I)->getOpcode() != AMDGPU::JUMP) {
+    return false;
+  }
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = I;
+
+  // If there is only one terminator instruction, process it.
+  unsigned LastOpc = LastInst->getOpcode();
+  if (I == MBB.begin() ||
+      static_cast<MachineInstr *>(--I)->getOpcode() != AMDGPU::JUMP) {
+    if (LastOpc == AMDGPU::JUMP) {
+      if(!isPredicated(LastInst)) {
+        TBB = LastInst->getOperand(0).getMBB();
+        return false;
+      } else {
+        MachineInstr *predSet = I;
+        while (!isPredicateSetter(predSet->getOpcode())) {
+          predSet = --I;
+        }
+        TBB = LastInst->getOperand(0).getMBB();
+        Cond.push_back(predSet->getOperand(1));
+        Cond.push_back(predSet->getOperand(2));
+        Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false));
+        return false;
+      }
+    }
+    return true;  // Can't handle indirect branch.
+  }
+
+  // Get the instruction before it if it is a terminator.
+  MachineInstr *SecondLastInst = I;
+  unsigned SecondLastOpc = SecondLastInst->getOpcode();
+
+  // If the block ends with a B and a Bcc, handle it.
+  if (SecondLastOpc == AMDGPU::JUMP &&
+      isPredicated(SecondLastInst) &&
+      LastOpc == AMDGPU::JUMP &&
+      !isPredicated(LastInst)) {
+    MachineInstr *predSet = --I;
+    while (!isPredicateSetter(predSet->getOpcode())) {
+      predSet = --I;
+    }
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    FBB = LastInst->getOperand(0).getMBB();
+    Cond.push_back(predSet->getOperand(1));
+    Cond.push_back(predSet->getOperand(2));
+    Cond.push_back(MachineOperand::CreateReg(AMDGPU::PRED_SEL_ONE, false));
+    return false;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+int R600InstrInfo::getBranchInstr(const MachineOperand &op) const {
+  const MachineInstr *MI = op.getParent();
+
+  switch (MI->getDesc().OpInfo->RegClass) {
+  default: // FIXME: fallthrough??
+  case AMDGPU::GPRI32RegClassID: return AMDGPU::BRANCH_COND_i32;
+  case AMDGPU::GPRF32RegClassID: return AMDGPU::BRANCH_COND_f32;
+  };
+}
+
+unsigned
+R600InstrInfo::InsertBranch(MachineBasicBlock &MBB,
+                            MachineBasicBlock *TBB,
+                            MachineBasicBlock *FBB,
+                            const SmallVectorImpl<MachineOperand> &Cond,
+                            DebugLoc DL) const {
+  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+
+  if (FBB == 0) {
+    if (Cond.empty()) {
+      BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(TBB).addReg(0);
+      return 1;
+    } else {
+      MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end());
+      assert(PredSet && "No previous predicate !");
+      addFlag(PredSet, 0, MO_FLAG_PUSH);
+      PredSet->getOperand(2).setImm(Cond[1].getImm());
+
+      BuildMI(&MBB, DL, get(AMDGPU::JUMP))
+             .addMBB(TBB)
+             .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+      return 1;
+    }
+  } else {
+    MachineInstr *PredSet = findFirstPredicateSetterFrom(MBB, MBB.end());
+    assert(PredSet && "No previous predicate !");
+    addFlag(PredSet, 0, MO_FLAG_PUSH);
+    PredSet->getOperand(2).setImm(Cond[1].getImm());
+    BuildMI(&MBB, DL, get(AMDGPU::JUMP))
+            .addMBB(TBB)
+            .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
+    BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(FBB).addReg(0);
+    return 2;
+  }
+}
+
+unsigned
+R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
+
+  // Note : we leave PRED* instructions there.
+  // They may be needed when predicating instructions.
+
+  MachineBasicBlock::iterator I = MBB.end();
+
+  if (I == MBB.begin()) {
+    return 0;
+  }
+  --I;
+  switch (I->getOpcode()) {
+  default:
+    return 0;
+  case AMDGPU::JUMP:
+    if (isPredicated(I)) {
+      MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
+      clearFlag(predSet, 0, MO_FLAG_PUSH);
+    }
+    I->eraseFromParent();
+    break;
+  }
+  I = MBB.end();
+
+  if (I == MBB.begin()) {
+    return 1;
+  }
+  --I;
+  switch (I->getOpcode()) {
+    // FIXME: only one case??
+  default:
+    return 1;
+  case AMDGPU::JUMP:
+    if (isPredicated(I)) {
+      MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
+      clearFlag(predSet, 0, MO_FLAG_PUSH);
+    }
+    I->eraseFromParent();
+    break;
+  }
+  return 2;
+}
+
+bool
+R600InstrInfo::isPredicated(const MachineInstr *MI) const {
+  int idx = MI->findFirstPredOperandIdx();
+  if (idx < 0)
+    return false;
+
+  unsigned Reg = MI->getOperand(idx).getReg();
+  switch (Reg) {
+  default: return false;
+  case AMDGPU::PRED_SEL_ONE:
+  case AMDGPU::PRED_SEL_ZERO:
+  case AMDGPU::PREDICATE_BIT:
+    return true;
+  }
+}
+
+bool
+R600InstrInfo::isPredicable(MachineInstr *MI) const {
+  // XXX: KILL* instructions can be predicated, but they must be the last
+  // instruction in a clause, so this means any instructions after them cannot
+  // be predicated.  Until we have proper support for instruction clauses in the
+  // backend, we will mark KILL* instructions as unpredicable.
+
+  if (MI->getOpcode() == AMDGPU::KILLGT) {
+    return false;
+  } else {
+    return AMDGPUInstrInfo::isPredicable(MI);
+  }
+}
+
+
+bool
+R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB,
+                                   unsigned NumCyles,
+                                   unsigned ExtraPredCycles,
+                                   const BranchProbability &Probability) const{
+  return true;
+}
+
+bool
+R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                                   unsigned NumTCycles,
+                                   unsigned ExtraTCycles,
+                                   MachineBasicBlock &FMBB,
+                                   unsigned NumFCycles,
+                                   unsigned ExtraFCycles,
+                                   const BranchProbability &Probability) const {
+  return true;
+}
+
+bool
+R600InstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
+                                         unsigned NumCyles,
+                                         const BranchProbability &Probability)
+                                         const {
+  return true;
+}
+
+bool
+R600InstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB,
+                                         MachineBasicBlock &FMBB) const {
+  return false;
+}
+
+
+bool
+R600InstrInfo::ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
+  MachineOperand &MO = Cond[1];
+  switch (MO.getImm()) {
+  case OPCODE_IS_ZERO_INT:
+    MO.setImm(OPCODE_IS_NOT_ZERO_INT);
+    break;
+  case OPCODE_IS_NOT_ZERO_INT:
+    MO.setImm(OPCODE_IS_ZERO_INT);
+    break;
+  case OPCODE_IS_ZERO:
+    MO.setImm(OPCODE_IS_NOT_ZERO);
+    break;
+  case OPCODE_IS_NOT_ZERO:
+    MO.setImm(OPCODE_IS_ZERO);
+    break;
+  default:
+    return true;
+  }
+
+  MachineOperand &MO2 = Cond[2];
+  switch (MO2.getReg()) {
+  case AMDGPU::PRED_SEL_ZERO:
+    MO2.setReg(AMDGPU::PRED_SEL_ONE);
+    break;
+  case AMDGPU::PRED_SEL_ONE:
+    MO2.setReg(AMDGPU::PRED_SEL_ZERO);
+    break;
+  default:
+    return true;
+  }
+  return false;
+}
+
+bool
+R600InstrInfo::DefinesPredicate(MachineInstr *MI,
+                                std::vector<MachineOperand> &Pred) const {
+  return isPredicateSetter(MI->getOpcode());
+}
+
+
+bool
+R600InstrInfo::SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+                       const SmallVectorImpl<MachineOperand> &Pred2) const {
+  return false;
+}
+
+
+bool
+R600InstrInfo::PredicateInstruction(MachineInstr *MI,
+                      const SmallVectorImpl<MachineOperand> &Pred) const {
+  int PIdx = MI->findFirstPredOperandIdx();
+
+  if (PIdx != -1) {
+    MachineOperand &PMO = MI->getOperand(PIdx);
+    PMO.setReg(Pred[2].getReg());
+    MachineInstrBuilder(MI).addReg(AMDGPU::PREDICATE_BIT, RegState::Implicit);
+    return true;
+  }
+
+  return false;
+}
+
+unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
+                                            const MachineInstr *MI,
+                                            unsigned *PredCost) const {
+  if (PredCost)
+    *PredCost = 2;
+  return 2;
+}
+
+MachineInstrBuilder R600InstrInfo::buildDefaultInstruction(MachineBasicBlock &MBB,
+                                                  MachineBasicBlock::iterator I,
+                                                  unsigned Opcode,
+                                                  unsigned DstReg,
+                                                  unsigned Src0Reg,
+                                                  unsigned Src1Reg) const {
+  MachineInstrBuilder MIB = BuildMI(MBB, I, MBB.findDebugLoc(I), get(Opcode),
+    DstReg);           // $dst
+
+  if (Src1Reg) {
+    MIB.addImm(0)     // $update_exec_mask
+       .addImm(0);    // $update_predicate
+  }
+  MIB.addImm(1)        // $write
+     .addImm(0)        // $omod
+     .addImm(0)        // $dst_rel
+     .addImm(0)        // $dst_clamp
+     .addReg(Src0Reg)  // $src0
+     .addImm(0)        // $src0_neg
+     .addImm(0)        // $src0_rel
+     .addImm(0);       // $src0_abs
+
+  if (Src1Reg) {
+    MIB.addReg(Src1Reg) // $src1
+       .addImm(0)       // $src1_neg
+       .addImm(0)       // $src1_rel
+       .addImm(0);       // $src1_abs
+  }
+
+  //XXX: The r600g finalizer expects this to be 1, once we've moved the
+  //scheduling to the backend, we can change the default to 0.
+  MIB.addImm(1)        // $last
+      .addReg(AMDGPU::PRED_SEL_OFF) // $pred_sel
+      .addImm(0);        // $literal
+
+  return MIB;
+}
+
+MachineInstr *R600InstrInfo::buildMovImm(MachineBasicBlock &BB,
+                                         MachineBasicBlock::iterator I,
+                                         unsigned DstReg,
+                                         uint64_t Imm) const {
+  MachineInstr *MovImm = buildDefaultInstruction(BB, I, AMDGPU::MOV, DstReg,
+                                                  AMDGPU::ALU_LITERAL_X);
+  setImmOperand(MovImm, R600Operands::IMM, Imm);
+  return MovImm;
+}
+
+int R600InstrInfo::getOperandIdx(const MachineInstr &MI,
+                                 R600Operands::Ops Op) const {
+  return getOperandIdx(MI.getOpcode(), Op);
+}
+
+int R600InstrInfo::getOperandIdx(unsigned Opcode,
+                                 R600Operands::Ops Op) const {
+  const static int OpTable[3][R600Operands::COUNT] = {
+//            W        C     S  S  S     S  S  S     S  S
+//            R  O  D  L  S  R  R  R  S  R  R  R  S  R  R  L  P
+//   D  U     I  M  R  A  R  C  C  C  C  C  C  C  R  C  C  A  R  I
+//   S  E  U  T  O  E  M  C  0  0  0  C  1  1  1  C  2  2  S  E  M
+//   T  M  P  E  D  L  P  0  N  R  A  1  N  R  A  2  N  R  T  D  M
+    {0,-1,-1, 1, 2, 3, 4, 5, 6, 7, 8,-1,-1,-1,-1,-1,-1,-1, 9,10,11},
+    {0, 1, 2, 3, 4 ,5 ,6 ,7, 8, 9,10,11,12,-1,-1,-1,13,14,15,16,17},
+    {0,-1,-1,-1,-1, 1, 2, 3, 4, 5,-1, 6, 7, 8,-1, 9,10,11,12,13,14}
+  };
+  unsigned TargetFlags = get(Opcode).TSFlags;
+  unsigned OpTableIdx;
+
+  if (!HAS_NATIVE_OPERANDS(TargetFlags)) {
+    switch (Op) {
+    case R600Operands::DST: return 0;
+    case R600Operands::SRC0: return 1;
+    case R600Operands::SRC1: return 2;
+    case R600Operands::SRC2: return 3;
+    default:
+      assert(!"Unknown operand type for instruction");
+      return -1;
+    }
+  }
+
+  if (TargetFlags & R600_InstFlag::OP1) {
+    OpTableIdx = 0;
+  } else if (TargetFlags & R600_InstFlag::OP2) {
+    OpTableIdx = 1;
+  } else {
+    assert((TargetFlags & R600_InstFlag::OP3) && "OP1, OP2, or OP3 not defined "
+                                                 "for this instruction");
+    OpTableIdx = 2;
+  }
+
+  return OpTable[OpTableIdx][Op];
+}
+
+void R600InstrInfo::setImmOperand(MachineInstr *MI, R600Operands::Ops Op,
+                                  int64_t Imm) const {
+  int Idx = getOperandIdx(*MI, Op);
+  assert(Idx != -1 && "Operand not supported for this instruction.");
+  assert(MI->getOperand(Idx).isImm());
+  MI->getOperand(Idx).setImm(Imm);
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction flag getters/setters
+//===----------------------------------------------------------------------===//
+
+bool R600InstrInfo::hasFlagOperand(const MachineInstr &MI) const {
+  return GET_FLAG_OPERAND_IDX(get(MI.getOpcode()).TSFlags) != 0;
+}
+
+MachineOperand &R600InstrInfo::getFlagOp(MachineInstr *MI, unsigned SrcIdx,
+                                         unsigned Flag) const {
+  unsigned TargetFlags = get(MI->getOpcode()).TSFlags;
+  int FlagIndex = 0;
+  if (Flag != 0) {
+    // If we pass something other than the default value of Flag to this
+    // function, it means we are want to set a flag on an instruction
+    // that uses native encoding.
+    assert(HAS_NATIVE_OPERANDS(TargetFlags));
+    bool IsOP3 = (TargetFlags & R600_InstFlag::OP3) == R600_InstFlag::OP3;
+    switch (Flag) {
+    case MO_FLAG_CLAMP:
+      FlagIndex = getOperandIdx(*MI, R600Operands::CLAMP);
+      break;
+    case MO_FLAG_MASK:
+      FlagIndex = getOperandIdx(*MI, R600Operands::WRITE);
+      break;
+    case MO_FLAG_NOT_LAST:
+    case MO_FLAG_LAST:
+      FlagIndex = getOperandIdx(*MI, R600Operands::LAST);
+      break;
+    case MO_FLAG_NEG:
+      switch (SrcIdx) {
+      case 0: FlagIndex = getOperandIdx(*MI, R600Operands::SRC0_NEG); break;
+      case 1: FlagIndex = getOperandIdx(*MI, R600Operands::SRC1_NEG); break;
+      case 2: FlagIndex = getOperandIdx(*MI, R600Operands::SRC2_NEG); break;
+      }
+      break;
+
+    case MO_FLAG_ABS:
+      assert(!IsOP3 && "Cannot set absolute value modifier for OP3 "
+                       "instructions.");
+      (void)IsOP3;
+      switch (SrcIdx) {
+      case 0: FlagIndex = getOperandIdx(*MI, R600Operands::SRC0_ABS); break;
+      case 1: FlagIndex = getOperandIdx(*MI, R600Operands::SRC1_ABS); break;
+      }
+      break;
+
+    default:
+      FlagIndex = -1;
+      break;
+    }
+    assert(FlagIndex != -1 && "Flag not supported for this instruction");
+  } else {
+      FlagIndex = GET_FLAG_OPERAND_IDX(TargetFlags);
+      assert(FlagIndex != 0 &&
+         "Instruction flags not supported for this instruction");
+  }
+
+  MachineOperand &FlagOp = MI->getOperand(FlagIndex);
+  assert(FlagOp.isImm());
+  return FlagOp;
+}
+
+void R600InstrInfo::addFlag(MachineInstr *MI, unsigned Operand,
+                            unsigned Flag) const {
+  unsigned TargetFlags = get(MI->getOpcode()).TSFlags;
+  if (Flag == 0) {
+    return;
+  }
+  if (HAS_NATIVE_OPERANDS(TargetFlags)) {
+    MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag);
+    if (Flag == MO_FLAG_NOT_LAST) {
+      clearFlag(MI, Operand, MO_FLAG_LAST);
+    } else if (Flag == MO_FLAG_MASK) {
+      clearFlag(MI, Operand, Flag);
+    } else {
+      FlagOp.setImm(1);
+    }
+  } else {
+      MachineOperand &FlagOp = getFlagOp(MI, Operand);
+      FlagOp.setImm(FlagOp.getImm() | (Flag << (NUM_MO_FLAGS * Operand)));
+  }
+}
+
+void R600InstrInfo::clearFlag(MachineInstr *MI, unsigned Operand,
+                              unsigned Flag) const {
+  unsigned TargetFlags = get(MI->getOpcode()).TSFlags;
+  if (HAS_NATIVE_OPERANDS(TargetFlags)) {
+    MachineOperand &FlagOp = getFlagOp(MI, Operand, Flag);
+    FlagOp.setImm(0);
+  } else {
+    MachineOperand &FlagOp = getFlagOp(MI);
+    unsigned InstFlags = FlagOp.getImm();
+    InstFlags &= ~(Flag << (NUM_MO_FLAGS * Operand));
+    FlagOp.setImm(InstFlags);
+  }
+}
diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h
new file mode 100644
index 0000000000..6bb0ca92e4
--- /dev/null
+++ b/lib/Target/R600/R600InstrInfo.h
@@ -0,0 +1,169 @@
+//===-- R600InstrInfo.h - R600 Instruction Info Interface -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface definition for R600InstrInfo
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef R600INSTRUCTIONINFO_H_
+#define R600INSTRUCTIONINFO_H_
+
+#include "AMDIL.h"
+#include "AMDGPUInstrInfo.h"
+#include "R600Defines.h"
+#include "R600RegisterInfo.h"
+
+#include <map>
+
+namespace llvm {
+
+  class AMDGPUTargetMachine;
+  class DFAPacketizer;
+  class ScheduleDAG;
+  class MachineFunction;
+  class MachineInstr;
+  class MachineInstrBuilder;
+
+  class R600InstrInfo : public AMDGPUInstrInfo {
+  private:
+  const R600RegisterInfo RI;
+
+  int getBranchInstr(const MachineOperand &op) const;
+
+  public:
+  explicit R600InstrInfo(AMDGPUTargetMachine &tm);
+
+  const R600RegisterInfo &getRegisterInfo() const;
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const;
+
+  bool isTrig(const MachineInstr &MI) const;
+  bool isPlaceHolderOpcode(unsigned opcode) const;
+  bool isReductionOp(unsigned opcode) const;
+  bool isCubeOp(unsigned opcode) const;
+
+  /// \returns true if this \p Opcode represents an ALU instruction.
+  bool isALUInstr(unsigned Opcode) const;
+
+  /// \breif Vector instructions are instructions that must fill all
+  /// instruction slots within an instruction group.
+  bool isVector(const MachineInstr &MI) const;
+
+  virtual MachineInstr * getMovImmInstr(MachineFunction *MF, unsigned DstReg,
+                                        int64_t Imm) const;
+
+  virtual unsigned getIEQOpcode() const;
+  virtual bool isMov(unsigned Opcode) const;
+
+  DFAPacketizer *CreateTargetScheduleState(const TargetMachine *TM,
+                                           const ScheduleDAG *DAG) const;
+
+  bool ReverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const;
+
+  bool AnalyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond, bool AllowModify) const;
+
+  unsigned InsertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB, MachineBasicBlock *FBB, const SmallVectorImpl<MachineOperand> &Cond, DebugLoc DL) const;
+
+  unsigned RemoveBranch(MachineBasicBlock &MBB) const;
+
+  bool isPredicated(const MachineInstr *MI) const;
+
+  bool isPredicable(MachineInstr *MI) const;
+
+  bool
+   isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
+                             const BranchProbability &Probability) const;
+
+  bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
+                           unsigned ExtraPredCycles,
+                           const BranchProbability &Probability) const ;
+
+  bool
+   isProfitableToIfCvt(MachineBasicBlock &TMBB,
+                       unsigned NumTCycles, unsigned ExtraTCycles,
+                       MachineBasicBlock &FMBB,
+                       unsigned NumFCycles, unsigned ExtraFCycles,
+                       const BranchProbability &Probability) const;
+
+  bool DefinesPredicate(MachineInstr *MI,
+                                  std::vector<MachineOperand> &Pred) const;
+
+  bool SubsumesPredicate(const SmallVectorImpl<MachineOperand> &Pred1,
+                         const SmallVectorImpl<MachineOperand> &Pred2) const;
+
+  bool isProfitableToUnpredicate(MachineBasicBlock &TMBB,
+                                          MachineBasicBlock &FMBB) const;
+
+  bool PredicateInstruction(MachineInstr *MI,
+                        const SmallVectorImpl<MachineOperand> &Pred) const;
+
+  unsigned int getInstrLatency(const InstrItineraryData *ItinData,
+                               const MachineInstr *MI,
+                               unsigned *PredCost = 0) const;
+
+  virtual int getInstrLatency(const InstrItineraryData *ItinData,
+                              SDNode *Node) const { return 1;}
+
+  /// You can use this function to avoid manually specifying each instruction
+  /// modifier operand when building a new instruction.
+  ///
+  /// \returns a MachineInstr with all the instruction modifiers initialized
+  /// to their default values.
+  MachineInstrBuilder buildDefaultInstruction(MachineBasicBlock &MBB,
+                                              MachineBasicBlock::iterator I,
+                                              unsigned Opcode,
+                                              unsigned DstReg,
+                                              unsigned Src0Reg,
+                                              unsigned Src1Reg = 0) const;
+
+  MachineInstr *buildMovImm(MachineBasicBlock &BB,
+                                  MachineBasicBlock::iterator I,
+                                  unsigned DstReg,
+                                  uint64_t Imm) const;
+
+  /// \brief Get the index of Op in the MachineInstr.
+  ///
+  /// \returns -1 if the Instruction does not contain the specified \p Op.
+  int getOperandIdx(const MachineInstr &MI, R600Operands::Ops Op) const;
+
+  /// \brief Get the index of \p Op for the given Opcode.
+  ///
+  /// \returns -1 if the Instruction does not contain the specified \p Op.
+  int getOperandIdx(unsigned Opcode, R600Operands::Ops Op) const;
+
+  /// \brief Helper function for setting instruction flag values.
+  void setImmOperand(MachineInstr *MI, R600Operands::Ops Op, int64_t Imm) const;
+
+  /// \returns true if this instruction has an operand for storing target flags.
+  bool hasFlagOperand(const MachineInstr &MI) const;
+
+  ///\brief Add one of the MO_FLAG* flags to the specified \p Operand.
+  void addFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const;
+
+  ///\brief Determine if the specified \p Flag is set on this \p Operand.
+  bool isFlagSet(const MachineInstr &MI, unsigned Operand, unsigned Flag) const;
+
+  /// \param SrcIdx The register source to set the flag on (e.g src0, src1, src2)
+  /// \param Flag The flag being set.
+  ///
+  /// \returns the operand containing the flags for this instruction.
+  MachineOperand &getFlagOp(MachineInstr *MI, unsigned SrcIdx = 0,
+                            unsigned Flag = 0) const;
+
+  /// \brief Clear the specified flag on the instruction.
+  void clearFlag(MachineInstr *MI, unsigned Operand, unsigned Flag) const;
+};
+
+} // End llvm namespace
+
+#endif // R600INSTRINFO_H_
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
new file mode 100644
index 0000000000..105822066c
--- /dev/null
+++ b/lib/Target/R600/R600Instructions.td
@@ -0,0 +1,1659 @@
+//===-- R600Instructions.td - R600 Instruction defs  -------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// R600 Tablegen instruction definitions
+//
+//===----------------------------------------------------------------------===//
+
+include "R600Intrinsics.td"
+
+class InstR600 <bits<11> inst, dag outs, dag ins, string asm, list<dag> pattern,
+                InstrItinClass itin>
+    : AMDGPUInst <outs, ins, asm, pattern> {
+
+  field bits<64> Inst;
+  bit Trig = 0;
+  bit Op3 = 0;
+  bit isVector = 0;
+  bits<2> FlagOperandIdx = 0;
+  bit Op1 = 0;
+  bit Op2 = 0;
+  bit HasNativeOperands = 0;
+
+  bits<11> op_code = inst;
+  //let Inst = inst;
+  let Namespace = "AMDGPU";
+  let OutOperandList = outs;
+  let InOperandList = ins;
+  let AsmString = asm;
+  let Pattern = pattern;
+  let Itinerary = itin;
+
+  let TSFlags{4} = Trig;
+  let TSFlags{5} = Op3;
+
+  // Vector instructions are instructions that must fill all slots in an
+  // instruction group
+  let TSFlags{6} = isVector;
+  let TSFlags{8-7} = FlagOperandIdx;
+  let TSFlags{9} = HasNativeOperands;
+  let TSFlags{10} = Op1;
+  let TSFlags{11} = Op2;
+}
+
+class InstR600ISA <dag outs, dag ins, string asm, list<dag> pattern> :
+    AMDGPUInst <outs, ins, asm, pattern> {
+  field bits<64> Inst;
+
+  let Namespace = "AMDGPU";
+}
+
+def MEMxi : Operand<iPTR> {
+  let MIOperandInfo = (ops R600_TReg32_X:$ptr, i32imm:$index);
+  let PrintMethod = "printMemOperand";
+}
+
+def MEMrr : Operand<iPTR> {
+  let MIOperandInfo = (ops R600_Reg32:$ptr, R600_Reg32:$index);
+}
+
+// Operands for non-registers
+
+class InstFlag<string PM = "printOperand", int Default = 0>
+    : OperandWithDefaultOps <i32, (ops (i32 Default))> {
+  let PrintMethod = PM;
+}
+
+def LITERAL : InstFlag<"printLiteral">;
+
+def WRITE : InstFlag <"printWrite", 1>;
+def OMOD : InstFlag <"printOMOD">;
+def REL : InstFlag <"printRel">;
+def CLAMP : InstFlag <"printClamp">;
+def NEG : InstFlag <"printNeg">;
+def ABS : InstFlag <"printAbs">;
+def UEM : InstFlag <"printUpdateExecMask">;
+def UP : InstFlag <"printUpdatePred">;
+
+// XXX: The r600g finalizer in Mesa expects last to be one in most cases.
+// Once we start using the packetizer in this backend we should have this
+// default to 0.
+def LAST : InstFlag<"printLast", 1>;
+
+def ADDRParam : ComplexPattern<i32, 2, "SelectADDRParam", [], []>;
+def ADDRDWord : ComplexPattern<i32, 1, "SelectADDRDWord", [], []>;
+def ADDRVTX_READ : ComplexPattern<i32, 2, "SelectADDRVTX_READ", [], []>;
+
+class R600ALU_Word0 {
+  field bits<32> Word0;
+
+  bits<11> src0;
+  bits<1>  src0_neg;
+  bits<1>  src0_rel;
+  bits<11> src1;
+  bits<1>  src1_rel;
+  bits<1>  src1_neg;
+  bits<3>  index_mode = 0;
+  bits<2>  pred_sel;
+  bits<1>  last;
+
+  bits<9>  src0_sel  = src0{8-0};
+  bits<2>  src0_chan = src0{10-9};
+  bits<9>  src1_sel  = src1{8-0};
+  bits<2>  src1_chan = src1{10-9};
+
+  let Word0{8-0}   = src0_sel;
+  let Word0{9}     = src0_rel;
+  let Word0{11-10} = src0_chan;
+  let Word0{12}    = src0_neg;
+  let Word0{21-13} = src1_sel;
+  let Word0{22}    = src1_rel;
+  let Word0{24-23} = src1_chan;
+  let Word0{25}    = src1_neg;
+  let Word0{28-26} = index_mode;
+  let Word0{30-29} = pred_sel;
+  let Word0{31}    = last;
+}
+
+class R600ALU_Word1 {
+  field bits<32> Word1;
+
+  bits<11> dst;
+  bits<3>  bank_swizzle = 0;
+  bits<1>  dst_rel;
+  bits<1>  clamp;
+
+  bits<7>  dst_sel  = dst{6-0};
+  bits<2>  dst_chan = dst{10-9};
+
+  let Word1{20-18} = bank_swizzle;
+  let Word1{27-21} = dst_sel;
+  let Word1{28}    = dst_rel;
+  let Word1{30-29} = dst_chan;
+  let Word1{31}    = clamp;
+}
+
+class R600ALU_Word1_OP2 <bits<11> alu_inst> : R600ALU_Word1{
+
+  bits<1>  src0_abs;
+  bits<1>  src1_abs;
+  bits<1>  update_exec_mask;
+  bits<1>  update_pred;
+  bits<1>  write;
+  bits<2>  omod;
+
+  let Word1{0}     = src0_abs;
+  let Word1{1}     = src1_abs;
+  let Word1{2}     = update_exec_mask;
+  let Word1{3}     = update_pred;
+  let Word1{4}     = write;
+  let Word1{6-5}   = omod;
+  let Word1{17-7}  = alu_inst;
+}
+
+class R600ALU_Word1_OP3 <bits<5> alu_inst> : R600ALU_Word1{
+
+  bits<11> src2;
+  bits<1>  src2_rel;
+  bits<1>  src2_neg;
+
+  bits<9>  src2_sel = src2{8-0};
+  bits<2>  src2_chan = src2{10-9};
+
+  let Word1{8-0}   = src2_sel;
+  let Word1{9}     = src2_rel;
+  let Word1{11-10} = src2_chan;
+  let Word1{12}    = src2_neg;
+  let Word1{17-13} = alu_inst;
+}
+
+/*
+XXX: R600 subtarget uses a slightly different encoding than the other
+subtargets.  We currently handle this in R600MCCodeEmitter, but we may
+want to use these instruction classes in the future.
+
+class R600ALU_Word1_OP2_r600 : R600ALU_Word1_OP2 {
+
+  bits<1>  fog_merge;
+  bits<10> alu_inst;
+
+  let Inst{37}    = fog_merge;
+  let Inst{39-38} = omod;
+  let Inst{49-40} = alu_inst;
+}
+
+class R600ALU_Word1_OP2_r700 : R600ALU_Word1_OP2 {
+
+  bits<11> alu_inst;
+
+  let Inst{38-37} = omod;
+  let Inst{49-39} = alu_inst;
+}
+*/
+
+def R600_Pred : PredicateOperand<i32, (ops R600_Predicate),
+                                     (ops PRED_SEL_OFF)>;
+
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+
+// Class for instructions with only one source register.
+// If you add new ins to this instruction, make sure they are listed before
+// $literal, because the backend currently assumes that the last operand is
+// a literal.  Also be sure to update the enum R600Op1OperandIndex::ROI in
+// R600Defines.h, R600InstrInfo::buildDefaultInstruction(),
+// and R600InstrInfo::getOperandIdx().
+class R600_1OP <bits<11> inst, string opName, list<dag> pattern,
+                InstrItinClass itin = AnyALU> :
+    InstR600 <0,
+              (outs R600_Reg32:$dst),
+              (ins WRITE:$write, OMOD:$omod, REL:$dst_rel, CLAMP:$clamp,
+                   R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs,
+                   LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal),
+              !strconcat(opName,
+                   "$clamp $dst$write$dst_rel$omod, "
+                   "$src0_neg$src0_abs$src0$src0_abs$src0_rel, "
+                   "$literal $pred_sel$last"),
+              pattern,
+              itin>,
+    R600ALU_Word0,
+    R600ALU_Word1_OP2 <inst> {
+
+  let src1 = 0;
+  let src1_rel = 0;
+  let src1_neg = 0;
+  let src1_abs = 0;
+  let update_exec_mask = 0;
+  let update_pred = 0;
+  let HasNativeOperands = 1;
+  let Op1 = 1;
+  let DisableEncoding = "$literal";
+
+  let Inst{31-0}  = Word0;
+  let Inst{63-32} = Word1;
+}
+
+class R600_1OP_Helper <bits<11> inst, string opName, SDPatternOperator node,
+                    InstrItinClass itin = AnyALU> :
+    R600_1OP <inst, opName,
+              [(set R600_Reg32:$dst, (node R600_Reg32:$src0))]
+>;
+
+// If you add our change the operands for R600_2OP instructions, you must
+// also update the R600Op2OperandIndex::ROI enum in R600Defines.h,
+// R600InstrInfo::buildDefaultInstruction(), and R600InstrInfo::getOperandIdx().
+class R600_2OP <bits<11> inst, string opName, list<dag> pattern,
+                InstrItinClass itin = AnyALU> :
+  InstR600 <inst,
+          (outs R600_Reg32:$dst),
+          (ins UEM:$update_exec_mask, UP:$update_pred, WRITE:$write,
+               OMOD:$omod, REL:$dst_rel, CLAMP:$clamp,
+               R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel, ABS:$src0_abs,
+               R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel, ABS:$src1_abs,
+               LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal),
+          !strconcat(opName,
+                "$clamp $update_exec_mask$update_pred$dst$write$dst_rel$omod, "
+                "$src0_neg$src0_abs$src0$src0_abs$src0_rel, "
+                "$src1_neg$src1_abs$src1$src1_abs$src1_rel, "
+                "$literal $pred_sel$last"),
+          pattern,
+          itin>,
+    R600ALU_Word0,
+    R600ALU_Word1_OP2 <inst> {
+
+  let HasNativeOperands = 1;
+  let Op2 = 1;
+  let DisableEncoding = "$literal";
+
+  let Inst{31-0}  = Word0;
+  let Inst{63-32} = Word1;
+}
+
+class R600_2OP_Helper <bits<11> inst, string opName, SDPatternOperator node,
+                       InstrItinClass itim = AnyALU> :
+    R600_2OP <inst, opName,
+              [(set R600_Reg32:$dst, (node R600_Reg32:$src0,
+                                           R600_Reg32:$src1))]
+>;
+
+// If you add our change the operands for R600_3OP instructions, you must
+// also update the R600Op3OperandIndex::ROI enum in R600Defines.h,
+// R600InstrInfo::buildDefaultInstruction(), and
+// R600InstrInfo::getOperandIdx().
+class R600_3OP <bits<5> inst, string opName, list<dag> pattern,
+                InstrItinClass itin = AnyALU> :
+  InstR600 <0,
+          (outs R600_Reg32:$dst),
+          (ins REL:$dst_rel, CLAMP:$clamp,
+               R600_Reg32:$src0, NEG:$src0_neg, REL:$src0_rel,
+               R600_Reg32:$src1, NEG:$src1_neg, REL:$src1_rel,
+               R600_Reg32:$src2, NEG:$src2_neg, REL:$src2_rel,
+               LAST:$last, R600_Pred:$pred_sel, LITERAL:$literal),
+          !strconcat(opName, "$clamp $dst$dst_rel, "
+                             "$src0_neg$src0$src0_rel, "
+                             "$src1_neg$src1$src1_rel, "
+                             "$src2_neg$src2$src2_rel, "
+                             "$literal $pred_sel$last"),
+          pattern,
+          itin>,
+    R600ALU_Word0,
+    R600ALU_Word1_OP3<inst>{
+
+  let HasNativeOperands = 1;
+  let DisableEncoding = "$literal";
+  let Op3 = 1;
+
+  let Inst{31-0}  = Word0;
+  let Inst{63-32} = Word1;
+}
+
+class R600_REDUCTION <bits<11> inst, dag ins, string asm, list<dag> pattern,
+                      InstrItinClass itin = VecALU> :
+  InstR600 <inst,
+          (outs R600_Reg32:$dst),
+          ins,
+          asm,
+          pattern,
+          itin>;
+
+class R600_TEX <bits<11> inst, string opName, list<dag> pattern,
+                InstrItinClass itin = AnyALU> :
+  InstR600 <inst,
+          (outs R600_Reg128:$dst),
+          (ins R600_Reg128:$src0, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
+          !strconcat(opName, "$dst, $src0, $resourceId, $samplerId, $textureTarget"),
+          pattern,
+          itin>{
+    let Inst {10-0} = inst;
+  }
+
+} // End mayLoad = 1, mayStore = 0, hasSideEffects = 0
+
+def TEX_SHADOW : PatLeaf<
+  (imm),
+  [{uint32_t TType = (uint32_t)N->getZExtValue();
+    return (TType >= 6 && TType <= 8) || TType == 11 || TType == 12;
+  }]
+>;
+
+class EG_CF_RAT <bits <8> cf_inst, bits <6> rat_inst, bits<4> rat_id, dag outs,
+                 dag ins, string asm, list<dag> pattern> :
+    InstR600ISA <outs, ins, asm, pattern> {
+  bits<7>  RW_GPR;
+  bits<7>  INDEX_GPR;
+
+  bits<2>  RIM;
+  bits<2>  TYPE;
+  bits<1>  RW_REL;
+  bits<2>  ELEM_SIZE;
+
+  bits<12> ARRAY_SIZE;
+  bits<4>  COMP_MASK;
+  bits<4>  BURST_COUNT;
+  bits<1>  VPM;
+  bits<1>  eop;
+  bits<1>  MARK;
+  bits<1>  BARRIER;
+
+  // CF_ALLOC_EXPORT_WORD0_RAT
+  let Inst{3-0}   = rat_id;
+  let Inst{9-4}   = rat_inst;
+  let Inst{10}    = 0; // Reserved
+  let Inst{12-11} = RIM;
+  let Inst{14-13} = TYPE;
+  let Inst{21-15} = RW_GPR;
+  let Inst{22}    = RW_REL;
+  let Inst{29-23} = INDEX_GPR;
+  let Inst{31-30} = ELEM_SIZE;
+
+  // CF_ALLOC_EXPORT_WORD1_BUF
+  let Inst{43-32} = ARRAY_SIZE;
+  let Inst{47-44} = COMP_MASK;
+  let Inst{51-48} = BURST_COUNT;
+  let Inst{52}    = VPM;
+  let Inst{53}    = eop;
+  let Inst{61-54} = cf_inst;
+  let Inst{62}    = MARK;
+  let Inst{63}    = BARRIER;
+}
+
+class LoadParamFrag <PatFrag load_type> : PatFrag <
+  (ops node:$ptr), (load_type node:$ptr),
+  [{ return isParamLoad(dyn_cast<LoadSDNode>(N)); }]
+>;
+
+def load_param : LoadParamFrag<load>;
+def load_param_zexti8 : LoadParamFrag<zextloadi8>;
+def load_param_zexti16 : LoadParamFrag<zextloadi16>;
+
+def isR600 : Predicate<"Subtarget.device()"
+                            "->getGeneration() == AMDGPUDeviceInfo::HD4XXX">;
+def isR700 : Predicate<"Subtarget.device()"
+                            "->getGeneration() == AMDGPUDeviceInfo::HD4XXX &&"
+                            "Subtarget.device()->getDeviceFlag()"
+                            ">= OCL_DEVICE_RV710">;
+def isEG : Predicate<
+  "Subtarget.device()->getGeneration() >= AMDGPUDeviceInfo::HD5XXX && "
+  "Subtarget.device()->getGeneration() < AMDGPUDeviceInfo::HD7XXX && "
+  "Subtarget.device()->getDeviceFlag() != OCL_DEVICE_CAYMAN">;
+
+def isCayman : Predicate<"Subtarget.device()"
+                            "->getDeviceFlag() == OCL_DEVICE_CAYMAN">;
+def isEGorCayman : Predicate<"Subtarget.device()"
+                            "->getGeneration() == AMDGPUDeviceInfo::HD5XXX"
+                            "|| Subtarget.device()->getGeneration() =="
+                            "AMDGPUDeviceInfo::HD6XXX">;
+
+def isR600toCayman : Predicate<
+                     "Subtarget.device()->getGeneration() <= AMDGPUDeviceInfo::HD6XXX">;
+
+//===----------------------------------------------------------------------===//
+// Interpolation Instructions
+//===----------------------------------------------------------------------===//
+
+def INTERP: SDNode<"AMDGPUISD::INTERP",
+  SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisInt<1>, SDTCisInt<2>]>
+  >;
+
+def INTERP_P0: SDNode<"AMDGPUISD::INTERP_P0",
+  SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisInt<1>]>
+  >;
+
+let usesCustomInserter = 1 in {
+def input_perspective :  AMDGPUShaderInst <
+  (outs R600_Reg128:$dst),
+  (ins i32imm:$src0, i32imm:$src1),
+  "input_perspective $src0 $src1 : dst",
+  [(set R600_Reg128:$dst, (INTERP (i32 imm:$src0), (i32 imm:$src1)))]>;
+}  // End usesCustomInserter = 1
+
+def input_constant :  AMDGPUShaderInst <
+  (outs R600_Reg128:$dst),
+  (ins i32imm:$src),
+  "input_perspective $src : dst",
+  [(set R600_Reg128:$dst, (INTERP_P0 (i32 imm:$src)))]>;
+
+
+
+def INTERP_XY : R600_2OP <0xD6, "INTERP_XY", []> {
+  let bank_swizzle = 5;
+}
+
+def INTERP_ZW : R600_2OP <0xD7, "INTERP_ZW", []> {
+  let bank_swizzle = 5;
+}
+
+def INTERP_LOAD_P0 : R600_1OP <0xE0, "INTERP_LOAD_P0", []>;
+
+//===----------------------------------------------------------------------===//
+// Export Instructions
+//===----------------------------------------------------------------------===//
+
+def ExportType : SDTypeProfile<0, 5, [SDTCisFP<0>, SDTCisInt<1>]>;
+
+def EXPORT: SDNode<"AMDGPUISD::EXPORT", ExportType,
+  [SDNPHasChain, SDNPSideEffect]>;
+
+class ExportWord0 {
+  field bits<32> Word0;
+
+  bits<13> arraybase;
+  bits<2> type;
+  bits<7> gpr;
+  bits<2> elem_size;
+
+  let Word0{12-0} = arraybase;
+  let Word0{14-13} = type;
+  let Word0{21-15} = gpr;
+  let Word0{22} = 0; // RW_REL
+  let Word0{29-23} = 0; // INDEX_GPR
+  let Word0{31-30} = elem_size;
+}
+
+class ExportSwzWord1 {
+  field bits<32> Word1;
+
+  bits<3> sw_x;
+  bits<3> sw_y;
+  bits<3> sw_z;
+  bits<3> sw_w;
+  bits<1> eop;
+  bits<8> inst;
+
+  let Word1{2-0} = sw_x;
+  let Word1{5-3} = sw_y;
+  let Word1{8-6} = sw_z;
+  let Word1{11-9} = sw_w;
+}
+
+class ExportBufWord1 {
+  field bits<32> Word1;
+
+  bits<12> arraySize;
+  bits<4> compMask;
+  bits<1> eop;
+  bits<8> inst;
+
+  let Word1{11-0} = arraySize;
+  let Word1{15-12} = compMask;
+}
+
+multiclass ExportPattern<Instruction ExportInst, bits<8> cf_inst> {
+  def : Pat<(int_R600_store_pixel_depth R600_Reg32:$reg),
+    (ExportInst
+        (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), R600_Reg32:$reg, sel_x),
+        0, 61, 0, 7, 7, 7, cf_inst, 0)
+  >;
+
+  def : Pat<(int_R600_store_pixel_stencil R600_Reg32:$reg),
+    (ExportInst
+        (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), R600_Reg32:$reg, sel_x),
+        0, 61, 7, 0, 7, 7, cf_inst, 0)
+  >;
+
+  def : Pat<(int_R600_store_pixel_dummy),
+    (ExportInst
+        (v4f32 (IMPLICIT_DEF)), 0, 0, 7, 7, 7, 7, cf_inst, 0)
+  >;
+
+  def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 0),
+    (i32 imm:$type), (i32 imm:$arraybase), (i32 imm)),
+        (ExportInst R600_Reg128:$src, imm:$type, imm:$arraybase,
+        0, 1, 2, 3, cf_inst, 0)
+  >;
+}
+
+multiclass SteamOutputExportPattern<Instruction ExportInst,
+    bits<8> buf0inst, bits<8> buf1inst, bits<8> buf2inst, bits<8> buf3inst> {
+// Stream0
+  def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 1),
+      (i32 imm:$type), (i32 imm:$arraybase), (i32 imm:$mask)),
+      (ExportInst R600_Reg128:$src, imm:$type, imm:$arraybase,
+      4095, imm:$mask, buf0inst, 0)>;
+// Stream1
+  def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 2),
+      (i32 imm:$type), (i32 imm:$arraybase), (i32 imm:$mask)),
+      (ExportInst R600_Reg128:$src, imm:$type, imm:$arraybase,
+      4095, imm:$mask, buf1inst, 0)>;
+// Stream2
+  def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 3),
+      (i32 imm:$type), (i32 imm:$arraybase), (i32 imm:$mask)),
+      (ExportInst R600_Reg128:$src, imm:$type, imm:$arraybase,
+      4095, imm:$mask, buf2inst, 0)>;
+// Stream3
+  def : Pat<(EXPORT (v4f32 R600_Reg128:$src), (i32 4),
+      (i32 imm:$type), (i32 imm:$arraybase), (i32 imm:$mask)),
+      (ExportInst R600_Reg128:$src, imm:$type, imm:$arraybase,
+      4095, imm:$mask, buf3inst, 0)>;
+}
+
+let isTerminator = 1, usesCustomInserter = 1 in {
+
+class ExportSwzInst : InstR600ISA<(
+    outs),
+    (ins R600_Reg128:$gpr, i32imm:$type, i32imm:$arraybase,
+    i32imm:$sw_x, i32imm:$sw_y, i32imm:$sw_z, i32imm:$sw_w, i32imm:$inst,
+    i32imm:$eop),
+    !strconcat("EXPORT", " $gpr"),
+    []>, ExportWord0, ExportSwzWord1 {
+  let elem_size = 3;
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
+}
+
+} // End isTerminator = 1, usesCustomInserter = 1
+
+class ExportBufInst : InstR600ISA<(
+    outs),
+    (ins R600_Reg128:$gpr, i32imm:$type, i32imm:$arraybase,
+    i32imm:$arraySize, i32imm:$compMask, i32imm:$inst, i32imm:$eop),
+    !strconcat("EXPORT", " $gpr"),
+    []>, ExportWord0, ExportBufWord1 {
+  let elem_size = 0;
+  let Inst{31-0} = Word0;
+  let Inst{63-32} = Word1;
+}
+
+let Predicates = [isR600toCayman] in { 
+
+//===----------------------------------------------------------------------===//
+// Common Instructions R600, R700, Evergreen, Cayman
+//===----------------------------------------------------------------------===//
+
+def ADD : R600_2OP_Helper <0x0, "ADD", fadd>;
+// Non-IEEE MUL: 0 * anything = 0
+def MUL : R600_2OP_Helper <0x1, "MUL NON-IEEE", int_AMDGPU_mul>;
+def MUL_IEEE : R600_2OP_Helper <0x2, "MUL_IEEE", fmul>;
+def MAX : R600_2OP_Helper <0x3, "MAX", AMDGPUfmax>;
+def MIN : R600_2OP_Helper <0x4, "MIN", AMDGPUfmin>;
+
+// For the SET* instructions there is a naming conflict in TargetSelectionDAG.td,
+// so some of the instruction names don't match the asm string.
+// XXX: Use the defs in TargetSelectionDAG.td instead of intrinsics.
+def SETE : R600_2OP <
+  0x08, "SETE",
+  [(set R600_Reg32:$dst,
+   (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO,
+             COND_EQ))]
+>;
+
+def SGT : R600_2OP <
+  0x09, "SETGT",
+  [(set R600_Reg32:$dst,
+   (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO,
+              COND_GT))]
+>;
+
+def SGE : R600_2OP <
+  0xA, "SETGE",
+  [(set R600_Reg32:$dst,
+   (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO,
+              COND_GE))]
+>;
+
+def SNE : R600_2OP <
+  0xB, "SETNE",
+  [(set R600_Reg32:$dst,
+   (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO,
+    COND_NE))]
+>;
+
+def FRACT : R600_1OP_Helper <0x10, "FRACT", AMDGPUfract>;
+def TRUNC : R600_1OP_Helper <0x11, "TRUNC", int_AMDGPU_trunc>;
+def CEIL : R600_1OP_Helper <0x12, "CEIL", fceil>;
+def RNDNE : R600_1OP_Helper <0x13, "RNDNE", frint>;
+def FLOOR : R600_1OP_Helper <0x14, "FLOOR", ffloor>;
+
+def MOV : R600_1OP <0x19, "MOV", []>;
+
+let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1 in {
+
+class MOV_IMM <ValueType vt, Operand immType> : AMDGPUInst <
+  (outs R600_Reg32:$dst),
+  (ins immType:$imm),
+  "",
+  []
+>;
+
+} // end let isPseudo = 1, isCodeGenOnly = 1, usesCustomInserter = 1
+
+def MOV_IMM_I32 : MOV_IMM<i32, i32imm>;
+def : Pat <
+  (imm:$val),
+  (MOV_IMM_I32 imm:$val)
+>;
+
+def MOV_IMM_F32 : MOV_IMM<f32, f32imm>;
+def : Pat <
+  (fpimm:$val),
+  (MOV_IMM_F32  fpimm:$val)
+>;
+
+def PRED_SETE : R600_2OP <0x20, "PRED_SETE", []>;
+def PRED_SETGT : R600_2OP <0x21, "PRED_SETGT", []>;
+def PRED_SETGE : R600_2OP <0x22, "PRED_SETGE", []>;
+def PRED_SETNE : R600_2OP <0x23, "PRED_SETNE", []>;
+
+let hasSideEffects = 1 in {
+
+def KILLGT : R600_2OP <0x2D, "KILLGT", []>;
+
+} // end hasSideEffects
+
+def AND_INT : R600_2OP_Helper <0x30, "AND_INT", and>;
+def OR_INT : R600_2OP_Helper <0x31, "OR_INT", or>;
+def XOR_INT : R600_2OP_Helper <0x32, "XOR_INT", xor>;
+def NOT_INT : R600_1OP_Helper <0x33, "NOT_INT", not>;
+def ADD_INT : R600_2OP_Helper <0x34, "ADD_INT", add>;
+def SUB_INT : R600_2OP_Helper <0x35, "SUB_INT", sub>;
+def MAX_INT : R600_2OP_Helper <0x36, "MAX_INT", AMDGPUsmax>;
+def MIN_INT : R600_2OP_Helper <0x37, "MIN_INT", AMDGPUsmin>;
+def MAX_UINT : R600_2OP_Helper <0x38, "MAX_UINT", AMDGPUsmax>;
+def MIN_UINT : R600_2OP_Helper <0x39, "MIN_UINT", AMDGPUumin>;
+
+def SETE_INT : R600_2OP <
+  0x3A, "SETE_INT",
+  [(set (i32 R600_Reg32:$dst),
+   (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETEQ))]
+>;
+
+def SETGT_INT : R600_2OP <
+  0x3B, "SGT_INT",
+  [(set (i32 R600_Reg32:$dst),
+   (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETGT))]
+>;
+
+def SETGE_INT : R600_2OP <
+  0x3C, "SETGE_INT",
+  [(set (i32 R600_Reg32:$dst),
+   (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETGE))]
+>;
+
+def SETNE_INT : R600_2OP <
+  0x3D, "SETNE_INT",
+  [(set (i32 R600_Reg32:$dst),
+   (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETNE))]
+>;
+
+def SETGT_UINT : R600_2OP <
+  0x3E, "SETGT_UINT",
+  [(set (i32 R600_Reg32:$dst),
+   (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETUGT))]
+>;
+
+def SETGE_UINT : R600_2OP <
+  0x3F, "SETGE_UINT",
+  [(set (i32 R600_Reg32:$dst),
+    (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETUGE))]
+>;
+
+def PRED_SETE_INT : R600_2OP <0x42, "PRED_SETE_INT", []>;
+def PRED_SETGT_INT : R600_2OP <0x43, "PRED_SETGE_INT", []>;
+def PRED_SETGE_INT : R600_2OP <0x44, "PRED_SETGE_INT", []>;
+def PRED_SETNE_INT : R600_2OP <0x45, "PRED_SETNE_INT", []>;
+
+def CNDE_INT : R600_3OP <
+  0x1C, "CNDE_INT",
+  [(set (i32 R600_Reg32:$dst),
+   (selectcc (i32 R600_Reg32:$src0), 0,
+       (i32 R600_Reg32:$src1), (i32 R600_Reg32:$src2),
+       COND_EQ))]
+>;
+
+def CNDGE_INT : R600_3OP <
+  0x1E, "CNDGE_INT",
+  [(set (i32 R600_Reg32:$dst),
+   (selectcc (i32 R600_Reg32:$src0), 0,
+       (i32 R600_Reg32:$src1), (i32 R600_Reg32:$src2),
+       COND_GE))]
+>;
+
+def CNDGT_INT : R600_3OP <
+  0x1D, "CNDGT_INT",
+  [(set (i32 R600_Reg32:$dst),
+   (selectcc (i32 R600_Reg32:$src0), 0,
+       (i32 R600_Reg32:$src1), (i32 R600_Reg32:$src2),
+       COND_GT))]
+>;
+
+//===----------------------------------------------------------------------===//
+// Texture instructions
+//===----------------------------------------------------------------------===//
+
+def TEX_LD : R600_TEX <
+  0x03, "TEX_LD",
+  [(set R600_Reg128:$dst, (int_AMDGPU_txf R600_Reg128:$src0, imm:$src1, imm:$src2, imm:$src3, imm:$resourceId, imm:$samplerId, imm:$textureTarget))]
+> {
+let AsmString = "TEX_LD $dst, $src0, $src1, $src2, $src3, $resourceId, $samplerId, $textureTarget";
+let InOperandList = (ins R600_Reg128:$src0, i32imm:$src1, i32imm:$src2, i32imm:$src3, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget);
+}
+
+def TEX_GET_TEXTURE_RESINFO : R600_TEX <
+  0x04, "TEX_GET_TEXTURE_RESINFO",
+  [(set R600_Reg128:$dst, (int_AMDGPU_txq R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, imm:$textureTarget))]
+>;
+
+def TEX_GET_GRADIENTS_H : R600_TEX <
+  0x07, "TEX_GET_GRADIENTS_H",
+  [(set R600_Reg128:$dst, (int_AMDGPU_ddx R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, imm:$textureTarget))]
+>;
+
+def TEX_GET_GRADIENTS_V : R600_TEX <
+  0x08, "TEX_GET_GRADIENTS_V",
+  [(set R600_Reg128:$dst, (int_AMDGPU_ddy R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, imm:$textureTarget))]
+>;
+
+def TEX_SET_GRADIENTS_H : R600_TEX <
+  0x0B, "TEX_SET_GRADIENTS_H",
+  []
+>;
+
+def TEX_SET_GRADIENTS_V : R600_TEX <
+  0x0C, "TEX_SET_GRADIENTS_V",
+  []
+>;
+
+def TEX_SAMPLE : R600_TEX <
+  0x10, "TEX_SAMPLE",
+  [(set R600_Reg128:$dst, (int_AMDGPU_tex R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, imm:$textureTarget))]
+>;
+
+def TEX_SAMPLE_C : R600_TEX <
+  0x18, "TEX_SAMPLE_C",
+  [(set R600_Reg128:$dst, (int_AMDGPU_tex R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))]
+>;
+
+def TEX_SAMPLE_L : R600_TEX <
+  0x11, "TEX_SAMPLE_L",
+  [(set R600_Reg128:$dst, (int_AMDGPU_txl R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, imm:$textureTarget))]
+>;
+
+def TEX_SAMPLE_C_L : R600_TEX <
+  0x19, "TEX_SAMPLE_C_L",
+  [(set R600_Reg128:$dst, (int_AMDGPU_txl R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))]
+>;
+
+def TEX_SAMPLE_LB : R600_TEX <
+  0x12, "TEX_SAMPLE_LB",
+  [(set R600_Reg128:$dst, (int_AMDGPU_txb R600_Reg128:$src0,imm:$resourceId, imm:$samplerId, imm:$textureTarget))]
+>;
+
+def TEX_SAMPLE_C_LB : R600_TEX <
+  0x1A, "TEX_SAMPLE_C_LB",
+  [(set R600_Reg128:$dst, (int_AMDGPU_txb R600_Reg128:$src0, imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))]
+>;
+
+def TEX_SAMPLE_G : R600_TEX <
+  0x14, "TEX_SAMPLE_G",
+  []
+>;
+
+def TEX_SAMPLE_C_G : R600_TEX <
+  0x1C, "TEX_SAMPLE_C_G",
+  []
+>;
+
+//===----------------------------------------------------------------------===//
+// Helper classes for common instructions
+//===----------------------------------------------------------------------===//
+
+class MUL_LIT_Common <bits<5> inst> : R600_3OP <
+  inst, "MUL_LIT",
+  []
+>;
+
+class MULADD_Common <bits<5> inst> : R600_3OP <
+  inst, "MULADD",
+  [(set (f32 R600_Reg32:$dst),
+   (IL_mad R600_Reg32:$src0, R600_Reg32:$src1, R600_Reg32:$src2))]
+>;
+
+class CNDE_Common <bits<5> inst> : R600_3OP <
+  inst, "CNDE",
+  [(set R600_Reg32:$dst,
+   (selectcc (f32 R600_Reg32:$src0), FP_ZERO,
+       (f32 R600_Reg32:$src1), (f32 R600_Reg32:$src2),
+       COND_EQ))]
+>;
+
+class CNDGT_Common <bits<5> inst> : R600_3OP <
+  inst, "CNDGT",
+  [(set R600_Reg32:$dst,
+   (selectcc (f32 R600_Reg32:$src0), FP_ZERO,
+       (f32 R600_Reg32:$src1), (f32 R600_Reg32:$src2),
+       COND_GT))]
+>;
+
+class CNDGE_Common <bits<5> inst> : R600_3OP <
+  inst, "CNDGE",
+  [(set R600_Reg32:$dst,
+   (selectcc (f32 R600_Reg32:$src0), FP_ZERO,
+       (f32 R600_Reg32:$src1), (f32 R600_Reg32:$src2),
+       COND_GE))]
+>;
+
+multiclass DOT4_Common <bits<11> inst> {
+
+  def _pseudo : R600_REDUCTION <inst,
+    (ins R600_Reg128:$src0, R600_Reg128:$src1),
+    "DOT4 $dst $src0, $src1",
+    [(set R600_Reg32:$dst, (int_AMDGPU_dp4 R600_Reg128:$src0, R600_Reg128:$src1))]
+  >;
+
+  def _real : R600_2OP <inst, "DOT4", []>;
+}
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
+multiclass CUBE_Common <bits<11> inst> {
+
+  def _pseudo : InstR600 <
+    inst,
+    (outs R600_Reg128:$dst),
+    (ins R600_Reg128:$src),
+    "CUBE $dst $src",
+    [(set R600_Reg128:$dst, (int_AMDGPU_cube R600_Reg128:$src))],
+    VecALU
+  > {
+    let isPseudo = 1;
+  }
+
+  def _real : R600_2OP <inst, "CUBE", []>;
+}
+} // End mayLoad = 0, mayStore = 0, hasSideEffects = 0
+
+class EXP_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "EXP_IEEE", fexp2
+>;
+
+class FLT_TO_INT_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "FLT_TO_INT", fp_to_sint
+>;
+
+class INT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "INT_TO_FLT", sint_to_fp
+>;
+
+class FLT_TO_UINT_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "FLT_TO_UINT", fp_to_uint
+>;
+
+class UINT_TO_FLT_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "UINT_TO_FLT", uint_to_fp
+>;
+
+class LOG_CLAMPED_Common <bits<11> inst> : R600_1OP <
+  inst, "LOG_CLAMPED", []
+>;
+
+class LOG_IEEE_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "LOG_IEEE", flog2
+>;
+
+class LSHL_Common <bits<11> inst> : R600_2OP_Helper <inst, "LSHL", shl>;
+class LSHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "LSHR", srl>;
+class ASHR_Common <bits<11> inst> : R600_2OP_Helper <inst, "ASHR", sra>;
+class MULHI_INT_Common <bits<11> inst> : R600_2OP_Helper <
+  inst, "MULHI_INT", mulhs
+>;
+class MULHI_UINT_Common <bits<11> inst> : R600_2OP_Helper <
+  inst, "MULHI", mulhu
+>;
+class MULLO_INT_Common <bits<11> inst> : R600_2OP_Helper <
+  inst, "MULLO_INT", mul
+>;
+class MULLO_UINT_Common <bits<11> inst> : R600_2OP <inst, "MULLO_UINT", []>;
+
+class RECIP_CLAMPED_Common <bits<11> inst> : R600_1OP <
+  inst, "RECIP_CLAMPED", []
+>;
+
+class RECIP_IEEE_Common <bits<11> inst> : R600_1OP <
+  inst, "RECIP_IEEE", [(set R600_Reg32:$dst, (fdiv FP_ONE, R600_Reg32:$src0))]
+>;
+
+class RECIP_UINT_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "RECIP_UINT", AMDGPUurecip
+>;
+
+class RECIPSQRT_CLAMPED_Common <bits<11> inst> : R600_1OP_Helper <
+  inst, "RECIPSQRT_CLAMPED", int_AMDGPU_rsq
+>;
+
+class RECIPSQRT_IEEE_Common <bits<11> inst> : R600_1OP <
+  inst, "RECIPSQRT_IEEE", []
+>;
+
+class SIN_Common <bits<11> inst> : R600_1OP <
+  inst, "SIN", []>{
+  let Trig = 1;
+}
+
+class COS_Common <bits<11> inst> : R600_1OP <
+  inst, "COS", []> {
+  let Trig = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Helper patterns for complex intrinsics
+//===----------------------------------------------------------------------===//
+
+multiclass DIV_Common <InstR600 recip_ieee> {
+def : Pat<
+  (int_AMDGPU_div R600_Reg32:$src0, R600_Reg32:$src1),
+  (MUL R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1))
+>;
+
+def : Pat<
+  (fdiv R600_Reg32:$src0, R600_Reg32:$src1),
+  (MUL R600_Reg32:$src0, (recip_ieee R600_Reg32:$src1))
+>;
+}
+
+class TGSI_LIT_Z_Common <InstR600 mul_lit, InstR600 log_clamped, InstR600 exp_ieee> : Pat <
+  (int_TGSI_lit_z R600_Reg32:$src_x, R600_Reg32:$src_y, R600_Reg32:$src_w),
+  (exp_ieee (mul_lit (log_clamped (MAX R600_Reg32:$src_y, (f32 ZERO))), R600_Reg32:$src_w, R600_Reg32:$src_x))
+>;
+
+//===----------------------------------------------------------------------===//
+// R600 / R700 Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isR600] in {
+
+  def MUL_LIT_r600 : MUL_LIT_Common<0x0C>;
+  def MULADD_r600 : MULADD_Common<0x10>;
+  def CNDE_r600 : CNDE_Common<0x18>;
+  def CNDGT_r600 : CNDGT_Common<0x19>;
+  def CNDGE_r600 : CNDGE_Common<0x1A>;
+  defm DOT4_r600 : DOT4_Common<0x50>;
+  defm CUBE_r600 : CUBE_Common<0x52>;
+  def EXP_IEEE_r600 : EXP_IEEE_Common<0x61>;
+  def LOG_CLAMPED_r600 : LOG_CLAMPED_Common<0x62>;
+  def LOG_IEEE_r600 : LOG_IEEE_Common<0x63>;
+  def RECIP_CLAMPED_r600 : RECIP_CLAMPED_Common<0x64>;
+  def RECIP_IEEE_r600 : RECIP_IEEE_Common<0x66>;
+  def RECIPSQRT_CLAMPED_r600 : RECIPSQRT_CLAMPED_Common<0x67>;
+  def RECIPSQRT_IEEE_r600 : RECIPSQRT_IEEE_Common<0x69>;
+  def FLT_TO_INT_r600 : FLT_TO_INT_Common<0x6b>;
+  def INT_TO_FLT_r600 : INT_TO_FLT_Common<0x6c>;
+  def FLT_TO_UINT_r600 : FLT_TO_UINT_Common<0x79>;
+  def UINT_TO_FLT_r600 : UINT_TO_FLT_Common<0x6d>;
+  def SIN_r600 : SIN_Common<0x6E>;
+  def COS_r600 : COS_Common<0x6F>;
+  def ASHR_r600 : ASHR_Common<0x70>;
+  def LSHR_r600 : LSHR_Common<0x71>;
+  def LSHL_r600 : LSHL_Common<0x72>;
+  def MULLO_INT_r600 : MULLO_INT_Common<0x73>;
+  def MULHI_INT_r600 : MULHI_INT_Common<0x74>;
+  def MULLO_UINT_r600 : MULLO_UINT_Common<0x75>;
+  def MULHI_UINT_r600 : MULHI_UINT_Common<0x76>;
+  def RECIP_UINT_r600 : RECIP_UINT_Common <0x78>;
+
+  defm DIV_r600 : DIV_Common<RECIP_IEEE_r600>;
+  def TGSI_LIT_Z_r600 : TGSI_LIT_Z_Common<MUL_LIT_r600, LOG_CLAMPED_r600, EXP_IEEE_r600>;
+
+  def : Pat<(fsqrt R600_Reg32:$src),
+    (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_r600 R600_Reg32:$src))>;
+
+  def R600_ExportSwz : ExportSwzInst {
+    let Word1{20-17} = 1; // BURST_COUNT
+    let Word1{21} = eop;
+    let Word1{22} = 1; // VALID_PIXEL_MODE
+    let Word1{30-23} = inst;
+    let Word1{31} = 1; // BARRIER
+  }
+  defm : ExportPattern<R600_ExportSwz, 39>;
+
+  def R600_ExportBuf : ExportBufInst {
+    let Word1{20-17} = 1; // BURST_COUNT
+    let Word1{21} = eop;
+    let Word1{22} = 1; // VALID_PIXEL_MODE
+    let Word1{30-23} = inst;
+    let Word1{31} = 1; // BARRIER
+  }
+  defm : SteamOutputExportPattern<R600_ExportBuf, 0x20, 0x21, 0x22, 0x23>;
+}
+
+// Helper pattern for normalizing inputs to triginomic instructions for R700+
+// cards.
+class COS_PAT <InstR600 trig> : Pat<
+  (fcos R600_Reg32:$src),
+  (trig (MUL (MOV_IMM_I32 CONST.TWO_PI_INV), R600_Reg32:$src))
+>;
+
+class SIN_PAT <InstR600 trig> : Pat<
+  (fsin R600_Reg32:$src),
+  (trig (MUL (MOV_IMM_I32 CONST.TWO_PI_INV), R600_Reg32:$src))
+>;
+
+//===----------------------------------------------------------------------===//
+// R700 Only instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isR700] in {
+  def SIN_r700 : SIN_Common<0x6E>;
+  def COS_r700 : COS_Common<0x6F>;
+
+  // R700 normalizes inputs to SIN/COS the same as EG
+  def : SIN_PAT <SIN_r700>;
+  def : COS_PAT <COS_r700>;
+}
+
+//===----------------------------------------------------------------------===//
+// Evergreen Only instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isEG] in {
+  
+def RECIP_IEEE_eg : RECIP_IEEE_Common<0x86>;
+defm DIV_eg : DIV_Common<RECIP_IEEE_eg>;
+
+def MULLO_INT_eg : MULLO_INT_Common<0x8F>;
+def MULHI_INT_eg : MULHI_INT_Common<0x90>;
+def MULLO_UINT_eg : MULLO_UINT_Common<0x91>;
+def MULHI_UINT_eg : MULHI_UINT_Common<0x92>;
+def RECIP_UINT_eg : RECIP_UINT_Common<0x94>;
+def RECIPSQRT_CLAMPED_eg : RECIPSQRT_CLAMPED_Common<0x87>;
+def EXP_IEEE_eg : EXP_IEEE_Common<0x81>;
+def LOG_IEEE_eg : LOG_IEEE_Common<0x83>;
+def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>;
+def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>;
+def SIN_eg : SIN_Common<0x8D>;
+def COS_eg : COS_Common<0x8E>;
+
+def : SIN_PAT <SIN_eg>;
+def : COS_PAT <COS_eg>;
+def : Pat<(fsqrt R600_Reg32:$src),
+  (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_eg R600_Reg32:$src))>;
+} // End Predicates = [isEG]
+
+//===----------------------------------------------------------------------===//
+// Evergreen / Cayman Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [isEGorCayman] in {
+
+  // BFE_UINT - bit_extract, an optimization for mask and shift
+  // Src0 = Input
+  // Src1 = Offset
+  // Src2 = Width
+  //
+  // bit_extract = (Input << (32 - Offset - Width)) >> (32 - Width)
+  //
+  // Example Usage:
+  // (Offset, Width)
+  //
+  // (0, 8)           = (Input << 24) >> 24  = (Input &  0xff)       >> 0
+  // (8, 8)           = (Input << 16) >> 24  = (Input &  0xffff)     >> 8
+  // (16,8)           = (Input <<  8) >> 24  = (Input &  0xffffff)   >> 16
+  // (24,8)           = (Input <<  0) >> 24  = (Input &  0xffffffff) >> 24
+  def BFE_UINT_eg : R600_3OP <0x4, "BFE_UINT",
+    [(set R600_Reg32:$dst, (int_AMDIL_bit_extract_u32 R600_Reg32:$src0,
+                                                      R600_Reg32:$src1,
+                                                      R600_Reg32:$src2))],
+    VecALU
+  >;
+
+  def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT",
+    [(set R600_Reg32:$dst, (AMDGPUbitalign R600_Reg32:$src0, R600_Reg32:$src1,
+                                          R600_Reg32:$src2))],
+    VecALU
+  >;
+
+  def MULADD_eg : MULADD_Common<0x14>;
+  def ASHR_eg : ASHR_Common<0x15>;
+  def LSHR_eg : LSHR_Common<0x16>;
+  def LSHL_eg : LSHL_Common<0x17>;
+  def CNDE_eg : CNDE_Common<0x19>;
+  def CNDGT_eg : CNDGT_Common<0x1A>;
+  def CNDGE_eg : CNDGE_Common<0x1B>;
+  def MUL_LIT_eg : MUL_LIT_Common<0x1F>;
+  def LOG_CLAMPED_eg : LOG_CLAMPED_Common<0x82>;
+  defm DOT4_eg : DOT4_Common<0xBE>;
+  defm CUBE_eg : CUBE_Common<0xC0>;
+
+  def TGSI_LIT_Z_eg : TGSI_LIT_Z_Common<MUL_LIT_eg, LOG_CLAMPED_eg, EXP_IEEE_eg>;
+
+  def FLT_TO_INT_eg : FLT_TO_INT_Common<0x50> {
+    let Pattern = [];
+  }
+
+  def INT_TO_FLT_eg : INT_TO_FLT_Common<0x9B>;
+
+  def FLT_TO_UINT_eg : FLT_TO_UINT_Common<0x9A> {
+    let Pattern = [];
+  }
+
+  def UINT_TO_FLT_eg : UINT_TO_FLT_Common<0x9C>;
+
+  // TRUNC is used for the FLT_TO_INT instructions to work around a
+  // perceived problem where the rounding modes are applied differently
+  // depending on the instruction and the slot they are in.
+  // See:
+  // https://bugs.freedesktop.org/show_bug.cgi?id=50232
+  // Mesa commit: a1a0974401c467cb86ef818f22df67c21774a38c
+  //
+  // XXX: Lowering SELECT_CC will sometimes generate fp_to_[su]int nodes,
+  // which do not need to be truncated since the fp values are 0.0f or 1.0f.
+  // We should look into handling these cases separately.
+  def : Pat<(fp_to_sint R600_Reg32:$src0),
+    (FLT_TO_INT_eg (TRUNC R600_Reg32:$src0))>;
+
+  def : Pat<(fp_to_uint R600_Reg32:$src0),
+    (FLT_TO_UINT_eg (TRUNC R600_Reg32:$src0))>;
+
+  def EG_ExportSwz : ExportSwzInst {
+    let Word1{19-16} = 1; // BURST_COUNT
+    let Word1{20} = 1; // VALID_PIXEL_MODE
+    let Word1{21} = eop;
+    let Word1{29-22} = inst;
+    let Word1{30} = 0; // MARK
+    let Word1{31} = 1; // BARRIER
+  }
+  defm : ExportPattern<EG_ExportSwz, 83>;
+
+  def EG_ExportBuf : ExportBufInst {
+    let Word1{19-16} = 1; // BURST_COUNT
+    let Word1{20} = 1; // VALID_PIXEL_MODE
+    let Word1{21} = eop;
+    let Word1{29-22} = inst;
+    let Word1{30} = 0; // MARK
+    let Word1{31} = 1; // BARRIER
+  }
+  defm : SteamOutputExportPattern<EG_ExportBuf, 0x40, 0x41, 0x42, 0x43>;
+
+//===----------------------------------------------------------------------===//
+// Memory read/write instructions
+//===----------------------------------------------------------------------===//
+let usesCustomInserter = 1 in {
+
+class RAT_WRITE_CACHELESS_eg <dag ins, bits<4> comp_mask, string name,
+                              list<dag> pattern>
+    : EG_CF_RAT <0x57, 0x2, 0, (outs), ins,
+                 !strconcat(name, " $rw_gpr, $index_gpr, $eop"), pattern> {
+  let RIM         = 0;
+  // XXX: Have a separate instruction for non-indexed writes.
+  let TYPE        = 1;
+  let RW_REL      = 0;
+  let ELEM_SIZE   = 0;
+
+  let ARRAY_SIZE  = 0;
+  let COMP_MASK   = comp_mask;
+  let BURST_COUNT = 0;
+  let VPM         = 0;
+  let MARK        = 0;
+  let BARRIER     = 1;
+}
+
+} // End usesCustomInserter = 1
+
+// 32-bit store
+def RAT_WRITE_CACHELESS_32_eg : RAT_WRITE_CACHELESS_eg <
+  (ins R600_TReg32_X:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
+  0x1, "RAT_WRITE_CACHELESS_32_eg",
+  [(global_store (i32 R600_TReg32_X:$rw_gpr), R600_TReg32_X:$index_gpr)]
+>;
+
+//128-bit store
+def RAT_WRITE_CACHELESS_128_eg : RAT_WRITE_CACHELESS_eg <
+  (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr, InstFlag:$eop),
+  0xf, "RAT_WRITE_CACHELESS_128",
+  [(global_store (v4i32 R600_Reg128:$rw_gpr), R600_TReg32_X:$index_gpr)]
+>;
+
+class VTX_READ_eg <string name, bits<8> buffer_id, dag outs, list<dag> pattern>
+    : InstR600ISA <outs, (ins MEMxi:$ptr), name#" $dst, $ptr", pattern> {
+
+  // Operands
+  bits<7> DST_GPR;
+  bits<7> SRC_GPR;
+
+  // Static fields
+  bits<5> VC_INST = 0;
+  bits<2> FETCH_TYPE = 2;
+  bits<1> FETCH_WHOLE_QUAD = 0;
+  bits<8> BUFFER_ID = buffer_id;
+  bits<1> SRC_REL = 0;
+  // XXX: We can infer this field based on the SRC_GPR.  This would allow us
+  // to store vertex addresses in any channel, not just X.
+  bits<2> SRC_SEL_X = 0;
+  bits<6> MEGA_FETCH_COUNT;
+  bits<1> DST_REL = 0;
+  bits<3> DST_SEL_X;
+  bits<3> DST_SEL_Y;
+  bits<3> DST_SEL_Z;
+  bits<3> DST_SEL_W;
+  // The docs say that if this bit is set, then DATA_FORMAT, NUM_FORMAT_ALL,
+  // FORMAT_COMP_ALL, SRF_MODE_ALL, and ENDIAN_SWAP fields will be ignored,
+  // however, based on my testing if USE_CONST_FIELDS is set, then all
+  // these fields need to be set to 0.
+  bits<1> USE_CONST_FIELDS = 0;
+  bits<6> DATA_FORMAT;
+  bits<2> NUM_FORMAT_ALL = 1;
+  bits<1> FORMAT_COMP_ALL = 0;
+  bits<1> SRF_MODE_ALL = 0;
+
+  // LLVM can only encode 64-bit instructions, so these fields are manually
+  // encoded in R600CodeEmitter
+  //
+  // bits<16> OFFSET;
+  // bits<2>  ENDIAN_SWAP = 0;
+  // bits<1>  CONST_BUF_NO_STRIDE = 0;
+  // bits<1>  MEGA_FETCH = 0;
+  // bits<1>  ALT_CONST = 0;
+  // bits<2>  BUFFER_INDEX_MODE = 0;
+
+  // VTX_WORD0
+  let Inst{4-0}   = VC_INST;
+  let Inst{6-5}   = FETCH_TYPE;
+  let Inst{7}     = FETCH_WHOLE_QUAD;
+  let Inst{15-8}  = BUFFER_ID;
+  let Inst{22-16} = SRC_GPR;
+  let Inst{23}    = SRC_REL;
+  let Inst{25-24} = SRC_SEL_X;
+  let Inst{31-26} = MEGA_FETCH_COUNT;
+
+  // VTX_WORD1_GPR
+  let Inst{38-32} = DST_GPR;
+  let Inst{39}    = DST_REL;
+  let Inst{40}    = 0; // Reserved
+  let Inst{43-41} = DST_SEL_X;
+  let Inst{46-44} = DST_SEL_Y;
+  let Inst{49-47} = DST_SEL_Z;
+  let Inst{52-50} = DST_SEL_W;
+  let Inst{53}    = USE_CONST_FIELDS;
+  let Inst{59-54} = DATA_FORMAT;
+  let Inst{61-60} = NUM_FORMAT_ALL;
+  let Inst{62}    = FORMAT_COMP_ALL;
+  let Inst{63}    = SRF_MODE_ALL;
+
+  // VTX_WORD2 (LLVM can only encode 64-bit instructions, so WORD2 encoding
+  // is done in R600CodeEmitter
+  //
+  // Inst{79-64} = OFFSET;
+  // Inst{81-80} = ENDIAN_SWAP;
+  // Inst{82}    = CONST_BUF_NO_STRIDE;
+  // Inst{83}    = MEGA_FETCH;
+  // Inst{84}    = ALT_CONST;
+  // Inst{86-85} = BUFFER_INDEX_MODE;
+  // Inst{95-86} = 0; Reserved
+
+  // VTX_WORD3 (Padding)
+  //
+  // Inst{127-96} = 0;
+}
+
+class VTX_READ_8_eg <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_eg <"VTX_READ_8", buffer_id, (outs R600_TReg32_X:$dst),
+                   pattern> {
+
+  let MEGA_FETCH_COUNT = 1;
+  let DST_SEL_X = 0;
+  let DST_SEL_Y = 7;   // Masked
+  let DST_SEL_Z = 7;   // Masked
+  let DST_SEL_W = 7;   // Masked
+  let DATA_FORMAT = 1; // FMT_8
+}
+
+class VTX_READ_16_eg <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_eg <"VTX_READ_16", buffer_id, (outs R600_TReg32_X:$dst),
+                    pattern> {
+  let MEGA_FETCH_COUNT = 2;
+  let DST_SEL_X = 0;
+  let DST_SEL_Y = 7;   // Masked
+  let DST_SEL_Z = 7;   // Masked
+  let DST_SEL_W = 7;   // Masked
+  let DATA_FORMAT = 5; // FMT_16
+
+}
+
+class VTX_READ_32_eg <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_eg <"VTX_READ_32", buffer_id, (outs R600_TReg32_X:$dst),
+                   pattern> {
+
+  let MEGA_FETCH_COUNT = 4;
+  let DST_SEL_X        = 0;
+  let DST_SEL_Y        = 7;   // Masked
+  let DST_SEL_Z        = 7;   // Masked
+  let DST_SEL_W        = 7;   // Masked
+  let DATA_FORMAT      = 0xD; // COLOR_32
+
+  // This is not really necessary, but there were some GPU hangs that appeared
+  // to be caused by ALU instructions in the next instruction group that wrote
+  // to the $ptr registers of the VTX_READ.  
+  // e.g.
+  // %T3_X<def> = VTX_READ_PARAM_32_eg %T2_X<kill>, 24
+  // %T2_X<def> = MOV %ZERO
+  //Adding this constraint prevents this from happening.
+  let Constraints = "$ptr.ptr = $dst";
+}
+
+class VTX_READ_128_eg <bits<8> buffer_id, list<dag> pattern>
+    : VTX_READ_eg <"VTX_READ_128", buffer_id, (outs R600_Reg128:$dst),
+                   pattern> {
+
+  let MEGA_FETCH_COUNT = 16;
+  let DST_SEL_X        =  0;
+  let DST_SEL_Y        =  1;
+  let DST_SEL_Z        =  2;
+  let DST_SEL_W        =  3;
+  let DATA_FORMAT      =  0x22; // COLOR_32_32_32_32
+
+  // XXX: Need to force VTX_READ_128 instructions to write to the same register
+  // that holds its buffer address to avoid potential hangs.  We can't use
+  // the same constraint as VTX_READ_32_eg, because the $ptr.ptr and $dst
+  // registers are different sizes.
+}
+
+//===----------------------------------------------------------------------===//
+// VTX Read from parameter memory space
+//===----------------------------------------------------------------------===//
+
+def VTX_READ_PARAM_8_eg : VTX_READ_8_eg <0,
+  [(set (i32 R600_TReg32_X:$dst), (load_param_zexti8 ADDRVTX_READ:$ptr))]
+>;
+
+def VTX_READ_PARAM_16_eg : VTX_READ_16_eg <0,
+  [(set (i32 R600_TReg32_X:$dst), (load_param_zexti16 ADDRVTX_READ:$ptr))]
+>;
+
+def VTX_READ_PARAM_32_eg : VTX_READ_32_eg <0,
+  [(set (i32 R600_TReg32_X:$dst), (load_param ADDRVTX_READ:$ptr))]
+>;
+
+//===----------------------------------------------------------------------===//
+// VTX Read from global memory space
+//===----------------------------------------------------------------------===//
+
+// 8-bit reads
+def VTX_READ_GLOBAL_8_eg : VTX_READ_8_eg <1,
+  [(set (i32 R600_TReg32_X:$dst), (zextloadi8_global ADDRVTX_READ:$ptr))]
+>;
+
+// 32-bit reads
+def VTX_READ_GLOBAL_32_eg : VTX_READ_32_eg <1,
+  [(set (i32 R600_TReg32_X:$dst), (global_load ADDRVTX_READ:$ptr))]
+>;
+
+// 128-bit reads
+def VTX_READ_GLOBAL_128_eg : VTX_READ_128_eg <1,
+  [(set (v4i32 R600_Reg128:$dst), (global_load ADDRVTX_READ:$ptr))]
+>;
+
+//===----------------------------------------------------------------------===//
+// Constant Loads
+// XXX: We are currently storing all constants in the global address space.
+//===----------------------------------------------------------------------===//
+
+def CONSTANT_LOAD_eg : VTX_READ_32_eg <1,
+  [(set (i32 R600_TReg32_X:$dst), (constant_load ADDRVTX_READ:$ptr))]
+>;
+
+}
+
+let Predicates = [isCayman] in {
+
+let isVector = 1 in { 
+
+def RECIP_IEEE_cm : RECIP_IEEE_Common<0x86>;
+
+def MULLO_INT_cm : MULLO_INT_Common<0x8F>;
+def MULHI_INT_cm : MULHI_INT_Common<0x90>;
+def MULLO_UINT_cm : MULLO_UINT_Common<0x91>;
+def MULHI_UINT_cm : MULHI_UINT_Common<0x92>;
+def RECIPSQRT_CLAMPED_cm : RECIPSQRT_CLAMPED_Common<0x87>;
+def EXP_IEEE_cm : EXP_IEEE_Common<0x81>;
+def LOG_IEEE_ : LOG_IEEE_Common<0x83>;
+def RECIP_CLAMPED_cm : RECIP_CLAMPED_Common<0x84>;
+def RECIPSQRT_IEEE_cm : RECIPSQRT_IEEE_Common<0x89>;
+def SIN_cm : SIN_Common<0x8D>;
+def COS_cm : COS_Common<0x8E>;
+} // End isVector = 1
+
+def : SIN_PAT <SIN_cm>;
+def : COS_PAT <COS_cm>;
+
+defm DIV_cm : DIV_Common<RECIP_IEEE_cm>;
+
+// RECIP_UINT emulation for Cayman
+def : Pat <
+  (AMDGPUurecip R600_Reg32:$src0),
+  (FLT_TO_UINT_eg (MUL_IEEE (RECIP_IEEE_cm (UINT_TO_FLT_eg R600_Reg32:$src0)),
+                            (MOV_IMM_I32 0x4f800000)))
+>;
+
+
+def : Pat<(fsqrt R600_Reg32:$src),
+  (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm R600_Reg32:$src))>;
+
+} // End isCayman
+
+//===----------------------------------------------------------------------===//
+// Branch Instructions
+//===----------------------------------------------------------------------===//
+
+
+def IF_PREDICATE_SET  : ILFormat<(outs), (ins GPRI32:$src),
+  "IF_PREDICATE_SET $src", []>;
+
+def PREDICATED_BREAK : ILFormat<(outs), (ins GPRI32:$src),
+  "PREDICATED_BREAK $src", []>;
+
+//===----------------------------------------------------------------------===//
+// Pseudo instructions
+//===----------------------------------------------------------------------===//
+
+let isPseudo = 1 in {
+
+def PRED_X : InstR600 <
+  0, (outs R600_Predicate_Bit:$dst),
+  (ins R600_Reg32:$src0, i32imm:$src1, i32imm:$flags),
+  "", [], NullALU> {
+  let FlagOperandIdx = 3;
+}
+
+let isTerminator = 1, isBranch = 1, isBarrier = 1 in {
+
+def JUMP : InstR600 <0x10,
+          (outs),
+          (ins brtarget:$target, R600_Pred:$p),
+          "JUMP $target ($p)",
+          [], AnyALU
+  >;
+
+}  // End isTerminator = 1, isBranch = 1, isBarrier = 1
+
+let usesCustomInserter = 1 in {
+
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in {
+
+def MASK_WRITE : AMDGPUShaderInst <
+    (outs),
+    (ins R600_Reg32:$src),
+    "MASK_WRITE $src",
+    []
+>;
+
+} // End mayLoad = 0, mayStore = 0, hasSideEffects = 1
+
+def R600_LOAD_CONST : AMDGPUShaderInst <
+  (outs R600_Reg32:$dst),
+  (ins i32imm:$src0),
+  "R600_LOAD_CONST $dst, $src0",
+  [(set R600_Reg32:$dst, (int_AMDGPU_load_const imm:$src0))]
+>;
+
+def RESERVE_REG : AMDGPUShaderInst <
+  (outs),
+  (ins i32imm:$src),
+  "RESERVE_REG $src",
+  [(int_AMDGPU_reserve_reg imm:$src)]
+>;
+
+def TXD: AMDGPUShaderInst <
+  (outs R600_Reg128:$dst),
+  (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
+  "TXD $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget",
+  [(set R600_Reg128:$dst, (int_AMDGPU_txd R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, imm:$resourceId, imm:$samplerId, imm:$textureTarget))]
+>;
+
+def TXD_SHADOW: AMDGPUShaderInst <
+  (outs R600_Reg128:$dst),
+  (ins R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, i32imm:$resourceId, i32imm:$samplerId, i32imm:$textureTarget),
+  "TXD_SHADOW $dst, $src0, $src1, $src2, $resourceId, $samplerId, $textureTarget",
+  [(set R600_Reg128:$dst, (int_AMDGPU_txd R600_Reg128:$src0, R600_Reg128:$src1, R600_Reg128:$src2, imm:$resourceId, imm:$samplerId, TEX_SHADOW:$textureTarget))]
+>;
+
+} // End isPseudo = 1
+} // End usesCustomInserter = 1
+
+def CLAMP_R600 :  CLAMP <R600_Reg32>;
+def FABS_R600 : FABS<R600_Reg32>;
+def FNEG_R600 : FNEG<R600_Reg32>;
+
+//===---------------------------------------------------------------------===//
+// Return instruction
+//===---------------------------------------------------------------------===//
+let isTerminator = 1, isReturn = 1, isBarrier = 1, hasCtrlDep = 1 in {
+  def RETURN          : ILFormat<(outs), (ins variable_ops),
+      "RETURN", [(IL_retflag)]>;
+}
+
+//===----------------------------------------------------------------------===//
+// ISel Patterns
+//===----------------------------------------------------------------------===//
+
+//CNDGE_INT extra pattern
+def : Pat <
+  (selectcc (i32 R600_Reg32:$src0), -1, (i32 R600_Reg32:$src1),
+                                        (i32 R600_Reg32:$src2), COND_GT),
+  (CNDGE_INT R600_Reg32:$src0, R600_Reg32:$src1, R600_Reg32:$src2)
+>;
+
+// KIL Patterns
+def KILP : Pat <
+  (int_AMDGPU_kilp),
+  (MASK_WRITE (KILLGT (f32 ONE), (f32 ZERO)))
+>;
+
+def KIL : Pat <
+  (int_AMDGPU_kill R600_Reg32:$src0),
+  (MASK_WRITE (KILLGT (f32 ZERO), (f32 R600_Reg32:$src0)))
+>;
+
+// SGT Reverse args
+def : Pat <
+  (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, COND_LT),
+  (SGT R600_Reg32:$src1, R600_Reg32:$src0)
+>;
+
+// SGE Reverse args
+def : Pat <
+  (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, COND_LE),
+  (SGE R600_Reg32:$src1, R600_Reg32:$src0) 
+>;
+
+// SETGT_INT reverse args
+def : Pat <
+  (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETLT),
+  (SETGT_INT R600_Reg32:$src1, R600_Reg32:$src0)
+>;
+
+// SETGE_INT reverse args
+def : Pat <
+  (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETLE),
+  (SETGE_INT R600_Reg32:$src1, R600_Reg32:$src0)
+>;
+
+// SETGT_UINT reverse args
+def : Pat <
+  (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETULT),
+  (SETGT_UINT R600_Reg32:$src1, R600_Reg32:$src0)
+>;
+
+// SETGE_UINT reverse args
+def : Pat <
+  (selectcc (i32 R600_Reg32:$src0), R600_Reg32:$src1, -1, 0, SETULE),
+  (SETGE_UINT R600_Reg32:$src1, R600_Reg32:$src0)
+>;
+
+// The next two patterns are special cases for handling 'true if ordered' and
+// 'true if unordered' conditionals.  The assumption here is that the behavior of
+// SETE and SNE conforms to the Direct3D 10 rules for floating point values
+// described here:
+// http://msdn.microsoft.com/en-us/library/windows/desktop/cc308050.aspx#alpha_32_bit
+// We assume that  SETE returns false when one of the operands is NAN and
+// SNE returns true when on of the operands is NAN
+
+//SETE - 'true if ordered'
+def : Pat <
+  (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, SETO),
+  (SETE R600_Reg32:$src0, R600_Reg32:$src1)
+>;
+
+//SNE - 'true if unordered'
+def : Pat <
+  (selectcc (f32 R600_Reg32:$src0), R600_Reg32:$src1, FP_ONE, FP_ZERO, SETUO),
+  (SNE R600_Reg32:$src0, R600_Reg32:$src1)
+>;
+
+def : Extract_Element <f32, v4f32, R600_Reg128, 0, sel_x>;
+def : Extract_Element <f32, v4f32, R600_Reg128, 1, sel_y>;
+def : Extract_Element <f32, v4f32, R600_Reg128, 2, sel_z>;
+def : Extract_Element <f32, v4f32, R600_Reg128, 3, sel_w>;
+
+def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 0, sel_x>;
+def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 1, sel_y>;
+def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 2, sel_z>;
+def : Insert_Element <f32, v4f32, R600_Reg32, R600_Reg128, 3, sel_w>;
+
+def : Extract_Element <i32, v4i32, R600_Reg128, 0, sel_x>;
+def : Extract_Element <i32, v4i32, R600_Reg128, 1, sel_y>;
+def : Extract_Element <i32, v4i32, R600_Reg128, 2, sel_z>;
+def : Extract_Element <i32, v4i32, R600_Reg128, 3, sel_w>;
+
+def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 0, sel_x>;
+def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 1, sel_y>;
+def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 2, sel_z>;
+def : Insert_Element <i32, v4i32, R600_Reg32, R600_Reg128, 3, sel_w>;
+
+def : Vector_Build <v4f32, R600_Reg128, f32, R600_Reg32>;
+def : Vector_Build <v4i32, R600_Reg128, i32, R600_Reg32>;
+
+// bitconvert patterns
+
+def : BitConvert <i32, f32, R600_Reg32>;
+def : BitConvert <f32, i32, R600_Reg32>;
+def : BitConvert <v4f32, v4i32, R600_Reg128>;
+def : BitConvert <v4i32, v4f32, R600_Reg128>;
+
+// DWORDADDR pattern
+def : DwordAddrPat  <i32, R600_Reg32>;
+
+} // End isR600toCayman Predicate
diff --git a/lib/Target/R600/R600Intrinsics.td b/lib/Target/R600/R600Intrinsics.td
new file mode 100644
index 0000000000..3825bc4d3b
--- /dev/null
+++ b/lib/Target/R600/R600Intrinsics.td
@@ -0,0 +1,32 @@
+//===-- R600Intrinsics.td - R600 Instrinsic defs -------*- tablegen -*-----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// R600 Intrinsic Definitions
+//
+//===----------------------------------------------------------------------===//
+
+let TargetPrefix = "R600", isTarget = 1 in {
+  def int_R600_load_input : Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+  def int_R600_load_input_perspective :
+    Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrReadMem]>;
+  def int_R600_load_input_constant :
+    Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrReadMem]>;
+  def int_R600_load_input_linear :
+    Intrinsic<[llvm_float_ty], [llvm_i32_ty], [IntrReadMem]>;
+  def int_R600_store_stream_output :
+    Intrinsic<[], [llvm_float_ty, llvm_i32_ty, llvm_i32_ty], []>;
+  def int_R600_store_pixel_color :
+      Intrinsic<[], [llvm_float_ty, llvm_i32_ty], []>;
+  def int_R600_store_pixel_depth :
+      Intrinsic<[], [llvm_float_ty], []>;
+  def int_R600_store_pixel_stencil :
+      Intrinsic<[], [llvm_float_ty], []>;
+  def int_R600_store_pixel_dummy :
+      Intrinsic<[], [], []>;
+}
diff --git a/lib/Target/R600/R600MachineFunctionInfo.cpp b/lib/Target/R600/R600MachineFunctionInfo.cpp
new file mode 100644
index 0000000000..4eb5efa19f
--- /dev/null
+++ b/lib/Target/R600/R600MachineFunctionInfo.cpp
@@ -0,0 +1,34 @@
+//===-- R600MachineFunctionInfo.cpp - R600 Machine Function Info-*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#include "R600MachineFunctionInfo.h"
+
+using namespace llvm;
+
+R600MachineFunctionInfo::R600MachineFunctionInfo(const MachineFunction &MF)
+  : MachineFunctionInfo(),
+    HasLinearInterpolation(false),
+    HasPerspectiveInterpolation(false) {
+    memset(Outputs, 0, sizeof(Outputs));
+    memset(StreamOutputs, 0, sizeof(StreamOutputs));
+  }
+
+unsigned R600MachineFunctionInfo::GetIJPerspectiveIndex() const {
+  assert(HasPerspectiveInterpolation);
+  return 0;
+}
+
+unsigned R600MachineFunctionInfo::GetIJLinearIndex() const {
+  assert(HasLinearInterpolation);
+  if (HasPerspectiveInterpolation)
+    return 1;
+  else
+    return 0;
+}
diff --git a/lib/Target/R600/R600MachineFunctionInfo.h b/lib/Target/R600/R600MachineFunctionInfo.h
new file mode 100644
index 0000000000..e97fb5be62
--- /dev/null
+++ b/lib/Target/R600/R600MachineFunctionInfo.h
@@ -0,0 +1,39 @@
+//===-- R600MachineFunctionInfo.h - R600 Machine Function Info ----*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+#ifndef R600MACHINEFUNCTIONINFO_H
+#define R600MACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+#include <vector>
+
+namespace llvm {
+
+class R600MachineFunctionInfo : public MachineFunctionInfo {
+
+public:
+  R600MachineFunctionInfo(const MachineFunction &MF);
+  std::vector<unsigned> ReservedRegs;
+  SDNode *Outputs[16];
+  SDNode *StreamOutputs[64][4];
+  bool HasLinearInterpolation;
+  bool HasPerspectiveInterpolation;
+
+  unsigned GetIJLinearIndex() const;
+  unsigned GetIJPerspectiveIndex() const;
+
+};
+
+} // End llvm namespace
+
+#endif //R600MACHINEFUNCTIONINFO_H
diff --git a/lib/Target/R600/R600RegisterInfo.cpp b/lib/Target/R600/R600RegisterInfo.cpp
new file mode 100644
index 0000000000..a39f83dbac
--- /dev/null
+++ b/lib/Target/R600/R600RegisterInfo.cpp
@@ -0,0 +1,89 @@
+//===-- R600RegisterInfo.cpp - R600 Register Information ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief R600 implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "R600RegisterInfo.h"
+#include "AMDGPUTargetMachine.h"
+#include "R600Defines.h"
+#include "R600MachineFunctionInfo.h"
+
+using namespace llvm;
+
+R600RegisterInfo::R600RegisterInfo(AMDGPUTargetMachine &tm,
+    const TargetInstrInfo &tii)
+: AMDGPURegisterInfo(tm, tii),
+  TM(tm),
+  TII(tii)
+  { }
+
+BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  const R600MachineFunctionInfo * MFI = MF.getInfo<R600MachineFunctionInfo>();
+
+  Reserved.set(AMDGPU::ZERO);
+  Reserved.set(AMDGPU::HALF);
+  Reserved.set(AMDGPU::ONE);
+  Reserved.set(AMDGPU::ONE_INT);
+  Reserved.set(AMDGPU::NEG_HALF);
+  Reserved.set(AMDGPU::NEG_ONE);
+  Reserved.set(AMDGPU::PV_X);
+  Reserved.set(AMDGPU::ALU_LITERAL_X);
+  Reserved.set(AMDGPU::PREDICATE_BIT);
+  Reserved.set(AMDGPU::PRED_SEL_OFF);
+  Reserved.set(AMDGPU::PRED_SEL_ZERO);
+  Reserved.set(AMDGPU::PRED_SEL_ONE);
+
+  for (TargetRegisterClass::iterator I = AMDGPU::R600_CReg32RegClass.begin(),
+                        E = AMDGPU::R600_CReg32RegClass.end(); I != E; ++I) {
+    Reserved.set(*I);
+  }
+
+  for (std::vector<unsigned>::const_iterator I = MFI->ReservedRegs.begin(),
+                                    E = MFI->ReservedRegs.end(); I != E; ++I) {
+    Reserved.set(*I);
+  }
+
+  return Reserved;
+}
+
+const TargetRegisterClass *
+R600RegisterInfo::getISARegClass(const TargetRegisterClass * rc) const {
+  switch (rc->getID()) {
+  case AMDGPU::GPRF32RegClassID:
+  case AMDGPU::GPRI32RegClassID:
+    return &AMDGPU::R600_Reg32RegClass;
+  default: return rc;
+  }
+}
+
+unsigned R600RegisterInfo::getHWRegChan(unsigned reg) const {
+  return this->getEncodingValue(reg) >> HW_CHAN_SHIFT;
+}
+
+const TargetRegisterClass * R600RegisterInfo::getCFGStructurizerRegClass(
+                                                                   MVT VT) const {
+  switch(VT.SimpleTy) {
+  default:
+  case MVT::i32: return &AMDGPU::R600_TReg32RegClass;
+  }
+}
+
+unsigned R600RegisterInfo::getSubRegFromChannel(unsigned Channel) const {
+  switch (Channel) {
+    default: assert(!"Invalid channel index"); return 0;
+    case 0: return AMDGPU::sel_x;
+    case 1: return AMDGPU::sel_y;
+    case 2: return AMDGPU::sel_z;
+    case 3: return AMDGPU::sel_w;
+  }
+}
diff --git a/lib/Target/R600/R600RegisterInfo.h b/lib/Target/R600/R600RegisterInfo.h
new file mode 100644
index 0000000000..c170ccb378
--- /dev/null
+++ b/lib/Target/R600/R600RegisterInfo.h
@@ -0,0 +1,55 @@
+//===-- R600RegisterInfo.h - R600 Register Info Interface ------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface definition for R600RegisterInfo
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef R600REGISTERINFO_H_
+#define R600REGISTERINFO_H_
+
+#include "AMDGPUTargetMachine.h"
+#include "AMDGPURegisterInfo.h"
+
+namespace llvm {
+
+class R600TargetMachine;
+class TargetInstrInfo;
+
+struct R600RegisterInfo : public AMDGPURegisterInfo {
+  AMDGPUTargetMachine &TM;
+  const TargetInstrInfo &TII;
+
+  R600RegisterInfo(AMDGPUTargetMachine &tm, const TargetInstrInfo &tii);
+
+  virtual BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  /// \param RC is an AMDIL reg class.
+  ///
+  /// \returns the R600 reg class that is equivalent to \p RC.
+  virtual const TargetRegisterClass *getISARegClass(
+    const TargetRegisterClass *RC) const;
+
+  /// \brief get the HW encoding for a register's channel.
+  unsigned getHWRegChan(unsigned reg) const;
+
+  /// \brief get the register class of the specified type to use in the
+  /// CFGStructurizer
+  virtual const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const;
+
+  /// \returns the sub reg enum value for the given \p Channel
+  /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sel_x)
+  unsigned getSubRegFromChannel(unsigned Channel) const;
+
+};
+
+} // End namespace llvm
+
+#endif // AMDIDSAREGISTERINFO_H_
diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td
new file mode 100644
index 0000000000..d3d6d25d29
--- /dev/null
+++ b/lib/Target/R600/R600RegisterInfo.td
@@ -0,0 +1,107 @@
+
+class R600Reg <string name, bits<16> encoding> : Register<name> {
+  let Namespace = "AMDGPU";
+  let HWEncoding = encoding;
+}
+
+class R600RegWithChan <string name, bits<9> sel, string chan> :
+    Register <name> {
+
+  field bits<2> chan_encoding = !if(!eq(chan, "X"), 0,
+                                !if(!eq(chan, "Y"), 1,
+                                !if(!eq(chan, "Z"), 2,
+                                !if(!eq(chan, "W"), 3, 0))));
+  let HWEncoding{8-0}  = sel;
+  let HWEncoding{10-9} = chan_encoding;
+  let Namespace = "AMDGPU";
+}
+
+class R600Reg_128<string n, list<Register> subregs, bits<16> encoding> :
+    RegisterWithSubRegs<n, subregs> {
+  let Namespace = "AMDGPU";
+  let SubRegIndices = [sel_x, sel_y, sel_z, sel_w];
+  let HWEncoding = encoding;
+}
+
+foreach Index = 0-127 in {
+  foreach Chan = [ "X", "Y", "Z", "W" ] in {
+    // 32-bit Temporary Registers
+    def T#Index#_#Chan : R600RegWithChan <"T"#Index#"."#Chan, Index, Chan>;
+
+    // 32-bit Constant Registers (There are more than 128, this the number
+    // that is currently supported.
+    def C#Index#_#Chan : R600RegWithChan <"C"#Index#"."#Chan, Index, Chan>;
+  }
+  // 128-bit Temporary Registers
+  def T#Index#_XYZW : R600Reg_128 <"T"#Index#".XYZW",
+                                   [!cast<Register>("T"#Index#"_X"),
+                                    !cast<Register>("T"#Index#"_Y"),
+                                    !cast<Register>("T"#Index#"_Z"),
+                                    !cast<Register>("T"#Index#"_W")],
+                                   Index>;
+}
+
+// Array Base Register holding input in FS
+foreach Index = 448-464 in {
+  def ArrayBase#Index :  R600Reg<"ARRAY_BASE", Index>;
+}
+
+
+// Special Registers
+
+def ZERO : R600Reg<"0.0", 248>;
+def ONE : R600Reg<"1.0", 249>;
+def NEG_ONE : R600Reg<"-1.0", 249>;
+def ONE_INT : R600Reg<"1", 250>;
+def HALF : R600Reg<"0.5", 252>;
+def NEG_HALF : R600Reg<"-0.5", 252>;
+def ALU_LITERAL_X : R600Reg<"literal.x", 253>;
+def PV_X : R600Reg<"pv.x", 254>;
+def PREDICATE_BIT : R600Reg<"PredicateBit", 0>;
+def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>;
+def PRED_SEL_ZERO : R600Reg<"Pred_sel_zero", 2>;
+def PRED_SEL_ONE : R600Reg<"Pred_sel_one", 3>;
+
+def R600_ArrayBase : RegisterClass <"AMDGPU", [f32, i32], 32,
+                          (add (sequence "ArrayBase%u", 448, 464))>;
+
+def R600_CReg32 : RegisterClass <"AMDGPU", [f32, i32], 32,
+                          (add (interleave
+                                  (interleave (sequence "C%u_X", 0, 127),
+                                              (sequence "C%u_Z", 0, 127)),
+                                  (interleave (sequence "C%u_Y", 0, 127),
+                                              (sequence "C%u_W", 0, 127))))>;
+
+def R600_TReg32_X : RegisterClass <"AMDGPU", [f32, i32], 32,
+                                   (add (sequence "T%u_X", 0, 127))>;
+
+def R600_TReg32_Y : RegisterClass <"AMDGPU", [f32, i32], 32,
+                                   (add (sequence "T%u_Y", 0, 127))>;
+
+def R600_TReg32_Z : RegisterClass <"AMDGPU", [f32, i32], 32,
+                                   (add (sequence "T%u_Z", 0, 127))>;
+
+def R600_TReg32_W : RegisterClass <"AMDGPU", [f32, i32], 32,
+                                   (add (sequence "T%u_W", 0, 127))>;
+
+def R600_TReg32 : RegisterClass <"AMDGPU", [f32, i32], 32,
+                          (add (interleave
+                                 (interleave R600_TReg32_X, R600_TReg32_Z),
+                                 (interleave R600_TReg32_Y, R600_TReg32_W)))>;
+
+def R600_Reg32 : RegisterClass <"AMDGPU", [f32, i32], 32, (add
+    R600_TReg32,
+    R600_CReg32,
+    R600_ArrayBase,
+    ZERO, HALF, ONE, ONE_INT, PV_X, ALU_LITERAL_X, NEG_ONE, NEG_HALF)>;
+
+def R600_Predicate : RegisterClass <"AMDGPU", [i32], 32, (add
+    PRED_SEL_OFF, PRED_SEL_ZERO, PRED_SEL_ONE)>;
+
+def R600_Predicate_Bit: RegisterClass <"AMDGPU", [i32], 32, (add
+    PREDICATE_BIT)>;
+
+def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
+                                (add (sequence "T%u_XYZW", 0, 127))> {
+  let CopyCost = -1;
+}
diff --git a/lib/Target/R600/R600Schedule.td b/lib/Target/R600/R600Schedule.td
new file mode 100644
index 0000000000..7ede181c51
--- /dev/null
+++ b/lib/Target/R600/R600Schedule.td
@@ -0,0 +1,36 @@
+//===-- R600Schedule.td - R600 Scheduling definitions ------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// R600 has a VLIW architecture.  On pre-cayman cards there are 5 instruction
+// slots ALU.X, ALU.Y, ALU.Z, ALU.W, and TRANS.  For cayman cards, the TRANS
+// slot has been removed. 
+//
+//===----------------------------------------------------------------------===//
+
+
+def ALU_X : FuncUnit;
+def ALU_Y : FuncUnit;
+def ALU_Z : FuncUnit;
+def ALU_W : FuncUnit;
+def TRANS : FuncUnit;
+
+def AnyALU : InstrItinClass;
+def VecALU : InstrItinClass;
+def TransALU : InstrItinClass;
+
+def R600_EG_Itin : ProcessorItineraries <
+  [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS, ALU_NULL],
+  [],
+  [
+    InstrItinData<AnyALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_Z, ALU_W, TRANS]>]>,
+    InstrItinData<VecALU, [InstrStage<1, [ALU_X, ALU_Y, ALU_X, ALU_W]>]>,
+    InstrItinData<TransALU, [InstrStage<1, [TRANS]>]>,
+    InstrItinData<NullALU, [InstrStage<1, [ALU_NULL]>]>
+  ]
+>;
diff --git a/lib/Target/R600/SIAssignInterpRegs.cpp b/lib/Target/R600/SIAssignInterpRegs.cpp
new file mode 100644
index 0000000000..832e44d766
--- /dev/null
+++ b/lib/Target/R600/SIAssignInterpRegs.cpp
@@ -0,0 +1,152 @@
+//===-- SIAssignInterpRegs.cpp - Assign interpolation registers -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This pass maps the pseudo interpolation registers to the correct physical
+/// registers.
+//
+/// Prior to executing a fragment shader, the GPU loads interpolation
+/// parameters into physical registers.  The specific physical register that each
+/// interpolation parameter ends up in depends on the type of the interpolation
+/// parameter as well as how many interpolation parameters are used by the
+/// shader.
+//
+//===----------------------------------------------------------------------===//
+
+
+
+#include "AMDGPU.h"
+#include "AMDIL.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+class SIAssignInterpRegsPass : public MachineFunctionPass {
+
+private:
+  static char ID;
+  TargetMachine &TM;
+
+  void addLiveIn(MachineFunction * MF,  MachineRegisterInfo & MRI,
+                 unsigned physReg, unsigned virtReg);
+
+public:
+  SIAssignInterpRegsPass(TargetMachine &tm) :
+    MachineFunctionPass(ID), TM(tm) { }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  const char *getPassName() const { return "SI Assign intrpolation registers"; }
+};
+
+} // End anonymous namespace
+
+char SIAssignInterpRegsPass::ID = 0;
+
+#define INTERP_VALUES 16
+#define REQUIRED_VALUE_MAX_INDEX 7
+
+struct InterpInfo {
+  bool Enabled;
+  unsigned Regs[3];
+  unsigned RegCount;
+};
+
+
+FunctionPass *llvm::createSIAssignInterpRegsPass(TargetMachine &tm) {
+  return new SIAssignInterpRegsPass(tm);
+}
+
+bool SIAssignInterpRegsPass::runOnMachineFunction(MachineFunction &MF) {
+
+  struct InterpInfo InterpUse[INTERP_VALUES] = {
+    {false, {AMDGPU::PERSP_SAMPLE_I, AMDGPU::PERSP_SAMPLE_J}, 2},
+    {false, {AMDGPU::PERSP_CENTER_I, AMDGPU::PERSP_CENTER_J}, 2},
+    {false, {AMDGPU::PERSP_CENTROID_I, AMDGPU::PERSP_CENTROID_J}, 2},
+    {false, {AMDGPU::PERSP_I_W, AMDGPU::PERSP_J_W, AMDGPU::PERSP_1_W}, 3},
+    {false, {AMDGPU::LINEAR_SAMPLE_I, AMDGPU::LINEAR_SAMPLE_J}, 2},
+    {false, {AMDGPU::LINEAR_CENTER_I, AMDGPU::LINEAR_CENTER_J}, 2},
+    {false, {AMDGPU::LINEAR_CENTROID_I, AMDGPU::LINEAR_CENTROID_J}, 2},
+    {false, {AMDGPU::LINE_STIPPLE_TEX_COORD}, 1},
+    {false, {AMDGPU::POS_X_FLOAT}, 1},
+    {false, {AMDGPU::POS_Y_FLOAT}, 1},
+    {false, {AMDGPU::POS_Z_FLOAT}, 1},
+    {false, {AMDGPU::POS_W_FLOAT}, 1},
+    {false, {AMDGPU::FRONT_FACE}, 1},
+    {false, {AMDGPU::ANCILLARY}, 1},
+    {false, {AMDGPU::SAMPLE_COVERAGE}, 1},
+    {false, {AMDGPU::POS_FIXED_PT}, 1}
+  };
+
+  SIMachineFunctionInfo * MFI = MF.getInfo<SIMachineFunctionInfo>();
+  // This pass is only needed for pixel shaders.
+  if (MFI->ShaderType != ShaderType::PIXEL) {
+    return false;
+  }
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  bool ForceEnable = true;
+
+  // First pass, mark the interpolation values that are used.
+  for (unsigned InterpIdx = 0; InterpIdx < INTERP_VALUES; InterpIdx++) {
+    for (unsigned RegIdx = 0; RegIdx < InterpUse[InterpIdx].RegCount;
+                                                               RegIdx++) {
+      InterpUse[InterpIdx].Enabled = InterpUse[InterpIdx].Enabled ||
+                            !MRI.use_empty(InterpUse[InterpIdx].Regs[RegIdx]);
+      if (InterpUse[InterpIdx].Enabled &&
+          InterpIdx <= REQUIRED_VALUE_MAX_INDEX) {
+        ForceEnable = false;
+      }
+    }
+  }
+
+  // At least one interpolation mode must be enabled or else the GPU will hang.
+  if (ForceEnable) {
+    InterpUse[0].Enabled = true;
+  }
+
+  unsigned UsedVgprs = 0;
+
+  // Second pass, replace with VGPRs.
+  for (unsigned InterpIdx = 0; InterpIdx < INTERP_VALUES; InterpIdx++) {
+    if (!InterpUse[InterpIdx].Enabled) {
+      continue;
+    }
+    MFI->SPIPSInputAddr |= (1 << InterpIdx);
+
+    for (unsigned RegIdx = 0; RegIdx < InterpUse[InterpIdx].RegCount;
+                                                  RegIdx++, UsedVgprs++) {
+      unsigned NewReg = AMDGPU::VReg_32RegClass.getRegister(UsedVgprs);
+      unsigned VirtReg = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+      MRI.replaceRegWith(InterpUse[InterpIdx].Regs[RegIdx], VirtReg);
+      addLiveIn(&MF, MRI, NewReg, VirtReg);
+    }
+  }
+
+  return false;
+}
+
+void SIAssignInterpRegsPass::addLiveIn(MachineFunction * MF,
+                           MachineRegisterInfo & MRI,
+                           unsigned physReg, unsigned virtReg) {
+    const TargetInstrInfo * TII = TM.getInstrInfo();
+    if (!MRI.isLiveIn(physReg)) {
+      MRI.addLiveIn(physReg, virtReg);
+      MF->front().addLiveIn(physReg);
+      BuildMI(MF->front(), MF->front().begin(), DebugLoc(),
+              TII->get(TargetOpcode::COPY), virtReg)
+                .addReg(physReg);
+    } else {
+      MRI.replaceRegWith(virtReg, MRI.getLiveInVirtReg(physReg));
+    }
+}
diff --git a/lib/Target/R600/SIFixSGPRLiveness.cpp b/lib/Target/R600/SIFixSGPRLiveness.cpp
new file mode 100644
index 0000000000..0fecd7a28e
--- /dev/null
+++ b/lib/Target/R600/SIFixSGPRLiveness.cpp
@@ -0,0 +1,179 @@
+//===-- SIFixSGPRLiveness.cpp - SGPR liveness adjustment ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+///
+/// SGPRs are not affected by control flow. This pass adjusts SGPR liveness in
+/// so that the register allocator can still correctly allocate them.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+using namespace llvm;
+
+namespace {
+
+class SIFixSGPRLiveness : public MachineFunctionPass {
+private:
+  static char ID;
+
+  const TargetInstrInfo *TII;
+  MachineRegisterInfo *MRI;
+  MachineDominatorTree *MD;
+  MachinePostDominatorTree *MPD;
+
+  bool isSGPR(const TargetRegisterClass *RegClass) {
+    return RegClass == &AMDGPU::SReg_1RegClass ||
+           RegClass == &AMDGPU::SReg_32RegClass ||
+           RegClass == &AMDGPU::SReg_64RegClass ||
+           RegClass == &AMDGPU::SReg_128RegClass ||
+           RegClass == &AMDGPU::SReg_256RegClass;
+  }
+
+  void addKill(MachineBasicBlock::iterator I, unsigned Reg);
+  MachineBasicBlock *handleUses(unsigned VirtReg, MachineBasicBlock *Begin);
+  void handlePreds(MachineBasicBlock *Begin, MachineBasicBlock *End,
+                   unsigned VirtReg);
+
+  bool handleVirtReg(unsigned VirtReg);
+
+public:
+  SIFixSGPRLiveness(TargetMachine &tm);
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  virtual const char *getPassName() const {
+    return "SI fix SGPR liveness pass";
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+};
+
+} // end anonymous namespace
+
+char SIFixSGPRLiveness::ID = 0;
+
+SIFixSGPRLiveness::SIFixSGPRLiveness(TargetMachine &tm):
+  MachineFunctionPass(ID),
+  TII(tm.getInstrInfo()) {
+  initializeLiveIntervalsPass(*PassRegistry::getPassRegistry());
+}
+
+void SIFixSGPRLiveness::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<MachinePostDominatorTree>();
+  AU.setPreservesCFG();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+void SIFixSGPRLiveness::addKill(MachineBasicBlock::iterator I, unsigned Reg) {
+  MachineBasicBlock *MBB = I->getParent();
+
+  BuildMI(*MBB, I, DebugLoc(), TII->get(TargetOpcode::KILL)).addReg(Reg);
+}
+
+// Find the common post dominator of all uses
+MachineBasicBlock *SIFixSGPRLiveness::handleUses(unsigned VirtReg,
+                                                 MachineBasicBlock *Begin) {
+  MachineBasicBlock *LastUse = Begin, *End = Begin;
+  bool EndUsesReg = true;
+
+  MachineRegisterInfo::use_iterator i, e;
+  for (i = MRI->use_begin(VirtReg), e = MRI->use_end(); i != e; ++i) {
+    MachineBasicBlock *MBB = i->getParent();
+    if (LastUse == MBB)
+      continue;
+
+    LastUse = MBB;
+    MBB = MPD->findNearestCommonDominator(End, MBB);
+
+    if (MBB == LastUse)
+      EndUsesReg = true;
+    else if (MBB != End)
+      EndUsesReg = false;
+
+    End = MBB;
+  }
+
+  return EndUsesReg ? Begin : End;
+}
+
+// Handles predecessors separately, only add KILLs to dominated ones
+void SIFixSGPRLiveness::handlePreds(MachineBasicBlock *Begin,
+                                    MachineBasicBlock *End,
+                                    unsigned VirtReg) {
+  MachineBasicBlock::pred_iterator i, e;
+  for (i = End->pred_begin(), e = End->pred_end(); i != e; ++i) {
+
+    if (MD->dominates(End, *i))
+      continue; // ignore loops
+
+    if (MD->dominates(*i, Begin))
+      continue; // too far up, abort search
+
+    if (MD->dominates(Begin, *i)) {
+      // found end of livetime
+      addKill((*i)->getFirstTerminator(), VirtReg);
+      continue;
+    }
+
+    handlePreds(Begin, *i, VirtReg);
+  }
+}
+
+bool SIFixSGPRLiveness::handleVirtReg(unsigned VirtReg) {
+
+  MachineInstr *Def = MRI->getVRegDef(VirtReg);
+  if (!Def || MRI->use_empty(VirtReg))
+    return false; // No definition or not used
+
+  MachineBasicBlock *Begin = Def->getParent();
+  MachineBasicBlock *End = handleUses(VirtReg, Begin);
+  if (Begin == End)
+    return false; // Defined and only used in the same block
+
+  if (MD->dominates(Begin, End)) {
+    // Lifetime dominate the end node, just kill it here
+    addKill(End->getFirstNonPHI(), VirtReg);
+  } else {
+    // only some predecessors are dominate, handle them separately
+    handlePreds(Begin, End, VirtReg);
+  }
+  return true;
+}
+
+bool SIFixSGPRLiveness::runOnMachineFunction(MachineFunction &MF) {
+  bool Changes = false;
+
+  MRI = &MF.getRegInfo();
+  MD = &getAnalysis<MachineDominatorTree>();
+  MPD = &getAnalysis<MachinePostDominatorTree>();
+
+  for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
+    unsigned VirtReg = TargetRegisterInfo::index2VirtReg(i);
+
+    const TargetRegisterClass *RegClass = MRI->getRegClass(VirtReg);
+    if (!isSGPR(RegClass))
+      continue;
+
+    Changes |= handleVirtReg(VirtReg);
+  }
+
+  return Changes;
+}
+
+FunctionPass *llvm::createSIFixSGPRLivenessPass(TargetMachine &tm) {
+  return new SIFixSGPRLiveness(tm);
+}
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
new file mode 100644
index 0000000000..292ce850f7
--- /dev/null
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -0,0 +1,442 @@
+//===-- SIISelLowering.cpp - SI DAG Lowering Implementation ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Custom DAG lowering for SI
+//
+//===----------------------------------------------------------------------===//
+
+#include "SIISelLowering.h"
+#include "AMDIL.h"
+#include "AMDILIntrinsicInfo.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "SIRegisterInfo.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
+
+using namespace llvm;
+
+SITargetLowering::SITargetLowering(TargetMachine &TM) :
+    AMDGPUTargetLowering(TM),
+    TII(static_cast<const SIInstrInfo*>(TM.getInstrInfo())) {
+  addRegisterClass(MVT::v4f32, &AMDGPU::VReg_128RegClass);
+  addRegisterClass(MVT::f32, &AMDGPU::VReg_32RegClass);
+  addRegisterClass(MVT::i32, &AMDGPU::VReg_32RegClass);
+  addRegisterClass(MVT::i64, &AMDGPU::SReg_64RegClass);
+  addRegisterClass(MVT::i1, &AMDGPU::SCCRegRegClass);
+  addRegisterClass(MVT::i1, &AMDGPU::VCCRegRegClass);
+
+  addRegisterClass(MVT::v4i32, &AMDGPU::SReg_128RegClass);
+  addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
+
+  computeRegisterProperties();
+
+  setOperationAction(ISD::AND, MVT::i1, Custom);
+
+  setOperationAction(ISD::ADD, MVT::i64, Legal);
+  setOperationAction(ISD::ADD, MVT::i32, Legal);
+
+  setOperationAction(ISD::BR_CC, MVT::i32, Custom);
+
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
+  // We need to custom lower loads from the USER_SGPR address space, so we can
+  // add the SGPRs as livein registers.
+  setOperationAction(ISD::LOAD, MVT::i32, Custom);
+  setOperationAction(ISD::LOAD, MVT::i64, Custom);
+
+  setOperationAction(ISD::SELECT_CC, MVT::f32, Custom);
+  setOperationAction(ISD::SELECT_CC, MVT::i32, Custom);
+
+  setOperationAction(ISD::SELECT_CC, MVT::Other, Expand);
+  setTargetDAGCombine(ISD::SELECT_CC);
+
+  setTargetDAGCombine(ISD::SETCC);
+}
+
+MachineBasicBlock * SITargetLowering::EmitInstrWithCustomInserter(
+    MachineInstr * MI, MachineBasicBlock * BB) const {
+  const TargetInstrInfo * TII = getTargetMachine().getInstrInfo();
+  MachineRegisterInfo & MRI = BB->getParent()->getRegInfo();
+  MachineBasicBlock::iterator I = MI;
+
+  if (TII->get(MI->getOpcode()).TSFlags & SIInstrFlags::NEED_WAIT) {
+    AppendS_WAITCNT(MI, *BB, llvm::next(I));
+    return BB;
+  }
+
+  switch (MI->getOpcode()) {
+  default:
+    return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
+  case AMDGPU::BRANCH: return BB;
+  case AMDGPU::CLAMP_SI:
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_MOV_B32_e64))
+           .addOperand(MI->getOperand(0))
+           .addOperand(MI->getOperand(1))
+           // VSRC1-2 are unused, but we still need to fill all the
+           // operand slots, so we just reuse the VSRC0 operand
+           .addOperand(MI->getOperand(1))
+           .addOperand(MI->getOperand(1))
+           .addImm(0) // ABS
+           .addImm(1) // CLAMP
+           .addImm(0) // OMOD
+           .addImm(0); // NEG
+    MI->eraseFromParent();
+    break;
+
+  case AMDGPU::FABS_SI:
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_MOV_B32_e64))
+                 .addOperand(MI->getOperand(0))
+                 .addOperand(MI->getOperand(1))
+                 // VSRC1-2 are unused, but we still need to fill all the
+                 // operand slots, so we just reuse the VSRC0 operand
+                 .addOperand(MI->getOperand(1))
+                 .addOperand(MI->getOperand(1))
+                 .addImm(1) // ABS
+                 .addImm(0) // CLAMP
+                 .addImm(0) // OMOD
+                 .addImm(0); // NEG
+    MI->eraseFromParent();
+    break;
+
+  case AMDGPU::FNEG_SI:
+    BuildMI(*BB, I, BB->findDebugLoc(I), TII->get(AMDGPU::V_MOV_B32_e64))
+                 .addOperand(MI->getOperand(0))
+                 .addOperand(MI->getOperand(1))
+                 // VSRC1-2 are unused, but we still need to fill all the
+                 // operand slots, so we just reuse the VSRC0 operand
+                 .addOperand(MI->getOperand(1))
+                 .addOperand(MI->getOperand(1))
+                 .addImm(0) // ABS
+                 .addImm(0) // CLAMP
+                 .addImm(0) // OMOD
+                 .addImm(1); // NEG
+    MI->eraseFromParent();
+    break;
+  case AMDGPU::SHADER_TYPE:
+    BB->getParent()->getInfo<SIMachineFunctionInfo>()->ShaderType =
+                                        MI->getOperand(0).getImm();
+    MI->eraseFromParent();
+    break;
+
+  case AMDGPU::SI_INTERP:
+    LowerSI_INTERP(MI, *BB, I, MRI);
+    break;
+  case AMDGPU::SI_INTERP_CONST:
+    LowerSI_INTERP_CONST(MI, *BB, I, MRI);
+    break;
+  case AMDGPU::SI_KIL:
+    LowerSI_KIL(MI, *BB, I, MRI);
+    break;
+  case AMDGPU::SI_WQM:
+    LowerSI_WQM(MI, *BB, I, MRI);
+    break;
+  case AMDGPU::SI_V_CNDLT:
+    LowerSI_V_CNDLT(MI, *BB, I, MRI);
+    break;
+  }
+  return BB;
+}
+
+void SITargetLowering::AppendS_WAITCNT(MachineInstr *MI, MachineBasicBlock &BB,
+    MachineBasicBlock::iterator I) const {
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_WAITCNT))
+          .addImm(0);
+}
+
+
+void SITargetLowering::LowerSI_WQM(MachineInstr *MI, MachineBasicBlock &BB,
+    MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const {
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_WQM_B64), AMDGPU::EXEC)
+          .addReg(AMDGPU::EXEC);
+
+  MI->eraseFromParent();
+}
+
+void SITargetLowering::LowerSI_INTERP(MachineInstr *MI, MachineBasicBlock &BB,
+    MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const {
+  unsigned tmp = MRI.createVirtualRegister(&AMDGPU::VReg_32RegClass);
+  unsigned M0 = MRI.createVirtualRegister(&AMDGPU::M0RegRegClass);
+  MachineOperand dst = MI->getOperand(0);
+  MachineOperand iReg = MI->getOperand(1);
+  MachineOperand jReg = MI->getOperand(2);
+  MachineOperand attr_chan = MI->getOperand(3);
+  MachineOperand attr = MI->getOperand(4);
+  MachineOperand params = MI->getOperand(5);
+
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B32), M0)
+          .addOperand(params);
+
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_INTERP_P1_F32), tmp)
+          .addOperand(iReg)
+          .addOperand(attr_chan)
+          .addOperand(attr)
+          .addReg(M0);
+
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_INTERP_P2_F32))
+          .addOperand(dst)
+          .addReg(tmp)
+          .addOperand(jReg)
+          .addOperand(attr_chan)
+          .addOperand(attr)
+          .addReg(M0);
+
+  MI->eraseFromParent();
+}
+
+void SITargetLowering::LowerSI_INTERP_CONST(MachineInstr *MI,
+    MachineBasicBlock &BB, MachineBasicBlock::iterator I,
+    MachineRegisterInfo &MRI) const {
+  MachineOperand dst = MI->getOperand(0);
+  MachineOperand attr_chan = MI->getOperand(1);
+  MachineOperand attr = MI->getOperand(2);
+  MachineOperand params = MI->getOperand(3);
+  unsigned M0 = MRI.createVirtualRegister(&AMDGPU::M0RegRegClass);
+
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::S_MOV_B32), M0)
+          .addOperand(params);
+
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_INTERP_MOV_F32))
+          .addOperand(dst)
+          .addOperand(attr_chan)
+          .addOperand(attr)
+          .addReg(M0);
+
+  MI->eraseFromParent();
+}
+
+void SITargetLowering::LowerSI_KIL(MachineInstr *MI, MachineBasicBlock &BB,
+    MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const {
+  // Clear this pixel from the exec mask if the operand is negative
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_CMPX_LE_F32_e32),
+          AMDGPU::VCC)
+          .addReg(AMDGPU::SREG_LIT_0)
+          .addOperand(MI->getOperand(0));
+
+  MI->eraseFromParent();
+}
+
+void SITargetLowering::LowerSI_V_CNDLT(MachineInstr *MI, MachineBasicBlock &BB,
+    MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const {
+  unsigned VCC = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+
+  BuildMI(BB, I, BB.findDebugLoc(I),
+          TII->get(AMDGPU::V_CMP_GT_F32_e32),
+          VCC)
+          .addReg(AMDGPU::SREG_LIT_0)
+          .addOperand(MI->getOperand(1));
+
+  BuildMI(BB, I, BB.findDebugLoc(I), TII->get(AMDGPU::V_CNDMASK_B32_e32))
+          .addOperand(MI->getOperand(0))
+          .addOperand(MI->getOperand(3))
+          .addOperand(MI->getOperand(2))
+          .addReg(VCC);
+
+  MI->eraseFromParent();
+}
+
+EVT SITargetLowering::getSetCCResultType(EVT VT) const {
+  return MVT::i1;
+}
+
+//===----------------------------------------------------------------------===//
+// Custom DAG Lowering Operations
+//===----------------------------------------------------------------------===//
+
+SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+  case ISD::BR_CC: return LowerBR_CC(Op, DAG);
+  case ISD::LOAD: return LowerLOAD(Op, DAG);
+  case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
+  case ISD::AND: return Loweri1ContextSwitch(Op, DAG, ISD::AND);
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IntrinsicID =
+                         cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+    EVT VT = Op.getValueType();
+    switch (IntrinsicID) {
+    case AMDGPUIntrinsic::SI_vs_load_buffer_index:
+      return CreateLiveInRegister(DAG, &AMDGPU::VReg_32RegClass,
+                                  AMDGPU::VGPR0, VT);
+    default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+    }
+    break;
+  }
+  }
+  return SDValue();
+}
+
+/// \brief The function is for lowering i1 operations on the
+/// VCC register.
+///
+/// In the VALU context, VCC is a one bit register, but in the
+/// SALU context the VCC is a 64-bit register (1-bit per thread).  Since only
+/// the SALU can perform operations on the VCC register, we need to promote
+/// the operand types from i1 to i64 in order for tablegen to be able to match
+/// this operation to the correct SALU instruction.  We do this promotion by
+/// wrapping the operands in a CopyToReg node.
+///
+SDValue SITargetLowering::Loweri1ContextSwitch(SDValue Op,
+                                               SelectionDAG &DAG,
+                                               unsigned VCCNode) const {
+  DebugLoc DL = Op.getDebugLoc();
+
+  SDValue OpNode = DAG.getNode(VCCNode, DL, MVT::i64,
+                               DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i64,
+                                           Op.getOperand(0)),
+                               DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i64,
+                                           Op.getOperand(1)));
+
+  return DAG.getNode(SIISD::VCC_BITCAST, DL, MVT::i1, OpNode);
+}
+
+SDValue SITargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDValue CC = Op.getOperand(1);
+  SDValue LHS   = Op.getOperand(2);
+  SDValue RHS   = Op.getOperand(3);
+  SDValue JumpT  = Op.getOperand(4);
+  SDValue CmpValue;
+  SDValue Result;
+  CmpValue = DAG.getNode(
+      ISD::SETCC,
+      Op.getDebugLoc(),
+      MVT::i1,
+      LHS, RHS,
+      CC);
+
+  Result = DAG.getNode(
+      AMDGPUISD::BRANCH_COND,
+      CmpValue.getDebugLoc(),
+      MVT::Other, Chain,
+      JumpT, CmpValue);
+  return Result;
+}
+
+SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  LoadSDNode *Ptr = dyn_cast<LoadSDNode>(Op);
+
+  assert(Ptr);
+
+  unsigned AddrSpace = Ptr->getPointerInfo().getAddrSpace();
+
+  // We only need to lower USER_SGPR address space loads
+  if (AddrSpace != AMDGPUAS::USER_SGPR_ADDRESS) {
+    return SDValue();
+  }
+
+  // Loads from the USER_SGPR address space can only have constant value
+  // pointers.
+  ConstantSDNode *BasePtr = dyn_cast<ConstantSDNode>(Ptr->getBasePtr());
+  assert(BasePtr);
+
+  unsigned TypeDwordWidth = VT.getSizeInBits() / 32;
+  const TargetRegisterClass * dstClass;
+  switch (TypeDwordWidth) {
+    default:
+      assert(!"USER_SGPR value size not implemented");
+      return SDValue();
+    case 1:
+      dstClass = &AMDGPU::SReg_32RegClass;
+      break;
+    case 2:
+      dstClass = &AMDGPU::SReg_64RegClass;
+      break;
+  }
+  uint64_t Index = BasePtr->getZExtValue();
+  assert(Index % TypeDwordWidth == 0 && "USER_SGPR not properly aligned");
+  unsigned SGPRIndex = Index / TypeDwordWidth;
+  unsigned Reg = dstClass->getRegister(SGPRIndex);
+
+  DAG.ReplaceAllUsesOfValueWith(Op, CreateLiveInRegister(DAG, dstClass, Reg,
+                                                         VT));
+  return SDValue();
+}
+
+SDValue SITargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  SDValue True = Op.getOperand(2);
+  SDValue False = Op.getOperand(3);
+  SDValue CC = Op.getOperand(4);
+  EVT VT = Op.getValueType();
+  DebugLoc DL = Op.getDebugLoc();
+
+  // Possible Min/Max pattern
+  SDValue MinMax = LowerMinMax(Op, DAG);
+  if (MinMax.getNode()) {
+    return MinMax;
+  }
+
+  SDValue Cond = DAG.getNode(ISD::SETCC, DL, MVT::i1, LHS, RHS, CC);
+  return DAG.getNode(ISD::SELECT, DL, VT, Cond, True, False);
+}
+
+//===----------------------------------------------------------------------===//
+// Custom DAG optimizations
+//===----------------------------------------------------------------------===//
+
+SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
+                                            DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  DebugLoc DL = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+
+  switch (N->getOpcode()) {
+    default: break;
+    case ISD::SELECT_CC: {
+      N->dump();
+      ConstantSDNode *True, *False;
+      // i1 selectcc(l, r, -1, 0, cc) -> i1 setcc(l, r, cc)
+      if ((True = dyn_cast<ConstantSDNode>(N->getOperand(2)))
+          && (False = dyn_cast<ConstantSDNode>(N->getOperand(3)))
+          && True->isAllOnesValue()
+          && False->isNullValue()
+          && VT == MVT::i1) {
+        return DAG.getNode(ISD::SETCC, DL, VT, N->getOperand(0),
+                           N->getOperand(1), N->getOperand(4));
+
+      }
+      break;
+    }
+    case ISD::SETCC: {
+      SDValue Arg0 = N->getOperand(0);
+      SDValue Arg1 = N->getOperand(1);
+      SDValue CC = N->getOperand(2);
+      ConstantSDNode * C = NULL;
+      ISD::CondCode CCOp = dyn_cast<CondCodeSDNode>(CC)->get();
+
+      // i1 setcc (sext(i1), 0, setne) -> i1 setcc(i1, 0, setne)
+      if (VT == MVT::i1
+          && Arg0.getOpcode() == ISD::SIGN_EXTEND
+          && Arg0.getOperand(0).getValueType() == MVT::i1
+          && (C = dyn_cast<ConstantSDNode>(Arg1))
+          && C->isNullValue()
+          && CCOp == ISD::SETNE) {
+        return SimplifySetCC(VT, Arg0.getOperand(0),
+                             DAG.getConstant(0, MVT::i1), CCOp, true, DCI, DL);
+      }
+      break;
+    }
+  }
+  return SDValue();
+}
+
+#define NODE_NAME_CASE(node) case SIISD::node: return #node;
+
+const char* SITargetLowering::getTargetNodeName(unsigned Opcode) const {
+  switch (Opcode) {
+  default: return AMDGPUTargetLowering::getTargetNodeName(Opcode);
+  NODE_NAME_CASE(VCC_AND)
+  NODE_NAME_CASE(VCC_BITCAST)
+  }
+}
diff --git a/lib/Target/R600/SIISelLowering.h b/lib/Target/R600/SIISelLowering.h
new file mode 100644
index 0000000000..27c2a1c39a
--- /dev/null
+++ b/lib/Target/R600/SIISelLowering.h
@@ -0,0 +1,62 @@
+//===-- SIISelLowering.h - SI DAG Lowering Interface ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief SI DAG Lowering interface definition
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SIISELLOWERING_H
+#define SIISELLOWERING_H
+
+#include "AMDGPUISelLowering.h"
+#include "SIInstrInfo.h"
+
+namespace llvm {
+
+class SITargetLowering : public AMDGPUTargetLowering {
+  const SIInstrInfo * TII;
+
+  /// Memory reads and writes are syncronized using the S_WAITCNT instruction.
+  /// This function takes the most conservative approach and inserts an
+  /// S_WAITCNT instruction after every read and write.
+  void AppendS_WAITCNT(MachineInstr *MI, MachineBasicBlock &BB,
+              MachineBasicBlock::iterator I) const;
+  void LowerMOV_IMM(MachineInstr *MI, MachineBasicBlock &BB,
+              MachineBasicBlock::iterator I, unsigned Opocde) const;
+  void LowerSI_INTERP(MachineInstr *MI, MachineBasicBlock &BB,
+              MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const;
+  void LowerSI_INTERP_CONST(MachineInstr *MI, MachineBasicBlock &BB,
+              MachineBasicBlock::iterator I, MachineRegisterInfo &MRI) const;
+  void LowerSI_KIL(MachineInstr *MI, MachineBasicBlock &BB,
+              MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const;
+  void LowerSI_WQM(MachineInstr *MI, MachineBasicBlock &BB,
+              MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const;
+  void LowerSI_V_CNDLT(MachineInstr *MI, MachineBasicBlock &BB,
+              MachineBasicBlock::iterator I, MachineRegisterInfo & MRI) const;
+
+  SDValue Loweri1ContextSwitch(SDValue Op, SelectionDAG &DAG,
+                                           unsigned VCCNode) const;
+  SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
+
+public:
+  SITargetLowering(TargetMachine &tm);
+  virtual MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr * MI,
+                                              MachineBasicBlock * BB) const;
+  virtual EVT getSetCCResultType(EVT VT) const;
+  virtual SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const;
+  virtual SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  virtual const char* getTargetNodeName(unsigned Opcode) const;
+};
+
+} // End namespace llvm
+
+#endif //SIISELLOWERING_H
diff --git a/lib/Target/R600/SIInstrFormats.td b/lib/Target/R600/SIInstrFormats.td
new file mode 100644
index 0000000000..aea3b5a888
--- /dev/null
+++ b/lib/Target/R600/SIInstrFormats.td
@@ -0,0 +1,146 @@
+//===-- SIInstrFormats.td - SI Instruction Formats ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// SI Instruction format definitions.
+//
+// Instructions with _32 take 32-bit operands.
+// Instructions with _64 take 64-bit operands.
+//
+// VOP_* instructions can use either a 32-bit or 64-bit encoding.  The 32-bit
+// encoding is the standard encoding, but instruction that make use of
+// any of the instruction modifiers must use the 64-bit encoding.
+//
+// Instructions with _e32 use the 32-bit encoding.
+// Instructions with _e64 use the 64-bit encoding.
+//
+//===----------------------------------------------------------------------===//
+
+class VOP3b_2IN <bits<9> op, string opName, RegisterClass dstClass,
+                 RegisterClass src0Class, RegisterClass src1Class,
+                 list<dag> pattern>
+  : VOP3b <op, (outs dstClass:$vdst),
+               (ins src0Class:$src0, src1Class:$src1, InstFlag:$src2, InstFlag:$sdst,
+                    InstFlag:$omod, InstFlag:$neg),
+           opName, pattern
+>;
+
+
+class VOP3_1_32 <bits<9> op, string opName, list<dag> pattern>
+  : VOP3b_2IN <op, opName, SReg_1, AllReg_32, VReg_32, pattern>;
+
+class VOP3_32 <bits<9> op, string opName, list<dag> pattern>
+  : VOP3 <op, (outs VReg_32:$dst), (ins AllReg_32:$src0, VReg_32:$src1, VReg_32:$src2, i32imm:$src3, i32imm:$src4, i32imm:$src5, i32imm:$src6), opName, pattern>;
+
+class VOP3_64 <bits<9> op, string opName, list<dag> pattern>
+  : VOP3 <op, (outs VReg_64:$dst), (ins AllReg_64:$src0, VReg_64:$src1, VReg_64:$src2, i32imm:$src3, i32imm:$src4, i32imm:$src5, i32imm:$src6), opName, pattern>;
+
+
+class SOP1_32 <bits<8> op, string opName, list<dag> pattern>
+  : SOP1 <op, (outs SReg_32:$dst), (ins SReg_32:$src0), opName, pattern>;
+
+class SOP1_64 <bits<8> op, string opName, list<dag> pattern>
+  : SOP1 <op, (outs SReg_64:$dst), (ins SReg_64:$src0), opName, pattern>;
+
+class SOP2_32 <bits<7> op, string opName, list<dag> pattern>
+  : SOP2 <op, (outs SReg_32:$dst), (ins SReg_32:$src0, SReg_32:$src1), opName, pattern>;
+
+class SOP2_64 <bits<7> op, string opName, list<dag> pattern>
+  : SOP2 <op, (outs SReg_64:$dst), (ins SReg_64:$src0, SReg_64:$src1), opName, pattern>;
+
+class SOP2_VCC <bits<7> op, string opName, list<dag> pattern>
+  : SOP2 <op, (outs SReg_1:$vcc), (ins SReg_64:$src0, SReg_64:$src1), opName, pattern>;
+
+class VOP1_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc,
+                   string opName, list<dag> pattern> : 
+  VOP1 <
+    op, (outs vrc:$dst), (ins arc:$src0), opName, pattern
+  >;
+
+multiclass VOP1_32 <bits<8> op, string opName, list<dag> pattern> {
+  def _e32: VOP1_Helper <op, VReg_32, AllReg_32, opName, pattern>;
+  def _e64 : VOP3_32 <{1, 1, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
+                      opName, []
+  >;
+}
+
+multiclass VOP1_64 <bits<8> op, string opName, list<dag> pattern> {
+
+  def _e32 : VOP1_Helper <op, VReg_64, AllReg_64, opName, pattern>;
+
+  def _e64 : VOP3_64 <
+    {1, 1, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
+    opName, []
+  >;
+}
+
+class VOP2_Helper <bits<6> op, RegisterClass vrc, RegisterClass arc,
+                   string opName, list<dag> pattern> :
+  VOP2 <
+    op, (outs vrc:$dst), (ins arc:$src0, vrc:$src1), opName, pattern
+  >;
+
+multiclass VOP2_32 <bits<6> op, string opName, list<dag> pattern> {
+
+  def _e32 : VOP2_Helper <op, VReg_32, AllReg_32, opName, pattern>;
+
+  def _e64 : VOP3_32 <{1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
+                      opName, []
+  >;
+}
+
+multiclass VOP2_64 <bits<6> op, string opName, list<dag> pattern> {
+  def _e32: VOP2_Helper <op, VReg_64, AllReg_64, opName, pattern>;
+
+  def _e64 : VOP3_64 <
+    {1, 0, 0, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
+    opName, []
+  >;
+}
+
+class SOPK_32 <bits<5> op, string opName, list<dag> pattern>
+  : SOPK <op, (outs SReg_32:$dst), (ins i16imm:$src0), opName, pattern>;
+
+class SOPK_64 <bits<5> op, string opName, list<dag> pattern>
+  : SOPK <op, (outs SReg_64:$dst), (ins i16imm:$src0), opName, pattern>;
+
+class VOPC_Helper <bits<8> op, RegisterClass vrc, RegisterClass arc,
+                 string opName, list<dag> pattern> :
+  VOPC <
+    op, (ins arc:$src0, vrc:$src1), opName, pattern
+  >;
+
+multiclass VOPC_32 <bits<9> op, string opName, list<dag> pattern> {
+
+  def _e32 : VOPC_Helper <
+    {op{7}, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
+    VReg_32, AllReg_32, opName, pattern
+  >;
+
+  def _e64 : VOP3_1_32 <
+    op,
+    opName, pattern
+  >;
+}
+
+multiclass VOPC_64 <bits<8> op, string opName, list<dag> pattern> {
+
+  def _e32 : VOPC_Helper <op, VReg_64, AllReg_64, opName, pattern>;
+
+  def _e64 : VOP3_64 <
+    {0, op{7}, op{6}, op{5}, op{4}, op{3}, op{2}, op{1}, op{0}},
+    opName, []
+  >;
+}
+
+class SOPC_32 <bits<7> op, string opName, list<dag> pattern>
+  : SOPC <op, (outs SCCReg:$dst), (ins SReg_32:$src0, SReg_32:$src1), opName, pattern>;
+
+class SOPC_64 <bits<7> op, string opName, list<dag> pattern>
+  : SOPC <op, (outs SCCReg:$dst), (ins SReg_64:$src0, SReg_64:$src1), opName, pattern>;
+
diff --git a/lib/Target/R600/SIInstrInfo.cpp b/lib/Target/R600/SIInstrInfo.cpp
new file mode 100644
index 0000000000..4aa4a45fb5
--- /dev/null
+++ b/lib/Target/R600/SIInstrInfo.cpp
@@ -0,0 +1,89 @@
+//===-- SIInstrInfo.cpp - SI Instruction Information  ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief SI Implementation of TargetInstrInfo.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "SIInstrInfo.h"
+#include "AMDGPUTargetMachine.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/MCInstrDesc.h"
+
+#include <stdio.h>
+
+using namespace llvm;
+
+SIInstrInfo::SIInstrInfo(AMDGPUTargetMachine &tm)
+  : AMDGPUInstrInfo(tm),
+    RI(tm, *this)
+    { }
+
+const SIRegisterInfo &SIInstrInfo::getRegisterInfo() const {
+  return RI;
+}
+
+void
+SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const {
+  // If we are trying to copy to or from SCC, there is a bug somewhere else in
+  // the backend.  While it may be theoretically possible to do this, it should
+  // never be necessary.
+  assert(DestReg != AMDGPU::SCC && SrcReg != AMDGPU::SCC);
+
+  if (AMDGPU::SReg_64RegClass.contains(DestReg)) {
+    assert(AMDGPU::SReg_64RegClass.contains(SrcReg));
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
+            .addReg(SrcReg, getKillRegState(KillSrc));
+  } else if (AMDGPU::VReg_32RegClass.contains(DestReg)) {
+    assert(AMDGPU::VReg_32RegClass.contains(SrcReg) ||
+           AMDGPU::SReg_32RegClass.contains(SrcReg));
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
+            .addReg(SrcReg, getKillRegState(KillSrc));
+  } else {
+    assert(AMDGPU::SReg_32RegClass.contains(DestReg));
+    assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
+            .addReg(SrcReg, getKillRegState(KillSrc));
+  }
+}
+
+MachineInstr * SIInstrInfo::getMovImmInstr(MachineFunction *MF, unsigned DstReg,
+                                           int64_t Imm) const {
+  MachineInstr * MI = MF->CreateMachineInstr(get(AMDGPU::V_MOV_IMM_I32), DebugLoc());
+  MachineInstrBuilder(MI).addReg(DstReg, RegState::Define);
+  MachineInstrBuilder(MI).addImm(Imm);
+
+  return MI;
+
+}
+
+bool SIInstrInfo::isMov(unsigned Opcode) const {
+  switch(Opcode) {
+  default: return false;
+  case AMDGPU::S_MOV_B32:
+  case AMDGPU::S_MOV_B64:
+  case AMDGPU::V_MOV_B32_e32:
+  case AMDGPU::V_MOV_B32_e64:
+  case AMDGPU::V_MOV_IMM_F32:
+  case AMDGPU::V_MOV_IMM_I32:
+  case AMDGPU::S_MOV_IMM_I32:
+    return true;
+  }
+}
+
+bool
+SIInstrInfo::isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const {
+  return RC != &AMDGPU::EXECRegRegClass;
+}
diff --git a/lib/Target/R600/SIInstrInfo.h b/lib/Target/R600/SIInstrInfo.h
new file mode 100644
index 0000000000..631f6c00cc
--- /dev/null
+++ b/lib/Target/R600/SIInstrInfo.h
@@ -0,0 +1,62 @@
+//===-- SIInstrInfo.h - SI Instruction Info Interface ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface definition for SIInstrInfo.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef SIINSTRINFO_H
+#define SIINSTRINFO_H
+
+#include "AMDGPUInstrInfo.h"
+#include "SIRegisterInfo.h"
+
+namespace llvm {
+
+class SIInstrInfo : public AMDGPUInstrInfo {
+private:
+  const SIRegisterInfo RI;
+
+public:
+  explicit SIInstrInfo(AMDGPUTargetMachine &tm);
+
+  const SIRegisterInfo &getRegisterInfo() const;
+
+  virtual void copyPhysReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MI, DebugLoc DL,
+                           unsigned DestReg, unsigned SrcReg,
+                           bool KillSrc) const;
+
+  /// \returns the encoding type of this instruction.
+  unsigned getEncodingType(const MachineInstr &MI) const;
+
+  /// \returns the size of this instructions encoding in number of bytes.
+  unsigned getEncodingBytes(const MachineInstr &MI) const;
+
+  virtual MachineInstr * getMovImmInstr(MachineFunction *MF, unsigned DstReg,
+                                        int64_t Imm) const;
+
+  virtual unsigned getIEQOpcode() const { assert(!"Implement"); return 0;}
+  virtual bool isMov(unsigned Opcode) const;
+
+  virtual bool isSafeToMoveRegClassDefs(const TargetRegisterClass *RC) const;
+  };
+
+} // End namespace llvm
+
+namespace SIInstrFlags {
+  enum Flags {
+    // First 4 bits are the instruction encoding
+    NEED_WAIT = 1 << 4
+  };
+}
+
+#endif //SIINSTRINFO_H
diff --git a/lib/Target/R600/SIInstrInfo.td b/lib/Target/R600/SIInstrInfo.td
new file mode 100644
index 0000000000..873a451e99
--- /dev/null
+++ b/lib/Target/R600/SIInstrInfo.td
@@ -0,0 +1,589 @@
+//===-- SIInstrInfo.td - SI Instruction Encodings ---------*- tablegen -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// SI DAG Profiles
+//===----------------------------------------------------------------------===//
+def SDTVCCBinaryOp : SDTypeProfile<1, 2, [
+  SDTCisInt<0>, SDTCisInt<1>, SDTCisSameAs<1, 2>
+]>;
+
+//===----------------------------------------------------------------------===//
+// SI DAG Nodes
+//===----------------------------------------------------------------------===//
+
+// and operation on 64-bit wide vcc
+def SIsreg1_and : SDNode<"SIISD::VCC_AND", SDTVCCBinaryOp,
+  [SDNPCommutative, SDNPAssociative]
+>;
+
+// Special bitcast node for sharing VCC register between VALU and SALU
+def SIsreg1_bitcast : SDNode<"SIISD::VCC_BITCAST",
+  SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>]>
+>;
+
+// and operation on 64-bit wide vcc
+def SIvcc_and : SDNode<"SIISD::VCC_AND", SDTVCCBinaryOp,
+  [SDNPCommutative, SDNPAssociative]
+>;
+
+// Special bitcast node for sharing VCC register between VALU and SALU
+def SIvcc_bitcast : SDNode<"SIISD::VCC_BITCAST",
+  SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisInt<1>]>
+>;
+
+class InstSI <dag outs, dag ins, string asm, list<dag> pattern> :
+    AMDGPUInst<outs, ins, asm, pattern> {
+
+  field bits<4> EncodingType = 0;
+  field bits<1> NeedWait = 0;
+
+  let TSFlags{3-0} = EncodingType;
+  let TSFlags{4} = NeedWait;
+
+}
+
+class Enc32 <dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI <outs, ins, asm, pattern> {
+
+  field bits<32> Inst;
+}
+
+class Enc64 <dag outs, dag ins, string asm, list<dag> pattern> :
+    InstSI <outs, ins, asm, pattern> {
+
+  field bits<64> Inst;
+}
+
+class SIOperand <ValueType vt, dag opInfo>: Operand <vt> {
+  let EncoderMethod = "encodeOperand";
+  let MIOperandInfo = opInfo;
+}
+
+def IMM16bit : ImmLeaf <
+  i16,
+  [{return isInt<16>(Imm);}]
+>;
+
+def IMM8bit : ImmLeaf <
+  i32,
+  [{return (int32_t)Imm >= 0 && (int32_t)Imm <= 0xff;}]
+>;
+
+def IMM12bit : ImmLeaf <
+  i16,
+  [{return (int16_t)Imm >= 0 && (int16_t)Imm <= 0xfff;}]
+>;
+
+def IMM32bitIn64bit : ImmLeaf <
+  i64,
+  [{return isInt<32>(Imm);}]
+>;
+
+class GPR4Align <RegisterClass rc> : Operand <vAny> {
+  let EncoderMethod = "GPR4AlignEncode";
+  let MIOperandInfo = (ops rc:$reg); 
+}
+
+class GPR2Align <RegisterClass rc, ValueType vt> : Operand <vt> {
+  let EncoderMethod = "GPR2AlignEncode";
+  let MIOperandInfo = (ops rc:$reg);
+}
+
+def SMRDmemrr : Operand<iPTR> {
+  let MIOperandInfo = (ops SReg_64, SReg_32);
+  let EncoderMethod = "GPR2AlignEncode";
+}
+
+def SMRDmemri : Operand<iPTR> {
+  let MIOperandInfo = (ops SReg_64, i32imm);
+  let EncoderMethod = "SMRDmemriEncode";
+}
+
+def ADDR_Reg     : ComplexPattern<i64, 2, "SelectADDRReg", [], []>;
+def ADDR_Offset8 : ComplexPattern<i64, 2, "SelectADDR8BitOffset", [], []>;
+
+let Uses = [EXEC] in {
+
+def EXP : Enc64<
+  (outs),
+  (ins i32imm:$en, i32imm:$tgt, i32imm:$compr, i32imm:$done, i32imm:$vm,
+       VReg_32:$src0, VReg_32:$src1, VReg_32:$src2, VReg_32:$src3),
+  "EXP $en, $tgt, $compr, $done, $vm, $src0, $src1, $src2, $src3",
+  [] > {
+
+  bits<4> EN;
+  bits<6> TGT;
+  bits<1> COMPR;
+  bits<1> DONE;
+  bits<1> VM;
+  bits<8> VSRC0;
+  bits<8> VSRC1;
+  bits<8> VSRC2;
+  bits<8> VSRC3;
+
+  let Inst{3-0} = EN;
+  let Inst{9-4} = TGT;
+  let Inst{10} = COMPR;
+  let Inst{11} = DONE;
+  let Inst{12} = VM;
+  let Inst{31-26} = 0x3e;
+  let Inst{39-32} = VSRC0;
+  let Inst{47-40} = VSRC1;
+  let Inst{55-48} = VSRC2;
+  let Inst{63-56} = VSRC3;
+  let EncodingType = 0; //SIInstrEncodingType::EXP
+
+  let NeedWait = 1;
+  let usesCustomInserter = 1;
+}
+
+class MIMG <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc64 <outs, ins, asm, pattern> {
+
+  bits<8> VDATA;
+  bits<4> DMASK;
+  bits<1> UNORM;
+  bits<1> GLC;
+  bits<1> DA;
+  bits<1> R128;
+  bits<1> TFE;
+  bits<1> LWE;
+  bits<1> SLC;
+  bits<8> VADDR;
+  bits<5> SRSRC;
+  bits<5> SSAMP; 
+
+  let Inst{11-8} = DMASK;
+  let Inst{12} = UNORM;
+  let Inst{13} = GLC;
+  let Inst{14} = DA;
+  let Inst{15} = R128;
+  let Inst{16} = TFE;
+  let Inst{17} = LWE;
+  let Inst{24-18} = op;
+  let Inst{25} = SLC;
+  let Inst{31-26} = 0x3c;
+  let Inst{39-32} = VADDR;
+  let Inst{47-40} = VDATA;
+  let Inst{52-48} = SRSRC;
+  let Inst{57-53} = SSAMP;
+
+  let EncodingType = 2; //SIInstrEncodingType::MIMG
+
+  let NeedWait = 1;
+  let usesCustomInserter = 1;
+}
+
+class MTBUF <bits<3> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc64<outs, ins, asm, pattern> {
+
+  bits<8> VDATA;
+  bits<12> OFFSET;
+  bits<1> OFFEN;
+  bits<1> IDXEN;
+  bits<1> GLC;
+  bits<1> ADDR64;
+  bits<4> DFMT;
+  bits<3> NFMT;
+  bits<8> VADDR;
+  bits<5> SRSRC;
+  bits<1> SLC;
+  bits<1> TFE;
+  bits<8> SOFFSET;
+
+  let Inst{11-0} = OFFSET;
+  let Inst{12} = OFFEN;
+  let Inst{13} = IDXEN;
+  let Inst{14} = GLC;
+  let Inst{15} = ADDR64;
+  let Inst{18-16} = op;
+  let Inst{22-19} = DFMT;
+  let Inst{25-23} = NFMT;
+  let Inst{31-26} = 0x3a; //encoding
+  let Inst{39-32} = VADDR;
+  let Inst{47-40} = VDATA;
+  let Inst{52-48} = SRSRC;
+  let Inst{54} = SLC;
+  let Inst{55} = TFE;
+  let Inst{63-56} = SOFFSET;
+  let EncodingType = 3; //SIInstrEncodingType::MTBUF
+
+  let NeedWait = 1;
+  let usesCustomInserter = 1;
+  let neverHasSideEffects = 1;
+}
+
+class MUBUF <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc64<outs, ins, asm, pattern> {
+
+  bits<8> VDATA;
+  bits<12> OFFSET;
+  bits<1> OFFEN;
+  bits<1> IDXEN;
+  bits<1> GLC;
+  bits<1> ADDR64;
+  bits<1> LDS;
+  bits<8> VADDR;
+  bits<5> SRSRC;
+  bits<1> SLC;
+  bits<1> TFE;
+  bits<8> SOFFSET;
+
+  let Inst{11-0} = OFFSET;
+  let Inst{12} = OFFEN;
+  let Inst{13} = IDXEN;
+  let Inst{14} = GLC;
+  let Inst{15} = ADDR64;
+  let Inst{16} = LDS;
+  let Inst{24-18} = op;
+  let Inst{31-26} = 0x38; //encoding
+  let Inst{39-32} = VADDR;
+  let Inst{47-40} = VDATA;
+  let Inst{52-48} = SRSRC;
+  let Inst{54} = SLC;
+  let Inst{55} = TFE;
+  let Inst{63-56} = SOFFSET;
+  let EncodingType = 4; //SIInstrEncodingType::MUBUF
+
+  let NeedWait = 1;
+  let usesCustomInserter = 1;
+  let neverHasSideEffects = 1;
+}
+
+} // End Uses = [EXEC]
+
+class SMRD <bits<5> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc32<outs, ins, asm, pattern> {
+
+  bits<7> SDST;
+  bits<15> PTR;
+  bits<8> OFFSET = PTR{7-0};
+  bits<1> IMM    = PTR{8};
+  bits<6> SBASE  = PTR{14-9};
+  
+  let Inst{7-0} = OFFSET;
+  let Inst{8} = IMM;
+  let Inst{14-9} = SBASE;
+  let Inst{21-15} = SDST;
+  let Inst{26-22} = op;
+  let Inst{31-27} = 0x18; //encoding
+  let EncodingType = 5; //SIInstrEncodingType::SMRD
+
+  let NeedWait = 1;
+  let usesCustomInserter = 1;
+}
+
+class SOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc32<outs, ins, asm, pattern> {
+
+  bits<7> SDST;
+  bits<8> SSRC0;
+
+  let Inst{7-0} = SSRC0;
+  let Inst{15-8} = op;
+  let Inst{22-16} = SDST;
+  let Inst{31-23} = 0x17d; //encoding;
+  let EncodingType = 6; //SIInstrEncodingType::SOP1
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+}
+
+class SOP2 <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc32 <outs, ins, asm, pattern> {
+  
+  bits<7> SDST;
+  bits<8> SSRC0;
+  bits<8> SSRC1;
+
+  let Inst{7-0} = SSRC0;
+  let Inst{15-8} = SSRC1;
+  let Inst{22-16} = SDST;
+  let Inst{29-23} = op;
+  let Inst{31-30} = 0x2; // encoding
+  let EncodingType = 7; // SIInstrEncodingType::SOP2  
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+}
+
+class SOPC <bits<7> op, dag outs, dag ins, string asm, list<dag> pattern> :
+  Enc32<outs, ins, asm, pattern> {
+
+  bits<8> SSRC0;
+  bits<8> SSRC1;
+
+  let Inst{7-0} = SSRC0;
+  let Inst{15-8} = SSRC1;
+  let Inst{22-16} = op;
+  let Inst{31-23} = 0x17e;
+  let EncodingType = 8; // SIInstrEncodingType::SOPC
+
+  let DisableEncoding = "$dst";
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+}
+
+class SOPK <bits<5> op, dag outs, dag ins, string asm, list<dag> pattern> :
+   Enc32 <outs, ins , asm, pattern> {
+
+  bits <7> SDST;
+  bits <16> SIMM16;
+  
+  let Inst{15-0} = SIMM16;
+  let Inst{22-16} = SDST;
+  let Inst{27-23} = op;
+  let Inst{31-28} = 0xb; //encoding
+  let EncodingType = 9; // SIInstrEncodingType::SOPK
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+}
+
+class SOPP <bits<7> op, dag ins, string asm, list<dag> pattern> : Enc32 <
+  (outs),
+  ins,
+  asm,
+  pattern > {
+
+  bits <16> SIMM16;
+
+  let Inst{15-0} = SIMM16;
+  let Inst{22-16} = op;
+  let Inst{31-23} = 0x17f; // encoding
+  let EncodingType = 10; // SIInstrEncodingType::SOPP
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+}
+    
+let Uses = [EXEC] in {
+
+class VINTRP <bits <2> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc32 <outs, ins, asm, pattern> {
+
+  bits<8> VDST;
+  bits<8> VSRC;
+  bits<2> ATTRCHAN;
+  bits<6> ATTR;
+
+  let Inst{7-0} = VSRC;
+  let Inst{9-8} = ATTRCHAN;
+  let Inst{15-10} = ATTR;
+  let Inst{17-16} = op;
+  let Inst{25-18} = VDST;
+  let Inst{31-26} = 0x32; // encoding
+  let EncodingType = 11; // SIInstrEncodingType::VINTRP
+
+  let neverHasSideEffects = 1;
+  let mayLoad = 1;
+  let mayStore = 0;
+}
+
+class VOP1 <bits<8> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc32 <outs, ins, asm, pattern> {
+
+  bits<8> VDST;
+  bits<9> SRC0;
+  
+  let Inst{8-0} = SRC0;
+  let Inst{16-9} = op;
+  let Inst{24-17} = VDST;
+  let Inst{31-25} = 0x3f; //encoding
+  
+  let EncodingType = 12; // SIInstrEncodingType::VOP1
+  let PostEncoderMethod = "VOPPostEncode";
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+}
+
+class VOP2 <bits<6> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc32 <outs, ins, asm, pattern> {
+
+  bits<8> VDST;
+  bits<9> SRC0;
+  bits<8> VSRC1;
+  
+  let Inst{8-0} = SRC0;
+  let Inst{16-9} = VSRC1;
+  let Inst{24-17} = VDST;
+  let Inst{30-25} = op;
+  let Inst{31} = 0x0; //encoding
+  
+  let EncodingType = 13; // SIInstrEncodingType::VOP2
+  let PostEncoderMethod = "VOPPostEncode";
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+}
+
+class VOP3 <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc64 <outs, ins, asm, pattern> {
+
+  bits<8> VDST;
+  bits<9> SRC0;
+  bits<9> SRC1;
+  bits<9> SRC2;
+  bits<3> ABS; 
+  bits<1> CLAMP;
+  bits<2> OMOD;
+  bits<3> NEG;
+
+  let Inst{7-0} = VDST;
+  let Inst{10-8} = ABS;
+  let Inst{11} = CLAMP;
+  let Inst{25-17} = op;
+  let Inst{31-26} = 0x34; //encoding
+  let Inst{40-32} = SRC0;
+  let Inst{49-41} = SRC1;
+  let Inst{58-50} = SRC2;
+  let Inst{60-59} = OMOD;
+  let Inst{63-61} = NEG;
+  
+  let EncodingType = 14; // SIInstrEncodingType::VOP3
+  let PostEncoderMethod = "VOPPostEncode";
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+}
+
+class VOP3b <bits<9> op, dag outs, dag ins, string asm, list<dag> pattern> :
+    Enc64 <outs, ins, asm, pattern> {
+
+  bits<8> VDST;
+  bits<9> SRC0;
+  bits<9> SRC1;
+  bits<9> SRC2;
+  bits<7> SDST;
+  bits<2> OMOD;
+  bits<3> NEG;
+
+  let Inst{7-0} = VDST;
+  let Inst{14-8} = SDST;
+  let Inst{25-17} = op;
+  let Inst{31-26} = 0x34; //encoding
+  let Inst{40-32} = SRC0;
+  let Inst{49-41} = SRC1;
+  let Inst{58-50} = SRC2;
+  let Inst{60-59} = OMOD;
+  let Inst{63-61} = NEG;
+
+  let EncodingType = 14; // SIInstrEncodingType::VOP3
+  let PostEncoderMethod = "VOPPostEncode";
+
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+}
+
+class VOPC <bits<8> op, dag ins, string asm, list<dag> pattern> :
+    Enc32 <(outs VCCReg:$dst), ins, asm, pattern> {
+
+  bits<9> SRC0;
+  bits<8> VSRC1;
+
+  let Inst{8-0} = SRC0;
+  let Inst{16-9} = VSRC1;
+  let Inst{24-17} = op;
+  let Inst{31-25} = 0x3e;
+ 
+  let EncodingType = 15; //SIInstrEncodingType::VOPC
+  let PostEncoderMethod = "VOPPostEncode";
+  let DisableEncoding = "$dst";
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 0;
+}
+
+} // End Uses = [EXEC]
+
+class MIMG_Load_Helper <bits<7> op, string asm> : MIMG <
+  op,
+  (outs VReg_128:$vdata),
+  (ins i32imm:$dmask, i1imm:$unorm, i1imm:$glc, i1imm:$da, i1imm:$r128,
+       i1imm:$tfe, i1imm:$lwe, i1imm:$slc, VReg_128:$vaddr,
+       GPR4Align<SReg_256>:$srsrc, GPR4Align<SReg_128>:$ssamp),
+  asm,
+  []> {
+  let mayLoad = 1;
+  let mayStore = 0;
+}
+
+class MUBUF_Load_Helper <bits<7> op, string asm, RegisterClass regClass> : MUBUF <
+  op,
+  (outs regClass:$dst),
+  (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
+       i1imm:$lds, VReg_32:$vaddr, GPR4Align<SReg_128>:$srsrc, i1imm:$slc,
+       i1imm:$tfe, SReg_32:$soffset),
+  asm,
+  []> {
+  let mayLoad = 1;
+  let mayStore = 0;
+}
+
+class MTBUF_Load_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
+  op,
+  (outs regClass:$dst),
+  (ins i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc, i1imm:$addr64,
+       i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr, GPR4Align<SReg_128>:$srsrc,
+       i1imm:$slc, i1imm:$tfe, SReg_32:$soffset),
+  asm,
+  []> {
+  let mayLoad = 1;
+  let mayStore = 0;
+}
+
+class MTBUF_Store_Helper <bits<3> op, string asm, RegisterClass regClass> : MTBUF <
+  op,
+  (outs),
+  (ins regClass:$vdata, i16imm:$offset, i1imm:$offen, i1imm:$idxen, i1imm:$glc,
+   i1imm:$addr64, i8imm:$dfmt, i8imm:$nfmt, VReg_32:$vaddr,
+   GPR4Align<SReg_128>:$srsrc, i1imm:$slc, i1imm:$tfe, SReg_32:$soffset),
+  asm,
+  []> {
+  let mayStore = 1;
+  let mayLoad = 0;
+}
+
+multiclass SMRD_Helper <bits<5> op, string asm, RegisterClass dstClass,
+                        ValueType vt> {
+  def _IMM : SMRD <
+              op,
+              (outs dstClass:$dst),
+              (ins SMRDmemri:$src0),
+              asm,
+              [(set (vt dstClass:$dst), (constant_load ADDR_Offset8:$src0))]
+  >;
+
+  def _SGPR : SMRD <
+              op,
+              (outs dstClass:$dst),
+              (ins SMRDmemrr:$src0),
+              asm,
+              [(set (vt dstClass:$dst), (constant_load ADDR_Reg:$src0))]
+  >;
+}
+
+multiclass SMRD_32 <bits<5> op, string asm, RegisterClass dstClass> {
+  defm _F32 : SMRD_Helper <op, asm, dstClass, f32>;
+  defm _I32 : SMRD_Helper <op, asm, dstClass, i32>;
+}
+
+include "SIInstrFormats.td"
+include "SIInstructions.td"
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
new file mode 100644
index 0000000000..008652f55e
--- /dev/null
+++ b/lib/Target/R600/SIInstructions.td
@@ -0,0 +1,1306 @@
+//===-- SIInstructions.td - SI Instruction Defintions ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This file was originally auto-generated from a GPU register header file and
+// all the instruction definitions were originally commented out.  Instructions
+// that are not yet supported remain commented out.
+//===----------------------------------------------------------------------===//
+
+def isSI : Predicate<"Subtarget.device()"
+                            "->getGeneration() == AMDGPUDeviceInfo::HD7XXX">;
+
+let Predicates = [isSI] in {
+
+let neverHasSideEffects = 1 in {
+def S_MOV_B32 : SOP1_32 <0x00000003, "S_MOV_B32", []>;
+def S_MOV_B64 : SOP1_64 <0x00000004, "S_MOV_B64", []>;
+def S_CMOV_B32 : SOP1_32 <0x00000005, "S_CMOV_B32", []>;
+def S_CMOV_B64 : SOP1_64 <0x00000006, "S_CMOV_B64", []>;
+def S_NOT_B32 : SOP1_32 <0x00000007, "S_NOT_B32", []>;
+def S_NOT_B64 : SOP1_64 <0x00000008, "S_NOT_B64", []>;
+def S_WQM_B32 : SOP1_32 <0x00000009, "S_WQM_B32", []>;
+def S_WQM_B64 : SOP1_64 <0x0000000a, "S_WQM_B64", []>;
+def S_BREV_B32 : SOP1_32 <0x0000000b, "S_BREV_B32", []>;
+def S_BREV_B64 : SOP1_64 <0x0000000c, "S_BREV_B64", []>;
+} // End neverHasSideEffects = 1
+////def S_BCNT0_I32_B32 : SOP1_BCNT0 <0x0000000d, "S_BCNT0_I32_B32", []>;
+////def S_BCNT0_I32_B64 : SOP1_BCNT0 <0x0000000e, "S_BCNT0_I32_B64", []>;
+////def S_BCNT1_I32_B32 : SOP1_BCNT1 <0x0000000f, "S_BCNT1_I32_B32", []>;
+////def S_BCNT1_I32_B64 : SOP1_BCNT1 <0x00000010, "S_BCNT1_I32_B64", []>;
+////def S_FF0_I32_B32 : SOP1_FF0 <0x00000011, "S_FF0_I32_B32", []>;
+////def S_FF0_I32_B64 : SOP1_FF0 <0x00000012, "S_FF0_I32_B64", []>;
+////def S_FF1_I32_B32 : SOP1_FF1 <0x00000013, "S_FF1_I32_B32", []>;
+////def S_FF1_I32_B64 : SOP1_FF1 <0x00000014, "S_FF1_I32_B64", []>;
+//def S_FLBIT_I32_B32 : SOP1_32 <0x00000015, "S_FLBIT_I32_B32", []>;
+//def S_FLBIT_I32_B64 : SOP1_32 <0x00000016, "S_FLBIT_I32_B64", []>;
+def S_FLBIT_I32 : SOP1_32 <0x00000017, "S_FLBIT_I32", []>;
+//def S_FLBIT_I32_I64 : SOP1_32 <0x00000018, "S_FLBIT_I32_I64", []>;
+//def S_SEXT_I32_I8 : SOP1_32 <0x00000019, "S_SEXT_I32_I8", []>;
+//def S_SEXT_I32_I16 : SOP1_32 <0x0000001a, "S_SEXT_I32_I16", []>;
+////def S_BITSET0_B32 : SOP1_BITSET0 <0x0000001b, "S_BITSET0_B32", []>;
+////def S_BITSET0_B64 : SOP1_BITSET0 <0x0000001c, "S_BITSET0_B64", []>;
+////def S_BITSET1_B32 : SOP1_BITSET1 <0x0000001d, "S_BITSET1_B32", []>;
+////def S_BITSET1_B64 : SOP1_BITSET1 <0x0000001e, "S_BITSET1_B64", []>;
+def S_GETPC_B64 : SOP1_64 <0x0000001f, "S_GETPC_B64", []>;
+def S_SETPC_B64 : SOP1_64 <0x00000020, "S_SETPC_B64", []>;
+def S_SWAPPC_B64 : SOP1_64 <0x00000021, "S_SWAPPC_B64", []>;
+def S_RFE_B64 : SOP1_64 <0x00000022, "S_RFE_B64", []>;
+
+let hasSideEffects = 1, Uses = [EXEC], Defs = [EXEC] in {
+
+def S_AND_SAVEEXEC_B64 : SOP1_64 <0x00000024, "S_AND_SAVEEXEC_B64", []>;
+def S_OR_SAVEEXEC_B64 : SOP1_64 <0x00000025, "S_OR_SAVEEXEC_B64", []>;
+def S_XOR_SAVEEXEC_B64 : SOP1_64 <0x00000026, "S_XOR_SAVEEXEC_B64", []>;
+def S_ANDN2_SAVEEXEC_B64 : SOP1_64 <0x00000027, "S_ANDN2_SAVEEXEC_B64", []>;
+def S_ORN2_SAVEEXEC_B64 : SOP1_64 <0x00000028, "S_ORN2_SAVEEXEC_B64", []>;
+def S_NAND_SAVEEXEC_B64 : SOP1_64 <0x00000029, "S_NAND_SAVEEXEC_B64", []>;
+def S_NOR_SAVEEXEC_B64 : SOP1_64 <0x0000002a, "S_NOR_SAVEEXEC_B64", []>;
+def S_XNOR_SAVEEXEC_B64 : SOP1_64 <0x0000002b, "S_XNOR_SAVEEXEC_B64", []>;
+
+} // End hasSideEffects = 1
+
+def S_QUADMASK_B32 : SOP1_32 <0x0000002c, "S_QUADMASK_B32", []>;
+def S_QUADMASK_B64 : SOP1_64 <0x0000002d, "S_QUADMASK_B64", []>;
+def S_MOVRELS_B32 : SOP1_32 <0x0000002e, "S_MOVRELS_B32", []>;
+def S_MOVRELS_B64 : SOP1_64 <0x0000002f, "S_MOVRELS_B64", []>;
+def S_MOVRELD_B32 : SOP1_32 <0x00000030, "S_MOVRELD_B32", []>;
+def S_MOVRELD_B64 : SOP1_64 <0x00000031, "S_MOVRELD_B64", []>;
+//def S_CBRANCH_JOIN : SOP1_ <0x00000032, "S_CBRANCH_JOIN", []>;
+def S_MOV_REGRD_B32 : SOP1_32 <0x00000033, "S_MOV_REGRD_B32", []>;
+def S_ABS_I32 : SOP1_32 <0x00000034, "S_ABS_I32", []>;
+def S_MOV_FED_B32 : SOP1_32 <0x00000035, "S_MOV_FED_B32", []>;
+def S_MOVK_I32 : SOPK_32 <0x00000000, "S_MOVK_I32", []>;
+def S_CMOVK_I32 : SOPK_32 <0x00000002, "S_CMOVK_I32", []>;
+
+/*
+This instruction is disabled for now until we can figure out how to teach
+the instruction selector to correctly use the  S_CMP* vs V_CMP*
+instructions.
+
+When this instruction is enabled the code generator sometimes produces this
+invalid sequence:
+
+SCC = S_CMPK_EQ_I32 SGPR0, imm
+VCC = COPY SCC
+VGPR0 = V_CNDMASK VCC, VGPR0, VGPR1
+
+def S_CMPK_EQ_I32 : SOPK <
+  0x00000003, (outs SCCReg:$dst), (ins SReg_32:$src0, i32imm:$src1),
+  "S_CMPK_EQ_I32",
+  [(set SCCReg:$dst, (setcc SReg_32:$src0, imm:$src1, SETEQ))]
+>;
+*/
+
+def S_CMPK_LG_I32 : SOPK_32 <0x00000004, "S_CMPK_LG_I32", []>;
+def S_CMPK_GT_I32 : SOPK_32 <0x00000005, "S_CMPK_GT_I32", []>;
+def S_CMPK_GE_I32 : SOPK_32 <0x00000006, "S_CMPK_GE_I32", []>;
+def S_CMPK_LT_I32 : SOPK_32 <0x00000007, "S_CMPK_LT_I32", []>;
+def S_CMPK_LE_I32 : SOPK_32 <0x00000008, "S_CMPK_LE_I32", []>;
+def S_CMPK_EQ_U32 : SOPK_32 <0x00000009, "S_CMPK_EQ_U32", []>;
+def S_CMPK_LG_U32 : SOPK_32 <0x0000000a, "S_CMPK_LG_U32", []>;
+def S_CMPK_GT_U32 : SOPK_32 <0x0000000b, "S_CMPK_GT_U32", []>;
+def S_CMPK_GE_U32 : SOPK_32 <0x0000000c, "S_CMPK_GE_U32", []>;
+def S_CMPK_LT_U32 : SOPK_32 <0x0000000d, "S_CMPK_LT_U32", []>;
+def S_CMPK_LE_U32 : SOPK_32 <0x0000000e, "S_CMPK_LE_U32", []>;
+def S_ADDK_I32 : SOPK_32 <0x0000000f, "S_ADDK_I32", []>;
+def S_MULK_I32 : SOPK_32 <0x00000010, "S_MULK_I32", []>;
+//def S_CBRANCH_I_FORK : SOPK_ <0x00000011, "S_CBRANCH_I_FORK", []>;
+def S_GETREG_B32 : SOPK_32 <0x00000012, "S_GETREG_B32", []>;
+def S_SETREG_B32 : SOPK_32 <0x00000013, "S_SETREG_B32", []>;
+def S_GETREG_REGRD_B32 : SOPK_32 <0x00000014, "S_GETREG_REGRD_B32", []>;
+//def S_SETREG_IMM32_B32 : SOPK_32 <0x00000015, "S_SETREG_IMM32_B32", []>;
+//def EXP : EXP_ <0x00000000, "EXP", []>;
+
+defm V_CMP_F_F32 : VOPC_32 <0x00000000, "V_CMP_F_F32", []>;
+defm V_CMP_LT_F32 : VOPC_32 <0x00000001, "V_CMP_LT_F32", []>;
+def : Pat <
+  (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_LT)),
+  (V_CMP_LT_F32_e64 AllReg_32:$src0, VReg_32:$src1)
+>;
+defm V_CMP_EQ_F32 : VOPC_32 <0x00000002, "V_CMP_EQ_F32", []>;
+def : Pat <
+  (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_EQ)),
+  (V_CMP_EQ_F32_e64 AllReg_32:$src0, VReg_32:$src1)
+>;
+defm V_CMP_LE_F32 : VOPC_32 <0x00000003, "V_CMP_LE_F32", []>;
+def : Pat <
+  (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_LE)),
+  (V_CMP_LE_F32_e64 AllReg_32:$src0, VReg_32:$src1)
+>;
+defm V_CMP_GT_F32 : VOPC_32 <0x00000004, "V_CMP_GT_F32", []>;
+def : Pat <
+  (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_GT)),
+  (V_CMP_GT_F32_e64 AllReg_32:$src0, VReg_32:$src1)
+>;
+defm V_CMP_LG_F32 : VOPC_32 <0x00000005, "V_CMP_LG_F32", []>;
+def : Pat <
+  (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_NE)),
+  (V_CMP_LG_F32_e64 AllReg_32:$src0, VReg_32:$src1)
+>;
+defm V_CMP_GE_F32 : VOPC_32 <0x00000006, "V_CMP_GE_F32", []>;
+def : Pat <
+  (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_GE)),
+  (V_CMP_GE_F32_e64 AllReg_32:$src0, VReg_32:$src1)
+>;
+defm V_CMP_O_F32 : VOPC_32 <0x00000007, "V_CMP_O_F32", []>;
+defm V_CMP_U_F32 : VOPC_32 <0x00000008, "V_CMP_U_F32", []>;
+defm V_CMP_NGE_F32 : VOPC_32 <0x00000009, "V_CMP_NGE_F32", []>;
+defm V_CMP_NLG_F32 : VOPC_32 <0x0000000a, "V_CMP_NLG_F32", []>;
+defm V_CMP_NGT_F32 : VOPC_32 <0x0000000b, "V_CMP_NGT_F32", []>;
+defm V_CMP_NLE_F32 : VOPC_32 <0x0000000c, "V_CMP_NLE_F32", []>;
+defm V_CMP_NEQ_F32 : VOPC_32 <0x0000000d, "V_CMP_NEQ_F32", []>;
+def : Pat <
+  (i1 (setcc (f32 AllReg_32:$src0), VReg_32:$src1, COND_NE)),
+  (V_CMP_NEQ_F32_e64 AllReg_32:$src0, VReg_32:$src1)
+>;
+defm V_CMP_NLT_F32 : VOPC_32 <0x0000000e, "V_CMP_NLT_F32", []>;
+defm V_CMP_TRU_F32 : VOPC_32 <0x0000000f, "V_CMP_TRU_F32", []>;
+
+//Side effect is writing to EXEC
+let hasSideEffects = 1 in {
+
+defm V_CMPX_F_F32 : VOPC_32 <0x00000010, "V_CMPX_F_F32", []>;
+defm V_CMPX_LT_F32 : VOPC_32 <0x00000011, "V_CMPX_LT_F32", []>;
+defm V_CMPX_EQ_F32 : VOPC_32 <0x00000012, "V_CMPX_EQ_F32", []>;
+defm V_CMPX_LE_F32 : VOPC_32 <0x00000013, "V_CMPX_LE_F32", []>;
+defm V_CMPX_GT_F32 : VOPC_32 <0x00000014, "V_CMPX_GT_F32", []>;
+defm V_CMPX_LG_F32 : VOPC_32 <0x00000015, "V_CMPX_LG_F32", []>;
+defm V_CMPX_GE_F32 : VOPC_32 <0x00000016, "V_CMPX_GE_F32", []>;
+defm V_CMPX_O_F32 : VOPC_32 <0x00000017, "V_CMPX_O_F32", []>;
+defm V_CMPX_U_F32 : VOPC_32 <0x00000018, "V_CMPX_U_F32", []>;
+defm V_CMPX_NGE_F32 : VOPC_32 <0x00000019, "V_CMPX_NGE_F32", []>;
+defm V_CMPX_NLG_F32 : VOPC_32 <0x0000001a, "V_CMPX_NLG_F32", []>;
+defm V_CMPX_NGT_F32 : VOPC_32 <0x0000001b, "V_CMPX_NGT_F32", []>;
+defm V_CMPX_NLE_F32 : VOPC_32 <0x0000001c, "V_CMPX_NLE_F32", []>;
+defm V_CMPX_NEQ_F32 : VOPC_32 <0x0000001d, "V_CMPX_NEQ_F32", []>;
+defm V_CMPX_NLT_F32 : VOPC_32 <0x0000001e, "V_CMPX_NLT_F32", []>;
+defm V_CMPX_TRU_F32 : VOPC_32 <0x0000001f, "V_CMPX_TRU_F32", []>;
+
+} // End hasSideEffects = 1
+
+defm V_CMP_F_F64 : VOPC_64 <0x00000020, "V_CMP_F_F64", []>;
+defm V_CMP_LT_F64 : VOPC_64 <0x00000021, "V_CMP_LT_F64", []>;
+defm V_CMP_EQ_F64 : VOPC_64 <0x00000022, "V_CMP_EQ_F64", []>;
+defm V_CMP_LE_F64 : VOPC_64 <0x00000023, "V_CMP_LE_F64", []>;
+defm V_CMP_GT_F64 : VOPC_64 <0x00000024, "V_CMP_GT_F64", []>;
+defm V_CMP_LG_F64 : VOPC_64 <0x00000025, "V_CMP_LG_F64", []>;
+defm V_CMP_GE_F64 : VOPC_64 <0x00000026, "V_CMP_GE_F64", []>;
+defm V_CMP_O_F64 : VOPC_64 <0x00000027, "V_CMP_O_F64", []>;
+defm V_CMP_U_F64 : VOPC_64 <0x00000028, "V_CMP_U_F64", []>;
+defm V_CMP_NGE_F64 : VOPC_64 <0x00000029, "V_CMP_NGE_F64", []>;
+defm V_CMP_NLG_F64 : VOPC_64 <0x0000002a, "V_CMP_NLG_F64", []>;
+defm V_CMP_NGT_F64 : VOPC_64 <0x0000002b, "V_CMP_NGT_F64", []>;
+defm V_CMP_NLE_F64 : VOPC_64 <0x0000002c, "V_CMP_NLE_F64", []>;
+defm V_CMP_NEQ_F64 : VOPC_64 <0x0000002d, "V_CMP_NEQ_F64", []>;
+defm V_CMP_NLT_F64 : VOPC_64 <0x0000002e, "V_CMP_NLT_F64", []>;
+defm V_CMP_TRU_F64 : VOPC_64 <0x0000002f, "V_CMP_TRU_F64", []>;
+
+//Side effect is writing to EXEC
+let hasSideEffects = 1 in {
+
+defm V_CMPX_F_F64 : VOPC_64 <0x00000030, "V_CMPX_F_F64", []>;
+defm V_CMPX_LT_F64 : VOPC_64 <0x00000031, "V_CMPX_LT_F64", []>;
+defm V_CMPX_EQ_F64 : VOPC_64 <0x00000032, "V_CMPX_EQ_F64", []>;
+defm V_CMPX_LE_F64 : VOPC_64 <0x00000033, "V_CMPX_LE_F64", []>;
+defm V_CMPX_GT_F64 : VOPC_64 <0x00000034, "V_CMPX_GT_F64", []>;
+defm V_CMPX_LG_F64 : VOPC_64 <0x00000035, "V_CMPX_LG_F64", []>;
+defm V_CMPX_GE_F64 : VOPC_64 <0x00000036, "V_CMPX_GE_F64", []>;
+defm V_CMPX_O_F64 : VOPC_64 <0x00000037, "V_CMPX_O_F64", []>;
+defm V_CMPX_U_F64 : VOPC_64 <0x00000038, "V_CMPX_U_F64", []>;
+defm V_CMPX_NGE_F64 : VOPC_64 <0x00000039, "V_CMPX_NGE_F64", []>;
+defm V_CMPX_NLG_F64 : VOPC_64 <0x0000003a, "V_CMPX_NLG_F64", []>;
+defm V_CMPX_NGT_F64 : VOPC_64 <0x0000003b, "V_CMPX_NGT_F64", []>;
+defm V_CMPX_NLE_F64 : VOPC_64 <0x0000003c, "V_CMPX_NLE_F64", []>;
+defm V_CMPX_NEQ_F64 : VOPC_64 <0x0000003d, "V_CMPX_NEQ_F64", []>;
+defm V_CMPX_NLT_F64 : VOPC_64 <0x0000003e, "V_CMPX_NLT_F64", []>;
+defm V_CMPX_TRU_F64 : VOPC_64 <0x0000003f, "V_CMPX_TRU_F64", []>;
+
+} // End hasSideEffects = 1
+
+defm V_CMPS_F_F32 : VOPC_32 <0x00000040, "V_CMPS_F_F32", []>;
+defm V_CMPS_LT_F32 : VOPC_32 <0x00000041, "V_CMPS_LT_F32", []>;
+defm V_CMPS_EQ_F32 : VOPC_32 <0x00000042, "V_CMPS_EQ_F32", []>;
+defm V_CMPS_LE_F32 : VOPC_32 <0x00000043, "V_CMPS_LE_F32", []>;
+defm V_CMPS_GT_F32 : VOPC_32 <0x00000044, "V_CMPS_GT_F32", []>;
+defm V_CMPS_LG_F32 : VOPC_32 <0x00000045, "V_CMPS_LG_F32", []>;
+defm V_CMPS_GE_F32 : VOPC_32 <0x00000046, "V_CMPS_GE_F32", []>;
+defm V_CMPS_O_F32 : VOPC_32 <0x00000047, "V_CMPS_O_F32", []>;
+defm V_CMPS_U_F32 : VOPC_32 <0x00000048, "V_CMPS_U_F32", []>;
+defm V_CMPS_NGE_F32 : VOPC_32 <0x00000049, "V_CMPS_NGE_F32", []>;
+defm V_CMPS_NLG_F32 : VOPC_32 <0x0000004a, "V_CMPS_NLG_F32", []>;
+defm V_CMPS_NGT_F32 : VOPC_32 <0x0000004b, "V_CMPS_NGT_F32", []>;
+defm V_CMPS_NLE_F32 : VOPC_32 <0x0000004c, "V_CMPS_NLE_F32", []>;
+defm V_CMPS_NEQ_F32 : VOPC_32 <0x0000004d, "V_CMPS_NEQ_F32", []>;
+defm V_CMPS_NLT_F32 : VOPC_32 <0x0000004e, "V_CMPS_NLT_F32", []>;
+defm V_CMPS_TRU_F32 : VOPC_32 <0x0000004f, "V_CMPS_TRU_F32", []>;
+defm V_CMPSX_F_F32 : VOPC_32 <0x00000050, "V_CMPSX_F_F32", []>;
+defm V_CMPSX_LT_F32 : VOPC_32 <0x00000051, "V_CMPSX_LT_F32", []>;
+defm V_CMPSX_EQ_F32 : VOPC_32 <0x00000052, "V_CMPSX_EQ_F32", []>;
+defm V_CMPSX_LE_F32 : VOPC_32 <0x00000053, "V_CMPSX_LE_F32", []>;
+defm V_CMPSX_GT_F32 : VOPC_32 <0x00000054, "V_CMPSX_GT_F32", []>;
+defm V_CMPSX_LG_F32 : VOPC_32 <0x00000055, "V_CMPSX_LG_F32", []>;
+defm V_CMPSX_GE_F32 : VOPC_32 <0x00000056, "V_CMPSX_GE_F32", []>;
+defm V_CMPSX_O_F32 : VOPC_32 <0x00000057, "V_CMPSX_O_F32", []>;
+defm V_CMPSX_U_F32 : VOPC_32 <0x00000058, "V_CMPSX_U_F32", []>;
+defm V_CMPSX_NGE_F32 : VOPC_32 <0x00000059, "V_CMPSX_NGE_F32", []>;
+defm V_CMPSX_NLG_F32 : VOPC_32 <0x0000005a, "V_CMPSX_NLG_F32", []>;
+defm V_CMPSX_NGT_F32 : VOPC_32 <0x0000005b, "V_CMPSX_NGT_F32", []>;
+defm V_CMPSX_NLE_F32 : VOPC_32 <0x0000005c, "V_CMPSX_NLE_F32", []>;
+defm V_CMPSX_NEQ_F32 : VOPC_32 <0x0000005d, "V_CMPSX_NEQ_F32", []>;
+defm V_CMPSX_NLT_F32 : VOPC_32 <0x0000005e, "V_CMPSX_NLT_F32", []>;
+defm V_CMPSX_TRU_F32 : VOPC_32 <0x0000005f, "V_CMPSX_TRU_F32", []>;
+defm V_CMPS_F_F64 : VOPC_64 <0x00000060, "V_CMPS_F_F64", []>;
+defm V_CMPS_LT_F64 : VOPC_64 <0x00000061, "V_CMPS_LT_F64", []>;
+defm V_CMPS_EQ_F64 : VOPC_64 <0x00000062, "V_CMPS_EQ_F64", []>;
+defm V_CMPS_LE_F64 : VOPC_64 <0x00000063, "V_CMPS_LE_F64", []>;
+defm V_CMPS_GT_F64 : VOPC_64 <0x00000064, "V_CMPS_GT_F64", []>;
+defm V_CMPS_LG_F64 : VOPC_64 <0x00000065, "V_CMPS_LG_F64", []>;
+defm V_CMPS_GE_F64 : VOPC_64 <0x00000066, "V_CMPS_GE_F64", []>;
+defm V_CMPS_O_F64 : VOPC_64 <0x00000067, "V_CMPS_O_F64", []>;
+defm V_CMPS_U_F64 : VOPC_64 <0x00000068, "V_CMPS_U_F64", []>;
+defm V_CMPS_NGE_F64 : VOPC_64 <0x00000069, "V_CMPS_NGE_F64", []>;
+defm V_CMPS_NLG_F64 : VOPC_64 <0x0000006a, "V_CMPS_NLG_F64", []>;
+defm V_CMPS_NGT_F64 : VOPC_64 <0x0000006b, "V_CMPS_NGT_F64", []>;
+defm V_CMPS_NLE_F64 : VOPC_64 <0x0000006c, "V_CMPS_NLE_F64", []>;
+defm V_CMPS_NEQ_F64 : VOPC_64 <0x0000006d, "V_CMPS_NEQ_F64", []>;
+defm V_CMPS_NLT_F64 : VOPC_64 <0x0000006e, "V_CMPS_NLT_F64", []>;
+defm V_CMPS_TRU_F64 : VOPC_64 <0x0000006f, "V_CMPS_TRU_F64", []>;
+defm V_CMPSX_F_F64 : VOPC_64 <0x00000070, "V_CMPSX_F_F64", []>;
+defm V_CMPSX_LT_F64 : VOPC_64 <0x00000071, "V_CMPSX_LT_F64", []>;
+defm V_CMPSX_EQ_F64 : VOPC_64 <0x00000072, "V_CMPSX_EQ_F64", []>;
+defm V_CMPSX_LE_F64 : VOPC_64 <0x00000073, "V_CMPSX_LE_F64", []>;
+defm V_CMPSX_GT_F64 : VOPC_64 <0x00000074, "V_CMPSX_GT_F64", []>;
+defm V_CMPSX_LG_F64 : VOPC_64 <0x00000075, "V_CMPSX_LG_F64", []>;
+defm V_CMPSX_GE_F64 : VOPC_64 <0x00000076, "V_CMPSX_GE_F64", []>;
+defm V_CMPSX_O_F64 : VOPC_64 <0x00000077, "V_CMPSX_O_F64", []>;
+defm V_CMPSX_U_F64 : VOPC_64 <0x00000078, "V_CMPSX_U_F64", []>;
+defm V_CMPSX_NGE_F64 : VOPC_64 <0x00000079, "V_CMPSX_NGE_F64", []>;
+defm V_CMPSX_NLG_F64 : VOPC_64 <0x0000007a, "V_CMPSX_NLG_F64", []>;
+defm V_CMPSX_NGT_F64 : VOPC_64 <0x0000007b, "V_CMPSX_NGT_F64", []>;
+defm V_CMPSX_NLE_F64 : VOPC_64 <0x0000007c, "V_CMPSX_NLE_F64", []>;
+defm V_CMPSX_NEQ_F64 : VOPC_64 <0x0000007d, "V_CMPSX_NEQ_F64", []>;
+defm V_CMPSX_NLT_F64 : VOPC_64 <0x0000007e, "V_CMPSX_NLT_F64", []>;
+defm V_CMPSX_TRU_F64 : VOPC_64 <0x0000007f, "V_CMPSX_TRU_F64", []>;
+defm V_CMP_F_I32 : VOPC_32 <0x00000080, "V_CMP_F_I32", []>;
+defm V_CMP_LT_I32 : VOPC_32 <0x00000081, "V_CMP_LT_I32", []>;
+def : Pat <
+  (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_LT)),
+  (V_CMP_LT_I32_e64 AllReg_32:$src0, VReg_32:$src1)
+>;
+defm V_CMP_EQ_I32 : VOPC_32 <0x00000082, "V_CMP_EQ_I32", []>;
+def : Pat <
+  (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_EQ)),
+  (V_CMP_EQ_I32_e64 AllReg_32:$src0, VReg_32:$src1)
+>;
+defm V_CMP_LE_I32 : VOPC_32 <0x00000083, "V_CMP_LE_I32", []>;
+def : Pat <
+  (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_LE)),
+  (V_CMP_LE_I32_e64 AllReg_32:$src0, VReg_32:$src1)
+>;
+defm V_CMP_GT_I32 : VOPC_32 <0x00000084, "V_CMP_GT_I32", []>;
+def : Pat <
+  (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_GT)),
+  (V_CMP_GT_I32_e64 AllReg_32:$src0, VReg_32:$src1)
+>;
+defm V_CMP_NE_I32 : VOPC_32 <0x00000085, "V_CMP_NE_I32", []>;
+def : Pat <
+  (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_NE)),
+  (V_CMP_NE_I32_e64 AllReg_32:$src0, VReg_32:$src1)
+>;
+defm V_CMP_GE_I32 : VOPC_32 <0x00000086, "V_CMP_GE_I32", []>;
+def : Pat <
+  (i1 (setcc (i32 AllReg_32:$src0), VReg_32:$src1, COND_GE)),
+  (V_CMP_GE_I32_e64 AllReg_32:$src0, VReg_32:$src1)
+>;
+defm V_CMP_T_I32 : VOPC_32 <0x00000087, "V_CMP_T_I32", []>;
+
+let hasSideEffects = 1 in {
+
+defm V_CMPX_F_I32 : VOPC_32 <0x00000090, "V_CMPX_F_I32", []>;
+defm V_CMPX_LT_I32 : VOPC_32 <0x00000091, "V_CMPX_LT_I32", []>;
+defm V_CMPX_EQ_I32 : VOPC_32 <0x00000092, "V_CMPX_EQ_I32", []>;
+defm V_CMPX_LE_I32 : VOPC_32 <0x00000093, "V_CMPX_LE_I32", []>;
+defm V_CMPX_GT_I32 : VOPC_32 <0x00000094, "V_CMPX_GT_I32", []>;
+defm V_CMPX_NE_I32 : VOPC_32 <0x00000095, "V_CMPX_NE_I32", []>;
+defm V_CMPX_GE_I32 : VOPC_32 <0x00000096, "V_CMPX_GE_I32", []>;
+defm V_CMPX_T_I32 : VOPC_32 <0x00000097, "V_CMPX_T_I32", []>;
+
+} // End hasSideEffects
+
+defm V_CMP_F_I64 : VOPC_64 <0x000000a0, "V_CMP_F_I64", []>;
+defm V_CMP_LT_I64 : VOPC_64 <0x000000a1, "V_CMP_LT_I64", []>;
+defm V_CMP_EQ_I64 : VOPC_64 <0x000000a2, "V_CMP_EQ_I64", []>;
+defm V_CMP_LE_I64 : VOPC_64 <0x000000a3, "V_CMP_LE_I64", []>;
+defm V_CMP_GT_I64 : VOPC_64 <0x000000a4, "V_CMP_GT_I64", []>;
+defm V_CMP_NE_I64 : VOPC_64 <0x000000a5, "V_CMP_NE_I64", []>;
+defm V_CMP_GE_I64 : VOPC_64 <0x000000a6, "V_CMP_GE_I64", []>;
+defm V_CMP_T_I64 : VOPC_64 <0x000000a7, "V_CMP_T_I64", []>;
+
+let hasSideEffects = 1 in {
+
+defm V_CMPX_F_I64 : VOPC_64 <0x000000b0, "V_CMPX_F_I64", []>;
+defm V_CMPX_LT_I64 : VOPC_64 <0x000000b1, "V_CMPX_LT_I64", []>;
+defm V_CMPX_EQ_I64 : VOPC_64 <0x000000b2, "V_CMPX_EQ_I64", []>;
+defm V_CMPX_LE_I64 : VOPC_64 <0x000000b3, "V_CMPX_LE_I64", []>;
+defm V_CMPX_GT_I64 : VOPC_64 <0x000000b4, "V_CMPX_GT_I64", []>;
+defm V_CMPX_NE_I64 : VOPC_64 <0x000000b5, "V_CMPX_NE_I64", []>;
+defm V_CMPX_GE_I64 : VOPC_64 <0x000000b6, "V_CMPX_GE_I64", []>;
+defm V_CMPX_T_I64 : VOPC_64 <0x000000b7, "V_CMPX_T_I64", []>;
+
+} // End hasSideEffects
+
+defm V_CMP_F_U32 : VOPC_32 <0x000000c0, "V_CMP_F_U32", []>;
+defm V_CMP_LT_U32 : VOPC_32 <0x000000c1, "V_CMP_LT_U32", []>;
+defm V_CMP_EQ_U32 : VOPC_32 <0x000000c2, "V_CMP_EQ_U32", []>;
+defm V_CMP_LE_U32 : VOPC_32 <0x000000c3, "V_CMP_LE_U32", []>;
+defm V_CMP_GT_U32 : VOPC_32 <0x000000c4, "V_CMP_GT_U32", []>;
+defm V_CMP_NE_U32 : VOPC_32 <0x000000c5, "V_CMP_NE_U32", []>;
+defm V_CMP_GE_U32 : VOPC_32 <0x000000c6, "V_CMP_GE_U32", []>;
+defm V_CMP_T_U32 : VOPC_32 <0x000000c7, "V_CMP_T_U32", []>;
+
+let hasSideEffects = 1 in {
+
+defm V_CMPX_F_U32 : VOPC_32 <0x000000d0, "V_CMPX_F_U32", []>;
+defm V_CMPX_LT_U32 : VOPC_32 <0x000000d1, "V_CMPX_LT_U32", []>;
+defm V_CMPX_EQ_U32 : VOPC_32 <0x000000d2, "V_CMPX_EQ_U32", []>;
+defm V_CMPX_LE_U32 : VOPC_32 <0x000000d3, "V_CMPX_LE_U32", []>;
+defm V_CMPX_GT_U32 : VOPC_32 <0x000000d4, "V_CMPX_GT_U32", []>;
+defm V_CMPX_NE_U32 : VOPC_32 <0x000000d5, "V_CMPX_NE_U32", []>;
+defm V_CMPX_GE_U32 : VOPC_32 <0x000000d6, "V_CMPX_GE_U32", []>;
+defm V_CMPX_T_U32 : VOPC_32 <0x000000d7, "V_CMPX_T_U32", []>;
+
+} // End hasSideEffects
+
+defm V_CMP_F_U64 : VOPC_64 <0x000000e0, "V_CMP_F_U64", []>;
+defm V_CMP_LT_U64 : VOPC_64 <0x000000e1, "V_CMP_LT_U64", []>;
+defm V_CMP_EQ_U64 : VOPC_64 <0x000000e2, "V_CMP_EQ_U64", []>;
+defm V_CMP_LE_U64 : VOPC_64 <0x000000e3, "V_CMP_LE_U64", []>;
+defm V_CMP_GT_U64 : VOPC_64 <0x000000e4, "V_CMP_GT_U64", []>;
+defm V_CMP_NE_U64 : VOPC_64 <0x000000e5, "V_CMP_NE_U64", []>;
+defm V_CMP_GE_U64 : VOPC_64 <0x000000e6, "V_CMP_GE_U64", []>;
+defm V_CMP_T_U64 : VOPC_64 <0x000000e7, "V_CMP_T_U64", []>;
+defm V_CMPX_F_U64 : VOPC_64 <0x000000f0, "V_CMPX_F_U64", []>;
+defm V_CMPX_LT_U64 : VOPC_64 <0x000000f1, "V_CMPX_LT_U64", []>;
+defm V_CMPX_EQ_U64 : VOPC_64 <0x000000f2, "V_CMPX_EQ_U64", []>;
+defm V_CMPX_LE_U64 : VOPC_64 <0x000000f3, "V_CMPX_LE_U64", []>;
+defm V_CMPX_GT_U64 : VOPC_64 <0x000000f4, "V_CMPX_GT_U64", []>;
+defm V_CMPX_NE_U64 : VOPC_64 <0x000000f5, "V_CMPX_NE_U64", []>;
+defm V_CMPX_GE_U64 : VOPC_64 <0x000000f6, "V_CMPX_GE_U64", []>;
+defm V_CMPX_T_U64 : VOPC_64 <0x000000f7, "V_CMPX_T_U64", []>;
+defm V_CMP_CLASS_F32 : VOPC_32 <0x00000088, "V_CMP_CLASS_F32", []>;
+defm V_CMPX_CLASS_F32 : VOPC_32 <0x00000098, "V_CMPX_CLASS_F32", []>;
+defm V_CMP_CLASS_F64 : VOPC_64 <0x000000a8, "V_CMP_CLASS_F64", []>;
+defm V_CMPX_CLASS_F64 : VOPC_64 <0x000000b8, "V_CMPX_CLASS_F64", []>;
+//def BUFFER_LOAD_FORMAT_X : MUBUF_ <0x00000000, "BUFFER_LOAD_FORMAT_X", []>;
+//def BUFFER_LOAD_FORMAT_XY : MUBUF_ <0x00000001, "BUFFER_LOAD_FORMAT_XY", []>;
+//def BUFFER_LOAD_FORMAT_XYZ : MUBUF_ <0x00000002, "BUFFER_LOAD_FORMAT_XYZ", []>;
+def BUFFER_LOAD_FORMAT_XYZW : MUBUF_Load_Helper <0x00000003, "BUFFER_LOAD_FORMAT_XYZW", VReg_128>;
+//def BUFFER_STORE_FORMAT_X : MUBUF_ <0x00000004, "BUFFER_STORE_FORMAT_X", []>;
+//def BUFFER_STORE_FORMAT_XY : MUBUF_ <0x00000005, "BUFFER_STORE_FORMAT_XY", []>;
+//def BUFFER_STORE_FORMAT_XYZ : MUBUF_ <0x00000006, "BUFFER_STORE_FORMAT_XYZ", []>;
+//def BUFFER_STORE_FORMAT_XYZW : MUBUF_ <0x00000007, "BUFFER_STORE_FORMAT_XYZW", []>;
+//def BUFFER_LOAD_UBYTE : MUBUF_ <0x00000008, "BUFFER_LOAD_UBYTE", []>;
+//def BUFFER_LOAD_SBYTE : MUBUF_ <0x00000009, "BUFFER_LOAD_SBYTE", []>;
+//def BUFFER_LOAD_USHORT : MUBUF_ <0x0000000a, "BUFFER_LOAD_USHORT", []>;
+//def BUFFER_LOAD_SSHORT : MUBUF_ <0x0000000b, "BUFFER_LOAD_SSHORT", []>;
+//def BUFFER_LOAD_DWORD : MUBUF_ <0x0000000c, "BUFFER_LOAD_DWORD", []>;
+//def BUFFER_LOAD_DWORDX2 : MUBUF_DWORDX2 <0x0000000d, "BUFFER_LOAD_DWORDX2", []>;
+//def BUFFER_LOAD_DWORDX4 : MUBUF_DWORDX4 <0x0000000e, "BUFFER_LOAD_DWORDX4", []>;
+//def BUFFER_STORE_BYTE : MUBUF_ <0x00000018, "BUFFER_STORE_BYTE", []>;
+//def BUFFER_STORE_SHORT : MUBUF_ <0x0000001a, "BUFFER_STORE_SHORT", []>;
+//def BUFFER_STORE_DWORD : MUBUF_ <0x0000001c, "BUFFER_STORE_DWORD", []>;
+//def BUFFER_STORE_DWORDX2 : MUBUF_DWORDX2 <0x0000001d, "BUFFER_STORE_DWORDX2", []>;
+//def BUFFER_STORE_DWORDX4 : MUBUF_DWORDX4 <0x0000001e, "BUFFER_STORE_DWORDX4", []>;
+//def BUFFER_ATOMIC_SWAP : MUBUF_ <0x00000030, "BUFFER_ATOMIC_SWAP", []>;
+//def BUFFER_ATOMIC_CMPSWAP : MUBUF_ <0x00000031, "BUFFER_ATOMIC_CMPSWAP", []>;
+//def BUFFER_ATOMIC_ADD : MUBUF_ <0x00000032, "BUFFER_ATOMIC_ADD", []>;
+//def BUFFER_ATOMIC_SUB : MUBUF_ <0x00000033, "BUFFER_ATOMIC_SUB", []>;
+//def BUFFER_ATOMIC_RSUB : MUBUF_ <0x00000034, "BUFFER_ATOMIC_RSUB", []>;
+//def BUFFER_ATOMIC_SMIN : MUBUF_ <0x00000035, "BUFFER_ATOMIC_SMIN", []>;
+//def BUFFER_ATOMIC_UMIN : MUBUF_ <0x00000036, "BUFFER_ATOMIC_UMIN", []>;
+//def BUFFER_ATOMIC_SMAX : MUBUF_ <0x00000037, "BUFFER_ATOMIC_SMAX", []>;
+//def BUFFER_ATOMIC_UMAX : MUBUF_ <0x00000038, "BUFFER_ATOMIC_UMAX", []>;
+//def BUFFER_ATOMIC_AND : MUBUF_ <0x00000039, "BUFFER_ATOMIC_AND", []>;
+//def BUFFER_ATOMIC_OR : MUBUF_ <0x0000003a, "BUFFER_ATOMIC_OR", []>;
+//def BUFFER_ATOMIC_XOR : MUBUF_ <0x0000003b, "BUFFER_ATOMIC_XOR", []>;
+//def BUFFER_ATOMIC_INC : MUBUF_ <0x0000003c, "BUFFER_ATOMIC_INC", []>;
+//def BUFFER_ATOMIC_DEC : MUBUF_ <0x0000003d, "BUFFER_ATOMIC_DEC", []>;
+//def BUFFER_ATOMIC_FCMPSWAP : MUBUF_ <0x0000003e, "BUFFER_ATOMIC_FCMPSWAP", []>;
+//def BUFFER_ATOMIC_FMIN : MUBUF_ <0x0000003f, "BUFFER_ATOMIC_FMIN", []>;
+//def BUFFER_ATOMIC_FMAX : MUBUF_ <0x00000040, "BUFFER_ATOMIC_FMAX", []>;
+//def BUFFER_ATOMIC_SWAP_X2 : MUBUF_X2 <0x00000050, "BUFFER_ATOMIC_SWAP_X2", []>;
+//def BUFFER_ATOMIC_CMPSWAP_X2 : MUBUF_X2 <0x00000051, "BUFFER_ATOMIC_CMPSWAP_X2", []>;
+//def BUFFER_ATOMIC_ADD_X2 : MUBUF_X2 <0x00000052, "BUFFER_ATOMIC_ADD_X2", []>;
+//def BUFFER_ATOMIC_SUB_X2 : MUBUF_X2 <0x00000053, "BUFFER_ATOMIC_SUB_X2", []>;
+//def BUFFER_ATOMIC_RSUB_X2 : MUBUF_X2 <0x00000054, "BUFFER_ATOMIC_RSUB_X2", []>;
+//def BUFFER_ATOMIC_SMIN_X2 : MUBUF_X2 <0x00000055, "BUFFER_ATOMIC_SMIN_X2", []>;
+//def BUFFER_ATOMIC_UMIN_X2 : MUBUF_X2 <0x00000056, "BUFFER_ATOMIC_UMIN_X2", []>;
+//def BUFFER_ATOMIC_SMAX_X2 : MUBUF_X2 <0x00000057, "BUFFER_ATOMIC_SMAX_X2", []>;
+//def BUFFER_ATOMIC_UMAX_X2 : MUBUF_X2 <0x00000058, "BUFFER_ATOMIC_UMAX_X2", []>;
+//def BUFFER_ATOMIC_AND_X2 : MUBUF_X2 <0x00000059, "BUFFER_ATOMIC_AND_X2", []>;
+//def BUFFER_ATOMIC_OR_X2 : MUBUF_X2 <0x0000005a, "BUFFER_ATOMIC_OR_X2", []>;
+//def BUFFER_ATOMIC_XOR_X2 : MUBUF_X2 <0x0000005b, "BUFFER_ATOMIC_XOR_X2", []>;
+//def BUFFER_ATOMIC_INC_X2 : MUBUF_X2 <0x0000005c, "BUFFER_ATOMIC_INC_X2", []>;
+//def BUFFER_ATOMIC_DEC_X2 : MUBUF_X2 <0x0000005d, "BUFFER_ATOMIC_DEC_X2", []>;
+//def BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_X2 <0x0000005e, "BUFFER_ATOMIC_FCMPSWAP_X2", []>;
+//def BUFFER_ATOMIC_FMIN_X2 : MUBUF_X2 <0x0000005f, "BUFFER_ATOMIC_FMIN_X2", []>;
+//def BUFFER_ATOMIC_FMAX_X2 : MUBUF_X2 <0x00000060, "BUFFER_ATOMIC_FMAX_X2", []>;
+//def BUFFER_WBINVL1_SC : MUBUF_WBINVL1 <0x00000070, "BUFFER_WBINVL1_SC", []>;
+//def BUFFER_WBINVL1 : MUBUF_WBINVL1 <0x00000071, "BUFFER_WBINVL1", []>;
+//def TBUFFER_LOAD_FORMAT_X : MTBUF_ <0x00000000, "TBUFFER_LOAD_FORMAT_X", []>;
+//def TBUFFER_LOAD_FORMAT_XY : MTBUF_ <0x00000001, "TBUFFER_LOAD_FORMAT_XY", []>;
+//def TBUFFER_LOAD_FORMAT_XYZ : MTBUF_ <0x00000002, "TBUFFER_LOAD_FORMAT_XYZ", []>;
+def TBUFFER_LOAD_FORMAT_XYZW : MTBUF_Load_Helper <0x00000003, "TBUFFER_LOAD_FORMAT_XYZW", VReg_128>;
+//def TBUFFER_STORE_FORMAT_X : MTBUF_ <0x00000004, "TBUFFER_STORE_FORMAT_X", []>;
+//def TBUFFER_STORE_FORMAT_XY : MTBUF_ <0x00000005, "TBUFFER_STORE_FORMAT_XY", []>;
+//def TBUFFER_STORE_FORMAT_XYZ : MTBUF_ <0x00000006, "TBUFFER_STORE_FORMAT_XYZ", []>;
+//def TBUFFER_STORE_FORMAT_XYZW : MTBUF_ <0x00000007, "TBUFFER_STORE_FORMAT_XYZW", []>;
+
+defm S_LOAD_DWORD : SMRD_32 <0x00000000, "S_LOAD_DWORD", SReg_32>;
+
+//def S_LOAD_DWORDX2 : SMRD_DWORDX2 <0x00000001, "S_LOAD_DWORDX2", []>;
+defm S_LOAD_DWORDX4 : SMRD_Helper <0x00000002, "S_LOAD_DWORDX4", SReg_128, v4i32>;
+defm S_LOAD_DWORDX8 : SMRD_Helper <0x00000003, "S_LOAD_DWORDX8", SReg_256, v8i32>;
+//def S_LOAD_DWORDX16 : SMRD_DWORDX16 <0x00000004, "S_LOAD_DWORDX16", []>;
+//def S_BUFFER_LOAD_DWORD : SMRD_ <0x00000008, "S_BUFFER_LOAD_DWORD", []>;
+//def S_BUFFER_LOAD_DWORDX2 : SMRD_DWORDX2 <0x00000009, "S_BUFFER_LOAD_DWORDX2", []>;
+//def S_BUFFER_LOAD_DWORDX4 : SMRD_DWORDX4 <0x0000000a, "S_BUFFER_LOAD_DWORDX4", []>;
+//def S_BUFFER_LOAD_DWORDX8 : SMRD_DWORDX8 <0x0000000b, "S_BUFFER_LOAD_DWORDX8", []>;
+//def S_BUFFER_LOAD_DWORDX16 : SMRD_DWORDX16 <0x0000000c, "S_BUFFER_LOAD_DWORDX16", []>;
+
+//def S_MEMTIME : SMRD_ <0x0000001e, "S_MEMTIME", []>;
+//def S_DCACHE_INV : SMRD_ <0x0000001f, "S_DCACHE_INV", []>;
+//def IMAGE_LOAD : MIMG_NoPattern_ <"IMAGE_LOAD", 0x00000000>;
+//def IMAGE_LOAD_MIP : MIMG_NoPattern_ <"IMAGE_LOAD_MIP", 0x00000001>;
+//def IMAGE_LOAD_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_PCK", 0x00000002>;
+//def IMAGE_LOAD_PCK_SGN : MIMG_NoPattern_ <"IMAGE_LOAD_PCK_SGN", 0x00000003>;
+//def IMAGE_LOAD_MIP_PCK : MIMG_NoPattern_ <"IMAGE_LOAD_MIP_PCK", 0x00000004>;
+//def IMAGE_LOAD_MIP_PCK_SGN : MIMG_NoPattern_ <"IMAGE_LOAD_MIP_PCK_SGN", 0x00000005>;
+//def IMAGE_STORE : MIMG_NoPattern_ <"IMAGE_STORE", 0x00000008>;
+//def IMAGE_STORE_MIP : MIMG_NoPattern_ <"IMAGE_STORE_MIP", 0x00000009>;
+//def IMAGE_STORE_PCK : MIMG_NoPattern_ <"IMAGE_STORE_PCK", 0x0000000a>;
+//def IMAGE_STORE_MIP_PCK : MIMG_NoPattern_ <"IMAGE_STORE_MIP_PCK", 0x0000000b>;
+//def IMAGE_GET_RESINFO : MIMG_NoPattern_ <"IMAGE_GET_RESINFO", 0x0000000e>;
+//def IMAGE_ATOMIC_SWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_SWAP", 0x0000000f>;
+//def IMAGE_ATOMIC_CMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_CMPSWAP", 0x00000010>;
+//def IMAGE_ATOMIC_ADD : MIMG_NoPattern_ <"IMAGE_ATOMIC_ADD", 0x00000011>;
+//def IMAGE_ATOMIC_SUB : MIMG_NoPattern_ <"IMAGE_ATOMIC_SUB", 0x00000012>;
+//def IMAGE_ATOMIC_RSUB : MIMG_NoPattern_ <"IMAGE_ATOMIC_RSUB", 0x00000013>;
+//def IMAGE_ATOMIC_SMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_SMIN", 0x00000014>;
+//def IMAGE_ATOMIC_UMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_UMIN", 0x00000015>;
+//def IMAGE_ATOMIC_SMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_SMAX", 0x00000016>;
+//def IMAGE_ATOMIC_UMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_UMAX", 0x00000017>;
+//def IMAGE_ATOMIC_AND : MIMG_NoPattern_ <"IMAGE_ATOMIC_AND", 0x00000018>;
+//def IMAGE_ATOMIC_OR : MIMG_NoPattern_ <"IMAGE_ATOMIC_OR", 0x00000019>;
+//def IMAGE_ATOMIC_XOR : MIMG_NoPattern_ <"IMAGE_ATOMIC_XOR", 0x0000001a>;
+//def IMAGE_ATOMIC_INC : MIMG_NoPattern_ <"IMAGE_ATOMIC_INC", 0x0000001b>;
+//def IMAGE_ATOMIC_DEC : MIMG_NoPattern_ <"IMAGE_ATOMIC_DEC", 0x0000001c>;
+//def IMAGE_ATOMIC_FCMPSWAP : MIMG_NoPattern_ <"IMAGE_ATOMIC_FCMPSWAP", 0x0000001d>;
+//def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMIN", 0x0000001e>;
+//def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"IMAGE_ATOMIC_FMAX", 0x0000001f>;
+def IMAGE_SAMPLE : MIMG_Load_Helper <0x00000020, "IMAGE_SAMPLE">; 
+//def IMAGE_SAMPLE_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_CL", 0x00000021>;
+def IMAGE_SAMPLE_D : MIMG_Load_Helper <0x00000022, "IMAGE_SAMPLE_D">;
+//def IMAGE_SAMPLE_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_CL", 0x00000023>;
+def IMAGE_SAMPLE_L : MIMG_Load_Helper <0x00000024, "IMAGE_SAMPLE_L">;
+def IMAGE_SAMPLE_B : MIMG_Load_Helper <0x00000025, "IMAGE_SAMPLE_B">;
+//def IMAGE_SAMPLE_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_CL", 0x00000026>;
+//def IMAGE_SAMPLE_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_LZ", 0x00000027>;
+//def IMAGE_SAMPLE_C : MIMG_NoPattern_ <"IMAGE_SAMPLE_C", 0x00000028>;
+//def IMAGE_SAMPLE_C_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CL", 0x00000029>;
+//def IMAGE_SAMPLE_C_D : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D", 0x0000002a>;
+//def IMAGE_SAMPLE_C_D_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_CL", 0x0000002b>;
+//def IMAGE_SAMPLE_C_L : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_L", 0x0000002c>;
+//def IMAGE_SAMPLE_C_B : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B", 0x0000002d>;
+//def IMAGE_SAMPLE_C_B_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_CL", 0x0000002e>;
+//def IMAGE_SAMPLE_C_LZ : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_LZ", 0x0000002f>;
+//def IMAGE_SAMPLE_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_O", 0x00000030>;
+//def IMAGE_SAMPLE_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_CL_O", 0x00000031>;
+//def IMAGE_SAMPLE_D_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_O", 0x00000032>;
+//def IMAGE_SAMPLE_D_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_D_CL_O", 0x00000033>;
+//def IMAGE_SAMPLE_L_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_L_O", 0x00000034>;
+//def IMAGE_SAMPLE_B_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_O", 0x00000035>;
+//def IMAGE_SAMPLE_B_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_B_CL_O", 0x00000036>;
+//def IMAGE_SAMPLE_LZ_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_LZ_O", 0x00000037>;
+//def IMAGE_SAMPLE_C_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_O", 0x00000038>;
+//def IMAGE_SAMPLE_C_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CL_O", 0x00000039>;
+//def IMAGE_SAMPLE_C_D_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_O", 0x0000003a>;
+//def IMAGE_SAMPLE_C_D_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_D_CL_O", 0x0000003b>;
+//def IMAGE_SAMPLE_C_L_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_L_O", 0x0000003c>;
+//def IMAGE_SAMPLE_C_B_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_O", 0x0000003d>;
+//def IMAGE_SAMPLE_C_B_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_B_CL_O", 0x0000003e>;
+//def IMAGE_SAMPLE_C_LZ_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_LZ_O", 0x0000003f>;
+//def IMAGE_GATHER4 : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4", 0x00000040>;
+//def IMAGE_GATHER4_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_CL", 0x00000041>;
+//def IMAGE_GATHER4_L : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_L", 0x00000044>;
+//def IMAGE_GATHER4_B : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B", 0x00000045>;
+//def IMAGE_GATHER4_B_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_CL", 0x00000046>;
+//def IMAGE_GATHER4_LZ : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_LZ", 0x00000047>;
+//def IMAGE_GATHER4_C : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C", 0x00000048>;
+//def IMAGE_GATHER4_C_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_CL", 0x00000049>;
+//def IMAGE_GATHER4_C_L : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_L", 0x0000004c>;
+//def IMAGE_GATHER4_C_B : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B", 0x0000004d>;
+//def IMAGE_GATHER4_C_B_CL : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_CL", 0x0000004e>;
+//def IMAGE_GATHER4_C_LZ : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_LZ", 0x0000004f>;
+//def IMAGE_GATHER4_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_O", 0x00000050>;
+//def IMAGE_GATHER4_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_CL_O", 0x00000051>;
+//def IMAGE_GATHER4_L_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_L_O", 0x00000054>;
+//def IMAGE_GATHER4_B_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_O", 0x00000055>;
+//def IMAGE_GATHER4_B_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_B_CL_O", 0x00000056>;
+//def IMAGE_GATHER4_LZ_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_LZ_O", 0x00000057>;
+//def IMAGE_GATHER4_C_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_O", 0x00000058>;
+//def IMAGE_GATHER4_C_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_CL_O", 0x00000059>;
+//def IMAGE_GATHER4_C_L_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_L_O", 0x0000005c>;
+//def IMAGE_GATHER4_C_B_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_O", 0x0000005d>;
+//def IMAGE_GATHER4_C_B_CL_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_B_CL_O", 0x0000005e>;
+//def IMAGE_GATHER4_C_LZ_O : MIMG_NoPattern_GATHER4 <"IMAGE_GATHER4_C_LZ_O", 0x0000005f>;
+//def IMAGE_GET_LOD : MIMG_NoPattern_ <"IMAGE_GET_LOD", 0x00000060>;
+//def IMAGE_SAMPLE_CD : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD", 0x00000068>;
+//def IMAGE_SAMPLE_CD_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_CL", 0x00000069>;
+//def IMAGE_SAMPLE_C_CD : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD", 0x0000006a>;
+//def IMAGE_SAMPLE_C_CD_CL : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_CL", 0x0000006b>;
+//def IMAGE_SAMPLE_CD_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_O", 0x0000006c>;
+//def IMAGE_SAMPLE_CD_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_CD_CL_O", 0x0000006d>;
+//def IMAGE_SAMPLE_C_CD_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_O", 0x0000006e>;
+//def IMAGE_SAMPLE_C_CD_CL_O : MIMG_NoPattern_ <"IMAGE_SAMPLE_C_CD_CL_O", 0x0000006f>;
+//def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"IMAGE_RSRC256", 0x0000007e>;
+//def IMAGE_SAMPLER : MIMG_NoPattern_ <"IMAGE_SAMPLER", 0x0000007f>;
+//def V_NOP : VOP1_ <0x00000000, "V_NOP", []>;
+
+let neverHasSideEffects = 1 in {
+defm V_MOV_B32 : VOP1_32 <0x00000001, "V_MOV_B32", []>;
+}  // End neverHasSideEffects
+defm V_READFIRSTLANE_B32 : VOP1_32 <0x00000002, "V_READFIRSTLANE_B32", []>;
+//defm V_CVT_I32_F64 : VOP1_32 <0x00000003, "V_CVT_I32_F64", []>;
+//defm V_CVT_F64_I32 : VOP1_64 <0x00000004, "V_CVT_F64_I32", []>;
+defm V_CVT_F32_I32 : VOP1_32 <0x00000005, "V_CVT_F32_I32",
+  [(set VReg_32:$dst, (sint_to_fp AllReg_32:$src0))]
+>;
+//defm V_CVT_F32_U32 : VOP1_32 <0x00000006, "V_CVT_F32_U32", []>;
+//defm V_CVT_U32_F32 : VOP1_32 <0x00000007, "V_CVT_U32_F32", []>;
+defm V_CVT_I32_F32 : VOP1_32 <0x00000008, "V_CVT_I32_F32",
+  [(set VReg_32:$dst, (fp_to_sint AllReg_32:$src0))]
+>;
+defm V_MOV_FED_B32 : VOP1_32 <0x00000009, "V_MOV_FED_B32", []>;
+////def V_CVT_F16_F32 : VOP1_F16 <0x0000000a, "V_CVT_F16_F32", []>;
+//defm V_CVT_F32_F16 : VOP1_32 <0x0000000b, "V_CVT_F32_F16", []>;
+//defm V_CVT_RPI_I32_F32 : VOP1_32 <0x0000000c, "V_CVT_RPI_I32_F32", []>;
+//defm V_CVT_FLR_I32_F32 : VOP1_32 <0x0000000d, "V_CVT_FLR_I32_F32", []>;
+//defm V_CVT_OFF_F32_I4 : VOP1_32 <0x0000000e, "V_CVT_OFF_F32_I4", []>;
+//defm V_CVT_F32_F64 : VOP1_32 <0x0000000f, "V_CVT_F32_F64", []>;
+//defm V_CVT_F64_F32 : VOP1_64 <0x00000010, "V_CVT_F64_F32", []>;
+//defm V_CVT_F32_UBYTE0 : VOP1_32 <0x00000011, "V_CVT_F32_UBYTE0", []>;
+//defm V_CVT_F32_UBYTE1 : VOP1_32 <0x00000012, "V_CVT_F32_UBYTE1", []>;
+//defm V_CVT_F32_UBYTE2 : VOP1_32 <0x00000013, "V_CVT_F32_UBYTE2", []>;
+//defm V_CVT_F32_UBYTE3 : VOP1_32 <0x00000014, "V_CVT_F32_UBYTE3", []>;
+//defm V_CVT_U32_F64 : VOP1_32 <0x00000015, "V_CVT_U32_F64", []>;
+//defm V_CVT_F64_U32 : VOP1_64 <0x00000016, "V_CVT_F64_U32", []>;
+defm V_FRACT_F32 : VOP1_32 <0x00000020, "V_FRACT_F32",
+  [(set VReg_32:$dst, (AMDGPUfract AllReg_32:$src0))]
+>;
+defm V_TRUNC_F32 : VOP1_32 <0x00000021, "V_TRUNC_F32", []>;
+defm V_CEIL_F32 : VOP1_32 <0x00000022, "V_CEIL_F32", []>;
+defm V_RNDNE_F32 : VOP1_32 <0x00000023, "V_RNDNE_F32",
+  [(set VReg_32:$dst, (frint AllReg_32:$src0))]
+>;
+defm V_FLOOR_F32 : VOP1_32 <0x00000024, "V_FLOOR_F32",
+  [(set VReg_32:$dst, (ffloor AllReg_32:$src0))]
+>;
+defm V_EXP_F32 : VOP1_32 <0x00000025, "V_EXP_F32",
+  [(set VReg_32:$dst, (fexp2 AllReg_32:$src0))]
+>;
+defm V_LOG_CLAMP_F32 : VOP1_32 <0x00000026, "V_LOG_CLAMP_F32", []>;
+defm V_LOG_F32 : VOP1_32 <0x00000027, "V_LOG_F32", []>;
+defm V_RCP_CLAMP_F32 : VOP1_32 <0x00000028, "V_RCP_CLAMP_F32", []>;
+defm V_RCP_LEGACY_F32 : VOP1_32 <0x00000029, "V_RCP_LEGACY_F32", []>;
+defm V_RCP_F32 : VOP1_32 <0x0000002a, "V_RCP_F32",
+  [(set VReg_32:$dst, (fdiv FP_ONE, AllReg_32:$src0))]
+>;
+defm V_RCP_IFLAG_F32 : VOP1_32 <0x0000002b, "V_RCP_IFLAG_F32", []>;
+defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32", []>;
+defm V_RSQ_LEGACY_F32 : VOP1_32 <
+  0x0000002d, "V_RSQ_LEGACY_F32",
+  [(set VReg_32:$dst, (int_AMDGPU_rsq AllReg_32:$src0))]
+>;
+defm V_RSQ_F32 : VOP1_32 <0x0000002e, "V_RSQ_F32", []>;
+defm V_RCP_F64 : VOP1_64 <0x0000002f, "V_RCP_F64", []>;
+defm V_RCP_CLAMP_F64 : VOP1_64 <0x00000030, "V_RCP_CLAMP_F64", []>;
+defm V_RSQ_F64 : VOP1_64 <0x00000031, "V_RSQ_F64", []>;
+defm V_RSQ_CLAMP_F64 : VOP1_64 <0x00000032, "V_RSQ_CLAMP_F64", []>;
+defm V_SQRT_F32 : VOP1_32 <0x00000033, "V_SQRT_F32", []>;
+defm V_SQRT_F64 : VOP1_64 <0x00000034, "V_SQRT_F64", []>;
+defm V_SIN_F32 : VOP1_32 <0x00000035, "V_SIN_F32", []>;
+defm V_COS_F32 : VOP1_32 <0x00000036, "V_COS_F32", []>;
+defm V_NOT_B32 : VOP1_32 <0x00000037, "V_NOT_B32", []>;
+defm V_BFREV_B32 : VOP1_32 <0x00000038, "V_BFREV_B32", []>;
+defm V_FFBH_U32 : VOP1_32 <0x00000039, "V_FFBH_U32", []>;
+defm V_FFBL_B32 : VOP1_32 <0x0000003a, "V_FFBL_B32", []>;
+defm V_FFBH_I32 : VOP1_32 <0x0000003b, "V_FFBH_I32", []>;
+//defm V_FREXP_EXP_I32_F64 : VOP1_32 <0x0000003c, "V_FREXP_EXP_I32_F64", []>;
+defm V_FREXP_MANT_F64 : VOP1_64 <0x0000003d, "V_FREXP_MANT_F64", []>;
+defm V_FRACT_F64 : VOP1_64 <0x0000003e, "V_FRACT_F64", []>;
+//defm V_FREXP_EXP_I32_F32 : VOP1_32 <0x0000003f, "V_FREXP_EXP_I32_F32", []>;
+defm V_FREXP_MANT_F32 : VOP1_32 <0x00000040, "V_FREXP_MANT_F32", []>;
+//def V_CLREXCP : VOP1_ <0x00000041, "V_CLREXCP", []>;
+defm V_MOVRELD_B32 : VOP1_32 <0x00000042, "V_MOVRELD_B32", []>;
+defm V_MOVRELS_B32 : VOP1_32 <0x00000043, "V_MOVRELS_B32", []>;
+defm V_MOVRELSD_B32 : VOP1_32 <0x00000044, "V_MOVRELSD_B32", []>;
+
+def V_INTERP_P1_F32 : VINTRP <
+  0x00000000,
+  (outs VReg_32:$dst),
+  (ins VReg_32:$i, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
+  "V_INTERP_P1_F32",
+  []> {
+  let DisableEncoding = "$m0";
+}
+
+def V_INTERP_P2_F32 : VINTRP <
+  0x00000001,
+  (outs VReg_32:$dst),
+  (ins VReg_32:$src0, VReg_32:$j, i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
+  "V_INTERP_P2_F32",
+  []> {
+
+  let Constraints = "$src0 = $dst";
+  let DisableEncoding = "$src0,$m0";
+
+}
+
+def V_INTERP_MOV_F32 : VINTRP <
+  0x00000002,
+  (outs VReg_32:$dst),
+  (ins i32imm:$attr_chan, i32imm:$attr, M0Reg:$m0),
+  "V_INTERP_MOV_F32",
+  []> {
+  let VSRC = 0;
+  let DisableEncoding = "$m0";
+}
+
+//def S_NOP : SOPP_ <0x00000000, "S_NOP", []>;
+
+let isTerminator = 1 in {
+
+def S_ENDPGM : SOPP <0x00000001, (ins), "S_ENDPGM",
+  [(IL_retflag)]> {
+  let SIMM16 = 0;
+  let isBarrier = 1;
+  let hasCtrlDep = 1;
+}
+
+let isBranch = 1 in {
+def S_BRANCH : SOPP <
+  0x00000002, (ins brtarget:$target), "S_BRANCH",
+  []
+>;
+
+let DisableEncoding = "$scc" in {
+def S_CBRANCH_SCC0 : SOPP <
+  0x00000004, (ins brtarget:$target, SCCReg:$scc),
+  "S_CBRANCH_SCC0", []
+>;
+def S_CBRANCH_SCC1 : SOPP <
+  0x00000005, (ins brtarget:$target, SCCReg:$scc),
+  "S_CBRANCH_SCC1",
+  []
+>;
+} // End DisableEncoding = "$scc"
+
+def S_CBRANCH_VCCZ : SOPP <
+  0x00000006, (ins brtarget:$target, VCCReg:$vcc),
+  "S_CBRANCH_VCCZ",
+  []
+>;
+def S_CBRANCH_VCCNZ : SOPP <
+  0x00000007, (ins brtarget:$target, VCCReg:$vcc),
+  "S_CBRANCH_VCCNZ",
+  []
+>;
+
+let DisableEncoding = "$exec" in {
+def S_CBRANCH_EXECZ : SOPP <
+  0x00000008, (ins brtarget:$target, EXECReg:$exec),
+  "S_CBRANCH_EXECZ",
+  []
+>;
+def S_CBRANCH_EXECNZ : SOPP <
+  0x00000009, (ins brtarget:$target, EXECReg:$exec),
+  "S_CBRANCH_EXECNZ",
+  []
+>;
+} // End DisableEncoding = "$exec"
+
+
+} // End isBranch = 1
+} // End isTerminator = 1
+
+//def S_BARRIER : SOPP_ <0x0000000a, "S_BARRIER", []>;
+let hasSideEffects = 1 in {
+def S_WAITCNT : SOPP <0x0000000c, (ins i32imm:$simm16), "S_WAITCNT $simm16",
+  []
+>;
+} // End hasSideEffects
+//def S_SETHALT : SOPP_ <0x0000000d, "S_SETHALT", []>;
+//def S_SLEEP : SOPP_ <0x0000000e, "S_SLEEP", []>;
+//def S_SETPRIO : SOPP_ <0x0000000f, "S_SETPRIO", []>;
+//def S_SENDMSG : SOPP_ <0x00000010, "S_SENDMSG", []>;
+//def S_SENDMSGHALT : SOPP_ <0x00000011, "S_SENDMSGHALT", []>;
+//def S_TRAP : SOPP_ <0x00000012, "S_TRAP", []>;
+//def S_ICACHE_INV : SOPP_ <0x00000013, "S_ICACHE_INV", []>;
+//def S_INCPERFLEVEL : SOPP_ <0x00000014, "S_INCPERFLEVEL", []>;
+//def S_DECPERFLEVEL : SOPP_ <0x00000015, "S_DECPERFLEVEL", []>;
+//def S_TTRACEDATA : SOPP_ <0x00000016, "S_TTRACEDATA", []>;
+
+def V_CNDMASK_B32_e32 : VOP2 <0x00000000, (outs VReg_32:$dst),
+  (ins AllReg_32:$src0, VReg_32:$src1, VCCReg:$vcc), "V_CNDMASK_B32_e32",
+  []
+>{
+  let DisableEncoding = "$vcc";
+}
+
+def V_CNDMASK_B32_e64 : VOP3 <0x00000100, (outs VReg_32:$dst),
+  (ins VReg_32:$src0, VReg_32:$src1, SReg_1:$src2, InstFlag:$abs, InstFlag:$clamp, InstFlag:$omod, InstFlag:$neg),
+  "V_CNDMASK_B32_e64",
+  [(set (i32 VReg_32:$dst), (select SReg_1:$src2, VReg_32:$src1, VReg_32:$src0))]
+>;
+
+//f32 pattern for V_CNDMASK_B32_e64
+def : Pat <
+  (f32 (select SReg_1:$src2, VReg_32:$src1, VReg_32:$src0)),
+  (V_CNDMASK_B32_e64 VReg_32:$src0, VReg_32:$src1, SReg_1:$src2)
+>;
+
+defm V_READLANE_B32 : VOP2_32 <0x00000001, "V_READLANE_B32", []>;
+defm V_WRITELANE_B32 : VOP2_32 <0x00000002, "V_WRITELANE_B32", []>;
+
+defm V_ADD_F32 : VOP2_32 <0x00000003, "V_ADD_F32", []>;
+def : Pat <
+  (f32 (fadd AllReg_32:$src0, VReg_32:$src1)),
+  (V_ADD_F32_e32  AllReg_32:$src0, VReg_32:$src1)
+>;
+
+defm V_SUB_F32 : VOP2_32 <0x00000004, "V_SUB_F32", []>;
+def : Pat <
+  (f32 (fsub AllReg_32:$src0, VReg_32:$src1)),
+  (V_SUB_F32_e32  AllReg_32:$src0, VReg_32:$src1)
+>;
+defm V_SUBREV_F32 : VOP2_32 <0x00000005, "V_SUBREV_F32", []>;
+defm V_MAC_LEGACY_F32 : VOP2_32 <0x00000006, "V_MAC_LEGACY_F32", []>;
+defm V_MUL_LEGACY_F32 : VOP2_32 <
+  0x00000007, "V_MUL_LEGACY_F32",
+  [(set VReg_32:$dst, (int_AMDGPU_mul AllReg_32:$src0, VReg_32:$src1))]
+>;
+
+defm V_MUL_F32 : VOP2_32 <0x00000008, "V_MUL_F32",
+  [(set VReg_32:$dst, (fmul AllReg_32:$src0, VReg_32:$src1))]
+>;
+//defm V_MUL_I32_I24 : VOP2_32 <0x00000009, "V_MUL_I32_I24", []>;
+//defm V_MUL_HI_I32_I24 : VOP2_32 <0x0000000a, "V_MUL_HI_I32_I24", []>;
+//defm V_MUL_U32_U24 : VOP2_32 <0x0000000b, "V_MUL_U32_U24", []>;
+//defm V_MUL_HI_U32_U24 : VOP2_32 <0x0000000c, "V_MUL_HI_U32_U24", []>;
+defm V_MIN_LEGACY_F32 : VOP2_32 <0x0000000d, "V_MIN_LEGACY_F32",
+  [(set VReg_32:$dst, (AMDGPUfmin AllReg_32:$src0, VReg_32:$src1))]
+>;
+
+defm V_MAX_LEGACY_F32 : VOP2_32 <0x0000000e, "V_MAX_LEGACY_F32",
+  [(set VReg_32:$dst, (AMDGPUfmax AllReg_32:$src0, VReg_32:$src1))]
+>;
+defm V_MIN_F32 : VOP2_32 <0x0000000f, "V_MIN_F32", []>;
+defm V_MAX_F32 : VOP2_32 <0x00000010, "V_MAX_F32", []>;
+defm V_MIN_I32 : VOP2_32 <0x00000011, "V_MIN_I32", []>;
+defm V_MAX_I32 : VOP2_32 <0x00000012, "V_MAX_I32", []>;
+defm V_MIN_U32 : VOP2_32 <0x00000013, "V_MIN_U32", []>;
+defm V_MAX_U32 : VOP2_32 <0x00000014, "V_MAX_U32", []>;
+defm V_LSHR_B32 : VOP2_32 <0x00000015, "V_LSHR_B32", []>;
+defm V_LSHRREV_B32 : VOP2_32 <0x00000016, "V_LSHRREV_B32", []>;
+defm V_ASHR_I32 : VOP2_32 <0x00000017, "V_ASHR_I32", []>;
+defm V_ASHRREV_I32 : VOP2_32 <0x00000018, "V_ASHRREV_I32", []>;
+defm V_LSHL_B32 : VOP2_32 <0x00000019, "V_LSHL_B32", []>;
+defm V_LSHLREV_B32 : VOP2_32 <0x0000001a, "V_LSHLREV_B32", []>;
+defm V_AND_B32 : VOP2_32 <0x0000001b, "V_AND_B32",
+  [(set VReg_32:$dst, (and AllReg_32:$src0, VReg_32:$src1))]
+>;
+defm V_OR_B32 : VOP2_32 <0x0000001c, "V_OR_B32",
+  [(set VReg_32:$dst, (or AllReg_32:$src0, VReg_32:$src1))]
+>;
+defm V_XOR_B32 : VOP2_32 <0x0000001d, "V_XOR_B32",
+  [(set VReg_32:$dst, (xor AllReg_32:$src0, VReg_32:$src1))]
+>;
+defm V_BFM_B32 : VOP2_32 <0x0000001e, "V_BFM_B32", []>;
+defm V_MAC_F32 : VOP2_32 <0x0000001f, "V_MAC_F32", []>;
+defm V_MADMK_F32 : VOP2_32 <0x00000020, "V_MADMK_F32", []>;
+defm V_MADAK_F32 : VOP2_32 <0x00000021, "V_MADAK_F32", []>;
+//defm V_BCNT_U32_B32 : VOP2_32 <0x00000022, "V_BCNT_U32_B32", []>;
+//defm V_MBCNT_LO_U32_B32 : VOP2_32 <0x00000023, "V_MBCNT_LO_U32_B32", []>;
+//defm V_MBCNT_HI_U32_B32 : VOP2_32 <0x00000024, "V_MBCNT_HI_U32_B32", []>;
+let Defs = [VCC] in { // Carry-out goes to VCC
+defm V_ADD_I32 : VOP2_32 <0x00000025, "V_ADD_I32",
+  [(set VReg_32:$dst, (add (i32 AllReg_32:$src0), (i32 VReg_32:$src1)))]
+>;
+defm V_SUB_I32 : VOP2_32 <0x00000026, "V_SUB_I32",
+  [(set VReg_32:$dst, (sub (i32 AllReg_32:$src0), (i32 VReg_32:$src1)))]
+>;
+} // End Defs = [VCC]
+defm V_SUBREV_I32 : VOP2_32 <0x00000027, "V_SUBREV_I32", []>;
+defm V_ADDC_U32 : VOP2_32 <0x00000028, "V_ADDC_U32", []>;
+defm V_SUBB_U32 : VOP2_32 <0x00000029, "V_SUBB_U32", []>;
+defm V_SUBBREV_U32 : VOP2_32 <0x0000002a, "V_SUBBREV_U32", []>;
+defm V_LDEXP_F32 : VOP2_32 <0x0000002b, "V_LDEXP_F32", []>;
+////def V_CVT_PKACCUM_U8_F32 : VOP2_U8 <0x0000002c, "V_CVT_PKACCUM_U8_F32", []>;
+////def V_CVT_PKNORM_I16_F32 : VOP2_I16 <0x0000002d, "V_CVT_PKNORM_I16_F32", []>;
+////def V_CVT_PKNORM_U16_F32 : VOP2_U16 <0x0000002e, "V_CVT_PKNORM_U16_F32", []>;
+defm V_CVT_PKRTZ_F16_F32 : VOP2_32 <0x0000002f, "V_CVT_PKRTZ_F16_F32",
+ [(set VReg_32:$dst, (int_SI_packf16 AllReg_32:$src0, VReg_32:$src1))]
+>;
+////def V_CVT_PK_U16_U32 : VOP2_U16 <0x00000030, "V_CVT_PK_U16_U32", []>;
+////def V_CVT_PK_I16_I32 : VOP2_I16 <0x00000031, "V_CVT_PK_I16_I32", []>;
+def S_CMP_EQ_I32 : SOPC_32 <0x00000000, "S_CMP_EQ_I32", []>;
+def S_CMP_LG_I32 : SOPC_32 <0x00000001, "S_CMP_LG_I32", []>;
+def S_CMP_GT_I32 : SOPC_32 <0x00000002, "S_CMP_GT_I32", []>;
+def S_CMP_GE_I32 : SOPC_32 <0x00000003, "S_CMP_GE_I32", []>;
+def S_CMP_LT_I32 : SOPC_32 <0x00000004, "S_CMP_LT_I32", []>;
+def S_CMP_LE_I32 : SOPC_32 <0x00000005, "S_CMP_LE_I32", []>;
+def S_CMP_EQ_U32 : SOPC_32 <0x00000006, "S_CMP_EQ_U32", []>;
+def S_CMP_LG_U32 : SOPC_32 <0x00000007, "S_CMP_LG_U32", []>;
+def S_CMP_GT_U32 : SOPC_32 <0x00000008, "S_CMP_GT_U32", []>;
+def S_CMP_GE_U32 : SOPC_32 <0x00000009, "S_CMP_GE_U32", []>;
+def S_CMP_LT_U32 : SOPC_32 <0x0000000a, "S_CMP_LT_U32", []>;
+def S_CMP_LE_U32 : SOPC_32 <0x0000000b, "S_CMP_LE_U32", []>;
+////def S_BITCMP0_B32 : SOPC_BITCMP0 <0x0000000c, "S_BITCMP0_B32", []>;
+////def S_BITCMP1_B32 : SOPC_BITCMP1 <0x0000000d, "S_BITCMP1_B32", []>;
+////def S_BITCMP0_B64 : SOPC_BITCMP0 <0x0000000e, "S_BITCMP0_B64", []>;
+////def S_BITCMP1_B64 : SOPC_BITCMP1 <0x0000000f, "S_BITCMP1_B64", []>;
+//def S_SETVSKIP : SOPC_ <0x00000010, "S_SETVSKIP", []>;
+
+let neverHasSideEffects = 1 in {
+
+def V_MAD_LEGACY_F32 : VOP3_32 <0x00000140, "V_MAD_LEGACY_F32", []>;
+def V_MAD_F32 : VOP3_32 <0x00000141, "V_MAD_F32", []>;
+//def V_MAD_I32_I24 : VOP3_32 <0x00000142, "V_MAD_I32_I24", []>;
+//def V_MAD_U32_U24 : VOP3_32 <0x00000143, "V_MAD_U32_U24", []>;
+
+} // End neverHasSideEffects
+def V_CUBEID_F32 : VOP3_32 <0x00000144, "V_CUBEID_F32", []>;
+def V_CUBESC_F32 : VOP3_32 <0x00000145, "V_CUBESC_F32", []>;
+def V_CUBETC_F32 : VOP3_32 <0x00000146, "V_CUBETC_F32", []>;
+def V_CUBEMA_F32 : VOP3_32 <0x00000147, "V_CUBEMA_F32", []>;
+def V_BFE_U32 : VOP3_32 <0x00000148, "V_BFE_U32", []>;
+def V_BFE_I32 : VOP3_32 <0x00000149, "V_BFE_I32", []>;
+def V_BFI_B32 : VOP3_32 <0x0000014a, "V_BFI_B32", []>;
+def V_FMA_F32 : VOP3_32 <0x0000014b, "V_FMA_F32", []>;
+def V_FMA_F64 : VOP3_64 <0x0000014c, "V_FMA_F64", []>;
+//def V_LERP_U8 : VOP3_U8 <0x0000014d, "V_LERP_U8", []>;
+def V_ALIGNBIT_B32 : VOP3_32 <0x0000014e, "V_ALIGNBIT_B32", []>;
+def V_ALIGNBYTE_B32 : VOP3_32 <0x0000014f, "V_ALIGNBYTE_B32", []>;
+def V_MULLIT_F32 : VOP3_32 <0x00000150, "V_MULLIT_F32", []>;
+////def V_MIN3_F32 : VOP3_MIN3 <0x00000151, "V_MIN3_F32", []>;
+////def V_MIN3_I32 : VOP3_MIN3 <0x00000152, "V_MIN3_I32", []>;
+////def V_MIN3_U32 : VOP3_MIN3 <0x00000153, "V_MIN3_U32", []>;
+////def V_MAX3_F32 : VOP3_MAX3 <0x00000154, "V_MAX3_F32", []>;
+////def V_MAX3_I32 : VOP3_MAX3 <0x00000155, "V_MAX3_I32", []>;
+////def V_MAX3_U32 : VOP3_MAX3 <0x00000156, "V_MAX3_U32", []>;
+////def V_MED3_F32 : VOP3_MED3 <0x00000157, "V_MED3_F32", []>;
+////def V_MED3_I32 : VOP3_MED3 <0x00000158, "V_MED3_I32", []>;
+////def V_MED3_U32 : VOP3_MED3 <0x00000159, "V_MED3_U32", []>;
+//def V_SAD_U8 : VOP3_U8 <0x0000015a, "V_SAD_U8", []>;
+//def V_SAD_HI_U8 : VOP3_U8 <0x0000015b, "V_SAD_HI_U8", []>;
+//def V_SAD_U16 : VOP3_U16 <0x0000015c, "V_SAD_U16", []>;
+def V_SAD_U32 : VOP3_32 <0x0000015d, "V_SAD_U32", []>;
+////def V_CVT_PK_U8_F32 : VOP3_U8 <0x0000015e, "V_CVT_PK_U8_F32", []>;
+def V_DIV_FIXUP_F32 : VOP3_32 <0x0000015f, "V_DIV_FIXUP_F32", []>;
+def V_DIV_FIXUP_F64 : VOP3_64 <0x00000160, "V_DIV_FIXUP_F64", []>;
+def V_LSHL_B64 : VOP3_64 <0x00000161, "V_LSHL_B64", []>;
+def V_LSHR_B64 : VOP3_64 <0x00000162, "V_LSHR_B64", []>;
+def V_ASHR_I64 : VOP3_64 <0x00000163, "V_ASHR_I64", []>;
+def V_ADD_F64 : VOP3_64 <0x00000164, "V_ADD_F64", []>;
+def V_MUL_F64 : VOP3_64 <0x00000165, "V_MUL_F64", []>;
+def V_MIN_F64 : VOP3_64 <0x00000166, "V_MIN_F64", []>;
+def V_MAX_F64 : VOP3_64 <0x00000167, "V_MAX_F64", []>;
+def V_LDEXP_F64 : VOP3_64 <0x00000168, "V_LDEXP_F64", []>;
+def V_MUL_LO_U32 : VOP3_32 <0x00000169, "V_MUL_LO_U32", []>;
+def V_MUL_HI_U32 : VOP3_32 <0x0000016a, "V_MUL_HI_U32", []>;
+def V_MUL_LO_I32 : VOP3_32 <0x0000016b, "V_MUL_LO_I32", []>;
+def V_MUL_HI_I32 : VOP3_32 <0x0000016c, "V_MUL_HI_I32", []>;
+def V_DIV_SCALE_F32 : VOP3_32 <0x0000016d, "V_DIV_SCALE_F32", []>;
+def V_DIV_SCALE_F64 : VOP3_64 <0x0000016e, "V_DIV_SCALE_F64", []>;
+def V_DIV_FMAS_F32 : VOP3_32 <0x0000016f, "V_DIV_FMAS_F32", []>;
+def V_DIV_FMAS_F64 : VOP3_64 <0x00000170, "V_DIV_FMAS_F64", []>;
+//def V_MSAD_U8 : VOP3_U8 <0x00000171, "V_MSAD_U8", []>;
+//def V_QSAD_U8 : VOP3_U8 <0x00000172, "V_QSAD_U8", []>;
+//def V_MQSAD_U8 : VOP3_U8 <0x00000173, "V_MQSAD_U8", []>;
+def V_TRIG_PREOP_F64 : VOP3_64 <0x00000174, "V_TRIG_PREOP_F64", []>;
+def S_ADD_U32 : SOP2_32 <0x00000000, "S_ADD_U32", []>;
+def S_SUB_U32 : SOP2_32 <0x00000001, "S_SUB_U32", []>;
+def S_ADD_I32 : SOP2_32 <0x00000002, "S_ADD_I32", []>;
+def S_SUB_I32 : SOP2_32 <0x00000003, "S_SUB_I32", []>;
+def S_ADDC_U32 : SOP2_32 <0x00000004, "S_ADDC_U32", []>;
+def S_SUBB_U32 : SOP2_32 <0x00000005, "S_SUBB_U32", []>;
+def S_MIN_I32 : SOP2_32 <0x00000006, "S_MIN_I32", []>;
+def S_MIN_U32 : SOP2_32 <0x00000007, "S_MIN_U32", []>;
+def S_MAX_I32 : SOP2_32 <0x00000008, "S_MAX_I32", []>;
+def S_MAX_U32 : SOP2_32 <0x00000009, "S_MAX_U32", []>;
+
+def S_CSELECT_B32 : SOP2 <
+  0x0000000a, (outs SReg_32:$dst),
+  (ins SReg_32:$src0, SReg_32:$src1, SCCReg:$scc), "S_CSELECT_B32",
+  [(set (i32 SReg_32:$dst), (select SCCReg:$scc, SReg_32:$src0, SReg_32:$src1))]
+>;
+
+def S_CSELECT_B64 : SOP2_64 <0x0000000b, "S_CSELECT_B64", []>;
+
+// f32 pattern for S_CSELECT_B32
+def : Pat <
+  (f32 (select SCCReg:$scc, SReg_32:$src0, SReg_32:$src1)),
+  (S_CSELECT_B32 SReg_32:$src0, SReg_32:$src1, SCCReg:$scc)
+>;
+
+def S_AND_B32 : SOP2_32 <0x0000000e, "S_AND_B32", []>;
+
+def S_AND_B64 : SOP2_64 <0x0000000f, "S_AND_B64",
+  [(set SReg_64:$dst, (and SReg_64:$src0, SReg_64:$src1))]
+>;
+def S_AND_VCC : SOP2_VCC <0x0000000f, "S_AND_B64",
+  [(set SReg_1:$vcc, (SIvcc_and SReg_64:$src0, SReg_64:$src1))]
+>;
+def S_OR_B32 : SOP2_32 <0x00000010, "S_OR_B32", []>;
+def S_OR_B64 : SOP2_64 <0x00000011, "S_OR_B64", []>;
+def S_XOR_B32 : SOP2_32 <0x00000012, "S_XOR_B32", []>;
+def S_XOR_B64 : SOP2_64 <0x00000013, "S_XOR_B64", []>;
+def S_ANDN2_B32 : SOP2_32 <0x00000014, "S_ANDN2_B32", []>;
+def S_ANDN2_B64 : SOP2_64 <0x00000015, "S_ANDN2_B64", []>;
+def S_ORN2_B32 : SOP2_32 <0x00000016, "S_ORN2_B32", []>;
+def S_ORN2_B64 : SOP2_64 <0x00000017, "S_ORN2_B64", []>;
+def S_NAND_B32 : SOP2_32 <0x00000018, "S_NAND_B32", []>;
+def S_NAND_B64 : SOP2_64 <0x00000019, "S_NAND_B64", []>;
+def S_NOR_B32 : SOP2_32 <0x0000001a, "S_NOR_B32", []>;
+def S_NOR_B64 : SOP2_64 <0x0000001b, "S_NOR_B64", []>;
+def S_XNOR_B32 : SOP2_32 <0x0000001c, "S_XNOR_B32", []>;
+def S_XNOR_B64 : SOP2_64 <0x0000001d, "S_XNOR_B64", []>;
+def S_LSHL_B32 : SOP2_32 <0x0000001e, "S_LSHL_B32", []>;
+def S_LSHL_B64 : SOP2_64 <0x0000001f, "S_LSHL_B64", []>;
+def S_LSHR_B32 : SOP2_32 <0x00000020, "S_LSHR_B32", []>;
+def S_LSHR_B64 : SOP2_64 <0x00000021, "S_LSHR_B64", []>;
+def S_ASHR_I32 : SOP2_32 <0x00000022, "S_ASHR_I32", []>;
+def S_ASHR_I64 : SOP2_64 <0x00000023, "S_ASHR_I64", []>;
+def S_BFM_B32 : SOP2_32 <0x00000024, "S_BFM_B32", []>;
+def S_BFM_B64 : SOP2_64 <0x00000025, "S_BFM_B64", []>;
+def S_MUL_I32 : SOP2_32 <0x00000026, "S_MUL_I32", []>;
+def S_BFE_U32 : SOP2_32 <0x00000027, "S_BFE_U32", []>;
+def S_BFE_I32 : SOP2_32 <0x00000028, "S_BFE_I32", []>;
+def S_BFE_U64 : SOP2_64 <0x00000029, "S_BFE_U64", []>;
+def S_BFE_I64 : SOP2_64 <0x0000002a, "S_BFE_I64", []>;
+//def S_CBRANCH_G_FORK : SOP2_ <0x0000002b, "S_CBRANCH_G_FORK", []>;
+def S_ABSDIFF_I32 : SOP2_32 <0x0000002c, "S_ABSDIFF_I32", []>;
+
+class V_MOV_IMM <Operand immType, SDNode immNode> : InstSI <
+  (outs VReg_32:$dst),
+  (ins immType:$src0),
+  "V_MOV_IMM",
+   [(set VReg_32:$dst, (immNode:$src0))]
+>;
+
+let isCodeGenOnly = 1, isPseudo = 1 in {
+
+def V_MOV_IMM_I32 : V_MOV_IMM<i32imm, imm>;
+def V_MOV_IMM_F32 : V_MOV_IMM<f32imm, fpimm>;
+
+def S_MOV_IMM_I32 : InstSI <
+  (outs SReg_32:$dst),
+  (ins i32imm:$src0),
+  "S_MOV_IMM_I32",
+  [(set SReg_32:$dst, (imm:$src0))]
+>;
+
+// i64 immediates aren't really supported in hardware, but LLVM will use the i64
+// type for indices on load and store instructions.  The pattern for
+// S_MOV_IMM_I64 will only match i64 immediates that can fit into 32-bits,
+// which the hardware can handle.
+def S_MOV_IMM_I64 : InstSI <
+  (outs SReg_64:$dst),
+  (ins i64imm:$src0),
+  "S_MOV_IMM_I64 $dst, $src0",
+  [(set SReg_64:$dst, (IMM32bitIn64bit:$src0))]
+>;
+
+} // End isCodeGenOnly, isPseudo = 1
+
+class SI_LOAD_LITERAL<Operand ImmType> :
+    Enc32 <(outs), (ins ImmType:$imm), "LOAD_LITERAL $imm", []> {
+
+  bits<32> imm;
+  let Inst{31-0} = imm;
+}
+
+def SI_LOAD_LITERAL_I32 : SI_LOAD_LITERAL<i32imm>;
+def SI_LOAD_LITERAL_F32 : SI_LOAD_LITERAL<f32imm>;
+
+let isCodeGenOnly = 1, isPseudo = 1 in {
+
+def SET_M0 : InstSI <
+  (outs SReg_32:$dst),
+  (ins i32imm:$src0),
+  "SET_M0",
+  [(set SReg_32:$dst, (int_SI_set_M0 imm:$src0))]
+>;
+
+def LOAD_CONST : AMDGPUShaderInst <
+  (outs GPRF32:$dst),
+  (ins i32imm:$src),
+  "LOAD_CONST $dst, $src",
+  [(set GPRF32:$dst, (int_AMDGPU_load_const imm:$src))]
+>;
+
+let usesCustomInserter = 1 in {
+
+def SI_V_CNDLT : InstSI <
+  (outs VReg_32:$dst),
+  (ins VReg_32:$src0, VReg_32:$src1, VReg_32:$src2),
+  "SI_V_CNDLT $dst, $src0, $src1, $src2",
+  [(set VReg_32:$dst, (int_AMDGPU_cndlt VReg_32:$src0, VReg_32:$src1, VReg_32:$src2))]
+>;
+
+def SI_INTERP : InstSI <
+  (outs VReg_32:$dst),
+  (ins VReg_32:$i, VReg_32:$j, i32imm:$attr_chan, i32imm:$attr, SReg_32:$params),
+  "SI_INTERP $dst, $i, $j, $attr_chan, $attr, $params",
+  []
+>;
+
+def SI_INTERP_CONST : InstSI <
+  (outs VReg_32:$dst),
+  (ins i32imm:$attr_chan, i32imm:$attr, SReg_32:$params),
+  "SI_INTERP_CONST $dst, $attr_chan, $attr, $params",
+  [(set VReg_32:$dst, (int_SI_fs_interp_constant imm:$attr_chan,
+                                                 imm:$attr, SReg_32:$params))]
+>;
+
+def SI_KIL : InstSI <
+  (outs),
+  (ins VReg_32:$src),
+  "SI_KIL $src",
+  [(int_AMDGPU_kill VReg_32:$src)]
+>;
+
+def SI_WQM : InstSI <
+  (outs),
+  (ins),
+  "SI_WQM",
+  [(int_SI_wqm)]
+>;
+
+} // end usesCustomInserter 
+
+// SI Psuedo branch instructions.  These are used by the CFG structurizer pass
+// and should be lowered to ISA instructions prior to codegen.
+
+let isBranch = 1, isTerminator = 1, mayLoad = 0, mayStore = 0,
+                                                 hasSideEffects = 0 in {
+def SI_IF_NZ : InstSI <
+  (outs),
+  (ins brtarget:$target, SReg_1:$vcc),
+  "SI_BRANCH_NZ",
+  [(IL_brcond bb:$target, SReg_1:$vcc)]
+>;
+
+def SI_IF_Z : InstSI <
+  (outs),
+  (ins brtarget:$target, SReg_1:$vcc),
+  "SI_BRANCH_Z",
+  []
+>;
+} // end isBranch = 1, isTerminator = 1, mayLoad = 0, mayStore = 0,
+  //     hasSideEffects = 0
+} // end IsCodeGenOnly, isPseudo
+
+/* int_SI_vs_load_input */
+def : Pat<
+  (int_SI_vs_load_input SReg_128:$tlst, IMM12bit:$attr_offset,
+                        VReg_32:$buf_idx_vgpr),
+  (BUFFER_LOAD_FORMAT_XYZW imm:$attr_offset, 0, 1, 0, 0, 0,
+                           VReg_32:$buf_idx_vgpr, SReg_128:$tlst,
+                           0, 0, (i32 SREG_LIT_0))
+>;
+
+/* int_SI_export */
+def : Pat <
+  (int_SI_export imm:$en, imm:$vm, imm:$done, imm:$tgt, imm:$compr,
+                 VReg_32:$src0,VReg_32:$src1, VReg_32:$src2, VReg_32:$src3),
+  (EXP imm:$en, imm:$tgt, imm:$compr, imm:$done, imm:$vm,
+       VReg_32:$src0, VReg_32:$src1, VReg_32:$src2, VReg_32:$src3)
+>;
+
+/* int_SI_sample */
+def : Pat <
+  (int_SI_sample imm:$writemask, VReg_128:$coord, SReg_256:$rsrc, SReg_128:$sampler),
+  (IMAGE_SAMPLE imm:$writemask, 0, 0, 0, 0, 0, 0, 0, VReg_128:$coord,
+                SReg_256:$rsrc, SReg_128:$sampler)
+>;
+
+/* int_SI_sample_lod */
+def : Pat <
+  (int_SI_sample_lod imm:$writemask, VReg_128:$coord, SReg_256:$rsrc, SReg_128:$sampler),
+  (IMAGE_SAMPLE_L imm:$writemask, 0, 0, 0, 0, 0, 0, 0, VReg_128:$coord,
+                  SReg_256:$rsrc, SReg_128:$sampler)
+>;
+
+/* int_SI_sample_bias */
+def : Pat <
+  (int_SI_sample_bias imm:$writemask, VReg_128:$coord, SReg_256:$rsrc, SReg_128:$sampler),
+  (IMAGE_SAMPLE_B imm:$writemask, 0, 0, 0, 0, 0, 0, 0, VReg_128:$coord,
+                  SReg_256:$rsrc, SReg_128:$sampler)
+>;
+
+def CLAMP_SI : CLAMP<VReg_32>;
+def FABS_SI : FABS<VReg_32>;
+def FNEG_SI : FNEG<VReg_32>;
+
+def : Extract_Element <f32, v4f32, VReg_128, 0, sel_x>;
+def : Extract_Element <f32, v4f32, VReg_128, 1, sel_y>;
+def : Extract_Element <f32, v4f32, VReg_128, 2, sel_z>;
+def : Extract_Element <f32, v4f32, VReg_128, 3, sel_w>;
+
+def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 4, sel_x>;
+def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 5, sel_y>;
+def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 6, sel_z>;
+def : Insert_Element <f32, v4f32, VReg_32, VReg_128, 7, sel_w>;
+
+def : Vector_Build <v4f32, VReg_128, f32, VReg_32>;
+def : Vector_Build <v4i32, SReg_128, i32, SReg_32>;
+
+def : BitConvert <i32, f32, SReg_32>;
+def : BitConvert <i32, f32, VReg_32>;
+
+def : BitConvert <f32, i32, SReg_32>;
+def : BitConvert <f32, i32, VReg_32>;
+
+def : Pat <
+  (i64 (SIsreg1_bitcast SReg_1:$vcc)),
+  (S_MOV_B64 (COPY_TO_REGCLASS SReg_1:$vcc, SReg_64))
+>;
+
+def : Pat <
+  (i1 (SIsreg1_bitcast SReg_64:$vcc)),
+  (COPY_TO_REGCLASS SReg_64:$vcc, SReg_1)
+>;
+
+def : Pat <
+  (i64 (SIvcc_bitcast VCCReg:$vcc)),
+  (S_MOV_B64 (COPY_TO_REGCLASS VCCReg:$vcc, SReg_64))
+>;
+
+def : Pat <
+  (i1 (SIvcc_bitcast SReg_64:$vcc)),
+  (COPY_TO_REGCLASS SReg_64:$vcc, VCCReg)
+>;
+
+/********** ===================== **********/
+/********** Interpolation Paterns **********/
+/********** ===================== **********/
+
+def : Pat <
+  (int_SI_fs_interp_linear_center imm:$attr_chan, imm:$attr, SReg_32:$params),
+  (SI_INTERP (f32 LINEAR_CENTER_I), (f32 LINEAR_CENTER_J), imm:$attr_chan,
+             imm:$attr, SReg_32:$params)
+>;
+
+def : Pat <
+  (int_SI_fs_interp_linear_centroid imm:$attr_chan, imm:$attr, SReg_32:$params),
+  (SI_INTERP (f32 LINEAR_CENTROID_I), (f32 LINEAR_CENTROID_J), imm:$attr_chan,
+             imm:$attr, SReg_32:$params)
+>;
+
+def : Pat <
+  (int_SI_fs_interp_persp_center imm:$attr_chan, imm:$attr, SReg_32:$params),
+  (SI_INTERP (f32 PERSP_CENTER_I), (f32 PERSP_CENTER_J), imm:$attr_chan,
+             imm:$attr, SReg_32:$params)
+>;
+
+def : Pat <
+  (int_SI_fs_interp_persp_centroid imm:$attr_chan, imm:$attr, SReg_32:$params),
+  (SI_INTERP (f32 PERSP_CENTROID_I), (f32 PERSP_CENTROID_J), imm:$attr_chan,
+             imm:$attr, SReg_32:$params)
+>;
+
+def : Pat <
+  (int_SI_fs_read_face),
+  (f32 FRONT_FACE)
+>;
+
+def : Pat <
+  (int_SI_fs_read_pos 0),
+  (f32 POS_X_FLOAT)
+>;
+
+def : Pat <
+  (int_SI_fs_read_pos 1),
+  (f32 POS_Y_FLOAT)
+>;
+
+def : Pat <
+  (int_SI_fs_read_pos 2),
+  (f32 POS_Z_FLOAT)
+>;
+
+def : Pat <
+  (int_SI_fs_read_pos 3),
+  (f32 POS_W_FLOAT)
+>;
+
+/********** ================== **********/
+/********** Intrinsic Patterns **********/
+/********** ================== **********/
+
+/* llvm.AMDGPU.pow */
+/* XXX: We are using IEEE MUL, not the 0 * anything = 0 MUL, is this correct? */
+def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_F32_e32, VReg_32>;
+
+def : Pat <
+  (int_AMDGPU_div AllReg_32:$src0, AllReg_32:$src1),
+  (V_MUL_LEGACY_F32_e32 AllReg_32:$src0, (V_RCP_LEGACY_F32_e32 AllReg_32:$src1))
+>;
+
+def : Pat<
+  (fdiv AllReg_32:$src0, AllReg_32:$src1),
+  (V_MUL_F32_e32 AllReg_32:$src0, (V_RCP_F32_e32 AllReg_32:$src1))
+>;
+
+def : Pat <
+  (int_AMDGPU_kilp),
+  (SI_KIL (V_MOV_IMM_I32 0xbf800000))
+>;
+
+def : Pat <
+  (int_AMDGPU_cube VReg_128:$src),
+  (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)),
+    (V_CUBETC_F32 (EXTRACT_SUBREG VReg_128:$src, sel_x),
+                  (EXTRACT_SUBREG VReg_128:$src, sel_y),
+                  (EXTRACT_SUBREG VReg_128:$src, sel_z),
+                  0, 0, 0, 0), sel_x),
+    (V_CUBESC_F32 (EXTRACT_SUBREG VReg_128:$src, sel_x),
+                  (EXTRACT_SUBREG VReg_128:$src, sel_y),
+                  (EXTRACT_SUBREG VReg_128:$src, sel_z),
+                  0, 0, 0, 0), sel_y),
+    (V_CUBEMA_F32 (EXTRACT_SUBREG VReg_128:$src, sel_x),
+                  (EXTRACT_SUBREG VReg_128:$src, sel_y),
+                  (EXTRACT_SUBREG VReg_128:$src, sel_z),
+                  0, 0, 0, 0), sel_z),
+    (V_CUBEID_F32 (EXTRACT_SUBREG VReg_128:$src, sel_x),
+                  (EXTRACT_SUBREG VReg_128:$src, sel_y),
+                  (EXTRACT_SUBREG VReg_128:$src, sel_z),
+                  0, 0, 0, 0), sel_w)
+>;
+
+/********** ================== **********/
+/**********   VOP3 Patterns    **********/
+/********** ================== **********/
+
+def : Pat <(f32 (IL_mad AllReg_32:$src0, VReg_32:$src1, VReg_32:$src2)),
+           (V_MAD_LEGACY_F32 AllReg_32:$src0, VReg_32:$src1, VReg_32:$src2,
+            0, 0, 0, 0)>;
+
+} // End isSI predicate
diff --git a/lib/Target/R600/SIIntrinsics.td b/lib/Target/R600/SIIntrinsics.td
new file mode 100644
index 0000000000..1008fc42cc
--- /dev/null
+++ b/lib/Target/R600/SIIntrinsics.td
@@ -0,0 +1,42 @@
+//===-- SIIntrinsics.td - SI Intrinsic defs ----------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// SI Intrinsic Definitions
+//
+//===----------------------------------------------------------------------===//
+
+
+let TargetPrefix = "SI", isTarget = 1 in {
+
+  def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
+  def int_SI_export : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty, llvm_float_ty], []>;
+  /* XXX: We may need a seperate intrinsic here for loading integer values */
+  def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_i64_ty, llvm_i32_ty], []>;
+  def int_SI_vs_load_buffer_index : Intrinsic <[llvm_i32_ty], [], [IntrNoMem]>;
+  def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_v4i32_ty, llvm_i16_ty, llvm_i32_ty], [IntrReadMem]> ;
+  def int_SI_wqm : Intrinsic <[], [], []>;
+
+  def int_SI_sample : Intrinsic <[llvm_v4f32_ty], [llvm_i32_ty, llvm_v4f32_ty, llvm_v8i32_ty, llvm_v4i32_ty], [IntrReadMem]>;
+  def int_SI_sample_bias : Intrinsic <[llvm_v4f32_ty], [llvm_i32_ty, llvm_v4f32_ty, llvm_v8i32_ty, llvm_v4i32_ty], [IntrReadMem]>;
+  def int_SI_sample_lod : Intrinsic <[llvm_v4f32_ty], [llvm_i32_ty, llvm_v4f32_ty, llvm_v8i32_ty, llvm_v4i32_ty], [IntrReadMem]>;
+
+  /* Interpolation Intrinsics */
+
+  def int_SI_set_M0 : Intrinsic <[llvm_i32_ty], [llvm_i32_ty]>;
+  class Interp : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrReadMem]>;
+
+  def int_SI_fs_interp_linear_center : Interp;
+  def int_SI_fs_interp_linear_centroid : Interp;
+  def int_SI_fs_interp_persp_center : Interp;
+  def int_SI_fs_interp_persp_centroid : Interp;
+  def int_SI_fs_interp_constant : Interp;
+
+  def int_SI_fs_read_face : Intrinsic <[llvm_float_ty], [], [IntrNoMem]>;
+  def int_SI_fs_read_pos : Intrinsic <[llvm_float_ty], [llvm_i32_ty], [IntrNoMem]>;
+}
diff --git a/lib/Target/R600/SILowerControlFlow.cpp b/lib/Target/R600/SILowerControlFlow.cpp
new file mode 100644
index 0000000000..277b647f67
--- /dev/null
+++ b/lib/Target/R600/SILowerControlFlow.cpp
@@ -0,0 +1,191 @@
+//===-- SILowerControlFlow.cpp - Use predicates for control flow ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This pass lowers the pseudo control flow instructions (SI_IF_NZ, ELSE, ENDIF)
+/// to predicated instructions.
+///
+/// All control flow (except loops) is handled using predicated instructions and
+/// a predicate stack.  Each Scalar ALU controls the operations of 64 Vector
+/// ALUs.  The Scalar ALU can update the predicate for any of the Vector ALUs
+/// by writting to the 64-bit EXEC register (each bit corresponds to a
+/// single vector ALU).  Typically, for predicates, a vector ALU will write
+/// to its bit of the VCC register (like EXEC VCC is 64-bits, one for each
+/// Vector ALU) and then the ScalarALU will AND the VCC register with the
+/// EXEC to update the predicates.
+///
+/// For example:
+/// %VCC = V_CMP_GT_F32 %VGPR1, %VGPR2
+/// SI_IF_NZ %VCC
+///   %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0
+/// ELSE
+///   %VGPR0 = V_SUB_F32 %VGPR0, %VGPR0
+/// ENDIF
+///
+/// becomes:
+///
+/// %SGPR0 = S_AND_SAVEEXEC_B64 %VCC  // Save and update the exec mask
+/// %SGPR0 = S_XOR_B64 %SGPR0, %EXEC  // Clear live bits from saved exec mask
+/// S_CBRANCH_EXECZ label0            // This instruction is an
+///                                   // optimization which allows us to
+///                                   // branch if all the bits of
+///                                   // EXEC are zero.
+/// %VGPR0 = V_ADD_F32 %VGPR0, %VGPR0 // Do the IF block of the branch
+///
+/// label0:
+/// %SGPR0 = S_OR_SAVEEXEC_B64 %EXEC   // Restore the exec mask for the Then block
+/// %EXEC = S_XOR_B64 %SGPR0, %EXEC    // Clear live bits from saved exec mask
+/// S_BRANCH_EXECZ label1              // Use our branch optimization
+///                                    // instruction again.
+/// %VGPR0 = V_SUB_F32 %VGPR0, %VGPR   // Do the THEN block
+/// label1:
+/// %EXEC = S_OR_B64 %EXEC, %SGPR2     // Re-enable saved exec mask bits
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+class SILowerControlFlowPass : public MachineFunctionPass {
+
+private:
+  static char ID;
+  const TargetInstrInfo *TII;
+  std::vector<unsigned> PredicateStack;
+  std::vector<unsigned> UnusedRegisters;
+
+  unsigned allocReg();
+  void freeReg(unsigned Reg);
+
+public:
+  SILowerControlFlowPass(TargetMachine &tm) :
+    MachineFunctionPass(ID), TII(tm.getInstrInfo()) { }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  const char *getPassName() const {
+    return "SI Lower control flow instructions";
+  }
+
+};
+
+} // End anonymous namespace
+
+char SILowerControlFlowPass::ID = 0;
+
+FunctionPass *llvm::createSILowerControlFlowPass(TargetMachine &tm) {
+  return new SILowerControlFlowPass(tm);
+}
+
+bool SILowerControlFlowPass::runOnMachineFunction(MachineFunction &MF) {
+
+  // Find all the unused registers that can be used for the predicate stack.
+  for (TargetRegisterClass::iterator I = AMDGPU::SReg_64RegClass.begin(),
+                                     S = AMDGPU::SReg_64RegClass.end();
+                                     I != S; ++I) {
+    unsigned Reg = *I;
+    if (!MF.getRegInfo().isPhysRegUsed(Reg)) {
+      UnusedRegisters.insert(UnusedRegisters.begin(), Reg);
+    }
+  }
+
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
+                               I != MBB.end(); I = Next) {
+      Next = llvm::next(I);
+      MachineInstr &MI = *I;
+      unsigned Reg;
+      switch (MI.getOpcode()) {
+        default: break;
+        case AMDGPU::SI_IF_NZ:
+          Reg = allocReg();
+          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_AND_SAVEEXEC_B64),
+                  Reg)
+                  .addOperand(MI.getOperand(0)); // VCC
+          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_XOR_B64),
+                  Reg)
+                  .addReg(Reg)
+                  .addReg(AMDGPU::EXEC);
+          MI.eraseFromParent();
+          PredicateStack.push_back(Reg);
+          break;
+
+        case AMDGPU::ELSE:
+          Reg = PredicateStack.back();
+          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_OR_SAVEEXEC_B64),
+                  Reg)
+                  .addReg(Reg);
+          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_XOR_B64),
+                  AMDGPU::EXEC)
+                  .addReg(Reg)
+                  .addReg(AMDGPU::EXEC);
+          MI.eraseFromParent();
+          break;
+
+        case AMDGPU::ENDIF:
+          Reg = PredicateStack.back();
+          PredicateStack.pop_back();
+          BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_OR_B64),
+                  AMDGPU::EXEC)
+                  .addReg(AMDGPU::EXEC)
+                  .addReg(Reg);
+          freeReg(Reg);
+
+          if (MF.getInfo<SIMachineFunctionInfo>()->ShaderType == ShaderType::PIXEL &&
+              PredicateStack.empty()) {
+            // If the exec mask is non-zero, skip the next two instructions
+            BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_CBRANCH_EXECNZ))
+                    .addImm(3)
+                    .addReg(AMDGPU::EXEC);
+
+            // Exec mask is zero: Export to NULL target...
+            BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::EXP))
+                    .addImm(0)
+                    .addImm(0x09) // V_008DFC_SQ_EXP_NULL
+                    .addImm(0)
+                    .addImm(1)
+                    .addImm(1)
+                    .addReg(AMDGPU::SREG_LIT_0)
+                    .addReg(AMDGPU::SREG_LIT_0)
+                    .addReg(AMDGPU::SREG_LIT_0)
+                    .addReg(AMDGPU::SREG_LIT_0);
+
+            // ... and terminate wavefront
+            BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::S_ENDPGM));
+          }
+          MI.eraseFromParent();
+          break;
+      }
+    }
+  }
+  return true;
+}
+
+unsigned SILowerControlFlowPass::allocReg() {
+
+  assert(!UnusedRegisters.empty() && "Ran out of registers for predicate stack");
+  unsigned Reg = UnusedRegisters.back();
+  UnusedRegisters.pop_back();
+  return Reg;
+}
+
+void SILowerControlFlowPass::freeReg(unsigned Reg) {
+
+  UnusedRegisters.push_back(Reg);
+}
diff --git a/lib/Target/R600/SILowerLiteralConstants.cpp b/lib/Target/R600/SILowerLiteralConstants.cpp
new file mode 100644
index 0000000000..c0411e9b4d
--- /dev/null
+++ b/lib/Target/R600/SILowerLiteralConstants.cpp
@@ -0,0 +1,108 @@
+//===-- SILowerLiteralConstants.cpp - Lower intrs using literal constants--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief This pass performs the following transformation on instructions with
+/// literal constants:
+///
+/// %VGPR0 = V_MOV_IMM_I32 1
+///
+/// becomes:
+///
+/// BUNDLE
+///   * %VGPR = V_MOV_B32_32 SI_LITERAL_CONSTANT
+///   * SI_LOAD_LITERAL 1
+///
+/// The resulting sequence matches exactly how the hardware handles immediate
+/// operands, so this transformation greatly simplifies the code generator.
+///
+/// Only the *_MOV_IMM_* support immediate operands at the moment, but when
+/// support for immediate operands is added to other instructions, they
+/// will be lowered here as well.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+
+using namespace llvm;
+
+namespace {
+
+class SILowerLiteralConstantsPass : public MachineFunctionPass {
+
+private:
+  static char ID;
+  const TargetInstrInfo *TII;
+
+public:
+  SILowerLiteralConstantsPass(TargetMachine &tm) :
+    MachineFunctionPass(ID), TII(tm.getInstrInfo()) { }
+
+  virtual bool runOnMachineFunction(MachineFunction &MF);
+
+  const char *getPassName() const {
+    return "SI Lower literal constants pass";
+  }
+};
+
+} // End anonymous namespace
+
+char SILowerLiteralConstantsPass::ID = 0;
+
+FunctionPass *llvm::createSILowerLiteralConstantsPass(TargetMachine &tm) {
+  return new SILowerLiteralConstantsPass(tm);
+}
+
+bool SILowerLiteralConstantsPass::runOnMachineFunction(MachineFunction &MF) {
+  for (MachineFunction::iterator BB = MF.begin(), BB_E = MF.end();
+                                                  BB != BB_E; ++BB) {
+    MachineBasicBlock &MBB = *BB;
+    for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
+                               I != MBB.end(); I = Next) {
+      Next = llvm::next(I);
+      MachineInstr &MI = *I;
+      switch (MI.getOpcode()) {
+      default: break;
+      case AMDGPU::S_MOV_IMM_I32:
+      case AMDGPU::S_MOV_IMM_I64:
+      case AMDGPU::V_MOV_IMM_F32:
+      case AMDGPU::V_MOV_IMM_I32: {
+          unsigned MovOpcode;
+          unsigned LoadLiteralOpcode;
+          MachineOperand LiteralOp = MI.getOperand(1);
+          if (AMDGPU::VReg_32RegClass.contains(MI.getOperand(0).getReg())) {
+            MovOpcode = AMDGPU::V_MOV_B32_e32;
+          } else {
+            MovOpcode = AMDGPU::S_MOV_B32;
+          }
+          if (LiteralOp.isImm()) {
+            LoadLiteralOpcode = AMDGPU::SI_LOAD_LITERAL_I32;
+          } else {
+            LoadLiteralOpcode = AMDGPU::SI_LOAD_LITERAL_F32;
+          }
+          MIBundleBuilder Bundle(MBB, I);
+          Bundle
+            .append(BuildMI(MF, MBB.findDebugLoc(I), TII->get(MovOpcode),
+                            MI.getOperand(0).getReg())
+                    .addReg(AMDGPU::SI_LITERAL_CONSTANT))
+            .append(BuildMI(MF, MBB.findDebugLoc(I),
+                            TII->get(LoadLiteralOpcode))
+                    .addOperand(MI.getOperand(1)));
+          llvm::finalizeBundle(MBB, Bundle.begin());
+          MI.eraseFromParent();
+          break;
+        }
+      }
+    }
+  }
+  return false;
+}
diff --git a/lib/Target/R600/SIMachineFunctionInfo.cpp b/lib/Target/R600/SIMachineFunctionInfo.cpp
new file mode 100644
index 0000000000..7e59b42749
--- /dev/null
+++ b/lib/Target/R600/SIMachineFunctionInfo.cpp
@@ -0,0 +1,20 @@
+//===-- SIMachineFunctionInfo.cpp - SI Machine Function Info -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+/// \file
+//===----------------------------------------------------------------------===//
+
+
+#include "SIMachineFunctionInfo.h"
+
+using namespace llvm;
+
+SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
+  : MachineFunctionInfo(),
+    SPIPSInputAddr(0),
+    ShaderType(0)
+  { }
diff --git a/lib/Target/R600/SIMachineFunctionInfo.h b/lib/Target/R600/SIMachineFunctionInfo.h
new file mode 100644
index 0000000000..47271f5a1e
--- /dev/null
+++ b/lib/Target/R600/SIMachineFunctionInfo.h
@@ -0,0 +1,34 @@
+//===- SIMachineFunctionInfo.h - SIMachineFunctionInfo interface -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef SIMACHINEFUNCTIONINFO_H_
+#define SIMACHINEFUNCTIONINFO_H_
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+/// This class keeps track of the SPI_SP_INPUT_ADDR config register, which
+/// tells the hardware which interpolation parameters to load.
+class SIMachineFunctionInfo : public MachineFunctionInfo {
+public:
+  SIMachineFunctionInfo(const MachineFunction &MF);
+  unsigned SPIPSInputAddr;
+  unsigned ShaderType;
+};
+
+} // End namespace llvm
+
+
+#endif //_SIMACHINEFUNCTIONINFO_H_
diff --git a/lib/Target/R600/SIRegisterInfo.cpp b/lib/Target/R600/SIRegisterInfo.cpp
new file mode 100644
index 0000000000..88275c523f
--- /dev/null
+++ b/lib/Target/R600/SIRegisterInfo.cpp
@@ -0,0 +1,48 @@
+//===-- SIRegisterInfo.cpp - SI Register Information ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief SI implementation of the TargetRegisterInfo class.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "SIRegisterInfo.h"
+#include "AMDGPUTargetMachine.h"
+
+using namespace llvm;
+
+SIRegisterInfo::SIRegisterInfo(AMDGPUTargetMachine &tm,
+    const TargetInstrInfo &tii)
+: AMDGPURegisterInfo(tm, tii),
+  TM(tm),
+  TII(tii)
+  { }
+
+BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
+  BitVector Reserved(getNumRegs());
+  return Reserved;
+}
+
+const TargetRegisterClass *
+SIRegisterInfo::getISARegClass(const TargetRegisterClass * rc) const {
+  switch (rc->getID()) {
+  case AMDGPU::GPRF32RegClassID:
+    return &AMDGPU::VReg_32RegClass;
+  default: return rc;
+  }
+}
+
+const TargetRegisterClass * SIRegisterInfo::getCFGStructurizerRegClass(
+                                                                   MVT VT) const {
+  switch(VT.SimpleTy) {
+    default:
+    case MVT::i32: return &AMDGPU::VReg_32RegClass;
+  }
+}
diff --git a/lib/Target/R600/SIRegisterInfo.h b/lib/Target/R600/SIRegisterInfo.h
new file mode 100644
index 0000000000..40171e4450
--- /dev/null
+++ b/lib/Target/R600/SIRegisterInfo.h
@@ -0,0 +1,47 @@
+//===-- SIRegisterInfo.h - SI Register Info Interface ----------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Interface definition for SIRegisterInfo
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef SIREGISTERINFO_H_
+#define SIREGISTERINFO_H_
+
+#include "AMDGPURegisterInfo.h"
+
+namespace llvm {
+
+class AMDGPUTargetMachine;
+class TargetInstrInfo;
+
+struct SIRegisterInfo : public AMDGPURegisterInfo {
+  AMDGPUTargetMachine &TM;
+  const TargetInstrInfo &TII;
+
+  SIRegisterInfo(AMDGPUTargetMachine &tm, const TargetInstrInfo &tii);
+
+  virtual BitVector getReservedRegs(const MachineFunction &MF) const;
+
+  /// \param RC is an AMDIL reg class.
+  ///
+  /// \returns the SI register class that is equivalent to \p RC.
+  virtual const TargetRegisterClass *
+    getISARegClass(const TargetRegisterClass *RC) const;
+
+  /// \brief get the register class of the specified type to use in the
+  /// CFGStructurizer
+  virtual const TargetRegisterClass * getCFGStructurizerRegClass(MVT VT) const;
+};
+
+} // End namespace llvm
+
+#endif // SIREGISTERINFO_H_
diff --git a/lib/Target/R600/SIRegisterInfo.td b/lib/Target/R600/SIRegisterInfo.td
new file mode 100644
index 0000000000..e52311ab8a
--- /dev/null
+++ b/lib/Target/R600/SIRegisterInfo.td
@@ -0,0 +1,167 @@
+
+let Namespace = "AMDGPU" in {
+  def low : SubRegIndex;
+  def high : SubRegIndex;
+
+  def sub0 : SubRegIndex;
+  def sub1 : SubRegIndex;
+  def sub2 : SubRegIndex;
+  def sub3 : SubRegIndex;
+  def sub4 : SubRegIndex;
+  def sub5 : SubRegIndex;
+  def sub6 : SubRegIndex;
+  def sub7 : SubRegIndex;
+}
+
+class SIReg <string n, bits<16> encoding = 0> : Register<n> {
+  let Namespace = "AMDGPU";
+  let HWEncoding = encoding;
+}
+
+class SI_64 <string n, list<Register> subregs, bits<16> encoding> : RegisterWithSubRegs<n, subregs> {
+  let Namespace = "AMDGPU";
+  let SubRegIndices = [low, high];
+  let HWEncoding = encoding;
+}
+
+class SGPR_32 <bits<16> num, string name> : SIReg<name, num>;
+
+class VGPR_32 <bits<16> num, string name> : SIReg<name, num>;
+
+// Special Registers
+def VCC : SIReg<"VCC", 106>;
+def EXEC_LO : SIReg <"EXEC LO", 126>;
+def EXEC_HI : SIReg <"EXEC HI", 127>;
+def EXEC : SI_64<"EXEC", [EXEC_LO, EXEC_HI], 126>;
+def SCC : SIReg<"SCC", 253>;
+def SREG_LIT_0 : SIReg <"S LIT 0", 128>;
+def SI_LITERAL_CONSTANT : SIReg<"LITERAL CONSTANT", 255>;
+def M0 : SIReg <"M0", 124>;
+
+//Interpolation registers
+def PERSP_SAMPLE_I : SIReg <"PERSP_SAMPLE_I">;
+def PERSP_SAMPLE_J : SIReg <"PERSP_SAMPLE_J">;
+def PERSP_CENTER_I : SIReg <"PERSP_CENTER_I">;
+def PERSP_CENTER_J : SIReg <"PERSP_CENTER_J">;
+def PERSP_CENTROID_I : SIReg <"PERSP_CENTROID_I">;
+def PERSP_CENTROID_J : SIReg <"PERP_CENTROID_J">;
+def PERSP_I_W : SIReg <"PERSP_I_W">;
+def PERSP_J_W : SIReg <"PERSP_J_W">;
+def PERSP_1_W : SIReg <"PERSP_1_W">;
+def LINEAR_SAMPLE_I : SIReg <"LINEAR_SAMPLE_I">;
+def LINEAR_SAMPLE_J : SIReg <"LINEAR_SAMPLE_J">;
+def LINEAR_CENTER_I : SIReg <"LINEAR_CENTER_I">;
+def LINEAR_CENTER_J : SIReg <"LINEAR_CENTER_J">;
+def LINEAR_CENTROID_I : SIReg <"LINEAR_CENTROID_I">;
+def LINEAR_CENTROID_J : SIReg <"LINEAR_CENTROID_J">;
+def LINE_STIPPLE_TEX_COORD : SIReg <"LINE_STIPPLE_TEX_COORD">;
+def POS_X_FLOAT : SIReg <"POS_X_FLOAT">;
+def POS_Y_FLOAT : SIReg <"POS_Y_FLOAT">;
+def POS_Z_FLOAT : SIReg <"POS_Z_FLOAT">;
+def POS_W_FLOAT : SIReg <"POS_W_FLOAT">;
+def FRONT_FACE : SIReg <"FRONT_FACE">;
+def ANCILLARY : SIReg <"ANCILLARY">;
+def SAMPLE_COVERAGE : SIReg <"SAMPLE_COVERAGE">;
+def POS_FIXED_PT : SIReg <"POS_FIXED_PT">;
+
+// SGPR 32-bit registers
+foreach Index = 0-101 in {
+  def SGPR#Index : SGPR_32 <Index, "SGPR"#Index>;
+}
+
+def SGPR_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
+                            (add (sequence "SGPR%u", 0, 101))>;
+
+// SGPR 64-bit registers
+def SGPR_64 : RegisterTuples<[low, high],
+                             [(add (decimate SGPR_32, 2)),
+                              (add(decimate (rotl SGPR_32, 1), 2))]>;
+
+// SGPR 128-bit registers
+def SGPR_128 : RegisterTuples<[sel_x, sel_y, sel_z, sel_w],
+                              [(add (decimate SGPR_32, 4)),
+                               (add (decimate (rotl SGPR_32, 1), 4)),
+                               (add (decimate (rotl SGPR_32, 2), 4)),
+                               (add (decimate (rotl SGPR_32, 3), 4))]>;
+
+// SGPR 256-bit registers
+def SGPR_256 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7],
+                              [(add (decimate SGPR_32, 8)),
+                               (add (decimate (rotl SGPR_32, 1), 8)),
+                               (add (decimate (rotl SGPR_32, 2), 8)),
+                               (add (decimate (rotl SGPR_32, 3), 8)),
+                               (add (decimate (rotl SGPR_32, 4), 8)),
+                               (add (decimate (rotl SGPR_32, 5), 8)),
+                               (add (decimate (rotl SGPR_32, 6), 8)),
+                               (add (decimate (rotl SGPR_32, 7), 8))]>;
+
+// VGPR 32-bit registers
+foreach Index = 0-255 in {
+  def VGPR#Index : VGPR_32 <Index, "VGPR"#Index>;
+}
+
+def VGPR_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
+                            (add (sequence "VGPR%u", 0, 255))>;
+
+// VGPR 64-bit registers
+def VGPR_64 : RegisterTuples<[low, high],
+                             [(add (decimate VGPR_32, 2)),
+                              (add (decimate (rotl VGPR_32, 1), 2))]>;
+
+// VGPR 128-bit registers
+def VGPR_128 : RegisterTuples<[sel_x, sel_y, sel_z, sel_w],
+                              [(add (decimate VGPR_32, 4)),
+                               (add (decimate (rotl VGPR_32, 1), 4)),
+                               (add (decimate (rotl VGPR_32, 2), 4)),
+                               (add (decimate (rotl VGPR_32, 3), 4))]>;
+
+// Register class for all scalar registers (SGPRs + Special Registers)
+def SReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
+    (add SGPR_32,  SREG_LIT_0, M0, EXEC_LO, EXEC_HI)
+>;
+
+def SReg_64 : RegisterClass<"AMDGPU", [i64], 64, (add SGPR_64, VCC, EXEC)>;
+
+def SReg_1 : RegisterClass<"AMDGPU", [i1], 1, (add VCC, SGPR_64, EXEC)>;
+
+def SReg_128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128, (add SGPR_128)>;
+
+def SReg_256 : RegisterClass<"AMDGPU", [v8i32], 256, (add SGPR_256)>;
+
+// Register class for all vector registers (VGPRs + Interploation Registers)
+def VReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32,
+    (add VGPR_32,
+    PERSP_SAMPLE_I, PERSP_SAMPLE_J,
+    PERSP_CENTER_I, PERSP_CENTER_J,
+    PERSP_CENTROID_I, PERSP_CENTROID_J,
+    PERSP_I_W, PERSP_J_W, PERSP_1_W,
+    LINEAR_SAMPLE_I, LINEAR_SAMPLE_J,
+    LINEAR_CENTER_I, LINEAR_CENTER_J,
+    LINEAR_CENTROID_I, LINEAR_CENTROID_J,
+    LINE_STIPPLE_TEX_COORD,
+    POS_X_FLOAT,
+    POS_Y_FLOAT,
+    POS_Z_FLOAT,
+    POS_W_FLOAT,
+    FRONT_FACE,
+    ANCILLARY,
+    SAMPLE_COVERAGE,
+    POS_FIXED_PT
+    )
+>;
+
+def VReg_64 : RegisterClass<"AMDGPU", [i64], 64, (add VGPR_64)>;
+
+def VReg_128 : RegisterClass<"AMDGPU", [v4f32], 128, (add VGPR_128)>;
+
+// AllReg_* - A set of all scalar and vector registers of a given width.
+def AllReg_32 : RegisterClass<"AMDGPU", [f32, i32], 32, (add VReg_32, SReg_32)>;
+
+def AllReg_64 : RegisterClass<"AMDGPU", [f64, i64], 64, (add SReg_64, VReg_64)>;
+
+// Special register classes for predicates and the M0 register
+def SCCReg : RegisterClass<"AMDGPU", [i1], 1, (add SCC)>;
+def VCCReg : RegisterClass<"AMDGPU", [i1], 1, (add VCC)>;
+def EXECReg : RegisterClass<"AMDGPU", [i1], 1, (add EXEC)>;
+def M0Reg : RegisterClass<"AMDGPU", [i32], 32, (add M0)>;
+
diff --git a/lib/Target/R600/SISchedule.td b/lib/Target/R600/SISchedule.td
new file mode 100644
index 0000000000..28b65b8258
--- /dev/null
+++ b/lib/Target/R600/SISchedule.td
@@ -0,0 +1,15 @@
+//===-- SISchedule.td - SI Scheduling definitons -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// TODO: This is just a place holder for now.
+//
+//===----------------------------------------------------------------------===//
+
+
+def SI_Itin : ProcessorItineraries <[], [], []>;
diff --git a/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp b/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp
new file mode 100644
index 0000000000..46b1f18c62
--- /dev/null
+++ b/lib/Target/R600/TargetInfo/AMDGPUTargetInfo.cpp
@@ -0,0 +1,26 @@
+//===-- TargetInfo/AMDGPUTargetInfo.cpp - TargetInfo for AMDGPU -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+/// \brief The target for the AMDGPU backend
+Target llvm::TheAMDGPUTarget;
+
+/// \brief Extern function to initialize the targets for the AMDGPU backend
+extern "C" void LLVMInitializeR600TargetInfo() {
+  RegisterTarget<Triple::r600, false>
+    R600(TheAMDGPUTarget, "r600", "AMD GPUs HD2XXX-HD6XXX");
+}
diff --git a/lib/Target/R600/TargetInfo/CMakeLists.txt b/lib/Target/R600/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000000..3d1584eba3
--- /dev/null
+++ b/lib/Target/R600/TargetInfo/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMR600Info
+  AMDGPUTargetInfo.cpp
+  )
+
+add_dependencies(LLVMR600Info AMDGPUCommonTableGen intrinsics_gen)
diff --git a/lib/Target/R600/TargetInfo/LLVMBuild.txt b/lib/Target/R600/TargetInfo/LLVMBuild.txt
new file mode 100644
index 0000000000..4c6fea4aa0
--- /dev/null
+++ b/lib/Target/R600/TargetInfo/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Target/R600/TargetInfo/LLVMBuild.txt --------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = R600Info
+parent = R600
+required_libraries = MC Support
+add_to_library_groups = R600
diff --git a/lib/Target/R600/TargetInfo/Makefile b/lib/Target/R600/TargetInfo/Makefile
new file mode 100644
index 0000000000..b8ac4e7823
--- /dev/null
+++ b/lib/Target/R600/TargetInfo/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Target/AMDGPU/TargetInfo/Makefile ----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+LEVEL = ../../../..
+LIBRARYNAME = LLVMR600Info
+
+# Hack: we need to include 'main' target directory to grab private headers
+CPPFLAGS = -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/README.txt b/lib/Target/README.txt
index 8165f5b8cc..a9aab86abd 100644
--- a/lib/Target/README.txt
+++ b/lib/Target/README.txt
@@ -262,22 +262,7 @@ unsigned countbits_slow(unsigned v) {
     c += v & 1;
   return c;
 }
-unsigned countbits_fast(unsigned v){
-  unsigned c;
-  for (c = 0; v; c++)
-    v &= v - 1; // clear the least significant bit set
-  return c;
-}
 
-BITBOARD = unsigned long long
-int PopCnt(register BITBOARD a) {
-  register int c=0;
-  while(a) {
-    c++;
-    a &= a - 1;
-  }
-  return c;
-}
 unsigned int popcount(unsigned int input) {
   unsigned int count = 0;
   for (unsigned int i =  0; i < 4 * 8; i++)
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 467edadc7e..5f2c75ed55 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -279,9 +279,9 @@ void X86AsmBackend::relaxInstruction(const MCInst &Inst, MCInst &Res) const {
   Res.setOpcode(RelaxedOp);
 }
 
-/// writeNopData - Write optimal nops to the output file for the \p Count
-/// bytes.  This returns the number of bytes written.  It may return 0 if
-/// the \p Count is more than the maximum optimal nops.
+/// \brief Write a sequence of optimal nops to the output, covering \p Count
+/// bytes.
+/// \return - true on success, false on failure
 bool X86AsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
   static const uint8_t Nops[10][10] = {
     // nop
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index e3c22d9c3b..b9d8cf7645 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -155,7 +155,8 @@ def : Proc<"pentium3m",       [FeatureSSE1, FeatureSlowBTMem]>;
 def : Proc<"pentium-m",       [FeatureSSE2, FeatureSlowBTMem]>;
 def : Proc<"pentium4",        [FeatureSSE2]>;
 def : Proc<"pentium4m",       [FeatureSSE2, FeatureSlowBTMem]>;
-def : Proc<"x86-64",          [FeatureSSE2, Feature64Bit, FeatureSlowBTMem]>;
+def : Proc<"x86-64",          [FeatureSSE2, Feature64Bit, FeatureSlowBTMem,
+                               FeatureFastUAMem]>;
 def : Proc<"yonah",           [FeatureSSE3, FeatureSlowBTMem]>;
 def : Proc<"prescott",        [FeatureSSE3, FeatureSlowBTMem]>;
 def : Proc<"nocona",          [FeatureSSE3, FeatureCMPXCHG16B,
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 90bee41e35..34ca24f6d1 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -1369,21 +1369,20 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty) const {
 /// lowering. If DstAlign is zero that means it's safe to destination
 /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
 /// means there isn't a need to check it against alignment requirement,
-/// probably because the source does not need to be loaded. If
-/// 'IsZeroVal' is true, that means it's safe to return a
-/// non-scalar-integer type, e.g. empty string source, constant, or loaded
-/// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is
-/// constant so it does not need to be loaded.
+/// probably because the source does not need to be loaded. If 'IsMemset' is
+/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
+/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
+/// source is constant so it does not need to be loaded.
 /// It returns EVT::Other if the type should be determined using generic
 /// target-independent logic.
 EVT
 X86TargetLowering::getOptimalMemOpType(uint64_t Size,
                                        unsigned DstAlign, unsigned SrcAlign,
-                                       bool IsZeroVal,
+                                       bool IsMemset, bool ZeroMemset,
                                        bool MemcpyStrSrc,
                                        MachineFunction &MF) const {
   const Function *F = MF.getFunction();
-  if (IsZeroVal &&
+  if ((!IsMemset || ZeroMemset) &&
       !F->getFnAttributes().hasAttribute(Attributes::NoImplicitFloat)) {
     if (Size >= 16 &&
         (Subtarget->isUnalignedMemAccessFast() ||
@@ -1412,6 +1411,14 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
   return MVT::i32;
 }
 
+bool X86TargetLowering::isSafeMemOpType(MVT VT) const {
+  if (VT == MVT::f32)
+    return X86ScalarSSEf32;
+  else if (VT == MVT::f64)
+    return X86ScalarSSEf64;
+  return true;
+}
+
 bool
 X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const {
   if (Fast)
@@ -10090,6 +10097,14 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
     return DAG.getNode(X86ISD::PMULUDQ, dl, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));
 
+  // SSE2/AVX2 sub with unsigned saturation intrinsics
+  case Intrinsic::x86_sse2_psubus_b:
+  case Intrinsic::x86_sse2_psubus_w:
+  case Intrinsic::x86_avx2_psubus_b:
+  case Intrinsic::x86_avx2_psubus_w:
+    return DAG.getNode(X86ISD::SUBUS, dl, Op.getValueType(),
+                       Op.getOperand(1), Op.getOperand(2));
+
   // SSE3/AVX horizontal add/sub intrinsics
   case Intrinsic::x86_sse3_hadd_ps:
   case Intrinsic::x86_sse3_hadd_pd:
@@ -11735,6 +11750,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
                                            SmallVectorImpl<SDValue>&Results,
                                            SelectionDAG &DAG) const {
   DebugLoc dl = N->getDebugLoc();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   switch (N->getOpcode()) {
   default:
     llvm_unreachable("Do not know how to custom type legalize this operation!");
@@ -11784,6 +11800,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     return;
   }
   case ISD::FP_ROUND: {
+    if (!TLI.isTypeLegal(N->getOperand(0).getValueType()))
+        return;
     SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
     Results.push_back(V);
     return;
@@ -11951,6 +11969,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::PSIGN:              return "X86ISD::PSIGN";
   case X86ISD::BLENDV:             return "X86ISD::BLENDV";
   case X86ISD::BLENDI:             return "X86ISD::BLENDI";
+  case X86ISD::SUBUS:              return "X86ISD::SUBUS";
   case X86ISD::HADD:               return "X86ISD::HADD";
   case X86ISD::HSUB:               return "X86ISD::HSUB";
   case X86ISD::FHADD:              return "X86ISD::FHADD";
@@ -12007,7 +12026,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::OR:                 return "X86ISD::OR";
   case X86ISD::XOR:                return "X86ISD::XOR";
   case X86ISD::AND:                return "X86ISD::AND";
-  case X86ISD::ANDN:               return "X86ISD::ANDN";
   case X86ISD::BLSI:               return "X86ISD::BLSI";
   case X86ISD::BLSMSK:             return "X86ISD::BLSMSK";
   case X86ISD::BLSR:               return "X86ISD::BLSR";
@@ -14903,6 +14921,65 @@ static SDValue PerformSELECTCombine(SDNode *N, SelectionDAG &DAG,
     }
   }
 
+  // Match VSELECTs into subs with unsigned saturation.
+  if (!DCI.isBeforeLegalize() &&
+      N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
+      // psubus is available in SSE2 and AVX2 for i8 and i16 vectors.
+      ((Subtarget->hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) ||
+       (Subtarget->hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)))) {
+    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
+
+    // Check if one of the arms of the VSELECT is a zero vector. If it's on the
+    // left side invert the predicate to simplify logic below.
+    SDValue Other;
+    if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
+      Other = RHS;
+      CC = ISD::getSetCCInverse(CC, true);
+    } else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
+      Other = LHS;
+    }
+
+    if (Other.getNode() && Other->getNumOperands() == 2 &&
+        DAG.isEqualTo(Other->getOperand(0), Cond.getOperand(0))) {
+      SDValue OpLHS = Other->getOperand(0), OpRHS = Other->getOperand(1);
+      SDValue CondRHS = Cond->getOperand(1);
+
+      // Look for a general sub with unsigned saturation first.
+      // x >= y ? x-y : 0 --> subus x, y
+      // x >  y ? x-y : 0 --> subus x, y
+      if ((CC == ISD::SETUGE || CC == ISD::SETUGT) &&
+          Other->getOpcode() == ISD::SUB && DAG.isEqualTo(OpRHS, CondRHS))
+        return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
+
+      // If the RHS is a constant we have to reverse the const canonicalization.
+      // x > C-1 ? x+-C : 0 --> subus x, C
+      if (CC == ISD::SETUGT && Other->getOpcode() == ISD::ADD &&
+          isSplatVector(CondRHS.getNode()) && isSplatVector(OpRHS.getNode())) {
+        APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue();
+        if (CondRHS.getConstantOperandVal(0) == -A-1) {
+          SmallVector<SDValue, 32> V(VT.getVectorNumElements(),
+                                     DAG.getConstant(-A, VT.getScalarType()));
+          return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS,
+                             DAG.getNode(ISD::BUILD_VECTOR, DL, VT,
+                                         V.data(), V.size()));
+        }
+      }
+
+      // Another special case: If C was a sign bit, the sub has been
+      // canonicalized into a xor.
+      // FIXME: Would it be better to use ComputeMaskedBits to determine whether
+      //        it's safe to decanonicalize the xor?
+      // x s< 0 ? x^C : 0 --> subus x, C
+      if (CC == ISD::SETLT && Other->getOpcode() == ISD::XOR &&
+          ISD::isBuildVectorAllZeros(CondRHS.getNode()) &&
+          isSplatVector(OpRHS.getNode())) {
+        APInt A = cast<ConstantSDNode>(OpRHS.getOperand(0))->getAPIntValue();
+        if (A.isSignBit())
+          return DAG.getNode(X86ISD::SUBUS, DL, VT, OpLHS, OpRHS);
+      }
+    }
+  }
+
   // If we know that this node is legal then we know that it is going to be
   // matched by one of the SSE/AVX BLEND instructions. These instructions only
   // depend on the highest bit in each word. Try to use SimplifyDemandedBits
@@ -15554,7 +15631,7 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
 
   EVT VT = N->getValueType(0);
 
-  // Create ANDN, BLSI, and BLSR instructions
+  // Create BLSI, and BLSR instructions
   // BLSI is X & (-X)
   // BLSR is X & (X-1)
   if (Subtarget->hasBMI() && (VT == MVT::i32 || VT == MVT::i64)) {
@@ -15562,13 +15639,6 @@ static SDValue PerformAndCombine(SDNode *N, SelectionDAG &DAG,
     SDValue N1 = N->getOperand(1);
     DebugLoc DL = N->getDebugLoc();
 
-    // Check LHS for not
-    if (N0.getOpcode() == ISD::XOR && isAllOnes(N0.getOperand(1)))
-      return DAG.getNode(X86ISD::ANDN, DL, VT, N0.getOperand(0), N1);
-    // Check RHS for not
-    if (N1.getOpcode() == ISD::XOR && isAllOnes(N1.getOperand(1)))
-      return DAG.getNode(X86ISD::ANDN, DL, VT, N1.getOperand(0), N0);
-
     // Check LHS for neg
     if (N0.getOpcode() == ISD::SUB && N0.getOperand(1) == N1 &&
         isZero(N0.getOperand(0)))
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index a515be23ef..c51460bc42 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -182,6 +182,9 @@ namespace llvm {
       /// BLENDI - Blend where the selector is an immediate.
       BLENDI,
 
+      // SUBUS - Integer sub with unsigned saturation.
+      SUBUS,
+
       /// HADD - Integer horizontal add.
       HADD,
 
@@ -270,8 +273,6 @@ namespace llvm {
       ADD, SUB, ADC, SBB, SMUL,
       INC, DEC, OR, XOR, AND,
 
-      ANDN, // ANDN - Bitwise AND NOT with FLAGS results.
-
       BLSI,   // BLSI - Extract lowest set isolated bit
       BLSMSK, // BLSMSK - Get mask up to lowest set bit
       BLSR,   // BLSR - Reset lowest set bit
@@ -494,18 +495,25 @@ namespace llvm {
     /// lowering. If DstAlign is zero that means it's safe to destination
     /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
     /// means there isn't a need to check it against alignment requirement,
-    /// probably because the source does not need to be loaded. If
-    /// 'IsZeroVal' is true, that means it's safe to return a
-    /// non-scalar-integer type, e.g. empty string source, constant, or loaded
-    /// from memory. 'MemcpyStrSrc' indicates whether the memcpy source is
-    /// constant so it does not need to be loaded.
+    /// probably because the source does not need to be loaded. If 'IsMemset' is
+    /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
+    /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
+    /// source is constant so it does not need to be loaded.
     /// It returns EVT::Other if the type should be determined using generic
     /// target-independent logic.
     virtual EVT
-    getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
-                        bool IsZeroVal, bool MemcpyStrSrc,
+    getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, 
+                        bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
                         MachineFunction &MF) const;
 
+    /// isSafeMemOpType - Returns true if it's safe to use load / store of the
+    /// specified type to expand memcpy / memset inline. This is mostly true
+    /// for all types except for some special cases. For example, on X86
+    /// targets without SSE2 f64 load / store are done with fldl / fstpl which
+    /// also does type conversion. Note the specified type doesn't have to be
+    /// legal as the hook is used before type legalization.
+    virtual bool isSafeMemOpType(MVT VT) const;
+
     /// allowsUnalignedMemoryAccesses - Returns true if the target allows
     /// unaligned memory accesses. of the specified type. Returns whether it
     /// is "fast" by reference in the second argument.
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index f790611b8f..478c42d657 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -1204,12 +1204,12 @@ multiclass bmi_andn<string mnemonic, RegisterClass RC, X86MemOperand x86memop,
                     PatFrag ld_frag> {
   def rr : I<0xF2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-            [(set RC:$dst, EFLAGS, (X86andn_flag RC:$src1, RC:$src2))],
+            [(set RC:$dst, EFLAGS, (X86and_flag (not RC:$src1), RC:$src2))],
             IIC_BIN_NONMEM>;
   def rm : I<0xF2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
             !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
             [(set RC:$dst, EFLAGS,
-             (X86andn_flag RC:$src1, (ld_frag addr:$src2)))], IIC_BIN_MEM>;
+             (X86and_flag (not RC:$src1), (ld_frag addr:$src2)))], IIC_BIN_MEM>;
 }
 
 let Predicates = [HasBMI], Defs = [EFLAGS] in {
@@ -1217,6 +1217,17 @@ let Predicates = [HasBMI], Defs = [EFLAGS] in {
   defm ANDN64 : bmi_andn<"andn{q}", GR64, i64mem, loadi64>, T8, VEX_4V, VEX_W;
 }
 
+let Predicates = [HasBMI] in {
+  def : Pat<(and (not GR32:$src1), GR32:$src2),
+            (ANDN32rr GR32:$src1, GR32:$src2)>;
+  def : Pat<(and (not GR64:$src1), GR64:$src2),
+            (ANDN64rr GR64:$src1, GR64:$src2)>;
+  def : Pat<(and (not GR32:$src1), (loadi32 addr:$src2)),
+            (ANDN32rm GR32:$src1, addr:$src2)>;
+  def : Pat<(and (not GR64:$src1), (loadi64 addr:$src2)),
+            (ANDN64rm GR64:$src1, addr:$src2)>;
+}
+
 //===----------------------------------------------------------------------===//
 // MULX Instruction
 //
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 09ab995166..7d16d2741d 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -128,6 +128,7 @@ def X86vsrai   : SDNode<"X86ISD::VSRAI", SDTIntShiftOp>;
 def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
                                           SDTCisVec<1>,
                                           SDTCisSameAs<2, 1>]>;
+def X86subus   : SDNode<"X86ISD::SUBUS", SDTIntBinOp>;
 def X86ptest   : SDNode<"X86ISD::PTEST", SDTX86CmpPTest>;
 def X86testp   : SDNode<"X86ISD::TESTP", SDTX86CmpPTest>;
 
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 4c61b32cac..d02e12fc4f 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -562,7 +562,23 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::VBROADCASTSSYrr, X86::VBROADCASTSSYrm,     TB_NO_REVERSE },
     { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrm,     TB_NO_REVERSE },
 
-    // BMI/BMI2 foldable instructions
+    // BMI/BMI2/LZCNT/POPCNT foldable instructions
+    { X86::BEXTR32rr,       X86::BEXTR32rm,           0 },
+    { X86::BEXTR64rr,       X86::BEXTR64rm,           0 },
+    { X86::BLSI32rr,        X86::BLSI32rm,            0 },
+    { X86::BLSI64rr,        X86::BLSI64rm,            0 },
+    { X86::BLSMSK32rr,      X86::BLSMSK32rm,          0 },
+    { X86::BLSMSK64rr,      X86::BLSMSK64rm,          0 },
+    { X86::BLSR32rr,        X86::BLSR32rm,            0 },
+    { X86::BLSR64rr,        X86::BLSR64rm,            0 },
+    { X86::BZHI32rr,        X86::BZHI32rm,            0 },
+    { X86::BZHI64rr,        X86::BZHI64rm,            0 },
+    { X86::LZCNT16rr,       X86::LZCNT16rm,           0 },
+    { X86::LZCNT32rr,       X86::LZCNT32rm,           0 },
+    { X86::LZCNT64rr,       X86::LZCNT64rm,           0 },
+    { X86::POPCNT16rr,      X86::POPCNT16rm,          0 },
+    { X86::POPCNT32rr,      X86::POPCNT32rm,          0 },
+    { X86::POPCNT64rr,      X86::POPCNT64rm,          0 },
     { X86::RORX32ri,        X86::RORX32mi,            0 },
     { X86::RORX64ri,        X86::RORX64mi,            0 },
     { X86::SARX32rr,        X86::SARX32rm,            0 },
@@ -571,6 +587,9 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::SHRX64rr,        X86::SHRX64rm,            0 },
     { X86::SHLX32rr,        X86::SHLX32rm,            0 },
     { X86::SHLX64rr,        X86::SHLX64rm,            0 },
+    { X86::TZCNT16rr,       X86::TZCNT16rm,           0 },
+    { X86::TZCNT32rr,       X86::TZCNT32rm,           0 },
+    { X86::TZCNT64rr,       X86::TZCNT64rm,           0 },
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl1); i != e; ++i) {
@@ -1156,8 +1175,14 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::VFMSUBADDPD4rrY,   X86::VFMSUBADDPD4mrY,    TB_ALIGN_32 },
 
     // BMI/BMI2 foldable instructions
+    { X86::ANDN32rr,          X86::ANDN32rm,            0 },
+    { X86::ANDN64rr,          X86::ANDN64rm,            0 },
     { X86::MULX32rr,          X86::MULX32rm,            0 },
     { X86::MULX64rr,          X86::MULX64rm,            0 },
+    { X86::PDEP32rr,          X86::PDEP32rm,            0 },
+    { X86::PDEP64rr,          X86::PDEP64rm,            0 },
+    { X86::PEXT32rr,          X86::PEXT32rm,            0 },
+    { X86::PEXT64rr,          X86::PEXT64rm,            0 },
   };
 
   for (unsigned i = 0, e = array_lengthof(OpTbl2); i != e; ++i) {
@@ -3152,19 +3177,15 @@ inline static bool isDefConvertible(MachineInstr *MI) {
   case X86::SUB8ri:    case X86::SUB64rr:  case X86::SUB32rr:
   case X86::SUB16rr:   case X86::SUB8rr:   case X86::SUB64rm:
   case X86::SUB32rm:   case X86::SUB16rm:  case X86::SUB8rm:
-  case X86::DEC64r:  case X86::DEC32r:  case X86::DEC16r: case X86::DEC8r:
-  case X86::DEC64m:  case X86::DEC32m:  case X86::DEC16m: case X86::DEC8m:
+  case X86::DEC64r:    case X86::DEC32r:   case X86::DEC16r: case X86::DEC8r:
   case X86::DEC64_32r: case X86::DEC64_16r:
-  case X86::DEC64_32m: case X86::DEC64_16m:
   case X86::ADD64ri32: case X86::ADD64ri8: case X86::ADD32ri:
   case X86::ADD32ri8:  case X86::ADD16ri:  case X86::ADD16ri8:
   case X86::ADD8ri:    case X86::ADD64rr:  case X86::ADD32rr:
   case X86::ADD16rr:   case X86::ADD8rr:   case X86::ADD64rm:
   case X86::ADD32rm:   case X86::ADD16rm:  case X86::ADD8rm:
-  case X86::INC64r:  case X86::INC32r:  case X86::INC16r: case X86::INC8r:
-  case X86::INC64m:  case X86::INC32m:  case X86::INC16m: case X86::INC8m:
+  case X86::INC64r:    case X86::INC32r:   case X86::INC16r: case X86::INC8r:
   case X86::INC64_32r: case X86::INC64_16r:
-  case X86::INC64_32m: case X86::INC64_16m:
   case X86::AND64ri32: case X86::AND64ri8: case X86::AND32ri:
   case X86::AND32ri8:  case X86::AND16ri:  case X86::AND16ri8:
   case X86::AND8ri:    case X86::AND64rr:  case X86::AND32rr:
@@ -3180,6 +3201,8 @@ inline static bool isDefConvertible(MachineInstr *MI) {
   case X86::OR8ri:     case X86::OR64rr:   case X86::OR32rr:
   case X86::OR16rr:    case X86::OR8rr:    case X86::OR64rm:
   case X86::OR32rm:    case X86::OR16rm:   case X86::OR8rm:
+  case X86::ANDN32rr:  case X86::ANDN32rm:
+  case X86::ANDN64rr:  case X86::ANDN64rm:
     return true;
   }
 }
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index cdf1c8935f..cf7acfb81d 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -247,9 +247,9 @@ def X86and_flag  : SDNode<"X86ISD::AND",  SDTBinaryArithWithFlags,
                           [SDNPCommutative]>;
 def X86andn_flag : SDNode<"X86ISD::ANDN", SDTBinaryArithWithFlags>;
 
-def X86blsi_flag : SDNode<"X86ISD::BLSI",  SDTUnaryArithWithFlags>;
-def X86blsmsk_flag : SDNode<"X86ISD::BLSMSK",  SDTUnaryArithWithFlags>;
-def X86blsr_flag : SDNode<"X86ISD::BLSR",  SDTUnaryArithWithFlags>;
+def X86blsi   : SDNode<"X86ISD::BLSI",   SDTIntUnaryOp>;
+def X86blsmsk : SDNode<"X86ISD::BLSMSK", SDTIntUnaryOp>;
+def X86blsr   : SDNode<"X86ISD::BLSR",   SDTIntUnaryOp>;
 
 def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;
 
@@ -1605,26 +1605,26 @@ multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM,
                   PatFrag ld_frag> {
   def rr : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src),
              !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"),
-             [(set RC:$dst, EFLAGS, (OpNode RC:$src))]>, T8, VEX_4V;
+             [(set RC:$dst, (OpNode RC:$src)), (implicit EFLAGS)]>, T8, VEX_4V;
   def rm : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src),
              !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"),
-             [(set RC:$dst, EFLAGS, (OpNode (ld_frag addr:$src)))]>,
+             [(set RC:$dst, (OpNode (ld_frag addr:$src))), (implicit EFLAGS)]>,
              T8, VEX_4V;
 }
 
 let Predicates = [HasBMI], Defs = [EFLAGS] in {
   defm BLSR32 : bmi_bls<"blsr{l}", MRM1r, MRM1m, GR32, i32mem,
-                        X86blsr_flag, loadi32>;
+                        X86blsr, loadi32>;
   defm BLSR64 : bmi_bls<"blsr{q}", MRM1r, MRM1m, GR64, i64mem,
-                        X86blsr_flag, loadi64>, VEX_W;
+                        X86blsr, loadi64>, VEX_W;
   defm BLSMSK32 : bmi_bls<"blsmsk{l}", MRM2r, MRM2m, GR32, i32mem,
-                          X86blsmsk_flag, loadi32>;
+                          X86blsmsk, loadi32>;
   defm BLSMSK64 : bmi_bls<"blsmsk{q}", MRM2r, MRM2m, GR64, i64mem,
-                          X86blsmsk_flag, loadi64>, VEX_W;
+                          X86blsmsk, loadi64>, VEX_W;
   defm BLSI32 : bmi_bls<"blsi{l}", MRM3r, MRM3m, GR32, i32mem,
-                        X86blsi_flag, loadi32>;
+                        X86blsi, loadi32>;
   defm BLSI64 : bmi_bls<"blsi{q}", MRM3r, MRM3m, GR64, i64mem,
-                        X86blsi_flag, loadi64>, VEX_W;
+                        X86blsi, loadi64>, VEX_W;
 }
 
 multiclass bmi_bextr_bzhi<bits<8> opc, string mnemonic, RegisterClass RC,
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 1912a936ce..54032fe97f 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -3724,6 +3724,12 @@ defm VPSUBD : PDI_binop_rm<0xFA, "vpsubd", sub, v4i32, VR128, memopv2i64,
                             i128mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
 defm VPSUBQ : PDI_binop_rm<0xFB, "vpsubq", sub, v2i64, VR128, memopv2i64,
                             i128mem, SSE_INTALUQ_ITINS_P, 0, 0>, VEX_4V;
+defm VPSUBUSB : PDI_binop_rm<0xD8, "vpsubusb", X86subus, v16i8, VR128,
+                             memopv2i64, i128mem, SSE_INTALU_ITINS_P, 0, 0>,
+                             VEX_4V;
+defm VPSUBUSW : PDI_binop_rm<0xD9, "vpsubusw", X86subus, v8i16, VR128,
+                             memopv2i64, i128mem, SSE_INTALU_ITINS_P, 0, 0>,
+                             VEX_4V;
 defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
                               memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1, 0>,
                               VEX_4V;
@@ -3735,12 +3741,6 @@ defm VPSUBSB  : PDI_binop_rm_int<0xE8, "vpsubsb" , int_x86_sse2_psubs_b,
 defm VPSUBSW  : PDI_binop_rm_int<0xE9, "vpsubsw" , int_x86_sse2_psubs_w,
                                  VR128, memopv2i64, i128mem,
                                  SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
-defm VPSUBUSB : PDI_binop_rm_int<0xD8, "vpsubusb", int_x86_sse2_psubus_b,
-                                 VR128, memopv2i64, i128mem,
-                                 SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
-defm VPSUBUSW : PDI_binop_rm_int<0xD9, "vpsubusw", int_x86_sse2_psubus_w,
-                                 VR128, memopv2i64, i128mem,
-                                 SSE_INTALU_ITINS_P, 0, 0>, VEX_4V;
 defm VPADDSB  : PDI_binop_rm_int<0xEC, "vpaddsb" , int_x86_sse2_padds_b,
                                  VR128, memopv2i64, i128mem,
                                  SSE_INTALU_ITINS_P, 1, 0>, VEX_4V;
@@ -3804,6 +3804,12 @@ defm VPSUBDY  : PDI_binop_rm<0xFA, "vpsubd", sub, v8i32, VR256, memopv4i64,
                              i256mem, SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 defm VPSUBQY  : PDI_binop_rm<0xFB, "vpsubq", sub, v4i64, VR256, memopv4i64,
                              i256mem, SSE_INTALUQ_ITINS_P, 0, 0>, VEX_4V, VEX_L;
+defm VPSUBUSBY : PDI_binop_rm<0xD8, "vpsubusb", X86subus, v32i8, VR256,
+                              memopv4i64, i256mem, SSE_INTALU_ITINS_P, 0, 0>,
+                              VEX_4V, VEX_L;
+defm VPSUBUSWY : PDI_binop_rm<0xD9, "vpsubusw", X86subus, v16i16, VR256,
+                              memopv4i64, i256mem, SSE_INTALU_ITINS_P, 0, 0>,
+                              VEX_4V, VEX_L;
 defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32,
                                VR256, memopv4i64, i256mem,
                                SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
@@ -3815,12 +3821,6 @@ defm VPSUBSBY  : PDI_binop_rm_int<0xE8, "vpsubsb" , int_x86_avx2_psubs_b,
 defm VPSUBSWY  : PDI_binop_rm_int<0xE9, "vpsubsw" , int_x86_avx2_psubs_w,
                                   VR256, memopv4i64, i256mem,
                                   SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
-defm VPSUBUSBY : PDI_binop_rm_int<0xD8, "vpsubusb", int_x86_avx2_psubus_b,
-                                  VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
-defm VPSUBUSWY : PDI_binop_rm_int<0xD9, "vpsubusw", int_x86_avx2_psubus_w,
-                                  VR256, memopv4i64, i256mem,
-                                  SSE_INTALU_ITINS_P, 0, 0>, VEX_4V, VEX_L;
 defm VPADDSBY  : PDI_binop_rm_int<0xEC, "vpaddsb" , int_x86_avx2_padds_b,
                                   VR256, memopv4i64, i256mem,
                                   SSE_INTALU_ITINS_P, 1, 0>, VEX_4V, VEX_L;
@@ -3884,6 +3884,10 @@ defm PSUBD : PDI_binop_rm<0xFA, "psubd", sub, v4i32, VR128, memopv2i64,
                           i128mem, SSE_INTALU_ITINS_P>;
 defm PSUBQ : PDI_binop_rm<0xFB, "psubq", sub, v2i64, VR128, memopv2i64,
                           i128mem, SSE_INTALUQ_ITINS_P>;
+defm PSUBUSB : PDI_binop_rm<0xD8, "psubusb", X86subus, v16i8, VR128, memopv2i64,
+                            i128mem, SSE_INTALU_ITINS_P>;
+defm PSUBUSW : PDI_binop_rm<0xD9, "psubusw", X86subus, v8i16, VR128, memopv2i64,
+                            i128mem, SSE_INTALU_ITINS_P>;
 defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
                              memopv2i64, i128mem, SSE_INTMUL_ITINS_P, 1>;
 
@@ -3894,12 +3898,6 @@ defm PSUBSB  : PDI_binop_rm_int<0xE8, "psubsb" , int_x86_sse2_psubs_b,
 defm PSUBSW  : PDI_binop_rm_int<0xE9, "psubsw" , int_x86_sse2_psubs_w,
                                 VR128, memopv2i64, i128mem,
                                 SSE_INTALU_ITINS_P>;
-defm PSUBUSB : PDI_binop_rm_int<0xD8, "psubusb", int_x86_sse2_psubus_b,
-                                VR128, memopv2i64, i128mem,
-                                SSE_INTALU_ITINS_P>;
-defm PSUBUSW : PDI_binop_rm_int<0xD9, "psubusw", int_x86_sse2_psubus_w,
-                                VR128, memopv2i64, i128mem,
-                                SSE_INTALU_ITINS_P>;
 defm PADDSB  : PDI_binop_rm_int<0xEC, "paddsb" , int_x86_sse2_padds_b,
                                 VR128, memopv2i64, i128mem,
                                 SSE_INTALU_ITINS_P, 1>;
diff --git a/lib/Target/XCore/CMakeLists.txt b/lib/Target/XCore/CMakeLists.txt
index ca94f03a64..099ad390d2 100644
--- a/lib/Target/XCore/CMakeLists.txt
+++ b/lib/Target/XCore/CMakeLists.txt
@@ -2,6 +2,7 @@ set(LLVM_TARGET_DEFINITIONS XCore.td)
 
 tablegen(LLVM XCoreGenRegisterInfo.inc -gen-register-info)
 tablegen(LLVM XCoreGenInstrInfo.inc -gen-instr-info)
+tablegen(LLVM XCoreGenDisassemblerTables.inc -gen-disassembler)
 tablegen(LLVM XCoreGenAsmWriter.inc -gen-asm-writer)
 tablegen(LLVM XCoreGenDAGISel.inc -gen-dag-isel)
 tablegen(LLVM XCoreGenCallingConv.inc -gen-callingconv)
@@ -15,6 +16,7 @@ add_llvm_target(XCoreCodeGen
   XCoreISelDAGToDAG.cpp
   XCoreISelLowering.cpp
   XCoreMachineFunctionInfo.cpp
+  XCoreMCInstLower.cpp
   XCoreRegisterInfo.cpp
   XCoreSubtarget.cpp
   XCoreTargetMachine.cpp
@@ -24,5 +26,7 @@ add_llvm_target(XCoreCodeGen
 
 add_dependencies(LLVMXCoreCodeGen intrinsics_gen)
 
+add_subdirectory(Disassembler)
+add_subdirectory(InstPrinter)
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
diff --git a/lib/Target/XCore/Disassembler/CMakeLists.txt b/lib/Target/XCore/Disassembler/CMakeLists.txt
new file mode 100644
index 0000000000..cdc5d993b8
--- /dev/null
+++ b/lib/Target/XCore/Disassembler/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_llvm_library(LLVMXCoreDisassembler
+  XCoreDisassembler.cpp
+  )
+
+add_dependencies(LLVMXCoreDisassembler XCoreCommonTableGen)
diff --git a/lib/Target/XCore/Disassembler/LLVMBuild.txt b/lib/Target/XCore/Disassembler/LLVMBuild.txt
new file mode 100644
index 0000000000..028de2cb34
--- /dev/null
+++ b/lib/Target/XCore/Disassembler/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Target/XCore/Disassembler/LLVMBuild.txt ------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = XCoreDisassembler
+parent = XCore
+required_libraries = MC Support XCoreInfo
+add_to_library_groups = XCore
diff --git a/lib/Target/XCore/Disassembler/Makefile b/lib/Target/XCore/Disassembler/Makefile
new file mode 100644
index 0000000000..4caffdd1da
--- /dev/null
+++ b/lib/Target/XCore/Disassembler/Makefile
@@ -0,0 +1,16 @@
+##===- lib/Target/XCore/Disassembler/Makefile --------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMXCoreDisassembler
+
+# Hack: we need to include 'main' XCore target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp b/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
new file mode 100644
index 0000000000..094f18ceee
--- /dev/null
+++ b/lib/Target/XCore/Disassembler/XCoreDisassembler.cpp
@@ -0,0 +1,324 @@
+//===- XCoreDisassembler.cpp - Disassembler for XCore -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file is part of the XCore Disassembler.
+///
+//===----------------------------------------------------------------------===//
+
+#include "XCore.h"
+#include "XCoreRegisterInfo.h"
+#include "llvm/MC/MCDisassembler.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/MemoryObject.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+
+/// \brief A disassembler class for XCore.
+class XCoreDisassembler : public MCDisassembler {
+  const MCRegisterInfo *RegInfo;
+public:
+  XCoreDisassembler(const MCSubtargetInfo &STI, const MCRegisterInfo *Info) :
+    MCDisassembler(STI), RegInfo(Info) {}
+
+  /// \brief See MCDisassembler.
+  virtual DecodeStatus getInstruction(MCInst &instr,
+                                      uint64_t &size,
+                                      const MemoryObject &region,
+                                      uint64_t address,
+                                      raw_ostream &vStream,
+                                      raw_ostream &cStream) const;
+
+  const MCRegisterInfo *getRegInfo() const { return RegInfo; }
+};
+}
+
+static bool readInstruction16(const MemoryObject &region,
+                              uint64_t address,
+                              uint64_t &size,
+                              uint16_t &insn) {
+  uint8_t Bytes[4];
+
+  // We want to read exactly 2 Bytes of data.
+  if (region.readBytes(address, 2, Bytes, NULL) == -1) {
+    size = 0;
+    return false;
+  }
+  // Encoded as a little-endian 16-bit word in the stream.
+  insn = (Bytes[0] <<  0) | (Bytes[1] <<  8);
+  return true;
+}
+
+static bool readInstruction32(const MemoryObject &region,
+                              uint64_t address,
+                              uint64_t &size,
+                              uint32_t &insn) {
+  uint8_t Bytes[4];
+
+  // We want to read exactly 4 Bytes of data.
+  if (region.readBytes(address, 4, Bytes, NULL) == -1) {
+    size = 0;
+    return false;
+  }
+  // Encoded as a little-endian 32-bit word in the stream.
+  insn = (Bytes[0] << 0) | (Bytes[1] << 8) | (Bytes[2] << 16) |
+         (Bytes[3] << 24);
+  return true;
+}
+
+static unsigned getReg(const void *D, unsigned RC, unsigned RegNo) {
+  const XCoreDisassembler *Dis = static_cast<const XCoreDisassembler*>(D);
+  return *(Dis->getRegInfo()->getRegClass(RC).begin() + RegNo);
+}
+
+static DecodeStatus DecodeGRRegsRegisterClass(MCInst &Inst,
+                                              unsigned RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder);
+
+static DecodeStatus DecodeBitpOperand(MCInst &Inst, unsigned Val,
+                                      uint64_t Address, const void *Decoder);
+
+static DecodeStatus Decode2RInstruction(MCInst &Inst,
+                                        unsigned Insn,
+                                        uint64_t Address,
+                                        const void *Decoder);
+
+static DecodeStatus DecodeR2RInstruction(MCInst &Inst,
+                                         unsigned Insn,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
+static DecodeStatus Decode2RSrcDstInstruction(MCInst &Inst,
+                                              unsigned Insn,
+                                              uint64_t Address,
+                                              const void *Decoder);
+
+static DecodeStatus DecodeRUSInstruction(MCInst &Inst,
+                                         unsigned Insn,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
+static DecodeStatus DecodeRUSBitpInstruction(MCInst &Inst,
+                                             unsigned Insn,
+                                             uint64_t Address,
+                                             const void *Decoder);
+
+static DecodeStatus DecodeRUSSrcDstBitpInstruction(MCInst &Inst,
+                                                   unsigned Insn,
+                                                   uint64_t Address,
+                                                   const void *Decoder);
+
+static DecodeStatus DecodeL2RInstruction(MCInst &Inst,
+                                         unsigned Insn,
+                                         uint64_t Address,
+                                         const void *Decoder);
+
+static DecodeStatus DecodeLR2RInstruction(MCInst &Inst,
+                                          unsigned Insn,
+                                          uint64_t Address,
+                                          const void *Decoder);
+
+#include "XCoreGenDisassemblerTables.inc"
+
+static DecodeStatus DecodeGRRegsRegisterClass(MCInst &Inst,
+                                              unsigned RegNo,
+                                              uint64_t Address,
+                                              const void *Decoder)
+{
+  if (RegNo > 11)
+    return MCDisassembler::Fail;
+  unsigned Reg = getReg(Decoder, XCore::GRRegsRegClassID, RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeBitpOperand(MCInst &Inst, unsigned Val,
+                                      uint64_t Address, const void *Decoder) {
+  if (Val > 11)
+    return MCDisassembler::Fail;
+  static unsigned Values[] = {
+    32 /*bpw*/, 1, 2, 3, 4, 5, 6, 7, 8, 16, 24, 32
+  };
+  Inst.addOperand(MCOperand::CreateImm(Values[Val]));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+Decode2OpInstruction(unsigned Insn, unsigned &Op1, unsigned &Op2) {
+  unsigned Combined = fieldFromInstruction(Insn, 6, 5) +
+                      fieldFromInstruction(Insn, 5, 1) * 5 - 27;
+  if (Combined >= 9)
+    return MCDisassembler::Fail;
+
+  unsigned Op1High = Combined % 3;
+  unsigned Op2High = Combined / 3;
+  Op1 = (Op1High << 2) | fieldFromInstruction(Insn, 2, 2);
+  Op2 = (Op2High << 2) | fieldFromInstruction(Insn, 0, 2);
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus
+Decode2RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                    const void *Decoder) {
+  unsigned Op1, Op2;
+  DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus
+DecodeR2RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                     const void *Decoder) {
+  unsigned Op1, Op2;
+  DecodeStatus S = Decode2OpInstruction(Insn, Op2, Op1);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus
+Decode2RSrcDstInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                          const void *Decoder) {
+  unsigned Op1, Op2;
+  DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus
+DecodeRUSInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                     const void *Decoder) {
+  unsigned Op1, Op2;
+  DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    Inst.addOperand(MCOperand::CreateImm(Op2));
+  }
+  return S;
+}
+
+static DecodeStatus
+DecodeRUSBitpInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                         const void *Decoder) {
+  unsigned Op1, Op2;
+  DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeBitpOperand(Inst, Op2, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus
+DecodeRUSSrcDstBitpInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                               const void *Decoder) {
+  unsigned Op1, Op2;
+  DecodeStatus S = Decode2OpInstruction(Insn, Op1, Op2);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeBitpOperand(Inst, Op2, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus
+DecodeL2RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                               const void *Decoder) {
+  unsigned Op1, Op2;
+  DecodeStatus S = Decode2OpInstruction(fieldFromInstruction(Insn, 0, 16),
+                                        Op1, Op2);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+  }
+  return S;
+}
+
+static DecodeStatus
+DecodeLR2RInstruction(MCInst &Inst, unsigned Insn, uint64_t Address,
+                               const void *Decoder) {
+  unsigned Op1, Op2;
+  DecodeStatus S = Decode2OpInstruction(fieldFromInstruction(Insn, 0, 16),
+                                        Op1, Op2);
+  if (S == MCDisassembler::Success) {
+    DecodeGRRegsRegisterClass(Inst, Op2, Address, Decoder);
+    DecodeGRRegsRegisterClass(Inst, Op1, Address, Decoder);
+  }
+  return S;
+}
+
+MCDisassembler::DecodeStatus
+XCoreDisassembler::getInstruction(MCInst &instr,
+                                  uint64_t &Size,
+                                  const MemoryObject &Region,
+                                  uint64_t Address,
+                                  raw_ostream &vStream,
+                                  raw_ostream &cStream) const {
+  uint16_t insn16;
+
+  if (!readInstruction16(Region, Address, Size, insn16)) {
+    return Fail;
+  }
+
+  // Calling the auto-generated decoder function.
+  DecodeStatus Result = decodeInstruction(DecoderTable16, instr, insn16,
+                                          Address, this, STI);
+  if (Result != Fail) {
+    Size = 2;
+    return Result;
+  }
+
+  uint32_t insn32;
+
+  if (!readInstruction32(Region, Address, Size, insn32)) {
+    return Fail;
+  }
+
+  // Calling the auto-generated decoder function.
+  Result = decodeInstruction(DecoderTable32, instr, insn32, Address, this, STI);
+  if (Result != Fail) {
+    Size = 4;
+    return Result;
+  }
+
+  return Fail;
+}
+
+namespace llvm {
+  extern Target TheXCoreTarget;
+}
+
+static MCDisassembler *createXCoreDisassembler(const Target &T,
+                                               const MCSubtargetInfo &STI) {
+  return new XCoreDisassembler(STI, T.createMCRegInfo(""));
+}
+
+extern "C" void LLVMInitializeXCoreDisassembler() {
+  // Register the disassembler.
+  TargetRegistry::RegisterMCDisassembler(TheXCoreTarget,
+                                         createXCoreDisassembler);
+}
diff --git a/lib/Target/XCore/InstPrinter/CMakeLists.txt b/lib/Target/XCore/InstPrinter/CMakeLists.txt
new file mode 100644
index 0000000000..930e733cd7
--- /dev/null
+++ b/lib/Target/XCore/InstPrinter/CMakeLists.txt
@@ -0,0 +1,7 @@
+include_directories( ${CMAKE_CURRENT_BINARY_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/.. )
+
+add_llvm_library(LLVMXCoreAsmPrinter
+  XCoreInstPrinter.cpp
+  )
+
+add_dependencies(LLVMXCoreAsmPrinter XCoreCommonTableGen)
diff --git a/lib/Target/XCore/InstPrinter/LLVMBuild.txt b/lib/Target/XCore/InstPrinter/LLVMBuild.txt
new file mode 100644
index 0000000000..8750bc7ace
--- /dev/null
+++ b/lib/Target/XCore/InstPrinter/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Target/XCore/InstPrinter/LLVMBuild.txt -------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = XCoreAsmPrinter
+parent = XCore
+required_libraries = MC Support
+add_to_library_groups = XCore
diff --git a/lib/Target/XCore/InstPrinter/Makefile b/lib/Target/XCore/InstPrinter/Makefile
new file mode 100644
index 0000000000..1c1c61299c
--- /dev/null
+++ b/lib/Target/XCore/InstPrinter/Makefile
@@ -0,0 +1,16 @@
+##===- lib/Target/XCore/AsmPrinter/Makefile ----------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../../..
+LIBRARYNAME = LLVMXCoreAsmPrinter
+
+# Hack: we need to include 'main' xcore target directory to grab private headers
+CPP.Flags += -I$(PROJ_OBJ_DIR)/.. -I$(PROJ_SRC_DIR)/..
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp b/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp
new file mode 100644
index 0000000000..1592351c38
--- /dev/null
+++ b/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp
@@ -0,0 +1,97 @@
+//===-- XCoreInstPrinter.cpp - Convert XCore MCInst to assembly syntax ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an XCore MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asm-printer"
+#include "XCoreInstPrinter.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#include "XCoreGenAsmWriter.inc"
+
+void XCoreInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  OS << StringRef(getRegisterName(RegNo)).lower();
+}
+
+void XCoreInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
+                                 StringRef Annot) {
+  printInstruction(MI, O);
+  printAnnotation(O, Annot);
+}
+
+void XCoreInstPrinter::
+printInlineJT(const MCInst *MI, int opNum, raw_ostream &O) {
+  report_fatal_error("can't handle InlineJT");
+}
+
+void XCoreInstPrinter::
+printInlineJT32(const MCInst *MI, int opNum, raw_ostream &O) {
+  report_fatal_error("can't handle InlineJT32");
+}
+
+static void printExpr(const MCExpr *Expr, raw_ostream &OS) {
+  int Offset = 0;
+  const MCSymbolRefExpr *SRE;
+
+  if (const MCBinaryExpr *BE = dyn_cast<MCBinaryExpr>(Expr)) {
+    SRE = dyn_cast<MCSymbolRefExpr>(BE->getLHS());
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(BE->getRHS());
+    assert(SRE && CE && "Binary expression must be sym+const.");
+    Offset = CE->getValue();
+  } else {
+    SRE = dyn_cast<MCSymbolRefExpr>(Expr);
+    assert(SRE && "Unexpected MCExpr type.");
+  }
+  assert(SRE->getKind() == MCSymbolRefExpr::VK_None);
+
+  OS << SRE->getSymbol();
+
+  if (Offset) {
+    if (Offset > 0)
+      OS << '+';
+    OS << Offset;
+  }
+}
+
+void XCoreInstPrinter::
+printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Op.isReg()) {
+    printRegName(O, Op.getReg());
+    return;
+  }
+
+  if (Op.isImm()) {
+    O << Op.getImm();
+    return;
+  }
+
+  assert(Op.isExpr() && "unknown operand kind in printOperand");
+  printExpr(Op.getExpr(), O);
+}
+
+void XCoreInstPrinter::
+printMemOperand(const MCInst *MI, int opNum, raw_ostream &O) {
+  printOperand(MI, opNum, O);
+
+  if (MI->getOperand(opNum+1).isImm() && MI->getOperand(opNum+1).getImm() == 0)
+    return;
+
+  O << "+";
+  printOperand(MI, opNum+1, O);
+}
diff --git a/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h b/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
new file mode 100644
index 0000000000..772c515b5c
--- /dev/null
+++ b/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
@@ -0,0 +1,44 @@
+//== XCoreInstPrinter.h - Convert XCore MCInst to assembly syntax -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains the declaration of the XCoreInstPrinter class,
+/// which is used to print XCore MCInst to a .s file.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef XCOREINSTPRINTER_H
+#define XCOREINSTPRINTER_H
+#include "llvm/MC/MCInstPrinter.h"
+
+namespace llvm {
+
+class TargetMachine;
+
+class XCoreInstPrinter : public MCInstPrinter {
+public:
+  XCoreInstPrinter(const MCAsmInfo &MAI, const MCInstrInfo &MII,
+                  const MCRegisterInfo &MRI)
+    : MCInstPrinter(MAI, MII, MRI) {}
+
+  // Autogenerated by tblgen.
+  void printInstruction(const MCInst *MI, raw_ostream &O);
+  static const char *getRegisterName(unsigned RegNo);
+
+  virtual void printRegName(raw_ostream &OS, unsigned RegNo) const;
+  virtual void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot);
+private:
+  void printInlineJT(const MCInst *MI, int opNum, raw_ostream &O);
+  void printInlineJT32(const MCInst *MI, int opNum, raw_ostream &O);
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printMemOperand(const MCInst *MI, int opNum, raw_ostream &O);
+};
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/XCore/LLVMBuild.txt b/lib/Target/XCore/LLVMBuild.txt
index 53b4a9e3f5..59e64ad085 100644
--- a/lib/Target/XCore/LLVMBuild.txt
+++ b/lib/Target/XCore/LLVMBuild.txt
@@ -16,13 +16,14 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = MCTargetDesc TargetInfo
+subdirectories = Disassembler InstPrinter MCTargetDesc TargetInfo
 
 [component_0]
 type = TargetGroup
 name = XCore
 parent = Target
 has_asmprinter = 1
+has_disassembler = 1
 
 [component_1]
 type = Library
diff --git a/lib/Target/XCore/MCTargetDesc/LLVMBuild.txt b/lib/Target/XCore/MCTargetDesc/LLVMBuild.txt
index a80c939b43..8213f9e428 100644
--- a/lib/Target/XCore/MCTargetDesc/LLVMBuild.txt
+++ b/lib/Target/XCore/MCTargetDesc/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = XCoreDesc
 parent = XCore
-required_libraries = MC XCoreInfo
+required_libraries = MC XCoreAsmPrinter XCoreInfo
 add_to_library_groups = XCore
diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
index bbfdd4356f..048f9ebe05 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
@@ -13,6 +13,7 @@
 
 #include "XCoreMCTargetDesc.h"
 #include "XCoreMCAsmInfo.h"
+#include "InstPrinter/XCoreInstPrinter.h"
 #include "llvm/MC/MCCodeGenInfo.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
@@ -69,6 +70,15 @@ static MCCodeGenInfo *createXCoreMCCodeGenInfo(StringRef TT, Reloc::Model RM,
   return X;
 }
 
+static MCInstPrinter *createXCoreMCInstPrinter(const Target &T,
+                                               unsigned SyntaxVariant,
+                                               const MCAsmInfo &MAI,
+                                               const MCInstrInfo &MII,
+                                               const MCRegisterInfo &MRI,
+                                               const MCSubtargetInfo &STI) {
+  return new XCoreInstPrinter(MAI, MII, MRI);
+}
+
 // Force static initialization.
 extern "C" void LLVMInitializeXCoreTargetMC() {
   // Register the MC asm info.
@@ -87,4 +97,8 @@ extern "C" void LLVMInitializeXCoreTargetMC() {
   // Register the MC subtarget info.
   TargetRegistry::RegisterMCSubtargetInfo(TheXCoreTarget,
                                           createXCoreMCSubtargetInfo);
+
+  // Register the MCInstPrinter
+  TargetRegistry::RegisterMCInstPrinter(TheXCoreTarget,
+                                        createXCoreMCInstPrinter);
 }
diff --git a/lib/Target/XCore/Makefile b/lib/Target/XCore/Makefile
index b823c4ed37..92ddc88608 100644
--- a/lib/Target/XCore/Makefile
+++ b/lib/Target/XCore/Makefile
@@ -14,10 +14,10 @@ TARGET = XCore
 # Make sure that tblgen is run, first thing.
 BUILT_SOURCES = XCoreGenRegisterInfo.inc XCoreGenInstrInfo.inc \
 		XCoreGenAsmWriter.inc \
-                XCoreGenDAGISel.inc XCoreGenCallingConv.inc \
-		XCoreGenSubtargetInfo.inc
+		XCoreGenDAGISel.inc XCoreGenCallingConv.inc \
+		XCoreGenDisassemblerTables.inc XCoreGenSubtargetInfo.inc
 
-DIRS = TargetInfo MCTargetDesc
+DIRS = Disassembler InstPrinter TargetInfo MCTargetDesc
 
 include $(LEVEL)/Makefile.common
 
diff --git a/lib/Target/XCore/XCore.td b/lib/Target/XCore/XCore.td
index 04a1dd5e95..e9a6d88fd6 100644
--- a/lib/Target/XCore/XCore.td
+++ b/lib/Target/XCore/XCore.td
@@ -41,7 +41,13 @@ def : Proc<"xs1b-generic", []>;
 // Declare the target which we are implementing
 //===----------------------------------------------------------------------===//
 
+def XCoreAsmWriter : AsmWriter {
+  string AsmWriterClassName  = "InstPrinter";
+  bit isMCAsmWriter = 1;
+}
+
 def XCore : Target {
   // Pull in Instruction Info:
   let InstructionSet = XCoreInstrInfo;
+  let AssemblyWriters = [XCoreAsmWriter];
 }
diff --git a/lib/Target/XCore/XCoreAsmPrinter.cpp b/lib/Target/XCore/XCoreAsmPrinter.cpp
index 6760641efe..474d3aa215 100644
--- a/lib/Target/XCore/XCoreAsmPrinter.cpp
+++ b/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -15,6 +15,8 @@
 #define DEBUG_TYPE "asm-printer"
 #include "XCore.h"
 #include "XCoreInstrInfo.h"
+#include "XCoreMCInstLower.h"
+#include "InstPrinter/XCoreInstPrinter.h"
 #include "XCoreSubtarget.h"
 #include "XCoreTargetMachine.h"
 #include "llvm/ADT/SmallString.h"
@@ -30,6 +32,7 @@
 #include "llvm/DebugInfo.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Module.h"
@@ -52,16 +55,17 @@ static cl::opt<unsigned> MaxThreads("xcore-max-threads", cl::Optional,
 namespace {
   class XCoreAsmPrinter : public AsmPrinter {
     const XCoreSubtarget &Subtarget;
+    XCoreMCInstLower MCInstLowering;
     void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
   public:
     explicit XCoreAsmPrinter(TargetMachine &TM, MCStreamer &Streamer)
-      : AsmPrinter(TM, Streamer), Subtarget(TM.getSubtarget<XCoreSubtarget>()){}
+      : AsmPrinter(TM, Streamer), Subtarget(TM.getSubtarget<XCoreSubtarget>()),
+        MCInstLowering(*this) {}
 
     virtual const char *getPassName() const {
       return "XCore Assembly Printer";
     }
 
-    void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &O);
     void printInlineJT(const MachineInstr *MI, int opNum, raw_ostream &O,
                        const std::string &directive = ".jmptable");
     void printInlineJT32(const MachineInstr *MI, int opNum, raw_ostream &O) {
@@ -75,18 +79,14 @@ namespace {
     void emitArrayBound(MCSymbol *Sym, const GlobalVariable *GV);
     virtual void EmitGlobalVariable(const GlobalVariable *GV);
 
-    void printInstruction(const MachineInstr *MI, raw_ostream &O); // autogen'd.
-    static const char *getRegisterName(unsigned RegNo);
-
     void EmitFunctionEntryLabel();
     void EmitInstruction(const MachineInstr *MI);
+    void EmitFunctionBodyStart();
     void EmitFunctionBodyEnd();
     virtual MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
   };
 } // end of anonymous namespace
 
-#include "XCoreGenAsmWriter.inc"
-
 void XCoreAsmPrinter::emitArrayBound(MCSymbol *Sym, const GlobalVariable *GV) {
   assert(((GV->hasExternalLinkage() ||
     GV->hasWeakLinkage()) ||
@@ -177,6 +177,10 @@ void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
   OutStreamer.EmitRawText("\t.cc_bottom " + Twine(GVSym->getName()) + ".data");
 }
 
+void XCoreAsmPrinter::EmitFunctionBodyStart() {
+  MCInstLowering.Initialize(Mang, &MF->getContext());
+}
+
 /// EmitFunctionBodyEnd - Targets can override this to emit stuff after
 /// the last basic block in the function.
 void XCoreAsmPrinter::EmitFunctionBodyEnd() {
@@ -192,17 +196,6 @@ void XCoreAsmPrinter::EmitFunctionEntryLabel() {
   OutStreamer.EmitLabel(CurrentFnSym);
 }
 
-void XCoreAsmPrinter::printMemOperand(const MachineInstr *MI, int opNum,
-                                      raw_ostream &O) {
-  printOperand(MI, opNum, O);
-  
-  if (MI->getOperand(opNum+1).isImm() && MI->getOperand(opNum+1).getImm() == 0)
-    return;
-  
-  O << "+";
-  printOperand(MI, opNum+1, O);
-}
-
 void XCoreAsmPrinter::
 printInlineJT(const MachineInstr *MI, int opNum, raw_ostream &O,
               const std::string &directive) {
@@ -225,7 +218,7 @@ void XCoreAsmPrinter::printOperand(const MachineInstr *MI, int opNum,
   const MachineOperand &MO = MI->getOperand(opNum);
   switch (MO.getType()) {
   case MachineOperand::MO_Register:
-    O << getRegisterName(MO.getReg());
+    O << XCoreInstPrinter::getRegisterName(MO.getReg());
     break;
   case MachineOperand::MO_Immediate:
     O << MO.getImm();
@@ -270,7 +263,7 @@ bool XCoreAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
       return AsmPrinter::PrintAsmOperand(MI, OpNo, AsmVariant, ExtraCode, O);
     }
 
-printOperand(MI, OpNo, O);
+  printOperand(MI, OpNo, O);
   return false;
 }
 
@@ -317,15 +310,30 @@ void XCoreAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   }
   case XCore::ADD_2rus:
     if (MI->getOperand(2).getImm() == 0) {
-      O << "\tmov " << getRegisterName(MI->getOperand(0).getReg()) << ", "
-        << getRegisterName(MI->getOperand(1).getReg());
+      O << "\tmov "
+        << XCoreInstPrinter::getRegisterName(MI->getOperand(0).getReg()) << ", "
+        << XCoreInstPrinter::getRegisterName(MI->getOperand(1).getReg());
       OutStreamer.EmitRawText(O.str());
       return;
     }
     break;
+  case XCore::BR_JT:
+  case XCore::BR_JT32:
+    O << "\tbru "
+      << XCoreInstPrinter::getRegisterName(MI->getOperand(1).getReg()) << '\n';
+    if (MI->getOpcode() == XCore::BR_JT)
+      printInlineJT(MI, 0, O);
+    else
+      printInlineJT32(MI, 0, O);
+    O << '\n';
+    OutStreamer.EmitRawText(O.str());
+    return;
   }
-  printInstruction(MI, O);
-  OutStreamer.EmitRawText(O.str());
+
+  MCInst TmpInst;
+  MCInstLowering.Lower(MI, TmpInst);
+
+  OutStreamer.EmitInstruction(TmpInst);
 }
 
 // Force static initialization.
diff --git a/lib/Target/XCore/XCoreInstrFormats.td b/lib/Target/XCore/XCoreInstrFormats.td
index 1963a70fb3..44ac45c72f 100644
--- a/lib/Target/XCore/XCoreInstrFormats.td
+++ b/lib/Target/XCore/XCoreInstrFormats.td
@@ -10,7 +10,7 @@
 //===----------------------------------------------------------------------===//
 // Instruction format superclass
 //===----------------------------------------------------------------------===//
-class InstXCore<dag outs, dag ins, string asmstr, list<dag> pattern>
+class InstXCore<int sz, dag outs, dag ins, string asmstr, list<dag> pattern>
     : Instruction {
   field bits<32> Inst;
 
@@ -19,102 +19,143 @@ class InstXCore<dag outs, dag ins, string asmstr, list<dag> pattern>
   dag InOperandList = ins;
   let AsmString   = asmstr;
   let Pattern = pattern;
+  let Size = sz;
+  field bits<32> SoftFail = 0;
 }
 
 // XCore pseudo instructions format
 class PseudoInstXCore<dag outs, dag ins, string asmstr, list<dag> pattern>
-   : InstXCore<outs, ins, asmstr, pattern>;
+   : InstXCore<0, outs, ins, asmstr, pattern> {
+  let isPseudo = 1;
+}
 
 //===----------------------------------------------------------------------===//
 // Instruction formats
 //===----------------------------------------------------------------------===//
 
 class _F3R<dag outs, dag ins, string asmstr, list<dag> pattern>
-    : InstXCore<outs, ins, asmstr, pattern> {
-  let Inst{31-0} = 0;
+    : InstXCore<2, outs, ins, asmstr, pattern> {
 }
 
 class _FL3R<dag outs, dag ins, string asmstr, list<dag> pattern>
-    : InstXCore<outs, ins, asmstr, pattern> {
-  let Inst{31-0} = 0;
+    : InstXCore<4, outs, ins, asmstr, pattern> {
 }
 
 class _F2RUS<dag outs, dag ins, string asmstr, list<dag> pattern>
-    : InstXCore<outs, ins, asmstr, pattern> {
-  let Inst{31-0} = 0;
+    : InstXCore<2, outs, ins, asmstr, pattern> {
 }
 
 class _FL2RUS<dag outs, dag ins, string asmstr, list<dag> pattern>
-    : InstXCore<outs, ins, asmstr, pattern> {
-  let Inst{31-0} = 0;
+    : InstXCore<4, outs, ins, asmstr, pattern> {
 }
 
 class _FRU6<dag outs, dag ins, string asmstr, list<dag> pattern>
-    : InstXCore<outs, ins, asmstr, pattern> {
-  let Inst{31-0} = 0;
+    : InstXCore<2, outs, ins, asmstr, pattern> {
 }
 
 class _FLRU6<dag outs, dag ins, string asmstr, list<dag> pattern>
-    : InstXCore<outs, ins, asmstr, pattern> {
-  let Inst{31-0} = 0;
+    : InstXCore<4, outs, ins, asmstr, pattern> {
 }
 
 class _FU6<dag outs, dag ins, string asmstr, list<dag> pattern>
-    : InstXCore<outs, ins, asmstr, pattern> {
-  let Inst{31-0} = 0;
+    : InstXCore<2, outs, ins, asmstr, pattern> {
 }
 
 class _FLU6<dag outs, dag ins, string asmstr, list<dag> pattern>
-    : InstXCore<outs, ins, asmstr, pattern> {
-  let Inst{31-0} = 0;
+    : InstXCore<4, outs, ins, asmstr, pattern> {
 }
 
 class _FU10<dag outs, dag ins, string asmstr, list<dag> pattern>
-    : InstXCore<outs, ins, asmstr, pattern> {
-  let Inst{31-0} = 0;
+    : InstXCore<2, outs, ins, asmstr, pattern> {
 }
 
 class _FLU10<dag outs, dag ins, string asmstr, list<dag> pattern>
-    : InstXCore<outs, ins, asmstr, pattern> {
-  let Inst{31-0} = 0;
+    : InstXCore<4, outs, ins, asmstr, pattern> {
+}
+
+class _F2R<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<2, outs, ins, asmstr, pattern> {
+  let Inst{15-11} = opc{5-1};
+  let Inst{4} = opc{0};
+  let DecoderMethod = "Decode2RInstruction";
+}
+
+// 2R with first operand as both a source and a destination.
+class _F2RSrcDst<bits<6> opc, dag outs, dag ins, string asmstr,
+                 list<dag> pattern> : _F2R<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "Decode2RSrcDstInstruction";
+}
+
+// Same as 2R with last two operands swapped
+class _FR2R<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : _F2R<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "DecodeR2RInstruction";
 }
 
-class _F2R<dag outs, dag ins, string asmstr, list<dag> pattern>
-    : InstXCore<outs, ins, asmstr, pattern> {
-  let Inst{31-0} = 0;
+class _FRUS<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<2, outs, ins, asmstr, pattern> {
+  let Inst{15-11} = opc{5-1};
+  let Inst{4} = opc{0};
+  let DecoderMethod = "DecodeRUSInstruction";
 }
 
-class _FRUS<dag outs, dag ins, string asmstr, list<dag> pattern>
-    : InstXCore<outs, ins, asmstr, pattern> {
-  let Inst{31-0} = 0;
+// RUS with bitp operand
+class _FRUSBitp<bits<6> opc, dag outs, dag ins, string asmstr,
+                list<dag> pattern>
+    : _FRUS<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "DecodeRUSBitpInstruction";
 }
 
-class _FL2R<dag outs, dag ins, string asmstr, list<dag> pattern>
-    : InstXCore<outs, ins, asmstr, pattern> {
-  let Inst{31-0} = 0;
+// RUS with first operand as both a source and a destination and a bitp second
+// operand
+class _FRUSSrcDstBitp<bits<6> opc, dag outs, dag ins, string asmstr,
+                      list<dag> pattern>
+    : _FRUS<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "DecodeRUSSrcDstBitpInstruction";
 }
 
-class _F1R<dag outs, dag ins, string asmstr, list<dag> pattern>
-    : InstXCore<outs, ins, asmstr, pattern> {
-  let Inst{31-0} = 0;
+class _FL2R<bits<10> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<4, outs, ins, asmstr, pattern> {
+  let Inst{31-27} = opc{9-5};
+  let Inst{26-20} = 0b1111110;
+  let Inst{19-16} = opc{4-1};
+
+  let Inst{15-11} = 0b11111;
+  let Inst{4} = opc{0};
+  let DecoderMethod = "DecodeL2RInstruction";
+}
+
+// Same as L2R with last two operands swapped
+class _FLR2R<bits<10> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : _FL2R<opc, outs, ins, asmstr, pattern> {
+  let DecoderMethod = "DecodeLR2RInstruction";
+}
+
+class _F1R<bits<6> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<2, outs, ins, asmstr, pattern> {
+  bits<4> a;
+
+  let Inst{15-11} = opc{5-1};
+  let Inst{10-5} = 0b111111;
+  let Inst{4} = opc{0};
+  let Inst{3-0} = a;
 }
 
-class _F0R<dag outs, dag ins, string asmstr, list<dag> pattern>
-    : InstXCore<outs, ins, asmstr, pattern> {
-  let Inst{31-0} = 0;
+class _F0R<bits<10> opc, dag outs, dag ins, string asmstr, list<dag> pattern>
+    : InstXCore<2, outs, ins, asmstr, pattern> {
+  let Inst{15-11} = opc{9-5};
+  let Inst{10-5} = 0b111111;
+  let Inst{4-0} = opc{4-0};
 }
 
 class _L4R<dag outs, dag ins, string asmstr, list<dag> pattern>
-    : InstXCore<outs, ins, asmstr, pattern> {
-  let Inst{31-0} = 0;
+    : InstXCore<4, outs, ins, asmstr, pattern> {
 }
 
 class _L5R<dag outs, dag ins, string asmstr, list<dag> pattern>
-    : InstXCore<outs, ins, asmstr, pattern> {
-  let Inst{31-0} = 0;
+    : InstXCore<4, outs, ins, asmstr, pattern> {
 }
 
 class _L6R<dag outs, dag ins, string asmstr, list<dag> pattern>
-    : InstXCore<outs, ins, asmstr, pattern> {
-  let Inst{31-0} = 0;
+    : InstXCore<4, outs, ins, asmstr, pattern> {
 }
diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td
index 3e7666bdb9..95b076fdb4 100644
--- a/lib/Target/XCore/XCoreInstrInfo.td
+++ b/lib/Target/XCore/XCoreInstrInfo.td
@@ -344,10 +344,9 @@ multiclass FU10_LU10_np<string OpcStr> {
 
 // Two operand short
 
-class F2R_np<string OpcStr> : _F2R<
-                 (outs GRRegs:$dst), (ins GRRegs:$b),
-                 !strconcat(OpcStr, " $dst, $b"),
-                 []>;
+class F2R_np<bits<6> opc, string OpcStr> :
+  _F2R<opc, (outs GRRegs:$dst), (ins GRRegs:$b),
+       !strconcat(OpcStr, " $dst, $b"), []>;
 
 // Two operand long
 
@@ -357,23 +356,23 @@ class F2R_np<string OpcStr> : _F2R<
 
 let Defs = [SP], Uses = [SP] in {
 def ADJCALLSTACKDOWN : PseudoInstXCore<(outs), (ins i32imm:$amt),
-                               "${:comment} ADJCALLSTACKDOWN $amt",
+                               "# ADJCALLSTACKDOWN $amt",
                                [(callseq_start timm:$amt)]>;
 def ADJCALLSTACKUP : PseudoInstXCore<(outs), (ins i32imm:$amt1, i32imm:$amt2),
-                            "${:comment} ADJCALLSTACKUP $amt1",
+                            "# ADJCALLSTACKUP $amt1",
                             [(callseq_end timm:$amt1, timm:$amt2)]>;
 }
 
 def LDWFI : PseudoInstXCore<(outs GRRegs:$dst), (ins MEMii:$addr),
-                             "${:comment} LDWFI $dst, $addr",
+                             "# LDWFI $dst, $addr",
                              [(set GRRegs:$dst, (load ADDRspii:$addr))]>;
 
 def LDAWFI : PseudoInstXCore<(outs GRRegs:$dst), (ins MEMii:$addr),
-                             "${:comment} LDAWFI $dst, $addr",
+                             "# LDAWFI $dst, $addr",
                              [(set GRRegs:$dst, ADDRspii:$addr)]>;
 
 def STWFI : PseudoInstXCore<(outs), (ins GRRegs:$src, MEMii:$addr),
-                            "${:comment} STWFI $src, $addr",
+                            "# STWFI $src, $addr",
                             [(store GRRegs:$src, ADDRspii:$addr)]>;
 
 // SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded after
@@ -381,7 +380,7 @@ def STWFI : PseudoInstXCore<(outs), (ins GRRegs:$src, MEMii:$addr),
 let usesCustomInserter = 1 in {
   def SELECT_CC : PseudoInstXCore<(outs GRRegs:$dst),
                               (ins GRRegs:$cond, GRRegs:$T, GRRegs:$F),
-                              "${:comment} SELECT_CC PSEUDO!",
+                              "# SELECT_CC PSEUDO!",
                               [(set GRRegs:$dst,
                                  (select GRRegs:$cond, GRRegs:$T, GRRegs:$F))]>;
 }
@@ -753,210 +752,210 @@ def BL_lu10 : _FLU10<
 
 // Two operand short
 // TODO eet, eef, tsetmr
-def NOT : _F2R<(outs GRRegs:$dst), (ins GRRegs:$b),
-                 "not $dst, $b",
-                 [(set GRRegs:$dst, (not GRRegs:$b))]>;
+def NOT : _F2R<0b100010, (outs GRRegs:$dst), (ins GRRegs:$b),
+                "not $dst, $b", [(set GRRegs:$dst, (not GRRegs:$b))]>;
 
-def NEG : _F2R<(outs GRRegs:$dst), (ins GRRegs:$b),
-                 "neg $dst, $b",
-                 [(set GRRegs:$dst, (ineg GRRegs:$b))]>;
+def NEG : _F2R<0b100100, (outs GRRegs:$dst), (ins GRRegs:$b),
+                "neg $dst, $b", [(set GRRegs:$dst, (ineg GRRegs:$b))]>;
 
 let Constraints = "$src1 = $dst" in {
-def SEXT_rus : _FRUS<(outs GRRegs:$dst), (ins GRRegs:$src1, i32imm:$src2),
-                      "sext $dst, $src2",
-                      [(set GRRegs:$dst, (int_xcore_sext GRRegs:$src1,
-                                                         immBitp:$src2))]>;
-
-def SEXT_2r : _FRUS<(outs GRRegs:$dst), (ins GRRegs:$src1, GRRegs:$src2),
-                     "sext $dst, $src2",
-                     [(set GRRegs:$dst, (int_xcore_sext GRRegs:$src1,
-                                                        GRRegs:$src2))]>;
-
-def ZEXT_rus : _FRUS<(outs GRRegs:$dst), (ins GRRegs:$src1, i32imm:$src2),
-                      "zext $dst, $src2",
-                      [(set GRRegs:$dst, (int_xcore_zext GRRegs:$src1,
-                                                         immBitp:$src2))]>;
-
-def ZEXT_2r : _FRUS<(outs GRRegs:$dst), (ins GRRegs:$src1, GRRegs:$src2),
-                     "zext $dst, $src2",
-                     [(set GRRegs:$dst, (int_xcore_zext GRRegs:$src1,
-                                                        GRRegs:$src2))]>;
-
-def ANDNOT_2r : _F2R<(outs GRRegs:$dst), (ins GRRegs:$src1, GRRegs:$src2),
-                 "andnot $dst, $src2",
-                 [(set GRRegs:$dst, (and GRRegs:$src1, (not GRRegs:$src2)))]>;
+def SEXT_rus :
+  _FRUSSrcDstBitp<0b001101, (outs GRRegs:$dst), (ins GRRegs:$src1, i32imm:$src2),
+                  "sext $dst, $src2",
+                  [(set GRRegs:$dst, (int_xcore_sext GRRegs:$src1,
+                                                     immBitp:$src2))]>;
+
+def SEXT_2r :
+  _F2RSrcDst<0b001100, (outs GRRegs:$dst), (ins GRRegs:$src1, GRRegs:$src2),
+             "sext $dst, $src2",
+             [(set GRRegs:$dst, (int_xcore_sext GRRegs:$src1, GRRegs:$src2))]>;
+
+def ZEXT_rus :
+  _FRUSSrcDstBitp<0b010001, (outs GRRegs:$dst), (ins GRRegs:$src1, i32imm:$src2),
+                  "zext $dst, $src2",
+                  [(set GRRegs:$dst, (int_xcore_zext GRRegs:$src1,
+                                                     immBitp:$src2))]>;
+
+def ZEXT_2r :
+  _F2RSrcDst<0b010000, (outs GRRegs:$dst), (ins GRRegs:$src1, GRRegs:$src2),
+             "zext $dst, $src2",
+             [(set GRRegs:$dst, (int_xcore_zext GRRegs:$src1, GRRegs:$src2))]>;
+
+def ANDNOT_2r :
+  _F2RSrcDst<0b001010, (outs GRRegs:$dst), (ins GRRegs:$src1, GRRegs:$src2),
+             "andnot $dst, $src2",
+             [(set GRRegs:$dst, (and GRRegs:$src1, (not GRRegs:$src2)))]>;
 }
 
 let isReMaterializable = 1, neverHasSideEffects = 1 in
-def MKMSK_rus : _FRUS<(outs GRRegs:$dst), (ins i32imm:$size),
-                 "mkmsk $dst, $size",
-                 []>;
+def MKMSK_rus : _FRUSBitp<0b101001, (outs GRRegs:$dst), (ins i32imm:$size),
+                          "mkmsk $dst, $size", []>;
 
-def MKMSK_2r : _FRUS<(outs GRRegs:$dst), (ins GRRegs:$size),
-                 "mkmsk $dst, $size",
-                 [(set GRRegs:$dst, (add (shl 1, GRRegs:$size), -1))]>;
+def MKMSK_2r : _F2R<0b101000, (outs GRRegs:$dst), (ins GRRegs:$size),
+                    "mkmsk $dst, $size",
+                    [(set GRRegs:$dst, (add (shl 1, GRRegs:$size), -1))]>;
 
-def GETR_rus : _FRUS<(outs GRRegs:$dst), (ins i32imm:$type),
-                 "getr $dst, $type",
-                 [(set GRRegs:$dst, (int_xcore_getr immUs:$type))]>;
+def GETR_rus : _FRUS<0b100000, (outs GRRegs:$dst), (ins i32imm:$type),
+                     "getr $dst, $type",
+                     [(set GRRegs:$dst, (int_xcore_getr immUs:$type))]>;
 
-def GETTS_2r : _F2R<(outs GRRegs:$dst), (ins GRRegs:$r),
-                 "getts $dst, res[$r]",
-                 [(set GRRegs:$dst, (int_xcore_getts GRRegs:$r))]>;
+def GETTS_2r : _F2R<0b001110, (outs GRRegs:$dst), (ins GRRegs:$r),
+                    "getts $dst, res[$r]",
+                    [(set GRRegs:$dst, (int_xcore_getts GRRegs:$r))]>;
 
-def SETPT_2r : _F2R<(outs), (ins GRRegs:$r, GRRegs:$val),
-                 "setpt res[$r], $val",
-                 [(int_xcore_setpt GRRegs:$r, GRRegs:$val)]>;
+def SETPT_2r : _FR2R<0b001111, (outs), (ins GRRegs:$r, GRRegs:$val),
+                     "setpt res[$r], $val",
+                     [(int_xcore_setpt GRRegs:$r, GRRegs:$val)]>;
 
-def OUTCT_2r : _F2R<(outs), (ins GRRegs:$r, GRRegs:$val),
-                 "outct res[$r], $val",
-                 [(int_xcore_outct GRRegs:$r, GRRegs:$val)]>;
+def OUTCT_2r : _F2R<0b010010, (outs), (ins GRRegs:$r, GRRegs:$val),
+                    "outct res[$r], $val",
+                    [(int_xcore_outct GRRegs:$r, GRRegs:$val)]>;
 
-def OUTCT_rus : _F2R<(outs), (ins GRRegs:$r, i32imm:$val),
-                 "outct res[$r], $val",
-                 [(int_xcore_outct GRRegs:$r, immUs:$val)]>;
+def OUTCT_rus : _FRUS<0b010011, (outs), (ins GRRegs:$r, i32imm:$val),
+                       "outct res[$r], $val",
+                       [(int_xcore_outct GRRegs:$r, immUs:$val)]>;
 
-def OUTT_2r : _F2R<(outs), (ins GRRegs:$r, GRRegs:$val),
-                 "outt res[$r], $val",
-                 [(int_xcore_outt GRRegs:$r, GRRegs:$val)]>;
+def OUTT_2r : _FR2R<0b000011, (outs), (ins GRRegs:$r, GRRegs:$val),
+                    "outt res[$r], $val",
+                    [(int_xcore_outt GRRegs:$r, GRRegs:$val)]>;
 
-def OUT_2r : _F2R<(outs), (ins GRRegs:$r, GRRegs:$val),
-                 "out res[$r], $val",
-                 [(int_xcore_out GRRegs:$r, GRRegs:$val)]>;
+def OUT_2r : _FR2R<0b101010, (outs), (ins GRRegs:$r, GRRegs:$val),
+                   "out res[$r], $val",
+                   [(int_xcore_out GRRegs:$r, GRRegs:$val)]>;
 
 let Constraints = "$src = $dst" in
-def OUTSHR_2r : _F2R<(outs GRRegs:$dst), (ins GRRegs:$r, GRRegs:$src),
-                 "outshr res[$r], $src",
-                 [(set GRRegs:$dst, (int_xcore_outshr GRRegs:$r,
-                                                      GRRegs:$src))]>;
+def OUTSHR_2r :
+  _F2RSrcDst<0b101011, (outs GRRegs:$dst), (ins GRRegs:$src, GRRegs:$r),
+             "outshr res[$r], $src",
+             [(set GRRegs:$dst, (int_xcore_outshr GRRegs:$r, GRRegs:$src))]>;
 
-def INCT_2r : _F2R<(outs GRRegs:$dst), (ins GRRegs:$r),
-                 "inct $dst, res[$r]",
-                 [(set GRRegs:$dst, (int_xcore_inct GRRegs:$r))]>;
+def INCT_2r : _F2R<0b100001, (outs GRRegs:$dst), (ins GRRegs:$r),
+                   "inct $dst, res[$r]",
+                   [(set GRRegs:$dst, (int_xcore_inct GRRegs:$r))]>;
 
-def INT_2r : _F2R<(outs GRRegs:$dst), (ins GRRegs:$r),
-                 "int $dst, res[$r]",
-                 [(set GRRegs:$dst, (int_xcore_int GRRegs:$r))]>;
+def INT_2r : _F2R<0b100011, (outs GRRegs:$dst), (ins GRRegs:$r),
+                  "int $dst, res[$r]",
+                  [(set GRRegs:$dst, (int_xcore_int GRRegs:$r))]>;
 
-def IN_2r : _F2R<(outs GRRegs:$dst), (ins GRRegs:$r),
+def IN_2r : _F2R<0b101100, (outs GRRegs:$dst), (ins GRRegs:$r),
                  "in $dst, res[$r]",
                  [(set GRRegs:$dst, (int_xcore_in GRRegs:$r))]>;
 
 let Constraints = "$src = $dst" in
-def INSHR_2r : _F2R<(outs GRRegs:$dst), (ins GRRegs:$r, GRRegs:$src),
-                 "inshr $dst, res[$r]",
-                 [(set GRRegs:$dst, (int_xcore_inshr GRRegs:$r,
-                                                     GRRegs:$src))]>;
+def INSHR_2r :
+  _F2RSrcDst<0b101101, (outs GRRegs:$dst), (ins GRRegs:$src, GRRegs:$r),
+             "inshr $dst, res[$r]",
+             [(set GRRegs:$dst, (int_xcore_inshr GRRegs:$r, GRRegs:$src))]>;
 
-def CHKCT_2r : _F2R<(outs), (ins GRRegs:$r, GRRegs:$val),
-                 "chkct res[$r], $val",
-                 [(int_xcore_chkct GRRegs:$r, GRRegs:$val)]>;
+def CHKCT_2r : _F2R<0b110010, (outs), (ins GRRegs:$r, GRRegs:$val),
+                    "chkct res[$r], $val",
+                    [(int_xcore_chkct GRRegs:$r, GRRegs:$val)]>;
 
-def CHKCT_rus : _F2R<(outs), (ins GRRegs:$r, i32imm:$val),
-                 "chkct res[$r], $val",
-                 [(int_xcore_chkct GRRegs:$r, immUs:$val)]>;
+def CHKCT_rus : _FRUSBitp<0b110011, (outs), (ins GRRegs:$r, i32imm:$val),
+                          "chkct res[$r], $val",
+                          [(int_xcore_chkct GRRegs:$r, immUs:$val)]>;
 
-def TESTCT_2r : _F2R<(outs GRRegs:$dst), (ins GRRegs:$src),
+def TESTCT_2r : _F2R<0b101111, (outs GRRegs:$dst), (ins GRRegs:$src),
                      "testct $dst, res[$src]",
                      [(set GRRegs:$dst, (int_xcore_testct GRRegs:$src))]>;
 
-def TESTWCT_2r : _F2R<(outs GRRegs:$dst), (ins GRRegs:$src),
+def TESTWCT_2r : _F2R<0b110001, (outs GRRegs:$dst), (ins GRRegs:$src),
                       "testwct $dst, res[$src]",
                       [(set GRRegs:$dst, (int_xcore_testwct GRRegs:$src))]>;
 
-def SETD_2r : _F2R<(outs), (ins GRRegs:$r, GRRegs:$val),
-                 "setd res[$r], $val",
-                 [(int_xcore_setd GRRegs:$r, GRRegs:$val)]>;
+def SETD_2r : _FR2R<0b000101, (outs), (ins GRRegs:$r, GRRegs:$val),
+                    "setd res[$r], $val",
+                    [(int_xcore_setd GRRegs:$r, GRRegs:$val)]>;
 
-def GETST_2r : _F2R<(outs GRRegs:$dst), (ins GRRegs:$r),
+def SETPSC_l2r : _FR2R<0b110000, (outs), (ins GRRegs:$src1, GRRegs:$src2),
+                       "setpsc res[$src1], $src2",
+                       [(int_xcore_setpsc GRRegs:$src1, GRRegs:$src2)]>;
+
+def GETST_2r : _F2R<0b000001, (outs GRRegs:$dst), (ins GRRegs:$r),
                     "getst $dst, res[$r]",
                     [(set GRRegs:$dst, (int_xcore_getst GRRegs:$r))]>;
 
-def INITSP_2r : _F2R<(outs), (ins GRRegs:$t, GRRegs:$src),
+def INITSP_2r : _F2R<0b000100, (outs), (ins GRRegs:$src, GRRegs:$t),
                      "init t[$t]:sp, $src",
                      [(int_xcore_initsp GRRegs:$t, GRRegs:$src)]>;
 
-def INITPC_2r : _F2R<(outs), (ins GRRegs:$t, GRRegs:$src),
+def INITPC_2r : _F2R<0b000000, (outs), (ins GRRegs:$src, GRRegs:$t),
                      "init t[$t]:pc, $src",
                      [(int_xcore_initpc GRRegs:$t, GRRegs:$src)]>;
 
-def INITCP_2r : _F2R<(outs), (ins GRRegs:$t, GRRegs:$src),
+def INITCP_2r : _F2R<0b000110, (outs), (ins GRRegs:$src, GRRegs:$t),
                      "init t[$t]:cp, $src",
                      [(int_xcore_initcp GRRegs:$t, GRRegs:$src)]>;
 
-def INITDP_2r : _F2R<(outs), (ins GRRegs:$t, GRRegs:$src),
+def INITDP_2r : _F2R<0b000010, (outs), (ins GRRegs:$src, GRRegs:$t),
                      "init t[$t]:dp, $src",
                      [(int_xcore_initdp GRRegs:$t, GRRegs:$src)]>;
 
+def PEEK_2r : _F2R<0b101110, (outs GRRegs:$dst), (ins GRRegs:$src),
+                    "peek $dst, res[$src]",
+                    [(set GRRegs:$dst, (int_xcore_peek GRRegs:$src))]>;
+
+def ENDIN_2r : _F2R<0b100101, (outs GRRegs:$dst), (ins GRRegs:$src),
+                     "endin $dst, res[$src]",
+                     [(set GRRegs:$dst, (int_xcore_endin GRRegs:$src))]>;
+
 // Two operand long
 // getd, testlcl
-def BITREV_l2r : _FL2R<(outs GRRegs:$dst), (ins GRRegs:$src),
-                 "bitrev $dst, $src",
-                 [(set GRRegs:$dst, (int_xcore_bitrev GRRegs:$src))]>;
+def BITREV_l2r : _FL2R<0b0000011000, (outs GRRegs:$dst), (ins GRRegs:$src),
+                       "bitrev $dst, $src",
+                       [(set GRRegs:$dst, (int_xcore_bitrev GRRegs:$src))]>;
 
-def BYTEREV_l2r : _FL2R<(outs GRRegs:$dst), (ins GRRegs:$src),
-                 "byterev $dst, $src",
-                 [(set GRRegs:$dst, (bswap GRRegs:$src))]>;
+def BYTEREV_l2r : _FL2R<0b0000011001, (outs GRRegs:$dst), (ins GRRegs:$src),
+                        "byterev $dst, $src",
+                        [(set GRRegs:$dst, (bswap GRRegs:$src))]>;
 
-def CLZ_l2r : _FL2R<(outs GRRegs:$dst), (ins GRRegs:$src),
-                 "clz $dst, $src",
-                 [(set GRRegs:$dst, (ctlz GRRegs:$src))]>;
+def CLZ_l2r : _FL2R<0b000111000, (outs GRRegs:$dst), (ins GRRegs:$src),
+                    "clz $dst, $src",
+                    [(set GRRegs:$dst, (ctlz GRRegs:$src))]>;
 
-def SETC_l2r : _FL2R<(outs), (ins GRRegs:$r, GRRegs:$val),
-                  "setc res[$r], $val",
-                  [(int_xcore_setc GRRegs:$r, GRRegs:$val)]>;
+def SETC_l2r : _FL2R<0b0010111001, (outs), (ins GRRegs:$r, GRRegs:$val),
+                     "setc res[$r], $val",
+                     [(int_xcore_setc GRRegs:$r, GRRegs:$val)]>;
 
-def SETTW_l2r : _FL2R<(outs), (ins GRRegs:$r, GRRegs:$val),
-                  "settw res[$r], $val",
-                  [(int_xcore_settw GRRegs:$r, GRRegs:$val)]>;
+def SETTW_l2r : _FLR2R<0b0010011001, (outs), (ins GRRegs:$r, GRRegs:$val),
+                       "settw res[$r], $val",
+                       [(int_xcore_settw GRRegs:$r, GRRegs:$val)]>;
 
-def GETPS_l2r : _FL2R<(outs GRRegs:$dst), (ins GRRegs:$src),
-                 "get $dst, ps[$src]",
-                 [(set GRRegs:$dst, (int_xcore_getps GRRegs:$src))]>;
+def GETPS_l2r : _FL2R<0b0001011001, (outs GRRegs:$dst), (ins GRRegs:$src),
+                      "get $dst, ps[$src]",
+                      [(set GRRegs:$dst, (int_xcore_getps GRRegs:$src))]>;
 
-def SETPS_l2r : _FL2R<(outs), (ins GRRegs:$src1, GRRegs:$src2),
-                 "set ps[$src1], $src2",
-                 [(int_xcore_setps GRRegs:$src1, GRRegs:$src2)]>;
+def SETPS_l2r : _FLR2R<0b0001111000, (outs), (ins GRRegs:$src1, GRRegs:$src2),
+                       "set ps[$src1], $src2",
+                       [(int_xcore_setps GRRegs:$src1, GRRegs:$src2)]>;
 
-def INITLR_l2r : _FL2R<(outs), (ins GRRegs:$t, GRRegs:$src),
+def INITLR_l2r : _FL2R<0b0001011000, (outs), (ins GRRegs:$src, GRRegs:$t),
                        "init t[$t]:lr, $src",
                        [(int_xcore_initlr GRRegs:$t, GRRegs:$src)]>;
 
-def SETCLK_l2r : _FL2R<(outs), (ins GRRegs:$src1, GRRegs:$src2),
-                       "setclk res[$src1], $src2",
-                       [(int_xcore_setclk GRRegs:$src1, GRRegs:$src2)]>;
-
-def SETRDY_l2r : _FL2R<(outs), (ins GRRegs:$src1, GRRegs:$src2),
-                       "setrdy res[$src1], $src2",
-                       [(int_xcore_setrdy GRRegs:$src1, GRRegs:$src2)]>;
-
-def SETPSC_l2r : _FL2R<(outs), (ins GRRegs:$src1, GRRegs:$src2),
-                       "setpsc res[$src1], $src2",
-                       [(int_xcore_setpsc GRRegs:$src1, GRRegs:$src2)]>;
-
-def PEEK_l2r : _FL2R<(outs GRRegs:$dst), (ins GRRegs:$src),
-                      "peek $dst, res[$src]",
-                      [(set GRRegs:$dst, (int_xcore_peek GRRegs:$src))]>;
+def SETCLK_l2r : _FLR2R<0b0000111001, (outs), (ins GRRegs:$src1, GRRegs:$src2),
+                        "setclk res[$src1], $src2",
+                        [(int_xcore_setclk GRRegs:$src1, GRRegs:$src2)]>;
 
-def ENDIN_l2r : _FL2R<(outs GRRegs:$dst), (ins GRRegs:$src),
-                       "endin $dst, res[$src]",
-                       [(set GRRegs:$dst, (int_xcore_endin GRRegs:$src))]>;
+def SETRDY_l2r : _FLR2R<0b0010111000, (outs), (ins GRRegs:$src1, GRRegs:$src2),
+                        "setrdy res[$src1], $src2",
+                        [(int_xcore_setrdy GRRegs:$src1, GRRegs:$src2)]>;
 
 // One operand short
 // TODO edu, eeu, waitet, waitef, tstart, clrtp
 // setdp, setcp, setev, kcall
 // dgetreg
-def MSYNC_1r : _F1R<(outs), (ins GRRegs:$i),
-                    "msync res[$i]",
-                    [(int_xcore_msync GRRegs:$i)]>;
-def MJOIN_1r : _F1R<(outs), (ins GRRegs:$i),
-                    "mjoin res[$i]",
-                    [(int_xcore_mjoin GRRegs:$i)]>;
+def MSYNC_1r : _F1R<0b000111, (outs), (ins GRRegs:$a),
+                    "msync res[$a]",
+                    [(int_xcore_msync GRRegs:$a)]>;
+def MJOIN_1r : _F1R<0b000101, (outs), (ins GRRegs:$a),
+                    "mjoin res[$a]",
+                    [(int_xcore_mjoin GRRegs:$a)]>;
 
 let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1 in
-def BAU_1r : _F1R<(outs), (ins GRRegs:$addr),
-                 "bau $addr",
-                 [(brind GRRegs:$addr)]>;
+def BAU_1r : _F1R<0b001001, (outs), (ins GRRegs:$a),
+                 "bau $a",
+                 [(brind GRRegs:$a)]>;
 
 let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1 in
 def BR_JT : PseudoInstXCore<(outs), (ins InlineJT:$t, GRRegs:$i),
@@ -969,80 +968,80 @@ def BR_JT32 : PseudoInstXCore<(outs), (ins InlineJT32:$t, GRRegs:$i),
                               [(XCoreBR_JT32 tjumptable:$t, GRRegs:$i)]>;
 
 let Defs=[SP], neverHasSideEffects=1 in
-def SETSP_1r : _F1R<(outs), (ins GRRegs:$src),
-                 "set sp, $src",
+def SETSP_1r : _F1R<0b001011, (outs), (ins GRRegs:$a),
+                 "set sp, $a",
                  []>;
 
 let hasCtrlDep = 1 in 
-def ECALLT_1r : _F1R<(outs), (ins GRRegs:$src),
-                 "ecallt $src",
+def ECALLT_1r : _F1R<0b010011, (outs), (ins GRRegs:$a),
+                 "ecallt $a",
                  []>;
 
 let hasCtrlDep = 1 in 
-def ECALLF_1r : _F1R<(outs), (ins GRRegs:$src),
-                 "ecallf $src",
+def ECALLF_1r : _F1R<0b010010, (outs), (ins GRRegs:$a),
+                 "ecallf $a",
                  []>;
 
 let isCall=1, 
 // All calls clobber the link register and the non-callee-saved registers:
 Defs = [R0, R1, R2, R3, R11, LR], Uses = [SP] in {
-def BLA_1r : _F1R<(outs), (ins GRRegs:$addr),
-                 "bla $addr",
-                 [(XCoreBranchLink GRRegs:$addr)]>;
+def BLA_1r : _F1R<0b001000, (outs), (ins GRRegs:$a),
+                 "bla $a",
+                 [(XCoreBranchLink GRRegs:$a)]>;
 }
 
-def SYNCR_1r : _F1R<(outs), (ins GRRegs:$r),
-                 "syncr res[$r]",
-                 [(int_xcore_syncr GRRegs:$r)]>;
+def SYNCR_1r : _F1R<0b100001, (outs), (ins GRRegs:$a),
+                 "syncr res[$a]",
+                 [(int_xcore_syncr GRRegs:$a)]>;
 
-def FREER_1r : _F1R<(outs), (ins GRRegs:$r),
-               "freer res[$r]",
-               [(int_xcore_freer GRRegs:$r)]>;
+def FREER_1r : _F1R<0b000100, (outs), (ins GRRegs:$a),
+               "freer res[$a]",
+               [(int_xcore_freer GRRegs:$a)]>;
 
 let Uses=[R11] in {
-def SETV_1r : _F1R<(outs), (ins GRRegs:$r),
-                   "setv res[$r], r11",
-                   [(int_xcore_setv GRRegs:$r, R11)]>;
+def SETV_1r : _F1R<0b010001, (outs), (ins GRRegs:$a),
+                   "setv res[$a], r11",
+                   [(int_xcore_setv GRRegs:$a, R11)]>;
 
-def SETEV_1r : _F1R<(outs), (ins GRRegs:$r),
-                    "setev res[$r], r11",
-                    [(int_xcore_setev GRRegs:$r, R11)]>;
+def SETEV_1r : _F1R<0b001111, (outs), (ins GRRegs:$a),
+                    "setev res[$a], r11",
+                    [(int_xcore_setev GRRegs:$a, R11)]>;
 }
 
-def EEU_1r : _F1R<(outs), (ins GRRegs:$r),
-               "eeu res[$r]",
-               [(int_xcore_eeu GRRegs:$r)]>;
+def EEU_1r : _F1R<0b000001, (outs), (ins GRRegs:$a),
+               "eeu res[$a]",
+               [(int_xcore_eeu GRRegs:$a)]>;
 
 // Zero operand short
 // TODO freet, ldspc, stspc, ldssr, stssr, ldsed, stsed,
 // stet, getkep, getksp, setkep, getid, kret, dcall, dret,
 // dentsp, drestsp
 
-def CLRE_0R : _F0R<(outs), (ins), "clre", [(int_xcore_clre)]>;
+def CLRE_0R : _F0R<0b0000001101, (outs), (ins), "clre", [(int_xcore_clre)]>;
 
 let Defs = [R11] in {
-def GETID_0R : _F0R<(outs), (ins),
+def GETID_0R : _F0R<0b0001001110, (outs), (ins),
                     "get r11, id",
                     [(set R11, (int_xcore_getid))]>;
 
-def GETED_0R : _F0R<(outs), (ins),
+def GETED_0R : _F0R<0b0000111110, (outs), (ins),
                     "get r11, ed",
                     [(set R11, (int_xcore_geted))]>;
 
-def GETET_0R : _F0R<(outs), (ins),
+def GETET_0R : _F0R<0b0000111111, (outs), (ins),
                     "get r11, et",
                     [(set R11, (int_xcore_getet))]>;
 }
 
-def SSYNC_0r : _F0R<(outs), (ins),
+def SSYNC_0r : _F0R<0b0000001110, (outs), (ins),
                     "ssync",
                     [(int_xcore_ssync)]>;
 
 let isBranch=1, isIndirectBranch=1, isTerminator=1, isBarrier = 1,
     hasSideEffects = 1 in
-def WAITEU_0R : _F0R<(outs), (ins),
-                 "waiteu",
-                 [(brind (int_xcore_waitevent))]>;
+def WAITEU_0R : _F0R<0b0000001100, (outs), (ins),
+                     "waiteu",
+                     [(brind (int_xcore_waitevent))]>;
 
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
diff --git a/lib/Target/XCore/XCoreMCInstLower.cpp b/lib/Target/XCore/XCoreMCInstLower.cpp
new file mode 100644
index 0000000000..f96eda9fcb
--- /dev/null
+++ b/lib/Target/XCore/XCoreMCInstLower.cpp
@@ -0,0 +1,117 @@
+//===-- XCoreMCInstLower.cpp - Convert XCore MachineInstr to MCInst -------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains code to lower XCore MachineInstrs to their
+/// corresponding MCInst records.
+///
+//===----------------------------------------------------------------------===//
+#include "XCoreMCInstLower.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Target/Mangler.h"
+
+using namespace llvm;
+
+XCoreMCInstLower::XCoreMCInstLower(class AsmPrinter &asmprinter)
+: Printer(asmprinter) {}
+
+void XCoreMCInstLower::Initialize(Mangler *M, MCContext *C) {
+  Mang = M;
+  Ctx = C;
+}
+
+MCOperand XCoreMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
+                                               MachineOperandType MOTy,
+                                               unsigned Offset) const {
+  MCSymbolRefExpr::VariantKind Kind = MCSymbolRefExpr::VK_None;
+  const MCSymbol *Symbol;
+
+  switch (MOTy) {
+    case MachineOperand::MO_MachineBasicBlock:
+      Symbol = MO.getMBB()->getSymbol();
+      break;
+    case MachineOperand::MO_GlobalAddress:
+      Symbol = Mang->getSymbol(MO.getGlobal());
+      Offset += MO.getOffset();
+      break;
+    case MachineOperand::MO_BlockAddress:
+      Symbol = Printer.GetBlockAddressSymbol(MO.getBlockAddress());
+      Offset += MO.getOffset();
+      break;
+    case MachineOperand::MO_ExternalSymbol:
+      Symbol = Printer.GetExternalSymbolSymbol(MO.getSymbolName());
+      Offset += MO.getOffset();
+      break;
+    case MachineOperand::MO_JumpTableIndex:
+      Symbol = Printer.GetJTISymbol(MO.getIndex());
+      break;
+    case MachineOperand::MO_ConstantPoolIndex:
+      Symbol = Printer.GetCPISymbol(MO.getIndex());
+      Offset += MO.getOffset();
+      break;
+    default:
+      llvm_unreachable("<unknown operand type>");
+  }
+
+  const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::Create(Symbol, Kind, *Ctx);
+
+  if (!Offset)
+    return MCOperand::CreateExpr(MCSym);
+
+  // Assume offset is never negative.
+  assert(Offset > 0);
+
+  const MCConstantExpr *OffsetExpr =  MCConstantExpr::Create(Offset, *Ctx);
+  const MCBinaryExpr *Add = MCBinaryExpr::CreateAdd(MCSym, OffsetExpr, *Ctx);
+  return MCOperand::CreateExpr(Add);
+}
+
+MCOperand XCoreMCInstLower::LowerOperand(const MachineOperand &MO,
+                                         unsigned offset) const {
+  MachineOperandType MOTy = MO.getType();
+
+  switch (MOTy) {
+    default: llvm_unreachable("unknown operand type");
+    case MachineOperand::MO_Register:
+      // Ignore all implicit register operands.
+      if (MO.isImplicit()) break;
+      return MCOperand::CreateReg(MO.getReg());
+    case MachineOperand::MO_Immediate:
+      return MCOperand::CreateImm(MO.getImm() + offset);
+    case MachineOperand::MO_MachineBasicBlock:
+    case MachineOperand::MO_GlobalAddress:
+    case MachineOperand::MO_ExternalSymbol:
+    case MachineOperand::MO_JumpTableIndex:
+    case MachineOperand::MO_ConstantPoolIndex:
+    case MachineOperand::MO_BlockAddress:
+      return LowerSymbolOperand(MO, MOTy, offset);
+    case MachineOperand::MO_RegisterMask:
+      break;
+  }
+
+  return MCOperand();
+}
+
+void XCoreMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
+  OutMI.setOpcode(MI->getOpcode());
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    const MachineOperand &MO = MI->getOperand(i);
+    MCOperand MCOp = LowerOperand(MO);
+
+    if (MCOp.isValid())
+      OutMI.addOperand(MCOp);
+  }
+}
diff --git a/lib/Target/XCore/XCoreMCInstLower.h b/lib/Target/XCore/XCoreMCInstLower.h
new file mode 100644
index 0000000000..28e702bb98
--- /dev/null
+++ b/lib/Target/XCore/XCoreMCInstLower.h
@@ -0,0 +1,42 @@
+//===-- XCoreMCInstLower.h - Lower MachineInstr to MCInst ------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XCOREMCINSTLOWER_H
+#define XCOREMCINSTLOWER_H
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/Support/Compiler.h"
+
+namespace llvm {
+  class MCContext;
+  class MCInst;
+  class MCOperand;
+  class MachineInstr;
+  class MachineFunction;
+  class Mangler;
+  class AsmPrinter;
+
+/// \brief This class is used to lower an MachineInstr into an MCInst.
+class LLVM_LIBRARY_VISIBILITY XCoreMCInstLower {
+  typedef MachineOperand::MachineOperandType MachineOperandType;
+  MCContext *Ctx;
+  Mangler *Mang;
+  AsmPrinter &Printer;
+public:
+  XCoreMCInstLower(class AsmPrinter &asmprinter);
+  void Initialize(Mangler *mang, MCContext *C);
+  void Lower(const MachineInstr *MI, MCInst &OutMI) const;
+  MCOperand LowerOperand(const MachineOperand& MO, unsigned offset = 0) const;
+
+private:
+  MCOperand LowerSymbolOperand(const MachineOperand &MO,
+                               MachineOperandType MOTy, unsigned Offset) const;
+};
+}
+
+#endif
diff --git a/lib/Target/XCore/XCoreRegisterInfo.td b/lib/Target/XCore/XCoreRegisterInfo.td
index 9edfda1f50..4c771e9700 100644
--- a/lib/Target/XCore/XCoreRegisterInfo.td
+++ b/lib/Target/XCore/XCoreRegisterInfo.td
@@ -45,10 +45,10 @@ def LR : Ri<15, "lr">, DwarfRegNum<[15]>;
 def GRRegs : RegisterClass<"XCore", [i32], 32,
   // Return values and arguments
   (add R0, R1, R2, R3,
-  // Not preserved across procedure calls
-  R11,
   // Callee save
-  R4, R5, R6, R7, R8, R9, R10)>;
+  R4, R5, R6, R7, R8, R9, R10,
+  // Not preserved across procedure calls
+  R11)>;
 
 // Reserved
 def RRegs : RegisterClass<"XCore", [i32], 32, (add CP, DP, SP, LR)> {
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index bd8fa66d52..b636414250 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -214,11 +214,13 @@ unsigned Inliner::getInlineThreshold(CallSite CS) const {
       OptSizeThreshold < thres)
     thres = OptSizeThreshold;
 
-  // Listen to the inlinehint attribute when it would increase the threshold.
+  // Listen to the inlinehint attribute when it would increase the threshold
+  // and the caller does not need to minimize its size.
   Function *Callee = CS.getCalledFunction();
   bool InlineHint = Callee && !Callee->isDeclaration() &&
     Callee->getFnAttributes().hasAttribute(Attributes::InlineHint);
-  if (InlineHint && HintThreshold > thres)
+  if (InlineHint && HintThreshold > thres
+      && !Caller->getFnAttributes().hasAttribute(Attributes::MinSize))
     thres = HintThreshold;
 
   return thres;
diff --git a/lib/Transforms/IPO/Internalize.cpp b/lib/Transforms/IPO/Internalize.cpp
index b2cd3a765a..bd94f0a252 100644
--- a/lib/Transforms/IPO/Internalize.cpp
+++ b/lib/Transforms/IPO/Internalize.cpp
@@ -48,7 +48,7 @@ namespace {
   public:
     static char ID; // Pass identification, replacement for typeid
     explicit InternalizePass();
-    explicit InternalizePass(const std::vector <const char *>& exportList);
+    explicit InternalizePass(ArrayRef<const char *> exportList);
     void LoadFile(const char *Filename);
     virtual bool runOnModule(Module &M);
 
@@ -72,10 +72,10 @@ InternalizePass::InternalizePass()
     ExternalNames.insert(APIList.begin(), APIList.end());
 }
 
-InternalizePass::InternalizePass(const std::vector<const char *>&exportList)
+InternalizePass::InternalizePass(ArrayRef<const char *> exportList)
   : ModulePass(ID){
   initializeInternalizePassPass(*PassRegistry::getPassRegistry());
-  for(std::vector<const char *>::const_iterator itr = exportList.begin();
+  for(ArrayRef<const char *>::const_iterator itr = exportList.begin();
         itr != exportList.end(); itr++) {
     ExternalNames.insert(*itr);
   }
@@ -173,6 +173,6 @@ ModulePass *llvm::createInternalizePass() {
   return new InternalizePass();
 }
 
-ModulePass *llvm::createInternalizePass(const std::vector <const char *> &el) {
+ModulePass *llvm::createInternalizePass(ArrayRef<const char *> el) {
   return new InternalizePass(el);
 }
diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index d8257e64d8..47223c3b35 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -37,10 +37,10 @@ static Constant *SubOne(ConstantInt *C) {
 static inline Value *dyn_castFoldableMul(Value *V, ConstantInt *&CST) {
   if (!V->hasOneUse() || !V->getType()->isIntegerTy())
     return 0;
-  
+
   Instruction *I = dyn_cast<Instruction>(V);
   if (I == 0) return 0;
-  
+
   if (I->getOpcode() == Instruction::Mul)
     if ((CST = dyn_cast<ConstantInt>(I->getOperand(1))))
       return I->getOperand(0);
@@ -64,22 +64,22 @@ static inline Value *dyn_castFoldableMul(Value *V, ConstantInt *&CST) {
 bool InstCombiner::WillNotOverflowSignedAdd(Value *LHS, Value *RHS) {
   // There are different heuristics we can use for this.  Here are some simple
   // ones.
-  
-  // Add has the property that adding any two 2's complement numbers can only 
+
+  // Add has the property that adding any two 2's complement numbers can only
   // have one carry bit which can change a sign.  As such, if LHS and RHS each
   // have at least two sign bits, we know that the addition of the two values
   // will sign extend fine.
   if (ComputeNumSignBits(LHS) > 1 && ComputeNumSignBits(RHS) > 1)
     return true;
-  
-  
+
+
   // If one of the operands only has one non-zero bit, and if the other operand
   // has a known-zero bit in a more significant place than it (not including the
   // sign bit) the ripple may go up to and fill the zero, but won't change the
   // sign.  For example, (X & ~4) + 1.
-  
+
   // TODO: Implement.
-  
+
   return false;
 }
 
@@ -100,7 +100,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
     const APInt &Val = CI->getValue();
     if (Val.isSignBit())
       return BinaryOperator::CreateXor(LHS, RHS);
-    
+
     // See if SimplifyDemandedBits can simplify this.  This handles stuff like
     // (X & 254)+1 -> (X&254)|1
     if (SimplifyDemandedInstructionBits(I))
@@ -110,7 +110,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
     if (ZExtInst *ZI = dyn_cast<ZExtInst>(LHS))
       if (ZI->getSrcTy()->isIntegerTy(1))
         return SelectInst::Create(ZI->getOperand(0), AddOne(CI), CI);
-    
+
     Value *XorLHS = 0; ConstantInt *XorRHS = 0;
     if (match(LHS, m_Xor(m_Value(XorLHS), m_ConstantInt(XorRHS)))) {
       uint32_t TySizeBits = I.getType()->getScalarSizeInBits();
@@ -124,13 +124,13 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
         else if (XorRHS->getValue().isPowerOf2())
           ExtendAmt = TySizeBits - XorRHS->getValue().logBase2() - 1;
       }
-      
+
       if (ExtendAmt) {
         APInt Mask = APInt::getHighBitsSet(TySizeBits, ExtendAmt);
         if (!MaskedValueIsZero(XorLHS, Mask))
           ExtendAmt = 0;
       }
-      
+
       if (ExtendAmt) {
         Constant *ShAmt = ConstantInt::get(I.getType(), ExtendAmt);
         Value *NewShl = Builder->CreateShl(XorLHS, ShAmt, "sext");
@@ -175,7 +175,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
         Value *NewAdd = Builder->CreateAdd(LHSV, RHSV, "sum");
         return BinaryOperator::CreateNeg(NewAdd);
       }
-    
+
     return BinaryOperator::CreateSub(RHS, LHSV);
   }
 
@@ -209,7 +209,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
       APInt RHSKnownOne(IT->getBitWidth(), 0);
       APInt RHSKnownZero(IT->getBitWidth(), 0);
       ComputeMaskedBits(RHS, RHSKnownZero, RHSKnownOne);
-      
+
       // No bits in common -> bitwise or.
       if ((LHSKnownZero|RHSKnownZero).isAllOnesValue())
         return BinaryOperator::CreateOr(LHS, RHS);
@@ -251,7 +251,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
       // See if all bits from the first bit set in the Add RHS up are included
       // in the mask.  First, get the rightmost bit.
       const APInt &AddRHSV = CRHS->getValue();
-      
+
       // Form a mask of all bits from the lowest bit added through the top.
       APInt AddRHSHighBits(~((AddRHSV & -AddRHSV)-1));
 
@@ -289,7 +289,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
       if (match(FV, m_Zero()) && match(TV, m_Sub(m_Value(N), m_Specific(A))))
         // Fold the add into the true select value.
         return SelectInst::Create(SI->getCondition(), N, A);
-      
+
       if (match(TV, m_Zero()) && match(FV, m_Sub(m_Value(N), m_Specific(A))))
         // Fold the add into the false select value.
         return SelectInst::Create(SI->getCondition(), A, N);
@@ -301,18 +301,18 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
   if (SExtInst *LHSConv = dyn_cast<SExtInst>(LHS)) {
     // (add (sext x), cst) --> (sext (add x, cst'))
     if (ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS)) {
-      Constant *CI = 
+      Constant *CI =
         ConstantExpr::getTrunc(RHSC, LHSConv->getOperand(0)->getType());
       if (LHSConv->hasOneUse() &&
           ConstantExpr::getSExt(CI, I.getType()) == RHSC &&
           WillNotOverflowSignedAdd(LHSConv->getOperand(0), CI)) {
         // Insert the new, smaller add.
-        Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0), 
+        Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0),
                                               CI, "addconv");
         return new SExtInst(NewAdd, I.getType());
       }
     }
-    
+
     // (add (sext x), (sext y)) --> (sext (add int x, y))
     if (SExtInst *RHSConv = dyn_cast<SExtInst>(RHS)) {
       // Only do this if x/y have the same type, if at last one of them has a
@@ -323,7 +323,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
           WillNotOverflowSignedAdd(LHSConv->getOperand(0),
                                    RHSConv->getOperand(0))) {
         // Insert the new integer add.
-        Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0), 
+        Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0),
                                              RHSConv->getOperand(0), "addconv");
         return new SExtInst(NewAdd, I.getType());
       }
@@ -351,18 +351,12 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
   bool Changed = SimplifyAssociativeOrCommutative(I);
   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
 
-  if (Constant *RHSC = dyn_cast<Constant>(RHS)) {
-    // X + 0 --> X
-    if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHSC)) {
-      if (CFP->isExactlyValue(ConstantFP::getNegativeZero
-                              (I.getType())->getValueAPF()))
-        return ReplaceInstUsesWith(I, LHS);
-    }
+  if (Value *V = SimplifyFAddInst(LHS, RHS, I.getFastMathFlags(), TD))
+    return ReplaceInstUsesWith(I, V);
 
-    if (isa<PHINode>(LHS))
-      if (Instruction *NV = FoldOpIntoPhi(I))
-        return NV;
-  }
+  if (isa<Constant>(RHS) && isa<PHINode>(LHS))
+    if (Instruction *NV = FoldOpIntoPhi(I))
+      return NV;
 
   // -A + B  -->  B - A
   // -A + -B  -->  -(A + B)
@@ -374,11 +368,6 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
     if (Value *V = dyn_castFNegVal(RHS))
       return BinaryOperator::CreateFSub(LHS, V);
 
-  // Check for X+0.0.  Simplify it to X if we know X is not -0.0.
-  if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHS))
-    if (CFP->getValueAPF().isPosZero() && CannotBeNegativeZero(LHS))
-      return ReplaceInstUsesWith(I, LHS);
-
   // Check for (fadd double (sitofp x), y), see if we can merge this into an
   // integer add followed by a promotion.
   if (SIToFPInst *LHSConv = dyn_cast<SIToFPInst>(LHS)) {
@@ -388,7 +377,7 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
     // requires a constant pool load, and generally allows the add to be better
     // instcombined.
     if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHS)) {
-      Constant *CI = 
+      Constant *CI =
       ConstantExpr::getFPToSI(CFP, LHSConv->getOperand(0)->getType());
       if (LHSConv->hasOneUse() &&
           ConstantExpr::getSIToFP(CI, I.getType()) == CFP &&
@@ -399,7 +388,7 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
         return new SIToFPInst(NewAdd, I.getType());
       }
     }
-    
+
     // (fadd double (sitofp x), (sitofp y)) --> (sitofp (add int x, y))
     if (SIToFPInst *RHSConv = dyn_cast<SIToFPInst>(RHS)) {
       // Only do this if x/y have the same type, if at last one of them has a
@@ -410,13 +399,13 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
           WillNotOverflowSignedAdd(LHSConv->getOperand(0),
                                    RHSConv->getOperand(0))) {
         // Insert the new integer add.
-        Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0), 
+        Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0),
                                               RHSConv->getOperand(0),"addconv");
         return new SIToFPInst(NewAdd, I.getType());
       }
     }
   }
-  
+
   return Changed ? &I : 0;
 }
 
@@ -428,7 +417,7 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
 Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS,
                                                Type *Ty) {
   assert(TD && "Must have target data info for this");
-  
+
   // If LHS is a gep based on RHS or RHS is a gep based on LHS, we can optimize
   // this.
   bool Swapped = false;
@@ -451,7 +440,7 @@ Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS,
       }
     }
   }
-  
+
   if (GEPOperator *RHSGEP = dyn_cast<GEPOperator>(RHS)) {
     // X - (gep X, ...)
     if (RHSGEP->getOperand(0) == LHS) {
@@ -467,16 +456,16 @@ Value *InstCombiner::OptimizePointerDifference(Value *LHS, Value *RHS,
       }
     }
   }
-  
+
   // Avoid duplicating the arithmetic if GEP2 has non-constant indices and
   // multiple users.
   if (GEP1 == 0 ||
       (GEP2 != 0 && !GEP2->hasAllConstantIndices() && !GEP2->hasOneUse()))
     return 0;
-  
+
   // Emit the offset of the GEP and an intptr_t.
   Value *Result = EmitGEPOffset(GEP1);
-  
+
   // If we had a constant expression GEP on the other side offsetting the
   // pointer, subtract it from the offset we have.
   if (GEP2) {
@@ -517,7 +506,7 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
   // Replace (-1 - A) with (~A).
   if (match(Op0, m_AllOnes()))
     return BinaryOperator::CreateNot(Op1);
-  
+
   if (ConstantInt *C = dyn_cast<ConstantInt>(Op0)) {
     // C - ~X == X + (1+C)
     Value *X = 0;
@@ -553,18 +542,18 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
       return &I;
   }
 
-  
+
   { Value *Y;
     // X-(X+Y) == -Y    X-(Y+X) == -Y
     if (match(Op1, m_Add(m_Specific(Op0), m_Value(Y))) ||
         match(Op1, m_Add(m_Value(Y), m_Specific(Op0))))
       return BinaryOperator::CreateNeg(Y);
-    
+
     // (X-Y)-X == -Y
     if (match(Op0, m_Sub(m_Specific(Op1), m_Value(Y))))
       return BinaryOperator::CreateNeg(Y);
   }
-  
+
   if (Op1->hasOneUse()) {
     Value *X = 0, *Y = 0, *Z = 0;
     Constant *C = 0;
@@ -581,7 +570,7 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
         match(Op1, m_And(m_Specific(Op0), m_Value(Y))))
       return BinaryOperator::CreateAnd(Op0,
                                   Builder->CreateNot(Y, Y->getName() + ".not"));
-    
+
     // 0 - (X sdiv C)  -> (X sdiv -C)
     if (match(Op1, m_SDiv(m_Value(X), m_Constant(C))) &&
         match(Op0, m_Zero()))
@@ -604,14 +593,14 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
       C = ConstantExpr::getSub(One, ConstantExpr::getShl(One, CI));
       return BinaryOperator::CreateMul(Op0, C);
     }
-    
+
     // X - A*-B -> X + A*B
     // X - -A*B -> X + A*B
     Value *A, *B;
     if (match(Op1, m_Mul(m_Value(A), m_Neg(m_Value(B)))) ||
         match(Op1, m_Mul(m_Neg(m_Value(A)), m_Value(B))))
       return BinaryOperator::CreateAdd(Op0, Builder->CreateMul(A, B));
-      
+
     // X - A*CI -> X + A*-CI
     // X - CI*A -> X + A*-CI
     if (match(Op1, m_Mul(m_Value(A), m_ConstantInt(CI))) ||
@@ -630,7 +619,7 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
     if (X == dyn_castFoldableMul(Op1, C2))
       return BinaryOperator::CreateMul(X, ConstantExpr::getSub(C1, C2));
   }
-  
+
   // Optimize pointer differences into the same array into a size.  Consider:
   //  &A[10] - &A[0]: we should compile this to "10".
   if (TD) {
@@ -639,20 +628,23 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
         match(Op1, m_PtrToInt(m_Value(RHSOp))))
       if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType()))
         return ReplaceInstUsesWith(I, Res);
-    
+
     // trunc(p)-trunc(q) -> trunc(p-q)
     if (match(Op0, m_Trunc(m_PtrToInt(m_Value(LHSOp)))) &&
         match(Op1, m_Trunc(m_PtrToInt(m_Value(RHSOp)))))
       if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType()))
         return ReplaceInstUsesWith(I, Res);
   }
-  
+
   return 0;
 }
 
 Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
+  if (Value *V = SimplifyFSubInst(Op0, Op1, I.getFastMathFlags(), TD))
+    return ReplaceInstUsesWith(I, V);
+
   // If this is a 'B = x-(-A)', change to B = x+A...
   if (Value *V = dyn_castFNegVal(Op1))
     return BinaryOperator::CreateFAdd(Op0, V);
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 784742f274..ba4a57329d 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -16,9 +16,11 @@
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/DataLayout.h"
 #include "llvm/Support/CallSite.h"
+#include "llvm/Support/PatternMatch.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
+using namespace PatternMatch;
 
 STATISTIC(NumSimplified, "Number of library calls simplified");
 
@@ -276,25 +278,25 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       return ReplaceInstUsesWith(CI, ConstantInt::get(CI.getType(), Size));
     return 0;
   }
-  case Intrinsic::bswap:
+  case Intrinsic::bswap: {
+    Value *IIOperand = II->getArgOperand(0);
+    Value *X = 0;
+
     // bswap(bswap(x)) -> x
-    if (IntrinsicInst *Operand = dyn_cast<IntrinsicInst>(II->getArgOperand(0)))
-      if (Operand->getIntrinsicID() == Intrinsic::bswap)
-        return ReplaceInstUsesWith(CI, Operand->getArgOperand(0));
+    if (match(IIOperand, m_BSwap(m_Value(X))))
+        return ReplaceInstUsesWith(CI, X);
 
     // bswap(trunc(bswap(x))) -> trunc(lshr(x, c))
-    if (TruncInst *TI = dyn_cast<TruncInst>(II->getArgOperand(0))) {
-      if (IntrinsicInst *Operand = dyn_cast<IntrinsicInst>(TI->getOperand(0)))
-        if (Operand->getIntrinsicID() == Intrinsic::bswap) {
-          unsigned C = Operand->getType()->getPrimitiveSizeInBits() -
-                       TI->getType()->getPrimitiveSizeInBits();
-          Value *CV = ConstantInt::get(Operand->getType(), C);
-          Value *V = Builder->CreateLShr(Operand->getArgOperand(0), CV);
-          return new TruncInst(V, TI->getType());
-        }
+    if (match(IIOperand, m_Trunc(m_BSwap(m_Value(X))))) {
+      unsigned C = X->getType()->getPrimitiveSizeInBits() -
+        IIOperand->getType()->getPrimitiveSizeInBits();
+      Value *CV = ConstantInt::get(X->getType(), C);
+      Value *V = Builder->CreateLShr(X, CV);
+      return new TruncInst(V, IIOperand->getType());
     }
-
     break;
+  }
+
   case Intrinsic::powi:
     if (ConstantInt *Power = dyn_cast<ConstantInt>(II->getArgOperand(1))) {
       // powi(x, 0) -> 1.0
@@ -693,7 +695,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
         if (Splat->isOne()) {
           if (Zext)
             return CastInst::CreateZExtOrBitCast(Arg0, II->getType());
-          // else    
+          // else
           return CastInst::CreateSExtOrBitCast(Arg0, II->getType());
         }
       }
@@ -899,7 +901,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
       new StoreInst(ConstantInt::getTrue(Callee->getContext()),
                 UndefValue::get(Type::getInt1PtrTy(Callee->getContext())),
                                   OldCall);
-      // If OldCall dues not return void then replaceAllUsesWith undef.
+      // If OldCall does not return void then replaceAllUsesWith undef.
       // This allows ValueHandlers and custom metadata to adjust itself.
       if (!OldCall->getType()->isVoidTy())
         ReplaceInstUsesWith(*OldCall, UndefValue::get(OldCall->getType()));
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 5cd611c420..964297a5ea 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -37,7 +37,7 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC) {
   if (match(V, m_LShr(m_OneUse(m_Shl(m_Value(PowerOf2), m_Value(A))),
                       m_Value(B))) &&
       // The "1" can be any value known to be a power of 2.
-      isPowerOfTwo(PowerOf2, IC.getDataLayout())) {
+      isKnownToBeAPowerOfTwo(PowerOf2)) {
     A = IC.Builder->CreateSub(A, B);
     return IC.Builder->CreateShl(PowerOf2, A);
   }
@@ -45,8 +45,7 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC) {
   // (PowerOfTwo >>u B) --> isExact since shifting out the result would make it
   // inexact.  Similarly for <<.
   if (BinaryOperator *I = dyn_cast<BinaryOperator>(V))
-    if (I->isLogicalShift() &&
-        isPowerOfTwo(I->getOperand(0), IC.getDataLayout())) {
+    if (I->isLogicalShift() && isKnownToBeAPowerOfTwo(I->getOperand(0))) {
       // We know that this is an exact/nuw shift and that the input is a
       // non-zero context as well.
       if (Value *V2 = simplifyValueKnownNonZero(I->getOperand(0), IC)) {
@@ -296,20 +295,11 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
   bool Changed = SimplifyAssociativeOrCommutative(I);
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
-  // Simplify mul instructions with a constant RHS.
-  if (Constant *Op1C = dyn_cast<Constant>(Op1)) {
-    if (ConstantFP *Op1F = dyn_cast<ConstantFP>(Op1C)) {
-      // "In IEEE floating point, x*1 is not equivalent to x for nans.  However,
-      // ANSI says we can drop signals, so we can do this anyway." (from GCC)
-      if (Op1F->isExactlyValue(1.0))
-        return ReplaceInstUsesWith(I, Op0);  // Eliminate 'fmul double %X, 1.0'
-    } else if (ConstantDataVector *Op1V = dyn_cast<ConstantDataVector>(Op1C)) {
-      // As above, vector X*splat(1.0) -> X in all defined cases.
-      if (ConstantFP *F = dyn_cast_or_null<ConstantFP>(Op1V->getSplatValue()))
-        if (F->isExactlyValue(1.0))
-          return ReplaceInstUsesWith(I, Op0);
-    }
+  if (Value *V = SimplifyFMulInst(Op0, Op1, I.getFastMathFlags(), TD))
+    return ReplaceInstUsesWith(I, V);
 
+  // Simplify mul instructions with a constant RHS.
+  if (isa<Constant>(Op1)) {
     // Try to fold constant mul into select arguments.
     if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
       if (Instruction *R = FoldOpIntoSelect(I, SI))
@@ -351,6 +341,38 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
     }
   }
 
+  // X * cond ? 1.0 : 0.0 => cond ? X : 0.0
+  if (I.hasNoNaNs() && I.hasNoSignedZeros()) {
+    Value *V0 = I.getOperand(0);
+    Value *V1 = I.getOperand(1);
+    Value *Cond, *SLHS, *SRHS;
+    bool Match = false;
+
+    if (match(V0, m_Select(m_Value(Cond), m_Value(SLHS), m_Value(SRHS)))) {
+      Match = true;
+    } else if (match(V1, m_Select(m_Value(Cond), m_Value(SLHS), 
+                     m_Value(SRHS)))) {
+      Match = true;
+      std::swap(V0, V1);
+    }
+
+    if (Match) {
+      ConstantFP *C0 = dyn_cast<ConstantFP>(SLHS);
+      ConstantFP *C1 = dyn_cast<ConstantFP>(SRHS);
+
+      if (C0 && C1 &&
+          ((C0->isZero() && C1->isExactlyValue(1.0)) ||
+           (C1->isZero() && C0->isExactlyValue(1.0)))) {
+        Value *T;
+        if (C0->isZero())
+          T = Builder->CreateSelect(Cond, SLHS, V1);
+        else
+          T = Builder->CreateSelect(Cond, V1, SRHS);
+        return ReplaceInstUsesWith(I, T);
+      }
+    }
+  }
+
   return Changed ? &I : 0;
 }
 
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 08aedb3200..13653183a7 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -858,8 +858,8 @@ Value *InstCombiner::SimplifyShrShlDemandedBits(Instruction *Shr,
   Value *VarX = Shr->getOperand(0);
   Type *Ty = VarX->getType();
 
-  APInt BitMask1(Ty->getIntegerBitWidth(), (uint64_t)-1);
-  APInt BitMask2(Ty->getIntegerBitWidth(), (uint64_t)-1);
+  APInt BitMask1(APInt::getAllOnesValue(Ty->getIntegerBitWidth()));
+  APInt BitMask2(APInt::getAllOnesValue(Ty->getIntegerBitWidth()));
 
   bool isLshr = (Shr->getOpcode() == Instruction::LShr);
   BitMask1 = isLshr ? (BitMask1.lshr(ShrAmt) << ShlAmt) :
@@ -891,6 +891,8 @@ Value *InstCombiner::SimplifyShrShlDemandedBits(Instruction *Shr,
       Constant *Amt = ConstantInt::get(VarX->getType(), ShrAmt - ShlAmt);
       New = isLshr ? BinaryOperator::CreateLShr(VarX, Amt) :
                      BinaryOperator::CreateAShr(VarX, Amt);
+      if (cast<BinaryOperator>(Shr)->isExact())
+        New->setIsExact(true);
     }
 
     return InsertNewInstWith(New, *Shl);
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index f095cff33c..e0c610ffa4 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -25,6 +25,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/DataLayout.h"
+#include "llvm/DIBuilder.h"
 #include "llvm/Function.h"
 #include "llvm/IRBuilder.h"
 #include "llvm/InlineAsm.h"
@@ -38,6 +39,7 @@
 #include "llvm/Support/system_error.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include "llvm/Type.h"
 #include <algorithm>
@@ -1158,6 +1160,7 @@ bool AddressSanitizer::poisonStackInFunction(Function &F) {
   SmallVector<Instruction*, 8> RetVec;
   uint64_t TotalSize = 0;
   bool HavePoisonedAllocas = false;
+  DIBuilder DIB(*F.getParent());
 
   // Filter out Alloca instructions we want (and can) handle.
   // Collect Ret instructions.
@@ -1228,6 +1231,7 @@ bool AddressSanitizer::poisonStackInFunction(Function &F) {
     Value *NewAllocaPtr = IRB.CreateIntToPtr(
             IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, Pos)),
             AI->getType());
+    replaceDbgDeclareForAlloca(AI, NewAllocaPtr, DIB);
     AI->replaceAllUsesWith(NewAllocaPtr);
     // Analyze lifetime intrinsics only for static allocas we handle.
     if (CheckLifetime)
diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 947a2e3b12..59902269a0 100644
--- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -76,6 +76,7 @@ static const uint64_t kShadowMask32 = 1ULL << 31;
 static const uint64_t kShadowMask64 = 1ULL << 46;
 static const uint64_t kOriginOffset32 = 1ULL << 30;
 static const uint64_t kOriginOffset64 = 1ULL << 45;
+static const uint64_t kShadowTLSAlignment = 8;
 
 // This is an important flag that makes the reports much more
 // informative at the cost of greater slowdown. Not fully implemented
@@ -132,14 +133,14 @@ namespace {
 /// MemorySanitizer: instrument the code in module to find
 /// uninitialized reads.
 class MemorySanitizer : public FunctionPass {
-public:
+ public:
   MemorySanitizer() : FunctionPass(ID), TD(0), WarningFn(0) { }
   const char *getPassName() const { return "MemorySanitizer"; }
   bool runOnFunction(Function &F);
   bool doInitialization(Module &M);
   static char ID;  // Pass identification, replacement for typeid.
 
-private:
+ private:
   void initializeCallbacks(Module &M);
 
   DataLayout *TD;
@@ -241,8 +242,8 @@ void MemorySanitizer::initializeCallbacks(Module &M) {
   MsanPoisonStackFn = M.getOrInsertFunction(
     "__msan_poison_stack", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy, NULL);
   MemmoveFn = M.getOrInsertFunction(
-    "__msan_memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
-    IntptrTy, NULL);
+    "__msan_memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+    IRB.getInt8PtrTy(), IntptrTy, NULL);
   MemcpyFn = M.getOrInsertFunction(
     "__msan_memcpy", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
     IntptrTy, NULL);
@@ -377,7 +378,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
   // An unfortunate workaround for asymmetric lowering of va_arg stuff.
   // See a comment in visitCallSite for more details.
-  static const unsigned AMD64GpEndOffset = 48; // AMD64 ABI Draft 0.99.6 p3.5.7
+  static const unsigned AMD64GpEndOffset = 48;  // AMD64 ABI Draft 0.99.6 p3.5.7
   static const unsigned AMD64FpEndOffset = 176;
 
   struct ShadowOriginAndInsertPoint {
@@ -409,7 +410,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       Value *Shadow = getShadow(Val);
       Value *ShadowPtr = getShadowPtr(Addr, Shadow->getType(), IRB);
 
-      StoreInst *NewSI = IRB.CreateAlignedStore(Shadow, ShadowPtr, I.getAlignment());
+      StoreInst *NewSI =
+        IRB.CreateAlignedStore(Shadow, ShadowPtr, I.getAlignment());
       DEBUG(dbgs() << "  STORE: " << *NewSI << "\n");
       (void)NewSI;
       // If the store is volatile, add a check.
@@ -420,7 +422,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
       if (ClTrackOrigins) {
         if (ClStoreCleanOrigin || isa<StructType>(Shadow->getType())) {
-          IRB.CreateAlignedStore(getOrigin(Val), getOriginPtr(Addr, IRB), I.getAlignment());
+          IRB.CreateStore(getOrigin(Val), getOriginPtr(Addr, IRB));
         } else {
           Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB);
 
@@ -434,10 +436,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
           Value *Cmp = IRB.CreateICmpNE(ConvertedShadow,
               getCleanShadow(ConvertedShadow), "_mscmp");
           Instruction *CheckTerm =
-            SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), false, MS.OriginStoreWeights);
-          IRBuilder<> IRBNewBlock(CheckTerm);
-          IRBNewBlock.CreateAlignedStore(getOrigin(Val),
-              getOriginPtr(Addr, IRBNewBlock), I.getAlignment());
+            SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), false,
+                                      MS.OriginStoreWeights);
+          IRBuilder<> IRBNew(CheckTerm);
+          IRBNew.CreateStore(getOrigin(Val), getOriginPtr(Addr, IRBNew));
         }
       }
     }
@@ -768,7 +770,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       ShadowOriginAndInsertPoint(Shadow, Origin, OrigIns));
   }
 
-  //------------------- Visitors.
+  // ------------------- Visitors.
 
   /// \brief Instrument LoadInst
   ///
@@ -786,7 +788,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       insertCheck(I.getPointerOperand(), &I);
 
     if (ClTrackOrigins)
-      setOrigin(&I, IRB.CreateAlignedLoad(getOriginPtr(Addr, IRB), I.getAlignment()));
+      setOrigin(&I, IRB.CreateLoad(getOriginPtr(Addr, IRB)));
   }
 
   /// \brief Instrument StoreInst
@@ -918,67 +920,133 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
-  /// \brief Propagate origin for an instruction.
+  /// \brief Default propagation of shadow and/or origin.
   ///
-  /// This is a general case of origin propagation. For an Nary operation,
-  /// is set to the origin of an argument that is not entirely initialized.
-  /// If there is more than one such arguments, the rightmost of them is picked.
-  /// It does not matter which one is picked if all arguments are initialized.
-  void setOriginForNaryOp(Instruction &I) {
-    if (!ClTrackOrigins) return;
-    IRBuilder<> IRB(&I);
-    Value *Origin = getOrigin(&I, 0);
-    for (unsigned Op = 1, n = I.getNumOperands(); Op < n; ++Op) {
-      Value *S = convertToShadowTyNoVec(getShadow(&I, Op), IRB);
-      Origin = IRB.CreateSelect(IRB.CreateICmpNE(S, getCleanShadow(S)),
-                                getOrigin(&I, Op), Origin);
+  /// This class implements the general case of shadow propagation, used in all
+  /// cases where we don't know and/or don't care about what the operation
+  /// actually does. It converts all input shadow values to a common type
+  /// (extending or truncating as necessary), and bitwise OR's them.
+  ///
+  /// This is much cheaper than inserting checks (i.e. requiring inputs to be
+  /// fully initialized), and less prone to false positives.
+  ///
+  /// This class also implements the general case of origin propagation. For a
+  /// Nary operation, result origin is set to the origin of an argument that is
+  /// not entirely initialized. If there is more than one such arguments, the
+  /// rightmost of them is picked. It does not matter which one is picked if all
+  /// arguments are initialized.
+  template <bool CombineShadow>
+  class Combiner {
+    Value *Shadow;
+    Value *Origin;
+    IRBuilder<> &IRB;
+    MemorySanitizerVisitor *MSV;
+
+  public:
+    Combiner(MemorySanitizerVisitor *MSV, IRBuilder<> &IRB) :
+      Shadow(0), Origin(0), IRB(IRB), MSV(MSV) {}
+
+    /// \brief Add a pair of shadow and origin values to the mix.
+    Combiner &Add(Value *OpShadow, Value *OpOrigin) {
+      if (CombineShadow) {
+        assert(OpShadow);
+        if (!Shadow)
+          Shadow = OpShadow;
+        else {
+          OpShadow = MSV->CreateShadowCast(IRB, OpShadow, Shadow->getType());
+          Shadow = IRB.CreateOr(Shadow, OpShadow, "_msprop");
+        }
+      }
+
+      if (ClTrackOrigins) {
+        assert(OpOrigin);
+        if (!Origin) {
+          Origin = OpOrigin;
+        } else {
+          Value *FlatShadow = MSV->convertToShadowTyNoVec(OpShadow, IRB);
+          Value *Cond = IRB.CreateICmpNE(FlatShadow,
+                                         MSV->getCleanShadow(FlatShadow));
+          Origin = IRB.CreateSelect(Cond, OpOrigin, Origin);
+        }
+      }
+      return *this;
     }
-    setOrigin(&I, Origin);
-  }
 
-  /// \brief Propagate shadow for a binary operation.
-  ///
-  /// Shadow = Shadow0 | Shadow1, all 3 must have the same type.
-  /// Bitwise OR is selected as an operation that will never lose even a bit of
-  /// poison.
-  void handleShadowOrBinary(Instruction &I) {
+    /// \brief Add an application value to the mix.
+    Combiner &Add(Value *V) {
+      Value *OpShadow = MSV->getShadow(V);
+      Value *OpOrigin = ClTrackOrigins ? MSV->getOrigin(V) : 0;
+      return Add(OpShadow, OpOrigin);
+    }
+
+    /// \brief Set the current combined values as the given instruction's shadow
+    /// and origin.
+    void Done(Instruction *I) {
+      if (CombineShadow) {
+        assert(Shadow);
+        Shadow = MSV->CreateShadowCast(IRB, Shadow, MSV->getShadowTy(I));
+        MSV->setShadow(I, Shadow);
+      }
+      if (ClTrackOrigins) {
+        assert(Origin);
+        MSV->setOrigin(I, Origin);
+      }
+    }
+  };
+
+  typedef Combiner<true> ShadowAndOriginCombiner;
+  typedef Combiner<false> OriginCombiner;
+
+  /// \brief Propagate origin for arbitrary operation.
+  void setOriginForNaryOp(Instruction &I) {
+    if (!ClTrackOrigins) return;
     IRBuilder<> IRB(&I);
-    Value *Shadow0 = getShadow(&I, 0);
-    Value *Shadow1 = getShadow(&I, 1);
-    setShadow(&I, IRB.CreateOr(Shadow0, Shadow1, "_msprop"));
-    setOriginForNaryOp(I);
+    OriginCombiner OC(this, IRB);
+    for (Instruction::op_iterator OI = I.op_begin(); OI != I.op_end(); ++OI)
+      OC.Add(OI->get());
+    OC.Done(&I);
+  }
+
+  size_t VectorOrPrimitiveTypeSizeInBits(Type *Ty) {
+    return Ty->isVectorTy() ?
+      Ty->getVectorNumElements() * Ty->getScalarSizeInBits() :
+      Ty->getPrimitiveSizeInBits();
+  }
+
+  /// \brief Cast between two shadow types, extending or truncating as
+  /// necessary.
+  Value *CreateShadowCast(IRBuilder<> &IRB, Value *V, Type *dstTy) {
+    Type *srcTy = V->getType();
+    if (dstTy->isIntegerTy() && srcTy->isIntegerTy())
+      return IRB.CreateIntCast(V, dstTy, false);
+    if (dstTy->isVectorTy() && srcTy->isVectorTy() &&
+        dstTy->getVectorNumElements() == srcTy->getVectorNumElements())
+      return IRB.CreateIntCast(V, dstTy, false);
+    size_t srcSizeInBits = VectorOrPrimitiveTypeSizeInBits(srcTy);
+    size_t dstSizeInBits = VectorOrPrimitiveTypeSizeInBits(dstTy);
+    Value *V1 = IRB.CreateBitCast(V, Type::getIntNTy(*MS.C, srcSizeInBits));
+    Value *V2 =
+      IRB.CreateIntCast(V1, Type::getIntNTy(*MS.C, dstSizeInBits), false);
+    return IRB.CreateBitCast(V2, dstTy);
+    // TODO: handle struct types.
   }
 
   /// \brief Propagate shadow for arbitrary operation.
-  ///
-  /// This is a general case of shadow propagation, used in all cases where we
-  /// don't know and/or care about what the operation actually does.
-  /// It converts all input shadow values to a common type (extending or
-  /// truncating as necessary), and bitwise OR's them.
-  ///
-  /// This is much cheaper than inserting checks (i.e. requiring inputs to be
-  /// fully initialized), and less prone to false positives.
-  // FIXME: is the casting actually correct?
-  // FIXME: merge this with handleShadowOrBinary.
   void handleShadowOr(Instruction &I) {
     IRBuilder<> IRB(&I);
-    Value *Shadow = getShadow(&I, 0);
-    for (unsigned Op = 1, n = I.getNumOperands(); Op < n; ++Op)
-      Shadow = IRB.CreateOr(
-        Shadow, IRB.CreateIntCast(getShadow(&I, Op), Shadow->getType(), false),
-        "_msprop");
-    Shadow = IRB.CreateIntCast(Shadow, getShadowTy(&I), false);
-    setShadow(&I, Shadow);
-    setOriginForNaryOp(I);
+    ShadowAndOriginCombiner SC(this, IRB);
+    for (Instruction::op_iterator OI = I.op_begin(); OI != I.op_end(); ++OI)
+      SC.Add(OI->get());
+    SC.Done(&I);
   }
 
-  void visitFAdd(BinaryOperator &I) { handleShadowOrBinary(I); }
-  void visitFSub(BinaryOperator &I) { handleShadowOrBinary(I); }
-  void visitFMul(BinaryOperator &I) { handleShadowOrBinary(I); }
-  void visitAdd(BinaryOperator &I) { handleShadowOrBinary(I); }
-  void visitSub(BinaryOperator &I) { handleShadowOrBinary(I); }
-  void visitXor(BinaryOperator &I) { handleShadowOrBinary(I); }
-  void visitMul(BinaryOperator &I) { handleShadowOrBinary(I); }
+  void visitFAdd(BinaryOperator &I) { handleShadowOr(I); }
+  void visitFSub(BinaryOperator &I) { handleShadowOr(I); }
+  void visitFMul(BinaryOperator &I) { handleShadowOr(I); }
+  void visitAdd(BinaryOperator &I) { handleShadowOr(I); }
+  void visitSub(BinaryOperator &I) { handleShadowOr(I); }
+  void visitXor(BinaryOperator &I) { handleShadowOr(I); }
+  void visitMul(BinaryOperator &I) { handleShadowOr(I); }
 
   void handleDiv(Instruction &I) {
     IRBuilder<> IRB(&I);
@@ -1155,9 +1223,11 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   void visitIntrinsicInst(IntrinsicInst &I) {
     switch (I.getIntrinsicID()) {
     case llvm::Intrinsic::bswap:
-      handleBswap(I); break;
+      handleBswap(I);
+      break;
     default:
-      visitInstruction(I); break;
+      visitInstruction(I);
+      break;
     }
   }
 
@@ -1226,7 +1296,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
                                  Size, Alignment);
       } else {
         Size = MS.TD->getTypeAllocSize(A->getType());
-        Store = IRB.CreateStore(ArgShadow, ArgShadowBase);
+        Store = IRB.CreateAlignedStore(ArgShadow, ArgShadowBase,
+                                       kShadowTLSAlignment);
       }
       if (ClTrackOrigins)
         IRB.CreateStore(getOrigin(A),
@@ -1248,7 +1319,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     IRBuilder<> IRBBefore(&I);
     // Untill we have full dynamic coverage, make sure the retval shadow is 0.
     Value *Base = getShadowPtrForRetval(&I, IRBBefore);
-    IRBBefore.CreateStore(getCleanShadow(&I), Base);
+    IRBBefore.CreateAlignedStore(getCleanShadow(&I), Base, kShadowTLSAlignment);
     Instruction *NextInsn = 0;
     if (CS.isCall()) {
       NextInsn = I.getNextNode();
@@ -1267,8 +1338,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
              "Could not find insertion point for retval shadow load");
     }
     IRBuilder<> IRBAfter(NextInsn);
-    setShadow(&I, IRBAfter.CreateLoad(getShadowPtrForRetval(&I, IRBAfter),
-                                      "_msret"));
+    Value *RetvalShadow =
+      IRBAfter.CreateAlignedLoad(getShadowPtrForRetval(&I, IRBAfter),
+                                 kShadowTLSAlignment, "_msret");
+    setShadow(&I, RetvalShadow);
     if (ClTrackOrigins)
       setOrigin(&I, IRBAfter.CreateLoad(getOriginPtrForRetval(IRBAfter)));
   }
@@ -1280,7 +1353,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       Value *Shadow = getShadow(RetVal);
       Value *ShadowPtr = getShadowPtrForRetval(RetVal, IRB);
       DEBUG(dbgs() << "Return: " << *Shadow << "\n" << *ShadowPtr << "\n");
-      IRB.CreateStore(Shadow, ShadowPtr);
+      IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment);
       if (ClTrackOrigins)
         IRB.CreateStore(getOrigin(RetVal), getOriginPtrForRetval(IRB));
     }
@@ -1407,7 +1480,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 struct VarArgAMD64Helper : public VarArgHelper {
   // An unfortunate workaround for asymmetric lowering of va_arg stuff.
   // See a comment in visitCallSite for more details.
-  static const unsigned AMD64GpEndOffset = 48; // AMD64 ABI Draft 0.99.6 p3.5.7
+  static const unsigned AMD64GpEndOffset = 48;  // AMD64 ABI Draft 0.99.6 p3.5.7
   static const unsigned AMD64FpEndOffset = 176;
 
   Function &F;
@@ -1471,7 +1544,7 @@ struct VarArgAMD64Helper : public VarArgHelper {
         Base = getShadowPtrForVAArgument(A, IRB, OverflowOffset);
         OverflowOffset += DataLayout::RoundUpAlignment(ArgSize, 8);
       }
-      IRB.CreateStore(MSV.getShadow(A), Base);
+      IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment);
     }
     Constant *OverflowSize =
       ConstantInt::get(IRB.getInt64Ty(), OverflowOffset - AMD64FpEndOffset);
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index 1c220ca0f6..a0ee849a03 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -2150,7 +2150,7 @@ static bool isIntegerWideningViable(const DataLayout &TD,
           !canConvertValue(TD, ValueTy, AllocaTy))
         return false;
     } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I->U->getUser())) {
-      if (MI->isVolatile())
+      if (MI->isVolatile() || !isa<Constant>(MI->getLength()))
         return false;
       if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(I->U->getUser())) {
         const AllocaPartitioning::MemTransferOffsets &MTO
@@ -2223,6 +2223,84 @@ static Value *insertInteger(const DataLayout &DL, IRBuilder<> &IRB, Value *Old,
   return V;
 }
 
+static Value *extractVector(IRBuilder<> &IRB, Value *V,
+                            unsigned BeginIndex, unsigned EndIndex,
+                            const Twine &Name) {
+  VectorType *VecTy = cast<VectorType>(V->getType());
+  unsigned NumElements = EndIndex - BeginIndex;
+  assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
+
+  if (NumElements == VecTy->getNumElements())
+    return V;
+
+  if (NumElements == 1) {
+    V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
+                                 Name + ".extract");
+    DEBUG(dbgs() << "     extract: " << *V << "\n");
+    return V;
+  }
+
+  SmallVector<Constant*, 8> Mask;
+  Mask.reserve(NumElements);
+  for (unsigned i = BeginIndex; i != EndIndex; ++i)
+    Mask.push_back(IRB.getInt32(i));
+  V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
+                              ConstantVector::get(Mask),
+                              Name + ".extract");
+  DEBUG(dbgs() << "     shuffle: " << *V << "\n");
+  return V;
+}
+
+static Value *insertVector(IRBuilder<> &IRB, Value *Old, Value *V,
+                           unsigned BeginIndex, const Twine &Name) {
+  VectorType *VecTy = cast<VectorType>(Old->getType());
+  assert(VecTy && "Can only insert a vector into a vector");
+
+  VectorType *Ty = dyn_cast<VectorType>(V->getType());
+  if (!Ty) {
+    // Single element to insert.
+    V = IRB.CreateInsertElement(Old, V, IRB.getInt32(BeginIndex),
+                                Name + ".insert");
+    DEBUG(dbgs() <<  "     insert: " << *V << "\n");
+    return V;
+  }
+
+  assert(Ty->getNumElements() <= VecTy->getNumElements() &&
+         "Too many elements!");
+  if (Ty->getNumElements() == VecTy->getNumElements()) {
+    assert(V->getType() == VecTy && "Vector type mismatch");
+    return V;
+  }
+  unsigned EndIndex = BeginIndex + Ty->getNumElements();
+
+  // When inserting a smaller vector into the larger to store, we first
+  // use a shuffle vector to widen it with undef elements, and then
+  // a second shuffle vector to select between the loaded vector and the
+  // incoming vector.
+  SmallVector<Constant*, 8> Mask;
+  Mask.reserve(VecTy->getNumElements());
+  for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
+    if (i >= BeginIndex && i < EndIndex)
+      Mask.push_back(IRB.getInt32(i - BeginIndex));
+    else
+      Mask.push_back(UndefValue::get(IRB.getInt32Ty()));
+  V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
+                              ConstantVector::get(Mask),
+                              Name + ".expand");
+  DEBUG(dbgs() << "    shuffle1: " << *V << "\n");
+
+  Mask.clear();
+  for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
+    if (i >= BeginIndex && i < EndIndex)
+      Mask.push_back(IRB.getInt32(i));
+    else
+      Mask.push_back(IRB.getInt32(i + VecTy->getNumElements()));
+  V = IRB.CreateShuffleVector(V, Old, ConstantVector::get(Mask),
+                              Name + "insert");
+  DEBUG(dbgs() << "    shuffle2: " << *V << "\n");
+  return V;
+}
+
 namespace {
 /// \brief Visitor to rewrite instructions using a partition of an alloca to
 /// use a new alloca.
@@ -2388,29 +2466,14 @@ private:
       Pass.DeadInsts.insert(I);
   }
 
-  Value *rewriteVectorizedLoadInst(IRBuilder<> &IRB, LoadInst &LI, Value *OldOp) {
-    Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                     getName(".load"));
+  Value *rewriteVectorizedLoadInst(IRBuilder<> &IRB) {
     unsigned BeginIndex = getIndex(BeginOffset);
     unsigned EndIndex = getIndex(EndOffset);
     assert(EndIndex > BeginIndex && "Empty vector!");
-    unsigned NumElements = EndIndex - BeginIndex;
-    assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
-    if (NumElements == 1) {
-      V = IRB.CreateExtractElement(V, IRB.getInt32(BeginIndex),
-                                   getName(".extract"));
-      DEBUG(dbgs() << "     extract: " << *V << "\n");
-    } else if (NumElements < VecTy->getNumElements()) {
-      SmallVector<Constant*, 8> Mask;
-      Mask.reserve(NumElements);
-      for (unsigned i = BeginIndex; i != EndIndex; ++i)
-        Mask.push_back(IRB.getInt32(i));
-      V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
-                                  ConstantVector::get(Mask),
-                                  getName(".extract"));
-      DEBUG(dbgs() << "     shuffle: " << *V << "\n");
-    }
-    return V;
+
+    Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                     getName(".load"));
+    return extractVector(IRB, V, BeginIndex, EndIndex, getName(".vec"));
   }
 
   Value *rewriteIntegerLoad(IRBuilder<> &IRB, LoadInst &LI) {
@@ -2457,7 +2520,7 @@ private:
     bool IsPtrAdjusted = false;
     Value *V;
     if (VecTy) {
-      V = rewriteVectorizedLoadInst(IRB, LI, OldOp);
+      V = rewriteVectorizedLoadInst(IRB);
     } else if (IntTy && LI.getType()->isIntegerTy()) {
       V = rewriteIntegerLoad(IRB, LI);
     } else if (BeginOffset == NewAllocaBeginOffset &&
@@ -2518,44 +2581,12 @@ private:
                            : VectorType::get(ElementTy, NumElements);
     if (V->getType() != PartitionTy)
       V = convertValue(TD, IRB, V, PartitionTy);
-    if (NumElements < VecTy->getNumElements()) {
-      // We need to mix in the existing elements.
-      LoadInst *LI = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                           getName(".load"));
-      if (NumElements == 1) {
-        V = IRB.CreateInsertElement(LI, V, IRB.getInt32(BeginIndex),
-                                    getName(".insert"));
-        DEBUG(dbgs() <<  "     insert: " << *V << "\n");
-      } else {
-        // When inserting a smaller vector into the larger to store, we first
-        // use a shuffle vector to widen it with undef elements, and then
-        // a second shuffle vector to select between the loaded vector and the
-        // incoming vector.
-        SmallVector<Constant*, 8> Mask;
-        Mask.reserve(VecTy->getNumElements());
-        for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
-          if (i >= BeginIndex && i < EndIndex)
-            Mask.push_back(IRB.getInt32(i - BeginIndex));
-          else
-            Mask.push_back(UndefValue::get(IRB.getInt32Ty()));
-        V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
-                                    ConstantVector::get(Mask),
-                                    getName(".expand"));
-        DEBUG(dbgs() << "    shuffle1: " << *V << "\n");
-
-        Mask.clear();
-        for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
-          if (i >= BeginIndex && i < EndIndex)
-            Mask.push_back(IRB.getInt32(i));
-          else
-            Mask.push_back(IRB.getInt32(i + VecTy->getNumElements()));
-        V = IRB.CreateShuffleVector(V, LI, ConstantVector::get(Mask),
-                                    getName("insert"));
-        DEBUG(dbgs() << "    shuffle2: " << *V << "\n");
-      }
-    } else {
-      V = convertValue(TD, IRB, V, VecTy);
-    }
+
+    // Mix in the existing elements.
+    Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                       getName(".load"));
+    V = insertVector(IRB, Old, V, BeginIndex, getName(".vec"));
+
     StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
     Pass.DeadInsts.insert(&SI);
 
@@ -2607,7 +2638,7 @@ private:
              TD.getTypeStoreSizeInBits(V->getType()) &&
              "Non-byte-multiple bit width");
       assert(V->getType()->getIntegerBitWidth() ==
-             TD.getTypeSizeInBits(OldAI.getAllocatedType()) &&
+             TD.getTypeAllocSizeInBits(OldAI.getAllocatedType()) &&
              "Only alloca-wide stores can be split and recomposed");
       IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), Size * 8);
       V = extractInteger(TD, IRB, V, NarrowTy, BeginOffset,
@@ -2639,6 +2670,51 @@ private:
     return NewSI->getPointerOperand() == &NewAI && !SI.isVolatile();
   }
 
+  /// \brief Compute an integer value from splatting an i8 across the given
+  /// number of bytes.
+  ///
+  /// Note that this routine assumes an i8 is a byte. If that isn't true, don't
+  /// call this routine.
+  /// FIXME: Heed the abvice above.
+  ///
+  /// \param V The i8 value to splat.
+  /// \param Size The number of bytes in the output (assuming i8 is one byte)
+  Value *getIntegerSplat(IRBuilder<> &IRB, Value *V, unsigned Size) {
+    assert(Size > 0 && "Expected a positive number of bytes.");
+    IntegerType *VTy = cast<IntegerType>(V->getType());
+    assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte");
+    if (Size == 1)
+      return V;
+
+    Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size*8);
+    V = IRB.CreateMul(IRB.CreateZExt(V, SplatIntTy, getName(".zext")),
+                      ConstantExpr::getUDiv(
+                        Constant::getAllOnesValue(SplatIntTy),
+                        ConstantExpr::getZExt(
+                          Constant::getAllOnesValue(V->getType()),
+                          SplatIntTy)),
+                      getName(".isplat"));
+    return V;
+  }
+
+  /// \brief Compute a vector splat for a given element value.
+  Value *getVectorSplat(IRBuilder<> &IRB, Value *V, unsigned NumElements) {
+    assert(NumElements > 0 && "Cannot splat to an empty vector.");
+
+    // First insert it into a one-element vector so we can shuffle it. It is
+    // really silly that LLVM's IR requires this in order to form a splat.
+    Value *Undef = UndefValue::get(VectorType::get(V->getType(), 1));
+    V = IRB.CreateInsertElement(Undef, V, IRB.getInt32(0),
+                                getName(".splatinsert"));
+
+    // Shuffle the value across the desired number of elements.
+    SmallVector<Constant*, 8> Mask(NumElements, IRB.getInt32(0));
+    V = IRB.CreateShuffleVector(V, Undef, ConstantVector::get(Mask),
+                                getName(".splat"));
+    DEBUG(dbgs() << "       splat: " << *V << "\n");
+    return V;
+  }
+
   bool visitMemSetInst(MemSetInst &II) {
     DEBUG(dbgs() << "    original: " << II << "\n");
     IRBuilder<> IRB(&II);
@@ -2667,7 +2743,8 @@ private:
         (BeginOffset != NewAllocaBeginOffset ||
          EndOffset != NewAllocaEndOffset ||
          !AllocaTy->isSingleValueType() ||
-         !TD.isLegalInteger(TD.getTypeSizeInBits(ScalarTy)))) {
+         !TD.isLegalInteger(TD.getTypeSizeInBits(ScalarTy)) ||
+         TD.getTypeSizeInBits(ScalarTy)%8 != 0)) {
       Type *SizeTy = II.getLength()->getType();
       Constant *Size = ConstantInt::get(SizeTy, EndOffset - BeginOffset);
       CallInst *New
@@ -2683,53 +2760,62 @@ private:
     // If we can represent this as a simple value, we have to build the actual
     // value to store, which requires expanding the byte present in memset to
     // a sensible representation for the alloca type. This is essentially
-    // splatting the byte to a sufficiently wide integer, bitcasting to the
-    // desired scalar type, and splatting it across any desired vector type.
+    // splatting the byte to a sufficiently wide integer, splatting it across
+    // any desired vector width, and bitcasting to the final type.
     uint64_t Size = EndOffset - BeginOffset;
-    Value *V = II.getValue();
-    IntegerType *VTy = cast<IntegerType>(V->getType());
-    Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size*8);
-    if (Size*8 > VTy->getBitWidth())
-      V = IRB.CreateMul(IRB.CreateZExt(V, SplatIntTy, getName(".zext")),
-                        ConstantExpr::getUDiv(
-                          Constant::getAllOnesValue(SplatIntTy),
-                          ConstantExpr::getZExt(
-                            Constant::getAllOnesValue(V->getType()),
-                            SplatIntTy)),
-                        getName(".isplat"));
-
-    // If this is an element-wide memset of a vectorizable alloca, insert it.
-    if (VecTy && (BeginOffset > NewAllocaBeginOffset ||
-                  EndOffset < NewAllocaEndOffset)) {
-      if (V->getType() != ScalarTy)
-        V = convertValue(TD, IRB, V, ScalarTy);
-      StoreInst *Store = IRB.CreateAlignedStore(
-        IRB.CreateInsertElement(IRB.CreateAlignedLoad(&NewAI,
-                                                      NewAI.getAlignment(),
-                                                      getName(".load")),
-                                V, IRB.getInt32(getIndex(BeginOffset)),
-                                getName(".insert")),
-        &NewAI, NewAI.getAlignment());
-      (void)Store;
-      DEBUG(dbgs() << "          to: " << *Store << "\n");
-      return true;
-    }
+    Value *V = getIntegerSplat(IRB, II.getValue(), Size);
+
+    if (VecTy) {
+      // If this is a memset of a vectorized alloca, insert it.
+      assert(ElementTy == ScalarTy);
+
+      unsigned BeginIndex = getIndex(BeginOffset);
+      unsigned EndIndex = getIndex(EndOffset);
+      assert(EndIndex > BeginIndex && "Empty vector!");
+      unsigned NumElements = EndIndex - BeginIndex;
+      assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
+
+      Value *Splat = getIntegerSplat(IRB, II.getValue(),
+                                     TD.getTypeSizeInBits(ElementTy)/8);
+      Splat = convertValue(TD, IRB, Splat, ElementTy);
+      if (NumElements > 1)
+        Splat = getVectorSplat(IRB, Splat, NumElements);
 
-    // If this is a memset on an alloca where we can widen stores, insert the
-    // set integer.
-    if (IntTy && (BeginOffset > NewAllocaBeginOffset ||
-                  EndOffset < NewAllocaEndOffset)) {
-      assert(!II.isVolatile());
       Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
                                          getName(".oldload"));
-      Old = convertValue(TD, IRB, Old, IntTy);
-      assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
-      uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
-      V = insertInteger(TD, IRB, Old, V, Offset, getName(".insert"));
-    }
+      V = insertVector(IRB, Old, Splat, BeginIndex, getName(".vec"));
+    } else if (IntTy) {
+      // If this is a memset on an alloca where we can widen stores, insert the
+      // set integer.
+      assert(!II.isVolatile());
 
-    if (V->getType() != AllocaTy)
+      V = getIntegerSplat(IRB, II.getValue(), Size);
+
+      if (IntTy && (BeginOffset != NewAllocaBeginOffset ||
+                    EndOffset != NewAllocaBeginOffset)) {
+        Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                           getName(".oldload"));
+        Old = convertValue(TD, IRB, Old, IntTy);
+        assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
+        uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
+        V = insertInteger(TD, IRB, Old, V, Offset, getName(".insert"));
+      } else {
+        assert(V->getType() == IntTy &&
+               "Wrong type for an alloca wide integer!");
+      }
       V = convertValue(TD, IRB, V, AllocaTy);
+    } else {
+      // Established these invariants above.
+      assert(BeginOffset == NewAllocaBeginOffset);
+      assert(EndOffset == NewAllocaEndOffset);
+
+      V = getIntegerSplat(IRB, II.getValue(),
+                          TD.getTypeSizeInBits(ScalarTy)/8);
+      if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy))
+        V = getVectorSplat(IRB, V, AllocaVecTy->getNumElements());
+
+      V = convertValue(TD, IRB, V, AllocaTy);
+    }
 
     Value *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
                                         II.isVolatile());
@@ -2814,37 +2900,22 @@ private:
     // Record this instruction for deletion.
     Pass.DeadInsts.insert(&II);
 
-    bool IsWholeAlloca = BeginOffset == NewAllocaBeginOffset &&
-                         EndOffset == NewAllocaEndOffset;
-    bool IsVectorElement = VecTy && !IsWholeAlloca;
-    uint64_t Size = EndOffset - BeginOffset;
-    IntegerType *SubIntTy
-      = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : 0;
-
-    Type *OtherPtrTy = IsDest ? II.getRawSource()->getType()
-                              : II.getRawDest()->getType();
-    if (!EmitMemCpy) {
-      if (IsVectorElement)
-        OtherPtrTy = VecTy->getElementType()->getPointerTo();
-      else if (IntTy && !IsWholeAlloca)
-        OtherPtrTy = SubIntTy->getPointerTo();
-      else
-        OtherPtrTy = NewAI.getType();
-    }
-
-    // Compute the other pointer, folding as much as possible to produce
-    // a single, simple GEP in most cases.
-    Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
-    OtherPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy,
-                              getName("." + OtherPtr->getName()));
-
     // Strip all inbounds GEPs and pointer casts to try to dig out any root
     // alloca that should be re-examined after rewriting this instruction.
+    Value *OtherPtr = IsDest ? II.getRawSource() : II.getRawDest();
     if (AllocaInst *AI
           = dyn_cast<AllocaInst>(OtherPtr->stripInBoundsOffsets()))
       Pass.Worklist.insert(AI);
 
     if (EmitMemCpy) {
+      Type *OtherPtrTy = IsDest ? II.getRawSource()->getType()
+                                : II.getRawDest()->getType();
+
+      // Compute the other pointer, folding as much as possible to produce
+      // a single, simple GEP in most cases.
+      OtherPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy,
+                                getName("." + OtherPtr->getName()));
+
       Value *OurPtr
         = getAdjustedAllocaPtr(IRB, IsDest ? II.getRawDest()->getType()
                                            : II.getRawSource()->getType());
@@ -2865,18 +2936,38 @@ private:
     if (!Align)
       Align = 1;
 
-    Value *SrcPtr = OtherPtr;
+    bool IsWholeAlloca = BeginOffset == NewAllocaBeginOffset &&
+                         EndOffset == NewAllocaEndOffset;
+    uint64_t Size = EndOffset - BeginOffset;
+    unsigned BeginIndex = VecTy ? getIndex(BeginOffset) : 0;
+    unsigned EndIndex = VecTy ? getIndex(EndOffset) : 0;
+    unsigned NumElements = EndIndex - BeginIndex;
+    IntegerType *SubIntTy
+      = IntTy ? Type::getIntNTy(IntTy->getContext(), Size*8) : 0;
+
+    Type *OtherPtrTy = NewAI.getType();
+    if (VecTy && !IsWholeAlloca) {
+      if (NumElements == 1)
+        OtherPtrTy = VecTy->getElementType();
+      else
+        OtherPtrTy = VectorType::get(VecTy->getElementType(), NumElements);
+
+      OtherPtrTy = OtherPtrTy->getPointerTo();
+    } else if (IntTy && !IsWholeAlloca) {
+      OtherPtrTy = SubIntTy->getPointerTo();
+    }
+
+    Value *SrcPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy,
+                                   getName("." + OtherPtr->getName()));
     Value *DstPtr = &NewAI;
     if (!IsDest)
       std::swap(SrcPtr, DstPtr);
 
     Value *Src;
-    if (IsVectorElement && !IsDest) {
-      // We have to extract rather than load.
-      Src = IRB.CreateExtractElement(
-        IRB.CreateAlignedLoad(SrcPtr, Align, getName(".copyload")),
-        IRB.getInt32(getIndex(BeginOffset)),
-        getName(".copyextract"));
+    if (VecTy && !IsWholeAlloca && !IsDest) {
+      Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                  getName(".load"));
+      Src = extractVector(IRB, Src, BeginIndex, EndIndex, getName(".vec"));
     } else if (IntTy && !IsWholeAlloca && !IsDest) {
       Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
                                   getName(".load"));
@@ -2889,7 +2980,11 @@ private:
                                   getName(".copyload"));
     }
 
-    if (IntTy && !IsWholeAlloca && IsDest) {
+    if (VecTy && !IsWholeAlloca && IsDest) {
+      Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                         getName(".oldload"));
+      Src = insertVector(IRB, Old, Src, BeginIndex, getName(".vec"));
+    } else if (IntTy && !IsWholeAlloca && IsDest) {
       Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
                                          getName(".oldload"));
       Old = convertValue(TD, IRB, Old, IntTy);
@@ -2899,14 +2994,6 @@ private:
       Src = convertValue(TD, IRB, Src, NewAllocaTy);
     }
 
-    if (IsVectorElement && IsDest) {
-      // We have to insert into a loaded copy before storing.
-      Src = IRB.CreateInsertElement(
-        IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(), getName(".load")),
-        Src, IRB.getInt32(getIndex(BeginOffset)),
-        getName(".insert"));
-    }
-
     StoreInst *Store = cast<StoreInst>(
       IRB.CreateAlignedStore(Src, DstPtr, Align, II.isVolatile()));
     (void)Store;
diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 9160f04fe2..3af62ebcef 100644
--- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -111,13 +111,11 @@ static bool markAliveBlocks(BasicBlock *BB,
 
   SmallVector<BasicBlock*, 128> Worklist;
   Worklist.push_back(BB);
+  Reachable.insert(BB);
   bool Changed = false;
   do {
     BB = Worklist.pop_back_val();
 
-    if (!Reachable.insert(BB))
-      continue;
-
     // Do a quick scan of the basic block, turning any obviously unreachable
     // instructions into LLVM unreachable insts.  The instruction combining pass
     // canonicalizes unreachable insts into stores to null or undef.
@@ -176,7 +174,8 @@ static bool markAliveBlocks(BasicBlock *BB,
 
     Changed |= ConstantFoldTerminator(BB, true);
     for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
-      Worklist.push_back(*SI);
+      if (Reachable.insert(*SI))
+        Worklist.push_back(*SI);
   } while (!Worklist.empty());
   return Changed;
 }
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 0e56817a1b..58d973a61a 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -928,3 +928,38 @@ DbgDeclareInst *llvm::FindAllocaDbgDeclare(Value *V) {
 
   return 0;
 }
+
+bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
+                                      DIBuilder &Builder) {
+  DbgDeclareInst *DDI = FindAllocaDbgDeclare(AI);
+  if (!DDI)
+    return false;
+  DIVariable DIVar(DDI->getVariable());
+  if (!DIVar.Verify())
+    return false;
+
+  // Create a copy of the original DIDescriptor for user variable, appending
+  // "deref" operation to a list of address elements, as new llvm.dbg.declare
+  // will take a value storing address of the memory for variable, not
+  // alloca itself.
+  Type *Int64Ty = Type::getInt64Ty(AI->getContext());
+  SmallVector<Value*, 4> NewDIVarAddress;
+  if (DIVar.hasComplexAddress()) {
+    for (unsigned i = 0, n = DIVar.getNumAddrElements(); i < n; ++i) {
+      NewDIVarAddress.push_back(
+          ConstantInt::get(Int64Ty, DIVar.getAddrElement(i)));
+    }
+  }
+  NewDIVarAddress.push_back(ConstantInt::get(Int64Ty, DIBuilder::OpDeref));
+  DIVariable NewDIVar = Builder.createComplexVariable(
+      DIVar.getTag(), DIVar.getContext(), DIVar.getName(),
+      DIVar.getFile(), DIVar.getLineNumber(), DIVar.getType(),
+      NewDIVarAddress, DIVar.getArgNumber());
+
+  // Insert llvm.dbg.declare in the same basic block as the original alloca,
+  // and remove old llvm.dbg.declare.
+  BasicBlock *BB = AI->getParent();
+  Builder.insertDeclare(NewAllocaAddress, NewDIVar, BB);
+  DDI->eraseFromParent();
+  return true;
+}
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index feeececedb..d143f919ce 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -44,16 +44,17 @@ VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
                     cl::desc("Sets the SIMD width. Zero is autoselect."));
 
 static cl::opt<bool>
-EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
+EnableIfConversion("enable-if-conversion", cl::init(false), cl::Hidden,
                    cl::desc("Enable if-conversion during vectorization."));
 
 namespace {
 
 /// The LoopVectorize Pass.
 struct LoopVectorize : public LoopPass {
-  static char ID; // Pass identification, replacement for typeid
+  /// Pass identification, replacement for typeid
+  static char ID;
 
-  LoopVectorize() : LoopPass(ID) {
+  explicit LoopVectorize() : LoopPass(ID) {
     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
   }
 
@@ -85,28 +86,27 @@ struct LoopVectorize : public LoopPass {
     }
 
     // Select the preffered vectorization factor.
-    unsigned VF = 1;
-    if (VectorizationFactor == 0) {
-      const VectorTargetTransformInfo *VTTI = 0;
-      if (TTI)
-        VTTI = TTI->getVectorTargetTransformInfo();
-      // Use the cost model.
-      LoopVectorizationCostModel CM(L, SE, &LVL, VTTI);
-      VF = CM.findBestVectorizationFactor();
-
-      if (VF == 1) {
-        DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
-        return false;
-      }
-
-    } else {
-      // Use the user command flag.
-      VF = VectorizationFactor;
+    const VectorTargetTransformInfo *VTTI = 0;
+    if (TTI)
+      VTTI = TTI->getVectorTargetTransformInfo();
+    // Use the cost model.
+    LoopVectorizationCostModel CM(L, SE, &LVL, VTTI);
+
+    // Check the function attribues to find out if this function should be
+    // optimized for size.
+    Function *F = L->getHeader()->getParent();
+    Attributes::AttrVal SzAttr= Attributes::OptimizeForSize;
+    bool OptForSize = F->getFnAttributes().hasAttribute(SzAttr);
+
+    unsigned VF = CM.selectVectorizationFactor(OptForSize, VectorizationFactor);
+
+    if (VF == 1) {
+      DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
+      return false;
     }
 
     DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ") in "<<
-          L->getHeader()->getParent()->getParent()->getModuleIdentifier()<<
-          "\n");
+          F->getParent()->getModuleIdentifier()<<"\n");
 
     // If we decided that it is *legal* to vectorizer the loop then do it.
     InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF);
@@ -407,27 +407,27 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
    the vectorized instructions while the old loop will continue to run the
    scalar remainder.
 
-   [ ] <-- vector loop bypass.
-   /  |
-   /   v
+       [ ] <-- vector loop bypass.
+     /  |
+    /   v
    |   [ ]     <-- vector pre header.
    |    |
    |    v
    |   [  ] \
    |   [  ]_|   <-- vector loop.
    |    |
-   \   v
-   >[ ]   <--- middle-block.
-   /  |
-   /   v
+    \   v
+      >[ ]   <--- middle-block.
+     /  |
+    /   v
    |   [ ]     <--- new preheader.
    |    |
    |    v
    |   [ ] \
    |   [ ]_|   <-- old scalar loop to handle remainder.
-   \   |
-   \  v
-   >[ ]     <-- exit block.
+    \   |
+     \  v
+      >[ ]     <-- exit block.
    ...
    */
 
@@ -954,7 +954,7 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
         // At this point we generate the predication tree. There may be
         // duplications since this is a simple recursive scan, but future
         // optimizations will clean it up.
-        Value *Cond = createBlockInMask(P->getIncomingBlock(0));
+        Value *Cond = createEdgeMask(P->getIncomingBlock(0), P->getParent());
         WidenMap[P] =
           Builder.CreateSelect(Cond,
                                getVectorValue(P->getIncomingValue(0)),
@@ -1204,8 +1204,20 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
     case Instruction::Trunc:
     case Instruction::FPTrunc:
     case Instruction::BitCast: {
-      /// Vectorize bitcasts.
       CastInst *CI = dyn_cast<CastInst>(it);
+      /// Optimize the special case where the source is the induction
+      /// variable. Notice that we can only optimize the 'trunc' case
+      /// because: a. FP conversions lose precision, b. sext/zext may wrap,
+      /// c. other casts depend on pointer size.
+      if (CI->getOperand(0) == OldInduction &&
+          it->getOpcode() == Instruction::Trunc) {
+        Value *ScalarCast = Builder.CreateCast(CI->getOpcode(), Induction,
+                                               CI->getType());
+        Value *Broadcasted = getBroadcastInstrs(ScalarCast);
+        WidenMap[it] = getConsecutiveVector(Broadcasted);
+        break;
+      }
+      /// Vectorize casts.
       Value *A = getVectorValue(it->getOperand(0));
       Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF);
       WidenMap[it] = Builder.CreateCast(CI->getOpcode(), A, DestTy);
@@ -1263,6 +1275,10 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
   for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) {
     BasicBlock *BB = LoopBlocks[i];
 
+    // We don't support switch statements inside loops.
+    if (!isa<BranchInst>(BB->getTerminator()))
+      return false;
+
     // We must have at most two predecessors because we need to convert
     // all PHIs to selects.
     unsigned Preds = std::distance(pred_begin(BB), pred_end(BB));
@@ -1832,6 +1848,15 @@ LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
   return NoInduction;
 }
 
+bool LoopVectorizationLegality::isInductionVariable(const Value *V) {
+  Value *In0 = const_cast<Value*>(V);
+  PHINode *PN = dyn_cast_or_null<PHINode>(In0);
+  if (!PN)
+    return false;
+
+  return Inductions.count(PN);
+}
+
 bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB)  {
   assert(TheLoop->contains(BB) && "Unknown block used");
 
@@ -1846,7 +1871,7 @@ bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) {
     if (it->mayReadFromMemory() || it->mayWriteToMemory() || it->mayThrow())
       return false;
 
-    // The isntructions below can trap.
+    // The instructions below can trap.
     switch (it->getOpcode()) {
     default: continue;
     case Instruction::UDiv:
@@ -1870,7 +1895,48 @@ bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) {
 }
 
 unsigned
-LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) {
+LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize,
+                                                        unsigned UserVF) {
+  if (OptForSize && Legal->getRuntimePointerCheck()->Need) {
+    DEBUG(dbgs() << "LV: Aborting. Runtime ptr check is required in Os.\n");
+    return 1;
+  }
+
+  // Find the trip count.
+  unsigned TC = SE->getSmallConstantTripCount(TheLoop, TheLoop->getLoopLatch());
+  DEBUG(dbgs() << "LV: Found trip count:"<<TC<<"\n");
+
+  unsigned VF = MaxVectorSize;
+
+  // If we optimize the program for size, avoid creating the tail loop.
+  if (OptForSize) {
+    // If we are unable to calculate the trip count then don't try to vectorize.
+    if (TC < 2) {
+      DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
+      return 1;
+    }
+
+    // Find the maximum SIMD width that can fit within the trip count.
+    VF = TC % MaxVectorSize;
+
+    if (VF == 0)
+      VF = MaxVectorSize;
+
+    // If the trip count that we found modulo the vectorization factor is not
+    // zero then we require a tail.
+    if (VF < 2) {
+      DEBUG(dbgs() << "LV: Aborting. A tail loop is required in Os.\n");
+      return 1;
+    }
+  }
+
+  if (UserVF != 0) {
+    assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
+    DEBUG(dbgs() << "LV: Using user VF "<<UserVF<<".\n");
+
+    return UserVF;
+  }
+
   if (!VTTI) {
     DEBUG(dbgs() << "LV: No vector target information. Not vectorizing. \n");
     return 1;
@@ -2052,6 +2118,13 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   case Instruction::Trunc:
   case Instruction::FPTrunc:
   case Instruction::BitCast: {
+    // We optimize the truncation of induction variable.
+    // The cost of these is the same as the scalar operation.
+    if (I->getOpcode() == Instruction::Trunc &&
+        Legal->isInductionVariable(I->getOperand(0)))
+         return VTTI->getCastInstrCost(I->getOpcode(), I->getType(),
+                                       I->getOperand(0)->getType());
+
     Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
     return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
   }
diff --git a/lib/Transforms/Vectorize/LoopVectorize.h b/lib/Transforms/Vectorize/LoopVectorize.h
index 9d6d80e22b..e5ef29052e 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.h
+++ b/lib/Transforms/Vectorize/LoopVectorize.h
@@ -320,6 +320,9 @@ public:
   /// Returns the induction variables found in the loop.
   InductionList *getInductionVars() { return &Inductions; }
 
+  /// Returns True if V is an induction variable in this loop.
+  bool isInductionVariable(const Value *V);
+
   /// Return true if the block BB needs to be predicated in order for the loop
   /// to be vectorized.
   bool blockNeedsPredication(BasicBlock *BB);
@@ -420,10 +423,11 @@ public:
                              const VectorTargetTransformInfo *Vtti):
   TheLoop(Lp), SE(Se), Legal(Leg), VTTI(Vtti) { }
 
-  /// Returns the most profitable vectorization factor for the loop that is
-  /// smaller or equal to the VF argument. This method checks every power
-  /// of two up to VF.
-  unsigned findBestVectorizationFactor(unsigned VF = MaxVectorSize);
+  /// Returns the most profitable vectorization factor in powers of two.
+  /// This method checks every power of two up to VF. If UserVF is not ZERO
+  /// then this vectorization factor will be selected if vectorization is
+  /// possible.
+  unsigned selectVectorizationFactor(bool OptForSize, unsigned UserVF);
 
 private:
   /// Returns the expected execution cost. The unit of the cost does
diff --git a/lib/Transforms/Vectorize/Vectorize.cpp b/lib/Transforms/Vectorize/Vectorize.cpp
index 3fb36cadea..19eefd2f87 100644
--- a/lib/Transforms/Vectorize/Vectorize.cpp
+++ b/lib/Transforms/Vectorize/Vectorize.cpp
@@ -1,4 +1,4 @@
-//===-- Vectorize.cpp -----------------------------------------------------===//
+   //===-- Vectorize.cpp -----------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/VMCore/Attributes.cpp b/lib/VMCore/Attributes.cpp
index 751ff85f21..02c8aaeb53 100644
--- a/lib/VMCore/Attributes.cpp
+++ b/lib/VMCore/Attributes.cpp
@@ -356,7 +356,7 @@ uint64_t AttributesImpl::getStackAlignment() const {
 //===----------------------------------------------------------------------===//
 
 AttributeSet AttributeSet::get(LLVMContext &C,
-                             ArrayRef<AttributeWithIndex> Attrs) {
+                               ArrayRef<AttributeWithIndex> Attrs) {
   // If there are no attributes then return a null AttributesList pointer.
   if (Attrs.empty())
     return AttributeSet();
diff --git a/lib/VMCore/Function.cpp b/lib/VMCore/Function.cpp
index 9b2046bb12..013299e76f 100644
--- a/lib/VMCore/Function.cpp
+++ b/lib/VMCore/Function.cpp
@@ -62,11 +62,11 @@ void Argument::setParent(Function *parent) {
 }
 
 /// getArgNo - Return the index of this formal argument in its containing
-/// function.  For example in "void foo(int a, float b)" a is 0 and b is 1. 
+/// function.  For example in "void foo(int a, float b)" a is 0 and b is 1.
 unsigned Argument::getArgNo() const {
   const Function *F = getParent();
   assert(F && "Argument is not in a function");
-  
+
   Function::const_arg_iterator AI = F->arg_begin();
   unsigned ArgIdx = 0;
   for (; &*AI != this; ++AI)
@@ -86,7 +86,7 @@ bool Argument::hasByValAttr() const {
 unsigned Argument::getParamAlignment() const {
   assert(getType()->isPointerTy() && "Only pointers have alignments");
   return getParent()->getParamAlignment(getArgNo()+1);
-  
+
 }
 
 /// hasNestAttr - Return true if this argument has the nest attribute on
@@ -168,7 +168,7 @@ void Function::eraseFromParent() {
 
 Function::Function(FunctionType *Ty, LinkageTypes Linkage,
                    const Twine &name, Module *ParentModule)
-  : GlobalValue(PointerType::getUnqual(Ty), 
+  : GlobalValue(PointerType::getUnqual(Ty),
                 Value::FunctionVal, 0, 0, Linkage, name) {
   assert(FunctionType::isValidReturnType(getReturnType()) &&
          "invalid return type");
@@ -177,7 +177,7 @@ Function::Function(FunctionType *Ty, LinkageTypes Linkage,
   // If the function has arguments, mark them as lazily built.
   if (Ty->getNumParams())
     setValueSubclassData(1);   // Set the "has lazy arguments" bit.
-  
+
   // Make sure that we get added to a function
   LeakDetector::addGarbageObject(this);
 
@@ -209,7 +209,7 @@ void Function::BuildLazyArguments() const {
            "Cannot have void typed arguments!");
     ArgumentList.push_back(new Argument(FT->getParamType(i)));
   }
-  
+
   // Clear the lazy arguments bit.
   unsigned SDC = getSubclassDataFromValue();
   const_cast<Function*>(this)->setValueSubclassData(SDC &= ~1);
@@ -241,7 +241,7 @@ void Function::setParent(Module *parent) {
 void Function::dropAllReferences() {
   for (iterator I = begin(), E = end(); I != E; ++I)
     I->dropAllReferences();
-  
+
   // Delete all basic blocks. They are now unused, except possibly by
   // blockaddresses, but BasicBlock's destructor takes care of those.
   while (!BasicBlocks.empty())
@@ -330,7 +330,7 @@ unsigned Function::getIntrinsicID() const {
     return 0;
   unsigned Len = ValName->getKeyLength();
   const char *Name = ValName->getKeyData();
-  
+
   if (Len < 5 || Name[4] != '.' || Name[0] != 'l' || Name[1] != 'l'
       || Name[2] != 'v' || Name[3] != 'm')
     return 0;  // All intrinsics start with 'llvm.'
@@ -354,7 +354,7 @@ std::string Intrinsic::getName(ID id, ArrayRef<Type*> Tys) {
   std::string Result(Table[id]);
   for (unsigned i = 0; i < Tys.size(); ++i) {
     if (PointerType* PTyp = dyn_cast<PointerType>(Tys[i])) {
-      Result += ".p" + llvm::utostr(PTyp->getAddressSpace()) + 
+      Result += ".p" + llvm::utostr(PTyp->getAddressSpace()) +
                 EVT::getEVT(PTyp->getElementType()).getEVTString();
     }
     else if (Tys[i])
@@ -386,7 +386,7 @@ enum IIT_Info {
   IIT_MMX  = 13,
   IIT_PTR  = 14,
   IIT_ARG  = 15,
-  
+
   // Values from 16+ are only encodable with the inefficient encoding.
   IIT_METADATA = 16,
   IIT_EMPTYSTRUCT = 17,
@@ -405,7 +405,7 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
   IIT_Info Info = IIT_Info(Infos[NextElt++]);
   unsigned StructElts = 2;
   using namespace Intrinsic;
-  
+
   switch (Info) {
   case IIT_Done:
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::Void, 0));
@@ -462,7 +462,7 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
     DecodeIITType(NextElt, Infos, OutputTable);
     return;
   case IIT_ANYPTR: {  // [ANYPTR addrspace, subtype]
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Pointer, 
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Pointer,
                                              Infos[NextElt++]));
     DecodeIITType(NextElt, Infos, OutputTable);
     return;
@@ -506,11 +506,11 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
 #include "llvm/Intrinsics.gen"
 #undef GET_INTRINSIC_GENERATOR_GLOBAL
 
-void Intrinsic::getIntrinsicInfoTableEntries(ID id, 
+void Intrinsic::getIntrinsicInfoTableEntries(ID id,
                                              SmallVectorImpl<IITDescriptor> &T){
   // Check to see if the intrinsic's type was expressible by the table.
   unsigned TableVal = IIT_Table[id-1];
-  
+
   // Decode the TableVal into an array of IITValues.
   SmallVector<unsigned char, 8> IITValues;
   ArrayRef<unsigned char> IITEntries;
@@ -518,7 +518,7 @@ void Intrinsic::getIntrinsicInfoTableEntries(ID id,
   if ((TableVal >> 31) != 0) {
     // This is an offset into the IIT_LongEncodingTable.
     IITEntries = IIT_LongEncodingTable;
-    
+
     // Strip sentinel bit.
     NextElt = (TableVal << 1) >> 1;
   } else {
@@ -528,7 +528,7 @@ void Intrinsic::getIntrinsicInfoTableEntries(ID id,
       IITValues.push_back(TableVal & 0xF);
       TableVal >>= 4;
     } while (TableVal);
-    
+
     IITEntries = IITValues;
     NextElt = 0;
   }
@@ -545,14 +545,14 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
   using namespace Intrinsic;
   IITDescriptor D = Infos.front();
   Infos = Infos.slice(1);
-  
+
   switch (D.Kind) {
   case IITDescriptor::Void: return Type::getVoidTy(Context);
   case IITDescriptor::MMX: return Type::getX86_MMXTy(Context);
   case IITDescriptor::Metadata: return Type::getMetadataTy(Context);
   case IITDescriptor::Float: return Type::getFloatTy(Context);
   case IITDescriptor::Double: return Type::getDoubleTy(Context);
-      
+
   case IITDescriptor::Integer:
     return IntegerType::get(Context, D.Integer_Width);
   case IITDescriptor::Vector:
@@ -573,7 +573,7 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
   case IITDescriptor::ExtendVecArgument:
     return VectorType::getExtendedElementVectorType(cast<VectorType>(
                                                   Tys[D.getArgumentNumber()]));
-      
+
   case IITDescriptor::TruncVecArgument:
     return VectorType::getTruncatedElementVectorType(cast<VectorType>(
                                                   Tys[D.getArgumentNumber()]));
@@ -587,15 +587,15 @@ FunctionType *Intrinsic::getType(LLVMContext &Context,
                                  ID id, ArrayRef<Type*> Tys) {
   SmallVector<IITDescriptor, 8> Table;
   getIntrinsicInfoTableEntries(id, Table);
-  
+
   ArrayRef<IITDescriptor> TableRef = Table;
   Type *ResultTy = DecodeFixedType(TableRef, Tys, Context);
-    
+
   SmallVector<Type*, 8> ArgTys;
   while (!TableRef.empty())
     ArgTys.push_back(DecodeFixedType(TableRef, Tys, Context));
 
-  return FunctionType::get(ResultTy, ArgTys, false); 
+  return FunctionType::get(ResultTy, ArgTys, false);
 }
 
 bool Intrinsic::isOverloaded(ID id) {
diff --git a/lib/VMCore/Instructions.cpp b/lib/VMCore/Instructions.cpp
index ded95349d4..17ab34f4f8 100644
--- a/lib/VMCore/Instructions.cpp
+++ b/lib/VMCore/Instructions.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Instructions.h"
 #include "LLVMContextImpl.h"
 #include "llvm/Constants.h"
+#include "llvm/DataLayout.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
 #include "llvm/Module.h"
@@ -1423,6 +1424,12 @@ bool GetElementPtrInst::isInBounds() const {
   return cast<GEPOperator>(this)->isInBounds();
 }
 
+bool GetElementPtrInst::accumulateConstantOffset(const DataLayout &DL,
+                                                 APInt &Offset) const {
+  // Delegate to the generic GEPOperator implementation.
+  return cast<GEPOperator>(this)->accumulateConstantOffset(DL, Offset);
+}
+
 //===----------------------------------------------------------------------===//
 //                           ExtractElementInst Implementation
 //===----------------------------------------------------------------------===//
diff --git a/test/CodeGen/Mips/2012-12-12-ExpandMemcpy.ll b/test/CodeGen/Mips/2012-12-12-ExpandMemcpy.ll
new file mode 100644
index 0000000000..9d4daee696
--- /dev/null
+++ b/test/CodeGen/Mips/2012-12-12-ExpandMemcpy.ll
@@ -0,0 +1,11 @@
+; RUN: llc -march=mips64el -mcpu=mips64r2 < %s
+
+@.str = private unnamed_addr constant [7 x i8] c"hello\0A\00", align 1
+
+define void @t(i8* %ptr) {
+entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %ptr, i8* getelementptr inbounds ([7 x i8]* @.str, i64 0, i64 0), i64 7, i32 1, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/CodeGen/Mips/gpreg-lazy-binding.ll b/test/CodeGen/Mips/gpreg-lazy-binding.ll
new file mode 100644
index 0000000000..85d8d4b1db
--- /dev/null
+++ b/test/CodeGen/Mips/gpreg-lazy-binding.ll
@@ -0,0 +1,27 @@
+; RUN: llc -march=mipsel -disable-mips-delay-filler < %s | FileCheck %s 
+
+@g = external global i32
+
+; CHECK:     addu  $gp
+; CHECK:     jalr  $25
+; CHECK:     nop
+; CHECK-NOT: addu  $gp
+; CHECK:     jalr  $25
+
+define void @f0() nounwind {
+entry:
+  tail call void @externalFunc() nounwind
+  tail call fastcc void @internalFunc()
+  ret void
+}
+
+declare void @externalFunc()
+
+define internal fastcc void @internalFunc() nounwind noinline {
+entry:
+  %0 = load i32* @g, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* @g, align 4
+  ret void
+}
+
diff --git a/test/CodeGen/Mips/mips16ex.ll b/test/CodeGen/Mips/mips16ex.ll
new file mode 100644
index 0000000000..ecb30b5c63
--- /dev/null
+++ b/test/CodeGen/Mips/mips16ex.ll
@@ -0,0 +1,87 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 < %s | FileCheck %s -check-prefix=16
+
+;16: $eh_func_begin0=.
+@.str = private unnamed_addr constant [7 x i8] c"hello\0A\00", align 1
+@_ZTIi = external constant i8*
+@.str1 = private unnamed_addr constant [15 x i8] c"exception %i \0A\00", align 1
+
+define i32 @main() {
+entry:
+  %retval = alloca i32, align 4
+  %exn.slot = alloca i8*
+  %ehselector.slot = alloca i32
+  %e = alloca i32, align 4
+  store i32 0, i32* %retval
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([7 x i8]* @.str, i32 0, i32 0))
+  %exception = call i8* @__cxa_allocate_exception(i32 4) nounwind
+  %0 = bitcast i8* %exception to i32*
+  store i32 20, i32* %0
+  invoke void @__cxa_throw(i8* %exception, i8* bitcast (i8** @_ZTIi to i8*), i8* null) noreturn
+          to label %unreachable unwind label %lpad
+
+lpad:                                             ; preds = %entry
+  %1 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* bitcast (i8** @_ZTIi to i8*)
+  %2 = extractvalue { i8*, i32 } %1, 0
+  store i8* %2, i8** %exn.slot
+  %3 = extractvalue { i8*, i32 } %1, 1
+  store i32 %3, i32* %ehselector.slot
+  br label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %lpad
+  %sel = load i32* %ehselector.slot
+  %4 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*)) nounwind
+  %matches = icmp eq i32 %sel, %4
+  br i1 %matches, label %catch, label %eh.resume
+
+catch:                                            ; preds = %catch.dispatch
+  %exn = load i8** %exn.slot
+  %5 = call i8* @__cxa_begin_catch(i8* %exn) nounwind
+  %6 = bitcast i8* %5 to i32*
+  %exn.scalar = load i32* %6
+  store i32 %exn.scalar, i32* %e, align 4
+  %7 = load i32* %e, align 4
+  %call2 = invoke i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([15 x i8]* @.str1, i32 0, i32 0), i32 %7)
+          to label %invoke.cont unwind label %lpad1
+
+invoke.cont:                                      ; preds = %catch
+  call void @__cxa_end_catch() nounwind
+  br label %try.cont
+
+try.cont:                                         ; preds = %invoke.cont
+  ret i32 0
+
+lpad1:                                            ; preds = %catch
+  %8 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  %9 = extractvalue { i8*, i32 } %8, 0
+  store i8* %9, i8** %exn.slot
+  %10 = extractvalue { i8*, i32 } %8, 1
+  store i32 %10, i32* %ehselector.slot
+  call void @__cxa_end_catch() nounwind
+  br label %eh.resume
+
+eh.resume:                                        ; preds = %lpad1, %catch.dispatch
+  %exn3 = load i8** %exn.slot
+  %sel4 = load i32* %ehselector.slot
+  %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn3, 0
+  %lpad.val5 = insertvalue { i8*, i32 } %lpad.val, i32 %sel4, 1
+  resume { i8*, i32 } %lpad.val5
+
+unreachable:                                      ; preds = %entry
+  unreachable
+}
+
+declare i32 @printf(i8*, ...)
+
+declare i8* @__cxa_allocate_exception(i32)
+
+declare i32 @__gxx_personality_v0(...)
+
+declare void @__cxa_throw(i8*, i8*, i8*)
+
+declare i32 @llvm.eh.typeid.for(i8*) nounwind readnone
+
+declare i8* @__cxa_begin_catch(i8*)
+
+declare void @__cxa_end_catch()
diff --git a/test/CodeGen/Mips/mips16fpe.ll b/test/CodeGen/Mips/mips16fpe.ll
new file mode 100644
index 0000000000..4335436079
--- /dev/null
+++ b/test/CodeGen/Mips/mips16fpe.ll
@@ -0,0 +1,381 @@
+; RUN: llc  -march=mipsel -mcpu=mips16 -relocation-model=pic -O3 -soft-float -mips16-hard-float < %s | FileCheck %s -check-prefix=16hf
+
+@x = global float 5.000000e+00, align 4
+@y = global float 1.500000e+01, align 4
+@xd = global double 6.000000e+00, align 8
+@yd = global double 1.800000e+01, align 8
+@two = global i32 2, align 4
+@addsf3_result = common global float 0.000000e+00, align 4
+@adddf3_result = common global double 0.000000e+00, align 8
+@subsf3_result = common global float 0.000000e+00, align 4
+@subdf3_result = common global double 0.000000e+00, align 8
+@mulsf3_result = common global float 0.000000e+00, align 4
+@muldf3_result = common global double 0.000000e+00, align 8
+@divsf3_result = common global float 0.000000e+00, align 4
+@divdf3_result = common global double 0.000000e+00, align 8
+@extendsfdf2_result = common global double 0.000000e+00, align 8
+@xd2 = global double 0x40147E6B74B4CF6A, align 8
+@truncdfsf2_result = common global float 0.000000e+00, align 4
+@fix_truncsfsi_result = common global i32 0, align 4
+@fix_truncdfsi_result = common global i32 0, align 4
+@si = global i32 -9, align 4
+@ui = global i32 9, align 4
+@floatsisf_result = common global float 0.000000e+00, align 4
+@floatsidf_result = common global double 0.000000e+00, align 8
+@floatunsisf_result = common global float 0.000000e+00, align 4
+@floatunsidf_result = common global double 0.000000e+00, align 8
+@xx = global float 5.000000e+00, align 4
+@eqsf2_result = common global i32 0, align 4
+@xxd = global double 6.000000e+00, align 8
+@eqdf2_result = common global i32 0, align 4
+@nesf2_result = common global i32 0, align 4
+@nedf2_result = common global i32 0, align 4
+@gesf2_result = common global i32 0, align 4
+@gedf2_result = common global i32 0, align 4
+@ltsf2_result = common global i32 0, align 4
+@ltdf2_result = common global i32 0, align 4
+@lesf2_result = common global i32 0, align 4
+@ledf2_result = common global i32 0, align 4
+@gtsf2_result = common global i32 0, align 4
+@gtdf2_result = common global i32 0, align 4
+
+define void @test_addsf3() nounwind {
+entry:
+;16hf: test_addsf3:
+  %0 = load float* @x, align 4
+  %1 = load float* @y, align 4
+  %add = fadd float %0, %1
+  store float %add, float* @addsf3_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_addsf3)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_adddf3() nounwind {
+entry:
+;16hf: test_adddf3:
+  %0 = load double* @xd, align 8
+  %1 = load double* @yd, align 8
+  %add = fadd double %0, %1
+  store double %add, double* @adddf3_result, align 8
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_adddf3)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_subsf3() nounwind {
+entry:
+;16hf: test_subsf3:
+  %0 = load float* @x, align 4
+  %1 = load float* @y, align 4
+  %sub = fsub float %0, %1
+  store float %sub, float* @subsf3_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_subsf3)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_subdf3() nounwind {
+entry:
+;16hf: test_subdf3:
+  %0 = load double* @xd, align 8
+  %1 = load double* @yd, align 8
+  %sub = fsub double %0, %1
+  store double %sub, double* @subdf3_result, align 8
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_subdf3)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_mulsf3() nounwind {
+entry:
+;16hf: test_mulsf3:
+  %0 = load float* @x, align 4
+  %1 = load float* @y, align 4
+  %mul = fmul float %0, %1
+  store float %mul, float* @mulsf3_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_mulsf3)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_muldf3() nounwind {
+entry:
+;16hf: test_muldf3:
+  %0 = load double* @xd, align 8
+  %1 = load double* @yd, align 8
+  %mul = fmul double %0, %1
+  store double %mul, double* @muldf3_result, align 8
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_muldf3)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_divsf3() nounwind {
+entry:
+;16hf: test_divsf3:
+  %0 = load float* @y, align 4
+  %1 = load float* @x, align 4
+  %div = fdiv float %0, %1
+  store float %div, float* @divsf3_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_divsf3)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_divdf3() nounwind {
+entry:
+;16hf: test_divdf3:
+  %0 = load double* @yd, align 8
+  %mul = fmul double %0, 2.000000e+00
+  %1 = load double* @xd, align 8
+  %div = fdiv double %mul, %1
+  store double %div, double* @divdf3_result, align 8
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_divdf3)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_extendsfdf2() nounwind {
+entry:
+;16hf: test_extendsfdf2:
+  %0 = load float* @x, align 4
+  %conv = fpext float %0 to double
+  store double %conv, double* @extendsfdf2_result, align 8
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_extendsfdf2)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_truncdfsf2() nounwind {
+entry:
+;16hf: test_truncdfsf2:
+  %0 = load double* @xd2, align 8
+  %conv = fptrunc double %0 to float
+  store float %conv, float* @truncdfsf2_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_truncdfsf2)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_fix_truncsfsi() nounwind {
+entry:
+;16hf: test_fix_truncsfsi:
+  %0 = load float* @x, align 4
+  %conv = fptosi float %0 to i32
+  store i32 %conv, i32* @fix_truncsfsi_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_fix_truncsfsi)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_fix_truncdfsi() nounwind {
+entry:
+;16hf: test_fix_truncdfsi:
+  %0 = load double* @xd, align 8
+  %conv = fptosi double %0 to i32
+  store i32 %conv, i32* @fix_truncdfsi_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_fix_truncdfsi)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_floatsisf() nounwind {
+entry:
+;16hf: test_floatsisf:
+  %0 = load i32* @si, align 4
+  %conv = sitofp i32 %0 to float
+  store float %conv, float* @floatsisf_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_floatsisf)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_floatsidf() nounwind {
+entry:
+;16hf: test_floatsidf:
+  %0 = load i32* @si, align 4
+  %conv = sitofp i32 %0 to double
+  store double %conv, double* @floatsidf_result, align 8
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_floatsidf)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_floatunsisf() nounwind {
+entry:
+;16hf: test_floatunsisf:
+  %0 = load i32* @ui, align 4
+  %conv = uitofp i32 %0 to float
+  store float %conv, float* @floatunsisf_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_floatunsisf)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_floatunsidf() nounwind {
+entry:
+;16hf: test_floatunsidf:
+  %0 = load i32* @ui, align 4
+  %conv = uitofp i32 %0 to double
+  store double %conv, double* @floatunsidf_result, align 8
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_floatunsidf)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_eqsf2() nounwind {
+entry:
+;16hf: test_eqsf2:
+  %0 = load float* @x, align 4
+  %1 = load float* @xx, align 4
+  %cmp = fcmp oeq float %0, %1
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @eqsf2_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_eqsf2)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_eqdf2() nounwind {
+entry:
+;16hf: test_eqdf2:
+  %0 = load double* @xd, align 8
+  %1 = load double* @xxd, align 8
+  %cmp = fcmp oeq double %0, %1
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @eqdf2_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_eqdf2)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_nesf2() nounwind {
+entry:
+;16hf: test_nesf2:
+  %0 = load float* @x, align 4
+  %1 = load float* @y, align 4
+  %cmp = fcmp une float %0, %1
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @nesf2_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_nesf2)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_nedf2() nounwind {
+entry:
+;16hf: test_nedf2:
+  %0 = load double* @xd, align 8
+  %1 = load double* @yd, align 8
+  %cmp = fcmp une double %0, %1
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, i32* @nedf2_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_nedf2)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_gesf2() nounwind {
+entry:
+;16hf: test_gesf2:
+  %0 = load float* @x, align 4
+  %1 = load float* @xx, align 4
+  %cmp = fcmp oge float %0, %1
+  %2 = load float* @y, align 4
+  %cmp1 = fcmp oge float %2, %0
+  %and3 = and i1 %cmp, %cmp1
+  %and = zext i1 %and3 to i32
+  store i32 %and, i32* @gesf2_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_gesf2)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_gedf2() nounwind {
+entry:
+;16hf: test_gedf2:
+  %0 = load double* @xd, align 8
+  %1 = load double* @xxd, align 8
+  %cmp = fcmp oge double %0, %1
+  %2 = load double* @yd, align 8
+  %cmp1 = fcmp oge double %2, %0
+  %and3 = and i1 %cmp, %cmp1
+  %and = zext i1 %and3 to i32
+  store i32 %and, i32* @gedf2_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_gedf2)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_ltsf2() nounwind {
+entry:
+;16hf: test_ltsf2:
+  %0 = load float* @x, align 4
+  %1 = load float* @xx, align 4
+  %lnot = fcmp uge float %0, %1
+  %2 = load float* @y, align 4
+  %cmp1 = fcmp olt float %0, %2
+  %and2 = and i1 %lnot, %cmp1
+  %and = zext i1 %and2 to i32
+  store i32 %and, i32* @ltsf2_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_unordsf2)(${{[0-9]+}})
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_ltsf2)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_ltdf2() nounwind {
+entry:
+;16hf: test_ltdf2:
+  %0 = load double* @xd, align 8
+  %1 = load double* @xxd, align 8
+  %lnot = fcmp uge double %0, %1
+  %2 = load double* @yd, align 8
+  %cmp1 = fcmp olt double %0, %2
+  %and2 = and i1 %lnot, %cmp1
+  %and = zext i1 %and2 to i32
+  store i32 %and, i32* @ltdf2_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_unorddf2)(${{[0-9]+}})
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_ltdf2)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_lesf2() nounwind {
+entry:
+;16hf: test_lesf2:
+  %0 = load float* @x, align 4
+  %1 = load float* @xx, align 4
+  %cmp = fcmp ole float %0, %1
+  %2 = load float* @y, align 4
+  %cmp1 = fcmp ole float %0, %2
+  %and3 = and i1 %cmp, %cmp1
+  %and = zext i1 %and3 to i32
+  store i32 %and, i32* @lesf2_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_lesf2)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_ledf2() nounwind {
+entry:
+;16hf: test_ledf2:
+  %0 = load double* @xd, align 8
+  %1 = load double* @xxd, align 8
+  %cmp = fcmp ole double %0, %1
+  %2 = load double* @yd, align 8
+  %cmp1 = fcmp ole double %0, %2
+  %and3 = and i1 %cmp, %cmp1
+  %and = zext i1 %and3 to i32
+  store i32 %and, i32* @ledf2_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_ledf2)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_gtsf2() nounwind {
+entry:
+;16hf: test_gtsf2:
+  %0 = load float* @x, align 4
+  %1 = load float* @xx, align 4
+  %lnot = fcmp ule float %0, %1
+  %2 = load float* @y, align 4
+  %cmp1 = fcmp ogt float %2, %0
+  %and2 = and i1 %lnot, %cmp1
+  %and = zext i1 %and2 to i32
+  store i32 %and, i32* @gtsf2_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_gtsf2)(${{[0-9]+}})
+  ret void
+}
+
+define void @test_gtdf2() nounwind {
+entry:
+;16hf: test_gtdf2:
+  %0 = load double* @xd, align 8
+  %1 = load double* @xxd, align 8
+  %lnot = fcmp ule double %0, %1
+  %2 = load double* @yd, align 8
+  %cmp1 = fcmp ogt double %2, %0
+  %and2 = and i1 %lnot, %cmp1
+  %and = zext i1 %and2 to i32
+  store i32 %and, i32* @gtdf2_result, align 4
+;16hf:  lw	${{[0-9]+}}, %call16(__mips16_gtdf2)(${{[0-9]+}})
+  ret void
+}
+
+
diff --git a/test/CodeGen/PowerPC/tls-gd-obj.ll b/test/CodeGen/PowerPC/tls-gd-obj.ll
new file mode 100644
index 0000000000..00b537d532
--- /dev/null
+++ b/test/CodeGen/PowerPC/tls-gd-obj.ll
@@ -0,0 +1,41 @@
+; RUN: llc -mcpu=pwr7 -O0 -filetype=obj -relocation-model=pic %s -o - | \
+; RUN: elf-dump --dump-section-data | FileCheck %s
+
+; Test correct relocation generation for thread-local storage using
+; the general dynamic model and integrated assembly.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@a = thread_local global i32 0, align 4
+
+define signext i32 @main() nounwind {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  %0 = load i32* @a, align 4
+  ret i32 %0
+}
+
+; Verify generation of R_PPC64_GOT_TLSGD16_HA, R_PPC64_GOT_TLSGD16_LO,
+; and R_PPC64_TLSGD for accessing external variable a, and R_PPC64_REL24
+; for the call to __tls_get_addr.
+;
+; CHECK:       '.rela.text'
+; CHECK:       Relocation 0
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM1:[0-9a-f]+]]
+; CHECK-NEXT:  'r_type', 0x00000052
+; CHECK:       Relocation 1
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM1]]
+; CHECK-NEXT:  'r_type', 0x00000050
+; CHECK:       Relocation 2
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM1]]
+; CHECK-NEXT:  'r_type', 0x0000006b
+; CHECK:       Relocation 3
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x{{[0-9a-f]+}}
+; CHECK-NEXT:  'r_type', 0x0000000a
+
diff --git a/test/CodeGen/PowerPC/tls-gd.ll b/test/CodeGen/PowerPC/tls-gd.ll
new file mode 100644
index 0000000000..fb8dfaf04a
--- /dev/null
+++ b/test/CodeGen/PowerPC/tls-gd.ll
@@ -0,0 +1,23 @@
+; RUN: llc -mcpu=pwr7 -O0 -relocation-model=pic < %s | FileCheck %s
+
+; Test correct assembly code generation for thread-local storage using
+; the general dynamic model.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@a = thread_local global i32 0, align 4
+
+define signext i32 @main() nounwind {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  %0 = load i32* @a, align 4
+  ret i32 %0
+}
+
+; CHECK: addis [[REG:[0-9]+]], 2, a@got@tlsgd@ha
+; CHECK-NEXT: addi 3, [[REG]], a@got@tlsgd@l
+; CHECK-NEXT: bl __tls_get_addr(a@tlsgd)
+; CHECK-NEXT: nop
+
diff --git a/test/CodeGen/PowerPC/tls-ie-obj.ll b/test/CodeGen/PowerPC/tls-ie-obj.ll
index 5cc0b187f6..3600cc52ba 100644
--- a/test/CodeGen/PowerPC/tls-ie-obj.ll
+++ b/test/CodeGen/PowerPC/tls-ie-obj.ll
@@ -24,9 +24,13 @@ entry:
 ; CHECK:       Relocation 0
 ; CHECK-NEXT:  'r_offset'
 ; CHECK-NEXT:  'r_sym', 0x[[SYM1:[0-9a-f]+]]
-; CHECK-NEXT:  'r_type', 0x00000057
+; CHECK-NEXT:  'r_type', 0x0000005a
 ; CHECK:       Relocation 1
 ; CHECK-NEXT:  'r_offset'
 ; CHECK-NEXT:  'r_sym', 0x[[SYM1]]
+; CHECK-NEXT:  'r_type', 0x00000058
+; CHECK:       Relocation 2
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM1]]
 ; CHECK-NEXT:  'r_type', 0x00000043
 
diff --git a/test/CodeGen/PowerPC/tls-ie.ll b/test/CodeGen/PowerPC/tls-ie.ll
index cc6f084efb..c5cfba7b3f 100644
--- a/test/CodeGen/PowerPC/tls-ie.ll
+++ b/test/CodeGen/PowerPC/tls-ie.ll
@@ -16,6 +16,7 @@ entry:
   ret i32 %0
 }
 
-; CHECK: ld [[REG:[0-9]+]], a@got@tprel(2)
-; CHECK: add {{[0-9]+}}, [[REG]], a@tls
+; CHECK: addis [[REG1:[0-9]+]], 2, a@got@tprel@ha
+; CHECK: ld [[REG2:[0-9]+]], a@got@tprel@l([[REG1]])
+; CHECK: add {{[0-9]+}}, [[REG2]], a@tls
 
diff --git a/test/CodeGen/PowerPC/tls-ld-obj.ll b/test/CodeGen/PowerPC/tls-ld-obj.ll
new file mode 100644
index 0000000000..c521ae405f
--- /dev/null
+++ b/test/CodeGen/PowerPC/tls-ld-obj.ll
@@ -0,0 +1,50 @@
+; RUN: llc -mcpu=pwr7 -O0 -filetype=obj -relocation-model=pic %s -o - | \
+; RUN: elf-dump --dump-section-data | FileCheck %s
+
+; Test correct relocation generation for thread-local storage using
+; the local dynamic model.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@a = hidden thread_local global i32 0, align 4
+
+define signext i32 @main() nounwind {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  %0 = load i32* @a, align 4
+  ret i32 %0
+}
+
+; Verify generation of R_PPC64_GOT_TLSLD16_HA, R_PPC64_GOT_TLSLD16_LO,
+; R_PPC64_TLSLD, R_PPC64_DTPREL16_HA, and R_PPC64_DTPREL16_LO for
+; accessing external variable a, and R_PPC64_REL24 for the call to
+; __tls_get_addr.
+;
+; CHECK:       '.rela.text'
+; CHECK:       Relocation 0
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM1:[0-9a-f]+]]
+; CHECK-NEXT:  'r_type', 0x00000056
+; CHECK:       Relocation 1
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM1]]
+; CHECK-NEXT:  'r_type', 0x00000054
+; CHECK:       Relocation 2
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM1]]
+; CHECK-NEXT:  'r_type', 0x0000006c
+; CHECK:       Relocation 3
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x{{[0-9a-f]+}}
+; CHECK-NEXT:  'r_type', 0x0000000a
+; CHECK:       Relocation 4
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM1]]
+; CHECK-NEXT:  'r_type', 0x0000004d
+; CHECK:       Relocation 5
+; CHECK-NEXT:  'r_offset'
+; CHECK-NEXT:  'r_sym', 0x[[SYM1]]
+; CHECK-NEXT:  'r_type', 0x0000004b
+
diff --git a/test/CodeGen/PowerPC/tls-ld.ll b/test/CodeGen/PowerPC/tls-ld.ll
new file mode 100644
index 0000000000..1ebc6129e2
--- /dev/null
+++ b/test/CodeGen/PowerPC/tls-ld.ll
@@ -0,0 +1,24 @@
+; RUN: llc -mcpu=pwr7 -O0 -relocation-model=pic < %s | FileCheck %s
+
+; Test correct assembly code generation for thread-local storage using
+; the local dynamic model.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@a = hidden thread_local global i32 0, align 4
+
+define signext i32 @main() nounwind {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  %0 = load i32* @a, align 4
+  ret i32 %0
+}
+
+; CHECK:      addis [[REG:[0-9]+]], 2, a@got@tlsld@ha
+; CHECK-NEXT: addi 3, [[REG]], a@got@tlsld@l
+; CHECK-NEXT: bl __tls_get_addr(a@tlsld)
+; CHECK-NEXT: nop
+; CHECK-NEXT: addis [[REG2:[0-9]+]], 3, a@dtprel@ha
+; CHECK-NEXT: addi {{[0-9]+}}, [[REG2]], a@dtprel@l
diff --git a/test/CodeGen/R600/add.v4i32.ll b/test/CodeGen/R600/add.v4i32.ll
new file mode 100644
index 0000000000..ac4a87417b
--- /dev/null
+++ b/test/CodeGen/R600/add.v4i32.ll
@@ -0,0 +1,15 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: ADD_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: ADD_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: ADD_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: ADD_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32> addrspace(1) * %b_ptr
+  %result = add <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/and.v4i32.ll b/test/CodeGen/R600/and.v4i32.ll
new file mode 100644
index 0000000000..662085e2d6
--- /dev/null
+++ b/test/CodeGen/R600/and.v4i32.ll
@@ -0,0 +1,15 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: AND_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: AND_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: AND_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: AND_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32> addrspace(1) * %b_ptr
+  %result = and <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fabs.ll b/test/CodeGen/R600/fabs.ll
new file mode 100644
index 0000000000..0407533eaa
--- /dev/null
+++ b/test/CodeGen/R600/fabs.ll
@@ -0,0 +1,16 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: MOV T{{[0-9]+\.[XYZW], \|T[0-9]+\.[XYZW]\|}}
+
+define void @test() {
+   %r0 = call float @llvm.R600.load.input(i32 0)
+   %r1 = call float @fabs( float %r0)
+   call void @llvm.AMDGPU.store.output(float %r1, i32 0)
+   ret void
+}
+
+declare float @llvm.R600.load.input(i32) readnone
+
+declare void @llvm.AMDGPU.store.output(float, i32)
+
+declare float @fabs(float ) readnone
diff --git a/test/CodeGen/R600/fadd.ll b/test/CodeGen/R600/fadd.ll
new file mode 100644
index 0000000000..d7d1b6572c
--- /dev/null
+++ b/test/CodeGen/R600/fadd.ll
@@ -0,0 +1,16 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test() {
+   %r0 = call float @llvm.R600.load.input(i32 0)
+   %r1 = call float @llvm.R600.load.input(i32 1)
+   %r2 = fadd float %r0, %r1
+   call void @llvm.AMDGPU.store.output(float %r2, i32 0)
+   ret void
+}
+
+declare float @llvm.R600.load.input(i32) readnone
+
+declare void @llvm.AMDGPU.store.output(float, i32)
+
diff --git a/test/CodeGen/R600/fadd.v4f32.ll b/test/CodeGen/R600/fadd.v4f32.ll
new file mode 100644
index 0000000000..85dbfd52cb
--- /dev/null
+++ b/test/CodeGen/R600/fadd.v4f32.ll
@@ -0,0 +1,15 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
+  %a = load <4 x float> addrspace(1) * %in
+  %b = load <4 x float> addrspace(1) * %b_ptr
+  %result = fadd <4 x float> %a, %b
+  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fcmp-cnd.ll b/test/CodeGen/R600/fcmp-cnd.ll
new file mode 100644
index 0000000000..a94cfb5cf2
--- /dev/null
+++ b/test/CodeGen/R600/fcmp-cnd.ll
@@ -0,0 +1,14 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;Not checking arguments 2 and 3 to CNDE, because they may change between
+;registers and literal.x depending on what the optimizer does.
+;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
+entry:
+  %0 = load float addrspace(1)* %in
+  %cmp = fcmp oeq float %0, 0.000000e+00
+  %value = select i1 %cmp, i32 2, i32 3 
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fcmp-cnde-int-args.ll b/test/CodeGen/R600/fcmp-cnde-int-args.ll
new file mode 100644
index 0000000000..5c981efa9d
--- /dev/null
+++ b/test/CodeGen/R600/fcmp-cnde-int-args.ll
@@ -0,0 +1,16 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; This test checks a bug in R600TargetLowering::LowerSELECT_CC where the
+; chance to optimize the fcmp + select instructions to CNDE was missed
+; due to the fact that the operands to fcmp and select had different types
+
+;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], literal.x, 0.0, -1}}
+
+define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
+entry:
+  %0 = load float addrspace(1)* %in
+  %cmp = fcmp oeq float %0, 0.000000e+00
+  %value = select i1 %cmp, i32 -1, i32 0
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fcmp.ll b/test/CodeGen/R600/fcmp.ll
new file mode 100644
index 0000000000..1dcd07c0b3
--- /dev/null
+++ b/test/CodeGen/R600/fcmp.ll
@@ -0,0 +1,16 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: SETE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MOV T{{[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+;CHECK: FLT_TO_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
+entry:
+  %0 = load float addrspace(1)* %in
+  %arrayidx1 = getelementptr inbounds float addrspace(1)* %in, i32 1
+  %1 = load float addrspace(1)* %arrayidx1
+  %cmp = fcmp oeq float %0, %1
+  %sext = sext i1 %cmp to i32
+  store i32 %sext, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fdiv.v4f32.ll b/test/CodeGen/R600/fdiv.v4f32.ll
new file mode 100644
index 0000000000..b013fd647c
--- /dev/null
+++ b/test/CodeGen/R600/fdiv.v4f32.ll
@@ -0,0 +1,19 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
+  %a = load <4 x float> addrspace(1) * %in
+  %b = load <4 x float> addrspace(1) * %b_ptr
+  %result = fdiv <4 x float> %a, %b
+  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/floor.ll b/test/CodeGen/R600/floor.ll
new file mode 100644
index 0000000000..845330f284
--- /dev/null
+++ b/test/CodeGen/R600/floor.ll
@@ -0,0 +1,16 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: FLOOR T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test() {
+   %r0 = call float @llvm.R600.load.input(i32 0)
+   %r1 = call float @floor(float %r0)
+   call void @llvm.AMDGPU.store.output(float %r1, i32 0)
+   ret void
+}
+
+declare float @llvm.R600.load.input(i32) readnone
+
+declare void @llvm.AMDGPU.store.output(float, i32)
+
+declare float @floor(float) readonly
diff --git a/test/CodeGen/R600/fmax.ll b/test/CodeGen/R600/fmax.ll
new file mode 100644
index 0000000000..3708f0b9ee
--- /dev/null
+++ b/test/CodeGen/R600/fmax.ll
@@ -0,0 +1,16 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: MAX T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test() {
+   %r0 = call float @llvm.R600.load.input(i32 0)
+   %r1 = call float @llvm.R600.load.input(i32 1)
+   %r2 = fcmp uge float %r0, %r1
+   %r3 = select i1 %r2, float %r0, float %r1
+   call void @llvm.AMDGPU.store.output(float %r3, i32 0)
+   ret void
+}
+
+declare float @llvm.R600.load.input(i32) readnone
+
+declare void @llvm.AMDGPU.store.output(float, i32)
diff --git a/test/CodeGen/R600/fmin.ll b/test/CodeGen/R600/fmin.ll
new file mode 100644
index 0000000000..19d59ab306
--- /dev/null
+++ b/test/CodeGen/R600/fmin.ll
@@ -0,0 +1,16 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: MIN T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test() {
+   %r0 = call float @llvm.R600.load.input(i32 0)
+   %r1 = call float @llvm.R600.load.input(i32 1)
+   %r2 = fcmp uge float %r0, %r1
+   %r3 = select i1 %r2, float %r1, float %r0
+   call void @llvm.AMDGPU.store.output(float %r3, i32 0)
+   ret void
+}
+
+declare float @llvm.R600.load.input(i32) readnone
+
+declare void @llvm.AMDGPU.store.output(float, i32)
diff --git a/test/CodeGen/R600/fmul.ll b/test/CodeGen/R600/fmul.ll
new file mode 100644
index 0000000000..eb1d523c0b
--- /dev/null
+++ b/test/CodeGen/R600/fmul.ll
@@ -0,0 +1,16 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test() {
+   %r0 = call float @llvm.R600.load.input(i32 0)
+   %r1 = call float @llvm.R600.load.input(i32 1)
+   %r2 = fmul float %r0, %r1
+   call void @llvm.AMDGPU.store.output(float %r2, i32 0)
+   ret void
+}
+
+declare float @llvm.R600.load.input(i32) readnone
+
+declare void @llvm.AMDGPU.store.output(float, i32)
+
diff --git a/test/CodeGen/R600/fmul.v4f32.ll b/test/CodeGen/R600/fmul.v4f32.ll
new file mode 100644
index 0000000000..6d44a0c5c7
--- /dev/null
+++ b/test/CodeGen/R600/fmul.v4f32.ll
@@ -0,0 +1,15 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
+  %a = load <4 x float> addrspace(1) * %in
+  %b = load <4 x float> addrspace(1) * %b_ptr
+  %result = fmul <4 x float> %a, %b
+  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/fsub.ll b/test/CodeGen/R600/fsub.ll
new file mode 100644
index 0000000000..0ec1c376df
--- /dev/null
+++ b/test/CodeGen/R600/fsub.ll
@@ -0,0 +1,17 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: MOV T{{[0-9]+\.[XYZW], -T[0-9]+\.[XYZW]}}
+; CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test() {
+   %r0 = call float @llvm.R600.load.input(i32 0)
+   %r1 = call float @llvm.R600.load.input(i32 1)
+   %r2 = fsub float %r0, %r1
+   call void @llvm.AMDGPU.store.output(float %r2, i32 0)
+   ret void
+}
+
+declare float @llvm.R600.load.input(i32) readnone
+
+declare void @llvm.AMDGPU.store.output(float, i32)
+
diff --git a/test/CodeGen/R600/fsub.v4f32.ll b/test/CodeGen/R600/fsub.v4f32.ll
new file mode 100644
index 0000000000..612a57e4b6
--- /dev/null
+++ b/test/CodeGen/R600/fsub.v4f32.ll
@@ -0,0 +1,15 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK: ADD T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x float> addrspace(1)* %in, i32 1
+  %a = load <4 x float> addrspace(1) * %in
+  %b = load <4 x float> addrspace(1) * %b_ptr
+  %result = fsub <4 x float> %a, %b
+  store <4 x float> %result, <4 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/i8_to_double_to_float.ll b/test/CodeGen/R600/i8_to_double_to_float.ll
new file mode 100644
index 0000000000..39f33227fa
--- /dev/null
+++ b/test/CodeGen/R600/i8_to_double_to_float.ll
@@ -0,0 +1,11 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: UINT_TO_FLT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(float addrspace(1)* %out, i8 addrspace(1)* %in) {
+  %1 = load i8 addrspace(1)* %in
+  %2 = uitofp i8 %1 to double
+  %3 = fptrunc double %2 to float
+  store float %3, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/icmp-select-sete-reverse-args.ll b/test/CodeGen/R600/icmp-select-sete-reverse-args.ll
new file mode 100644
index 0000000000..aad44d9edf
--- /dev/null
+++ b/test/CodeGen/R600/icmp-select-sete-reverse-args.ll
@@ -0,0 +1,18 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;Test that a select with reversed True/False values is correctly lowered
+;to a SETNE_INT.  There should only be one SETNE_INT instruction.
+
+;CHECK: SETNE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK_NOT: SETNE_INT
+
+define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %0 = load i32 addrspace(1)* %in
+  %arrayidx1 = getelementptr inbounds i32 addrspace(1)* %in, i32 1
+  %1 = load i32 addrspace(1)* %arrayidx1
+  %cmp = icmp eq i32 %0, %1
+  %value = select i1 %cmp, i32 0, i32 -1
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/lit.local.cfg b/test/CodeGen/R600/lit.local.cfg
new file mode 100644
index 0000000000..36ee493e59
--- /dev/null
+++ b/test/CodeGen/R600/lit.local.cfg
@@ -0,0 +1,13 @@
+config.suffixes = ['.ll', '.c', '.cpp']
+
+def getRoot(config):
+    if not config.parent:
+        return config
+    return getRoot(config.parent)
+
+root = getRoot(config)
+
+targets = set(root.targets_to_build.split())
+if not 'R600' in targets:
+    config.unsupported = True
+
diff --git a/test/CodeGen/R600/literals.ll b/test/CodeGen/R600/literals.ll
new file mode 100644
index 0000000000..4c731b25ec
--- /dev/null
+++ b/test/CodeGen/R600/literals.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; Test using an integer literal constant.
+; Generated ASM should be:
+; ADD_INT REG literal.x, 5
+; or
+; ADD_INT literal.x REG, 5
+
+; CHECK: ADD_INT {{[A-Z0-9,. ]*}}literal.x,{{[A-Z0-9,. ]*}} 5
+define void @i32_literal(i32 addrspace(1)* %out, i32 %in) {
+entry:
+  %0 = add i32 5, %in
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
+
+; Test using a float literal constant.
+; Generated ASM should be:
+; ADD REG literal.x, 5.0
+; or
+; ADD literal.x REG, 5.0
+
+; CHECK: ADD {{[A-Z0-9,. ]*}}literal.x,{{[A-Z0-9,. ]*}} {{[0-9]+}}(5.0
+define void @float_literal(float addrspace(1)* %out, float %in) {
+entry:
+  %0 = fadd float 5.0, %in
+  store float %0, float addrspace(1)* %out
+  ret void
+}
+
diff --git a/test/CodeGen/R600/llvm.AMDGPU.mul.ll b/test/CodeGen/R600/llvm.AMDGPU.mul.ll
new file mode 100644
index 0000000000..693eb27457
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.mul.ll
@@ -0,0 +1,17 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: MUL NON-IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test() {
+   %r0 = call float @llvm.R600.load.input(i32 0)
+   %r1 = call float @llvm.R600.load.input(i32 1)
+   %r2 = call float @llvm.AMDGPU.mul( float %r0, float %r1)
+   call void @llvm.AMDGPU.store.output(float %r2, i32 0)
+   ret void
+}
+
+declare float @llvm.R600.load.input(i32) readnone
+
+declare void @llvm.AMDGPU.store.output(float, i32)
+
+declare float @llvm.AMDGPU.mul(float ,float ) readnone
diff --git a/test/CodeGen/R600/llvm.AMDGPU.trunc.ll b/test/CodeGen/R600/llvm.AMDGPU.trunc.ll
new file mode 100644
index 0000000000..fac957f7ee
--- /dev/null
+++ b/test/CodeGen/R600/llvm.AMDGPU.trunc.ll
@@ -0,0 +1,16 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: TRUNC T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test() {
+   %r0 = call float @llvm.R600.load.input(i32 0)
+   %r1 = call float @llvm.AMDGPU.trunc( float %r0)
+   call void @llvm.AMDGPU.store.output(float %r1, i32 0)
+   ret void
+}
+
+declare float @llvm.R600.load.input(i32) readnone
+
+declare void @llvm.AMDGPU.store.output(float, i32)
+
+declare float @llvm.AMDGPU.trunc(float ) readnone
diff --git a/test/CodeGen/R600/llvm.cos.ll b/test/CodeGen/R600/llvm.cos.ll
new file mode 100644
index 0000000000..dc120bfb00
--- /dev/null
+++ b/test/CodeGen/R600/llvm.cos.ll
@@ -0,0 +1,16 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: COS T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test() {
+   %r0 = call float @llvm.R600.load.input(i32 0)
+   %r1 = call float @llvm.cos.f32(float %r0)
+   call void @llvm.AMDGPU.store.output(float %r1, i32 0)
+   ret void
+}
+
+declare float @llvm.cos.f32(float) readnone
+
+declare float @llvm.R600.load.input(i32) readnone
+
+declare void @llvm.AMDGPU.store.output(float, i32)
diff --git a/test/CodeGen/R600/llvm.pow.ll b/test/CodeGen/R600/llvm.pow.ll
new file mode 100644
index 0000000000..0ae9172579
--- /dev/null
+++ b/test/CodeGen/R600/llvm.pow.ll
@@ -0,0 +1,19 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: LOG_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK-NEXT: MUL_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+;CHECK-NEXT: EXP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test() {
+   %r0 = call float @llvm.R600.load.input(i32 0)
+   %r1 = call float @llvm.R600.load.input(i32 1)
+   %r2 = call float @llvm.pow.f32( float %r0, float %r1)
+   call void @llvm.AMDGPU.store.output(float %r2, i32 0)
+   ret void
+}
+
+declare float @llvm.R600.load.input(i32) readnone
+
+declare void @llvm.AMDGPU.store.output(float, i32)
+
+declare float @llvm.pow.f32(float ,float ) readonly
diff --git a/test/CodeGen/R600/llvm.sin.ll b/test/CodeGen/R600/llvm.sin.ll
new file mode 100644
index 0000000000..5cd6998c93
--- /dev/null
+++ b/test/CodeGen/R600/llvm.sin.ll
@@ -0,0 +1,16 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: SIN T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test() {
+   %r0 = call float @llvm.R600.load.input(i32 0)
+   %r1 = call float @llvm.sin.f32( float %r0)
+   call void @llvm.AMDGPU.store.output(float %r1, i32 0)
+   ret void
+}
+
+declare float @llvm.sin.f32(float) readnone
+
+declare float @llvm.R600.load.input(i32) readnone
+
+declare void @llvm.AMDGPU.store.output(float, i32)
diff --git a/test/CodeGen/R600/load.constant_addrspace.f32.ll b/test/CodeGen/R600/load.constant_addrspace.f32.ll
new file mode 100644
index 0000000000..93627283bb
--- /dev/null
+++ b/test/CodeGen/R600/load.constant_addrspace.f32.ll
@@ -0,0 +1,9 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: VTX_READ_32 T{{[0-9]+\.X, T[0-9]+\.X}}
+
+define void @test(float addrspace(1)* %out, float addrspace(2)* %in) {
+  %1 = load float addrspace(2)* %in
+  store float %1, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/load.i8.ll b/test/CodeGen/R600/load.i8.ll
new file mode 100644
index 0000000000..b070dcd520
--- /dev/null
+++ b/test/CodeGen/R600/load.i8.ll
@@ -0,0 +1,10 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
+
+define void @test(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+  %1 = load i8 addrspace(1)* %in
+  %2 = zext i8 %1 to i32
+  store i32 %2, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/reciprocal.ll b/test/CodeGen/R600/reciprocal.ll
new file mode 100644
index 0000000000..6838c1ae36
--- /dev/null
+++ b/test/CodeGen/R600/reciprocal.ll
@@ -0,0 +1,16 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: RECIP_IEEE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test() {
+   %r0 = call float @llvm.R600.load.input(i32 0)
+   %r1 = fdiv float 1.0, %r0
+   call void @llvm.AMDGPU.store.output(float %r1, i32 0)
+   ret void
+}
+
+declare float @llvm.R600.load.input(i32) readnone
+
+declare void @llvm.AMDGPU.store.output(float, i32)
+
+declare float @llvm.AMDGPU.rcp(float ) readnone
diff --git a/test/CodeGen/R600/sdiv.ll b/test/CodeGen/R600/sdiv.ll
new file mode 100644
index 0000000000..3556facfba
--- /dev/null
+++ b/test/CodeGen/R600/sdiv.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; The code generated by sdiv is long and complex and may frequently change.
+; The goal of this test is to make sure the ISel doesn't fail.
+;
+; This program was previously failing to compile when one of the selectcc
+; opcodes generated by the sdiv lowering was being legalized and optimized to:
+; selectcc Remainder -1, 0, -1, SETGT
+; This was fixed by adding an additional pattern in R600Instructions.td to
+; match this pattern with a CNDGE_INT.
+
+; CHECK: RETURN
+
+define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %den_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %num = load i32 addrspace(1) * %in
+  %den = load i32 addrspace(1) * %den_ptr
+  %result = sdiv i32 %num, %den
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/selectcc-icmp-select-float.ll b/test/CodeGen/R600/selectcc-icmp-select-float.ll
new file mode 100644
index 0000000000..f65a30086e
--- /dev/null
+++ b/test/CodeGen/R600/selectcc-icmp-select-float.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; Note additional optimizations may cause this SGT to be replaced with a
+; CND* instruction.
+; CHECK: SGT_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], literal.x, -1}}
+; Test a selectcc with i32 LHS/RHS and float True/False
+
+define void @test(float addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %0 = load i32 addrspace(1)* %in
+  %1 = icmp sge i32 %0, 0
+  %2 = select i1 %1, float 1.0, float 0.0
+  store float %2, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/selectcc_cnde.ll b/test/CodeGen/R600/selectcc_cnde.ll
new file mode 100644
index 0000000000..f0a0f512ba
--- /dev/null
+++ b/test/CodeGen/R600/selectcc_cnde.ll
@@ -0,0 +1,11 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK-NOT: SETE
+;CHECK: CNDE T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 1.0, literal.x, [-0-9]+\(2.0}}
+define void @test(float addrspace(1)* %out, float addrspace(1)* %in) {
+  %1 = load float addrspace(1)* %in
+  %2 = fcmp oeq float %1, 0.0
+  %3 = select i1 %2, float 1.0, float 2.0
+  store float %3, float addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/selectcc_cnde_int.ll b/test/CodeGen/R600/selectcc_cnde_int.ll
new file mode 100644
index 0000000000..b38078e26d
--- /dev/null
+++ b/test/CodeGen/R600/selectcc_cnde_int.ll
@@ -0,0 +1,11 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK-NOT: SETE_INT
+;CHECK: CNDE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], 1, literal.x, 2}}
+define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %1 = load i32 addrspace(1)* %in
+  %2 = icmp eq i32 %1, 0
+  %3 = select i1 %2, i32 1, i32 2
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/setcc.v4i32.ll b/test/CodeGen/R600/setcc.v4i32.ll
new file mode 100644
index 0000000000..0752f2e63d
--- /dev/null
+++ b/test/CodeGen/R600/setcc.v4i32.ll
@@ -0,0 +1,12 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+;CHECK: SETE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
+
+define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32> addrspace(1) * %b_ptr
+  %result = icmp eq <4 x i32> %a, %b
+  %sext = sext <4 x i1> %result to <4 x i32>
+  store <4 x i32> %sext, <4 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/short-args.ll b/test/CodeGen/R600/short-args.ll
new file mode 100644
index 0000000000..107025045c
--- /dev/null
+++ b/test/CodeGen/R600/short-args.ll
@@ -0,0 +1,37 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+; CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
+
+define void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
+entry:
+  %0 = zext i8 %in to i32
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK: VTX_READ_8 T{{[0-9]+\.X, T[0-9]+\.X}}
+
+define void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
+entry:
+  %0 = zext i8 %in to i32
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
+
+define void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
+entry:
+  %0 = zext i16 %in to i32
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; CHECK: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}
+
+define void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
+entry:
+  %0 = zext i16 %in to i32
+  store i32 %0, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/R600/store.v4f32.ll b/test/CodeGen/R600/store.v4f32.ll
new file mode 100644
index 0000000000..8b0d244459
--- /dev/null
+++ b/test/CodeGen/R600/store.v4f32.ll
@@ -0,0 +1,9 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: RAT_WRITE_CACHELESS_128 T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1
+
+define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+  %1 = load <4 x float> addrspace(1) * %in
+  store <4 x float> %1, <4 x float> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/store.v4i32.ll b/test/CodeGen/R600/store.v4i32.ll
new file mode 100644
index 0000000000..a659815dde
--- /dev/null
+++ b/test/CodeGen/R600/store.v4i32.ll
@@ -0,0 +1,9 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;CHECK: RAT_WRITE_CACHELESS_128 T{{[0-9]+\.XYZW, T[0-9]+\.X}}, 1
+
+define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %1 = load <4 x i32> addrspace(1) * %in
+  store <4 x i32> %1, <4 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/udiv.v4i32.ll b/test/CodeGen/R600/udiv.v4i32.ll
new file mode 100644
index 0000000000..47657a6be7
--- /dev/null
+++ b/test/CodeGen/R600/udiv.v4i32.ll
@@ -0,0 +1,15 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;The code generated by udiv is long and complex and may frequently change.
+;The goal of this test is to make sure the ISel doesn't fail when it gets
+;a v4i32 udiv
+;CHECK: RETURN
+
+define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32> addrspace(1) * %b_ptr
+  %result = udiv <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/R600/urem.v4i32.ll b/test/CodeGen/R600/urem.v4i32.ll
new file mode 100644
index 0000000000..2e7388caa6
--- /dev/null
+++ b/test/CodeGen/R600/urem.v4i32.ll
@@ -0,0 +1,15 @@
+;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+
+;The code generated by urem is long and complex and may frequently change.
+;The goal of this test is to make sure the ISel doesn't fail when it gets
+;a v4i32 urem
+;CHECK: RETURN
+
+define void @test(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+  %b_ptr = getelementptr <4 x i32> addrspace(1)* %in, i32 1
+  %a = load <4 x i32> addrspace(1) * %in
+  %b = load <4 x i32> addrspace(1) * %b_ptr
+  %result = urem <4 x i32> %a, %b
+  store <4 x i32> %result, <4 x i32> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/SI/sanity.ll b/test/CodeGen/SI/sanity.ll
new file mode 100644
index 0000000000..62cdcf5eca
--- /dev/null
+++ b/test/CodeGen/SI/sanity.ll
@@ -0,0 +1,37 @@
+;RUN: llc < %s -march=r600 -mcpu=SI | FileCheck %s
+
+; CHECK: S_ENDPGM
+
+define void @main() {
+main_body:
+  call void @llvm.AMDGPU.shader.type(i32 1)
+  %0 = load <4 x i32> addrspace(2)* addrspace(8)* inttoptr (i32 6 to <4 x i32> addrspace(2)* addrspace(8)*)
+  %1 = getelementptr <4 x i32> addrspace(2)* %0, i32 0
+  %2 = load <4 x i32> addrspace(2)* %1
+  %3 = call i32 @llvm.SI.vs.load.buffer.index()
+  %4 = call <4 x float> @llvm.SI.vs.load.input(<4 x i32> %2, i32 0, i32 %3)
+  %5 = extractelement <4 x float> %4, i32 0
+  %6 = extractelement <4 x float> %4, i32 1
+  %7 = extractelement <4 x float> %4, i32 2
+  %8 = extractelement <4 x float> %4, i32 3
+  %9 = load <4 x i32> addrspace(2)* addrspace(8)* inttoptr (i32 6 to <4 x i32> addrspace(2)* addrspace(8)*)
+  %10 = getelementptr <4 x i32> addrspace(2)* %9, i32 1
+  %11 = load <4 x i32> addrspace(2)* %10
+  %12 = call i32 @llvm.SI.vs.load.buffer.index()
+  %13 = call <4 x float> @llvm.SI.vs.load.input(<4 x i32> %11, i32 0, i32 %12)
+  %14 = extractelement <4 x float> %13, i32 0
+  %15 = extractelement <4 x float> %13, i32 1
+  %16 = extractelement <4 x float> %13, i32 2
+  %17 = extractelement <4 x float> %13, i32 3
+  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %14, float %15, float %16, float %17)
+  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %5, float %6, float %7, float %8)
+  ret void
+}
+
+declare void @llvm.AMDGPU.shader.type(i32)
+
+declare i32 @llvm.SI.vs.load.buffer.index() readnone
+
+declare <4 x float> @llvm.SI.vs.load.input(<4 x i32>, i32, i32)
+
+declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/X86/2012-12-12-DAGCombineCrash.ll b/test/CodeGen/X86/2012-12-12-DAGCombineCrash.ll
new file mode 100644
index 0000000000..8cef2c8201
--- /dev/null
+++ b/test/CodeGen/X86/2012-12-12-DAGCombineCrash.ll
@@ -0,0 +1,46 @@
+; RUN: llc -march=x86 -mtriple=i686-apple-ios -mcpu=yonah < %s
+; rdar://12868039
+
+define void @t() nounwind ssp {
+  %1 = alloca i32
+  %2 = ptrtoint i32* %1 to i32
+  br label %3
+
+; <label>:3                                       ; preds = %5, %3, %0
+  switch i32 undef, label %3 [
+    i32 611946160, label %5
+    i32 954117870, label %4
+  ]
+
+; <label>:4                                       ; preds = %3
+  ret void
+
+; <label>:5                                       ; preds = %5, %3
+  %6 = add i32 0, 148
+  %7 = and i32 %6, 48
+  %8 = add i32 %7, 0
+  %9 = or i32 %2, %8
+  %10 = xor i32 -1, %2
+  %11 = or i32 %8, %10
+  %12 = or i32 %9, %11
+  %13 = xor i32 %9, %11
+  %14 = sub i32 %12, %13
+  %15 = xor i32 2044674005, %14
+  %16 = xor i32 %15, 0
+  %17 = shl nuw nsw i32 %16, 1
+  %18 = sub i32 0, %17
+  %19 = and i32 %18, 2051242402
+  %20 = sub i32 0, %19
+  %21 = xor i32 %20, 0
+  %22 = xor i32 %21, 0
+  %23 = add i32 0, %22
+  %24 = shl i32 %23, 1
+  %25 = or i32 1, %24
+  %26 = add i32 0, %25
+  %27 = trunc i32 %26 to i8
+  %28 = xor i8 %27, 125
+  %29 = add i8 %28, -16
+  %30 = add i8 0, %29
+  store i8 %30, i8* null
+  br i1 undef, label %5, label %3
+}
diff --git a/test/CodeGen/X86/2012-12-14-v8fp80-crash.ll b/test/CodeGen/X86/2012-12-14-v8fp80-crash.ll
new file mode 100644
index 0000000000..c465527bd8
--- /dev/null
+++ b/test/CodeGen/X86/2012-12-14-v8fp80-crash.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -march=x86 -mcpu=corei7 -mtriple=i686-pc-win32
+
+; Make sure we don't crash on this testcase.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+define void @_ZN6VectorIfE3equIeEEvfRKS_IT_E() nounwind uwtable ssp align 2 {
+entry:
+  br i1 undef, label %while.end, label %while.body.lr.ph
+
+while.body.lr.ph:                                 ; preds = %entry
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %while.body.lr.ph
+  %0 = fptrunc <8 x x86_fp80> undef to <8 x float>
+  store <8 x float> %0, <8 x float>* undef, align 4
+  br label %vector.body
+
+while.end:                                        ; preds = %entry
+  ret void
+}
diff --git a/test/CodeGen/X86/atom-bypass-slow-division.ll b/test/CodeGen/X86/atom-bypass-slow-division.ll
index e7c9605d3e..453e72672b 100644
--- a/test/CodeGen/X86/atom-bypass-slow-division.ll
+++ b/test/CodeGen/X86/atom-bypass-slow-division.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mcpu=atom -mtriple=i686-linux  | FileCheck %s
 
-define i32 @test_get_quotient(i32 %a, i32 %b) nounwind {
-; CHECK: test_get_quotient
+define i32 @Test_get_quotient(i32 %a, i32 %b) nounwind {
+; CHECK: Test_get_quotient:
 ; CHECK: orl %ecx, %edx
 ; CHECK-NEXT: testl $-256, %edx
 ; CHECK-NEXT: je
@@ -13,8 +13,8 @@ define i32 @test_get_quotient(i32 %a, i32 %b) nounwind {
   ret i32 %result
 }
 
-define i32 @test_get_remainder(i32 %a, i32 %b) nounwind {
-; CHECK: test_get_remainder
+define i32 @Test_get_remainder(i32 %a, i32 %b) nounwind {
+; CHECK: Test_get_remainder:
 ; CHECK: orl %ecx, %edx
 ; CHECK-NEXT: testl $-256, %edx
 ; CHECK-NEXT: je
@@ -26,8 +26,8 @@ define i32 @test_get_remainder(i32 %a, i32 %b) nounwind {
   ret i32 %result
 }
 
-define i32 @test_get_quotient_and_remainder(i32 %a, i32 %b) nounwind {
-; CHECK: test_get_quotient_and_remainder
+define i32 @Test_get_quotient_and_remainder(i32 %a, i32 %b) nounwind {
+; CHECK: Test_get_quotient_and_remainder:
 ; CHECK: orl %ecx, %edx
 ; CHECK-NEXT: testl $-256, %edx
 ; CHECK-NEXT: je
@@ -35,7 +35,7 @@ define i32 @test_get_quotient_and_remainder(i32 %a, i32 %b) nounwind {
 ; CHECK: divb
 ; CHECK: addl
 ; CHECK: ret
-; CEECK-NOT: idivl
+; CHECK-NOT: idivl
 ; CHECK-NOT: divb
   %resultdiv = sdiv i32 %a, %b
   %resultrem = srem i32 %a, %b
@@ -43,8 +43,8 @@ define i32 @test_get_quotient_and_remainder(i32 %a, i32 %b) nounwind {
   ret i32 %result
 }
 
-define i32 @test_use_div_and_idiv(i32 %a, i32 %b) nounwind {
-; CHECK: test_use_div_and_idiv
+define i32 @Test_use_div_and_idiv(i32 %a, i32 %b) nounwind {
+; CHECK: Test_use_div_and_idiv:
 ; CHECK: idivl
 ; CHECK: divb
 ; CHECK: divl
@@ -57,34 +57,34 @@ define i32 @test_use_div_and_idiv(i32 %a, i32 %b) nounwind {
   ret i32 %result
 }
 
-define i32 @test_use_div_imm_imm() nounwind {
-; CHECK: test_use_div_imm_imm
+define i32 @Test_use_div_imm_imm() nounwind {
+; CHECK: Test_use_div_imm_imm:
 ; CHECK: movl $64
   %resultdiv = sdiv i32 256, 4
   ret i32 %resultdiv
 }
 
-define i32 @test_use_div_reg_imm(i32 %a) nounwind {
-; CHECK: test_use_div_reg_imm
-; CEHCK-NOT: test
+define i32 @Test_use_div_reg_imm(i32 %a) nounwind {
+; CHECK: Test_use_div_reg_imm:
+; CHECK-NOT: test
 ; CHECK-NOT: idiv
 ; CHECK-NOT: divb
   %resultdiv = sdiv i32 %a, 33
   ret i32 %resultdiv
 }
 
-define i32 @test_use_rem_reg_imm(i32 %a) nounwind {
-; CHECK: test_use_rem_reg_imm
-; CEHCK-NOT: test
+define i32 @Test_use_rem_reg_imm(i32 %a) nounwind {
+; CHECK: Test_use_rem_reg_imm:
+; CHECK-NOT: test
 ; CHECK-NOT: idiv
 ; CHECK-NOT: divb
   %resultrem = srem i32 %a, 33
   ret i32 %resultrem
 }
 
-define i32 @test_use_divrem_reg_imm(i32 %a) nounwind {
-; CHECK: test_use_divrem_reg_imm
-; CEHCK-NOT: test
+define i32 @Test_use_divrem_reg_imm(i32 %a) nounwind {
+; CHECK: Test_use_divrem_reg_imm:
+; CHECK-NOT: test
 ; CHECK-NOT: idiv
 ; CHECK-NOT: divb
   %resultdiv = sdiv i32 %a, 33
@@ -93,8 +93,8 @@ define i32 @test_use_divrem_reg_imm(i32 %a) nounwind {
   ret i32 %result
 }
 
-define i32 @test_use_div_imm_reg(i32 %a) nounwind {
-; CHECK: test_use_div_imm_reg
+define i32 @Test_use_div_imm_reg(i32 %a) nounwind {
+; CHECK: Test_use_div_imm_reg:
 ; CHECK: test
 ; CHECK: idiv
 ; CHECK: divb
@@ -102,8 +102,8 @@ define i32 @test_use_div_imm_reg(i32 %a) nounwind {
   ret i32 %resultdiv
 }
 
-define i32 @test_use_rem_imm_reg(i32 %a) nounwind {
-; CHECK: test_use_rem_imm_reg
+define i32 @Test_use_rem_imm_reg(i32 %a) nounwind {
+; CHECK: Test_use_rem_imm_reg:
 ; CHECK: test
 ; CHECK: idiv
 ; CHECK: divb
diff --git a/test/CodeGen/X86/bmi.ll b/test/CodeGen/X86/bmi.ll
index 43c47c0fa8..b89e648c52 100644
--- a/test/CodeGen/X86/bmi.ll
+++ b/test/CodeGen/X86/bmi.ll
@@ -26,6 +26,14 @@ define i32 @t3(i32 %x) nounwind  {
 ; CHECK: tzcntl
 }
 
+define i32 @tzcnt32_load(i32* %x) nounwind  {
+  %x1 = load i32* %x
+  %tmp = tail call i32 @llvm.cttz.i32(i32 %x1, i1 false )
+  ret i32 %tmp
+; CHECK: tzcnt32_load:
+; CHECK: tzcntl ({{.*}})
+}
+
 define i64 @t4(i64 %x) nounwind  {
   %tmp = tail call i64 @llvm.cttz.i64( i64 %x, i1 false )
   ret i64 %tmp
@@ -69,6 +77,15 @@ define i32 @andn32(i32 %x, i32 %y) nounwind readnone {
 ; CHECK: andnl
 }
 
+define i32 @andn32_load(i32 %x, i32* %y) nounwind readnone {
+  %y1 = load i32* %y
+  %tmp1 = xor i32 %x, -1
+  %tmp2 = and i32 %y1, %tmp1
+  ret i32 %tmp2
+; CHECK: andn32_load:
+; CHECK: andnl ({{.*}})
+}
+
 define i64 @andn64(i64 %x, i64 %y) nounwind readnone {
   %tmp1 = xor i64 %x, -1
   %tmp2 = and i64 %tmp1, %y
@@ -84,6 +101,14 @@ define i32 @bextr32(i32 %x, i32 %y) nounwind readnone {
 ; CHECK: bextrl
 }
 
+define i32 @bextr32_load(i32* %x, i32 %y) nounwind readnone {
+  %x1 = load i32* %x
+  %tmp = tail call i32 @llvm.x86.bmi.bextr.32(i32 %x1, i32 %y)
+  ret i32 %tmp
+; CHECK: bextr32_load:
+; CHECK: bextrl {{.*}}, ({{.*}}), {{.*}}
+}
+
 declare i32 @llvm.x86.bmi.bextr.32(i32, i32) nounwind readnone
 
 define i64 @bextr64(i64 %x, i64 %y) nounwind readnone {
@@ -102,6 +127,14 @@ define i32 @bzhi32(i32 %x, i32 %y) nounwind readnone {
 ; CHECK: bzhil
 }
 
+define i32 @bzhi32_load(i32* %x, i32 %y) nounwind readnone {
+  %x1 = load i32* %x
+  %tmp = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %x1, i32 %y)
+  ret i32 %tmp
+; CHECK: bzhi32_load:
+; CHECK: bzhil {{.*}}, ({{.*}}), {{.*}}
+}
+
 declare i32 @llvm.x86.bmi.bzhi.32(i32, i32) nounwind readnone
 
 define i64 @bzhi64(i64 %x, i64 %y) nounwind readnone {
@@ -121,6 +154,15 @@ define i32 @blsi32(i32 %x) nounwind readnone {
 ; CHECK: blsil
 }
 
+define i32 @blsi32_load(i32* %x) nounwind readnone {
+  %x1 = load i32* %x
+  %tmp = sub i32 0, %x1
+  %tmp2 = and i32 %x1, %tmp
+  ret i32 %tmp2
+; CHECK: blsi32_load:
+; CHECK: blsil ({{.*}})
+}
+
 define i64 @blsi64(i64 %x) nounwind readnone {
   %tmp = sub i64 0, %x
   %tmp2 = and i64 %tmp, %x
@@ -137,6 +179,15 @@ define i32 @blsmsk32(i32 %x) nounwind readnone {
 ; CHECK: blsmskl
 }
 
+define i32 @blsmsk32_load(i32* %x) nounwind readnone {
+  %x1 = load i32* %x
+  %tmp = sub i32 %x1, 1
+  %tmp2 = xor i32 %x1, %tmp
+  ret i32 %tmp2
+; CHECK: blsmsk32_load:
+; CHECK: blsmskl ({{.*}})
+}
+
 define i64 @blsmsk64(i64 %x) nounwind readnone {
   %tmp = sub i64 %x, 1
   %tmp2 = xor i64 %tmp, %x
@@ -153,6 +204,15 @@ define i32 @blsr32(i32 %x) nounwind readnone {
 ; CHECK: blsrl
 }
 
+define i32 @blsr32_load(i32* %x) nounwind readnone {
+  %x1 = load i32* %x
+  %tmp = sub i32 %x1, 1
+  %tmp2 = and i32 %x1, %tmp
+  ret i32 %tmp2
+; CHECK: blsr32_load:
+; CHECK: blsrl ({{.*}})
+}
+
 define i64 @blsr64(i64 %x) nounwind readnone {
   %tmp = sub i64 %x, 1
   %tmp2 = and i64 %tmp, %x
@@ -168,6 +228,14 @@ define i32 @pdep32(i32 %x, i32 %y) nounwind readnone {
 ; CHECK: pdepl
 }
 
+define i32 @pdep32_load(i32 %x, i32* %y) nounwind readnone {
+  %y1 = load i32* %y
+  %tmp = tail call i32 @llvm.x86.bmi.pdep.32(i32 %x, i32 %y1)
+  ret i32 %tmp
+; CHECK: pdep32_load:
+; CHECK: pdepl ({{.*}})
+}
+
 declare i32 @llvm.x86.bmi.pdep.32(i32, i32) nounwind readnone
 
 define i64 @pdep64(i64 %x, i64 %y) nounwind readnone {
@@ -186,6 +254,14 @@ define i32 @pext32(i32 %x, i32 %y) nounwind readnone {
 ; CHECK: pextl
 }
 
+define i32 @pext32_load(i32 %x, i32* %y) nounwind readnone {
+  %y1 = load i32* %y
+  %tmp = tail call i32 @llvm.x86.bmi.pext.32(i32 %x, i32 %y1)
+  ret i32 %tmp
+; CHECK: pext32_load:
+; CHECK: pextl ({{.*}})
+}
+
 declare i32 @llvm.x86.bmi.pext.32(i32, i32) nounwind readnone
 
 define i64 @pext64(i64 %x, i64 %y) nounwind readnone {
diff --git a/test/CodeGen/X86/memcpy-2.ll b/test/CodeGen/X86/memcpy-2.ll
index dcc8f0d268..949d6a4293 100644
--- a/test/CodeGen/X86/memcpy-2.ll
+++ b/test/CodeGen/X86/memcpy-2.ll
@@ -17,11 +17,11 @@ entry:
 ; SSE2: movb $0, 24(%esp)
 
 ; SSE1: t1:
-; SSE1: fldl _.str+16
-; SSE1: fstpl 16(%esp)
 ; SSE1: movaps _.str, %xmm0
 ; SSE1: movaps %xmm0
 ; SSE1: movb $0, 24(%esp)
+; SSE1: movl $0, 20(%esp)
+; SSE1: movl $0, 16(%esp)
 
 ; NOSSE: t1:
 ; NOSSE: movb $0
diff --git a/test/CodeGen/X86/psubus.ll b/test/CodeGen/X86/psubus.ll
new file mode 100644
index 0000000000..aff4afbd2e
--- /dev/null
+++ b/test/CodeGen/X86/psubus.ll
@@ -0,0 +1,340 @@
+; RUN: llc -mcpu=core2 < %s | FileCheck %s -check-prefix=SSE2
+; RUN: llc -mcpu=corei7-avx < %s | FileCheck %s -check-prefix=AVX1
+; RUN: llc -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX2
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define void @test1(i16* nocapture %head) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i16* %head, i64 %index
+  %1 = bitcast i16* %0 to <8 x i16>*
+  %2 = load <8 x i16>* %1, align 2
+  %3 = icmp slt <8 x i16> %2, zeroinitializer
+  %4 = xor <8 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
+  %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer
+  store <8 x i16> %5, <8 x i16>* %1, align 2
+  %index.next = add i64 %index, 8
+  %6 = icmp eq i64 %index.next, 16384
+  br i1 %6, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE2: @test1
+; SSE2: psubusw LCPI0_0(%rip), %xmm0
+
+; AVX1: @test1
+; AVX1: vpsubusw LCPI0_0(%rip), %xmm0, %xmm0
+
+; AVX2: @test1
+; AVX2: vpsubusw LCPI0_0(%rip), %xmm0, %xmm0
+}
+
+define void @test2(i16* nocapture %head) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i16* %head, i64 %index
+  %1 = bitcast i16* %0 to <8 x i16>*
+  %2 = load <8 x i16>* %1, align 2
+  %3 = icmp ugt <8 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
+  %4 = add <8 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
+  %5 = select <8 x i1> %3, <8 x i16> %4, <8 x i16> zeroinitializer
+  store <8 x i16> %5, <8 x i16>* %1, align 2
+  %index.next = add i64 %index, 8
+  %6 = icmp eq i64 %index.next, 16384
+  br i1 %6, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE2: @test2
+; SSE2: psubusw LCPI1_0(%rip), %xmm0
+
+; AVX1: @test2
+; AVX1: vpsubusw LCPI1_0(%rip), %xmm0, %xmm0
+
+; AVX2: @test2
+; AVX2: vpsubusw LCPI1_0(%rip), %xmm0, %xmm0
+}
+
+define void @test3(i16* nocapture %head, i16 zeroext %w) nounwind {
+vector.ph:
+  %0 = insertelement <8 x i16> undef, i16 %w, i32 0
+  %broadcast15 = shufflevector <8 x i16> %0, <8 x i16> undef, <8 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %1 = getelementptr inbounds i16* %head, i64 %index
+  %2 = bitcast i16* %1 to <8 x i16>*
+  %3 = load <8 x i16>* %2, align 2
+  %4 = icmp ult <8 x i16> %3, %broadcast15
+  %5 = sub <8 x i16> %3, %broadcast15
+  %6 = select <8 x i1> %4, <8 x i16> zeroinitializer, <8 x i16> %5
+  store <8 x i16> %6, <8 x i16>* %2, align 2
+  %index.next = add i64 %index, 8
+  %7 = icmp eq i64 %index.next, 16384
+  br i1 %7, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE2: @test3
+; SSE2: psubusw %xmm0, %xmm1
+
+; AVX1: @test3
+; AVX1: vpsubusw %xmm0, %xmm1, %xmm1
+
+; AVX2: @test3
+; AVX2: vpsubusw %xmm0, %xmm1, %xmm1
+}
+
+define void @test4(i8* nocapture %head) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i8* %head, i64 %index
+  %1 = bitcast i8* %0 to <16 x i8>*
+  %2 = load <16 x i8>* %1, align 1
+  %3 = icmp slt <16 x i8> %2, zeroinitializer
+  %4 = xor <16 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
+  %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer
+  store <16 x i8> %5, <16 x i8>* %1, align 1
+  %index.next = add i64 %index, 16
+  %6 = icmp eq i64 %index.next, 16384
+  br i1 %6, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE2: @test4
+; SSE2: psubusb LCPI3_0(%rip), %xmm0
+
+; AVX1: @test4
+; AVX1: vpsubusb LCPI3_0(%rip), %xmm0, %xmm0
+
+; AVX2: @test4
+; AVX2: vpsubusb LCPI3_0(%rip), %xmm0, %xmm0
+}
+
+define void @test5(i8* nocapture %head) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i8* %head, i64 %index
+  %1 = bitcast i8* %0 to <16 x i8>*
+  %2 = load <16 x i8>* %1, align 1
+  %3 = icmp ugt <16 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
+  %4 = add <16 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
+  %5 = select <16 x i1> %3, <16 x i8> %4, <16 x i8> zeroinitializer
+  store <16 x i8> %5, <16 x i8>* %1, align 1
+  %index.next = add i64 %index, 16
+  %6 = icmp eq i64 %index.next, 16384
+  br i1 %6, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE2: @test5
+; SSE2: psubusb LCPI4_0(%rip), %xmm0
+
+; AVX1: @test5
+; AVX1: vpsubusb LCPI4_0(%rip), %xmm0, %xmm0
+
+; AVX2: @test5
+; AVX2: vpsubusb LCPI4_0(%rip), %xmm0, %xmm0
+}
+
+define void @test6(i8* nocapture %head, i8 zeroext %w) nounwind {
+vector.ph:
+  %0 = insertelement <16 x i8> undef, i8 %w, i32 0
+  %broadcast15 = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %1 = getelementptr inbounds i8* %head, i64 %index
+  %2 = bitcast i8* %1 to <16 x i8>*
+  %3 = load <16 x i8>* %2, align 1
+  %4 = icmp ult <16 x i8> %3, %broadcast15
+  %5 = sub <16 x i8> %3, %broadcast15
+  %6 = select <16 x i1> %4, <16 x i8> zeroinitializer, <16 x i8> %5
+  store <16 x i8> %6, <16 x i8>* %2, align 1
+  %index.next = add i64 %index, 16
+  %7 = icmp eq i64 %index.next, 16384
+  br i1 %7, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; SSE2: @test6
+; SSE2: psubusb %xmm0, %xmm1
+
+; AVX1: @test6
+; AVX1: vpsubusb %xmm0, %xmm1, %xmm1
+
+; AVX2: @test6
+; AVX2: vpsubusb %xmm0, %xmm1, %xmm1
+}
+
+define void @test7(i16* nocapture %head) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i16* %head, i64 %index
+  %1 = bitcast i16* %0 to <16 x i16>*
+  %2 = load <16 x i16>* %1, align 2
+  %3 = icmp slt <16 x i16> %2, zeroinitializer
+  %4 = xor <16 x i16> %2, <i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768, i16 -32768>
+  %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer
+  store <16 x i16> %5, <16 x i16>* %1, align 2
+  %index.next = add i64 %index, 8
+  %6 = icmp eq i64 %index.next, 16384
+  br i1 %6, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: @test7
+; AVX2: vpsubusw LCPI6_0(%rip), %ymm0, %ymm0
+}
+
+define void @test8(i16* nocapture %head) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i16* %head, i64 %index
+  %1 = bitcast i16* %0 to <16 x i16>*
+  %2 = load <16 x i16>* %1, align 2
+  %3 = icmp ugt <16 x i16> %2, <i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766, i16 32766>
+  %4 = add <16 x i16> %2, <i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767, i16 -32767>
+  %5 = select <16 x i1> %3, <16 x i16> %4, <16 x i16> zeroinitializer
+  store <16 x i16> %5, <16 x i16>* %1, align 2
+  %index.next = add i64 %index, 8
+  %6 = icmp eq i64 %index.next, 16384
+  br i1 %6, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: @test8
+; AVX2: vpsubusw LCPI7_0(%rip), %ymm0, %ymm0
+}
+
+define void @test9(i16* nocapture %head, i16 zeroext %w) nounwind {
+vector.ph:
+  %0 = insertelement <16 x i16> undef, i16 %w, i32 0
+  %broadcast15 = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %1 = getelementptr inbounds i16* %head, i64 %index
+  %2 = bitcast i16* %1 to <16 x i16>*
+  %3 = load <16 x i16>* %2, align 2
+  %4 = icmp ult <16 x i16> %3, %broadcast15
+  %5 = sub <16 x i16> %3, %broadcast15
+  %6 = select <16 x i1> %4, <16 x i16> zeroinitializer, <16 x i16> %5
+  store <16 x i16> %6, <16 x i16>* %2, align 2
+  %index.next = add i64 %index, 8
+  %7 = icmp eq i64 %index.next, 16384
+  br i1 %7, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+
+; AVX2: @test9
+; AVX2: vpsubusw %ymm0, %ymm1, %ymm1
+}
+
+define void @test10(i8* nocapture %head) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i8* %head, i64 %index
+  %1 = bitcast i8* %0 to <32 x i8>*
+  %2 = load <32 x i8>* %1, align 1
+  %3 = icmp slt <32 x i8> %2, zeroinitializer
+  %4 = xor <32 x i8> %2, <i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128, i8 -128>
+  %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer
+  store <32 x i8> %5, <32 x i8>* %1, align 1
+  %index.next = add i64 %index, 16
+  %6 = icmp eq i64 %index.next, 16384
+  br i1 %6, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+
+; AVX2: @test10
+; AVX2: vpsubusb LCPI9_0(%rip), %ymm0, %ymm0
+}
+
+define void @test11(i8* nocapture %head) nounwind {
+vector.ph:
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %0 = getelementptr inbounds i8* %head, i64 %index
+  %1 = bitcast i8* %0 to <32 x i8>*
+  %2 = load <32 x i8>* %1, align 1
+  %3 = icmp ugt <32 x i8> %2, <i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126, i8 126>
+  %4 = add <32 x i8> %2, <i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127, i8 -127>
+  %5 = select <32 x i1> %3, <32 x i8> %4, <32 x i8> zeroinitializer
+  store <32 x i8> %5, <32 x i8>* %1, align 1
+  %index.next = add i64 %index, 16
+  %6 = icmp eq i64 %index.next, 16384
+  br i1 %6, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: @test11
+; AVX2: vpsubusb LCPI10_0(%rip), %ymm0, %ymm0
+}
+
+define void @test12(i8* nocapture %head, i8 zeroext %w) nounwind {
+vector.ph:
+  %0 = insertelement <32 x i8> undef, i8 %w, i32 0
+  %broadcast15 = shufflevector <32 x i8> %0, <32 x i8> undef, <32 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %1 = getelementptr inbounds i8* %head, i64 %index
+  %2 = bitcast i8* %1 to <32 x i8>*
+  %3 = load <32 x i8>* %2, align 1
+  %4 = icmp ult <32 x i8> %3, %broadcast15
+  %5 = sub <32 x i8> %3, %broadcast15
+  %6 = select <32 x i1> %4, <32 x i8> zeroinitializer, <32 x i8> %5
+  store <32 x i8> %6, <32 x i8>* %2, align 1
+  %index.next = add i64 %index, 16
+  %7 = icmp eq i64 %index.next, 16384
+  br i1 %7, label %for.end, label %vector.body
+
+for.end:                                          ; preds = %vector.body
+  ret void
+
+; AVX2: @test12
+; AVX2: vpsubusb %ymm0, %ymm1, %ymm1
+}
diff --git a/test/CodeGen/X86/store_op_load_fold.ll b/test/CodeGen/X86/store_op_load_fold.ll
index 6e47eb397d..070cccdb87 100644
--- a/test/CodeGen/X86/store_op_load_fold.ll
+++ b/test/CodeGen/X86/store_op_load_fold.ll
@@ -1,13 +1,30 @@
-; RUN: llc < %s -march=x86 | not grep mov
+; RUN: llc < %s -mtriple=i686-darwin | FileCheck %s
 ;
 ; Test the add and load are folded into the store instruction.
 
 @X = internal global i16 0              ; <i16*> [#uses=2]
 
 define void @foo() nounwind {
+; CHECK: foo:
+; CHECK-NOT: mov
+; CHECK: add
+; CHECK-NEXT: ret
         %tmp.0 = load i16* @X           ; <i16> [#uses=1]
         %tmp.3 = add i16 %tmp.0, 329            ; <i16> [#uses=1]
         store i16 %tmp.3, i16* @X
         ret void
 }
 
+; rdar://12838504
+%struct.S2 = type { i64, i16, [2 x i8], i8, [3 x i8], [7 x i8], i8, [8 x i8] }
+@s2 = external global %struct.S2, align 16
+define void @test2() nounwind uwtable ssp {
+; CHECK: test2:
+; CHECK: mov
+; CHECK-NEXT: and
+; CHECK-NEXT: ret
+  %bf.load35 = load i56* bitcast ([7 x i8]* getelementptr inbounds (%struct.S2* @s2, i32 0, i32 5) to i56*), align 16
+  %bf.clear36 = and i56 %bf.load35, -1125895611875329
+  store i56 %bf.clear36, i56* bitcast ([7 x i8]* getelementptr inbounds (%struct.S2* @s2, i32 0, i32 5) to i56*), align 16
+  ret void
+}
diff --git a/test/DebugInfo/X86/elf-names.ll b/test/DebugInfo/X86/elf-names.ll
index b908bcefe4..6e7154cfe8 100644
--- a/test/DebugInfo/X86/elf-names.ll
+++ b/test/DebugInfo/X86/elf-names.ll
@@ -7,6 +7,7 @@
 ; CHECK: 0x0000003d:     DW_AT_name [DW_FORM_strp]       ( .debug_str[0x0000006d] = "D")
 ; CHECK: 0x00000044:     DW_TAG_member
 ; CHECK: 0x00000045:       DW_AT_name [DW_FORM_strp]     ( .debug_str[0x0000005d] = "c1")
+; CHECK: 0x0000008d:       DW_AT_artificial [DW_FORM_flag_present]       (true)
 
 %class.D = type { i32, i32, i32, i32 }
 
diff --git a/test/DebugInfo/X86/lit.local.cfg b/test/DebugInfo/X86/lit.local.cfg
index 0d694da8df..60d66eae49 100644
--- a/test/DebugInfo/X86/lit.local.cfg
+++ b/test/DebugInfo/X86/lit.local.cfg
@@ -1,4 +1,4 @@
-config.suffixes = ['.ll']
+config.suffixes = ['.ll', '.s']
 
 targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
diff --git a/test/DebugInfo/X86/main-file-name.s b/test/DebugInfo/X86/main-file-name.s
new file mode 100644
index 0000000000..6817c9e3a7
--- /dev/null
+++ b/test/DebugInfo/X86/main-file-name.s
@@ -0,0 +1,17 @@
+// RUN: llvm-mc -triple x86_64-unknown-linux-gnu -filetype obj -main-file-name foo.S -g -o %t %s
+// RUN: llvm-dwarfdump %t | FileCheck %s
+
+// CHECK: DW_TAG_compile_unit [1]
+// CHECK-NOT: DW_TAG_
+// CHECK: DW_AT_name [DW_FORM_string]       ("foo.S")
+        
+
+# 1 "foo.S"
+# 1 "<built-in>" 1
+# 1 "foo.S" 2
+
+foo:
+  nop
+  nop
+  nop
+        
diff --git a/test/Instrumentation/AddressSanitizer/debug_info.ll b/test/Instrumentation/AddressSanitizer/debug_info.ll
new file mode 100644
index 0000000000..f686ac1c52
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/debug_info.ll
@@ -0,0 +1,60 @@
+; RUN: opt < %s -asan -asan-module -S | FileCheck %s
+
+; Checks that llvm.dbg.declare instructions are updated 
+; accordingly as we merge allocas.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @_Z3zzzi(i32 %p) nounwind uwtable address_safety {
+entry:
+  %p.addr = alloca i32, align 4
+  %r = alloca i32, align 4
+  store i32 %p, i32* %p.addr, align 4
+  call void @llvm.dbg.declare(metadata !{i32* %p.addr}, metadata !10), !dbg !11
+  call void @llvm.dbg.declare(metadata !{i32* %r}, metadata !12), !dbg !14
+  %0 = load i32* %p.addr, align 4, !dbg !14
+  %add = add nsw i32 %0, 1, !dbg !14
+  store i32 %add, i32* %r, align 4, !dbg !14
+  %1 = load i32* %r, align 4, !dbg !15
+  ret i32 %1, !dbg !15
+}
+
+;   CHECK: define i32 @_Z3zzzi
+;   CHECK: entry:
+; Verify that llvm.dbg.declare calls are in the entry basic block.
+;   CHECK-NOT: %entry
+;   CHECK: call void @llvm.dbg.declare(metadata {{.*}}, metadata ![[ARG_ID:[0-9]+]])
+;   CHECK-NOT: %entry
+;   CHECK: call void @llvm.dbg.declare(metadata {{.*}}, metadata ![[VAR_ID:[0-9]+]])
+
+declare void @llvm.dbg.declare(metadata, metadata) nounwind readnone
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 786449, i32 0, i32 4, metadata !"a.cc", metadata !"/usr/local/google/llvm_cmake_clang/tmp/debuginfo", metadata !"clang version 3.3 (trunk 169314)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !1} ; [ DW_TAG_compile_unit ] [/usr/local/google/llvm_cmake_clang/tmp/debuginfo/a.cc] [DW_LANG_C_plus_plus]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 786478, i32 0, metadata !6, metadata !"zzz", metadata !"zzz", metadata !"_Z3zzzi", metadata !6, i32 1, metadata !7, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, i32 (i32)* @_Z3zzzi, null, null, metadata !1, i32 1} ; [ DW_TAG_subprogram ] [line 1] [def] [zzz]
+!6 = metadata !{i32 786473, metadata !"a.cc", metadata !"/usr/local/google/llvm_cmake_clang/tmp/debuginfo", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 786453, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
+!8 = metadata !{metadata !9, metadata !9}
+!9 = metadata !{i32 786468, null, metadata !"int", null, i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ] [int] [line 0, size 32, align 32, offset 0, enc DW_ATE_signed]
+!10 = metadata !{i32 786689, metadata !5, metadata !"p", metadata !6, i32 16777217, metadata !9, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [p] [line 1]
+!11 = metadata !{i32 1, i32 0, metadata !5, null}
+!12 = metadata !{i32 786688, metadata !13, metadata !"r", metadata !6, i32 2, metadata !9, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [r] [line 2]
+
+; Verify that debug descriptors for argument and local variable will be replaced
+; with descriptors that end with OpDeref (encoded as 2).
+;   CHECK: ![[ARG_ID]] = metadata {{.*}} i64 2} ; [ DW_TAG_arg_variable ] [p] [line 1]
+;   CHECK: ![[VAR_ID]] = metadata {{.*}} i64 2} ; [ DW_TAG_auto_variable ] [r] [line 2]
+; Verify that there are no more variable descriptors.
+;   CHECK-NOT: DW_TAG_arg_variable
+;   CHECK-NOT: DW_TAG_auto_variable
+
+
+!13 = metadata !{i32 786443, metadata !5, i32 1, i32 0, metadata !6, i32 0} ; [ DW_TAG_lexical_block ] [/usr/local/google/llvm_cmake_clang/tmp/debuginfo/a.cc]
+!14 = metadata !{i32 2, i32 0, metadata !13, null}
+!15 = metadata !{i32 3, i32 0, metadata !13, null}
diff --git a/test/Instrumentation/MemorySanitizer/msan_basic.ll b/test/Instrumentation/MemorySanitizer/msan_basic.ll
index 3228863193..b6dcd16662 100644
--- a/test/Instrumentation/MemorySanitizer/msan_basic.ll
+++ b/test/Instrumentation/MemorySanitizer/msan_basic.ll
@@ -8,7 +8,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ; Check the presence and the linkage type of __msan_track_origins
 ; CHECK: @__msan_track_origins = weak_odr constant i32 0
 
+
 ; Check instrumentation of stores
+
 define void @Store(i32* nocapture %p, i32 %x) nounwind uwtable {
 entry:
   store i32 %x, i32* %p, align 4
@@ -33,6 +35,34 @@ entry:
 ; CHECK-ORIGINS: ret void
 
 
+; Check instrumentation of aligned stores
+; Shadow store has the same alignment as the original store; origin store
+; does not specify explicit alignment.
+
+define void @AlignedStore(i32* nocapture %p, i32 %x) nounwind uwtable {
+entry:
+  store i32 %x, i32* %p, align 32
+  ret void
+}
+
+; CHECK: @AlignedStore
+; CHECK: load {{.*}} @__msan_param_tls
+; CHECK: store {{.*}} align 32
+; CHECK: store {{.*}} align 32
+; CHECK: ret void
+; CHECK-ORIGINS: @AlignedStore
+; CHECK-ORIGINS: load {{.*}} @__msan_param_tls
+; CHECK-ORIGINS: store {{.*}} align 32
+; CHECK-ORIGINS: icmp
+; CHECK-ORIGINS: br i1
+; CHECK-ORIGINS: <label>
+; CHECK-ORIGINS-NOT: store {{.*}} align
+; CHECK-ORIGINS: br label
+; CHECK-ORIGINS: <label>
+; CHECK-ORIGINS: store {{.*}} align 32
+; CHECK-ORIGINS: ret void
+
+
 ; load followed by cmp: check that we load the shadow and call __msan_warning.
 define void @LoadAndCmp(i32* nocapture %a) nounwind uwtable {
 entry:
diff --git a/test/MC/ARM/basic-thumb2-instructions.s b/test/MC/ARM/basic-thumb2-instructions.s
index 23d9f5977a..d495c91c0e 100644
--- a/test/MC/ARM/basic-thumb2-instructions.s
+++ b/test/MC/ARM/basic-thumb2-instructions.s
@@ -3509,3 +3509,7 @@ _func:
 @ CHECK: ldrh.w	r11, [pc, #-22]         @ encoding: [0x3f,0xf8,0x16,0xb0]
 @ CHECK: ldrsb.w r11, [pc, #-22]        @ encoding: [0x1f,0xf9,0x16,0xb0]
 @ CHECK: ldrsh.w r11, [pc, #-22]        @ encoding: [0x3f,0xf9,0x16,0xb0]
+
+@ rdar://12596361
+        ldr r1, [pc, #12]
+@ CHECK: ldr.n r1, [pc, #12]        @ encoding: [0x03,0x49]
diff --git a/test/MC/Disassembler/XCore/lit.local.cfg b/test/MC/Disassembler/XCore/lit.local.cfg
new file mode 100644
index 0000000000..15b65836e7
--- /dev/null
+++ b/test/MC/Disassembler/XCore/lit.local.cfg
@@ -0,0 +1,5 @@
+config.suffixes = ['.txt']
+
+targets = set(config.root.targets_to_build.split())
+if not 'XCore' in targets:
+    config.unsupported = True
diff --git a/test/MC/Disassembler/XCore/xcore.txt b/test/MC/Disassembler/XCore/xcore.txt
new file mode 100644
index 0000000000..f6b9c90da0
--- /dev/null
+++ b/test/MC/Disassembler/XCore/xcore.txt
@@ -0,0 +1,198 @@
+# RUN: llvm-mc --disassemble %s -triple=xcore-xmos-elf | FileCheck %s
+# CHECK: .section        __TEXT,__text,regular,pure_instructions
+
+# 0r instructions
+
+# CHECK: clre
+0xed 0x07
+
+# CHECK: get r11, id
+0xee 0x17
+
+# CHECK: get r11, ed
+0xfe 0x0f
+
+# CHECK: get r11, et
+0xff 0x0f
+
+# CHECK: ssync
+0xee 0x07
+
+# CHECK: waiteu
+0xec 0x07
+
+# 1r instructions
+
+# CHECK: msync res[r0]
+0xf0 0x1f
+
+# CHECK: mjoin res[r1]
+0xf1 0x17
+
+# CHECK: bau r2
+0xf2 0x27
+
+# CHECK: set sp, r3
+0xf3 0x2f
+
+# CHECK: ecallt r4
+0xf4 0x4f
+
+# CHECK: ecallf r5
+0xe5 0x4f
+
+# CHECK: bla r6
+0xe6 0x27
+
+# CHECK: syncr res[r7]
+0xf7 0x87
+
+# CHECK: freer res[r8]
+0xe8 0x17
+
+# CHECK: setv res[r9], r11
+0xf9 0x47
+
+# CHECK: setev res[r10], r11
+0xfa 0x3f
+
+# CHECK: eeu res[r11]
+0xfb 0x07
+
+# 2r instructions
+
+# CHECK: not r1, r8
+0x24 0x8f
+
+# CHECK: neg r7, r6
+0xce 0x97
+
+# CHECK: andnot r10, r11
+0xab 0x2f
+
+# CHECK: mkmsk r11, r0
+0x4c 0xa7
+
+# CHECK: getts r8, res[r1]
+0x41 0x3f
+
+# CHECK: setpt res[r2], r3
+0xde 0x3e
+
+# CHECK: outct res[r1], r2
+0xc6 0x4e
+
+# CHECK: outt res[r5], r4
+0xd1 0x0f
+
+# CHECK: out res[r9], r10
+0xa9 0xaf
+
+# CHECK: outshr res[r0], r2
+0xd8 0xae
+
+# CHECK: inct r7, res[r4]
+0xdc 0x87
+
+# CHECK: int r8, res[r3]
+0x53 0x8f
+
+# CHECK: in r10, res[r0]
+0x48 0xb7
+
+# CHECK: inshr r4, res[r2]
+0x12 0xb7
+
+# CHECK: chkct res[r6], r0
+0x08 0xcf
+
+# CHECK: testct r8, res[r3]
+0x53 0xbf
+
+# CHECK: testwct r2, res[r9]
+0x39 0xc7
+
+# CHECK: setd res[r3], r4
+0x13 0x17
+
+# CHECK: getst r7, res[r1]
+0x1d 0x07
+
+# CHECK: init t[r1]:sp, r2
+0xc9 0x16
+
+# CHECK: init t[r10]:pc, r1
+0x26 0x07
+
+# CHECK: init t[r2]:cp, r10
+0x4a 0x1f
+
+# CHECK: init t[r2]:dp, r3
+0xce 0x0e
+
+# CHECK: setpsc res[r8], r2
+0x28 0xc7
+
+# CHECK: zext r3, r8
+0x2c 0x47
+
+# CHECK: sext r9, r1
+0x45 0x37
+
+# rus instructions
+
+# CHECK: chkct res[r1], 8
+0x34 0xcf
+
+# CHECK: getr r11, 2
+0x4e 0x87
+
+# CHECK: mkmsk r4, 24
+0x72 0xa7
+
+# CHECK: outct res[r3], r0
+0xcc 0x4e
+
+# CHECK: sext r8, 16
+0xb1 0x37
+
+# CHECK: zext r2, 32
+0xd8 0x46
+
+# CHECK: peek r0, res[r5]
+0x81 0xbf
+
+# CHECK: endin r10, res[r1]
+0x59 0x97
+
+# l2r instructions
+
+# CHECK: bitrev r1, r10
+0x26 0xff 0xec 0x07
+
+# CHECK: byterev r4, r1
+0x11 0xff 0xec 0x07
+
+# CHECK: clz r11, r10
+0xae 0xff 0xec 0x0f
+
+# CHECK: get r3, ps[r6]
+0x9e 0xff 0xec 0x17
+
+# CHECK: setc res[r5], r9
+0x75 0xff 0xec 0x2f
+
+# CHECK: init t[r2]:lr, r1
+0xc6 0xfe 0xec 0x17
+
+# CHECK: setclk res[r2], r1
+0xd6 0xfe 0xec 0x0f
+
+# CHECK: set ps[r9], r10
+0xa9 0xff 0xec 0x1f
+
+# CHECK: setrdy res[r3], r1
+0xc7 0xfe 0xec 0x2f
+
+# CHECK: settw res[r7], r2
+0x9b 0xff 0xec 0x27
diff --git a/test/MC/ELF/comp-dir.s b/test/MC/ELF/comp-dir.s
new file mode 100644
index 0000000000..7570fa9187
--- /dev/null
+++ b/test/MC/ELF/comp-dir.s
@@ -0,0 +1,7 @@
+// RUN: llvm-mc -triple=x86_64-linux-unknown -g -fdebug-compilation-dir=/test/comp/dir %s -filetype=obj -o %t.o
+// RUN: llvm-dwarfdump %t.o | FileCheck %s
+
+// CHECK: DW_AT_comp_dir [DW_FORM_string] ("/test/comp/dir")
+
+f:
+  nop
diff --git a/test/MC/X86/lit.local.cfg b/test/MC/X86/lit.local.cfg
index eee568e8fd..ad280c7cf7 100644
--- a/test/MC/X86/lit.local.cfg
+++ b/test/MC/X86/lit.local.cfg
@@ -1,12 +1,5 @@
 config.suffixes = ['.ll', '.c', '.cpp', '.s']
 
-def getRoot(config):
-    if not config.parent:
-        return config
-    return getRoot(config.parent)
-
-root = getRoot(config)
-
-targets = set(root.targets_to_build.split())
+targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
     config.unsupported = True
diff --git a/test/Object/Inputs/macho-text-sections.macho-x86_64 b/test/Object/Inputs/macho-text-sections.macho-x86_64
new file mode 100644
index 0000000000..cce203ba0d
--- /dev/null
+++ b/test/Object/Inputs/macho-text-sections.macho-x86_64
diff --git a/test/Object/X86/macho-text-sections.test b/test/Object/X86/macho-text-sections.test
new file mode 100644
index 0000000000..1b697dcada
--- /dev/null
+++ b/test/Object/X86/macho-text-sections.test
@@ -0,0 +1,3 @@
+RUN: llvm-objdump -disassemble %p/../Inputs/macho-text-sections.macho-x86_64 | FileCheck %s
+
+CHECK: Disassembly of section __notext,__notext
diff --git a/test/TableGen/2006-09-18-LargeInt.td b/test/TableGen/2006-09-18-LargeInt.td
index f7ae4eecce..94cd1ec307 100644
--- a/test/TableGen/2006-09-18-LargeInt.td
+++ b/test/TableGen/2006-09-18-LargeInt.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | grep -- 4294901760
-// XFAIL: vg_leak
 
 def X {
   int Y = 0xFFFF0000;
diff --git a/test/TableGen/2010-03-24-PrematureDefaults.td b/test/TableGen/2010-03-24-PrematureDefaults.td
index 24f6c93b3e..716a1d5900 100644
--- a/test/TableGen/2010-03-24-PrematureDefaults.td
+++ b/test/TableGen/2010-03-24-PrematureDefaults.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 
 class A<int k, bits<2> x = 1> {
   int K = k;
diff --git a/test/TableGen/Dag.td b/test/TableGen/Dag.td
index 7ceb4e74b2..40399a48ee 100644
--- a/test/TableGen/Dag.td
+++ b/test/TableGen/Dag.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 
 //===----------------------------------------------------------------------===//
 // Substitution of an int.
diff --git a/test/TableGen/DefmInherit.td b/test/TableGen/DefmInherit.td
index 46d3f62c6d..b52a709731 100644
--- a/test/TableGen/DefmInherit.td
+++ b/test/TableGen/DefmInherit.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | grep "zing = 4" | count 4
-// XFAIL: vg_leak
 
 class C1<int A, string B> { 
   int bar = A;
diff --git a/test/TableGen/DefmInsideMultiClass.td b/test/TableGen/DefmInsideMultiClass.td
index e6fc019b1e..0aea21280d 100644
--- a/test/TableGen/DefmInsideMultiClass.td
+++ b/test/TableGen/DefmInsideMultiClass.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | grep ADDPSrr | count 1
-// XFAIL: vg_leak
 
 class Instruction<bits<4> opc, string Name> {
   bits<4> opcode = opc;
diff --git a/test/TableGen/ForeachList.td b/test/TableGen/ForeachList.td
index 99b7e14c2d..9bc76e0f0c 100644
--- a/test/TableGen/ForeachList.td
+++ b/test/TableGen/ForeachList.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 
 class Register<string name, int idx> {
   string Name = name;
diff --git a/test/TableGen/ForeachLoop.td b/test/TableGen/ForeachLoop.td
index 4aacc74d8a..a49a60bf26 100644
--- a/test/TableGen/ForeachLoop.td
+++ b/test/TableGen/ForeachLoop.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 
 class Register<string name, int idx> {
   string Name = name;
diff --git a/test/TableGen/LazyChange.td b/test/TableGen/LazyChange.td
index 306959ebb6..919a1a7e9a 100644
--- a/test/TableGen/LazyChange.td
+++ b/test/TableGen/LazyChange.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | grep "int Y = 3"
-// XFAIL: vg_leak
 
 class C {
   int X = 4;
diff --git a/test/TableGen/LetInsideMultiClasses.td b/test/TableGen/LetInsideMultiClasses.td
index cb13508e51..72f48b6d80 100644
--- a/test/TableGen/LetInsideMultiClasses.td
+++ b/test/TableGen/LetInsideMultiClasses.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | grep "bit IsDouble = 1;" | count 3
-// XFAIL: vg_leak
 
 class Instruction<bits<4> opc, string Name> {
   bits<4> opcode = opc;
diff --git a/test/TableGen/ListOfList.td b/test/TableGen/ListOfList.td
index 864401ec3c..adf9fe483e 100644
--- a/test/TableGen/ListOfList.td
+++ b/test/TableGen/ListOfList.td
@@ -1,7 +1,6 @@
 // RUN llvm-tblgen %s | FileCheck %s
 
 // RUN: llvm-tblgen %s | grep "foo" | count 1
-// XFAIL: vg_leak
 
 class Base<string t> {
   string text = t;
diff --git a/test/TableGen/LoLoL.td b/test/TableGen/LoLoL.td
index 778c9609d1..f758e1b604 100644
--- a/test/TableGen/LoLoL.td
+++ b/test/TableGen/LoLoL.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 
 class Base<list<int> v> {
   list<int> values = v;
diff --git a/test/TableGen/MultiClass.td b/test/TableGen/MultiClass.td
index 449c5d6c04..ef320cf79f 100644
--- a/test/TableGen/MultiClass.td
+++ b/test/TableGen/MultiClass.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | grep "zing = 4" | count 2
-// XFAIL: vg_leak
 
 class C1<int A, string B> { 
   int bar = A;
diff --git a/test/TableGen/MultiClassDefName.td b/test/TableGen/MultiClassDefName.td
index 296e30c7c7..75d6af5b42 100644
--- a/test/TableGen/MultiClassDefName.td
+++ b/test/TableGen/MultiClassDefName.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | grep WorldHelloCC | count 1
-// XFAIL: vg_leak
 
 class C<string n> {
   string name = n;
diff --git a/test/TableGen/MultiClassInherit.td b/test/TableGen/MultiClassInherit.td
index c768fff0b6..9d1470a661 100644
--- a/test/TableGen/MultiClassInherit.td
+++ b/test/TableGen/MultiClassInherit.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | grep "zing = 4" | count 28
-// XFAIL: vg_leak
 
 class C1<int A, string B> { 
   int bar = A;
diff --git a/test/TableGen/MultiPat.td b/test/TableGen/MultiPat.td
index b3792777b6..b49b06c24c 100644
--- a/test/TableGen/MultiPat.td
+++ b/test/TableGen/MultiPat.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 
 class ValueType<int size, int value> {
   int Size = size;
diff --git a/test/TableGen/NestedForeach.td b/test/TableGen/NestedForeach.td
index e8c16f720d..5b63175b19 100644
--- a/test/TableGen/NestedForeach.td
+++ b/test/TableGen/NestedForeach.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 
 class Droid<string series, int release, string model, int patchlevel> {
   string Series = series;
diff --git a/test/TableGen/Paste.td b/test/TableGen/Paste.td
index a7e2a5b318..33d61ccde1 100644
--- a/test/TableGen/Paste.td
+++ b/test/TableGen/Paste.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 
 class Instr<int i> {
   int index = i;
diff --git a/test/TableGen/SetTheory.td b/test/TableGen/SetTheory.td
index 761332312b..f26b9e65ac 100644
--- a/test/TableGen/SetTheory.td
+++ b/test/TableGen/SetTheory.td
@@ -1,6 +1,5 @@
 // Test evaluation of set operations in dags.
 // RUN: llvm-tblgen -print-sets %s | FileCheck %s
-// XFAIL: vg_leak
 //
 // The -print-sets driver configures a primitive SetTheory instance that
 // understands these sets:
diff --git a/test/TableGen/SiblingForeach.td b/test/TableGen/SiblingForeach.td
index a11f6f87b4..e4c4704a5e 100644
--- a/test/TableGen/SiblingForeach.td
+++ b/test/TableGen/SiblingForeach.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 
 class Set<int i = 0, int j = 0, int k = 0> {
   int I = i;
diff --git a/test/TableGen/Slice.td b/test/TableGen/Slice.td
index 6d051d77c8..cec9fb65ca 100644
--- a/test/TableGen/Slice.td
+++ b/test/TableGen/Slice.td
@@ -1,6 +1,5 @@
 // RUN: llvm-tblgen %s | grep "\[(set" | count 2
 // RUN: llvm-tblgen %s | grep "\[\]" | count 2
-// XFAIL: vg_leak
 
 class ValueType<int size, int value> {
   int Size = size;
diff --git a/test/TableGen/TargetInstrSpec.td b/test/TableGen/TargetInstrSpec.td
index 64b706dc6a..bf2d257c5d 100644
--- a/test/TableGen/TargetInstrSpec.td
+++ b/test/TableGen/TargetInstrSpec.td
@@ -1,6 +1,5 @@
 // RUN: llvm-tblgen %s | grep '\[(set VR128:$dst, (int_x86_sse2_add_pd VR128:$src1, VR128:$src2))\]' | count 1
 // RUN: llvm-tblgen %s | grep '\[(set VR128:$dst, (int_x86_sse2_add_ps VR128:$src1, VR128:$src2))\]' | count 1
-// XFAIL: vg_leak
 
 class ValueType<int size, int value> {
   int Size = size;
diff --git a/test/TableGen/TwoLevelName.td b/test/TableGen/TwoLevelName.td
index 9c502f4755..e88696217f 100644
--- a/test/TableGen/TwoLevelName.td
+++ b/test/TableGen/TwoLevelName.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 
 class Type<string name, int length, int width> {
   string Name = name;
diff --git a/test/TableGen/cast.td b/test/TableGen/cast.td
index 7948aff795..b9e4b37535 100644
--- a/test/TableGen/cast.td
+++ b/test/TableGen/cast.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | grep "add_ps" | count 3
-// XFAIL: vg_leak
 
 class ValueType<int size, int value> {
   int Size = size;
diff --git a/test/TableGen/defmclass.td b/test/TableGen/defmclass.td
index 80f03b3194..6198c000fd 100644
--- a/test/TableGen/defmclass.td
+++ b/test/TableGen/defmclass.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 
 class XD { bits<4> Prefix = 11; }
 // CHECK: Prefix = { 1, 1, 0, 0 };
diff --git a/test/TableGen/eq.td b/test/TableGen/eq.td
index f8daf880b9..fc3ad424e2 100644
--- a/test/TableGen/eq.td
+++ b/test/TableGen/eq.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 // CHECK: Value = 0
 // CHECK: Value = 1
 
diff --git a/test/TableGen/eqbit.td b/test/TableGen/eqbit.td
index 1d58fa0c19..b77b1a26df 100644
--- a/test/TableGen/eqbit.td
+++ b/test/TableGen/eqbit.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 // CHECK: a = 6
 // CHECK: a = 5
 
diff --git a/test/TableGen/foreach.td b/test/TableGen/foreach.td
index 902af25237..7b7c199728 100644
--- a/test/TableGen/foreach.td
+++ b/test/TableGen/foreach.td
@@ -1,7 +1,6 @@
 // RUN: llvm-tblgen %s | grep 'Jr' | count 2
 // RUN: llvm-tblgen %s | grep 'Sr' | count 2
 // RUN: llvm-tblgen %s | grep '"NAME"' | count 1
-// XFAIL: vg_leak
 
 // Variables for foreach
 class decls {
diff --git a/test/TableGen/if.td b/test/TableGen/if.td
index 1d8d62329a..e4df74f368 100644
--- a/test/TableGen/if.td
+++ b/test/TableGen/if.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 
 // Support for an `!if' operator as part of a `let' statement.
 // CHECK:      class C
diff --git a/test/TableGen/ifbit.td b/test/TableGen/ifbit.td
index 88f575e9ac..e3341219ff 100644
--- a/test/TableGen/ifbit.td
+++ b/test/TableGen/ifbit.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 // CHECK: a = 6
 // CHECK: a = 5
 
diff --git a/test/TableGen/lisp.td b/test/TableGen/lisp.td
index dd85ddc67c..efe00022f5 100644
--- a/test/TableGen/lisp.td
+++ b/test/TableGen/lisp.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | grep ""
-// XFAIL: vg_leak
 
 class List<list<string> n> {
   list<string> names = n;
diff --git a/test/TableGen/list-element-bitref.td b/test/TableGen/list-element-bitref.td
index 5f3e3dabf4..7db3d31167 100644
--- a/test/TableGen/list-element-bitref.td
+++ b/test/TableGen/list-element-bitref.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 
 class C<list<bits<8>> L> {
   bits<2> V0 = L[0]{1-0};
@@ -10,6 +9,6 @@ class C<list<bits<8>> L> {
 def c0 : C<[0b0101, 0b1010]>;
 
 // CHECK: def c0
-// CHECk-NEXT: bits<2> V0 = { 0, 1 };
-// CHECk-NEXT: bits<2> V1 = { 1, 0 };
-// CHECk-NEXT: string V2 = "Odd";
+// CHECK-NEXT: bits<2> V0 = { 0, 1 };
+// CHECK-NEXT: bits<2> V1 = { 1, 0 };
+// CHECK-NEXT: string V2 = "Odd";
diff --git a/test/TableGen/pr8330.td b/test/TableGen/pr8330.td
index 7779b635e3..e672014789 100644
--- a/test/TableGen/pr8330.td
+++ b/test/TableGen/pr8330.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 
 class Or4<bits<8> Val> {
   bits<8> V = {Val{7}, Val{6}, Val{5}, Val{4}, Val{3}, 1, Val{1}, Val{0} };
diff --git a/test/TableGen/strconcat.td b/test/TableGen/strconcat.td
index 85ee831b4d..0173c49365 100644
--- a/test/TableGen/strconcat.td
+++ b/test/TableGen/strconcat.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | grep fufoo
-// XFAIL: vg_leak
 
 class Y<string S> {
   string T = !strconcat(S, "foo");
diff --git a/test/TableGen/subst.td b/test/TableGen/subst.td
index 850ac38465..e265b44cf3 100644
--- a/test/TableGen/subst.td
+++ b/test/TableGen/subst.td
@@ -4,7 +4,6 @@
 // RUN: llvm-tblgen %s | grep "LAST" | count 1
 // RUN: llvm-tblgen %s | grep "TVAR" | count 2
 // RUN: llvm-tblgen %s | grep "Bogus" | count 1
-// XFAIL: vg_leak
 
 class Honorific<string t> {
   string honorific = t;
diff --git a/test/TableGen/subst2.td b/test/TableGen/subst2.td
index 7c007f7db1..ce7307703d 100644
--- a/test/TableGen/subst2.td
+++ b/test/TableGen/subst2.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 // CHECK: No subst
 // CHECK: No foo
 // CHECK: RECURSE foo
diff --git a/test/TableGen/usevalname.td b/test/TableGen/usevalname.td
index d85b98ac33..a80ba12869 100644
--- a/test/TableGen/usevalname.td
+++ b/test/TableGen/usevalname.td
@@ -1,5 +1,4 @@
 // RUN: llvm-tblgen %s | FileCheck %s
-// XFAIL: vg_leak
 
 class Instr<list<dag> pat> {
   list<dag> Pattern = pat;
diff --git a/test/Transforms/GVN/rle.ll b/test/Transforms/GVN/rle.ll
index 72fa819d1c..f470ed88bb 100644
--- a/test/Transforms/GVN/rle.ll
+++ b/test/Transforms/GVN/rle.ll
@@ -254,14 +254,11 @@ Cont:
   %A = load i8* %P3
   ret i8 %A
 
-;; FIXME: This is disabled because this caused a miscompile in the llvm-gcc
-;; bootstrap, see r82411
-;
-; HECK: @coerce_mustalias_nonlocal1
-; HECK: Cont:
-; HECK:   %A = phi i8 [
-; HECK-NOT: load
-; HECK: ret i8 %A
+; CHECK: @coerce_mustalias_nonlocal1
+; CHECK: Cont:
+; CHECK:   %A = phi i8 [
+; CHECK-NOT: load
+; CHECK: ret i8 %A
 }
 
 
diff --git a/test/Transforms/Inline/inline_minisize.ll b/test/Transforms/Inline/inline_minisize.ll
new file mode 100644
index 0000000000..ae0b08be32
--- /dev/null
+++ b/test/Transforms/Inline/inline_minisize.ll
@@ -0,0 +1,232 @@
+; RUN: opt %s -O2 -S -o - | FileCheck %s
+
+@data = common global i32* null, align 8
+
+define i32 @fct1(i32 %a) nounwind uwtable ssp {
+entry:
+  %a.addr = alloca i32, align 4
+  %res = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %tmp = load i32* %a.addr, align 4
+  %idxprom = sext i32 %tmp to i64
+  %tmp1 = load i32** @data, align 8
+  %arrayidx = getelementptr inbounds i32* %tmp1, i64 %idxprom
+  %tmp2 = load i32* %arrayidx, align 4
+  %tmp3 = load i32* %a.addr, align 4
+  %add = add nsw i32 %tmp3, 1
+  %idxprom1 = sext i32 %add to i64
+  %tmp4 = load i32** @data, align 8
+  %arrayidx2 = getelementptr inbounds i32* %tmp4, i64 %idxprom1
+  %tmp5 = load i32* %arrayidx2, align 4
+  %mul = mul nsw i32 %tmp2, %tmp5
+  store i32 %mul, i32* %res, align 4
+  store i32 0, i32* %i, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %tmp6 = load i32* %i, align 4
+  %tmp7 = load i32* %res, align 4
+  %cmp = icmp slt i32 %tmp6, %tmp7
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %tmp8 = load i32* %i, align 4
+  %idxprom3 = sext i32 %tmp8 to i64
+  %tmp9 = load i32** @data, align 8
+  %arrayidx4 = getelementptr inbounds i32* %tmp9, i64 %idxprom3
+  call void @fct0(i32* %arrayidx4)
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %tmp10 = load i32* %i, align 4
+  %inc = add nsw i32 %tmp10, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  store i32 0, i32* %i, align 4
+  br label %for.cond5
+
+for.cond5:                                        ; preds = %for.inc10, %for.end
+  %tmp11 = load i32* %i, align 4
+  %tmp12 = load i32* %res, align 4
+  %cmp6 = icmp slt i32 %tmp11, %tmp12
+  br i1 %cmp6, label %for.body7, label %for.end12
+
+for.body7:                                        ; preds = %for.cond5
+  %tmp13 = load i32* %i, align 4
+  %idxprom8 = sext i32 %tmp13 to i64
+  %tmp14 = load i32** @data, align 8
+  %arrayidx9 = getelementptr inbounds i32* %tmp14, i64 %idxprom8
+  call void @fct0(i32* %arrayidx9)
+  br label %for.inc10
+
+for.inc10:                                        ; preds = %for.body7
+  %tmp15 = load i32* %i, align 4
+  %inc11 = add nsw i32 %tmp15, 1
+  store i32 %inc11, i32* %i, align 4
+  br label %for.cond5
+
+for.end12:                                        ; preds = %for.cond5
+  store i32 0, i32* %i, align 4
+  br label %for.cond13
+
+for.cond13:                                       ; preds = %for.inc18, %for.end12
+  %tmp16 = load i32* %i, align 4
+  %tmp17 = load i32* %res, align 4
+  %cmp14 = icmp slt i32 %tmp16, %tmp17
+  br i1 %cmp14, label %for.body15, label %for.end20
+
+for.body15:                                       ; preds = %for.cond13
+  %tmp18 = load i32* %i, align 4
+  %idxprom16 = sext i32 %tmp18 to i64
+  %tmp19 = load i32** @data, align 8
+  %arrayidx17 = getelementptr inbounds i32* %tmp19, i64 %idxprom16
+  call void @fct0(i32* %arrayidx17)
+  br label %for.inc18
+
+for.inc18:                                        ; preds = %for.body15
+  %tmp20 = load i32* %i, align 4
+  %inc19 = add nsw i32 %tmp20, 1
+  store i32 %inc19, i32* %i, align 4
+  br label %for.cond13
+
+for.end20:                                        ; preds = %for.cond13
+  %tmp21 = load i32* %res, align 4
+  ret i32 %tmp21
+}
+
+declare void @fct0(i32*)
+
+define i32 @fct2(i32 %a) nounwind uwtable inlinehint ssp {
+entry:
+  %a.addr = alloca i32, align 4
+  %res = alloca i32, align 4
+  %i = alloca i32, align 4
+  store i32 %a, i32* %a.addr, align 4
+  %tmp = load i32* %a.addr, align 4
+  %shl = shl i32 %tmp, 1
+  %idxprom = sext i32 %shl to i64
+  %tmp1 = load i32** @data, align 8
+  %arrayidx = getelementptr inbounds i32* %tmp1, i64 %idxprom
+  %tmp2 = load i32* %arrayidx, align 4
+  %tmp3 = load i32* %a.addr, align 4
+  %shl1 = shl i32 %tmp3, 1
+  %add = add nsw i32 %shl1, 13
+  %idxprom2 = sext i32 %add to i64
+  %tmp4 = load i32** @data, align 8
+  %arrayidx3 = getelementptr inbounds i32* %tmp4, i64 %idxprom2
+  %tmp5 = load i32* %arrayidx3, align 4
+  %mul = mul nsw i32 %tmp2, %tmp5
+  store i32 %mul, i32* %res, align 4
+  store i32 0, i32* %i, align 4
+  store i32 0, i32* %i, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %tmp6 = load i32* %i, align 4
+  %tmp7 = load i32* %res, align 4
+  %cmp = icmp slt i32 %tmp6, %tmp7
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %tmp8 = load i32* %i, align 4
+  %idxprom4 = sext i32 %tmp8 to i64
+  %tmp9 = load i32** @data, align 8
+  %arrayidx5 = getelementptr inbounds i32* %tmp9, i64 %idxprom4
+  call void @fct0(i32* %arrayidx5)
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %tmp10 = load i32* %i, align 4
+  %inc = add nsw i32 %tmp10, 1
+  store i32 %inc, i32* %i, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  store i32 0, i32* %i, align 4
+  br label %for.cond6
+
+for.cond6:                                        ; preds = %for.inc11, %for.end
+  %tmp11 = load i32* %i, align 4
+  %tmp12 = load i32* %res, align 4
+  %cmp7 = icmp slt i32 %tmp11, %tmp12
+  br i1 %cmp7, label %for.body8, label %for.end13
+
+for.body8:                                        ; preds = %for.cond6
+  %tmp13 = load i32* %i, align 4
+  %idxprom9 = sext i32 %tmp13 to i64
+  %tmp14 = load i32** @data, align 8
+  %arrayidx10 = getelementptr inbounds i32* %tmp14, i64 %idxprom9
+  call void @fct0(i32* %arrayidx10)
+  br label %for.inc11
+
+for.inc11:                                        ; preds = %for.body8
+  %tmp15 = load i32* %i, align 4
+  %inc12 = add nsw i32 %tmp15, 1
+  store i32 %inc12, i32* %i, align 4
+  br label %for.cond6
+
+for.end13:                                        ; preds = %for.cond6
+  store i32 0, i32* %i, align 4
+  br label %for.cond14
+
+for.cond14:                                       ; preds = %for.inc19, %for.end13
+  %tmp16 = load i32* %i, align 4
+  %tmp17 = load i32* %res, align 4
+  %cmp15 = icmp slt i32 %tmp16, %tmp17
+  br i1 %cmp15, label %for.body16, label %for.end21
+
+for.body16:                                       ; preds = %for.cond14
+  %tmp18 = load i32* %i, align 4
+  %idxprom17 = sext i32 %tmp18 to i64
+  %tmp19 = load i32** @data, align 8
+  %arrayidx18 = getelementptr inbounds i32* %tmp19, i64 %idxprom17
+  call void @fct0(i32* %arrayidx18)
+  br label %for.inc19
+
+for.inc19:                                        ; preds = %for.body16
+  %tmp20 = load i32* %i, align 4
+  %inc20 = add nsw i32 %tmp20, 1
+  store i32 %inc20, i32* %i, align 4
+  br label %for.cond14
+
+for.end21:                                        ; preds = %for.cond14
+  %tmp21 = load i32* %res, align 4
+  ret i32 %tmp21
+}
+
+define i32 @fct3(i32 %c) nounwind uwtable ssp {
+entry:
+  ;CHECK: @fct3
+  ;CHECK: call i32 @fct1
+  ; The inline keyword gives a sufficient benefits to inline fct2
+  ;CHECK-NOT: call i32 @fct2
+  %c.addr = alloca i32, align 4
+  store i32 %c, i32* %c.addr, align 4
+  %tmp = load i32* %c.addr, align 4
+  %call = call i32 @fct1(i32 %tmp)
+  %tmp1 = load i32* %c.addr, align 4
+  %call1 = call i32 @fct2(i32 %tmp1)
+  %add = add nsw i32 %call, %call1
+  ret i32 %add
+}
+
+define i32 @fct4(i32 %c) minsize nounwind uwtable ssp {
+entry:
+  ;CHECK: @fct4
+  ;CHECK: call i32 @fct1
+  ; With Oz (minsize attribute), the benefit of inlining fct2
+  ; is the same as fct1, thus no inlining for fct2
+  ;CHECK: call i32 @fct2
+  %c.addr = alloca i32, align 4
+  store i32 %c, i32* %c.addr, align 4
+  %tmp = load i32* %c.addr, align 4
+  %call = call i32 @fct1(i32 %tmp)
+  %tmp1 = load i32* %c.addr, align 4
+  %call1 = call i32 @fct2(i32 %tmp1)
+  %add = add nsw i32 %call, %call1
+  ret i32 %add
+}
+\ No newline at end of file
diff --git a/test/Transforms/InstCombine/2012-12-14-simp-vgep.ll b/test/Transforms/InstCombine/2012-12-14-simp-vgep.ll
new file mode 100644
index 0000000000..fc29b095e5
--- /dev/null
+++ b/test/Transforms/InstCombine/2012-12-14-simp-vgep.ll
@@ -0,0 +1,10 @@
+; RUN: opt < %s -instcombine -S
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+define <4 x i32> @foo(<4 x i32*>* %in) {
+  %t17 = load <4 x i32*>* %in, align 8
+  %t18 = icmp eq <4 x i32*> %t17, zeroinitializer
+  %t19 = zext <4 x i1> %t18 to <4 x i32>
+  ret <4 x i32> %t19
+}
diff --git a/test/Transforms/InstCombine/cast.ll b/test/Transforms/InstCombine/cast.ll
index b4eb69d436..de738bb7c0 100644
--- a/test/Transforms/InstCombine/cast.ll
+++ b/test/Transforms/InstCombine/cast.ll
@@ -473,14 +473,12 @@ define i64 @test51(i64 %A, i1 %cond) {
   %F = sext i32 %E to i64
   ret i64 %F
 ; CHECK: @test51
-
-; FIXME: disabled, see PR5997
-; HECK-NEXT: %C = and i64 %A, 4294967294
-; HECK-NEXT: %D = or i64 %A, 1
-; HECK-NEXT: %E = select i1 %cond, i64 %C, i64 %D
-; HECK-NEXT: %sext = shl i64 %E, 32
-; HECK-NEXT: %F = ashr i64 %sext, 32
-; HECK-NEXT: ret i64 %F
+; CHECK-NEXT: %C = and i64 %A, 4294967294
+; CHECK-NEXT: %D = or i64 %A, 1
+; CHECK-NEXT: %E = select i1 %cond, i64 %C, i64 %D
+; CHECK-NEXT: %sext = shl i64 %E, 32
+; CHECK-NEXT: %F = ashr exact i64 %sext, 32
+; CHECK-NEXT: ret i64 %F
 }
 
 define i32 @test52(i64 %A) {
diff --git a/test/Transforms/InstCombine/fast-math.ll b/test/Transforms/InstCombine/fast-math.ll
index b6a15677bb..0b87cd95d9 100644
--- a/test/Transforms/InstCombine/fast-math.ll
+++ b/test/Transforms/InstCombine/fast-math.ll
@@ -3,19 +3,17 @@
 ; testing-case "float fold(float a) { return 1.2f * a * 2.3f; }"
 ; 1.2f and 2.3f is supposed to be fold.
 define float @fold(float %a) {
-fold:
   %mul = fmul fast float %a, 0x3FF3333340000000
   %mul1 = fmul fast float %mul, 0x4002666660000000
   ret float %mul1
-; CHECK: fold
+; CHECK: @fold
 ; CHECK: fmul float %a, 0x4006147AE0000000
 }
 
 ; Same testing-case as the one used in fold() except that the operators have
 ; fixed FP mode.
 define float @notfold(float %a) {
-notfold:
-; CHECK: notfold
+; CHECK: @notfold
 ; CHECK: %mul = fmul fast float %a, 0x3FF3333340000000
   %mul = fmul fast float %a, 0x3FF3333340000000
   %mul1 = fmul float %mul, 0x4002666660000000
@@ -23,10 +21,40 @@ notfold:
 }
 
 define float @fold2(float %a) {
-fold2:
-; CHECK: fold2
+; CHECK: @fold2
 ; CHECK: fmul float %a, 0x4006147AE0000000
   %mul = fmul float %a, 0x3FF3333340000000
   %mul1 = fmul fast float %mul, 0x4002666660000000
   ret float %mul1
 }
+
+; rdar://12753946:  x * cond ? 1.0 : 0.0 => cond ? x : 0.0
+define double @select1(i32 %cond, double %x, double %y) {
+  %tobool = icmp ne i32 %cond, 0
+  %cond1 = select i1 %tobool, double 1.000000e+00, double 0.000000e+00
+  %mul = fmul nnan nsz double %cond1, %x
+  %add = fadd double %mul, %y
+  ret double %add
+; CHECK: @select1
+; CHECK: select i1 %tobool, double %x, double 0.000000e+00
+}
+
+define double @select2(i32 %cond, double %x, double %y) {
+  %tobool = icmp ne i32 %cond, 0
+  %cond1 = select i1 %tobool, double 0.000000e+00, double 1.000000e+00
+  %mul = fmul nnan nsz double %cond1, %x
+  %add = fadd double %mul, %y
+  ret double %add
+; CHECK: @select2
+; CHECK: select i1 %tobool, double 0.000000e+00, double %x
+}
+
+define double @select3(i32 %cond, double %x, double %y) {
+  %tobool = icmp ne i32 %cond, 0
+  %cond1 = select i1 %tobool, double 0.000000e+00, double 2.000000e+00
+  %mul = fmul nnan nsz double %cond1, %x
+  %add = fadd double %mul, %y
+  ret double %add
+; CHECK: @select3
+; CHECK: fmul nnan nsz double %cond1, %x
+}
diff --git a/test/Transforms/InstCombine/fold-phi.ll b/test/Transforms/InstCombine/fold-phi.ll
new file mode 100644
index 0000000000..bd01d58aa5
--- /dev/null
+++ b/test/Transforms/InstCombine/fold-phi.ll
@@ -0,0 +1,39 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; CHECK: no_crash
+define float @no_crash(float %a) nounwind {
+entry:
+  br label %for.body
+
+for.body:
+  %sum.057 = phi float [ 0.000000e+00, %entry ], [ %add5, %bb0 ]
+  %add5 = fadd float %sum.057, %a    ; PR14592
+  br i1 undef, label %bb0, label %end
+
+bb0:
+  br label %for.body
+
+end:
+  ret float %add5
+}
+
+; CHECK: fold_phi
+define float @fold_phi(float %a) nounwind {
+entry:
+  br label %for.body
+
+for.body:
+; CHECK: phi float
+; CHECK-NEXT: br i1 undef
+  %sum.057 = phi float [ 0.000000e+00, %entry ], [ %add5, %bb0 ]
+  %add5 = fadd float %sum.057, 1.0 ;; Should be moved to the latch!
+  br i1 undef, label %bb0, label %end
+
+; CHECK: bb0:
+bb0:
+; CHECK: fadd float
+  br label %for.body
+
+end:
+  ret float %add5
+}
diff --git a/test/Transforms/InstCombine/shift.ll b/test/Transforms/InstCombine/shift.ll
index 32867761a3..41f8aa9ee8 100644
--- a/test/Transforms/InstCombine/shift.ll
+++ b/test/Transforms/InstCombine/shift.ll
@@ -735,3 +735,13 @@ define i32 @test61(i32 %x) {
 ; CHECK: @test61
 ; CHECK: ashr i32 %x, 4
 }
+
+; propagate "exact" trait
+define i32 @test62(i32 %x) {
+  %shr = ashr exact i32 %x, 4
+  %shl = shl i32 %shr, 1
+  %or = or i32 %shl, 1
+  ret i32 %or
+; CHECK: @test62
+; CHECK: ashr exact i32 %x, 3
+}
diff --git a/test/Transforms/InstSimplify/fast-math.ll b/test/Transforms/InstSimplify/fast-math.ll
index e4b3ea306a..154b967397 100644
--- a/test/Transforms/InstSimplify/fast-math.ll
+++ b/test/Transforms/InstSimplify/fast-math.ll
@@ -33,3 +33,75 @@ define float @no_mul_zero_3(float %a) {
 ; CHECK: ret float %b
   ret float %b
 }
+
+; fadd [nnan ninf] X, (fsub [nnan ninf] 0, X) ==> 0
+;   where nnan and ninf have to occur at least once somewhere in this
+;   expression
+; CHECK: fadd_fsub_0
+define float @fadd_fsub_0(float %a) {
+; X + -X ==> 0
+  %t1 = fsub nnan ninf float 0.0, %a
+  %zero1 = fadd nnan ninf float %t1, %a
+
+  %t2 = fsub nnan float 0.0, %a
+  %zero2 = fadd ninf float %t2, %a
+
+  %t3 = fsub nnan ninf float 0.0, %a
+  %zero3 = fadd float %t3, %a
+
+  %t4 = fsub float 0.0, %a
+  %zero4 = fadd nnan ninf float %t4, %a
+
+; Dont fold this
+; CHECK: %nofold = fsub float 0.0
+  %nofold = fsub float 0.0, %a
+; CHECK: %no_zero = fadd nnan float %nofold, %a
+  %no_zero = fadd nnan float %nofold, %a
+
+; Coalesce the folded zeros
+  %zero5 = fadd float %zero1, %zero2
+  %zero6 = fadd float %zero3, %zero4
+  %zero7 = fadd float %zero5, %zero6
+
+; Should get folded
+  %ret = fadd nsz float %no_zero, %zero7
+
+; CHECK: ret float %no_zero
+  ret float %ret
+}
+
+; fsub nnan ninf x, x ==> 0.0
+; CHECK: @fsub_x_x
+define float @fsub_x_x(float %a) {
+; X - X ==> 0
+  %zero1 = fsub nnan ninf float %a, %a
+
+; Dont fold
+; CHECK: %no_zero1 = fsub
+  %no_zero1 = fsub ninf float %a, %a
+; CHECK: %no_zero2 = fsub
+  %no_zero2 = fsub nnan float %a, %a
+; CHECK: %no_zero = fadd
+  %no_zero = fadd float %no_zero1, %no_zero2
+
+; Should get folded
+  %ret = fadd nsz float %no_zero, %zero1
+
+; CHECK: ret float %no_zero
+  ret float %ret
+}
+
+; fadd nsz X, 0 ==> X
+; CHECK: @nofold_fadd_x_0
+define float @nofold_fadd_x_0(float %a) {
+; Dont fold
+; CHECK: %no_zero1 = fadd
+  %no_zero1 = fadd ninf float %a, 0.0
+; CHECK: %no_zero2 = fadd
+  %no_zero2 = fadd nnan float %a, 0.0
+; CHECK: %no_zero = fadd
+  %no_zero = fadd float %no_zero1, %no_zero2
+
+; CHECK: ret float %no_zero
+  ret float %no_zero
+}
diff --git a/test/Transforms/InstSimplify/floating-point-arithmetic.ll b/test/Transforms/InstSimplify/floating-point-arithmetic.ll
new file mode 100644
index 0000000000..f9c364cade
--- /dev/null
+++ b/test/Transforms/InstSimplify/floating-point-arithmetic.ll
@@ -0,0 +1,35 @@
+; RUN: opt < %s -instsimplify -S | FileCheck %s
+
+; fsub 0, (fsub 0, X) ==> X
+; CHECK: @fsub_0_0_x
+define float @fsub_0_0_x(float %a) {
+  %t1 = fsub float -0.0, %a
+  %ret = fsub float -0.0, %t1
+
+; CHECK: ret float %a
+  ret float %ret
+}
+
+; fsub X, 0 ==> X
+; CHECK: @fsub_x_0
+define float @fsub_x_0(float %a) {
+  %ret = fsub float %a, 0.0
+; CHECK ret float %a
+  ret float %ret
+}
+
+; fadd X, -0 ==> X
+; CHECK: @fadd_x_n0
+define float @fadd_x_n0(float %a) {
+  %ret = fadd float %a, -0.0
+; CHECK ret float %a
+  ret float %ret
+}
+
+; fmul X, 1.0 ==> X
+; CHECK: @fmul_X_1
+define double @fmul_X_1(double %a) {
+  %b = fmul double 1.000000e+00, %a                ; <double> [#uses=1]
+  ; CHECK: ret double %a
+  ret double %b
+}
diff --git a/test/Transforms/LoopVectorize/12-12-11-if-conv.ll b/test/Transforms/LoopVectorize/12-12-11-if-conv.ll
new file mode 100644
index 0000000000..7919012147
--- /dev/null
+++ b/test/Transforms/LoopVectorize/12-12-11-if-conv.ll
@@ -0,0 +1,44 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -enable-if-conversion -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: @foo
+;CHECK: icmp eq <4 x i32>
+;CHECK: select <4 x i1>
+;CHECK: ret i32
+define i32 @foo(i32 %x, i32 %t, i32* nocapture %A) nounwind uwtable ssp {
+entry:
+  %cmp10 = icmp sgt i32 %x, 0
+  br i1 %cmp10, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %if.end
+  %indvars.iv = phi i64 [ %indvars.iv.next, %if.end ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4, !tbaa !0
+  %tobool = icmp eq i32 %0, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %1 = add nsw i64 %indvars.iv, 45
+  %2 = trunc i64 %indvars.iv to i32
+  %mul = mul nsw i32 %2, %t
+  %3 = trunc i64 %1 to i32
+  %add1 = add nsw i32 %3, %mul
+  br label %if.end
+
+if.end:                                           ; preds = %for.body, %if.then
+  %z.0 = phi i32 [ %add1, %if.then ], [ 9, %for.body ]
+  store i32 %z.0, i32* %arrayidx, align 4, !tbaa !0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %x
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %if.end, %entry
+  ret i32 undef
+}
+
+!0 = metadata !{metadata !"int", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/test/Transforms/LoopVectorize/X86/conversion-cost.ll
index 8f1bb545fa..60c742e220 100644
--- a/test/Transforms/LoopVectorize/X86/conversion-cost.ll
+++ b/test/Transforms/LoopVectorize/X86/conversion-cost.ll
@@ -4,7 +4,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "x86_64-apple-macosx10.8.0"
 
 ;CHECK: @conversion_cost1
-;CHECK: store <2 x i8>
+;CHECK: store <8 x i8>
 ;CHECK: ret
 define i32 @conversion_cost1(i32 %n, i8* nocapture %A, float* nocapture %B) nounwind uwtable ssp {
   %1 = icmp sgt i32 %n, 3
diff --git a/test/Transforms/LoopVectorize/cast-induction.ll b/test/Transforms/LoopVectorize/cast-induction.ll
new file mode 100644
index 0000000000..fc8281ba8a
--- /dev/null
+++ b/test/Transforms/LoopVectorize/cast-induction.ll
@@ -0,0 +1,30 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+; rdar://problem/12848162
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@a = common global [2048 x i32] zeroinitializer, align 16
+
+;CHECK: @example12
+;CHECK: trunc i64
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example12() nounwind uwtable ssp {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  %3 = trunc i64 %indvars.iv to i32
+  store i32 %3, i32* %2, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 1024
+  br i1 %exitcond, label %4, label %1
+
+; <label>:4                                       ; preds = %1
+  ret void
+}
+
diff --git a/test/Transforms/LoopVectorize/cpp-new-array.ll b/test/Transforms/LoopVectorize/cpp-new-array.ll
index 26902eba9e..af59963899 100644
--- a/test/Transforms/LoopVectorize/cpp-new-array.ll
+++ b/test/Transforms/LoopVectorize/cpp-new-array.ll
@@ -4,7 +4,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "x86_64-apple-macosx10.8.0"
 
 ;CHECK: @cpp_new_arrays
-;CHECK: insertelement <4 x i32>
+;CHECK: sext i32
 ;CHECK: load <4 x float>
 ;CHECK: fadd <4 x float>
 ;CHECK: ret i32
diff --git a/test/Transforms/LoopVectorize/gcc-examples.ll b/test/Transforms/LoopVectorize/gcc-examples.ll
index f1bf6cb6d8..4fabc8e2d3 100644
--- a/test/Transforms/LoopVectorize/gcc-examples.ll
+++ b/test/Transforms/LoopVectorize/gcc-examples.ll
@@ -329,7 +329,7 @@ define void @example11() nounwind uwtable ssp {
 }
 
 ;CHECK: @example12
-;CHECK: trunc <4 x i64>
+;CHECK: trunc i64
 ;CHECK: store <4 x i32>
 ;CHECK: ret void
 define void @example12() nounwind uwtable ssp {
diff --git a/test/Transforms/LoopVectorize/if-conv-crash.ll b/test/Transforms/LoopVectorize/if-conv-crash.ll
new file mode 100644
index 0000000000..c82df5a9be
--- /dev/null
+++ b/test/Transforms/LoopVectorize/if-conv-crash.ll
@@ -0,0 +1,39 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -enable-if-conversion
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+define fastcc void @DD_dump() nounwind uwtable ssp {
+entry:
+  br i1 undef, label %lor.lhs.false, label %if.end25
+
+lor.lhs.false:                                    ; preds = %entry
+  br i1 undef, label %if.end21, label %if.else
+
+if.else:                                          ; preds = %lor.lhs.false
+  br i1 undef, label %num_q.exit, label %while.body.i.preheader
+
+while.body.i.preheader:                           ; preds = %if.else
+  br label %while.body.i
+
+while.body.i:                                     ; preds = %if.end.i, %while.body.i.preheader
+  switch i8 undef, label %if.end.i [
+    i8 39, label %if.then.i
+    i8 92, label %if.then.i
+  ]
+
+if.then.i:                                        ; preds = %while.body.i, %while.body.i
+  br label %if.end.i
+
+if.end.i:                                         ; preds = %if.then.i, %while.body.i
+  br i1 undef, label %num_q.exit, label %while.body.i
+
+num_q.exit:                                       ; preds = %if.end.i, %if.else
+  unreachable
+
+if.end21:                                         ; preds = %lor.lhs.false
+  unreachable
+
+if.end25:                                         ; preds = %entry
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/induction_plus.ll b/test/Transforms/LoopVectorize/induction_plus.ll
index b31bceb50d..a25845f1c1 100644
--- a/test/Transforms/LoopVectorize/induction_plus.ll
+++ b/test/Transforms/LoopVectorize/induction_plus.ll
@@ -6,8 +6,7 @@ target triple = "x86_64-apple-macosx10.8.0"
 @array = common global [1024 x i32] zeroinitializer, align 16
 
 ;CHECK: @array_at_plus_one
-;CHECK: add <4 x i64>
-;CHECK: trunc <4 x i64>
+;CHECK: trunc i64
 ;CHECK: add i64 %index, 12
 ;CHECK: ret i32
 define i32 @array_at_plus_one(i32 %n) nounwind uwtable ssp {
diff --git a/test/Transforms/LoopVectorize/small-size.ll b/test/Transforms/LoopVectorize/small-size.ll
new file mode 100644
index 0000000000..deb0bb2f87
--- /dev/null
+++ b/test/Transforms/LoopVectorize/small-size.ll
@@ -0,0 +1,170 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@b = common global [2048 x i32] zeroinitializer, align 16
+@c = common global [2048 x i32] zeroinitializer, align 16
+@a = common global [2048 x i32] zeroinitializer, align 16
+@G = common global [32 x [1024 x i32]] zeroinitializer, align 16
+@ub = common global [1024 x i32] zeroinitializer, align 16
+@uc = common global [1024 x i32] zeroinitializer, align 16
+@d = common global [2048 x i32] zeroinitializer, align 16
+@fa = common global [1024 x float] zeroinitializer, align 16
+@fb = common global [1024 x float] zeroinitializer, align 16
+@ic = common global [1024 x i32] zeroinitializer, align 16
+@da = common global [1024 x float] zeroinitializer, align 16
+@db = common global [1024 x float] zeroinitializer, align 16
+@dc = common global [1024 x float] zeroinitializer, align 16
+@dd = common global [1024 x float] zeroinitializer, align 16
+@dj = common global [1024 x i32] zeroinitializer, align 16
+
+; We can optimize this test without a tail.
+;CHECK: @example1
+;CHECK: load <4 x i32>
+;CHECK: add nsw <4 x i32>
+;CHECK: store <4 x i32>
+;CHECK: ret void
+define void @example1() optsize {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %3 = load i32* %2, align 4
+  %4 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %5 = load i32* %4, align 4
+  %6 = add nsw i32 %5, %3
+  %7 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %6, i32* %7, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+; Can't vectorize in 'optsize' mode because we need a tail.
+;CHECK: @example2
+;CHECK-NOT: store <4 x i32>
+;CHECK: ret void
+define void @example2(i32 %n, i32 %x) optsize {
+  %1 = icmp sgt i32 %n, 0
+  br i1 %1, label %.lr.ph5, label %.preheader
+
+..preheader_crit_edge:                            ; preds = %.lr.ph5
+  %phitmp = sext i32 %n to i64
+  br label %.preheader
+
+.preheader:                                       ; preds = %..preheader_crit_edge, %0
+  %i.0.lcssa = phi i64 [ %phitmp, %..preheader_crit_edge ], [ 0, %0 ]
+  %2 = icmp eq i32 %n, 0
+  br i1 %2, label %._crit_edge, label %.lr.ph
+
+.lr.ph5:                                          ; preds = %0, %.lr.ph5
+  %indvars.iv6 = phi i64 [ %indvars.iv.next7, %.lr.ph5 ], [ 0, %0 ]
+  %3 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv6
+  store i32 %x, i32* %3, align 4
+  %indvars.iv.next7 = add i64 %indvars.iv6, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next7 to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %..preheader_crit_edge, label %.lr.ph5
+
+.lr.ph:                                           ; preds = %.preheader, %.lr.ph
+  %indvars.iv = phi i64 [ %indvars.iv.next, %.lr.ph ], [ %i.0.lcssa, %.preheader ]
+  %.02 = phi i32 [ %4, %.lr.ph ], [ %n, %.preheader ]
+  %4 = add nsw i32 %.02, -1
+  %5 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %indvars.iv
+  %6 = load i32* %5, align 4
+  %7 = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %indvars.iv
+  %8 = load i32* %7, align 4
+  %9 = and i32 %8, %6
+  %10 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %indvars.iv
+  store i32 %9, i32* %10, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %11 = icmp eq i32 %4, 0
+  br i1 %11, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %.preheader
+  ret void
+}
+
+; N is unknown, we need a tail. Can't vectorize.
+;CHECK: @example3
+;CHECK-NOT: <4 x i32>
+;CHECK: ret void
+define void @example3(i32 %n, i32* noalias nocapture %p, i32* noalias nocapture %q) optsize {
+  %1 = icmp eq i32 %n, 0
+  br i1 %1, label %._crit_edge, label %.lr.ph
+
+.lr.ph:                                           ; preds = %0, %.lr.ph
+  %.05 = phi i32 [ %2, %.lr.ph ], [ %n, %0 ]
+  %.014 = phi i32* [ %5, %.lr.ph ], [ %p, %0 ]
+  %.023 = phi i32* [ %3, %.lr.ph ], [ %q, %0 ]
+  %2 = add nsw i32 %.05, -1
+  %3 = getelementptr inbounds i32* %.023, i64 1
+  %4 = load i32* %.023, align 16
+  %5 = getelementptr inbounds i32* %.014, i64 1
+  store i32 %4, i32* %.014, align 16
+  %6 = icmp eq i32 %2, 0
+  br i1 %6, label %._crit_edge, label %.lr.ph
+
+._crit_edge:                                      ; preds = %.lr.ph, %0
+  ret void
+}
+
+
+; We can't vectorize this one because we need a runtime ptr check.
+;CHECK: @example23
+;CHECK-NOT: <4 x i32>
+;CHECK: ret void
+define void @example23(i16* nocapture %src, i32* nocapture %dst) optsize {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %.04 = phi i16* [ %src, %0 ], [ %2, %1 ]
+  %.013 = phi i32* [ %dst, %0 ], [ %6, %1 ]
+  %i.02 = phi i32 [ 0, %0 ], [ %7, %1 ]
+  %2 = getelementptr inbounds i16* %.04, i64 1
+  %3 = load i16* %.04, align 2
+  %4 = zext i16 %3 to i32
+  %5 = shl nuw nsw i32 %4, 7
+  %6 = getelementptr inbounds i32* %.013, i64 1
+  store i32 %5, i32* %.013, align 4
+  %7 = add nsw i32 %i.02, 1
+  %exitcond = icmp eq i32 %7, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+
+; We CAN vectorize this example because the pointers are marked as noalias.
+;CHECK: @example23b
+;CHECK: <4 x i32>
+;CHECK: ret void
+define void @example23b(i16* noalias nocapture %src, i32* noalias nocapture %dst) optsize {
+  br label %1
+
+; <label>:1                                       ; preds = %1, %0
+  %.04 = phi i16* [ %src, %0 ], [ %2, %1 ]
+  %.013 = phi i32* [ %dst, %0 ], [ %6, %1 ]
+  %i.02 = phi i32 [ 0, %0 ], [ %7, %1 ]
+  %2 = getelementptr inbounds i16* %.04, i64 1
+  %3 = load i16* %.04, align 2
+  %4 = zext i16 %3 to i32
+  %5 = shl nuw nsw i32 %4, 7
+  %6 = getelementptr inbounds i32* %.013, i64 1
+  store i32 %5, i32* %.013, align 4
+  %7 = add nsw i32 %i.02, 1
+  %exitcond = icmp eq i32 %7, 256
+  br i1 %exitcond, label %8, label %1
+
+; <label>:8                                       ; preds = %1
+  ret void
+}
+
+
diff --git a/test/Transforms/SROA/basictest.ll b/test/Transforms/SROA/basictest.ll
index 9fe926ee2c..efc01acd59 100644
--- a/test/Transforms/SROA/basictest.ll
+++ b/test/Transforms/SROA/basictest.ll
@@ -575,8 +575,8 @@ entry:
   store i8 0, i8* %a2ptr
   %aiptr = bitcast [3 x i8]* %a to i24*
   %ai = load i24* %aiptr
-; CHCEK-NOT: store
-; CHCEK-NOT: load
+; CHECK-NOT: store
+; CHECK-NOT: load
 ; CHECK:      %[[ext2:.*]] = zext i8 0 to i24
 ; CHECK-NEXT: %[[shift2:.*]] = shl i24 %[[ext2]], 16
 ; CHECK-NEXT: %[[mask2:.*]] = and i24 undef, 65535
@@ -597,8 +597,8 @@ entry:
   %b1 = load i8* %b1ptr
   %b2ptr = getelementptr [3 x i8]* %b, i64 0, i32 2
   %b2 = load i8* %b2ptr
-; CHCEK-NOT: store
-; CHCEK-NOT: load
+; CHECK-NOT: store
+; CHECK-NOT: load
 ; CHECK:      %[[trunc0:.*]] = trunc i24 %[[insert0]] to i8
 ; CHECK-NEXT: %[[shift1:.*]] = lshr i24 %[[insert0]], 8
 ; CHECK-NEXT: %[[trunc1:.*]] = trunc i24 %[[shift1]] to i8
@@ -1176,3 +1176,50 @@ entry:
   %baz = load i1* %a.i1, align 1
   ret void
 }
+
+define <3 x i8> @PR14572.1(i32 %x) {
+; Ensure that a split integer store which is wider than the type size of the
+; alloca (relying on the alloc size padding) doesn't trigger an assert.
+; CHECK: @PR14572.1
+
+entry:
+  %a = alloca <3 x i8>, align 4
+; CHECK-NOT: alloca
+
+  %cast = bitcast <3 x i8>* %a to i32*
+  store i32 %x, i32* %cast, align 1
+  %y = load <3 x i8>* %a, align 4
+  ret <3 x i8> %y
+; CHECK: ret <3 x i8>
+}
+
+define i32 @PR14572.2(<3 x i8> %x) {
+; Ensure that a split integer load which is wider than the type size of the
+; alloca (relying on the alloc size padding) doesn't trigger an assert.
+; CHECK: @PR14572.2
+
+entry:
+  %a = alloca <3 x i8>, align 4
+; CHECK-NOT: alloca
+
+  store <3 x i8> %x, <3 x i8>* %a, align 1
+  %cast = bitcast <3 x i8>* %a to i32*
+  %y = load i32* %cast, align 4
+  ret i32 %y
+; CHECK: ret i32
+}
+
+define i32 @PR14601(i32 %x) {
+; Don't try to form a promotable integer alloca when there is a variable length
+; memory intrinsic.
+; CHECK: @PR14601
+
+entry:
+  %a = alloca i32
+; CHECK: alloca
+
+  %a.i8 = bitcast i32* %a to i8*
+  call void @llvm.memset.p0i8.i32(i8* %a.i8, i8 0, i32 %x, i32 1, i1 false)
+  %v = load i32* %a
+  ret i32 %v
+}
diff --git a/test/Transforms/SROA/big-endian.ll b/test/Transforms/SROA/big-endian.ll
index 1ac6d25d63..64a0cc7439 100644
--- a/test/Transforms/SROA/big-endian.ll
+++ b/test/Transforms/SROA/big-endian.ll
@@ -24,8 +24,8 @@ entry:
   store i8 0, i8* %a2ptr
   %aiptr = bitcast [3 x i8]* %a to i24*
   %ai = load i24* %aiptr
-; CHCEK-NOT: store
-; CHCEK-NOT: load
+; CHECK-NOT: store
+; CHECK-NOT: load
 ; CHECK:      %[[ext2:.*]] = zext i8 0 to i24
 ; CHECK-NEXT: %[[mask2:.*]] = and i24 undef, -256
 ; CHECK-NEXT: %[[insert2:.*]] = or i24 %[[mask2]], %[[ext2]]
@@ -46,8 +46,8 @@ entry:
   %b1 = load i8* %b1ptr
   %b2ptr = getelementptr [3 x i8]* %b, i64 0, i32 2
   %b2 = load i8* %b2ptr
-; CHCEK-NOT: store
-; CHCEK-NOT: load
+; CHECK-NOT: store
+; CHECK-NOT: load
 ; CHECK:      %[[shift0:.*]] = lshr i24 %[[insert0]], 16
 ; CHECK-NEXT: %[[trunc0:.*]] = trunc i24 %[[shift0]] to i8
 ; CHECK-NEXT: %[[shift1:.*]] = lshr i24 %[[insert0]], 8
@@ -77,8 +77,8 @@ entry:
   %a2ptr = getelementptr [7 x i8]* %a, i64 0, i32 2
   %a3ptr = getelementptr [7 x i8]* %a, i64 0, i32 3
 
-; CHCEK-NOT: store
-; CHCEK-NOT: load
+; CHECK-NOT: store
+; CHECK-NOT: load
 
   %a0i16ptr = bitcast i8* %a0ptr to i16*
   store i16 1, i16* %a0i16ptr
@@ -98,8 +98,8 @@ entry:
 ; CHECK-NEXT: %[[mask3:.*]] = and i56 undef, -1099511627776
 ; CHECK-NEXT: %[[insert3:.*]] = or i56 %[[mask3]], %[[ext3]]
 
-; CHCEK-NOT: store
-; CHCEK-NOT: load
+; CHECK-NOT: store
+; CHECK-NOT: load
 
   %aiptr = bitcast [7 x i8]* %a to i56*
   %ai = load i56* %aiptr
diff --git a/test/Transforms/SROA/vector-promotion.ll b/test/Transforms/SROA/vector-promotion.ll
index bb34e3f084..02f6d040cc 100644
--- a/test/Transforms/SROA/vector-promotion.ll
+++ b/test/Transforms/SROA/vector-promotion.ll
@@ -279,6 +279,89 @@ entry:
 ; CHECK-NEXT: ret <4 x i32> %[[ret]]
 }
 
+declare void @llvm.memset.p0i32.i32(i32* nocapture, i32, i32, i32, i1) nounwind
+
+define <4 x float> @test_subvec_memset() {
+; CHECK: @test_subvec_memset
+entry:
+  %a = alloca <4 x float>
+; CHECK-NOT: alloca
+
+  %a.gep0 = getelementptr <4 x float>* %a, i32 0, i32 0
+  %a.cast0 = bitcast float* %a.gep0 to i8*
+  call void @llvm.memset.p0i8.i32(i8* %a.cast0, i8 0, i32 8, i32 0, i1 false)
+; CHECK-NOT: store
+; CHECK:      %[[insert1:.*]] = shufflevector <4 x float> <float 0.000000e+00, float 0.000000e+00, float undef, float undef>, <4 x float> undef, <4 x i32> <i32 0, i32 1, {{.*}}>
+
+  %a.gep1 = getelementptr <4 x float>* %a, i32 0, i32 1
+  %a.cast1 = bitcast float* %a.gep1 to i8*
+  call void @llvm.memset.p0i8.i32(i8* %a.cast1, i8 1, i32 8, i32 0, i1 false)
+; CHECK-NEXT: %[[insert2:.*]] = shufflevector <4 x float> <float undef, float 0x3820202020000000, float 0x3820202020000000, float undef>, <4 x float> %[[insert1]], <4 x i32> <i32 4, i32 1, i32 2, {{.*}}>
+
+  %a.gep2 = getelementptr <4 x float>* %a, i32 0, i32 2
+  %a.cast2 = bitcast float* %a.gep2 to i8*
+  call void @llvm.memset.p0i8.i32(i8* %a.cast2, i8 3, i32 8, i32 0, i1 false)
+; CHECK-NEXT: %[[insert3:.*]] = shufflevector <4 x float> <float undef, float undef, float 0x3860606060000000, float 0x3860606060000000>, <4 x float> %[[insert2]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+
+  %a.gep3 = getelementptr <4 x float>* %a, i32 0, i32 3
+  %a.cast3 = bitcast float* %a.gep3 to i8*
+  call void @llvm.memset.p0i8.i32(i8* %a.cast3, i8 7, i32 4, i32 0, i1 false)
+; CHECK-NEXT: %[[insert4:.*]] = insertelement <4 x float> %[[insert3]], float 0x38E0E0E0E0000000, i32 3
+
+  %ret = load <4 x float>* %a
+
+  ret <4 x float> %ret
+; CHECK-NEXT: ret <4 x float> %[[insert4]]
+}
+
+define <4 x float> @test_subvec_memcpy(i8* %x, i8* %y, i8* %z, i8* %f, i8* %out) {
+; CHECK: @test_subvec_memcpy
+entry:
+  %a = alloca <4 x float>
+; CHECK-NOT: alloca
+
+  %a.gep0 = getelementptr <4 x float>* %a, i32 0, i32 0
+  %a.cast0 = bitcast float* %a.gep0 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.cast0, i8* %x, i32 8, i32 0, i1 false)
+; CHECK:      %[[xptr:.*]] = bitcast i8* %x to <2 x float>*
+; CHECK-NEXT: %[[x:.*]] = load <2 x float>* %[[xptr]]
+; CHECK-NEXT: %[[expand_x:.*]] = shufflevector <2 x float> %[[x]], <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT: %[[insert_x:.*]] = shufflevector <4 x float> %[[expand_x]], <4 x float> undef, <4 x i32> <i32 0, i32 1, {{.*}}>
+
+  %a.gep1 = getelementptr <4 x float>* %a, i32 0, i32 1
+  %a.cast1 = bitcast float* %a.gep1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.cast1, i8* %y, i32 8, i32 0, i1 false)
+; CHECK-NEXT: %[[yptr:.*]] = bitcast i8* %y to <2 x float>*
+; CHECK-NEXT: %[[y:.*]] = load <2 x float>* %[[yptr]]
+; CHECK-NEXT: %[[expand_y:.*]] = shufflevector <2 x float> %[[y]], <2 x float> undef, <4 x i32> <i32 undef, i32 0, i32 1, i32 undef>
+; CHECK-NEXT: %[[insert_y:.*]] = shufflevector <4 x float> %[[expand_y]], <4 x float> %[[insert_x]], <4 x i32> <i32 4, i32 1, i32 2, {{.*}}>
+
+  %a.gep2 = getelementptr <4 x float>* %a, i32 0, i32 2
+  %a.cast2 = bitcast float* %a.gep2 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.cast2, i8* %z, i32 8, i32 0, i1 false)
+; CHECK-NEXT: %[[zptr:.*]] = bitcast i8* %z to <2 x float>*
+; CHECK-NEXT: %[[z:.*]] = load <2 x float>* %[[zptr]]
+; CHECK-NEXT: %[[expand_z:.*]] = shufflevector <2 x float> %[[z]], <2 x float> undef, <4 x i32> <i32 undef, i32 undef, i32 0, i32 1>
+; CHECK-NEXT: %[[insert_z:.*]] = shufflevector <4 x float> %[[expand_z]], <4 x float> %[[insert_y]], <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+
+  %a.gep3 = getelementptr <4 x float>* %a, i32 0, i32 3
+  %a.cast3 = bitcast float* %a.gep3 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a.cast3, i8* %f, i32 4, i32 0, i1 false)
+; CHECK-NEXT: %[[fptr:.*]] = bitcast i8* %f to float*
+; CHECK-NEXT: %[[f:.*]] = load float* %[[fptr]]
+; CHECK-NEXT: %[[insert_f:.*]] = insertelement <4 x float> %[[insert_z]], float %[[f]], i32 3
+
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %out, i8* %a.cast2, i32 8, i32 0, i1 false)
+; CHECK-NEXT: %[[outptr:.*]] = bitcast i8* %out to <2 x float>*
+; CHECK-NEXT: %[[extract_out:.*]] = shufflevector <4 x float> %[[insert_f]], <4 x float> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: store <2 x float> %[[extract_out]], <2 x float>* %[[outptr]]
+
+  %ret = load <4 x float>* %a
+
+  ret <4 x float> %ret
+; CHECK-NEXT: ret <4 x float> %[[insert_f]]
+}
+
 define i32 @PR14212() {
 ; CHECK: @PR14212
 ; This caused a crash when "splitting" the load of the i32 in order to promote
diff --git a/tools/llc/llc.cpp b/tools/llc/llc.cpp
index 4e9c9c0683..74fbf57d26 100644
--- a/tools/llc/llc.cpp
+++ b/tools/llc/llc.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/PassManager.h"
+#include "llvm/TargetTransformInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FormattedStream.h"
diff --git a/tools/llvm-mc/llvm-mc.cpp b/tools/llvm-mc/llvm-mc.cpp
index 15cacfabeb..930f528668 100644
--- a/tools/llvm-mc/llvm-mc.cpp
+++ b/tools/llvm-mc/llvm-mc.cpp
@@ -157,6 +157,14 @@ static cl::opt<bool>
 GenDwarfForAssembly("g", cl::desc("Generate dwarf debugging info for assembly "
                                   "source files"));
 
+static cl::opt<std::string>
+DebugCompilationDir("fdebug-compilation-dir",
+                    cl::desc("Specifies the debug info's compilation dir"));
+
+static cl::opt<std::string>
+MainFileName("main-file-name",
+             cl::desc("Specifies the name we should consider the input file"));
+
 enum ActionType {
   AC_AsLex,
   AC_Assemble,
@@ -389,8 +397,12 @@ int main(int argc, char **argv) {
     Ctx.setAllowTemporaryLabels(false);
 
   Ctx.setGenDwarfForAssembly(GenDwarfForAssembly);
-  if (!DwarfDebugFlags.empty()) 
+  if (!DwarfDebugFlags.empty())
     Ctx.setDwarfDebugFlags(StringRef(DwarfDebugFlags));
+  if (!DebugCompilationDir.empty())
+    Ctx.setCompilationDir(DebugCompilationDir);
+  if (!MainFileName.empty())
+    Ctx.setMainFileName(MainFileName);
 
   // Package up features to be passed to target/subtarget
   std::string FeaturesStr;
diff --git a/tools/lto/LTOCodeGenerator.cpp b/tools/lto/LTOCodeGenerator.cpp
index cfaaf863b2..ebb377abf9 100644
--- a/tools/lto/LTOCodeGenerator.cpp
+++ b/tools/lto/LTOCodeGenerator.cpp
@@ -29,6 +29,7 @@
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Module.h"
 #include "llvm/PassManager.h"
+#include "llvm/TargetTransformInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/Host.h"
diff --git a/tools/lto/LTOModule.cpp b/tools/lto/LTOModule.cpp
index 5a388cbad6..9d2f9c7848 100644
--- a/tools/lto/LTOModule.cpp
+++ b/tools/lto/LTOModule.cpp
@@ -320,8 +320,9 @@ MemoryBuffer *LTOModule::makeBuffer(const void *mem, size_t length) {
 }
 
 /// objcClassNameFromExpression - Get string that the data pointer points to.
-bool LTOModule::objcClassNameFromExpression(Constant *c, std::string &name) {
-  if (ConstantExpr *ce = dyn_cast<ConstantExpr>(c)) {
+bool
+LTOModule::objcClassNameFromExpression(const Constant *c, std::string &name) {
+  if (const ConstantExpr *ce = dyn_cast<ConstantExpr>(c)) {
     Constant *op = ce->getOperand(0);
     if (GlobalVariable *gvn = dyn_cast<GlobalVariable>(op)) {
       Constant *cn = gvn->getInitializer();
@@ -337,8 +338,8 @@ bool LTOModule::objcClassNameFromExpression(Constant *c, std::string &name) {
 }
 
 /// addObjCClass - Parse i386/ppc ObjC class data structure.
-void LTOModule::addObjCClass(GlobalVariable *clgv) {
-  ConstantStruct *c = dyn_cast<ConstantStruct>(clgv->getInitializer());
+void LTOModule::addObjCClass(const GlobalVariable *clgv) {
+  const ConstantStruct *c = dyn_cast<ConstantStruct>(clgv->getInitializer());
   if (!c) return;
 
   // second slot in __OBJC,__class is pointer to superclass name
@@ -374,8 +375,8 @@ void LTOModule::addObjCClass(GlobalVariable *clgv) {
 }
 
 /// addObjCCategory - Parse i386/ppc ObjC category data structure.
-void LTOModule::addObjCCategory(GlobalVariable *clgv) {
-  ConstantStruct *c = dyn_cast<ConstantStruct>(clgv->getInitializer());
+void LTOModule::addObjCCategory(const GlobalVariable *clgv) {
+  const ConstantStruct *c = dyn_cast<ConstantStruct>(clgv->getInitializer());
   if (!c) return;
 
   // second slot in __OBJC,__category is pointer to target class name
@@ -399,7 +400,7 @@ void LTOModule::addObjCCategory(GlobalVariable *clgv) {
 }
 
 /// addObjCClassRef - Parse i386/ppc ObjC class list data structure.
-void LTOModule::addObjCClassRef(GlobalVariable *clgv) {
+void LTOModule::addObjCClassRef(const GlobalVariable *clgv) {
   std::string targetclassName;
   if (!objcClassNameFromExpression(clgv->getInitializer(), targetclassName))
     return;
@@ -419,7 +420,7 @@ void LTOModule::addObjCClassRef(GlobalVariable *clgv) {
 }
 
 /// addDefinedDataSymbol - Add a data symbol as defined to the list.
-void LTOModule::addDefinedDataSymbol(GlobalValue *v) {
+void LTOModule::addDefinedDataSymbol(const GlobalValue *v) {
   // Add to list of defined symbols.
   addDefinedSymbol(v, false);
 
@@ -448,34 +449,34 @@ void LTOModule::addDefinedDataSymbol(GlobalValue *v) {
 
   // special case if this data blob is an ObjC class definition
   if (v->getSection().compare(0, 15, "__OBJC,__class,") == 0) {
-    if (GlobalVariable *gv = dyn_cast<GlobalVariable>(v)) {
+    if (const GlobalVariable *gv = dyn_cast<GlobalVariable>(v)) {
       addObjCClass(gv);
     }
   }
 
   // special case if this data blob is an ObjC category definition
   else if (v->getSection().compare(0, 18, "__OBJC,__category,") == 0) {
-    if (GlobalVariable *gv = dyn_cast<GlobalVariable>(v)) {
+    if (const GlobalVariable *gv = dyn_cast<GlobalVariable>(v)) {
       addObjCCategory(gv);
     }
   }
 
   // special case if this data blob is the list of referenced classes
   else if (v->getSection().compare(0, 18, "__OBJC,__cls_refs,") == 0) {
-    if (GlobalVariable *gv = dyn_cast<GlobalVariable>(v)) {
+    if (const GlobalVariable *gv = dyn_cast<GlobalVariable>(v)) {
       addObjCClassRef(gv);
     }
   }
 }
 
 /// addDefinedFunctionSymbol - Add a function symbol as defined to the list.
-void LTOModule::addDefinedFunctionSymbol(Function *f) {
+void LTOModule::addDefinedFunctionSymbol(const Function *f) {
   // add to list of defined symbols
   addDefinedSymbol(f, true);
 }
 
 /// addDefinedSymbol - Add a defined symbol to the list.
-void LTOModule::addDefinedSymbol(GlobalValue *def, bool isFunction) {
+void LTOModule::addDefinedSymbol(const GlobalValue *def, bool isFunction) {
   // ignore all llvm.* symbols
   if (def->getName().startswith("llvm."))
     return;
@@ -492,7 +493,7 @@ void LTOModule::addDefinedSymbol(GlobalValue *def, bool isFunction) {
   if (isFunction) {
     attr |= LTO_SYMBOL_PERMISSIONS_CODE;
   } else {
-    GlobalVariable *gv = dyn_cast<GlobalVariable>(def);
+    const GlobalVariable *gv = dyn_cast<GlobalVariable>(def);
     if (gv && gv->isConstant())
       attr |= LTO_SYMBOL_PERMISSIONS_RODATA;
     else
@@ -607,7 +608,8 @@ void LTOModule::addAsmGlobalSymbolUndef(const char *name) {
 
 /// addPotentialUndefinedSymbol - Add a symbol which isn't defined just yet to a
 /// list to be resolved later.
-void LTOModule::addPotentialUndefinedSymbol(GlobalValue *decl, bool isFunc) {
+void
+LTOModule::addPotentialUndefinedSymbol(const GlobalValue *decl, bool isFunc) {
   // ignore all llvm.* symbols
   if (decl->getName().startswith("llvm."))
     return;
@@ -743,6 +745,9 @@ namespace {
       Symbol->setSection(*getCurrentSection());
       markDefined(*Symbol);
     }
+    virtual void EmitDebugLabel(MCSymbol *Symbol) {
+      EmitLabel(Symbol);
+    }
     virtual void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
       // FIXME: should we handle aliases?
       markDefined(*Symbol);
diff --git a/tools/lto/LTOModule.h b/tools/lto/LTOModule.h
index eca97c43c6..b194ed0287 100644
--- a/tools/lto/LTOModule.h
+++ b/tools/lto/LTOModule.h
@@ -44,7 +44,7 @@ private:
     const char        *name;
     uint32_t           attributes;
     bool               isFunction;
-    llvm::GlobalValue *symbol;
+    const llvm::GlobalValue *symbol;
   };
 
   llvm::OwningPtr<llvm::Module>           _module;
@@ -138,16 +138,16 @@ private:
 
   /// addPotentialUndefinedSymbol - Add a symbol which isn't defined just yet
   /// to a list to be resolved later.
-  void addPotentialUndefinedSymbol(llvm::GlobalValue *dcl, bool isFunc);
+  void addPotentialUndefinedSymbol(const llvm::GlobalValue *dcl, bool isFunc);
 
   /// addDefinedSymbol - Add a defined symbol to the list.
-  void addDefinedSymbol(llvm::GlobalValue *def, bool isFunction);
+  void addDefinedSymbol(const llvm::GlobalValue *def, bool isFunction);
 
   /// addDefinedFunctionSymbol - Add a function symbol as defined to the list.
-  void addDefinedFunctionSymbol(llvm::Function *f);
+  void addDefinedFunctionSymbol(const llvm::Function *f);
 
   /// addDefinedDataSymbol - Add a data symbol as defined to the list.
-  void addDefinedDataSymbol(llvm::GlobalValue *v);
+  void addDefinedDataSymbol(const llvm::GlobalValue *v);
 
   /// addAsmGlobalSymbols - Add global symbols from module-level ASM to the
   /// defined or undefined lists.
@@ -162,17 +162,17 @@ private:
   void addAsmGlobalSymbolUndef(const char *);
 
   /// addObjCClass - Parse i386/ppc ObjC class data structure.
-  void addObjCClass(llvm::GlobalVariable *clgv);
+  void addObjCClass(const llvm::GlobalVariable *clgv);
 
   /// addObjCCategory - Parse i386/ppc ObjC category data structure.
-  void addObjCCategory(llvm::GlobalVariable *clgv);
+  void addObjCCategory(const llvm::GlobalVariable *clgv);
 
   /// addObjCClassRef - Parse i386/ppc ObjC class list data structure.
-  void addObjCClassRef(llvm::GlobalVariable *clgv);
+  void addObjCClassRef(const llvm::GlobalVariable *clgv);
 
   /// objcClassNameFromExpression - Get string that the data pointer points
   /// to.
-  bool objcClassNameFromExpression(llvm::Constant* c, std::string &name);
+  bool objcClassNameFromExpression(const llvm::Constant* c, std::string &name);
 
   /// isTargetMatch - Returns 'true' if the memory buffer is for the specified
   /// target triple.
diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp
index 8e88e3aee2..1bbe4e3623 100644
--- a/tools/opt/opt.cpp
+++ b/tools/opt/opt.cpp
@@ -30,6 +30,7 @@
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Module.h"
 #include "llvm/PassManager.h"
+#include "llvm/TargetTransformInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/IRReader.h"
 #include "llvm/Support/ManagedStatic.h"
diff --git a/unittests/Support/CMakeLists.txt b/unittests/Support/CMakeLists.txt
index 09a0ea50d7..f6a5949cdb 100644
--- a/unittests/Support/CMakeLists.txt
+++ b/unittests/Support/CMakeLists.txt
@@ -24,6 +24,7 @@ add_llvm_unittest(SupportTests
   SwapByteOrderTest.cpp
   TimeValue.cpp
   ValueHandleTest.cpp
+  YAMLIOTest.cpp
   YAMLParserTest.cpp
   formatted_raw_ostream_test.cpp
   raw_ostream_test.cpp
diff --git a/unittests/Support/YAMLIOTest.cpp b/unittests/Support/YAMLIOTest.cpp
new file mode 100644
index 0000000000..fab2d5b666
--- /dev/null
+++ b/unittests/Support/YAMLIOTest.cpp
@@ -0,0 +1,1287 @@
+//===- unittest/Support/YAMLIOTest.cpp ------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/YAMLTraits.h"
+#include "gtest/gtest.h"
+
+
+using llvm::yaml::Input;
+using llvm::yaml::Output;
+using llvm::yaml::IO;
+using llvm::yaml::MappingTraits;
+using llvm::yaml::MappingNormalization;
+using llvm::yaml::ScalarTraits;
+using llvm::yaml::Hex8;
+using llvm::yaml::Hex16;
+using llvm::yaml::Hex32;
+using llvm::yaml::Hex64;
+
+
+//===----------------------------------------------------------------------===//
+//  Test MappingTraits
+//===----------------------------------------------------------------------===//
+
+struct FooBar {
+  int foo;
+  int bar;
+};
+typedef std::vector<FooBar> FooBarSequence;
+
+LLVM_YAML_IS_SEQUENCE_VECTOR(FooBar)
+
+
+namespace llvm {
+namespace yaml {
+  template <>
+  struct MappingTraits<FooBar> {
+    static void mapping(IO &io, FooBar& fb) {
+      io.mapRequired("foo",    fb.foo);
+      io.mapRequired("bar",    fb.bar);
+    }
+  };
+}
+}
+
+
+//
+// Test the reading of a yaml mapping
+//
+TEST(YAMLIO, TestMapRead) {
+  FooBar doc;
+  Input yin("---\nfoo:  3\nbar:  5\n...\n");
+  yin >> doc;
+
+  EXPECT_FALSE(yin.error());
+  EXPECT_EQ(doc.foo, 3);
+  EXPECT_EQ(doc.bar,5);
+}
+
+
+//
+// Test the reading of a yaml sequence of mappings
+//
+TEST(YAMLIO, TestSequenceMapRead) {
+  FooBarSequence seq;
+  Input yin("---\n - foo:  3\n   bar:  5\n - foo:  7\n   bar:  9\n...\n");
+  yin >> seq;
+
+  EXPECT_FALSE(yin.error());
+  EXPECT_EQ(seq.size(), 2UL);
+  FooBar& map1 = seq[0];
+  FooBar& map2 = seq[1];
+  EXPECT_EQ(map1.foo, 3);
+  EXPECT_EQ(map1.bar, 5);
+  EXPECT_EQ(map2.foo, 7);
+  EXPECT_EQ(map2.bar, 9);
+}
+
+
+//
+// Test writing then reading back a sequence of mappings
+//
+TEST(YAMLIO, TestSequenceMapWriteAndRead) {
+  std::string intermediate;
+  {
+    FooBar entry1;
+    entry1.foo = 10;
+    entry1.bar = -3;
+    FooBar entry2;
+    entry2.foo = 257;
+    entry2.bar = 0;
+    FooBarSequence seq;
+    seq.push_back(entry1);
+    seq.push_back(entry2);
+
+    llvm::raw_string_ostream ostr(intermediate);
+    Output yout(ostr);
+    yout << seq;
+  }
+
+  {
+    Input yin(intermediate);
+    FooBarSequence seq2;
+    yin >> seq2;
+
+    EXPECT_FALSE(yin.error());
+    EXPECT_EQ(seq2.size(), 2UL);
+    FooBar& map1 = seq2[0];
+    FooBar& map2 = seq2[1];
+    EXPECT_EQ(map1.foo, 10);
+    EXPECT_EQ(map1.bar, -3);
+    EXPECT_EQ(map2.foo, 257);
+    EXPECT_EQ(map2.bar, 0);
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Test built-in types
+//===----------------------------------------------------------------------===//
+
+struct BuiltInTypes {
+  llvm::StringRef str;
+  uint64_t        u64;
+  uint32_t        u32;
+  uint16_t        u16;
+  uint8_t         u8;
+  bool            b;
+  int64_t         s64;
+  int32_t         s32;
+  int16_t         s16;
+  int8_t          s8;
+  float           f;
+  double          d;
+  Hex8            h8;
+  Hex16           h16;
+  Hex32           h32;
+  Hex64           h64;
+};
+
+namespace llvm {
+namespace yaml {
+  template <>
+  struct MappingTraits<BuiltInTypes> {
+    static void mapping(IO &io, BuiltInTypes& bt) {
+      io.mapRequired("str",      bt.str);
+      io.mapRequired("u64",      bt.u64);
+      io.mapRequired("u32",      bt.u32);
+      io.mapRequired("u16",      bt.u16);
+      io.mapRequired("u8",       bt.u8);
+      io.mapRequired("b",        bt.b);
+      io.mapRequired("s64",      bt.s64);
+      io.mapRequired("s32",      bt.s32);
+      io.mapRequired("s16",      bt.s16);
+      io.mapRequired("s8",       bt.s8);
+      io.mapRequired("f",        bt.f);
+      io.mapRequired("d",        bt.d);
+      io.mapRequired("h8",       bt.h8);
+      io.mapRequired("h16",      bt.h16);
+      io.mapRequired("h32",      bt.h32);
+      io.mapRequired("h64",      bt.h64);
+    }
+  };
+}
+}
+
+
+//
+// Test the reading of all built-in scalar conversions
+//
+TEST(YAMLIO, TestReadBuiltInTypes) {
+  BuiltInTypes map;
+  Input yin("---\n"
+            "str:      hello there\n"
+            "u64:      5000000000\n"
+            "u32:      4000000000\n"
+            "u16:      65000\n"
+            "u8:       255\n"
+            "b:        false\n"
+            "s64:      -5000000000\n"
+            "s32:      -2000000000\n"
+            "s16:      -32000\n"
+            "s8:       -127\n"
+            "f:        137.125\n"
+            "d:        -2.8625\n"
+            "h8:       0xFF\n"
+            "h16:      0x8765\n"
+            "h32:      0xFEDCBA98\n"
+            "h64:      0xFEDCBA9876543210\n"
+           "...\n");
+  yin >> map;
+
+  EXPECT_FALSE(yin.error());
+  EXPECT_TRUE(map.str.equals("hello there"));
+  EXPECT_EQ(map.u64, 5000000000ULL);
+  EXPECT_EQ(map.u32, 4000000000U);
+  EXPECT_EQ(map.u16, 65000);
+  EXPECT_EQ(map.u8,  255);
+  EXPECT_EQ(map.b,   false);
+  EXPECT_EQ(map.s64, -5000000000LL);
+  EXPECT_EQ(map.s32, -2000000000L);
+  EXPECT_EQ(map.s16, -32000);
+  EXPECT_EQ(map.s8,  -127);
+  EXPECT_EQ(map.f,   137.125);
+  EXPECT_EQ(map.d,   -2.8625);
+  EXPECT_EQ(map.h8,  Hex8(255));
+  EXPECT_EQ(map.h16, Hex16(0x8765));
+  EXPECT_EQ(map.h32, Hex32(0xFEDCBA98));
+  EXPECT_EQ(map.h64, Hex64(0xFEDCBA9876543210LL));
+}
+
+
+//
+// Test writing then reading back all built-in scalar types
+//
+TEST(YAMLIO, TestReadWriteBuiltInTypes) {
+  std::string intermediate;
+  {
+    BuiltInTypes map;
+    map.str = "one two";
+    map.u64 = 6000000000ULL;
+    map.u32 = 3000000000U;
+    map.u16 = 50000;
+    map.u8  = 254;
+    map.b   = true;
+    map.s64 = -6000000000LL;
+    map.s32 = -2000000000;
+    map.s16 = -32000;
+    map.s8  = -128;
+    map.f   = 3.25;
+    map.d   = -2.8625;
+    map.h8  = 254;
+    map.h16 = 50000;
+    map.h32 = 3000000000U;
+    map.h64 = 6000000000LL;
+
+    llvm::raw_string_ostream ostr(intermediate);
+    Output yout(ostr);
+    yout << map;
+  }
+
+  {
+    Input yin(intermediate);
+    BuiltInTypes map;
+    yin >> map;
+
+    EXPECT_FALSE(yin.error());
+    EXPECT_TRUE(map.str.equals("one two"));
+    EXPECT_EQ(map.u64,      6000000000ULL);
+    EXPECT_EQ(map.u32,      3000000000U);
+    EXPECT_EQ(map.u16,      50000);
+    EXPECT_EQ(map.u8,       254);
+    EXPECT_EQ(map.b,        true);
+    EXPECT_EQ(map.s64,      -6000000000LL);
+    EXPECT_EQ(map.s32,      -2000000000L);
+    EXPECT_EQ(map.s16,      -32000);
+    EXPECT_EQ(map.s8,       -128);
+    EXPECT_EQ(map.f,        3.25);
+    EXPECT_EQ(map.d,        -2.8625);
+    EXPECT_EQ(map.h8,       Hex8(254));
+    EXPECT_EQ(map.h16,      Hex16(50000));
+    EXPECT_EQ(map.h32,      Hex32(3000000000U));
+    EXPECT_EQ(map.h64,      Hex64(6000000000LL));
+  }
+}
+
+
+
+//===----------------------------------------------------------------------===//
+//  Test ScalarEnumerationTraits
+//===----------------------------------------------------------------------===//
+
+enum Colors {
+    cRed,
+    cBlue,
+    cGreen,
+    cYellow
+};
+
+struct ColorMap {
+  Colors      c1;
+  Colors      c2;
+  Colors      c3;
+  Colors      c4;
+  Colors      c5;
+  Colors      c6;
+};
+
+namespace llvm {
+namespace yaml {
+  template <>
+  struct ScalarEnumerationTraits<Colors> {
+    static void enumeration(IO &io, Colors &value) {
+      io.enumCase(value, "red",   cRed);
+      io.enumCase(value, "blue",  cBlue);
+      io.enumCase(value, "green", cGreen);
+      io.enumCase(value, "yellow",cYellow);
+    }
+  };
+  template <>
+  struct MappingTraits<ColorMap> {
+    static void mapping(IO &io, ColorMap& c) {
+      io.mapRequired("c1", c.c1);
+      io.mapRequired("c2", c.c2);
+      io.mapRequired("c3", c.c3);
+      io.mapOptional("c4", c.c4, cBlue);   // supplies default
+      io.mapOptional("c5", c.c5, cYellow); // supplies default
+      io.mapOptional("c6", c.c6, cRed);    // supplies default
+    }
+  };
+}
+}
+
+
+//
+// Test reading enumerated scalars
+//
+TEST(YAMLIO, TestEnumRead) {
+  ColorMap map;
+  Input yin("---\n"
+            "c1:  blue\n"
+            "c2:  red\n"
+            "c3:  green\n"
+            "c5:  yellow\n"
+            "...\n");
+  yin >> map;
+
+  EXPECT_FALSE(yin.error());
+  EXPECT_EQ(cBlue,  map.c1);
+  EXPECT_EQ(cRed,   map.c2);
+  EXPECT_EQ(cGreen, map.c3);
+  EXPECT_EQ(cBlue,  map.c4);  // tests default
+  EXPECT_EQ(cYellow,map.c5);  // tests overridden
+  EXPECT_EQ(cRed,   map.c6);  // tests default
+}
+
+
+
+//===----------------------------------------------------------------------===//
+//  Test ScalarBitSetTraits
+//===----------------------------------------------------------------------===//
+
+enum MyFlags {
+  flagNone    = 0,
+  flagBig     = 1 << 0,
+  flagFlat    = 1 << 1,
+  flagRound   = 1 << 2,
+  flagPointy  = 1 << 3
+};
+inline MyFlags operator|(MyFlags a, MyFlags b) {
+  return static_cast<MyFlags>(
+                      static_cast<uint32_t>(a) | static_cast<uint32_t>(b));
+}
+
+struct FlagsMap {
+  MyFlags     f1;
+  MyFlags     f2;
+  MyFlags     f3;
+  MyFlags     f4;
+};
+
+
+namespace llvm {
+namespace yaml {
+  template <>
+  struct ScalarBitSetTraits<MyFlags> {
+    static void bitset(IO &io, MyFlags &value) {
+      io.bitSetCase(value, "big",   flagBig);
+      io.bitSetCase(value, "flat",  flagFlat);
+      io.bitSetCase(value, "round", flagRound);
+      io.bitSetCase(value, "pointy",flagPointy);
+    }
+  };
+  template <>
+  struct MappingTraits<FlagsMap> {
+    static void mapping(IO &io, FlagsMap& c) {
+      io.mapRequired("f1", c.f1);
+      io.mapRequired("f2", c.f2);
+      io.mapRequired("f3", c.f3);
+      io.mapOptional("f4", c.f4, MyFlags(flagRound));
+     }
+  };
+}
+}
+
+
+//
+// Test reading flow sequence representing bit-mask values
+//
+TEST(YAMLIO, TestFlagsRead) {
+  FlagsMap map;
+  Input yin("---\n"
+            "f1:  [ big ]\n"
+            "f2:  [ round, flat ]\n"
+            "f3:  []\n"
+            "...\n");
+  yin >> map;
+
+  EXPECT_FALSE(yin.error());
+  EXPECT_EQ(flagBig,              map.f1);
+  EXPECT_EQ(flagRound|flagFlat,   map.f2);
+  EXPECT_EQ(flagNone,             map.f3);  // check empty set
+  EXPECT_EQ(flagRound,            map.f4);  // check optional key
+}
+
+
+//
+// Test writing then reading back bit-mask values
+//
+TEST(YAMLIO, TestReadWriteFlags) {
+  std::string intermediate;
+  {
+    FlagsMap map;
+    map.f1 = flagBig;
+    map.f2 = flagRound | flagFlat;
+    map.f3 = flagNone;
+    map.f4 = flagNone;
+
+    llvm::raw_string_ostream ostr(intermediate);
+    Output yout(ostr);
+    yout << map;
+  }
+
+  {
+    Input yin(intermediate);
+    FlagsMap map2;
+    yin >> map2;
+
+    EXPECT_FALSE(yin.error());
+    EXPECT_EQ(flagBig,              map2.f1);
+    EXPECT_EQ(flagRound|flagFlat,   map2.f2);
+    EXPECT_EQ(flagNone,             map2.f3);
+    //EXPECT_EQ(flagRound,            map2.f4);  // check optional key
+  }
+}
+
+
+
+//===----------------------------------------------------------------------===//
+//  Test ScalarTraits
+//===----------------------------------------------------------------------===//
+
+struct MyCustomType {
+  int length;
+  int width;
+};
+
+struct MyCustomTypeMap {
+  MyCustomType     f1;
+  MyCustomType     f2;
+  int              f3;
+};
+
+
+namespace llvm {
+namespace yaml {
+  template <>
+  struct MappingTraits<MyCustomTypeMap> {
+    static void mapping(IO &io, MyCustomTypeMap& s) {
+      io.mapRequired("f1", s.f1);
+      io.mapRequired("f2", s.f2);
+      io.mapRequired("f3", s.f3);
+     }
+  };
+  // MyCustomType is formatted as a yaml scalar.  A value of
+  // {length=3, width=4} would be represented in yaml as "3 by 4".
+  template<>
+  struct ScalarTraits<MyCustomType> {
+    static void output(const MyCustomType &value, void* ctxt, llvm::raw_ostream &out) {
+      out << llvm::format("%d by %d", value.length, value.width);
+    }
+    static StringRef input(StringRef scalar, void* ctxt, MyCustomType &value) {
+      size_t byStart = scalar.find("by");
+      if ( byStart != StringRef::npos ) {
+        StringRef lenStr = scalar.slice(0, byStart);
+        lenStr = lenStr.rtrim();
+        if ( lenStr.getAsInteger(0, value.length) ) {
+          return "malformed length";
+        }
+        StringRef widthStr = scalar.drop_front(byStart+2);
+        widthStr = widthStr.ltrim();
+        if ( widthStr.getAsInteger(0, value.width) ) {
+          return "malformed width";
+        }
+        return StringRef();
+      }
+      else {
+          return "malformed by";
+      }
+    }
+  };
+}
+}
+
+
+//
+// Test writing then reading back custom values
+//
+TEST(YAMLIO, TestReadWriteMyCustomType) {
+  std::string intermediate;
+  {
+    MyCustomTypeMap map;
+    map.f1.length = 1;
+    map.f1.width  = 4;
+    map.f2.length = 100;
+    map.f2.width  = 400;
+    map.f3 = 10;
+
+    llvm::raw_string_ostream ostr(intermediate);
+    Output yout(ostr);
+    yout << map;
+  }
+
+  {
+    Input yin(intermediate);
+    MyCustomTypeMap map2;
+    yin >> map2;
+
+    EXPECT_FALSE(yin.error());
+    EXPECT_EQ(1,      map2.f1.length);
+    EXPECT_EQ(4,      map2.f1.width);
+    EXPECT_EQ(100,    map2.f2.length);
+    EXPECT_EQ(400,    map2.f2.width);
+    EXPECT_EQ(10,     map2.f3);
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Test flow sequences
+//===----------------------------------------------------------------------===//
+
+LLVM_YAML_STRONG_TYPEDEF(int, MyNumber)
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(MyNumber)
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(llvm::StringRef)
+
+namespace llvm {
+namespace yaml {
+  template<>
+  struct ScalarTraits<MyNumber> {
+    static void output(const MyNumber &value, void *, llvm::raw_ostream &out) {
+      out << value;
+    }
+
+    static StringRef input(StringRef scalar, void *, MyNumber &value) {
+      long long n;
+      if ( getAsSignedInteger(scalar, 0, n) )
+        return "invalid number";
+      value = n;
+      return StringRef();
+    }
+  };
+}
+}
+
+struct NameAndNumbers {
+  llvm::StringRef               name;
+  std::vector<llvm::StringRef>  strings;
+  std::vector<MyNumber>         single;
+  std::vector<MyNumber>         numbers;
+};
+
+namespace llvm {
+namespace yaml {
+  template <>
+  struct MappingTraits<NameAndNumbers> {
+    static void mapping(IO &io, NameAndNumbers& nn) {
+      io.mapRequired("name",     nn.name);
+      io.mapRequired("strings",  nn.strings);
+      io.mapRequired("single",   nn.single);
+      io.mapRequired("numbers",  nn.numbers);
+    }
+  };
+}
+}
+
+
+//
+// Test writing then reading back custom values
+//
+TEST(YAMLIO, TestReadWriteMyFlowSequence) {
+  std::string intermediate;
+  {
+    NameAndNumbers map;
+    map.name  = "hello";
+    map.strings.push_back(llvm::StringRef("one"));
+    map.strings.push_back(llvm::StringRef("two"));
+    map.single.push_back(1);
+    map.numbers.push_back(10);
+    map.numbers.push_back(-30);
+    map.numbers.push_back(1024);
+
+    llvm::raw_string_ostream ostr(intermediate);
+    Output yout(ostr);
+    yout << map;
+  }
+
+  {
+    Input yin(intermediate);
+    NameAndNumbers map2;
+    yin >> map2;
+
+    EXPECT_FALSE(yin.error());
+    EXPECT_TRUE(map2.name.equals("hello"));
+    EXPECT_EQ(map2.strings.size(), 2UL);
+    EXPECT_TRUE(map2.strings[0].equals("one"));
+    EXPECT_TRUE(map2.strings[1].equals("two"));
+    EXPECT_EQ(map2.single.size(), 1UL);
+    EXPECT_EQ(1,       map2.single[0]);
+    EXPECT_EQ(map2.numbers.size(), 3UL);
+    EXPECT_EQ(10,      map2.numbers[0]);
+    EXPECT_EQ(-30,     map2.numbers[1]);
+    EXPECT_EQ(1024,    map2.numbers[2]);
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Test normalizing/denormalizing
+//===----------------------------------------------------------------------===//
+
+LLVM_YAML_STRONG_TYPEDEF(uint32_t, TotalSeconds)
+
+typedef std::vector<TotalSeconds> SecondsSequence;
+
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(TotalSeconds)
+
+
+namespace llvm {
+namespace yaml {
+  template <>
+  struct MappingTraits<TotalSeconds> {
+
+    class NormalizedSeconds {
+    public:
+      NormalizedSeconds(IO &io)
+        : hours(0), minutes(0), seconds(0) {
+      }
+      NormalizedSeconds(IO &, TotalSeconds &secs)
+        : hours(secs/3600),
+          minutes((secs - (hours*3600))/60),
+          seconds(secs % 60) {
+      }
+      TotalSeconds denormalize(IO &) {
+        return TotalSeconds(hours*3600 + minutes*60 + seconds);
+      }
+
+      uint32_t     hours;
+      uint8_t      minutes;
+      uint8_t      seconds;
+    };
+
+    static void mapping(IO &io, TotalSeconds &secs) {
+      MappingNormalization<NormalizedSeconds, TotalSeconds> keys(io, secs);
+
+      io.mapOptional("hours",    keys->hours,    (uint32_t)0);
+      io.mapOptional("minutes",  keys->minutes,  (uint8_t)0);
+      io.mapRequired("seconds",  keys->seconds);
+    }
+  };
+}
+}
+
+
+//
+// Test the reading of a yaml sequence of mappings
+//
+TEST(YAMLIO, TestReadMySecondsSequence) {
+  SecondsSequence seq;
+  Input yin("---\n - hours:  1\n   seconds:  5\n - seconds:  59\n...\n");
+  yin >> seq;
+
+  EXPECT_FALSE(yin.error());
+  EXPECT_EQ(seq.size(), 2UL);
+  EXPECT_EQ(seq[0], 3605U);
+  EXPECT_EQ(seq[1], 59U);
+}
+
+
+//
+// Test writing then reading back custom values
+//
+TEST(YAMLIO, TestReadWriteMySecondsSequence) {
+  std::string intermediate;
+  {
+    SecondsSequence seq;
+    seq.push_back(4000);
+    seq.push_back(500);
+    seq.push_back(59);
+
+    llvm::raw_string_ostream ostr(intermediate);
+    Output yout(ostr);
+    yout << seq;
+  }
+  {
+    Input yin(intermediate);
+    SecondsSequence seq2;
+    yin >> seq2;
+
+    EXPECT_FALSE(yin.error());
+    EXPECT_EQ(seq2.size(), 3UL);
+    EXPECT_EQ(seq2[0], 4000U);
+    EXPECT_EQ(seq2[1], 500U);
+    EXPECT_EQ(seq2[2], 59U);
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Test dynamic typing
+//===----------------------------------------------------------------------===//
+
+enum AFlags {
+    a1,
+    a2,
+    a3
+};
+
+enum BFlags {
+    b1,
+    b2,
+    b3
+};
+
+enum Kind {
+    kindA,
+    kindB
+};
+
+struct KindAndFlags {
+  KindAndFlags() : kind(kindA), flags(0) { }
+  KindAndFlags(Kind k, uint32_t f) : kind(k), flags(f) { }
+  Kind        kind;
+  uint32_t    flags;
+};
+
+typedef std::vector<KindAndFlags> KindAndFlagsSequence;
+
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(KindAndFlags)
+
+namespace llvm {
+namespace yaml {
+  template <>
+  struct ScalarEnumerationTraits<AFlags> {
+    static void enumeration(IO &io, AFlags &value) {
+      io.enumCase(value, "a1",  a1);
+      io.enumCase(value, "a2",  a2);
+      io.enumCase(value, "a3",  a3);
+    }
+  };
+  template <>
+  struct ScalarEnumerationTraits<BFlags> {
+    static void enumeration(IO &io, BFlags &value) {
+      io.enumCase(value, "b1",  b1);
+      io.enumCase(value, "b2",  b2);
+      io.enumCase(value, "b3",  b3);
+    }
+  };
+  template <>
+  struct ScalarEnumerationTraits<Kind> {
+    static void enumeration(IO &io, Kind &value) {
+      io.enumCase(value, "A",  kindA);
+      io.enumCase(value, "B",  kindB);
+    }
+  };
+  template <>
+  struct MappingTraits<KindAndFlags> {
+    static void mapping(IO &io, KindAndFlags& kf) {
+      io.mapRequired("kind",  kf.kind);
+      // type of flags field varies depending on kind field
+      if ( kf.kind == kindA )
+        io.mapRequired("flags", *((AFlags*)&kf.flags));
+      else
+        io.mapRequired("flags", *((BFlags*)&kf.flags));
+    }
+  };
+}
+}
+
+
+//
+// Test the reading of a yaml sequence dynamic types
+//
+TEST(YAMLIO, TestReadKindAndFlagsSequence) {
+  KindAndFlagsSequence seq;
+  Input yin("---\n - kind:  A\n   flags:  a2\n - kind:  B\n   flags:  b1\n...\n");
+  yin >> seq;
+
+  EXPECT_FALSE(yin.error());
+  EXPECT_EQ(seq.size(), 2UL);
+  EXPECT_EQ(seq[0].kind,  kindA);
+  EXPECT_EQ(seq[0].flags, (uint32_t)a2);
+  EXPECT_EQ(seq[1].kind,  kindB);
+  EXPECT_EQ(seq[1].flags, (uint32_t)b1);
+}
+
+//
+// Test writing then reading back dynamic types
+//
+TEST(YAMLIO, TestReadWriteKindAndFlagsSequence) {
+  std::string intermediate;
+  {
+    KindAndFlagsSequence seq;
+    seq.push_back(KindAndFlags(kindA,a1));
+    seq.push_back(KindAndFlags(kindB,b1));
+    seq.push_back(KindAndFlags(kindA,a2));
+    seq.push_back(KindAndFlags(kindB,b2));
+    seq.push_back(KindAndFlags(kindA,a3));
+
+    llvm::raw_string_ostream ostr(intermediate);
+    Output yout(ostr);
+    yout << seq;
+  }
+  {
+    Input yin(intermediate);
+    KindAndFlagsSequence seq2;
+    yin >> seq2;
+
+    EXPECT_FALSE(yin.error());
+    EXPECT_EQ(seq2.size(), 5UL);
+    EXPECT_EQ(seq2[0].kind,  kindA);
+    EXPECT_EQ(seq2[0].flags, (uint32_t)a1);
+    EXPECT_EQ(seq2[1].kind,  kindB);
+    EXPECT_EQ(seq2[1].flags, (uint32_t)b1);
+    EXPECT_EQ(seq2[2].kind,  kindA);
+    EXPECT_EQ(seq2[2].flags, (uint32_t)a2);
+    EXPECT_EQ(seq2[3].kind,  kindB);
+    EXPECT_EQ(seq2[3].flags, (uint32_t)b2);
+    EXPECT_EQ(seq2[4].kind,  kindA);
+    EXPECT_EQ(seq2[4].flags, (uint32_t)a3);
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Test document list
+//===----------------------------------------------------------------------===//
+
+struct FooBarMap {
+  int foo;
+  int bar;
+};
+typedef std::vector<FooBarMap> FooBarMapDocumentList;
+
+LLVM_YAML_IS_DOCUMENT_LIST_VECTOR(FooBarMap)
+
+
+namespace llvm {
+namespace yaml {
+  template <>
+  struct MappingTraits<FooBarMap> {
+    static void mapping(IO &io, FooBarMap& fb) {
+      io.mapRequired("foo",    fb.foo);
+      io.mapRequired("bar",    fb.bar);
+    }
+  };
+}
+}
+
+
+//
+// Test the reading of a yaml mapping
+//
+TEST(YAMLIO, TestDocRead) {
+  FooBarMap doc;
+  Input yin("---\nfoo:  3\nbar:  5\n...\n");
+  yin >> doc;
+
+  EXPECT_FALSE(yin.error());
+  EXPECT_EQ(doc.foo, 3);
+  EXPECT_EQ(doc.bar,5);
+}
+
+
+
+//
+// Test writing then reading back a sequence of mappings
+//
+TEST(YAMLIO, TestSequenceDocListWriteAndRead) {
+  std::string intermediate;
+  {
+    FooBarMap doc1;
+    doc1.foo = 10;
+    doc1.bar = -3;
+    FooBarMap doc2;
+    doc2.foo = 257;
+    doc2.bar = 0;
+    std::vector<FooBarMap> docList;
+    docList.push_back(doc1);
+    docList.push_back(doc2);
+
+    llvm::raw_string_ostream ostr(intermediate);
+    Output yout(ostr);
+    yout << docList;
+  }
+
+
+  {
+    Input yin(intermediate);
+    std::vector<FooBarMap> docList2;
+    yin >> docList2;
+
+    EXPECT_FALSE(yin.error());
+    EXPECT_EQ(docList2.size(), 2UL);
+    FooBarMap& map1 = docList2[0];
+    FooBarMap& map2 = docList2[1];
+    EXPECT_EQ(map1.foo, 10);
+    EXPECT_EQ(map1.bar, -3);
+    EXPECT_EQ(map2.foo, 257);
+    EXPECT_EQ(map2.bar, 0);
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+//  Test error handling
+//===----------------------------------------------------------------------===//
+
+
+
+static void suppressErrorMessages(const llvm::SMDiagnostic &, void *) {
+}
+
+
+//
+// Test error handling of unknown enumerated scalar
+//
+TEST(YAMLIO, TestColorsReadError) {
+  ColorMap map;
+  Input yin("---\n"
+            "c1:  blue\n"
+            "c2:  purple\n"
+            "c3:  green\n"
+            "...\n");
+  yin.setDiagHandler(suppressErrorMessages);
+  yin >> map;
+  EXPECT_TRUE(yin.error());
+}
+
+
+//
+// Test error handling of flow sequence with unknown value
+//
+TEST(YAMLIO, TestFlagsReadError) {
+  FlagsMap map;
+  Input yin("---\n"
+            "f1:  [ big ]\n"
+            "f2:  [ round, hollow ]\n"
+            "f3:  []\n"
+            "...\n");
+  yin.setDiagHandler(suppressErrorMessages);
+  yin >> map;
+
+  EXPECT_TRUE(yin.error());
+}
+
+
+//
+// Test error handling reading built-in uint8_t type
+//
+LLVM_YAML_IS_SEQUENCE_VECTOR(uint8_t)
+TEST(YAMLIO, TestReadBuiltInTypesUint8Error) {
+  std::vector<uint8_t> seq;
+  Input yin("---\n"
+            "- 255\n"
+            "- 0\n"
+            "- 257\n"
+            "...\n");
+  yin.setDiagHandler(suppressErrorMessages);
+  yin >> seq;
+
+  EXPECT_TRUE(yin.error());
+}
+
+
+//
+// Test error handling reading built-in uint16_t type
+//
+LLVM_YAML_IS_SEQUENCE_VECTOR(uint16_t)
+TEST(YAMLIO, TestReadBuiltInTypesUint16Error) {
+  std::vector<uint16_t> seq;
+  Input yin("---\n"
+            "- 65535\n"
+            "- 0\n"
+            "- 66000\n"
+            "...\n");
+  yin.setDiagHandler(suppressErrorMessages);
+  yin >> seq;
+
+  EXPECT_TRUE(yin.error());
+}
+
+
+//
+// Test error handling reading built-in uint32_t type
+//
+LLVM_YAML_IS_SEQUENCE_VECTOR(uint32_t)
+TEST(YAMLIO, TestReadBuiltInTypesUint32Error) {
+  std::vector<uint32_t> seq;
+  Input yin("---\n"
+            "- 4000000000\n"
+            "- 0\n"
+            "- 5000000000\n"
+            "...\n");
+  yin.setDiagHandler(suppressErrorMessages);
+  yin >> seq;
+
+  EXPECT_TRUE(yin.error());
+}
+
+
+//
+// Test error handling reading built-in uint64_t type
+//
+LLVM_YAML_IS_SEQUENCE_VECTOR(uint64_t)
+TEST(YAMLIO, TestReadBuiltInTypesUint64Error) {
+  std::vector<uint64_t> seq;
+  Input yin("---\n"
+            "- 18446744073709551615\n"
+            "- 0\n"
+            "- 19446744073709551615\n"
+            "...\n");
+  yin.setDiagHandler(suppressErrorMessages);
+  yin >> seq;
+
+  EXPECT_TRUE(yin.error());
+}
+
+
+//
+// Test error handling reading built-in int8_t type
+//
+LLVM_YAML_IS_SEQUENCE_VECTOR(int8_t)
+TEST(YAMLIO, TestReadBuiltInTypesint8OverError) {
+  std::vector<int8_t> seq;
+  Input yin("---\n"
+            "- -128\n"
+            "- 0\n"
+            "- 127\n"
+            "- 128\n"
+           "...\n");
+  yin.setDiagHandler(suppressErrorMessages);
+  yin >> seq;
+
+  EXPECT_TRUE(yin.error());
+}
+
+//
+// Test error handling reading built-in int8_t type
+//
+TEST(YAMLIO, TestReadBuiltInTypesint8UnderError) {
+  std::vector<int8_t> seq;
+  Input yin("---\n"
+            "- -128\n"
+            "- 0\n"
+            "- 127\n"
+            "- -129\n"
+            "...\n");
+  yin.setDiagHandler(suppressErrorMessages);
+  yin >> seq;
+
+  EXPECT_TRUE(yin.error());
+}
+
+
+//
+// Test error handling reading built-in int16_t type
+//
+LLVM_YAML_IS_SEQUENCE_VECTOR(int16_t)
+TEST(YAMLIO, TestReadBuiltInTypesint16UnderError) {
+  std::vector<int16_t> seq;
+  Input yin("---\n"
+            "- 32767\n"
+            "- 0\n"
+            "- -32768\n"
+            "- -32769\n"
+            "...\n");
+  yin.setDiagHandler(suppressErrorMessages);
+  yin >> seq;
+
+  EXPECT_TRUE(yin.error());
+}
+
+
+//
+// Test error handling reading built-in int16_t type
+//
+TEST(YAMLIO, TestReadBuiltInTypesint16OverError) {
+  std::vector<int16_t> seq;
+  Input yin("---\n"
+            "- 32767\n"
+            "- 0\n"
+            "- -32768\n"
+            "- 32768\n"
+            "...\n");
+  yin.setDiagHandler(suppressErrorMessages);
+  yin >> seq;
+
+  EXPECT_TRUE(yin.error());
+}
+
+
+//
+// Test error handling reading built-in int32_t type
+//
+LLVM_YAML_IS_SEQUENCE_VECTOR(int32_t)
+TEST(YAMLIO, TestReadBuiltInTypesint32UnderError) {
+  std::vector<int32_t> seq;
+  Input yin("---\n"
+            "- 2147483647\n"
+            "- 0\n"
+            "- -2147483648\n"
+            "- -2147483649\n"
+            "...\n");
+  yin.setDiagHandler(suppressErrorMessages);
+  yin >> seq;
+
+  EXPECT_TRUE(yin.error());
+}
+
+//
+// Test error handling reading built-in int32_t type
+//
+TEST(YAMLIO, TestReadBuiltInTypesint32OverError) {
+  std::vector<int32_t> seq;
+  Input yin("---\n"
+            "- 2147483647\n"
+            "- 0\n"
+            "- -2147483648\n"
+            "- 2147483649\n"
+            "...\n");
+  yin.setDiagHandler(suppressErrorMessages);
+  yin >> seq;
+
+  EXPECT_TRUE(yin.error());
+}
+
+
+//
+// Test error handling reading built-in int64_t type
+//
+LLVM_YAML_IS_SEQUENCE_VECTOR(int64_t)
+TEST(YAMLIO, TestReadBuiltInTypesint64UnderError) {
+  std::vector<int64_t> seq;
+  Input yin("---\n"
+            "- -9223372036854775808\n"
+            "- 0\n"
+            "- 9223372036854775807\n"
+            "- -9223372036854775809\n"
+            "...\n");
+  yin.setDiagHandler(suppressErrorMessages);
+  yin >> seq;
+
+  EXPECT_TRUE(yin.error());
+}
+
+//
+// Test error handling reading built-in int64_t type
+//
+TEST(YAMLIO, TestReadBuiltInTypesint64OverError) {
+  std::vector<int64_t> seq;
+  Input yin("---\n"
+            "- -9223372036854775808\n"
+            "- 0\n"
+            "- 9223372036854775807\n"
+            "- 9223372036854775809\n"
+            "...\n");
+  yin.setDiagHandler(suppressErrorMessages);
+  yin >> seq;
+
+  EXPECT_TRUE(yin.error());
+}
+
+//
+// Test error handling reading built-in float type
+//
+LLVM_YAML_IS_SEQUENCE_VECTOR(float)
+TEST(YAMLIO, TestReadBuiltInTypesFloatError) {
+  std::vector<float> seq;
+  Input yin("---\n"
+            "- 0.0\n"
+            "- 1000.1\n"
+            "- -123.456\n"
+            "- 1.2.3\n"
+            "...\n");
+  yin.setDiagHandler(suppressErrorMessages);
+  yin >> seq;
+
+  EXPECT_TRUE(yin.error());
+}
+
+//
+// Test error handling reading built-in float type
+//
+LLVM_YAML_IS_SEQUENCE_VECTOR(double)
+TEST(YAMLIO, TestReadBuiltInTypesDoubleError) {
+  std::vector<double> seq;
+  Input yin("---\n"
+            "- 0.0\n"
+            "- 1000.1\n"
+            "- -123.456\n"
+            "- 1.2.3\n"
+            "...\n");
+  yin.setDiagHandler(suppressErrorMessages);
+  yin >> seq;
+
+  EXPECT_TRUE(yin.error());
+}
+
+//
+// Test error handling reading built-in Hex8 type
+//
+LLVM_YAML_IS_SEQUENCE_VECTOR(Hex8)
+TEST(YAMLIO, TestReadBuiltInTypesHex8Error) {
+  std::vector<Hex8> seq;
+  Input yin("---\n"
+            "- 0x12\n"
+            "- 0xFE\n"
+            "- 0x123\n"
+            "...\n");
+  yin.setDiagHandler(suppressErrorMessages);
+  yin >> seq;
+
+  EXPECT_TRUE(yin.error());
+}
+
+
+//
+// Test error handling reading built-in Hex16 type
+//
+LLVM_YAML_IS_SEQUENCE_VECTOR(Hex16)
+TEST(YAMLIO, TestReadBuiltInTypesHex16Error) {
+  std::vector<Hex16> seq;
+  Input yin("---\n"
+            "- 0x0012\n"
+            "- 0xFEFF\n"
+            "- 0x12345\n"
+            "...\n");
+  yin.setDiagHandler(suppressErrorMessages);
+  yin >> seq;
+
+  EXPECT_TRUE(yin.error());
+}
+
+//
+// Test error handling reading built-in Hex32 type
+//
+LLVM_YAML_IS_SEQUENCE_VECTOR(Hex32)
+TEST(YAMLIO, TestReadBuiltInTypesHex32Error) {
+  std::vector<Hex32> seq;
+  Input yin("---\n"
+            "- 0x0012\n"
+            "- 0xFEFF0000\n"
+            "- 0x1234556789\n"
+            "...\n");
+  yin.setDiagHandler(suppressErrorMessages);
+  yin >> seq;
+
+  EXPECT_TRUE(yin.error());
+}
+
+//
+// Test error handling reading built-in Hex64 type
+//
+LLVM_YAML_IS_SEQUENCE_VECTOR(Hex64)
+TEST(YAMLIO, TestReadBuiltInTypesHex64Error) {
+  std::vector<Hex64> seq;
+  Input yin("---\n"
+            "- 0x0012\n"
+            "- 0xFFEEDDCCBBAA9988\n"
+            "- 0x12345567890ABCDEF0\n"
+            "...\n");
+  yin.setDiagHandler(suppressErrorMessages);
+  yin >> seq;
+
+  EXPECT_TRUE(yin.error());
+}
+