177 files changed, 4951 insertions, 1339 deletions
diff --git a/Makefile b/Makefile
index 49e4b24ebf..3bc1f5da52 100644
--- a/Makefile
+++ b/Makefile
@@ -73,10 +73,13 @@ endif
 
 ifeq ($(MAKECMDGOALS),install-clang)
   DIRS := tools/clang/tools/driver tools/clang/lib/Headers \
-          tools/clang/tools/libclang tools/clang/tools/c-index-test \
+          tools/clang/tools/libclang \
           tools/clang/include/clang-c \
           tools/clang/runtime tools/clang/docs \
           tools/lto runtime
+  ifneq ($(BUILD_CLANG_ONLY),YES)
+    DIRS += tools/clang/tools/c-index-test
+  endif
   OPTIONAL_DIRS :=
   NO_INSTALL = 1
 endif
diff --git a/cmake/modules/LLVMProcessSources.cmake b/cmake/modules/LLVMProcessSources.cmake
index 0e410edc15..2cef6cfc3a 100644
--- a/cmake/modules/LLVMProcessSources.cmake
+++ b/cmake/modules/LLVMProcessSources.cmake
@@ -48,7 +48,7 @@ function(llvm_process_sources OUT_VAR)
     set( f ${CMAKE_CURRENT_SOURCE_DIR}/${s} )
     add_file_dependencies( ${f} ${TABLEGEN_OUTPUT} )
   endforeach(s)
-  if( MSVC_IDE )
+  if( MSVC_IDE OR XCODE )
     # This adds .td and .h files to the Visual Studio solution:
     # FIXME: Shall we handle *.def here?
     add_td_sources(sources)
diff --git a/docs/HowToUseInstrMappings.rst b/docs/HowToUseInstrMappings.rst
new file mode 100755
index 0000000000..b51e74e23c
--- /dev/null
+++ b/docs/HowToUseInstrMappings.rst
@@ -0,0 +1,179 @@
+.. _how_to_use_instruction_mappings:
+
+===============================
+How To Use Instruction Mappings
+===============================
+
+.. sectionauthor:: Jyotsna Verma <jverma@codeaurora.org>
+
+.. contents::
+   :local:
+
+Introduction
+============
+
+This document contains information about adding instruction mapping support
+for a target. The motivation behind this feature comes from the need to switch
+between different instruction formats during various optimizations. One approach
+could be to use switch cases which list all the instructions along with formats
+they can transition to. However, it has large maintenance overhead
+because of the hardcoded instruction names. Also, whenever a new instruction is
+added in the .td files, all the relevant switch cases should be modified
+accordingly. Instead, the same functionality could be achieved with TableGen and
+some support from the .td files for a fraction of maintenance cost.
+
+``InstrMapping`` Class Overview
+===============================
+
+TableGen uses relationship models to map instructions with each other. These
+models are described using ``InstrMapping`` class as a base. Each model sets
+various fields of the ``InstrMapping`` class such that they can uniquely
+describe all the instructions using that model. TableGen parses all the relation
+models and uses the information to construct relation tables which relate
+instructions with each other. These tables are emitted in the
+``XXXInstrInfo.inc`` file along with the functions to query them. Following
+is the definition of ``InstrMapping`` class definied in Target.td file:
+
+.. code-block:: llvm
+
+  class InstrMapping {
+    // Used to reduce search space only to the instructions using this
+    // relation model.
+    string FilterClass;
+
+    // List of fields/attributes that should be same for all the instructions in
+    // a row of the relation table. Think of this as a set of properties shared
+    // by all the instructions related by this relationship.
+    list<string> RowFields = [];
+
+    // List of fields/attributes that are same for all the instructions
+    // in a column of the relation table.
+    list<string> ColFields = [];
+
+    // Values for the fields/attributes listed in 'ColFields' corresponding to
+    // the key instruction. This is the instruction that will be transformed
+    // using this relation model.
+    list<string> KeyCol = [];
+
+    // List of values for the fields/attributes listed in 'ColFields', one for
+    // each column in the relation table. These are the instructions a key
+    // instruction will be transformed into.
+    list<list<string> > ValueCols = [];
+  }
+
+Sample Example
+--------------
+
+Let's say that we want to have a function
+``int getPredOpcode(uint16_t Opcode, enum PredSense inPredSense)`` which
+takes a non-predicated instruction and returns its predicated true or false form
+depending on some input flag, ``inPredSense``. The first step in the process is
+to define a relationship model that relates predicated instructions to their
+non-predicated form by assigning appropriate values to the ``InstrMapping``
+fields. For this relationship, non-predicated instructions are treated as key
+instruction since they are the one used to query the interface function.
+
+.. code-block:: llvm
+
+  def getPredOpcode : InstrMapping {
+    // Choose a FilterClass that is used as a base class for all the
+    // instructions modeling this relationship. This is done to reduce the
+    // search space only to these set of instructions.
+    let FilterClass = "PredRel";
+
+    // Instructions with same values for all the fields in RowFields form a
+    // row in the resulting relation table.
+    // For example, if we want to relate 'ADD' (non-predicated) with 'Add_pt'
+    // (predicated true) and 'Add_pf' (predicated false), then all 3
+    // instructions need to have same value for BaseOpcode field. It can be any
+    // unique value (Ex: XYZ) and should not be shared with any other
+    // instruction not related to 'add'.
+    let RowFields = ["BaseOpcode"];
+
+    // List of attributes that can be used to define key and column instructions
+    // for a relation. Key instruction is passed as an argument
+    // to the function used for querying relation tables. Column instructions
+    // are the instructions they (key) can transform into.
+    //
+    // Here, we choose 'PredSense' as ColFields since this is the unique
+    // attribute of the key (non-predicated) and column (true/false)
+    // instructions involved in this relationship model.
+    let ColFields = ["PredSense"];
+
+    // The key column contains non-predicated instructions.
+    let KeyCol = ["none"];
+
+    // Two value columns - first column contains instructions with
+    // PredSense=true while second column has instructions with PredSense=false.
+    let ValueCols = [["true"], ["false"]];
+  }
+
+TableGen uses the above relationship model to emit relation table that maps
+non-predicated instructions with their predicated forms. It also outputs the
+interface function
+``int getPredOpcode(uint16_t Opcode, enum PredSense inPredSense)`` to query
+the table. Here, Function ``getPredOpcode`` takes two arguments, opcode of the
+current instruction and PredSense of the desired instruction, and returns
+predicated form of the instruction, if found in the relation table.
+In order for an instruction to be added into the relation table, it needs
+to include relevant information in its definition. For example, consider
+following to be the current definitions of ADD, ADD_pt (true) and ADD_pf (false)
+instructions:
+
+.. code-block::llvm
+
+  def ADD : ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$a, IntRegs:$b),
+              "$dst = add($a, $b)",
+              [(set (i32 IntRegs:$dst), (add (i32 IntRegs:$a),
+                                             (i32 IntRegs:$b)))]>;
+
+  def ADD_Pt : ALU32_rr<(outs IntRegs:$dst),
+                         (ins PredRegs:$p, IntRegs:$a, IntRegs:$b),
+              "if ($p) $dst = add($a, $b)",
+              []>;
+
+  def ADD_Pf : ALU32_rr<(outs IntRegs:$dst),
+                         (ins PredRegs:$p, IntRegs:$a, IntRegs:$b),
+              "if (!$p) $dst = add($a, $b)",
+              []>;
+
+In this step, we modify these instructions to include the information
+required by the relationship model, <tt>getPredOpcode</tt>, so that they can
+be related.
+
+.. code-block::llvm
+
+  def ADD : PredRel, ALU32_rr<(outs IntRegs:$dst), (ins IntRegs:$a, IntRegs:$b),
+              "$dst = add($a, $b)",
+              [(set (i32 IntRegs:$dst), (add (i32 IntRegs:$a),
+                                             (i32 IntRegs:$b)))]> {
+    let BaseOpcode = "ADD";
+    let PredSense = "none";
+  }
+
+  def ADD_Pt : PredRel, ALU32_rr<(outs IntRegs:$dst),
+                         (ins PredRegs:$p, IntRegs:$a, IntRegs:$b),
+              "if ($p) $dst = add($a, $b)",
+              []> {
+    let BaseOpcode = "ADD";
+    let PredSense = "true";
+  }
+
+  def ADD_Pf : PredRel, ALU32_rr<(outs IntRegs:$dst),
+                         (ins PredRegs:$p, IntRegs:$a, IntRegs:$b),
+              "if (!$p) $dst = add($a, $b)",
+              []> {
+    let BaseOpcode = "ADD";
+    let PredSense = "false";
+  }
+
+Please note that all the above instructions use ``PredRel`` as a base class.
+This is extremely important since TableGen uses it as a filter for selecting
+instructions for ``getPredOpcode`` model. Any instruction not derived from
+``PredRel`` is excluded from the analysis. ``BaseOpcode`` is another important
+field. Since it's selected as a ``RowFields`` of the model, it is required
+to have the same value for all 3 instructions in order to be related. Next,
+``PredSense`` is used to determine their column positions by comparing its value
+with ``KeyCol`` and ``ValueCols``. If an instruction sets its ``PredSense``
+value to something not used in the relation model, it will not be assigned
+a column in the relation table.
diff --git a/docs/MarkedUpDisassembly.rst b/docs/MarkedUpDisassembly.rst
new file mode 100644
index 0000000000..e1282e102e
--- /dev/null
+++ b/docs/MarkedUpDisassembly.rst
@@ -0,0 +1,88 @@
+.. _marked_up_disassembly:
+
+=======================================
+LLVM's Optional Rich Disassembly Output
+=======================================
+
+.. contents::
+   :local:
+
+Introduction
+============
+
+LLVM's default disassembly output is raw text. To allow consumers more ability
+to introspect the instructions' textual representation or to reformat for a more
+user friendly display there is an optional rich disassembly output.
+
+This optional output is sufficient to reference into individual portions of the
+instruction text. This is intended for clients like disassemblers, list file
+generators, and pretty-printers, which need more than the raw instructions and
+the ability to print them.
+
+To provide this functionality the assembly text is marked up with annotations.
+The markup is simple enough in syntax to be robust even in the case of version
+mismatches between consumers and producers. That is, the syntax generally does
+not carry semantics beyond "this text has an annotation," so consumers can
+simply ignore annotations they do not understand or do not care about.
+
+After calling ``LLVMCreateDisasm()`` to create a disassembler context the
+optional output is enable with this call:
+
+.. code-block:: c
+
+    LLVMSetDisasmOptions(DC, LLVMDisassembler_Option_UseMarkup);
+
+Then subsequent calls to ``LLVMDisasmInstruction()`` will return output strings
+with the marked up annotations.
+
+Instruction Annotations
+=======================
+
+.. _contextual markups:
+
+Contextual markups
+------------------
+
+Annoated assembly display will supply contextual markup to help clients more
+efficiently implement things like pretty printers. Most markup will be target
+independent, so clients can effectively provide good display without any target
+specific knowledge.
+
+Annotated assembly goes through the normal instruction printer, but optionally
+includes contextual tags on portions of the instruction string. An annotation
+is any '<' '>' delimited section of text(1).
+
+.. code-block:: bat
+
+    annotation: '<' tag-name tag-modifier-list ':' annotated-text '>'
+    tag-name: identifier
+    tag-modifier-list: comma delimited identifier list
+
+The tag-name is an identifier which gives the type of the annotation. For the
+first pass, this will be very simple, with memory references, registers, and
+immediates having the tag names "mem", "reg", and "imm", respectively.
+
+The tag-modifier-list is typically additional target-specific context, such as
+register class.
+
+Clients should accept and ignore any tag-names or tag-modifiers they do not
+understand, allowing the annotations to grow in richness without breaking older
+clients.
+
+For example, a possible annotation of an ARM load of a stack-relative location
+might be annotated as:
+
+.. code-block:: nasm
+
+   ldr <reg gpr:r0>, <mem regoffset:[<reg gpr:sp>, <imm:#4>]>
+
+
+1: For assembly dialects in which '<' and/or '>' are legal tokens, a literal token is escaped by following immediately with a repeat of the character.  For example, a literal '<' character is output as '<<' in an annotated assembly string.
+
+C API Details
+-------------
+
+The intended consumers of this information use the C API, therefore the new C
+API function for the disassembler will be added to provide an option to produce
+disassembled instructions with annotations, ``LLVMSetDisasmOptions()`` and the
+``LLVMDisassembler_Option_UseMarkup`` option (see above).
diff --git a/docs/ReleaseNotes.html b/docs/ReleaseNotes.html
index 0ef8f3d1f3..9a1b547b4a 100644
--- a/docs/ReleaseNotes.html
+++ b/docs/ReleaseNotes.html
@@ -468,7 +468,10 @@ Release Notes</a>.</h1>
 
 <p> Loop Vectorizer - We've added a loop vectorizer and we are now able to
     vectorize small loops. The loop vectorizer is disabled by default and
-    can be enabled using the <b>-mllvm -vectorize</b> flag. <br/>
+    can be enabled using the <b>-mllvm -vectorize</b> flag.
+    The SIMD vector width can be specified using the flag
+    <b>-mllvm -force-vector-width=4</b>.
+    <br/>
     We can now vectorize this code:
 
     <pre class="doc_code">
diff --git a/docs/WritingAnLLVMBackend.html b/docs/WritingAnLLVMBackend.html
index 7576d490d7..0ad472cb92 100644
--- a/docs/WritingAnLLVMBackend.html
+++ b/docs/WritingAnLLVMBackend.html
@@ -32,6 +32,7 @@
   <li><a href="#InstructionSet">Instruction Set</a>
   <ul>  
     <li><a href="#operandMapping">Instruction Operand Mapping</a></li>
+    <li><a href="#relationMapping">Instruction Relation Mapping</a></li>
     <li><a href="#implementInstr">Implement a subclass of TargetInstrInfo</a></li>
     <li><a href="#branchFolding">Branch Folding and If Conversion</a></li>
   </ul></li>
@@ -1259,6 +1260,29 @@ the <tt>rd</tt>, <tt>rs1</tt>, and <tt>rs2</tt> fields respectively.
 
 <!-- ======================================================================= -->
 <h3>
+  <a name="relationMapping">Instruction Relation Mapping</a>
+</h3>
+
+<div>
+
+<p>
+This TableGen feature is used to relate instructions with each other. It is
+particularly useful when you have multiple instruction formats and need to
+switch between them after instruction selection. This entire feature is driven
+by relation models which can be defined in <tt>XXXInstrInfo.td</tt> files
+according to the target-specific instruction set. Relation models are defined
+using <tt>InstrMapping</tt> class as a base. TableGen parses all the models
+and generates instruction relation maps using the specified information.
+Relation maps are emitted as tables in the <tt>XXXGenInstrInfo.inc</tt> file
+along with the functions to query them. For the detailed information on how to
+use this feature, please refer to
+<a href="HowToUseInstrMappings.html">How to add Instruction Mappings</a>
+document.
+</p>
+</div>
+
+<!-- ======================================================================= -->
+<h3>
   <a name="implementInstr">Implement a subclass of </a>
   <a href="CodeGenerator.html#targetinstrinfo">TargetInstrInfo</a>
 </h3>
diff --git a/docs/subsystems.rst b/docs/subsystems.rst
index 6f77b79fbe..80d0eed663 100644
--- a/docs/subsystems.rst
+++ b/docs/subsystems.rst
@@ -17,6 +17,7 @@ Subsystem Documentation
    TableGenFundamentals
    DebuggingJITedCode
    GoldPlugin
+   MarkedUpDisassembly
 
 * `Writing an LLVM Pass <WritingAnLLVMPass.html>`_
     
@@ -98,3 +99,8 @@ Subsystem Documentation
    architecture.
 
 .. _`Howto: Implementing LLVM Integrated Assembler`: http://www.embecosm.com/download/ean10.html
+
+* :ref:`marked_up_disassembly`
+
+   This document describes the optional rich disassembly output syntax.
+
diff --git a/include/llvm/ADT/DenseMap.h b/include/llvm/ADT/DenseMap.h
index cbcf7892c9..ac4bdbd126 100644
--- a/include/llvm/ADT/DenseMap.h
+++ b/include/llvm/ADT/DenseMap.h
@@ -420,7 +420,7 @@ private:
       NumBuckets = getNumBuckets();
     }
     if (NumBuckets-(NewNumEntries+getNumTombstones()) <= NumBuckets/8) {
-      this->grow(NumBuckets);
+      this->grow(NumBuckets * 2);
       LookupBucketFor(Key, TheBucket);
     }
     assert(TheBucket);
@@ -600,7 +600,7 @@ public:
     unsigned OldNumBuckets = NumBuckets;
     BucketT *OldBuckets = Buckets;
 
-    allocateBuckets(std::max<unsigned>(64, NextPowerOf2(AtLeast)));
+    allocateBuckets(std::max<unsigned>(64, NextPowerOf2(AtLeast-1)));
     assert(Buckets);
     if (!OldBuckets) {
       this->BaseT::initEmpty();
@@ -826,11 +826,11 @@ public:
   }
 
   void grow(unsigned AtLeast) {
-    if (AtLeast > InlineBuckets)
-      AtLeast = std::max<unsigned>(64, NextPowerOf2(AtLeast));
+    if (AtLeast >= InlineBuckets)
+      AtLeast = std::max<unsigned>(64, NextPowerOf2(AtLeast-1));
 
     if (Small) {
-      if (AtLeast <= InlineBuckets)
+      if (AtLeast < InlineBuckets)
         return; // Nothing to do.
 
       // First move the inline buckets into a temporary storage.
diff --git a/include/llvm/ADT/StringSet.h b/include/llvm/ADT/StringSet.h
index 9c55f6b70e..b69a964a23 100644
--- a/include/llvm/ADT/StringSet.h
+++ b/include/llvm/ADT/StringSet.h
@@ -29,8 +29,13 @@ namespace llvm {
       assert(!InLang.empty());
       const char *KeyStart = InLang.data();
       const char *KeyEnd = KeyStart + InLang.size();
-      return base::insert(llvm::StringMapEntry<char>::
-                          Create(KeyStart, KeyEnd, base::getAllocator(), '+'));
+      llvm::StringMapEntry<char> *Entry = llvm::StringMapEntry<char>::
+                            Create(KeyStart, KeyEnd, base::getAllocator(), '+');
+      if (!base::insert(Entry)) {
+        Entry->Destroy(base::getAllocator());
+        return false;
+      }
+      return true;
     }
   };
 }
diff --git a/include/llvm/Analysis/DependenceAnalysis.h b/include/llvm/Analysis/DependenceAnalysis.h
index 9b6a6bbd3e..3818428a5e 100644
--- a/include/llvm/Analysis/DependenceAnalysis.h
+++ b/include/llvm/Analysis/DependenceAnalysis.h
@@ -30,23 +30,17 @@
 #ifndef LLVM_ANALYSIS_DEPENDENCEANALYSIS_H
 #define LLVM_ANALYSIS_DEPENDENCEANALYSIS_H
 
-#include "llvm/BasicBlock.h"
-#include "llvm/Function.h"
-#include "llvm/Instruction.h"
+#include "llvm/Instructions.h"
 #include "llvm/Pass.h"
 #include "llvm/ADT/SmallBitVector.h"
-#include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Support/raw_ostream.h"
-
 
 namespace llvm {
   class AliasAnalysis;
+  class Loop;
+  class LoopInfo;
   class ScalarEvolution;
   class SCEV;
-  class Value;
+  class SCEVConstant;
   class raw_ostream;
 
   /// Dependence - This class represents a dependence between two memory
diff --git a/include/llvm/Analysis/MemoryBuiltins.h b/include/llvm/Analysis/MemoryBuiltins.h
index a842898e41..9e5d97dd7f 100644
--- a/include/llvm/Analysis/MemoryBuiltins.h
+++ b/include/llvm/Analysis/MemoryBuiltins.h
@@ -168,7 +168,8 @@ class ObjectSizeOffsetVisitor
 
 public:
   ObjectSizeOffsetVisitor(const DataLayout *TD, const TargetLibraryInfo *TLI,
-                          LLVMContext &Context, bool RoundToAlign = false);
+                          LLVMContext &Context, bool RoundToAlign = false,
+                          unsigned AS = 0);
 
   SizeOffsetType compute(Value *V);
 
@@ -229,7 +230,7 @@ class ObjectSizeOffsetEvaluator
 
 public:
   ObjectSizeOffsetEvaluator(const DataLayout *TD, const TargetLibraryInfo *TLI,
-                            LLVMContext &Context);
+                            LLVMContext &Context, unsigned AS = 0);
   SizeOffsetEvalType compute(Value *V);
 
   bool knownSize(SizeOffsetEvalType SizeOffset) {
diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h
index 67c9a4d14f..d2df67080c 100644
--- a/include/llvm/Analysis/ScalarEvolution.h
+++ b/include/llvm/Analysis/ScalarEvolution.h
@@ -628,7 +628,7 @@ namespace llvm {
 
     /// getSizeOfExpr - Return an expression for sizeof on the given type.
     ///
-    const SCEV *getSizeOfExpr(Type *AllocTy);
+    const SCEV *getSizeOfExpr(Type *AllocTy, Type *IntPtrTy);
 
     /// getAlignOfExpr - Return an expression for alignof on the given type.
     ///
@@ -636,7 +636,8 @@ namespace llvm {
 
     /// getOffsetOfExpr - Return an expression for offsetof on the given field.
     ///
-    const SCEV *getOffsetOfExpr(StructType *STy, unsigned FieldNo);
+    const SCEV *getOffsetOfExpr(StructType *STy, Type *IntPtrTy,
+        unsigned FieldNo);
 
     /// getOffsetOfExpr - Return an expression for offsetof on the given field.
     ///
diff --git a/include/llvm/CallingConv.h b/include/llvm/CallingConv.h
index 86e4eebb82..053f4eb326 100644
--- a/include/llvm/CallingConv.h
+++ b/include/llvm/CallingConv.h
@@ -112,7 +112,11 @@ namespace CallingConv {
     /// Cannot have variable arguments.
     /// Can also be called by the host.
     /// Is externally visible.
-    SPIR_KERNEL = 76
+    SPIR_KERNEL = 76,
+
+    /// Intel_OCL_BI - Calling conventions for Intel OpenCL built-ins
+    Intel_OCL_BI = 77
+
   };
 } // End CallingConv namespace
 
diff --git a/include/llvm/CodeGen/PBQP/Graph.h b/include/llvm/CodeGen/PBQP/Graph.h
index a5d8b0dbd6..83c379b48c 100644
--- a/include/llvm/CodeGen/PBQP/Graph.h
+++ b/include/llvm/CodeGen/PBQP/Graph.h
@@ -19,6 +19,7 @@
 
 #include <list>
 #include <map>
+#include <llvm/ADT/ilist.h>
 
 namespace PBQP {
 
@@ -31,16 +32,16 @@ namespace PBQP {
     class NodeEntry;
     class EdgeEntry;
 
-    typedef std::list<NodeEntry> NodeList;
-    typedef std::list<EdgeEntry> EdgeList;
+    typedef llvm::ilist<NodeEntry> NodeList;
+    typedef llvm::ilist<EdgeEntry> EdgeList;
 
   public:
 
-    typedef NodeList::iterator NodeItr;
-    typedef NodeList::const_iterator ConstNodeItr;
+    typedef NodeEntry* NodeItr;
+    typedef const NodeEntry* ConstNodeItr;
 
-    typedef EdgeList::iterator EdgeItr;
-    typedef EdgeList::const_iterator ConstEdgeItr;
+    typedef EdgeEntry* EdgeItr;
+    typedef const EdgeEntry* ConstEdgeItr;
 
   private:
 
@@ -52,12 +53,14 @@ namespace PBQP {
 
   private:
 
-    class NodeEntry {
+    class NodeEntry : public llvm::ilist_node<NodeEntry> {
+      friend struct llvm::ilist_sentinel_traits<NodeEntry>;
     private:
       Vector costs;      
       AdjEdgeList adjEdges;
       unsigned degree;
       void *data;
+      NodeEntry() : costs(0, 0) {}
     public:
       NodeEntry(const Vector &costs) : costs(costs), degree(0) {}
       Vector& getCosts() { return costs; }
@@ -77,12 +80,14 @@ namespace PBQP {
       void* getData() { return data; }
     };
 
-    class EdgeEntry {
+    class EdgeEntry : public llvm::ilist_node<EdgeEntry> {
+      friend struct llvm::ilist_sentinel_traits<EdgeEntry>;
     private:
       NodeItr node1, node2;
       Matrix costs;
       AdjEdgeItr node1AEItr, node2AEItr;
       void *data;
+      EdgeEntry() : costs(0, 0, 0) {}
     public:
       EdgeEntry(NodeItr node1, NodeItr node2, const Matrix &costs)
         : node1(node1), node2(node2), costs(costs) {}
diff --git a/include/llvm/DataLayout.h b/include/llvm/DataLayout.h
index c9ac0b7fea..d778556684 100644
--- a/include/llvm/DataLayout.h
+++ b/include/llvm/DataLayout.h
@@ -262,6 +262,14 @@ public:
     }
     return 8*val->second.TypeBitWidth;
   }
+  /// Layout pointer size, in bits, based on the type.
+  /// If this function is called with a pointer type, then
+  /// the type size of the pointer is returned.
+  /// If this function is called with a vector of pointers,
+  /// then the type size of the pointer is returned.
+  /// Otherwise the type sizeo f a default pointer is returned.
+  unsigned getPointerTypeSizeInBits(Type* Ty)    const;
+
   /// Size examples:
   ///
   /// Type        SizeInBits  StoreSizeInBits  AllocSizeInBits[*]
@@ -337,11 +345,13 @@ public:
   ///
   unsigned getPreferredTypeAlignmentShift(Type *Ty) const;
 
-  /// getIntPtrType - Return an unsigned integer type that is the same size or
-  /// greater to the host pointer size.
-  /// FIXME: Need to remove the default argument when the rest of the LLVM code
-  /// base has been updated.
-  IntegerType *getIntPtrType(LLVMContext &C, unsigned AddressSpace = 0) const;
+  /// getIntPtrType - Return an integer type that is the same size or
+  /// greater to the pointer size based on the address space.
+  IntegerType *getIntPtrType(LLVMContext &C, unsigned AddressSpace) const;
+
+  /// getIntPtrType - Return an integer type that is the same size or
+  /// greater to the pointer size based on the Type.
+  IntegerType *getIntPtrType(Type *) const;
 
   /// getIndexedOffset - return the offset from the beginning of the type for
   /// the specified indices.  This is used to implement getelementptr.
diff --git a/include/llvm/InstrTypes.h b/include/llvm/InstrTypes.h
index cfc79394b2..b661372f53 100644
--- a/include/llvm/InstrTypes.h
+++ b/include/llvm/InstrTypes.h
@@ -17,6 +17,7 @@
 #define LLVM_INSTRUCTION_TYPES_H
 
 #include "llvm/Instruction.h"
+#include "llvm/DataLayout.h"
 #include "llvm/OperandTraits.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/ADT/Twine.h"
@@ -576,6 +577,11 @@ public:
     Type *IntPtrTy ///< Integer type corresponding to pointer
   ) const;
 
+  /// @brief Determine if this cast is a no-op cast.
+  bool isNoopCast(
+      const DataLayout &DL ///< DataLayout to get the Int Ptr type from.
+      ) const;
+
   /// Determine how a pair of casts can be eliminated, if they can be at all.
   /// This is a helper function for both CastInst and ConstantExpr.
   /// @returns 0 if the CastInst pair can't be eliminated, otherwise
diff --git a/include/llvm/MC/MCELFObjectWriter.h b/include/llvm/MC/MCELFObjectWriter.h
index 688c8a9575..59a2501b8e 100644
--- a/include/llvm/MC/MCELFObjectWriter.h
+++ b/include/llvm/MC/MCELFObjectWriter.h
@@ -91,6 +91,9 @@ public:
                                          const MCFragment &F,
                                          const MCFixup &Fixup,
                                          bool IsPCRel) const;
+  virtual const MCSymbol *undefinedExplicitRelSym(const MCValue &Target,
+                                                  const MCFixup &Fixup,
+                                                  bool IsPCRel) const;
   virtual void adjustFixupOffset(const MCFixup &Fixup,
                                  uint64_t &RelocOffset);
 
diff --git a/include/llvm/MC/MCInstPrinter.h b/include/llvm/MC/MCInstPrinter.h
index f17c347050..3b9420a403 100644
--- a/include/llvm/MC/MCInstPrinter.h
+++ b/include/llvm/MC/MCInstPrinter.h
@@ -66,6 +66,10 @@ public:
 
   bool getUseMarkup() const { return UseMarkup; }
   void setUseMarkup(bool Value) { UseMarkup = Value; }
+
+  /// Utility functions to make adding mark ups simpler.
+  StringRef markup(StringRef s) const;
+  StringRef markup(StringRef a, StringRef b) const;
 };
 
 } // namespace llvm
diff --git a/include/llvm/MC/MCParser/MCAsmParser.h b/include/llvm/MC/MCParser/MCAsmParser.h
index fe7bda08db..8a5f37cb0c 100644
--- a/include/llvm/MC/MCParser/MCAsmParser.h
+++ b/include/llvm/MC/MCParser/MCAsmParser.h
@@ -90,7 +90,7 @@ public:
   /// ParseMSInlineAsm - Parse ms-style inline assembly.
   virtual bool ParseMSInlineAsm(void *AsmLoc, std::string &AsmString,
                                 unsigned &NumOutputs, unsigned &NumInputs,
-                                SmallVectorImpl<void *> &OpDecls,
+                                SmallVectorImpl<std::pair<void *, bool> > &OpDecls,
                                 SmallVectorImpl<std::string> &Constraints,
                                 SmallVectorImpl<std::string> &Clobbers,
                                 const MCInstrInfo *MII,
diff --git a/include/llvm/MC/MCParser/MCParsedAsmOperand.h b/include/llvm/MC/MCParser/MCParsedAsmOperand.h
index 35f47c0b9c..89b0a1f47b 100644
--- a/include/llvm/MC/MCParser/MCParsedAsmOperand.h
+++ b/include/llvm/MC/MCParser/MCParsedAsmOperand.h
@@ -69,6 +69,9 @@ public:
   /// inline assembly.
   virtual bool isOffsetOf() const { return false; }
 
+  /// getOffsetOfLoc - Get the location of the offset operator.
+  virtual SMLoc getOffsetOfLoc() const { return SMLoc(); }
+
   /// needSizeDirective - Do we need to emit a sizing directive for this
   /// operand?  Only valid when parsing MS-style inline assembly.
   virtual bool needSizeDirective() const { return false; }
diff --git a/include/llvm/Operator.h b/include/llvm/Operator.h
index 462324a669..b326c11352 100644
--- a/include/llvm/Operator.h
+++ b/include/llvm/Operator.h
@@ -36,8 +36,11 @@ private:
   void *operator new(size_t, unsigned) LLVM_DELETED_FUNCTION;
   void *operator new(size_t s) LLVM_DELETED_FUNCTION;
   Operator() LLVM_DELETED_FUNCTION;
-  // NOTE: cannot use LLVM_DELETED_FUNCTION because it's not legal to delete
-  // an overridden method that's not deleted in the base class.
+
+protected:
+  // NOTE: Cannot use LLVM_DELETED_FUNCTION because it's not legal to delete
+  // an overridden method that's not deleted in the base class. Cannot leave
+  // this unimplemented because that leads to an ODR-violation.
   ~Operator();
 
 public:
@@ -79,8 +82,6 @@ public:
   };
 
 private:
-  ~OverflowingBinaryOperator(); // DO NOT IMPLEMENT
-
   friend class BinaryOperator;
   friend class ConstantExpr;
   void setHasNoUnsignedWrap(bool B) {
@@ -132,8 +133,6 @@ public:
   };
   
 private:
-  ~PossiblyExactOperator(); // DO NOT IMPLEMENT
-
   friend class BinaryOperator;
   friend class ConstantExpr;
   void setIsExact(bool B) {
@@ -168,9 +167,6 @@ public:
 /// FPMathOperator - Utility class for floating point operations which can have
 /// information about relaxed accuracy requirements attached to them.
 class FPMathOperator : public Operator {
-private:
-  ~FPMathOperator(); // DO NOT IMPLEMENT
-
 public:
 
   /// \brief Get the maximum error permitted by this operation in ULPs.  An
@@ -191,7 +187,6 @@ public:
 /// opcodes.
 template<typename SuperClass, unsigned Opc>
 class ConcreteOperator : public SuperClass {
-  ~ConcreteOperator(); // DO NOT IMPLEMENT
 public:
   static inline bool classof(const Instruction *I) {
     return I->getOpcode() == Opc;
@@ -207,45 +202,35 @@ public:
 
 class AddOperator
   : public ConcreteOperator<OverflowingBinaryOperator, Instruction::Add> {
-  ~AddOperator(); // DO NOT IMPLEMENT
 };
 class SubOperator
   : public ConcreteOperator<OverflowingBinaryOperator, Instruction::Sub> {
-  ~SubOperator(); // DO NOT IMPLEMENT
 };
 class MulOperator
   : public ConcreteOperator<OverflowingBinaryOperator, Instruction::Mul> {
-  ~MulOperator(); // DO NOT IMPLEMENT
 };
 class ShlOperator
   : public ConcreteOperator<OverflowingBinaryOperator, Instruction::Shl> {
-  ~ShlOperator(); // DO NOT IMPLEMENT
 };
 
 
 class SDivOperator
   : public ConcreteOperator<PossiblyExactOperator, Instruction::SDiv> {
-  ~SDivOperator(); // DO NOT IMPLEMENT
 };
 class UDivOperator
   : public ConcreteOperator<PossiblyExactOperator, Instruction::UDiv> {
-  ~UDivOperator(); // DO NOT IMPLEMENT
 };
 class AShrOperator
   : public ConcreteOperator<PossiblyExactOperator, Instruction::AShr> {
-  ~AShrOperator(); // DO NOT IMPLEMENT
 };
 class LShrOperator
   : public ConcreteOperator<PossiblyExactOperator, Instruction::LShr> {
-  ~LShrOperator(); // DO NOT IMPLEMENT
 };
 
 
 
 class GEPOperator
   : public ConcreteOperator<Operator, Instruction::GetElementPtr> {
-  ~GEPOperator(); // DO NOT IMPLEMENT
-
   enum {
     IsInBounds = (1 << 0)
   };
diff --git a/include/llvm/TableGen/Error.h b/include/llvm/TableGen/Error.h
index 5c1c3adf7e..3f7b7f4e8c 100644
--- a/include/llvm/TableGen/Error.h
+++ b/include/llvm/TableGen/Error.h
@@ -40,6 +40,9 @@ void PrintError(const char *Loc, const Twine &Msg);
 void PrintError(const Twine &Msg);
 void PrintError(const TGError &Error);
 
+LLVM_ATTRIBUTE_NORETURN void PrintFatalError(const std::string &Msg);
+LLVM_ATTRIBUTE_NORETURN void PrintFatalError(ArrayRef<SMLoc> ErrorLoc,
+                                             const std::string &Msg);
 
 extern SourceMgr SrcMgr;
 
diff --git a/include/llvm/Target/Target.td b/include/llvm/Target/Target.td
index ed3db69a4e..5fb12f503e 100644
--- a/include/llvm/Target/Target.td
+++ b/include/llvm/Target/Target.td
@@ -1032,6 +1032,55 @@ class ProcessorModel<string n, SchedMachineModel m, list<SubtargetFeature> f>
 }
 
 //===----------------------------------------------------------------------===//
+// InstrMapping - This class is used to create mapping tables to relate
+// instructions with each other based on the values specified in RowFields,
+// ColFields, KeyCol and ValueCols.
+//
+class InstrMapping {
+  // FilterClass - Used to limit search space only to the instructions that
+  // define the relationship modeled by this InstrMapping record.
+  string FilterClass;
+
+  // RowFields - List of fields/attributes that should be same for all the
+  // instructions in a row of the relation table. Think of this as a set of
+  // properties shared by all the instructions related by this relationship
+  // model and is used to categorize instructions into subgroups. For instance,
+  // if we want to define a relation that maps 'Add' instruction to its
+  // predicated forms, we can define RowFields like this:
+  //
+  // let RowFields = BaseOp
+  // All add instruction predicated/non-predicated will have to set their BaseOp
+  // to the same value.
+  //
+  // def Add: { let BaseOp = 'ADD'; let predSense = 'nopred' }
+  // def Add_predtrue: { let BaseOp = 'ADD'; let predSense = 'true' }
+  // def Add_predfalse: { let BaseOp = 'ADD'; let predSense = 'false'  }
+  list<string> RowFields = [];
+
+  // List of fields/attributes that are same for all the instructions
+  // in a column of the relation table.
+  // Ex: let ColFields = 'predSense' -- It means that the columns are arranged
+  // based on the 'predSense' values. All the instruction in a specific
+  // column have the same value and it is fixed for the column according
+  // to the values set in 'ValueCols'.
+  list<string> ColFields = [];
+
+  // Values for the fields/attributes listed in 'ColFields'.
+  // Ex: let KeyCol = 'nopred' -- It means that the key instruction (instruction
+  // that models this relation) should be non-predicated.
+  // In the example above, 'Add' is the key instruction.
+  list<string> KeyCol = [];
+
+  // List of values for the fields/attributes listed in 'ColFields', one for
+  // each column in the relation table.
+  //
+  // Ex: let ValueCols = [['true'],['false']] -- It adds two columns in the
+  // table. First column requires all the instructions to have predSense
+  // set to 'true' and second column requires it to be 'false'.
+  list<list<string> > ValueCols = [];
+}
+
+//===----------------------------------------------------------------------===//
 // Pull in the common support for calling conventions.
 //
 include "llvm/Target/TargetCallingConv.td"
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index f34a7727cf..830e2d645a 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -103,6 +103,10 @@ public:
     TypeWidenVector      // This vector should be widened into a larger vector.
   };
 
+  /// LegalizeKind holds the legalization kind that needs to happen to EVT
+  /// in order to type-legalize it.
+  typedef std::pair<LegalizeTypeAction, EVT> LegalizeKind;
+
   enum BooleanContent { // How the target represents true/false values.
     UndefinedBooleanContent,    // Only bit 0 counts, the rest can hold garbage.
     ZeroOrOneBooleanContent,        // All bits zero except for bit 0.
@@ -1966,8 +1970,7 @@ private:
 
   ValueTypeActionImpl ValueTypeActions;
 
-  typedef std::pair<LegalizeTypeAction, EVT> LegalizeKind;
-
+public:
   LegalizeKind
   getTypeConversion(LLVMContext &Context, EVT VT) const {
     // If this is a simple type, use the ComputeRegisterProp mechanism.
@@ -2084,6 +2087,7 @@ private:
     return LegalizeKind(TypeSplitVector, NVT);
   }
 
+private:
   std::vector<std::pair<EVT, const TargetRegisterClass*> > AvailableRegClasses;
 
   /// TargetDAGCombineArray - Targets can specify ISD nodes that they would
diff --git a/include/llvm/Target/TargetTransformImpl.h b/include/llvm/Target/TargetTransformImpl.h
index 7648f4f935..25a7edeb01 100644
--- a/include/llvm/Target/TargetTransformImpl.h
+++ b/include/llvm/Target/TargetTransformImpl.h
@@ -16,6 +16,7 @@
 #define LLVM_TARGET_TARGET_TRANSFORMATION_IMPL_H
 
 #include "llvm/TargetTransformInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
 
 namespace llvm {
 
@@ -47,7 +48,27 @@ public:
   virtual unsigned getJumpBufSize() const;
 };
 
-class VectorTargetTransformImpl : public VectorTargetTransformInfo { };
+class VectorTargetTransformImpl : public VectorTargetTransformInfo {
+private:
+  const TargetLowering *TLI;
+
+  /// Estimate the cost of type-legalization and the legalized type.
+  std::pair<unsigned, EVT>
+  getTypeLegalizationCost(LLVMContext &C, EVT Ty) const;
+
+public:
+  explicit VectorTargetTransformImpl(const TargetLowering *TL) : TLI(TL) {}
+  
+  virtual ~VectorTargetTransformImpl() {}
+
+  virtual unsigned getInstrCost(unsigned Opcode, Type *Ty1, Type *Ty2) const;
+
+  virtual unsigned getBroadcastCost(Type *Tp) const;
+
+  virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src,
+                                   unsigned Alignment,
+                                   unsigned AddressSpace) const;
+};
 
 } // end llvm namespace
 
diff --git a/include/llvm/TargetTransformInfo.h b/include/llvm/TargetTransformInfo.h
index 82fc14dbd7..96470c30ca 100644
--- a/include/llvm/TargetTransformInfo.h
+++ b/include/llvm/TargetTransformInfo.h
@@ -54,10 +54,10 @@ public:
   TargetTransformInfo(const TargetTransformInfo &T) :
     ImmutablePass(ID), STTI(T.STTI), VTTI(T.VTTI) { }
 
-  const ScalarTargetTransformInfo* getScalarTargetTransformInfo() {
+  const ScalarTargetTransformInfo* getScalarTargetTransformInfo() const {
     return STTI;
   }
-  const VectorTargetTransformInfo* getVectorTargetTransformInfo() {
+  const VectorTargetTransformInfo* getVectorTargetTransformInfo() const {
     return VTTI;
   }
 
@@ -119,8 +119,43 @@ public:
   }
 };
 
+/// VectorTargetTransformInfo - This interface is used by the vectorizers
+/// to estimate the profitability of vectorization for different instructions.
 class VectorTargetTransformInfo {
-  // TODO: define an interface for VectorTargetTransformInfo.
+public:
+  virtual ~VectorTargetTransformInfo() {}
+
+  /// Returns the expected cost of the instruction opcode. The opcode is one of
+  /// the enums like Instruction::Add. The type arguments are the type of the
+  /// operation.
+  /// Most instructions only use the first type and in that case the second
+  /// operand is ignored.
+  ///
+  /// Exceptions:
+  /// * Br instructions do not use any of the types.
+  /// * Select instructions pass the return type as Ty1 and the selector as Ty2.
+  /// * Cast instructions pass the destination as Ty1 and the source as Ty2.
+  /// * Insert/Extract element pass only the vector type as Ty1.
+  /// * ShuffleVector, Load, Store do not use this call.
+  virtual unsigned getInstrCost(unsigned Opcode,
+                                Type *Ty1 = 0,
+                                Type *Ty2 = 0) const {
+    return 1;
+  }
+
+  /// Returns the cost of a vector broadcast of a scalar at place zero to a
+  /// vector of type 'Tp'.
+  virtual unsigned getBroadcastCost(Type *Tp) const {
+    return 1;
+  }
+
+  /// Returns the cost of Load and Store instructions. 
+  virtual unsigned getMemoryOpCost(unsigned Opcode, Type *Src,
+                                   unsigned Alignment,
+                                   unsigned AddressSpace) const {
+    return 1;
+  }
+
 };
 
 } // End llvm namespace
diff --git a/include/llvm/Transforms/Utils/Local.h b/include/llvm/Transforms/Utils/Local.h
index fd1b5556ef..49eeb57622 100644
--- a/include/llvm/Transforms/Utils/Local.h
+++ b/include/llvm/Transforms/Utils/Local.h
@@ -177,8 +177,9 @@ static inline unsigned getKnownAlignment(Value *V, const DataLayout *TD = 0) {
 template<typename IRBuilderTy>
 Value *EmitGEPOffset(IRBuilderTy *Builder, const DataLayout &TD, User *GEP,
                      bool NoAssumptions = false) {
+  unsigned AS = cast<GEPOperator>(GEP)->getPointerAddressSpace();
   gep_type_iterator GTI = gep_type_begin(GEP);
-  Type *IntPtrTy = TD.getIntPtrType(GEP->getContext());
+  Type *IntPtrTy = TD.getIntPtrType(GEP->getContext(), AS);
   Value *Result = Constant::getNullValue(IntPtrTy);
 
   // If the GEP is inbounds, we know that none of the addressing operations will
@@ -186,7 +187,6 @@ Value *EmitGEPOffset(IRBuilderTy *Builder, const DataLayout &TD, User *GEP,
   bool isInBounds = cast<GEPOperator>(GEP)->isInBounds() && !NoAssumptions;
 
   // Build a mask for high order bits.
-  unsigned AS = cast<GEPOperator>(GEP)->getPointerAddressSpace();
   unsigned IntPtrWidth = TD.getPointerSizeInBits(AS);
   uint64_t PtrSizeMask = ~0ULL >> (64-IntPtrWidth);
 
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index 146897ad67..de6d61d78b 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -41,7 +41,7 @@ using namespace llvm;
 // Constant Folding internal helper functions
 //===----------------------------------------------------------------------===//
 
-/// FoldBitCast - Constant fold bitcast, symbolically evaluating it with 
+/// FoldBitCast - Constant fold bitcast, symbolically evaluating it with
 /// DataLayout.  This always returns a non-null constant, but it may be a
 /// ConstantExpr if unfoldable.
 static Constant *FoldBitCast(Constant *C, Type *DestTy,
@@ -59,9 +59,9 @@ static Constant *FoldBitCast(Constant *C, Type *DestTy,
       return ConstantExpr::getBitCast(C, DestTy);
 
     unsigned NumSrcElts = CDV->getType()->getNumElements();
-    
+
     Type *SrcEltTy = CDV->getType()->getElementType();
-    
+
     // If the vector is a vector of floating point, convert it to vector of int
     // to simplify things.
     if (SrcEltTy->isFloatingPointTy()) {
@@ -72,7 +72,7 @@ static Constant *FoldBitCast(Constant *C, Type *DestTy,
       C = ConstantExpr::getBitCast(C, SrcIVTy);
       CDV = cast<ConstantDataVector>(C);
     }
-    
+
     // Now that we know that the input value is a vector of integers, just shift
     // and insert them into our result.
     unsigned BitShift = TD.getTypeAllocSizeInBits(SrcEltTy);
@@ -84,43 +84,43 @@ static Constant *FoldBitCast(Constant *C, Type *DestTy,
       else
         Result |= CDV->getElementAsInteger(i);
     }
-   
+
     return ConstantInt::get(IT, Result);
   }
-  
+
   // The code below only handles casts to vectors currently.
   VectorType *DestVTy = dyn_cast<VectorType>(DestTy);
   if (DestVTy == 0)
     return ConstantExpr::getBitCast(C, DestTy);
-  
+
   // If this is a scalar -> vector cast, convert the input into a <1 x scalar>
   // vector so the code below can handle it uniformly.
   if (isa<ConstantFP>(C) || isa<ConstantInt>(C)) {
     Constant *Ops = C; // don't take the address of C!
     return FoldBitCast(ConstantVector::get(Ops), DestTy, TD);
   }
-  
+
   // If this is a bitcast from constant vector -> vector, fold it.
   if (!isa<ConstantDataVector>(C) && !isa<ConstantVector>(C))
     return ConstantExpr::getBitCast(C, DestTy);
-  
+
   // If the element types match, VMCore can fold it.
   unsigned NumDstElt = DestVTy->getNumElements();
   unsigned NumSrcElt = C->getType()->getVectorNumElements();
   if (NumDstElt == NumSrcElt)
     return ConstantExpr::getBitCast(C, DestTy);
-  
+
   Type *SrcEltTy = C->getType()->getVectorElementType();
   Type *DstEltTy = DestVTy->getElementType();
-  
-  // Otherwise, we're changing the number of elements in a vector, which 
+
+  // Otherwise, we're changing the number of elements in a vector, which
   // requires endianness information to do the right thing.  For example,
   //    bitcast (<2 x i64> <i64 0, i64 1> to <4 x i32>)
   // folds to (little endian):
   //    <4 x i32> <i32 0, i32 0, i32 1, i32 0>
   // and to (big endian):
   //    <4 x i32> <i32 0, i32 0, i32 0, i32 1>
-  
+
   // First thing is first.  We only want to think about integer here, so if
   // we have something in FP form, recast it as integer.
   if (DstEltTy->isFloatingPointTy()) {
@@ -130,11 +130,11 @@ static Constant *FoldBitCast(Constant *C, Type *DestTy,
       VectorType::get(IntegerType::get(C->getContext(), FPWidth), NumDstElt);
     // Recursively handle this integer conversion, if possible.
     C = FoldBitCast(C, DestIVTy, TD);
-    
+
     // Finally, VMCore can handle this now that #elts line up.
     return ConstantExpr::getBitCast(C, DestTy);
   }
-  
+
   // Okay, we know the destination is integer, if the input is FP, convert
   // it to integer first.
   if (SrcEltTy->isFloatingPointTy()) {
@@ -148,13 +148,13 @@ static Constant *FoldBitCast(Constant *C, Type *DestTy,
         !isa<ConstantDataVector>(C))
       return C;
   }
-  
+
   // Now we know that the input and output vectors are both integer vectors
   // of the same size, and that their #elements is not the same.  Do the
   // conversion here, which depends on whether the input or output has
   // more elements.
   bool isLittleEndian = TD.isLittleEndian();
-  
+
   SmallVector<Constant*, 32> Result;
   if (NumDstElt < NumSrcElt) {
     // Handle: bitcast (<4 x i32> <i32 0, i32 1, i32 2, i32 3> to <2 x i64>)
@@ -170,15 +170,15 @@ static Constant *FoldBitCast(Constant *C, Type *DestTy,
         Constant *Src =dyn_cast<ConstantInt>(C->getAggregateElement(SrcElt++));
         if (!Src)  // Reject constantexpr elements.
           return ConstantExpr::getBitCast(C, DestTy);
-        
+  
         // Zero extend the element to the right size.
         Src = ConstantExpr::getZExt(Src, Elt->getType());
-        
+  
         // Shift it to the right place, depending on endianness.
-        Src = ConstantExpr::getShl(Src, 
+        Src = ConstantExpr::getShl(Src,
                                    ConstantInt::get(Src->getType(), ShiftAmt));
         ShiftAmt += isLittleEndian ? SrcBitSize : -SrcBitSize;
-        
+  
         // Mix it in.
         Elt = ConstantExpr::getOr(Elt, Src);
       }
@@ -186,30 +186,30 @@ static Constant *FoldBitCast(Constant *C, Type *DestTy,
     }
     return ConstantVector::get(Result);
   }
-  
+
   // Handle: bitcast (<2 x i64> <i64 0, i64 1> to <4 x i32>)
   unsigned Ratio = NumDstElt/NumSrcElt;
   unsigned DstBitSize = DstEltTy->getPrimitiveSizeInBits();
-  
+
   // Loop over each source value, expanding into multiple results.
   for (unsigned i = 0; i != NumSrcElt; ++i) {
     Constant *Src = dyn_cast<ConstantInt>(C->getAggregateElement(i));
     if (!Src)  // Reject constantexpr elements.
       return ConstantExpr::getBitCast(C, DestTy);
-    
+
     unsigned ShiftAmt = isLittleEndian ? 0 : DstBitSize*(Ratio-1);
     for (unsigned j = 0; j != Ratio; ++j) {
       // Shift the piece of the value into the right place, depending on
       // endianness.
-      Constant *Elt = ConstantExpr::getLShr(Src, 
+      Constant *Elt = ConstantExpr::getLShr(Src,
                                   ConstantInt::get(Src->getType(), ShiftAmt));
       ShiftAmt += isLittleEndian ? DstBitSize : -DstBitSize;
-      
+
       // Truncate and remember this piece.
       Result.push_back(ConstantExpr::getTrunc(Elt, DstEltTy));
     }
   }
-  
+
   return ConstantVector::get(Result);
 }
 
@@ -224,28 +224,28 @@ static bool IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
     Offset = 0;
     return true;
   }
-  
+
   // Otherwise, if this isn't a constant expr, bail out.
   ConstantExpr *CE = dyn_cast<ConstantExpr>(C);
   if (!CE) return false;
-  
+
   // Look through ptr->int and ptr->ptr casts.
   if (CE->getOpcode() == Instruction::PtrToInt ||
       CE->getOpcode() == Instruction::BitCast)
     return IsConstantOffsetFromGlobal(CE->getOperand(0), GV, Offset, TD);
-  
-  // i32* getelementptr ([5 x i32]* @a, i32 0, i32 5)    
+
+  // i32* getelementptr ([5 x i32]* @a, i32 0, i32 5)
   if (CE->getOpcode() == Instruction::GetElementPtr) {
     // Cannot compute this if the element type of the pointer is missing size
     // info.
     if (!cast<PointerType>(CE->getOperand(0)->getType())
                  ->getElementType()->isSized())
       return false;
-    
+
     // If the base isn't a global+constant, we aren't either.
     if (!IsConstantOffsetFromGlobal(CE->getOperand(0), GV, Offset, TD))
       return false;
-    
+
     // Otherwise, add any offset that our operands provide.
     gep_type_iterator GTI = gep_type_begin(CE);
     for (User::const_op_iterator i = CE->op_begin() + 1, e = CE->op_end();
@@ -253,7 +253,7 @@ static bool IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
       ConstantInt *CI = dyn_cast<ConstantInt>(*i);
       if (!CI) return false;  // Index isn't a simple constant?
       if (CI->isZero()) continue;  // Not adding anything.
-      
+
       if (StructType *ST = dyn_cast<StructType>(*GTI)) {
         // N = N + Offset
         Offset += TD.getStructLayout(ST)->getElementOffset(CI->getZExtValue());
@@ -264,7 +264,7 @@ static bool IsConstantOffsetFromGlobal(Constant *C, GlobalValue *&GV,
     }
     return true;
   }
-  
+
   return false;
 }
 
@@ -277,27 +277,27 @@ static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
                                const DataLayout &TD) {
   assert(ByteOffset <= TD.getTypeAllocSize(C->getType()) &&
          "Out of range access");
-  
+
   // If this element is zero or undefined, we can just return since *CurPtr is
   // zero initialized.
   if (isa<ConstantAggregateZero>(C) || isa<UndefValue>(C))
     return true;
-  
+
   if (ConstantInt *CI = dyn_cast<ConstantInt>(C)) {
     if (CI->getBitWidth() > 64 ||
         (CI->getBitWidth() & 7) != 0)
       return false;
-    
+
     uint64_t Val = CI->getZExtValue();
     unsigned IntBytes = unsigned(CI->getBitWidth()/8);
-    
+
     for (unsigned i = 0; i != BytesLeft && ByteOffset != IntBytes; ++i) {
       CurPtr[i] = (unsigned char)(Val >> (ByteOffset * 8));
       ++ByteOffset;
     }
     return true;
   }
-  
+
   if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
     if (CFP->getType()->isDoubleTy()) {
       C = FoldBitCast(C, Type::getInt64Ty(C->getContext()), TD);
@@ -309,13 +309,13 @@ static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
     }
     return false;
   }
-  
+
   if (ConstantStruct *CS = dyn_cast<ConstantStruct>(C)) {
     const StructLayout *SL = TD.getStructLayout(CS->getType());
     unsigned Index = SL->getElementContainingOffset(ByteOffset);
     uint64_t CurEltOffset = SL->getElementOffset(Index);
     ByteOffset -= CurEltOffset;
-    
+
     while (1) {
       // If the element access is to the element itself and not to tail padding,
       // read the bytes from the element.
@@ -325,9 +325,9 @@ static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
           !ReadDataFromGlobal(CS->getOperand(Index), ByteOffset, CurPtr,
                               BytesLeft, TD))
         return false;
-      
+
       ++Index;
-      
+
       // Check to see if we read from the last struct element, if so we're done.
       if (Index == CS->getType()->getNumElements())
         return true;
@@ -375,11 +375,11 @@ static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
     }
     return true;
   }
-      
+
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C)) {
     if (CE->getOpcode() == Instruction::IntToPtr &&
-        CE->getOperand(0)->getType() == TD.getIntPtrType(CE->getContext())) 
-      return ReadDataFromGlobal(CE->getOperand(0), ByteOffset, CurPtr, 
+        CE->getOperand(0)->getType() == TD.getIntPtrType(CE->getType()))
+        return ReadDataFromGlobal(CE->getOperand(0), ByteOffset, CurPtr,
                                 BytesLeft, TD);
   }
 
@@ -391,7 +391,7 @@ static Constant *FoldReinterpretLoadFromConstPtr(Constant *C,
                                                  const DataLayout &TD) {
   Type *LoadTy = cast<PointerType>(C->getType())->getElementType();
   IntegerType *IntType = dyn_cast<IntegerType>(LoadTy);
-  
+
   // If this isn't an integer load we can't fold it directly.
   if (!IntType) {
     // If this is a float/double load, we can try folding it as an int32/64 load
@@ -415,15 +415,15 @@ static Constant *FoldReinterpretLoadFromConstPtr(Constant *C,
       return FoldBitCast(Res, LoadTy, TD);
     return 0;
   }
-  
+
   unsigned BytesLoaded = (IntType->getBitWidth() + 7) / 8;
   if (BytesLoaded > 32 || BytesLoaded == 0) return 0;
-  
+
   GlobalValue *GVal;
   int64_t Offset;
   if (!IsConstantOffsetFromGlobal(C, GVal, Offset, TD))
     return 0;
-  
+
   GlobalVariable *GV = dyn_cast<GlobalVariable>(GVal);
   if (!GV || !GV->isConstant() || !GV->hasDefinitiveInitializer() ||
       !GV->getInitializer()->getType()->isSized())
@@ -432,11 +432,11 @@ static Constant *FoldReinterpretLoadFromConstPtr(Constant *C,
   // If we're loading off the beginning of the global, some bytes may be valid,
   // but we don't try to handle this.
   if (Offset < 0) return 0;
-  
+
   // If we're not accessing anything in this constant, the result is undefined.
   if (uint64_t(Offset) >= TD.getTypeAllocSize(GV->getInitializer()->getType()))
     return UndefValue::get(IntType);
-  
+
   unsigned char RawBytes[32] = {0};
   if (!ReadDataFromGlobal(GV->getInitializer(), Offset, RawBytes,
                           BytesLoaded, TD))
@@ -464,15 +464,15 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C,
   // If the loaded value isn't a constant expr, we can't handle it.
   ConstantExpr *CE = dyn_cast<ConstantExpr>(C);
   if (!CE) return 0;
-  
+
   if (CE->getOpcode() == Instruction::GetElementPtr) {
     if (GlobalVariable *GV = dyn_cast<GlobalVariable>(CE->getOperand(0)))
       if (GV->isConstant() && GV->hasDefinitiveInitializer())
-        if (Constant *V = 
+        if (Constant *V =
              ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE))
           return V;
   }
-  
+
   // Instead of loading constant c string, use corresponding integer value
   // directly if string length is small enough.
   StringRef Str;
@@ -500,14 +500,14 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C,
         SingleChar = 0;
         StrVal = (StrVal << 8) | SingleChar;
       }
-      
+
       Constant *Res = ConstantInt::get(CE->getContext(), StrVal);
       if (Ty->isFloatingPointTy())
         Res = ConstantExpr::getBitCast(Res, Ty);
       return Res;
     }
   }
-  
+
   // If this load comes from anywhere in a constant global, and if the global
   // is all undef or zero, we know what it loads.
   if (GlobalVariable *GV =
@@ -520,7 +520,7 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C,
         return UndefValue::get(ResTy);
     }
   }
-  
+
   // Try hard to fold loads from bitcasted strange and non-type-safe things.  We
   // currently don't do any of this for big endian systems.  It can be
   // generalized in the future if someone is interested.
@@ -531,7 +531,7 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C,
 
 static Constant *ConstantFoldLoadInst(const LoadInst *LI, const DataLayout *TD){
   if (LI->isVolatile()) return 0;
-  
+
   if (Constant *C = dyn_cast<Constant>(LI->getOperand(0)))
     return ConstantFoldLoadFromConstPtr(C, TD);
 
@@ -540,23 +540,23 @@ static Constant *ConstantFoldLoadInst(const LoadInst *LI, const DataLayout *TD){
 
 /// SymbolicallyEvaluateBinop - One of Op0/Op1 is a constant expression.
 /// Attempt to symbolically evaluate the result of a binary operator merging
-/// these together.  If target data info is available, it is provided as TD, 
+/// these together.  If target data info is available, it is provided as TD,
 /// otherwise TD is null.
 static Constant *SymbolicallyEvaluateBinop(unsigned Opc, Constant *Op0,
                                            Constant *Op1, const DataLayout *TD){
   // SROA
-  
+
   // Fold (and 0xffffffff00000000, (shl x, 32)) -> shl.
   // Fold (lshr (or X, Y), 32) -> (lshr [X/Y], 32) if one doesn't contribute
   // bits.
-  
-  
+
+
   // If the constant expr is something like &A[123] - &A[4].f, fold this into a
   // constant.  This happens frequently when iterating over a global array.
   if (Opc == Instruction::Sub && TD) {
     GlobalValue *GV1, *GV2;
     int64_t Offs1, Offs2;
-    
+
     if (IsConstantOffsetFromGlobal(Op0, GV1, Offs1, *TD))
       if (IsConstantOffsetFromGlobal(Op1, GV2, Offs2, *TD) &&
           GV1 == GV2) {
@@ -564,7 +564,7 @@ static Constant *SymbolicallyEvaluateBinop(unsigned Opc, Constant *Op0,
         return ConstantInt::get(Op0->getType(), Offs1-Offs2);
       }
   }
-    
+
   return 0;
 }
 
@@ -575,7 +575,7 @@ static Constant *CastGEPIndices(ArrayRef<Constant *> Ops,
                                 Type *ResultTy, const DataLayout *TD,
                                 const TargetLibraryInfo *TLI) {
   if (!TD) return 0;
-  Type *IntPtrTy = TD->getIntPtrType(ResultTy->getContext());
+  Type *IntPtrTy = TD->getIntPtrType(ResultTy);
 
   bool Any = false;
   SmallVector<Constant*, 32> NewIdxs;
@@ -628,14 +628,15 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
   if (!TD || !cast<PointerType>(Ptr->getType())->getElementType()->isSized() ||
       !Ptr->getType()->isPointerTy())
     return 0;
-  
-  Type *IntPtrTy = TD->getIntPtrType(Ptr->getContext());
+
+  unsigned AS = cast<PointerType>(Ptr->getType())->getAddressSpace();
+  Type *IntPtrTy = TD->getIntPtrType(Ptr->getContext(), AS);
 
   // If this is a constant expr gep that is effectively computing an
   // "offsetof", fold it into 'cast int Size to T*' instead of 'gep 0, 0, 12'
   for (unsigned i = 1, e = Ops.size(); i != e; ++i)
     if (!isa<ConstantInt>(Ops[i])) {
-      
+
       // If this is "gep i8* Ptr, (sub 0, V)", fold this as:
       // "inttoptr (sub (ptrtoint Ptr), V)"
       if (Ops.size() == 2 &&
@@ -702,6 +703,8 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
   // Also, this helps GlobalOpt do SROA on GlobalVariables.
   Type *Ty = Ptr->getType();
   assert(Ty->isPointerTy() && "Forming regular GEP of non-pointer type");
+  assert(Ty->getPointerAddressSpace() == AS
+      && "Operand and result of GEP should be in the same address space.");
   SmallVector<Constant*, 32> NewIdxs;
   do {
     if (SequentialType *ATy = dyn_cast<SequentialType>(Ty)) {
@@ -709,15 +712,15 @@ static Constant *SymbolicallyEvaluateGEP(ArrayRef<Constant *> Ops,
         // The only pointer indexing we'll do is on the first index of the GEP.
         if (!NewIdxs.empty())
           break;
-       
+
         // Only handle pointers to sized types, not pointers to functions.
         if (!ATy->getElementType()->isSized())
           return 0;
       }
-        
+
       // Determine which element of the array the offset points into.
       APInt ElemSize(BitWidth, TD->getTypeAllocSize(ATy->getElementType()));
-      IntegerType *IntPtrTy = TD->getIntPtrType(Ty->getContext());
+      IntegerType *IntPtrTy = TD->getIntPtrType(Ty->getContext(), AS);
       if (ElemSize == 0)
         // The element size is 0. This may be [0 x Ty]*, so just use a zero
         // index for this level and proceed to the next level to see if it can
@@ -837,7 +840,7 @@ Constant *llvm::ConstantFoldInstruction(Instruction *I,
   if (const CmpInst *CI = dyn_cast<CmpInst>(I))
     return ConstantFoldCompareInstOperands(CI->getPredicate(), Ops[0], Ops[1],
                                            TD, TLI);
-  
+
   if (const LoadInst *LI = dyn_cast<LoadInst>(I))
     return ConstantFoldLoadInst(LI, TD);
 
@@ -887,19 +890,19 @@ Constant *llvm::ConstantFoldConstantExpression(const ConstantExpr *CE,
 /// information, due to only being passed an opcode and operands. Constant
 /// folding using this function strips this information.
 ///
-Constant *llvm::ConstantFoldInstOperands(unsigned Opcode, Type *DestTy, 
+Constant *llvm::ConstantFoldInstOperands(unsigned Opcode, Type *DestTy,
                                          ArrayRef<Constant *> Ops,
                                          const DataLayout *TD,
-                                         const TargetLibraryInfo *TLI) {                                         
+                                         const TargetLibraryInfo *TLI) {                                   
   // Handle easy binops first.
   if (Instruction::isBinaryOp(Opcode)) {
     if (isa<ConstantExpr>(Ops[0]) || isa<ConstantExpr>(Ops[1]))
       if (Constant *C = SymbolicallyEvaluateBinop(Opcode, Ops[0], Ops[1], TD))
         return C;
-    
+
     return ConstantExpr::get(Opcode, Ops[0], Ops[1]);
   }
-  
+
   switch (Opcode) {
   default: return 0;
   case Instruction::ICmp:
@@ -918,7 +921,7 @@ Constant *llvm::ConstantFoldInstOperands(unsigned Opcode, Type *DestTy,
         unsigned InWidth = Input->getType()->getScalarSizeInBits();
         unsigned AS = cast<PointerType>(CE->getType())->getAddressSpace();
         if (TD->getPointerSizeInBits(AS) < InWidth) {
-          Constant *Mask = 
+          Constant *Mask =
             ConstantInt::get(CE->getContext(), APInt::getLowBitsSet(InWidth,
                                                   TD->getPointerSizeInBits(AS)));
           Input = ConstantExpr::getAnd(Input, Mask);
@@ -934,8 +937,7 @@ Constant *llvm::ConstantFoldInstOperands(unsigned Opcode, Type *DestTy,
     // pointer, so it can't be done in ConstantExpr::getCast.
     if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ops[0]))
       if (TD && CE->getOpcode() == Instruction::PtrToInt &&
-          TD->getPointerSizeInBits(
-            cast<PointerType>(CE->getOperand(0)->getType())->getAddressSpace())
+          TD->getTypeSizeInBits(CE->getOperand(0)->getType())
           <= CE->getType()->getScalarSizeInBits())
         return FoldBitCast(CE->getOperand(0), DestTy, *TD);
 
@@ -967,7 +969,7 @@ Constant *llvm::ConstantFoldInstOperands(unsigned Opcode, Type *DestTy,
       return C;
     if (Constant *C = SymbolicallyEvaluateGEP(Ops, DestTy, TD, TLI))
       return C;
-    
+
     return ConstantExpr::getGetElementPtr(Ops[0], Ops.slice(1));
   }
 }
@@ -977,7 +979,7 @@ Constant *llvm::ConstantFoldInstOperands(unsigned Opcode, Type *DestTy,
 /// returns a constant expression of the specified operands.
 ///
 Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate,
-                                                Constant *Ops0, Constant *Ops1, 
+                                                Constant *Ops0, Constant *Ops1,
                                                 const DataLayout *TD,
                                                 const TargetLibraryInfo *TLI) {
   // fold: icmp (inttoptr x), null         -> icmp x, 0
@@ -988,9 +990,10 @@ Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate,
   // ConstantExpr::getCompare cannot do this, because it doesn't have TD
   // around to know if bit truncation is happening.
   if (ConstantExpr *CE0 = dyn_cast<ConstantExpr>(Ops0)) {
+    Type *IntPtrTy = NULL;
     if (TD && Ops1->isNullValue()) {
-      Type *IntPtrTy = TD->getIntPtrType(CE0->getContext());
       if (CE0->getOpcode() == Instruction::IntToPtr) {
+        IntPtrTy = TD->getIntPtrType(CE0->getType());
         // Convert the integer value to the right size to ensure we get the
         // proper extension or truncation.
         Constant *C = ConstantExpr::getIntegerCast(CE0->getOperand(0),
@@ -998,22 +1001,24 @@ Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate,
         Constant *Null = Constant::getNullValue(C->getType());
         return ConstantFoldCompareInstOperands(Predicate, C, Null, TD, TLI);
       }
-      
+
       // Only do this transformation if the int is intptrty in size, otherwise
       // there is a truncation or extension that we aren't modeling.
-      if (CE0->getOpcode() == Instruction::PtrToInt && 
-          CE0->getType() == IntPtrTy) {
-        Constant *C = CE0->getOperand(0);
-        Constant *Null = Constant::getNullValue(C->getType());
-        return ConstantFoldCompareInstOperands(Predicate, C, Null, TD, TLI);
+      if (CE0->getOpcode() == Instruction::PtrToInt) {
+        IntPtrTy = TD->getIntPtrType(CE0->getOperand(0)->getType());
+        if (CE0->getType() == IntPtrTy) {
+          Constant *C = CE0->getOperand(0);
+          Constant *Null = Constant::getNullValue(C->getType());
+          return ConstantFoldCompareInstOperands(Predicate, C, Null, TD, TLI);
+        }
       }
     }
-    
+
     if (ConstantExpr *CE1 = dyn_cast<ConstantExpr>(Ops1)) {
       if (TD && CE0->getOpcode() == CE1->getOpcode()) {
-        Type *IntPtrTy = TD->getIntPtrType(CE0->getContext());
 
         if (CE0->getOpcode() == Instruction::IntToPtr) {
+          Type *IntPtrTy = TD->getIntPtrType(CE0->getType());
           // Convert the integer value to the right size to ensure we get the
           // proper extension or truncation.
           Constant *C0 = ConstantExpr::getIntegerCast(CE0->getOperand(0),
@@ -1022,34 +1027,36 @@ Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate,
                                                       IntPtrTy, false);
           return ConstantFoldCompareInstOperands(Predicate, C0, C1, TD, TLI);
         }
+      }
 
-        // Only do this transformation if the int is intptrty in size, otherwise
-        // there is a truncation or extension that we aren't modeling.
-        if ((CE0->getOpcode() == Instruction::PtrToInt &&
-             CE0->getType() == IntPtrTy &&
-             CE0->getOperand(0)->getType() == CE1->getOperand(0)->getType()))
+      // Only do this transformation if the int is intptrty in size, otherwise
+      // there is a truncation or extension that we aren't modeling.
+      if (CE0->getOpcode() == Instruction::PtrToInt) {
+        IntPtrTy = TD->getIntPtrType(CE0->getOperand(0)->getType());
+        if (CE0->getType() == IntPtrTy &&
+            CE0->getOperand(0)->getType() == CE1->getOperand(0)->getType())
           return ConstantFoldCompareInstOperands(Predicate, CE0->getOperand(0),
-                                                 CE1->getOperand(0), TD, TLI);
+              CE1->getOperand(0), TD, TLI);
       }
     }
-    
+
     // icmp eq (or x, y), 0 -> (icmp eq x, 0) & (icmp eq y, 0)
     // icmp ne (or x, y), 0 -> (icmp ne x, 0) | (icmp ne y, 0)
     if ((Predicate == ICmpInst::ICMP_EQ || Predicate == ICmpInst::ICMP_NE) &&
         CE0->getOpcode() == Instruction::Or && Ops1->isNullValue()) {
-      Constant *LHS = 
+      Constant *LHS =
         ConstantFoldCompareInstOperands(Predicate, CE0->getOperand(0), Ops1,
                                         TD, TLI);
-      Constant *RHS = 
+      Constant *RHS =
         ConstantFoldCompareInstOperands(Predicate, CE0->getOperand(1), Ops1,
                                         TD, TLI);
-      unsigned OpC = 
+      unsigned OpC =
         Predicate == ICmpInst::ICMP_EQ ? Instruction::And : Instruction::Or;
       Constant *Ops[] = { LHS, RHS };
       return ConstantFoldInstOperands(OpC, LHS->getType(), Ops, TD, TLI);
     }
   }
-  
+
   return ConstantExpr::getCompare(Predicate, Ops0, Ops1);
 }
 
@@ -1057,7 +1064,7 @@ Constant *llvm::ConstantFoldCompareInstOperands(unsigned Predicate,
 /// ConstantFoldLoadThroughGEPConstantExpr - Given a constant and a
 /// getelementptr constantexpr, return the constant value being addressed by the
 /// constant expression, or null if something is funny and we can't decide.
-Constant *llvm::ConstantFoldLoadThroughGEPConstantExpr(Constant *C, 
+Constant *llvm::ConstantFoldLoadThroughGEPConstantExpr(Constant *C,
                                                        ConstantExpr *CE) {
   if (!CE->getOperand(1)->isNullValue())
     return 0;  // Do not allow stepping over the value!
@@ -1127,14 +1134,14 @@ llvm::canConstantFoldCallTo(const Function *F) {
 
   if (!F->hasName()) return false;
   StringRef Name = F->getName();
-  
+
   // In these cases, the check of the length is required.  We don't want to
   // return true for a name like "cos\0blah" which strcmp would return equal to
   // "cos", but has length 8.
   switch (Name[0]) {
   default: return false;
   case 'a':
-    return Name == "acos" || Name == "asin" || 
+    return Name == "acos" || Name == "asin" ||
       Name == "atan" || Name == "atan2";
   case 'c':
     return Name == "cos" || Name == "ceil" || Name == "cosf" || Name == "cosh";
@@ -1154,7 +1161,7 @@ llvm::canConstantFoldCallTo(const Function *F) {
   }
 }
 
-static Constant *ConstantFoldFP(double (*NativeFP)(double), double V, 
+static Constant *ConstantFoldFP(double (*NativeFP)(double), double V,
                                 Type *Ty) {
   sys::llvm_fenv_clearexcept();
   V = NativeFP(V);
@@ -1162,7 +1169,7 @@ static Constant *ConstantFoldFP(double (*NativeFP)(double), double V,
     sys::llvm_fenv_clearexcept();
     return 0;
   }
-  
+
   if (Ty->isFloatTy())
     return ConstantFP::get(Ty->getContext(), APFloat((float)V));
   if (Ty->isDoubleTy())
@@ -1178,7 +1185,7 @@ static Constant *ConstantFoldBinaryFP(double (*NativeFP)(double, double),
     sys::llvm_fenv_clearexcept();
     return 0;
   }
-  
+
   if (Ty->isFloatTy())
     return ConstantFP::get(Ty->getContext(), APFloat((float)V));
   if (Ty->isDoubleTy())
@@ -1272,7 +1279,7 @@ llvm::ConstantFoldCall(Function *F, ArrayRef<Constant *> Operands,
       case 'e':
         if (Name == "exp" && TLI->has(LibFunc::exp))
           return ConstantFoldFP(exp, V, Ty);
-  
+
         if (Name == "exp2" && TLI->has(LibFunc::exp2)) {
           // Constant fold exp2(x) as pow(2,x) in case the host doesn't have a
           // C99 library.
@@ -1348,7 +1355,7 @@ llvm::ConstantFoldCall(Function *F, ArrayRef<Constant *> Operands,
     }
 
     // Support ConstantVector in case we have an Undef in the top.
-    if (isa<ConstantVector>(Operands[0]) || 
+    if (isa<ConstantVector>(Operands[0]) ||
         isa<ConstantDataVector>(Operands[0])) {
       Constant *Op = cast<Constant>(Operands[0]);
       switch (F->getIntrinsicID()) {
@@ -1367,11 +1374,11 @@ llvm::ConstantFoldCall(Function *F, ArrayRef<Constant *> Operands,
       case Intrinsic::x86_sse2_cvttsd2si64:
         if (ConstantFP *FPOp =
               dyn_cast_or_null<ConstantFP>(Op->getAggregateElement(0U)))
-          return ConstantFoldConvertToInt(FPOp->getValueAPF(), 
+          return ConstantFoldConvertToInt(FPOp->getValueAPF(),
                                           /*roundTowardZero=*/true, Ty);
       }
     }
-  
+
     if (isa<UndefValue>(Operands[0])) {
       if (F->getIntrinsicID() == Intrinsic::bswap)
         return Operands[0];
@@ -1385,14 +1392,14 @@ llvm::ConstantFoldCall(Function *F, ArrayRef<Constant *> Operands,
     if (ConstantFP *Op1 = dyn_cast<ConstantFP>(Operands[0])) {
       if (!Ty->isFloatTy() && !Ty->isDoubleTy())
         return 0;
-      double Op1V = Ty->isFloatTy() ? 
+      double Op1V = Ty->isFloatTy() ?
                       (double)Op1->getValueAPF().convertToFloat() :
                       Op1->getValueAPF().convertToDouble();
       if (ConstantFP *Op2 = dyn_cast<ConstantFP>(Operands[1])) {
         if (Op2->getType() != Op1->getType())
           return 0;
 
-        double Op2V = Ty->isFloatTy() ? 
+        double Op2V = Ty->isFloatTy() ?
                       (double)Op2->getValueAPF().convertToFloat():
                       Op2->getValueAPF().convertToDouble();
 
@@ -1419,7 +1426,7 @@ llvm::ConstantFoldCall(Function *F, ArrayRef<Constant *> Operands,
       }
       return 0;
     }
-    
+
     if (ConstantInt *Op1 = dyn_cast<ConstantInt>(Operands[0])) {
       if (ConstantInt *Op2 = dyn_cast<ConstantInt>(Operands[1])) {
         switch (F->getIntrinsicID()) {
@@ -1469,7 +1476,7 @@ llvm::ConstantFoldCall(Function *F, ArrayRef<Constant *> Operands,
           return ConstantInt::get(Ty, Op1->getValue().countLeadingZeros());
         }
       }
-      
+
       return 0;
     }
     return 0;
diff --git a/lib/Analysis/DependenceAnalysis.cpp b/lib/Analysis/DependenceAnalysis.cpp
index 016fe396e7..f97f0f2de6 100644
--- a/lib/Analysis/DependenceAnalysis.cpp
+++ b/lib/Analysis/DependenceAnalysis.cpp
@@ -55,12 +55,16 @@
 
 #include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Instructions.h"
 #include "llvm/Operator.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/InstIterator.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index 95e58022ca..64e183d60c 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -788,7 +788,7 @@ ConstantInt *CallAnalyzer::stripAndComputeInBoundsConstantOffsets(Value *&V) {
     assert(V->getType()->isPointerTy() && "Unexpected operand type!");
   } while (Visited.insert(V));
 
-  Type *IntPtrTy = TD->getIntPtrType(V->getContext());
+  Type *IntPtrTy = TD->getIntPtrType(V->getType());
   return cast<ConstantInt>(ConstantInt::get(IntPtrTy, Offset));
 }
 
@@ -828,8 +828,7 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
         // size of the byval type by the target's pointer size.
         PointerType *PTy = cast<PointerType>(CS.getArgument(I)->getType());
         unsigned TypeSize = TD->getTypeSizeInBits(PTy->getElementType());
-        unsigned AS = PTy->getAddressSpace();
-        unsigned PointerSize = TD->getPointerSizeInBits(AS);
+        unsigned PointerSize = TD->getTypeSizeInBits(PTy);
         // Ceiling division.
         unsigned NumStores = (TypeSize + PointerSize - 1) / PointerSize;
 
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index 8e326122fa..7ef74f67ce 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -728,7 +728,7 @@ static Constant *stripAndComputeConstantOffsets(const DataLayout &TD,
     assert(V->getType()->isPointerTy() && "Unexpected operand type!");
   } while (Visited.insert(V));
 
-  Type *IntPtrTy = TD.getIntPtrType(V->getContext());
+  Type *IntPtrTy = TD.getIntPtrType(V->getContext(), AS);
   return ConstantInt::get(IntPtrTy, Offset);
 }
 
@@ -1880,9 +1880,7 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     // Turn icmp (ptrtoint x), (ptrtoint/constant) into a compare of the input
     // if the integer type is the same size as the pointer type.
     if (MaxRecurse && Q.TD && isa<PtrToIntInst>(LI) &&
-        Q.TD->getPointerSizeInBits(
-          cast<PtrToIntInst>(LI)->getPointerAddressSpace()) ==
-        DstTy->getPrimitiveSizeInBits()) {
+        Q.TD->getTypeSizeInBits(SrcTy) == DstTy->getPrimitiveSizeInBits()) {
       if (Constant *RHSC = dyn_cast<Constant>(RHS)) {
         // Transfer the cast to the constant.
         if (Value *V = SimplifyICmpInst(Pred, SrcOp,
diff --git a/lib/Analysis/Lint.cpp b/lib/Analysis/Lint.cpp
index 6d6d580ed1..d62808e9cd 100644
--- a/lib/Analysis/Lint.cpp
+++ b/lib/Analysis/Lint.cpp
@@ -626,8 +626,7 @@ Value *Lint::findValueImpl(Value *V, bool OffsetOk,
       if (W != V)
         return findValueImpl(W, OffsetOk, Visited);
   } else if (CastInst *CI = dyn_cast<CastInst>(V)) {
-    if (CI->isNoopCast(TD ? TD->getIntPtrType(V->getContext()) :
-                            Type::getInt64Ty(V->getContext())))
+    if (CI->isNoopCast(*TD))
       return findValueImpl(CI->getOperand(0), OffsetOk, Visited);
   } else if (ExtractValueInst *Ex = dyn_cast<ExtractValueInst>(V)) {
     if (Value *W = FindInsertedValue(Ex->getAggregateOperand(),
@@ -640,7 +639,7 @@ Value *Lint::findValueImpl(Value *V, bool OffsetOk,
       if (CastInst::isNoopCast(Instruction::CastOps(CE->getOpcode()),
                                CE->getOperand(0)->getType(),
                                CE->getType(),
-                               TD ? TD->getIntPtrType(V->getContext()) :
+                               TD ? TD->getIntPtrType(CE->getType()) :
                                     Type::getInt64Ty(V->getContext())))
         return findValueImpl(CE->getOperand(0), OffsetOk, Visited);
     } else if (CE->getOpcode() == Instruction::ExtractValue) {
diff --git a/lib/Analysis/MemoryBuiltins.cpp b/lib/Analysis/MemoryBuiltins.cpp
index 0a539fe758..8d903c63af 100644
--- a/lib/Analysis/MemoryBuiltins.cpp
+++ b/lib/Analysis/MemoryBuiltins.cpp
@@ -376,9 +376,10 @@ APInt ObjectSizeOffsetVisitor::align(APInt Size, uint64_t Align) {
 ObjectSizeOffsetVisitor::ObjectSizeOffsetVisitor(const DataLayout *TD,
                                                  const TargetLibraryInfo *TLI,
                                                  LLVMContext &Context,
-                                                 bool RoundToAlign)
+                                                 bool RoundToAlign,
+                                                 unsigned AS)
 : TD(TD), TLI(TLI), RoundToAlign(RoundToAlign) {
-  IntegerType *IntTy = TD->getIntPtrType(Context);
+  IntegerType *IntTy = TD->getIntPtrType(Context, AS);
   IntTyBits = IntTy->getBitWidth();
   Zero = APInt::getNullValue(IntTyBits);
 }
@@ -561,9 +562,10 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitInstruction(Instruction &I) {
 
 ObjectSizeOffsetEvaluator::ObjectSizeOffsetEvaluator(const DataLayout *TD,
                                                    const TargetLibraryInfo *TLI,
-                                                     LLVMContext &Context)
+                                                     LLVMContext &Context,
+                                                     unsigned AS)
 : TD(TD), TLI(TLI), Context(Context), Builder(Context, TargetFolder(TD)) {
-  IntTy = TD->getIntPtrType(Context);
+  IntTy = TD->getIntPtrType(Context, AS);
   Zero = ConstantInt::get(IntTy, 0);
 }
 
diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp
index 9316df6fbf..9872890494 100644
--- a/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -983,7 +983,7 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer,
     for (NonLocalDepInfo::iterator I = Cache->begin(), E = Cache->end();
          I != E; ++I) {
       Visited.insert(std::make_pair(I->getBB(), Addr));
-      if (!I->getResult().isNonLocal())
+      if (!I->getResult().isNonLocal() && DT->isReachableFromEntry(I->getBB()))
         Result.push_back(NonLocalDepResult(I->getBB(), I->getResult(), Addr));
     }
     ++NumCacheCompleteNonLocalPtr;
@@ -1029,7 +1029,7 @@ getNonLocalPointerDepFromBB(const PHITransAddr &Pointer,
                                                  NumSortedEntries);
       
       // If we got a Def or Clobber, add this to the list of results.
-      if (!Dep.isNonLocal()) {
+      if (!Dep.isNonLocal() && DT->isReachableFromEntry(BB)) {
         Result.push_back(NonLocalDepResult(BB, Dep, Pointer.getAddr()));
         continue;
       }
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 5400646be1..148912b766 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -2581,13 +2581,12 @@ const SCEV *ScalarEvolution::getUMinExpr(const SCEV *LHS,
   return getNotSCEV(getUMaxExpr(getNotSCEV(LHS), getNotSCEV(RHS)));
 }
 
-const SCEV *ScalarEvolution::getSizeOfExpr(Type *AllocTy) {
+const SCEV *ScalarEvolution::getSizeOfExpr(Type *AllocTy, Type *IntPtrTy) {
   // If we have DataLayout, we can bypass creating a target-independent
   // constant expression and then folding it back into a ConstantInt.
   // This is just a compile-time optimization.
   if (TD)
-    return getConstant(TD->getIntPtrType(getContext()),
-                       TD->getTypeAllocSize(AllocTy));
+    return getConstant(IntPtrTy, TD->getTypeAllocSize(AllocTy));
 
   Constant *C = ConstantExpr::getSizeOf(AllocTy);
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
@@ -2606,13 +2605,13 @@ const SCEV *ScalarEvolution::getAlignOfExpr(Type *AllocTy) {
   return getTruncateOrZeroExtend(getSCEV(C), Ty);
 }
 
-const SCEV *ScalarEvolution::getOffsetOfExpr(StructType *STy,
+const SCEV *ScalarEvolution::getOffsetOfExpr(StructType *STy, Type *IntPtrTy,
                                              unsigned FieldNo) {
   // If we have DataLayout, we can bypass creating a target-independent
   // constant expression and then folding it back into a ConstantInt.
   // This is just a compile-time optimization.
   if (TD)
-    return getConstant(TD->getIntPtrType(getContext()),
+    return getConstant(IntPtrTy,
                        TD->getStructLayout(STy)->getElementOffset(FieldNo));
 
   Constant *C = ConstantExpr::getOffsetOf(STy, FieldNo);
@@ -2699,7 +2698,7 @@ Type *ScalarEvolution::getEffectiveSCEVType(Type *Ty) const {
 
   // The only other support type is pointer.
   assert(Ty->isPointerTy() && "Unexpected non-pointer non-integer type!");
-  if (TD) return TD->getIntPtrType(getContext());
+  if (TD) return TD->getIntPtrType(Ty);
 
   // Without DataLayout, conservatively assume pointers are 64-bit.
   return Type::getInt64Ty(getContext());
@@ -3152,13 +3151,13 @@ const SCEV *ScalarEvolution::createNodeForGEP(GEPOperator *GEP) {
     if (StructType *STy = dyn_cast<StructType>(*GTI++)) {
       // For a struct, add the member offset.
       unsigned FieldNo = cast<ConstantInt>(Index)->getZExtValue();
-      const SCEV *FieldOffset = getOffsetOfExpr(STy, FieldNo);
+      const SCEV *FieldOffset = getOffsetOfExpr(STy, IntPtrTy, FieldNo);
 
       // Add the field offset to the running total offset.
       TotalOffset = getAddExpr(TotalOffset, FieldOffset);
     } else {
       // For an array, add the element offset, explicitly scaled.
-      const SCEV *ElementSize = getSizeOfExpr(*GTI);
+      const SCEV *ElementSize = getSizeOfExpr(*GTI, IntPtrTy);
       const SCEV *IndexS = getSCEV(Index);
       // Getelementptr indices are signed.
       IndexS = getTruncateOrSignExtend(IndexS, IntPtrTy);
@@ -3980,8 +3979,11 @@ getSmallConstantTripMultiple(Loop *L, BasicBlock *ExitingBlock) {
 
   ConstantInt *Result = MulC->getValue();
 
-  // Guard against huge trip counts.
-  if (!Result || Result->getValue().getActiveBits() > 32)
+  // Guard against huge trip counts (this requires checking
+  // for zero to handle the case where the trip count == -1 and the
+  // addition wraps).
+  if (!Result || Result->getValue().getActiveBits() > 32 ||
+      Result->getValue().getActiveBits() == 0)
     return 1;
 
   return (unsigned)Result->getZExtValue();
diff --git a/lib/Analysis/ScalarEvolutionExpander.cpp b/lib/Analysis/ScalarEvolutionExpander.cpp
index 111bfb4a6a..0295da5e4a 100644
--- a/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/lib/Analysis/ScalarEvolutionExpander.cpp
@@ -417,7 +417,9 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
     // array indexing.
     SmallVector<const SCEV *, 8> ScaledOps;
     if (ElTy->isSized()) {
-      const SCEV *ElSize = SE.getSizeOfExpr(ElTy);
+      Type *IntPtrTy = SE.TD ? SE.TD->getIntPtrType(PTy) :
+        IntegerType::getInt64Ty(PTy->getContext());
+      const SCEV *ElSize = SE.getSizeOfExpr(ElTy, IntPtrTy);
       if (!ElSize->isZero()) {
         SmallVector<const SCEV *, 8> NewOps;
         for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index 464dfd51d6..91f973d8d3 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -527,6 +527,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(ptx_device);
   KEYWORD(spir_kernel);
   KEYWORD(spir_func);
+  KEYWORD(intel_ocl_bicc);
 
   KEYWORD(cc);
   KEYWORD(c);
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index cc7cc31246..75fc16cd95 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -1094,6 +1094,7 @@ bool LLParser::ParseOptionalVisibility(unsigned &Res) {
 ///   ::= /*empty*/
 ///   ::= 'ccc'
 ///   ::= 'fastcc'
+///   ::= 'kw_intel_ocl_bicc'
 ///   ::= 'coldcc'
 ///   ::= 'x86_stdcallcc'
 ///   ::= 'x86_fastcallcc'
@@ -1125,6 +1126,7 @@ bool LLParser::ParseOptionalCallingConv(CallingConv::ID &CC) {
   case lltok::kw_ptx_device:     CC = CallingConv::PTX_Device; break;
   case lltok::kw_spir_kernel:    CC = CallingConv::SPIR_KERNEL; break;
   case lltok::kw_spir_func:      CC = CallingConv::SPIR_FUNC; break;
+  case lltok::kw_intel_ocl_bicc: CC = CallingConv::Intel_OCL_BI; break;
   case lltok::kw_cc: {
       unsigned ArbitraryCC;
       Lex.Lex();
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index ff6d68f0da..6cffc52d17 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -77,6 +77,7 @@ namespace lltok {
     kw_c,
 
     kw_cc, kw_ccc, kw_fastcc, kw_coldcc,
+	  kw_intel_ocl_bicc,
     kw_x86_stdcallcc, kw_x86_fastcallcc, kw_x86_thiscallcc,
     kw_arm_apcscc, kw_arm_aapcscc, kw_arm_aapcs_vfpcc,
     kw_msp430_intrcc,
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 788a89bf13..81eec3c9ac 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -401,8 +401,8 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
     //   - __tlv_bootstrap - used to make sure support exists
     //   - spare pointer, used when mapped by the runtime
     //   - pointer to mangled symbol above with initializer
-    unsigned AS = GV->getType()->getAddressSpace();
-    unsigned PtrSize = TD->getPointerSizeInBits(AS)/8;
+    assert(GV->getType()->isPointerTy() && "GV must be a pointer type!");
+    unsigned PtrSize = TD->getTypeSizeInBits(GV->getType())/8;
     OutStreamer.EmitSymbolValue(GetExternalSymbolSymbol("_tlv_bootstrap"),
                           PtrSize, 0);
     OutStreamer.EmitIntValue(0, PtrSize, 0);
@@ -1538,9 +1538,9 @@ static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP) {
     if (Offset == 0)
       return Base;
 
-    unsigned AS = cast<PointerType>(CE->getType())->getAddressSpace();
+    assert(CE->getType()->isPointerTy() && "We must have a pointer type!");
     // Truncate/sext the offset to the pointer size.
-    unsigned Width = TD.getPointerSizeInBits(AS);
+    unsigned Width = TD.getTypeSizeInBits(CE->getType());
     if (Width < 64)
       Offset = SignExtend64(Offset, Width);
 
@@ -1562,7 +1562,7 @@ static const MCExpr *lowerConstant(const Constant *CV, AsmPrinter &AP) {
     // Handle casts to pointers by changing them into casts to the appropriate
     // integer type.  This promotes constant folding and simplifies this code.
     Constant *Op = CE->getOperand(0);
-    Op = ConstantExpr::getIntegerCast(Op, TD.getIntPtrType(CV->getContext()),
+    Op = ConstantExpr::getIntegerCast(Op, TD.getIntPtrType(CE->getType()),
                                       false/*ZExt*/);
     return lowerConstant(Op, AP);
   }
diff --git a/lib/CodeGen/EarlyIfConversion.cpp b/lib/CodeGen/EarlyIfConversion.cpp
index 7f28828a5d..d5d84041b6 100644
--- a/lib/CodeGen/EarlyIfConversion.cpp
+++ b/lib/CodeGen/EarlyIfConversion.cpp
@@ -797,6 +797,5 @@ bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) {
     if (tryConvertIf(I->getBlock()))
       Changed = true;
 
-  MF.verify(this, "After early if-conversion");
   return Changed;
 }
diff --git a/lib/CodeGen/IntrinsicLowering.cpp b/lib/CodeGen/IntrinsicLowering.cpp
index eab10f2fd4..35f9e270dd 100644
--- a/lib/CodeGen/IntrinsicLowering.cpp
+++ b/lib/CodeGen/IntrinsicLowering.cpp
@@ -155,21 +155,21 @@ void IntrinsicLowering::AddPrototypes(Module &M) {
           Type::getInt8PtrTy(Context),
                               Type::getInt8PtrTy(Context), 
                               Type::getInt8PtrTy(Context), 
-                              TD.getIntPtrType(Context), (Type *)0);
+                              TD.getIntPtrType(Context, 0), (Type *)0);
         break;
       case Intrinsic::memmove:
         M.getOrInsertFunction("memmove",
           Type::getInt8PtrTy(Context),
                               Type::getInt8PtrTy(Context), 
                               Type::getInt8PtrTy(Context), 
-                              TD.getIntPtrType(Context), (Type *)0);
+                              TD.getIntPtrType(Context, 0), (Type *)0);
         break;
       case Intrinsic::memset:
         M.getOrInsertFunction("memset",
           Type::getInt8PtrTy(Context),
                               Type::getInt8PtrTy(Context), 
                               Type::getInt32Ty(M.getContext()), 
-                              TD.getIntPtrType(Context), (Type *)0);
+                              TD.getIntPtrType(Context, 0), (Type *)0);
         break;
       case Intrinsic::sqrt:
         EnsureFPIntrinsicsExist(M, I, "sqrtf", "sqrt", "sqrtl");
@@ -497,7 +497,7 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
     break;   // Strip out annotate intrinsic
     
   case Intrinsic::memcpy: {
-    IntegerType *IntPtr = TD.getIntPtrType(Context);
+    IntegerType *IntPtr = TD.getIntPtrType(CI->getArgOperand(0)->getType());
     Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
                                         /* isSigned */ false);
     Value *Ops[3];
@@ -508,7 +508,7 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
     break;
   }
   case Intrinsic::memmove: {
-    IntegerType *IntPtr = TD.getIntPtrType(Context);
+    IntegerType *IntPtr = TD.getIntPtrType(CI->getArgOperand(0)->getType());
     Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
                                         /* isSigned */ false);
     Value *Ops[3];
@@ -519,7 +519,7 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
     break;
   }
   case Intrinsic::memset: {
-    IntegerType *IntPtr = TD.getIntPtrType(Context);
+    IntegerType *IntPtr = TD.getIntPtrType(CI->getArgOperand(0)->getType());
     Value *Size = Builder.CreateIntCast(CI->getArgOperand(2), IntPtr,
                                         /* isSigned */ false);
     Value *Ops[3];
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index ce5f414597..0a85179293 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -270,6 +270,8 @@ namespace {
     SDValue ReduceLoadWidth(SDNode *N);
     SDValue ReduceLoadOpStoreWidth(SDNode *N);
     SDValue TransformFPLoadStorePair(SDNode *N);
+    SDValue reduceBuildVecExtToExtBuildVec(SDNode *N);
+    SDValue reduceBuildVecConvertToConvertBuildVec(SDNode *N);
 
     SDValue GetDemandedBits(SDValue V, const APInt &Mask);
 
@@ -8356,15 +8358,21 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
   return SDValue();
 }
 
-SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
+// Simplify (build_vec (ext )) to (bitcast (build_vec ))
+SDValue DAGCombiner::reduceBuildVecExtToExtBuildVec(SDNode *N) {
+  // We perform this optimization post type-legalization because
+  // the type-legalizer often scalarizes integer-promoted vectors.
+  // Performing this optimization before may create bit-casts which
+  // will be type-legalized to complex code sequences.
+  // We perform this optimization only before the operation legalizer because we
+  // may introduce illegal operations.
+  if (Level != AfterLegalizeVectorOps && Level != AfterLegalizeTypes)
+    return SDValue();
+
   unsigned NumInScalars = N->getNumOperands();
   DebugLoc dl = N->getDebugLoc();
   EVT VT = N->getValueType(0);
 
-  // A vector built entirely of undefs is undef.
-  if (ISD::allOperandsUndef(N))
-    return DAG.getUNDEF(VT);
-
   // Check to see if this is a BUILD_VECTOR of a bunch of values
   // which come from any_extend or zero_extend nodes. If so, we can create
   // a new BUILD_VECTOR using bit-casts which may enable other BUILD_VECTOR
@@ -8407,64 +8415,141 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
   // In order to have valid types, all of the inputs must be extended from the
   // same source type and all of the inputs must be any or zero extend.
   // Scalar sizes must be a power of two.
-  EVT OutScalarTy = N->getValueType(0).getScalarType();
+  EVT OutScalarTy = VT.getScalarType();
   bool ValidTypes = SourceType != MVT::Other &&
                  isPowerOf2_32(OutScalarTy.getSizeInBits()) &&
                  isPowerOf2_32(SourceType.getSizeInBits());
 
-  // We perform this optimization post type-legalization because
-  // the type-legalizer often scalarizes integer-promoted vectors.
-  // Performing this optimization before may create bit-casts which
-  // will be type-legalized to complex code sequences.
-  // We perform this optimization only before the operation legalizer because we
-  // may introduce illegal operations.
   // Create a new simpler BUILD_VECTOR sequence which other optimizations can
   // turn into a single shuffle instruction.
-  if ((Level == AfterLegalizeVectorOps || Level == AfterLegalizeTypes) &&
-      ValidTypes) {
-    bool isLE = TLI.isLittleEndian();
-    unsigned ElemRatio = OutScalarTy.getSizeInBits()/SourceType.getSizeInBits();
-    assert(ElemRatio > 1 && "Invalid element size ratio");
-    SDValue Filler = AllAnyExt ? DAG.getUNDEF(SourceType):
-                                 DAG.getConstant(0, SourceType);
-
-    unsigned NewBVElems = ElemRatio * N->getValueType(0).getVectorNumElements();
-    SmallVector<SDValue, 8> Ops(NewBVElems, Filler);
-
-    // Populate the new build_vector
-    for (unsigned i=0; i < N->getNumOperands(); ++i) {
-      SDValue Cast = N->getOperand(i);
-      assert((Cast.getOpcode() == ISD::ANY_EXTEND ||
-              Cast.getOpcode() == ISD::ZERO_EXTEND ||
-              Cast.getOpcode() == ISD::UNDEF) && "Invalid cast opcode");
-      SDValue In;
-      if (Cast.getOpcode() == ISD::UNDEF)
-        In = DAG.getUNDEF(SourceType);
-      else
-        In = Cast->getOperand(0);
-      unsigned Index = isLE ? (i * ElemRatio) :
-                              (i * ElemRatio + (ElemRatio - 1));
+  if (!ValidTypes)
+    return SDValue();
+
+  bool isLE = TLI.isLittleEndian();
+  unsigned ElemRatio = OutScalarTy.getSizeInBits()/SourceType.getSizeInBits();
+  assert(ElemRatio > 1 && "Invalid element size ratio");
+  SDValue Filler = AllAnyExt ? DAG.getUNDEF(SourceType):
+                               DAG.getConstant(0, SourceType);
+
+  unsigned NewBVElems = ElemRatio * VT.getVectorNumElements();
+  SmallVector<SDValue, 8> Ops(NewBVElems, Filler);
+
+  // Populate the new build_vector
+  for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
+    SDValue Cast = N->getOperand(i);
+    assert((Cast.getOpcode() == ISD::ANY_EXTEND ||
+            Cast.getOpcode() == ISD::ZERO_EXTEND ||
+            Cast.getOpcode() == ISD::UNDEF) && "Invalid cast opcode");
+    SDValue In;
+    if (Cast.getOpcode() == ISD::UNDEF)
+      In = DAG.getUNDEF(SourceType);
+    else
+      In = Cast->getOperand(0);
+    unsigned Index = isLE ? (i * ElemRatio) :
+                            (i * ElemRatio + (ElemRatio - 1));
+
+    assert(Index < Ops.size() && "Invalid index");
+    Ops[Index] = In;
+  }
+
+  // The type of the new BUILD_VECTOR node.
+  EVT VecVT = EVT::getVectorVT(*DAG.getContext(), SourceType, NewBVElems);
+  assert(VecVT.getSizeInBits() == VT.getSizeInBits() &&
+         "Invalid vector size");
+  // Check if the new vector type is legal.
+  if (!isTypeLegal(VecVT)) return SDValue();
+
+  // Make the new BUILD_VECTOR.
+  SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, VecVT, &Ops[0], Ops.size());
+
+  // The new BUILD_VECTOR node has the potential to be further optimized.
+  AddToWorkList(BV.getNode());
+  // Bitcast to the desired type.
+  return DAG.getNode(ISD::BITCAST, dl, VT, BV);
+}
+
+SDValue DAGCombiner::reduceBuildVecConvertToConvertBuildVec(SDNode *N) {
+  EVT VT = N->getValueType(0);
 
-      assert(Index < Ops.size() && "Invalid index");
-      Ops[Index] = In;
+  unsigned NumInScalars = N->getNumOperands();
+  DebugLoc dl = N->getDebugLoc();
+
+  EVT SrcVT = MVT::Other;
+  unsigned Opcode = ISD::DELETED_NODE;
+  unsigned NumDefs = 0;
+
+  for (unsigned i = 0; i != NumInScalars; ++i) {
+    SDValue In = N->getOperand(i);
+    unsigned Opc = In.getOpcode();
+
+    if (Opc == ISD::UNDEF)
+      continue;
+
+    // If all scalar values are floats and converted from integers.
+    if (Opcode == ISD::DELETED_NODE &&
+        (Opc == ISD::UINT_TO_FP || Opc == ISD::SINT_TO_FP)) {
+      Opcode = Opc;
+      // If not supported by target, bail out.
+      if (TLI.getOperationAction(Opcode, VT) != TargetLowering::Legal &&
+          TLI.getOperationAction(Opcode, VT) != TargetLowering::Custom)
+        return SDValue();
     }
+    if (Opc != Opcode)
+      return SDValue();
+
+    EVT InVT = In.getOperand(0).getValueType();
+
+    // If all scalar values are typed differently, bail out. It's chosen to
+    // simplify BUILD_VECTOR of integer types.
+    if (SrcVT == MVT::Other)
+      SrcVT = InVT;
+    if (SrcVT != InVT)
+      return SDValue();
+    NumDefs++;
+  }
+
+  // If the vector has just one element defined, it's not worth to fold it into
+  // a vectorized one.
+  if (NumDefs < 2)
+    return SDValue();
 
-    // The type of the new BUILD_VECTOR node.
-    EVT VecVT = EVT::getVectorVT(*DAG.getContext(), SourceType, NewBVElems);
-    assert(VecVT.getSizeInBits() == N->getValueType(0).getSizeInBits() &&
-           "Invalid vector size");
-    // Check if the new vector type is legal.
-    if (!isTypeLegal(VecVT)) return SDValue();
+  assert((Opcode == ISD::UINT_TO_FP || Opcode == ISD::SINT_TO_FP)
+         && "Should only handle conversion from integer to float.");
+  assert(SrcVT != MVT::Other && "Cannot determine source type!");
 
-    // Make the new BUILD_VECTOR.
-    SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, N->getDebugLoc(),
-                                 VecVT, &Ops[0], Ops.size());
+  EVT NVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumInScalars);
+  SmallVector<SDValue, 8> Opnds;
+  for (unsigned i = 0; i != NumInScalars; ++i) {
+    SDValue In = N->getOperand(i);
 
-    // The new BUILD_VECTOR node has the potential to be further optimized.
-    AddToWorkList(BV.getNode());
-    // Bitcast to the desired type.
-    return DAG.getNode(ISD::BITCAST, dl, N->getValueType(0), BV);
+    if (In.getOpcode() == ISD::UNDEF)
+      Opnds.push_back(DAG.getUNDEF(SrcVT));
+    else
+      Opnds.push_back(In.getOperand(0));
   }
+  SDValue BV = DAG.getNode(ISD::BUILD_VECTOR, dl, NVT,
+                           &Opnds[0], Opnds.size());
+  AddToWorkList(BV.getNode());
+
+  return DAG.getNode(Opcode, dl, VT, BV);
+}
+
+SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
+  unsigned NumInScalars = N->getNumOperands();
+  DebugLoc dl = N->getDebugLoc();
+  EVT VT = N->getValueType(0);
+
+  // A vector built entirely of undefs is undef.
+  if (ISD::allOperandsUndef(N))
+    return DAG.getUNDEF(VT);
+
+  SDValue V = reduceBuildVecExtToExtBuildVec(N);
+  if (V.getNode())
+    return V;
+
+  V = reduceBuildVecConvertToConvertBuildVec(N);
+  if (V.getNode())
+    return V;
 
   // Check to see if this is a BUILD_VECTOR of a bunch of EXTRACT_VECTOR_ELT
   // operations.  If so, and if the EXTRACT_VECTOR_ELT vector inputs come from
@@ -8549,7 +8634,7 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
         return SDValue();
 
       // Widen the input vector by adding undef values.
-      VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, N->getDebugLoc(), VT,
+      VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
                            VecIn1, DAG.getUNDEF(VecIn1.getValueType()));
     }
 
@@ -8570,7 +8655,7 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
     SDValue Ops[2];
     Ops[0] = VecIn1;
     Ops[1] = VecIn2;
-    return DAG.getVectorShuffle(VT, N->getDebugLoc(), Ops[0], Ops[1], &Mask[0]);
+    return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], &Mask[0]);
   }
 
   return SDValue();
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index 4854cf7b26..2ddc07cc63 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -101,8 +101,7 @@ bool FastISel::hasTrivialKill(const Value *V) const {
 
   // No-op casts are trivially coalesced by fast-isel.
   if (const CastInst *Cast = dyn_cast<CastInst>(I))
-    if (Cast->isNoopCast(TD.getIntPtrType(Cast->getContext())) &&
-        !hasTrivialKill(Cast->getOperand(0)))
+    if (Cast->isNoopCast(TD) && !hasTrivialKill(Cast->getOperand(0)))
       return false;
 
   // GEPs with all zero indices are trivially coalesced by fast-isel.
@@ -175,7 +174,7 @@ unsigned FastISel::materializeRegForValue(const Value *V, MVT VT) {
     // Translate this as an integer zero so that it can be
     // local-CSE'd with actual integer zeros.
     Reg =
-      getRegForValue(Constant::getNullValue(TD.getIntPtrType(V->getContext())));
+      getRegForValue(Constant::getNullValue(TD.getIntPtrType(V->getType())));
   } else if (const ConstantFP *CF = dyn_cast<ConstantFP>(V)) {
     if (CF->isNullValue()) {
       Reg = TargetMaterializeFloatZero(CF);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 183416f3fd..d661971bb8 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -3804,7 +3804,8 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, DebugLoc dl, SDValue Dst,
   // Emit a library call.
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
-  Entry.Ty = TLI.getDataLayout()->getIntPtrType(*getContext());
+  unsigned AS = SrcPtrInfo.getAddrSpace();
+  Entry.Ty = TLI.getDataLayout()->getIntPtrType(*getContext(), AS);
   Entry.Node = Dst; Args.push_back(Entry);
   Entry.Node = Src; Args.push_back(Entry);
   Entry.Node = Size; Args.push_back(Entry);
@@ -3859,7 +3860,8 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, DebugLoc dl, SDValue Dst,
   // Emit a library call.
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
-  Entry.Ty = TLI.getDataLayout()->getIntPtrType(*getContext());
+  unsigned AS = SrcPtrInfo.getAddrSpace();
+  Entry.Ty = TLI.getDataLayout()->getIntPtrType(*getContext(), AS);
   Entry.Node = Dst; Args.push_back(Entry);
   Entry.Node = Src; Args.push_back(Entry);
   Entry.Node = Size; Args.push_back(Entry);
@@ -3908,7 +3910,8 @@ SDValue SelectionDAG::getMemset(SDValue Chain, DebugLoc dl, SDValue Dst,
     return Result;
 
   // Emit a library call.
-  Type *IntPtrTy = TLI.getDataLayout()->getIntPtrType(*getContext());
+  unsigned AS = DstPtrInfo.getAddrSpace();
+  Type *IntPtrTy = TLI.getDataLayout()->getIntPtrType(*getContext(), AS);
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
   Entry.Node = Dst; Entry.Ty = IntPtrTy;
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 30a3fc2ca5..6c9d001a1f 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -2604,14 +2604,14 @@ void SelectionDAGBuilder::visitIndirectBr(const IndirectBrInst &I) {
   MachineBasicBlock *IndirectBrMBB = FuncInfo.MBB;
 
   // Update machine-CFG edges with unique successors.
-  SmallVector<BasicBlock*, 32> succs;
-  succs.reserve(I.getNumSuccessors());
-  for (unsigned i = 0, e = I.getNumSuccessors(); i != e; ++i)
-    succs.push_back(I.getSuccessor(i));
-  array_pod_sort(succs.begin(), succs.end());
-  succs.erase(std::unique(succs.begin(), succs.end()), succs.end());
-  for (unsigned i = 0, e = succs.size(); i != e; ++i) {
-    MachineBasicBlock *Succ = FuncInfo.MBBMap[succs[i]];
+  SmallSet<BasicBlock*, 32> Done;
+  for (unsigned i = 0, e = I.getNumSuccessors(); i != e; ++i) {
+    BasicBlock *BB = I.getSuccessor(i);
+    bool Inserted = Done.insert(BB);
+    if (!Inserted)
+        continue;
+
+    MachineBasicBlock *Succ = FuncInfo.MBBMap[BB];
     addSuccessorWithWeight(IndirectBrMBB, Succ);
   }
 
diff --git a/lib/ExecutionEngine/ExecutionEngine.cpp b/lib/ExecutionEngine/ExecutionEngine.cpp
index 94a2542e7a..99f6ec691a 100644
--- a/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -645,16 +645,17 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
     }
     case Instruction::PtrToInt: {
       GenericValue GV = getConstantValue(Op0);
-      unsigned AS = cast<PointerType>(CE->getOperand(1)->getType())
-        ->getAddressSpace();
-      uint32_t PtrWidth = TD->getPointerSizeInBits(AS);
+      assert(CE->getOperand(1)->getType()->isPointerTy() &&
+          "Must be a pointer type!");
+      uint32_t PtrWidth = TD->getTypeSizeInBits(CE->getOperand(1)->getType());
       GV.IntVal = APInt(PtrWidth, uintptr_t(GV.PointerVal));
       return GV;
     }
     case Instruction::IntToPtr: {
       GenericValue GV = getConstantValue(Op0);
-      unsigned AS = cast<PointerType>(CE->getType())->getAddressSpace();
-      uint32_t PtrWidth = TD->getPointerSizeInBits(AS);
+      assert(CE->getOperand(1)->getType()->isPointerTy() &&
+          "Must be a pointer type!");
+      uint32_t PtrWidth = TD->getTypeSizeInBits(CE->getType());
       if (PtrWidth != GV.IntVal.getBitWidth())
         GV.IntVal = GV.IntVal.zextOrTrunc(PtrWidth);
       assert(GV.IntVal.getBitWidth() <= 64 && "Bad pointer width");
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index c1f8baed1a..ff05c82aec 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -190,7 +190,7 @@ void RuntimeDyldImpl::emitCommonSymbols(ObjectImage &Obj,
   if (!Addr)
     report_fatal_error("Unable to allocate memory for common symbols!");
   uint64_t Offset = 0;
-  Sections.push_back(SectionEntry(Addr, TotalSize, TotalSize, 0));
+  Sections.push_back(SectionEntry(StringRef(), Addr, TotalSize, TotalSize, 0));
   memset(Addr, 0, TotalSize);
 
   DEBUG(dbgs() << "emitCommonSection SectionID: " << SectionID
@@ -233,10 +233,12 @@ unsigned RuntimeDyldImpl::emitSection(ObjectImage &Obj,
   bool IsVirtual;
   bool IsZeroInit;
   uint64_t DataSize;
+  StringRef Name;
   Check(Section.isRequiredForExecution(IsRequired));
   Check(Section.isVirtual(IsVirtual));
   Check(Section.isZeroInit(IsZeroInit));
   Check(Section.getSize(DataSize));
+  Check(Section.getName(Name));
 
   unsigned Allocate;
   unsigned SectionID = Sections.size();
@@ -264,6 +266,7 @@ unsigned RuntimeDyldImpl::emitSection(ObjectImage &Obj,
       memcpy(Addr, pData, DataSize);
 
     DEBUG(dbgs() << "emitSection SectionID: " << SectionID
+                 << " Name: " << Name
                  << " obj addr: " << format("%p", pData)
                  << " new addr: " << format("%p", Addr)
                  << " DataSize: " << DataSize
@@ -279,6 +282,7 @@ unsigned RuntimeDyldImpl::emitSection(ObjectImage &Obj,
     Allocate = 0;
     Addr = 0;
     DEBUG(dbgs() << "emitSection SectionID: " << SectionID
+                 << " Name: " << Name
                  << " obj addr: " << format("%p", data.data())
                  << " new addr: 0"
                  << " DataSize: " << DataSize
@@ -287,7 +291,8 @@ unsigned RuntimeDyldImpl::emitSection(ObjectImage &Obj,
                  << "\n");
   }
 
-  Sections.push_back(SectionEntry(Addr, Allocate, DataSize,(uintptr_t)pData));
+  Sections.push_back(SectionEntry(Name, Addr, Allocate, DataSize,
+				  (uintptr_t)pData));
   return SectionID;
 }
 
@@ -353,6 +358,24 @@ uint8_t *RuntimeDyldImpl::createStubFunction(uint8_t *Addr) {
     StubAddr++;
     *StubAddr = NopInstr;
     return Addr;
+  } else if (Arch == Triple::ppc64) {
+    // PowerPC64 stub: the address points to a function descriptor
+    // instead of the function itself. Load the function address
+    // on r11 and sets it to control register. Also loads the function
+    // TOC in r2 and environment pointer to r11.
+    writeInt32BE(Addr,    0x3D800000); // lis   r12, highest(addr)
+    writeInt32BE(Addr+4,  0x618C0000); // ori   r12, higher(addr)
+    writeInt32BE(Addr+8,  0x798C07C6); // sldi  r12, r12, 32
+    writeInt32BE(Addr+12, 0x658C0000); // oris  r12, r12, h(addr)
+    writeInt32BE(Addr+16, 0x618C0000); // ori   r12, r12, l(addr)
+    writeInt32BE(Addr+20, 0xF8410028); // std   r2,  40(r1)
+    writeInt32BE(Addr+24, 0xE96C0000); // ld    r11, 0(r12)
+    writeInt32BE(Addr+28, 0xE84C0008); // ld    r2,  0(r12)
+    writeInt32BE(Addr+32, 0x7D6903A6); // mtctr r11
+    writeInt32BE(Addr+36, 0xE96C0010); // ld    r11, 16(r2)
+    writeInt32BE(Addr+40, 0x4E800420); // bctr
+    
+    return Addr;
   }
   return Addr;
 }
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index 08aba64e46..1073c6fc52 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -30,6 +30,14 @@ using namespace llvm::object;
 
 namespace {
 
+static inline
+error_code check(error_code Err) {
+  if (Err) {
+    report_fatal_error(Err.message());
+  }
+  return Err;
+}
+
 template<support::endianness target_endianness, bool is64Bits>
 class DyldELFObject : public ELFObjectFile<target_endianness, is64Bits> {
   LLVM_ELF_IMPORT_TYPES(target_endianness, is64Bits)
@@ -340,6 +348,179 @@ void RuntimeDyldELF::resolveMIPSRelocation(uint8_t *LocalAddress,
    }
 }
 
+// Return the .TOC. section address to R_PPC64_TOC relocations.
+uint64_t RuntimeDyldELF::findPPC64TOC() const {
+  // The TOC consists of sections .got, .toc, .tocbss, .plt in that
+  // order. The TOC starts where the first of these sections starts.
+  SectionList::const_iterator it = Sections.begin();
+  SectionList::const_iterator ite = Sections.end();
+  for (; it != ite; ++it) {
+    if (it->Name == ".got" ||
+        it->Name == ".toc" ||
+        it->Name == ".tocbss" ||
+        it->Name == ".plt")
+      break;
+  }
+  if (it == ite) {
+    // This may happen for
+    // * references to TOC base base (sym@toc, .odp relocation) without
+    // a .toc directive.
+    // In this case just use the first section (which is usually
+    // the .odp) since the code won't reference the .toc base
+    // directly.
+    it = Sections.begin();
+  }
+  assert (it != ite);
+  // Per the ppc64-elf-linux ABI, The TOC base is TOC value plus 0x8000
+  // thus permitting a full 64 Kbytes segment.
+  return it->LoadAddress + 0x8000;
+}
+
+// Returns the sections and offset associated with the ODP entry referenced
+// by Symbol.
+void RuntimeDyldELF::findOPDEntrySection(ObjectImage &Obj,
+                                         ObjSectionToIDMap &LocalSections,
+                                         RelocationValueRef &Rel) {
+  // Get the ELF symbol value (st_value) to compare with Relocation offset in
+  // .opd entries
+
+  error_code err;
+  for (section_iterator si = Obj.begin_sections(),
+     se = Obj.end_sections(); si != se; si.increment(err)) {
+    StringRef SectionName;
+    check(si->getName(SectionName));
+    if (SectionName != ".opd")
+      continue;
+
+    for (relocation_iterator i = si->begin_relocations(),
+         e = si->end_relocations(); i != e;) {
+      check(err);
+
+      // The R_PPC64_ADDR64 relocation indicates the first field
+      // of a .opd entry
+      uint64_t TypeFunc;
+      check(i->getType(TypeFunc));
+      if (TypeFunc != ELF::R_PPC64_ADDR64) {
+        i.increment(err);
+        continue;
+      }
+
+      SymbolRef TargetSymbol;
+      uint64_t TargetSymbolOffset;
+      int64_t TargetAdditionalInfo;
+      check(i->getSymbol(TargetSymbol));
+      check(i->getOffset(TargetSymbolOffset));
+      check(i->getAdditionalInfo(TargetAdditionalInfo));
+
+      i = i.increment(err);
+      if (i == e)
+        break;
+      check(err);
+
+      // Just check if following relocation is a R_PPC64_TOC
+      uint64_t TypeTOC;
+      check(i->getType(TypeTOC));
+      if (TypeTOC != ELF::R_PPC64_TOC)
+        continue;
+
+      // Finally compares the Symbol value and the target symbol offset
+      // to check if this .opd entry refers to the symbol the relocation
+      // points to.
+      if (Rel.Addend != (intptr_t)TargetSymbolOffset)
+        continue;
+
+      section_iterator tsi(Obj.end_sections());
+      check(TargetSymbol.getSection(tsi));
+      Rel.SectionID = findOrEmitSection(Obj, (*tsi), true, LocalSections);
+      Rel.Addend = (intptr_t)TargetAdditionalInfo;
+      return;
+    }
+  }
+  llvm_unreachable("Attempting to get address of ODP entry!");
+}
+
+// Relocation masks following the #lo(value), #hi(value), #higher(value),
+// and #highest(value) macros defined in section 4.5.1. Relocation Types
+// in PPC-elf64abi document.
+//
+static inline
+uint16_t applyPPClo (uint64_t value)
+{
+  return value & 0xffff;
+}
+
+static inline
+uint16_t applyPPChi (uint64_t value)
+{
+  return (value >> 16) & 0xffff;
+}
+
+static inline
+uint16_t applyPPChigher (uint64_t value)
+{
+  return (value >> 32) & 0xffff;
+}
+
+static inline
+uint16_t applyPPChighest (uint64_t value)
+{
+  return (value >> 48) & 0xffff;
+}
+
+void RuntimeDyldELF::resolvePPC64Relocation(uint8_t *LocalAddress,
+					    uint64_t FinalAddress,
+					    uint64_t Value,
+					    uint32_t Type,
+					    int64_t Addend) {
+  switch (Type) {
+  default:
+    llvm_unreachable("Relocation type not implemented yet!");
+  break;
+  case ELF::R_PPC64_ADDR16_LO :
+    writeInt16BE(LocalAddress, applyPPClo (Value + Addend));
+    break;
+  case ELF::R_PPC64_ADDR16_HI :
+    writeInt16BE(LocalAddress, applyPPChi (Value + Addend));
+    break;
+  case ELF::R_PPC64_ADDR16_HIGHER :
+    writeInt16BE(LocalAddress, applyPPChigher (Value + Addend));
+    break;
+  case ELF::R_PPC64_ADDR16_HIGHEST :
+    writeInt16BE(LocalAddress, applyPPChighest (Value + Addend));
+    break;
+  case ELF::R_PPC64_ADDR14 : {
+    assert(((Value + Addend) & 3) == 0);
+    // Preserve the AA/LK bits in the branch instruction
+    uint8_t aalk = *(LocalAddress+3);
+    writeInt16BE(LocalAddress + 2, (aalk & 3) | ((Value + Addend) & 0xfffc));
+  } break;
+  case ELF::R_PPC64_REL24 : {
+    int32_t delta = static_cast<int32_t>(Value - FinalAddress + Addend);
+    if (SignExtend32<24>(delta) != delta)
+      llvm_unreachable("Relocation R_PPC64_REL24 overflow");
+    // Generates a 'bl <address>' instruction
+    writeInt32BE(LocalAddress, 0x48000001 | (delta & 0x03FFFFFC));
+  } break;
+  case ELF::R_PPC64_ADDR64 :
+    writeInt64BE(LocalAddress, Value + Addend);
+    break;
+  case ELF::R_PPC64_TOC :
+    writeInt64BE(LocalAddress, findPPC64TOC());
+    break;
+  case ELF::R_PPC64_TOC16 : {
+    uint64_t TOCStart = findPPC64TOC();
+    Value = applyPPClo((Value + Addend) - TOCStart);
+    writeInt16BE(LocalAddress, applyPPClo(Value));
+  } break;
+  case ELF::R_PPC64_TOC16_DS : {
+    uint64_t TOCStart = findPPC64TOC();
+    Value = ((Value + Addend) - TOCStart);
+    writeInt16BE(LocalAddress, applyPPClo(Value));
+  } break;
+  }
+}
+
+
 void RuntimeDyldELF::resolveRelocation(uint8_t *LocalAddress,
                                        uint64_t FinalAddress,
                                        uint64_t Value,
@@ -366,6 +547,9 @@ void RuntimeDyldELF::resolveRelocation(uint8_t *LocalAddress,
                           (uint32_t)(Value & 0xffffffffL), Type,
                           (uint32_t)(Addend & 0xffffffffL));
     break;
+  case Triple::ppc64:
+    resolvePPC64Relocation(LocalAddress, FinalAddress, Value, Type, Addend);
+    break;
   default: llvm_unreachable("Unsupported CPU type!");
   }
 }
@@ -390,6 +574,8 @@ void RuntimeDyldELF::processRelocationRef(const ObjRelocationInfo &Rel,
   RelocationValueRef Value;
   // First search for the symbol in the local symbol table
   SymbolTableMap::const_iterator lsi = Symbols.find(TargetName.data());
+  SymbolRef::Type SymType;
+  Symbol.getType(SymType);
   if (lsi != Symbols.end()) {
     Value.SectionID = lsi->second.first;
     Value.Addend = lsi->second.second;
@@ -401,8 +587,6 @@ void RuntimeDyldELF::processRelocationRef(const ObjRelocationInfo &Rel,
       Value.SectionID = gsi->second.first;
       Value.Addend = gsi->second.second;
     } else {
-      SymbolRef::Type SymType;
-      Symbol.getType(SymType);
       switch (SymType) {
         case SymbolRef::ST_Debug: {
           // TODO: Now ELF SymbolRef::ST_Debug = STT_SECTION, it's not obviously
@@ -446,7 +630,7 @@ void RuntimeDyldELF::processRelocationRef(const ObjRelocationInfo &Rel,
     SectionEntry &Section = Sections[Rel.SectionID];
     uint8_t *Target = Section.Address + Rel.Offset;
 
-    //  Look up for existing stub.
+    // Look for an existing stub.
     StubMap::const_iterator i = Stubs.find(Value);
     if (i != Stubs.end()) {
       resolveRelocation(Target, (uint64_t)Target, (uint64_t)Section.Address +
@@ -516,6 +700,93 @@ void RuntimeDyldELF::processRelocationRef(const ObjRelocationInfo &Rel,
                         Section.StubOffset, RelType, 0);
       Section.StubOffset += getMaxStubSize();
     }
+  } else if (Arch == Triple::ppc64) {
+    if (RelType == ELF::R_PPC64_REL24) {
+      // A PPC branch relocation will need a stub function if the target is
+      // an external symbol (Symbol::ST_Unknown) or if the target address
+      // is not within the signed 24-bits branch address.
+      SectionEntry &Section = Sections[Rel.SectionID];
+      uint8_t *Target = Section.Address + Rel.Offset;
+      bool RangeOverflow = false;
+      if (SymType != SymbolRef::ST_Unknown) {
+        // A function call may points to the .opd entry, so the final symbol value
+        // in calculated based in the relocation values in .opd section.
+        findOPDEntrySection(Obj, ObjSectionToID, Value);
+        uint8_t *RelocTarget = Sections[Value.SectionID].Address + Value.Addend;
+        int32_t delta = static_cast<int32_t>(Target - RelocTarget);
+        // If it is within 24-bits branch range, just set the branch target
+        if (SignExtend32<24>(delta) == delta) {
+          RelocationEntry RE(Rel.SectionID, Rel.Offset, RelType, Value.Addend);
+          if (Value.SymbolName)
+            addRelocationForSymbol(RE, Value.SymbolName);
+          else
+            addRelocationForSection(RE, Value.SectionID);
+        } else {
+          RangeOverflow = true;
+        }
+      }
+      if (SymType == SymbolRef::ST_Unknown || RangeOverflow == true) {
+        // It is an external symbol (SymbolRef::ST_Unknown) or within a range
+        // larger than 24-bits.
+        StubMap::const_iterator i = Stubs.find(Value);
+        if (i != Stubs.end()) {
+          // Symbol function stub already created, just relocate to it
+          resolveRelocation(Target, (uint64_t)Target, (uint64_t)Section.Address
+                            + i->second, RelType, 0);
+          DEBUG(dbgs() << " Stub function found\n");
+        } else {
+          // Create a new stub function.
+          DEBUG(dbgs() << " Create a new stub function\n");
+          Stubs[Value] = Section.StubOffset;
+          uint8_t *StubTargetAddr = createStubFunction(Section.Address +
+                                                       Section.StubOffset);
+          RelocationEntry RE(Rel.SectionID, StubTargetAddr - Section.Address,
+                             ELF::R_PPC64_ADDR64, Value.Addend);
+
+          // Generates the 64-bits address loads as exemplified in section
+          // 4.5.1 in PPC64 ELF ABI.
+          RelocationEntry REhst(Rel.SectionID,
+                                StubTargetAddr - Section.Address + 2,
+                                ELF::R_PPC64_ADDR16_HIGHEST, Value.Addend);
+          RelocationEntry REhr(Rel.SectionID,
+                               StubTargetAddr - Section.Address + 6,
+                               ELF::R_PPC64_ADDR16_HIGHER, Value.Addend);
+          RelocationEntry REh(Rel.SectionID,
+                              StubTargetAddr - Section.Address + 14,
+                              ELF::R_PPC64_ADDR16_HI, Value.Addend);
+          RelocationEntry REl(Rel.SectionID,
+                              StubTargetAddr - Section.Address + 18,
+                              ELF::R_PPC64_ADDR16_LO, Value.Addend);
+
+          if (Value.SymbolName) {
+            addRelocationForSymbol(REhst, Value.SymbolName);
+            addRelocationForSymbol(REhr,  Value.SymbolName);
+            addRelocationForSymbol(REh,   Value.SymbolName);
+            addRelocationForSymbol(REl,   Value.SymbolName);
+          } else {
+            addRelocationForSection(REhst, Value.SectionID);
+            addRelocationForSection(REhr,  Value.SectionID);
+            addRelocationForSection(REh,   Value.SectionID);
+            addRelocationForSection(REl,   Value.SectionID);
+          }
+
+          resolveRelocation(Target, (uint64_t)Target, (uint64_t)Section.Address
+                            + Section.StubOffset, RelType, 0);
+          if (SymType == SymbolRef::ST_Unknown)
+            // Restore the TOC for external calls
+            writeInt32BE(Target+4, 0xE8410028); // ld r2,40(r1)
+          Section.StubOffset += getMaxStubSize();
+        }
+      }
+    } else {
+      RelocationEntry RE(Rel.SectionID, Rel.Offset, RelType, Value.Addend);
+      // Extra check to avoid relocation againt empty symbols (usually
+      // the R_PPC64_TOC).
+      if (Value.SymbolName && !TargetName.empty())
+        addRelocationForSymbol(RE, Value.SymbolName);
+      else
+        addRelocationForSection(RE, Value.SectionID);
+    }
   } else {
     RelocationEntry RE(Rel.SectionID, Rel.Offset, RelType, Value.Addend);
     if (Value.SymbolName)
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
index 3011a06537..6c31f0dc12 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
@@ -46,6 +46,12 @@ protected:
                              uint32_t Type,
                              int32_t Addend);
 
+  void resolvePPC64Relocation(uint8_t *LocalAddress,
+                              uint64_t FinalAddress,
+                              uint64_t Value,
+                              uint32_t Type,
+                              int64_t Addend);
+
   virtual void resolveRelocation(uint8_t *LocalAddress,
                                  uint64_t FinalAddress,
                                  uint64_t Value,
@@ -60,6 +66,11 @@ protected:
 
   virtual ObjectImage *createObjectImage(ObjectBuffer *InputBuffer);
 
+  uint64_t findPPC64TOC() const;
+  void findOPDEntrySection(ObjectImage &Obj,
+                           ObjSectionToIDMap &LocalSections,
+                           RelocationValueRef &Rel);
+
 public:
   RuntimeDyldELF(RTDyldMemoryManager *mm)
       : RuntimeDyldImpl(mm) {}
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
index a9733407ec..45633e735c 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
@@ -24,6 +24,8 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/SwapByteOrder.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/system_error.h"
 #include <map>
@@ -41,6 +43,9 @@ class Twine;
 /// linker.
 class SectionEntry {
 public:
+  /// Name - section name.
+  StringRef Name;
+
   /// Address - address in the linker's memory where the section resides.
   uint8_t *Address;
 
@@ -61,9 +66,9 @@ public:
   /// for calculating relocations in some object formats (like MachO).
   uintptr_t ObjAddress;
 
-  SectionEntry(uint8_t *address, size_t size, uintptr_t stubOffset,
-               uintptr_t objAddress)
-    : Address(address), Size(size), LoadAddress((uintptr_t)address),
+  SectionEntry(StringRef name, uint8_t *address, size_t size,
+	       uintptr_t stubOffset, uintptr_t objAddress)
+    : Name(name), Address(address), Size(size), LoadAddress((uintptr_t)address),
       StubOffset(stubOffset), ObjAddress(objAddress) {}
 };
 
@@ -163,6 +168,8 @@ protected:
       return 8; // 32-bit instruction and 32-bit address
     else if (Arch == Triple::mipsel)
       return 16;
+    else if (Arch == Triple::ppc64)
+      return 44;
     else
       return 0;
   }
@@ -185,6 +192,35 @@ protected:
     return (uint8_t*)Sections[SectionID].Address;
   }
 
+  void writeInt16BE(uint8_t *Addr, uint16_t Value) {
+    if (sys::isLittleEndianHost())
+      Value = sys::SwapByteOrder(Value);
+    *Addr     = (Value >> 8) & 0xFF;
+    *(Addr+1) = Value & 0xFF;
+  }
+
+  void writeInt32BE(uint8_t *Addr, uint32_t Value) {
+    if (sys::isLittleEndianHost())
+      Value = sys::SwapByteOrder(Value);
+    *Addr     = (Value >> 24) & 0xFF;
+    *(Addr+1) = (Value >> 16) & 0xFF;
+    *(Addr+2) = (Value >> 8) & 0xFF;
+    *(Addr+3) = Value & 0xFF;
+  }
+
+  void writeInt64BE(uint8_t *Addr, uint64_t Value) {
+    if (sys::isLittleEndianHost())
+      Value = sys::SwapByteOrder(Value);
+    *Addr     = (Value >> 56) & 0xFF;
+    *(Addr+1) = (Value >> 48) & 0xFF;
+    *(Addr+2) = (Value >> 40) & 0xFF;
+    *(Addr+3) = (Value >> 32) & 0xFF;
+    *(Addr+4) = (Value >> 24) & 0xFF;
+    *(Addr+5) = (Value >> 16) & 0xFF;
+    *(Addr+6) = (Value >> 8) & 0xFF;
+    *(Addr+7) = Value & 0xFF;
+  }
+
   /// \brief Given the common symbols discovered in the object file, emit a
   /// new section for them and update the symbol mappings in the object and
   /// symbol table.
diff --git a/lib/MC/ELFObjectWriter.cpp b/lib/MC/ELFObjectWriter.cpp
index 02b4c3c4c1..a94d51bb74 100644
--- a/lib/MC/ELFObjectWriter.cpp
+++ b/lib/MC/ELFObjectWriter.cpp
@@ -133,6 +133,11 @@ class ELFObjectWriter : public MCObjectWriter {
                                    bool IsPCRel) const {
       return TargetObjectWriter->ExplicitRelSym(Asm, Target, F, Fixup, IsPCRel);
     }
+    const MCSymbol *undefinedExplicitRelSym(const MCValue &Target,
+                                            const MCFixup &Fixup,
+                                            bool IsPCRel) const {
+      return TargetObjectWriter->undefinedExplicitRelSym(Target, Fixup, IsPCRel);
+    }
 
     bool is64Bit() const { return TargetObjectWriter->is64Bit(); }
     bool hasRelocationAddend() const {
@@ -639,7 +644,7 @@ const MCSymbol *ELFObjectWriter::SymbolToReloc(const MCAssembler &Asm,
   if (ASymbol.isUndefined()) {
     if (Renamed)
       return Renamed;
-    return &ASymbol;
+    return undefinedExplicitRelSym(Target, Fixup, IsPCRel);
   }
 
   if (SD.isExternal()) {
@@ -721,10 +726,13 @@ void ELFObjectWriter::RecordRelocation(const MCAssembler &Asm,
       MCSymbolData &SD = Asm.getSymbolData(ASymbol);
       MCFragment *F = SD.getFragment();
 
-      Index = F->getParent()->getOrdinal() + 1;
-
-      // Offset of the symbol in the section
-      Value += Layout.getSymbolOffset(&SD);
+      if (F) {
+        Index = F->getParent()->getOrdinal() + 1;
+        // Offset of the symbol in the section
+        Value += Layout.getSymbolOffset(&SD);
+      } else {
+        Index = 0;
+      }
     } else {
       if (Asm.getSymbolData(Symbol).getFlags() & ELF_Other_Weakref)
         WeakrefUsedInReloc.insert(RelocSymbol);
diff --git a/lib/MC/MCELFObjectTargetWriter.cpp b/lib/MC/MCELFObjectTargetWriter.cpp
index 6eb6914f4b..74cd042a0f 100644
--- a/lib/MC/MCELFObjectTargetWriter.cpp
+++ b/lib/MC/MCELFObjectTargetWriter.cpp
@@ -9,6 +9,8 @@
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
 
 using namespace llvm;
 
@@ -35,6 +37,12 @@ const MCSymbol *MCELFObjectTargetWriter::ExplicitRelSym(const MCAssembler &Asm,
   return NULL;
 }
 
+const MCSymbol *MCELFObjectTargetWriter::undefinedExplicitRelSym(const MCValue &Target,
+                                                                 const MCFixup &Fixup,
+                                                                 bool IsPCRel) const {
+  const MCSymbol &Symbol = Target.getSymA()->getSymbol();
+  return &Symbol.AliasedSymbol();
+}
 
 void MCELFObjectTargetWriter::adjustFixupOffset(const MCFixup &Fixup,
                                                 uint64_t &RelocOffset) {
diff --git a/lib/MC/MCInstPrinter.cpp b/lib/MC/MCInstPrinter.cpp
index 847bcc0a16..41d90abeeb 100644
--- a/lib/MC/MCInstPrinter.cpp
+++ b/lib/MC/MCInstPrinter.cpp
@@ -36,3 +36,17 @@ void MCInstPrinter::printAnnotation(raw_ostream &OS, StringRef Annot) {
       OS << " " << MAI.getCommentString() << " " << Annot;
   }
 }
+
+/// Utility functions to make adding mark ups simpler.
+StringRef MCInstPrinter::markup(StringRef s) const {
+  if (getUseMarkup())
+    return s;
+  else
+    return "";
+}
+StringRef MCInstPrinter::markup(StringRef a, StringRef b) const {
+  if (getUseMarkup())
+    return a;
+  else
+    return b;
+}
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index b1cc08d56e..a1f0a2a885 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -207,7 +207,7 @@ public:
 
   bool ParseMSInlineAsm(void *AsmLoc, std::string &AsmString,
                         unsigned &NumOutputs, unsigned &NumInputs,
-                        SmallVectorImpl<void *> &OpDecls,
+                        SmallVectorImpl<std::pair<void *,bool> > &OpDecls,
                         SmallVectorImpl<std::string> &Constraints,
                         SmallVectorImpl<std::string> &Clobbers,
                         const MCInstrInfo *MII,
@@ -3658,7 +3658,8 @@ enum AsmRewriteKind {
    AOK_Input,
    AOK_Output,
    AOK_SizeDirective,
-   AOK_Emit
+   AOK_Emit,
+   AOK_Skip
 };
 
 struct AsmRewrite {
@@ -3690,14 +3691,16 @@ bool AsmParser::ParseDirectiveEmit(SMLoc IDLoc, ParseStatementInfo &Info) {
 
 bool AsmParser::ParseMSInlineAsm(void *AsmLoc, std::string &AsmString,
                                  unsigned &NumOutputs, unsigned &NumInputs,
-                                 SmallVectorImpl<void *> &OpDecls,
+                                 SmallVectorImpl<std::pair<void *, bool> > &OpDecls,
                                  SmallVectorImpl<std::string> &Constraints,
                                  SmallVectorImpl<std::string> &Clobbers,
                                  const MCInstrInfo *MII,
                                  const MCInstPrinter *IP,
                                  MCAsmParserSemaCallback &SI) {
-  SmallVector<void*, 4> InputDecls;
-  SmallVector<void*, 4> OutputDecls;
+  SmallVector<void *, 4> InputDecls;
+  SmallVector<void *, 4> OutputDecls;
+  SmallVector<bool, 4> InputDeclsOffsetOf;
+  SmallVector<bool, 4> OutputDeclsOffsetOf;
   SmallVector<std::string, 4> InputConstraints;
   SmallVector<std::string, 4> OutputConstraints;
   std::set<std::string> ClobberRegs;
@@ -3725,13 +3728,13 @@ bool AsmParser::ParseMSInlineAsm(void *AsmLoc, std::string &AsmString,
         // Immediate.
         if (Operand->isImm()) {
           AsmStrRewrites.push_back(AsmRewrite(AOK_Imm,
-                                                Operand->getStartLoc(),
-                                                Operand->getNameLen()));
+                                              Operand->getStartLoc(),
+                                              Operand->getNameLen()));
           continue;
         }
 
         // Register operand.
-        if (Operand->isReg()) {
+        if (Operand->isReg() && !Operand->isOffsetOf()) {
           unsigned NumDefs = Desc.getNumDefs();
           // Clobber.
           if (NumDefs && Operand->getMCOperandNum() < NumDefs) {
@@ -3749,26 +3752,33 @@ bool AsmParser::ParseMSInlineAsm(void *AsmLoc, std::string &AsmString,
                                                     Size);
         if (OpDecl) {
           bool isOutput = (i == 1) && Desc.mayStore();
-          if (Operand->needSizeDirective())
+          if (!Operand->isOffsetOf() && Operand->needSizeDirective())
             AsmStrRewrites.push_back(AsmRewrite(AOK_SizeDirective,
-                                                  Operand->getStartLoc(), 0,
-                                                  Operand->getMemSize()));
-          
+                                                Operand->getStartLoc(), 0,
+                                                Operand->getMemSize()));
+
+          // Don't emit the offset directive.
+          if (Operand->isOffsetOf())
+            AsmStrRewrites.push_back(AsmRewrite(AOK_Skip,
+                                                Operand->getOffsetOfLoc(), 7));
+
           if (isOutput) {
             std::string Constraint = "=";
             ++InputIdx;
             OutputDecls.push_back(OpDecl);
+            OutputDeclsOffsetOf.push_back(Operand->isOffsetOf());
             Constraint += Operand->getConstraint().str();
             OutputConstraints.push_back(Constraint);
             AsmStrRewrites.push_back(AsmRewrite(AOK_Output,
-                                                  Operand->getStartLoc(),
-                                                  Operand->getNameLen()));
+                                                Operand->getStartLoc(),
+                                                Operand->getNameLen()));
           } else {
             InputDecls.push_back(OpDecl);
+            InputDeclsOffsetOf.push_back(Operand->isOffsetOf());
             InputConstraints.push_back(Operand->getConstraint().str());
             AsmStrRewrites.push_back(AsmRewrite(AOK_Input,
-                                                  Operand->getStartLoc(),
-                                                  Operand->getNameLen()));
+                                                Operand->getStartLoc(),
+                                                Operand->getNameLen()));
           }
         }
       }
@@ -3789,13 +3799,15 @@ bool AsmParser::ParseMSInlineAsm(void *AsmLoc, std::string &AsmString,
     unsigned NumExprs = NumOutputs + NumInputs;
     OpDecls.resize(NumExprs);
     Constraints.resize(NumExprs);
+    // FIXME: Constraints are hard coded to 'm', but we need an 'r'
+    // constraint for offsetof.  This needs to be cleaned up!
     for (unsigned i = 0; i < NumOutputs; ++i) {
-      OpDecls[i] = OutputDecls[i];
-      Constraints[i] = OutputConstraints[i];
+      OpDecls[i] = std::make_pair(OutputDecls[i], OutputDeclsOffsetOf[i]);
+      Constraints[i] = OutputDeclsOffsetOf[i] ? "=r" : OutputConstraints[i];
     }
     for (unsigned i = 0, j = NumOutputs; i < NumInputs; ++i, ++j) {
-      OpDecls[j] = InputDecls[i];
-      Constraints[j] = InputConstraints[i];
+      OpDecls[j] = std::make_pair(InputDecls[i], InputDeclsOffsetOf[i]);
+      Constraints[j] = InputDeclsOffsetOf[i] ? "r" : InputConstraints[i];
     }
   }
 
@@ -3816,8 +3828,15 @@ bool AsmParser::ParseMSInlineAsm(void *AsmLoc, std::string &AsmString,
       OS << StringRef(Start, Loc - Start);
     PrevKind = Kind;
 
+    // Skip the original expression.
+    if (Kind == AOK_Skip) {
+      Start = Loc + (*I).Len;
+      continue;
+    }
+
     // Rewrite expressions in $N notation.
     switch (Kind) {
+    default: break;
     case AOK_Imm:
       OS << Twine("$$") + StringRef(Loc, (*I).Len);
       break;
diff --git a/lib/TableGen/Error.cpp b/lib/TableGen/Error.cpp
index 5dd688cb67..ad98fba9ba 100644
--- a/lib/TableGen/Error.cpp
+++ b/lib/TableGen/Error.cpp
@@ -16,6 +16,8 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/raw_ostream.h"
 
+#include <cstdlib>
+
 namespace llvm {
 
 SourceMgr SrcMgr;
@@ -63,4 +65,14 @@ void PrintError(const TGError &Error) {
   PrintError(Error.getLoc(), Error.getMessage());
 }
 
+void PrintFatalError(const std::string &Msg) {
+  PrintError(Twine(Msg));
+  std::exit(1);
+}
+
+void PrintFatalError(ArrayRef<SMLoc> ErrorLoc, const std::string &Msg) {
+  PrintError(ErrorLoc, Msg);
+  std::exit(1);
+}
+
 } // end namespace llvm
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 874ef43b90..bf50081cc7 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -296,15 +296,13 @@ class RegConstraint<string C> {
 //  ARM specific transformation functions and pattern fragments.
 //
 
-// imm_neg_XFORM - Return a imm value packed into the format described for
-// imm_neg defs below.
+// imm_neg_XFORM - Return the negation of an i32 immediate value.
 def imm_neg_XFORM : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(-(int)N->getZExtValue(), MVT::i32);
 }]>;
 
-// so_imm_not_XFORM - Return a so_imm value packed into the format described for
-// so_imm_not def below.
-def so_imm_not_XFORM : SDNodeXForm<imm, [{
+// imm_not_XFORM - Return the complement of a i32 immediate value.
+def imm_not_XFORM : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(~(int)N->getZExtValue(), MVT::i32);
 }]>;
 
@@ -327,7 +325,7 @@ def so_imm_neg : Operand<i32>, PatLeaf<(imm), [{
 def so_imm_not_asmoperand : AsmOperandClass { let Name = "ARMSOImmNot"; }
 def so_imm_not : Operand<i32>, PatLeaf<(imm), [{
     return ARM_AM::getSOImmVal(~(uint32_t)N->getZExtValue()) != -1;
-  }], so_imm_not_XFORM> {
+  }], imm_not_XFORM> {
   let ParserMatchClass = so_imm_not_asmoperand;
 }
 
@@ -3269,6 +3267,8 @@ def : ARMPat<(ARMaddc GPR:$src, imm0_65535_neg:$imm),
 // for part of the negation.
 def : ARMPat<(ARMadde GPR:$src, so_imm_not:$imm, CPSR),
              (SBCri   GPR:$src, so_imm_not:$imm)>;
+def : ARMPat<(ARMadde GPR:$src, imm0_65535_neg:$imm, CPSR),
+             (SBCrr   GPR:$src, (MOVi16 (imm_not_XFORM imm:$imm)))>;
 
 // Note: These are implemented in C++ code, because they have to generate
 // ADD/SUBrs instructions, which use a complex pattern that a xform function
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 37b280f447..e10f4a865e 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -1953,7 +1953,7 @@ def : T2Pat<(ARMadde    rGPR:$src, imm0_255_not:$imm, CPSR),
 def : T2Pat<(ARMadde    rGPR:$src, t2_so_imm_not:$imm, CPSR),
             (t2SBCri    rGPR:$src, t2_so_imm_not:$imm)>;
 def : T2Pat<(ARMadde    rGPR:$src, imm0_65535_neg:$imm, CPSR),
-            (t2SBCrr    rGPR:$src, (t2MOVi16 (imm_neg_XFORM imm:$imm)))>;
+            (t2SBCrr    rGPR:$src, (t2MOVi16 (imm_not_XFORM imm:$imm)))>;
 
 // Select Bytes -- for disassembly only
 
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index 4c44f69f4d..cb3ac4d1f6 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -155,7 +155,8 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
   TargetLowering::ArgListEntry Entry;
 
   // First argument: data pointer
-  Type *IntPtrTy = TLI.getDataLayout()->getIntPtrType(*DAG.getContext());
+  unsigned AS = DstPtrInfo.getAddrSpace();
+  Type *IntPtrTy = TLI.getDataLayout()->getIntPtrType(*DAG.getContext(), AS);
   Entry.Node = Dst;
   Entry.Ty = IntPtrTy;
   Args.push_back(Entry);
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 30fa6bc2c7..740548adbc 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -79,7 +79,7 @@ ARMTargetMachine::ARMTargetMachine(const Target &T, StringRef TT,
     TLInfo(*this),
     TSInfo(*this),
     FrameLowering(Subtarget),
-    STTI(&TLInfo) {
+    STTI(&TLInfo), VTTI(&TLInfo) {
   if (!Subtarget.hasARMOps())
     report_fatal_error("CPU: '" + Subtarget.getCPUString() + "' does not "
                        "support ARM mode execution!");
@@ -113,7 +113,7 @@ ThumbTargetMachine::ThumbTargetMachine(const Target &T, StringRef TT,
     FrameLowering(Subtarget.hasThumb2()
               ? new ARMFrameLowering(Subtarget)
               : (ARMFrameLowering*)new Thumb1FrameLowering(Subtarget)),
-    STTI(&TLInfo){
+    STTI(&TLInfo), VTTI(&TLInfo) {
 }
 
 namespace {
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index 227689f897..ddb38687d0 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -47,7 +47,7 @@ static void printRegImmShift(raw_ostream &O, ARM_AM::ShiftOpc ShOpc,
   assert (!(ShOpc == ARM_AM::ror && !ShImm) && "Cannot have ror #0");
   O << getShiftOpcStr(ShOpc);
 
-  if (ShOpc != ARM_AM::rrx){
+  if (ShOpc != ARM_AM::rrx) {
     O << " ";
     if (UseMarkup)
       O << "<imm:";
@@ -67,11 +67,9 @@ ARMInstPrinter::ARMInstPrinter(const MCAsmInfo &MAI,
 }
 
 void ARMInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
-  if (UseMarkup)
-    OS << "<reg:";
-  OS << getRegisterName(RegNo);
-  if (UseMarkup)
-    OS << ">";
+  OS << markup("<reg:")
+     << getRegisterName(RegNo)
+     << markup(">");
 }
 
 void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
@@ -143,12 +141,10 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
       return;
     }
 
-    O << ", ";
-    if (UseMarkup)
-      O << "<imm:";
-    O << "#" << translateShiftImm(ARM_AM::getSORegOffset(MO2.getImm()));
-    if (UseMarkup)
-      O << ">";
+    O << ", "
+      << markup("<imm:")
+      << "#" << translateShiftImm(ARM_AM::getSORegOffset(MO2.getImm()))
+      << markup(">");
     printAnnotation(O, Annot);
     return;
   }
@@ -332,11 +328,9 @@ void ARMInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     unsigned Reg = Op.getReg();
     printRegName(O, Reg);
   } else if (Op.isImm()) {
-    if (UseMarkup)
-      O << "<imm:";
-    O << '#' << Op.getImm();
-    if (UseMarkup)
-      O << ">";
+    O << markup("<imm:")
+      << '#' << Op.getImm()
+      << markup(">");
   } else {
     assert(Op.isExpr() && "unknown operand kind in printOperand");
     // If a symbolic branch target was added as a constant expression then print
@@ -360,18 +354,9 @@ void ARMInstPrinter::printT2LdrLabelOperand(const MCInst *MI, unsigned OpNum,
   if (MO1.isExpr())
     O << *MO1.getExpr();
   else if (MO1.isImm()) {
-    if (UseMarkup)
-      O << "<mem:";
-    O << "[pc, ";
-    if (UseMarkup)
-      O << "<imm:";
-    O << "#";
-    O << MO1.getImm();
-    if (UseMarkup)
-      O << ">";
-    O << "]";
-    if (UseMarkup)
-      O << ">";
+    O << markup("<mem:") << "[pc, "
+      << markup("<imm:") << "#" << MO1.getImm()
+      << markup(">]>", "]");
   }
   else
     llvm_unreachable("Unknown LDR label operand?");
@@ -424,25 +409,19 @@ void ARMInstPrinter::printAM2PreOrOffsetIndexOp(const MCInst *MI, unsigned Op,
   const MCOperand &MO2 = MI->getOperand(Op+1);
   const MCOperand &MO3 = MI->getOperand(Op+2);
 
-  if (UseMarkup)
-    O << "<mem:";
-  O << "[";
+  O << markup("<mem:") << "[";
   printRegName(O, MO1.getReg());
 
   if (!MO2.getReg()) {
     if (ARM_AM::getAM2Offset(MO3.getImm())) { // Don't print +0.
-      O << ", ";
-      if (UseMarkup)
-	O << "<imm:";
-      O << "#";
-      O << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm()));
-      O << ARM_AM::getAM2Offset(MO3.getImm());
-      if (UseMarkup)
-	O << ">";
+      O << ", "
+        << markup("<imm:")
+        << "#"
+        << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO3.getImm()))
+        << ARM_AM::getAM2Offset(MO3.getImm())
+        << markup(">");
     }
-    O << "]";
-    if (UseMarkup)
-      O << ">";
+    O << "]" << markup(">");
     return;
   }
 
@@ -452,45 +431,29 @@ void ARMInstPrinter::printAM2PreOrOffsetIndexOp(const MCInst *MI, unsigned Op,
 
   printRegImmShift(O, ARM_AM::getAM2ShiftOpc(MO3.getImm()),
                    ARM_AM::getAM2Offset(MO3.getImm()), UseMarkup);
-  O << "]";
-  if (UseMarkup)
-    O << ">";
+  O << "]" << markup(">");
 }
 
 void ARMInstPrinter::printAddrModeTBB(const MCInst *MI, unsigned Op,
                                            raw_ostream &O) {
   const MCOperand &MO1 = MI->getOperand(Op);
   const MCOperand &MO2 = MI->getOperand(Op+1);
-  if (UseMarkup)
-    O << "<mem:";
-  O << "[";
+  O << markup("<mem:") << "[";
   printRegName(O, MO1.getReg());
   O << ", ";
   printRegName(O, MO2.getReg());
-  O << "]";
-  if (UseMarkup)
-    O << ">";
+  O << "]" << markup(">");
 }
 
 void ARMInstPrinter::printAddrModeTBH(const MCInst *MI, unsigned Op,
                                            raw_ostream &O) {
   const MCOperand &MO1 = MI->getOperand(Op);
   const MCOperand &MO2 = MI->getOperand(Op+1);
-  if (UseMarkup)
-    O << "<mem:";
-  O << "[";
+  O << markup("<mem:") << "[";
   printRegName(O, MO1.getReg());
   O << ", ";
   printRegName(O, MO2.getReg());
-  O << ", lsl ";
-  if (UseMarkup)
-    O << "<imm:";
-  O << "#1";
-  if (UseMarkup)
-    O << ">";
-  O << "]";
-  if (UseMarkup)
-    O << ">";
+  O << ", lsl " << markup("<imm:") << "#1" << markup(">") << "]" << markup(">");
 }
 
 void ARMInstPrinter::printAddrMode2Operand(const MCInst *MI, unsigned Op,
@@ -520,13 +483,10 @@ void ARMInstPrinter::printAddrMode2OffsetOperand(const MCInst *MI,
 
   if (!MO1.getReg()) {
     unsigned ImmOffs = ARM_AM::getAM2Offset(MO2.getImm());
-    if (UseMarkup)
-      O << "<imm:";
-    O << '#'
-      << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO2.getImm()))
-      << ImmOffs;
-    if (UseMarkup)
-      O << ">";
+    O << markup("<imm:")
+      << '#' << ARM_AM::getAddrOpcStr(ARM_AM::getAM2Op(MO2.getImm()))
+      << ImmOffs
+      << markup(">");
     return;
   }
 
@@ -547,13 +507,9 @@ void ARMInstPrinter::printAM3PostIndexOp(const MCInst *MI, unsigned Op,
   const MCOperand &MO2 = MI->getOperand(Op+1);
   const MCOperand &MO3 = MI->getOperand(Op+2);
 
-  if (UseMarkup)
-    O << "<mem:";
-  O << "[";
+  O << markup("<mem:") << "[";
   printRegName(O, MO1.getReg());
-  O << "], ";
-  if (UseMarkup)
-    O << ">";
+  O << "], " << markup(">");
 
   if (MO2.getReg()) {
     O << (char)ARM_AM::getAM3Op(MO3.getImm());
@@ -562,13 +518,11 @@ void ARMInstPrinter::printAM3PostIndexOp(const MCInst *MI, unsigned Op,
   }
 
   unsigned ImmOffs = ARM_AM::getAM3Offset(MO3.getImm());
-  if (UseMarkup)
-    O << "<imm:";
-  O << '#'
+  O << markup("<imm:")
+    << '#'
     << ARM_AM::getAddrOpcStr(ARM_AM::getAM3Op(MO3.getImm()))
-    << ImmOffs;
-  if (UseMarkup)
-    O << ">";
+    << ImmOffs
+    << markup(">");
 }
 
 void ARMInstPrinter::printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op,
@@ -577,18 +531,13 @@ void ARMInstPrinter::printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op,
   const MCOperand &MO2 = MI->getOperand(Op+1);
   const MCOperand &MO3 = MI->getOperand(Op+2);
 
-  if (UseMarkup)
-    O << "<mem:";
-  O << '[';
+  O << markup("<mem:") << '[';
   printRegName(O, MO1.getReg());
 
   if (MO2.getReg()) {
-    O << ", ";
-    O << getAddrOpcStr(ARM_AM::getAM3Op(MO3.getImm()));
+    O << ", " << getAddrOpcStr(ARM_AM::getAM3Op(MO3.getImm()));
     printRegName(O, MO2.getReg());
-    O << ']';
-    if (UseMarkup)
-      O << ">";
+    O << ']' << markup(">");
     return;
   }
 
@@ -597,18 +546,14 @@ void ARMInstPrinter::printAM3PreOrOffsetIndexOp(const MCInst *MI, unsigned Op,
   ARM_AM::AddrOpc op = ARM_AM::getAM3Op(MO3.getImm());
 
   if (ImmOffs || (op == ARM_AM::sub)) {
-    O << ", ";
-    if (UseMarkup)
-      O << "<imm:";
-    O << "#"
+    O << ", "
+      << markup("<imm:")
+      << "#"
       << ARM_AM::getAddrOpcStr(op)
-      << ImmOffs;
-    if (UseMarkup)
-      O << ">";
+      << ImmOffs
+      << markup(">");
   }
-  O << ']';
-  if (UseMarkup)
-    O << ">";
+  O << ']' << markup(">");
 }
 
 void ARMInstPrinter::printAddrMode3Operand(const MCInst *MI, unsigned Op,
@@ -642,13 +587,9 @@ void ARMInstPrinter::printAddrMode3OffsetOperand(const MCInst *MI,
   }
 
   unsigned ImmOffs = ARM_AM::getAM3Offset(MO2.getImm());
-  if (UseMarkup)
-    O << "<imm:";
-  O << '#'
-    << ARM_AM::getAddrOpcStr(ARM_AM::getAM3Op(MO2.getImm()))
-    << ImmOffs;
-  if (UseMarkup)
-    O << ">";
+  O << markup("<imm:")
+    << '#' << ARM_AM::getAddrOpcStr(ARM_AM::getAM3Op(MO2.getImm())) << ImmOffs
+    << markup(">");
 }
 
 void ARMInstPrinter::printPostIdxImm8Operand(const MCInst *MI,
@@ -656,11 +597,9 @@ void ARMInstPrinter::printPostIdxImm8Operand(const MCInst *MI,
                                              raw_ostream &O) {
   const MCOperand &MO = MI->getOperand(OpNum);
   unsigned Imm = MO.getImm();
-  if (UseMarkup)
-    O << "<imm:";
-  O << '#' << ((Imm & 256) ? "" : "-") << (Imm & 0xff);
-  if (UseMarkup)
-    O << ">";
+  O << markup("<imm:")
+    << '#' << ((Imm & 256) ? "" : "-") << (Imm & 0xff)
+    << markup(">");
 }
 
 void ARMInstPrinter::printPostIdxRegOperand(const MCInst *MI, unsigned OpNum,
@@ -677,11 +616,9 @@ void ARMInstPrinter::printPostIdxImm8s4Operand(const MCInst *MI,
                                              raw_ostream &O) {
   const MCOperand &MO = MI->getOperand(OpNum);
   unsigned Imm = MO.getImm();
-  if (UseMarkup)
-    O << "<imm:";
-  O << '#' << ((Imm & 256) ? "" : "-") << ((Imm & 0xff) << 2);
-  if (UseMarkup)
-    O << ">";
+  O << markup("<imm:")
+    << '#' << ((Imm & 256) ? "" : "-") << ((Imm & 0xff) << 2)
+    << markup(">");
 }
 
 
@@ -702,26 +639,20 @@ void ARMInstPrinter::printAddrMode5Operand(const MCInst *MI, unsigned OpNum,
     return;
   }
 
-  if (UseMarkup)
-    O << "<mem:";
-  O << "[";
+  O << markup("<mem:") << "[";
   printRegName(O, MO1.getReg());
 
   unsigned ImmOffs = ARM_AM::getAM5Offset(MO2.getImm());
   unsigned Op = ARM_AM::getAM5Op(MO2.getImm());
   if (ImmOffs || Op == ARM_AM::sub) {
-    O << ", ";
-    if (UseMarkup)
-      O << "<imm:";
-    O << "#"
+    O << ", "
+      << markup("<imm:")
+      << "#"
       << ARM_AM::getAddrOpcStr(ARM_AM::getAM5Op(MO2.getImm()))
-      << ImmOffs * 4;
-    if (UseMarkup)
-      O << ">";
+      << ImmOffs * 4
+      << markup(">");
   }
-  O << "]";
-  if (UseMarkup)
-    O << ">";
+  O << "]" << markup(">");
 }
 
 void ARMInstPrinter::printAddrMode6Operand(const MCInst *MI, unsigned OpNum,
@@ -729,29 +660,21 @@ void ARMInstPrinter::printAddrMode6Operand(const MCInst *MI, unsigned OpNum,
   const MCOperand &MO1 = MI->getOperand(OpNum);
   const MCOperand &MO2 = MI->getOperand(OpNum+1);
 
-  if (UseMarkup)
-    O << "<mem:";
-  O << "[";
+  O << markup("<mem:") << "[";
   printRegName(O, MO1.getReg());
   if (MO2.getImm()) {
     // FIXME: Both darwin as and GNU as violate ARM docs here.
     O << ", :" << (MO2.getImm() << 3);
   }
-  O << "]";
-  if (UseMarkup)
-    O << ">";
+  O << "]" << markup(">");
 }
 
 void ARMInstPrinter::printAddrMode7Operand(const MCInst *MI, unsigned OpNum,
                                            raw_ostream &O) {
   const MCOperand &MO1 = MI->getOperand(OpNum);
-  if (UseMarkup)
-    O << "<mem:";
-  O << "[";
+  O << markup("<mem:") << "[";
   printRegName(O, MO1.getReg());
-  O << "]";
-  if (UseMarkup)
-    O << ">";
+  O << "]" << markup(">");
 }
 
 void ARMInstPrinter::printAddrMode6OffsetOperand(const MCInst *MI,
@@ -774,17 +697,9 @@ void ARMInstPrinter::printBitfieldInvMaskImmOperand(const MCInst *MI,
   int32_t lsb = CountTrailingZeros_32(v);
   int32_t width = (32 - CountLeadingZeros_32 (v)) - lsb;
   assert(MO.isImm() && "Not a valid bf_inv_mask_imm value!");
-  if (UseMarkup)
-    O << "<imm:";
-  O << '#' << lsb;
-  if (UseMarkup)
-    O << ">";
-  O << ", ";
-  if (UseMarkup)
-    O << "<imm:";
-  O << '#' << width;
-  if (UseMarkup)
-    O << ">";
+  O << markup("<imm:") << '#' << lsb << markup(">")
+    << ", "
+    << markup("<imm:") << '#' << width << markup(">");
 }
 
 void ARMInstPrinter::printMemBOption(const MCInst *MI, unsigned OpNum,
@@ -799,20 +714,16 @@ void ARMInstPrinter::printShiftImmOperand(const MCInst *MI, unsigned OpNum,
   bool isASR = (ShiftOp & (1 << 5)) != 0;
   unsigned Amt = ShiftOp & 0x1f;
   if (isASR) {
-    O << ", asr ";
-    if (UseMarkup)
-      O << "<imm:";
-    O << "#" << (Amt == 0 ? 32 : Amt);
-    if (UseMarkup)
-      O << ">";
+    O << ", asr "
+      << markup("<imm:")
+      << "#" << (Amt == 0 ? 32 : Amt)
+      << markup(">");
   }
   else if (Amt) {
-    O << ", lsl ";
-    if (UseMarkup)
-      O << "<imm:";
-    O << "#" << Amt;
-    if (UseMarkup)
-      O << ">";
+    O << ", lsl "
+      << markup("<imm:")
+      << "#" << Amt
+      << markup(">");
   }
 }
 
@@ -822,12 +733,7 @@ void ARMInstPrinter::printPKHLSLShiftImm(const MCInst *MI, unsigned OpNum,
   if (Imm == 0)
     return;
   assert(Imm > 0 && Imm < 32 && "Invalid PKH shift immediate value!");
-  O << ", lsl ";
-  if (UseMarkup)
-    O << "<imm:";
-  O << "#" << Imm;
-  if (UseMarkup)
-    O << ">";
+  O << ", lsl " << markup("<imm:") << "#" << Imm << markup(">");
 }
 
 void ARMInstPrinter::printPKHASRShiftImm(const MCInst *MI, unsigned OpNum,
@@ -837,12 +743,7 @@ void ARMInstPrinter::printPKHASRShiftImm(const MCInst *MI, unsigned OpNum,
   if (Imm == 0)
     Imm = 32;
   assert(Imm > 0 && Imm <= 32 && "Invalid PKH shift immediate value!");
-  O << ", asr ";
-  if (UseMarkup)
-    O << "<imm:";
-  O << "#" << Imm;
-  if (UseMarkup)
-    O << ">";
+  O << ", asr " << markup("<imm:") << "#" << Imm << markup(">");
 }
 
 void ARMInstPrinter::printRegisterList(const MCInst *MI, unsigned OpNum,
@@ -1024,35 +925,29 @@ void ARMInstPrinter::printAdrLabelOperand(const MCInst *MI, unsigned OpNum,
 
   int32_t OffImm = (int32_t)MO.getImm();
 
-  if (UseMarkup)
-    O << "<imm:";
+  O << markup("<imm:");
   if (OffImm == INT32_MIN)
     O << "#-0";
   else if (OffImm < 0)
     O << "#-" << -OffImm;
   else
     O << "#" << OffImm;
-  if (UseMarkup)
-    O << ">";
+  O << markup(">");
 }
 
 void ARMInstPrinter::printThumbS4ImmOperand(const MCInst *MI, unsigned OpNum,
                                             raw_ostream &O) {
-  if (UseMarkup)
-    O << "<imm:";
-  O << "#" << MI->getOperand(OpNum).getImm() * 4;
-  if (UseMarkup)
-    O << ">";
+  O << markup("<imm:")
+    << "#" << MI->getOperand(OpNum).getImm() * 4
+    << markup(">");
 }
 
 void ARMInstPrinter::printThumbSRImm(const MCInst *MI, unsigned OpNum,
                                      raw_ostream &O) {
   unsigned Imm = MI->getOperand(OpNum).getImm();
-  if (UseMarkup)
-    O << "<imm:";
-  O << "#" << (Imm == 0 ? 32 : Imm);
-  if (UseMarkup)
-    O << ">";
+  O << markup("<imm:")
+    << "#" << (Imm == 0 ? 32 : Imm)
+    << markup(">");
 }
 
 void ARMInstPrinter::printThumbITMask(const MCInst *MI, unsigned OpNum,
@@ -1082,17 +977,13 @@ void ARMInstPrinter::printThumbAddrModeRROperand(const MCInst *MI, unsigned Op,
     return;
   }
 
-  if (UseMarkup)
-    O << "<mem:";
-  O << "[";
+  O << markup("<mem:") << "[";
   printRegName(O, MO1.getReg());
   if (unsigned RegNum = MO2.getReg()) {
     O << ", ";
     printRegName(O, RegNum);
   }
-  O << "]";
-  if (UseMarkup)
-    O << ">";
+  O << "]" << markup(">");
 }
 
 void ARMInstPrinter::printThumbAddrModeImm5SOperand(const MCInst *MI,
@@ -1107,21 +998,15 @@ void ARMInstPrinter::printThumbAddrModeImm5SOperand(const MCInst *MI,
     return;
   }
 
-  if (UseMarkup)
-    O << "<mem:";
-  O << "[";
+  O << markup("<mem:") << "[";
   printRegName(O, MO1.getReg());
   if (unsigned ImmOffs = MO2.getImm()) {
-    O << ", ";
-    if (UseMarkup)
-      O << "<imm:";
-    O << "#" << ImmOffs * Scale;
-    if (UseMarkup)
-      O << ">";
+    O << ", "
+      << markup("<imm:")
+      << "#" << ImmOffs * Scale
+      << markup(">");
   }
-  O << "]";
-  if (UseMarkup)
-    O << ">";
+  O << "]" << markup(">");
 }
 
 void ARMInstPrinter::printThumbAddrModeImm5S1Operand(const MCInst *MI,
@@ -1175,9 +1060,7 @@ void ARMInstPrinter::printAddrModeImm12Operand(const MCInst *MI, unsigned OpNum,
     return;
   }
 
-  if (UseMarkup)
-    O << "<mem:";
-  O << "[";
+  O << markup("<mem:") << "[";
   printRegName(O, MO1.getReg());
 
   int32_t OffImm = (int32_t)MO2.getImm();
@@ -1186,24 +1069,18 @@ void ARMInstPrinter::printAddrModeImm12Operand(const MCInst *MI, unsigned OpNum,
   if (OffImm == INT32_MIN)
     OffImm = 0;
   if (isSub) {
-    O << ", ";
-    if (UseMarkup)
-      O << "<imm:";
-    O << "#-" << -OffImm;
-    if (UseMarkup)
-      O << ">";
+    O << ", "
+      << markup("<imm:") 
+      << "#-" << -OffImm
+      << markup(">");
   }
   else if (OffImm > 0) {
-    O << ", ";
-    if (UseMarkup)
-      O << "<imm:";
-    O << "#" << OffImm;
-    if (UseMarkup)
-      O << ">";
+    O << ", "
+      << markup("<imm:") 
+      << "#" << OffImm
+      << markup(">");
   }
-  O << "]";
-  if (UseMarkup)
-    O << ">";
+  O << "]" << markup(">");
 }
 
 void ARMInstPrinter::printT2AddrModeImm8Operand(const MCInst *MI,
@@ -1212,9 +1089,7 @@ void ARMInstPrinter::printT2AddrModeImm8Operand(const MCInst *MI,
   const MCOperand &MO1 = MI->getOperand(OpNum);
   const MCOperand &MO2 = MI->getOperand(OpNum+1);
 
-  if (UseMarkup)
-    O << "<mem:";
-  O << "[";
+  O << markup("<mem:") << "[";
   printRegName(O, MO1.getReg());
 
   int32_t OffImm = (int32_t)MO2.getImm();
@@ -1231,9 +1106,7 @@ void ARMInstPrinter::printT2AddrModeImm8Operand(const MCInst *MI,
     O << "#" << OffImm;
   if (OffImm != 0 && UseMarkup)
     O << ">";
-  O << "]";
-  if (UseMarkup)
-    O << ">";
+  O << "]" << markup(">");
 }
 
 void ARMInstPrinter::printT2AddrModeImm8s4Operand(const MCInst *MI,
@@ -1247,9 +1120,7 @@ void ARMInstPrinter::printT2AddrModeImm8s4Operand(const MCInst *MI,
     return;
   }
 
-  if (UseMarkup)
-    O << "<mem:";
-  O << "[";
+  O << markup("<mem:") << "[";
   printRegName(O, MO1.getReg());
 
   int32_t OffImm = (int32_t)MO2.getImm();
@@ -1269,9 +1140,7 @@ void ARMInstPrinter::printT2AddrModeImm8s4Operand(const MCInst *MI,
     O << "#" << OffImm;
   if (OffImm != 0 && UseMarkup)
     O << ">";
-  O << "]";
-  if (UseMarkup)
-    O << ">";
+  O << "]" << markup(">");
 }
 
 void ARMInstPrinter::printT2AddrModeImm0_1020s4Operand(const MCInst *MI,
@@ -1280,21 +1149,15 @@ void ARMInstPrinter::printT2AddrModeImm0_1020s4Operand(const MCInst *MI,
   const MCOperand &MO1 = MI->getOperand(OpNum);
   const MCOperand &MO2 = MI->getOperand(OpNum+1);
 
-  if (UseMarkup)
-    O << "<mem:";
-  O << "[";
+  O << markup("<mem:") << "[";
   printRegName(O, MO1.getReg());
   if (MO2.getImm()) {
-    O << ", ";
-    if (UseMarkup)
-      O << "<imm:";
-    O << "#" << MO2.getImm() * 4;
-    if (UseMarkup)
-      O << ">";
+    O << ", "
+      << markup("<imm:")
+      << "#" << MO2.getImm() * 4
+      << markup(">");
   }
-  O << "]";
-  if (UseMarkup)
-    O << ">";
+  O << "]" << markup(">");
 }
 
 void ARMInstPrinter::printT2AddrModeImm8OffsetOperand(const MCInst *MI,
@@ -1302,15 +1165,12 @@ void ARMInstPrinter::printT2AddrModeImm8OffsetOperand(const MCInst *MI,
                                                       raw_ostream &O) {
   const MCOperand &MO1 = MI->getOperand(OpNum);
   int32_t OffImm = (int32_t)MO1.getImm();
-  O << ", ";
-  if (UseMarkup)
-    O << "<imm:";
+  O << ", " << markup("<imm:");
   if (OffImm < 0)
     O << "#-" << -OffImm;
   else
     O << "#" << OffImm;
-  if (UseMarkup)
-    O << ">";
+  O << markup(">");
 }
 
 void ARMInstPrinter::printT2AddrModeImm8s4OffsetOperand(const MCInst *MI,
@@ -1343,9 +1203,7 @@ void ARMInstPrinter::printT2AddrModeSoRegOperand(const MCInst *MI,
   const MCOperand &MO2 = MI->getOperand(OpNum+1);
   const MCOperand &MO3 = MI->getOperand(OpNum+2);
 
-  if (UseMarkup)
-    O << "<mem:";
-  O << "[";
+  O << markup("<mem:") << "[";
   printRegName(O, MO1.getReg());
 
   assert(MO2.getReg() && "Invalid so_reg load / store address!");
@@ -1355,26 +1213,20 @@ void ARMInstPrinter::printT2AddrModeSoRegOperand(const MCInst *MI,
   unsigned ShAmt = MO3.getImm();
   if (ShAmt) {
     assert(ShAmt <= 3 && "Not a valid Thumb2 addressing mode!");
-    O << ", lsl ";
-    if (UseMarkup)
-      O << "<imm:";
-    O << "#" << ShAmt;
-    if (UseMarkup)
-      O << ">";
+    O << ", lsl "
+      << markup("<imm:")
+      << "#" << ShAmt
+      << markup(">");
   }
-  O << "]";
-  if (UseMarkup)
-    O << ">";
+  O << "]" << markup(">");
 }
 
 void ARMInstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
                                        raw_ostream &O) {
   const MCOperand &MO = MI->getOperand(OpNum);
-  if (UseMarkup)
-    O << "<imm:";
-  O << '#' << ARM_AM::getFPImmFloat(MO.getImm());
-  if (UseMarkup)
-    O << ">";
+  O << markup("<imm:")
+    << '#' << ARM_AM::getFPImmFloat(MO.getImm())
+    << markup(">");
 }
 
 void ARMInstPrinter::printNEONModImmOperand(const MCInst *MI, unsigned OpNum,
@@ -1382,22 +1234,18 @@ void ARMInstPrinter::printNEONModImmOperand(const MCInst *MI, unsigned OpNum,
   unsigned EncodedImm = MI->getOperand(OpNum).getImm();
   unsigned EltBits;
   uint64_t Val = ARM_AM::decodeNEONModImm(EncodedImm, EltBits);
-  if (UseMarkup)
-    O << "<imm:";
-  O << "#0x";
+  O << markup("<imm:")
+    << "#0x";
   O.write_hex(Val);
-  if (UseMarkup)
-    O << ">";
+  O << markup(">");
 }
 
 void ARMInstPrinter::printImmPlusOneOperand(const MCInst *MI, unsigned OpNum,
                                             raw_ostream &O) {
   unsigned Imm = MI->getOperand(OpNum).getImm();
-  if (UseMarkup)
-    O << "<imm:";
-  O << "#" << Imm + 1;
-  if (UseMarkup)
-    O << ">";
+  O << markup("<imm:")
+    << "#" << Imm + 1
+    << markup(">");
 }
 
 void ARMInstPrinter::printRotImmOperand(const MCInst *MI, unsigned OpNum,
@@ -1405,45 +1253,35 @@ void ARMInstPrinter::printRotImmOperand(const MCInst *MI, unsigned OpNum,
   unsigned Imm = MI->getOperand(OpNum).getImm();
   if (Imm == 0)
     return;
-  O << ", ror ";
-  if (UseMarkup)
-    O << "<imm:";
-  O << "#";
+  O << ", ror "
+    << markup("<imm:")
+    << "#";
   switch (Imm) {
   default: assert (0 && "illegal ror immediate!");
   case 1: O << "8"; break;
   case 2: O << "16"; break;
   case 3: O << "24"; break;
   }
-  if (UseMarkup)
-    O << ">";
+  O << markup(">");
 }
 
 void ARMInstPrinter::printFBits16(const MCInst *MI, unsigned OpNum,
                                   raw_ostream &O) {
-  if (UseMarkup)
-    O << "<imm:";
-  O << "#" << 16 - MI->getOperand(OpNum).getImm();
-  if (UseMarkup)
-    O << ">";
+  O << markup("<imm:")
+    << "#" << 16 - MI->getOperand(OpNum).getImm()
+    << markup(">");
 }
 
 void ARMInstPrinter::printFBits32(const MCInst *MI, unsigned OpNum,
                                   raw_ostream &O) {
-  if (UseMarkup)
-    O << "<imm:";
-  O << "#" << 32 - MI->getOperand(OpNum).getImm();
-  if (UseMarkup)
-    O << ">";
+  O << markup("<imm:")
+    << "#" << 32 - MI->getOperand(OpNum).getImm()
+    << markup(">");
 }
 
 void ARMInstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum,
                                       raw_ostream &O) {
-  if (UseMarkup)
-    O << "<mem:";
   O << "[" << MI->getOperand(OpNum).getImm() << "]";
-  if (UseMarkup)
-    O << ">";
 }
 
 void ARMInstPrinter::printVectorListOne(const MCInst *MI, unsigned OpNum,
diff --git a/lib/Target/CellSPU/SPUTargetMachine.cpp b/lib/Target/CellSPU/SPUTargetMachine.cpp
index e92ad01e1d..918316572a 100644
--- a/lib/Target/CellSPU/SPUTargetMachine.cpp
+++ b/lib/Target/CellSPU/SPUTargetMachine.cpp
@@ -44,7 +44,7 @@ SPUTargetMachine::SPUTargetMachine(const Target &T, StringRef TT,
     TLInfo(*this),
     TSInfo(*this),
     InstrItins(Subtarget.getInstrItineraryData()),
-    STTI(&TLInfo){
+    STTI(&TLInfo), VTTI(&TLInfo) {
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 353542a809..30866e9eeb 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -75,7 +75,7 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, StringRef TT,
     TSInfo(*this),
     FrameLowering(Subtarget),
     InstrItins(&Subtarget.getInstrItineraryData()),
-    STTI(&TLInfo) {
+    STTI(&TLInfo), VTTI(&TLInfo) {
   setMCUseCFI(false);
 }
 
diff --git a/lib/Target/MBlaze/MBlazeTargetMachine.cpp b/lib/Target/MBlaze/MBlazeTargetMachine.cpp
index cb5f46062d..1ae2baa198 100644
--- a/lib/Target/MBlaze/MBlazeTargetMachine.cpp
+++ b/lib/Target/MBlaze/MBlazeTargetMachine.cpp
@@ -42,7 +42,8 @@ MBlazeTargetMachine(const Target &T, StringRef TT,
     InstrInfo(*this),
     FrameLowering(Subtarget),
     TLInfo(*this), TSInfo(*this), ELFWriterInfo(*this),
-    InstrItins(Subtarget.getInstrItineraryData()), STTI(&TLInfo) {
+    InstrItins(Subtarget.getInstrItineraryData()),
+    STTI(&TLInfo), VTTI(&TLInfo) {
 }
 
 namespace {
diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp
index 29ea681216..13e37b3735 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -36,7 +36,7 @@ MSP430TargetMachine::MSP430TargetMachine(const Target &T,
     // FIXME: Check DataLayout string.
     DL("e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16"),
     InstrInfo(*this), TLInfo(*this), TSInfo(*this),
-    FrameLowering(Subtarget), STTI(&TLInfo) { }
+    FrameLowering(Subtarget), STTI(&TLInfo), VTTI(&TLInfo) { }
 
 namespace {
 /// MSP430 Code Generator Pass Configuration Options.
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index da6a8d2a67..ae89cdd693 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -3601,10 +3601,11 @@ MipsTargetLowering::LowerReturn(SDValue Chain,
     if (!Reg)
       llvm_unreachable("sret virtual register not created in the entry block");
     SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
+    unsigned V0 = IsN64 ? Mips::V0_64 : Mips::V0;
 
-    Chain = DAG.getCopyToReg(Chain, dl, IsN64 ? Mips::V0_64 : Mips::V0, Val,
-                             Flag);
+    Chain = DAG.getCopyToReg(Chain, dl, V0, Val, Flag);
     Flag = Chain.getValue(1);
+    MF.getRegInfo().addLiveOut(V0);
   }
 
   // Return on Mips is always a "jr $ra"
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index 5e8062373f..f610253f49 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -53,7 +53,7 @@ MipsTargetMachine(const Target &T, StringRef TT,
     InstrInfo(MipsInstrInfo::create(*this)),
     FrameLowering(MipsFrameLowering::create(*this, Subtarget)),
     TLInfo(*this), TSInfo(*this), JITInfo(),
-    ELFWriterInfo(false, isLittle), STTI(&TLInfo) {
+    ELFWriterInfo(false, isLittle), STTI(&TLInfo), VTTI(&TLInfo) {
 }
 
 void MipsebTargetMachine::anchor() { }
diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h
index 60822d0c05..a62db327e5 100644
--- a/lib/Target/Mips/MipsTargetMachine.h
+++ b/lib/Target/Mips/MipsTargetMachine.h
@@ -40,7 +40,7 @@ class MipsTargetMachine : public LLVMTargetMachine {
   MipsJITInfo JITInfo;
   MipsELFWriterInfo   ELFWriterInfo;
   ScalarTargetTransformImpl STTI;
-  VectorTargetTransformInfo VTTI; 
+  VectorTargetTransformImpl VTTI;
 
 public:
   MipsTargetMachine(const Target &T, StringRef TT,
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index c46094569e..971d1b89a8 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -126,10 +126,9 @@ const MCExpr *nvptx::LowerConstant(const Constant *CV, AsmPrinter &AP) {
       return Base;
 
     // Truncate/sext the offset to the pointer size.
-    unsigned AS = PtrVal->getType()->isPointerTy() ?
-      cast<PointerType>(PtrVal->getType())->getAddressSpace() : 0;
-    if (TD.getPointerSizeInBits(AS) != 64) {
-      int SExtAmount = 64-TD.getPointerSizeInBits(AS);
+    unsigned PtrSize = TD.getPointerTypeSizeInBits(PtrVal->getType());
+    if (PtrSize != 64) {
+      int SExtAmount = 64-PtrSize;
       Offset = (Offset << SExtAmount) >> SExtAmount;
     }
 
@@ -151,7 +150,7 @@ const MCExpr *nvptx::LowerConstant(const Constant *CV, AsmPrinter &AP) {
     // Handle casts to pointers by changing them into casts to the appropriate
     // integer type.  This promotes constant folding and simplifies this code.
     Constant *Op = CE->getOperand(0);
-    Op = ConstantExpr::getIntegerCast(Op, TD.getIntPtrType(CV->getContext()),
+    Op = ConstantExpr::getIntegerCast(Op, TD.getIntPtrType(CE->getType()),
                                       false/*ZExt*/);
     return LowerConstant(Op, AP);
   }
diff --git a/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 7519b4a083..cbb490003d 100644
--- a/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -73,7 +73,7 @@ NVPTXTargetMachine::NVPTXTargetMachine(const Target &T,
   Subtarget(TT, CPU, FS, is64bit),
   DL(Subtarget.getDataLayout()),
   InstrInfo(*this), TLInfo(*this), TSInfo(*this), FrameLowering(*this,is64bit),
-  STTI(&TLInfo)
+  STTI(&TLInfo), VTTI(&TLInfo)
 /*FrameInfo(TargetFrameInfo::StackGrowsUp, 8, 0)*/ {
 }
 
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index 1744738622..87ecb13a4c 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -29,9 +29,14 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
   case FK_Data_1:
   case FK_Data_2:
   case FK_Data_4:
+  case FK_Data_8:
+  case PPC::fixup_ppc_toc:
     return Value;
+  case PPC::fixup_ppc_lo14:
+  case PPC::fixup_ppc_toc16_ds:
+    return (Value & 0xffff) << 2;
   case PPC::fixup_ppc_brcond14:
-    return Value & 0x3ffc;
+    return Value & 0xfffc;
   case PPC::fixup_ppc_br24:
     return Value & 0x3fffffc;
 #if 0
@@ -41,6 +46,7 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
   case PPC::fixup_ppc_ha16:
     return ((Value >> 16) + ((Value & 0x8000) ? 1 : 0)) & 0xffff;
   case PPC::fixup_ppc_lo16:
+  case PPC::fixup_ppc_toc16:
     return Value & 0xffff;
   }
 }
@@ -72,7 +78,10 @@ public:
       { "fixup_ppc_brcond14",    16,     14,   MCFixupKindInfo::FKF_IsPCRel },
       { "fixup_ppc_lo16",        16,     16,   0 },
       { "fixup_ppc_ha16",        16,     16,   0 },
-      { "fixup_ppc_lo14",        16,     14,   0 }
+      { "fixup_ppc_lo14",        16,     14,   0 },
+      { "fixup_ppc_toc",          0,     64,   0 },
+      { "fixup_ppc_toc16",       16,     16,   0 },
+      { "fixup_ppc_toc16_ds",    16,     14,   0 }
     };
 
     if (Kind < FirstTargetFixupKind)
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
index a19798157b..1518a60db8 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
@@ -11,6 +11,8 @@
 #include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
 
 using namespace llvm;
 
@@ -21,9 +23,15 @@ namespace {
 
     virtual ~PPCELFObjectWriter();
   protected:
+    virtual unsigned getRelocTypeInner(const MCValue &Target,
+                                       const MCFixup &Fixup,
+                                       bool IsPCRel) const;
     virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
                                   bool IsPCRel, bool IsRelocWithSymbol,
                                   int64_t Addend) const;
+    virtual const MCSymbol *undefinedExplicitRelSym(const MCValue &Target,
+                                                    const MCFixup &Fixup,
+                                                    bool IsPCRel) const;
     virtual void adjustFixupOffset(const MCFixup &Fixup, uint64_t &RelocOffset);
   };
 }
@@ -36,11 +44,13 @@ PPCELFObjectWriter::PPCELFObjectWriter(bool Is64Bit, uint8_t OSABI)
 PPCELFObjectWriter::~PPCELFObjectWriter() {
 }
 
-unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target,
-                                             const MCFixup &Fixup,
-                                             bool IsPCRel,
-                                             bool IsRelocWithSymbol,
-                                             int64_t Addend) const {
+unsigned PPCELFObjectWriter::getRelocTypeInner(const MCValue &Target,
+                                               const MCFixup &Fixup,
+                                               bool IsPCRel) const
+{
+  MCSymbolRefExpr::VariantKind Modifier = Target.isAbsolute() ?
+    MCSymbolRefExpr::VK_None : Target.getSymA()->getKind();
+
   // determine the type of the relocation
   unsigned Type;
   if (IsPCRel) {
@@ -61,7 +71,7 @@ unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target,
       Type = ELF::R_PPC_ADDR24;
       break;
     case PPC::fixup_ppc_brcond14:
-      Type = ELF::R_PPC_ADDR14_BRTAKEN; // XXX: or BRNTAKEN?_
+      Type = ELF::R_PPC_ADDR14; // XXX: or BRNTAKEN?_
       break;
     case PPC::fixup_ppc_ha16:
       Type = ELF::R_PPC_ADDR16_HA;
@@ -72,6 +82,26 @@ unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target,
     case PPC::fixup_ppc_lo14:
       Type = ELF::R_PPC_ADDR14;
       break;
+    case PPC::fixup_ppc_toc:
+      Type = ELF::R_PPC64_TOC;
+      break;
+    case PPC::fixup_ppc_toc16:
+      Type = ELF::R_PPC64_TOC16;
+      break;
+    case PPC::fixup_ppc_toc16_ds:
+      Type = ELF::R_PPC64_TOC16_DS;
+      break;
+    case FK_Data_8:
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_PPC_TOC:
+        Type = ELF::R_PPC64_TOC;
+        break;
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_PPC64_ADDR64;
+	break;
+      }
+      break;
     case FK_Data_4:
       Type = ELF::R_PPC_ADDR32;
       break;
@@ -83,11 +113,41 @@ unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target,
   return Type;
 }
 
+unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target,
+                                          const MCFixup &Fixup,
+                                          bool IsPCRel,
+                                          bool IsRelocWithSymbol,
+                                          int64_t Addend) const {
+  return getRelocTypeInner(Target, Fixup, IsPCRel);
+}
+
+const MCSymbol *PPCELFObjectWriter::undefinedExplicitRelSym(const MCValue &Target,
+                                                            const MCFixup &Fixup,
+                                                            bool IsPCRel) const {
+  assert(Target.getSymA() && "SymA cannot be 0");
+  const MCSymbol &Symbol = Target.getSymA()->getSymbol().AliasedSymbol();
+
+  unsigned RelocType = getRelocTypeInner(Target, Fixup, IsPCRel);
+
+  // The .odp creation emits a relocation against the symbol ".TOC." which
+  // create a R_PPC64_TOC relocation. However the relocation symbol name
+  // in final object creation should be NULL, since the symbol does not
+  // really exist, it is just the reference to TOC base for the current
+  // object file.
+  bool EmitThisSym = RelocType != ELF::R_PPC64_TOC;
+
+  if (EmitThisSym && !Symbol.isTemporary())
+    return &Symbol;
+  return NULL;
+}
+
 void PPCELFObjectWriter::
 adjustFixupOffset(const MCFixup &Fixup, uint64_t &RelocOffset) {
   switch ((unsigned)Fixup.getKind()) {
     case PPC::fixup_ppc_ha16:
     case PPC::fixup_ppc_lo16:
+    case PPC::fixup_ppc_toc16:
+    case PPC::fixup_ppc_toc16_ds:
       RelocOffset += 2;
       break;
     default:
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
index b3c889e3f8..37b265e7fd 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
@@ -34,6 +34,16 @@ enum Fixups {
   /// fixup_ppc_lo14 - A 14-bit fixup corresponding to lo16(_foo) for instrs
   /// like 'std'.
   fixup_ppc_lo14,
+
+  /// fixup_ppc_toc - Insert value of TOC base (.TOC.).
+  fixup_ppc_toc,
+
+  /// fixup_ppc_toc16 - A 16-bit signed fixup relative to the TOC base.
+  fixup_ppc_toc16,
+
+  /// fixup_ppc_toc16_ds - A 14-bit signed fixup relative to the TOC base with
+  /// implied 2 zero bits
+  fixup_ppc_toc16_ds,
   
   // Marker
   LastTargetFixupKind,
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index 1fba5b8dc3..21183024a5 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -15,7 +15,9 @@
 #include "MCTargetDesc/PPCBaseInfo.h"
 #include "MCTargetDesc/PPCFixupKinds.h"
 #include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -28,13 +30,25 @@ class PPCMCCodeEmitter : public MCCodeEmitter {
   PPCMCCodeEmitter(const PPCMCCodeEmitter &) LLVM_DELETED_FUNCTION;
   void operator=(const PPCMCCodeEmitter &) LLVM_DELETED_FUNCTION;
 
+  const MCSubtargetInfo &STI;
+  Triple TT;
+
 public:
   PPCMCCodeEmitter(const MCInstrInfo &mcii, const MCSubtargetInfo &sti,
-                   MCContext &ctx) {
+                   MCContext &ctx)
+    : STI(sti), TT(STI.getTargetTriple()) {
   }
   
   ~PPCMCCodeEmitter() {}
 
+  bool is64BitMode() const {
+    return (STI.getFeatureBits() & PPC::Feature64Bit) != 0;
+  }
+
+  bool isSVR4ABI() const {
+    return TT.isMacOSX() == 0;
+  }
+
   unsigned getDirectBrEncoding(const MCInst &MI, unsigned OpNo,
                                SmallVectorImpl<MCFixup> &Fixups) const;
   unsigned getCondBrEncoding(const MCInst &MI, unsigned OpNo,
@@ -61,11 +75,19 @@ public:
                                  SmallVectorImpl<MCFixup> &Fixups) const;
   void EncodeInstruction(const MCInst &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups) const {
-    unsigned Bits = getBinaryCodeForInstr(MI, Fixups);
+    uint64_t Bits = getBinaryCodeForInstr(MI, Fixups);
+
+    // BL8_NOPELF and BLA8_NOP_ELF is both size of 8 bacause of the
+    // following 'nop'.
+    unsigned Size = 4; // FIXME: Have Desc.getSize() return the correct value!
+    unsigned Opcode = MI.getOpcode();
+    if (Opcode == PPC::BL8_NOP_ELF || Opcode == PPC::BLA8_NOP_ELF)
+      Size = 8;
     
     // Output the constant in big endian byte order.
-    for (unsigned i = 0; i != 4; ++i) {
-      OS << (char)(Bits >> 24);
+    int ShiftValue = (Size * 8) - 8;
+    for (unsigned i = 0; i != Size; ++i) {
+      OS << (char)(Bits >> ShiftValue);
       Bits <<= 8;
     }
     
@@ -140,8 +162,12 @@ unsigned PPCMCCodeEmitter::getMemRIEncoding(const MCInst &MI, unsigned OpNo,
     return (getMachineOpValue(MI, MO, Fixups) & 0xFFFF) | RegBits;
   
   // Add a fixup for the displacement field.
-  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
-                                   (MCFixupKind)PPC::fixup_ppc_lo16));
+  if (isSVR4ABI() && is64BitMode())
+    Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
+                                     (MCFixupKind)PPC::fixup_ppc_toc16));
+  else
+    Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
+                                     (MCFixupKind)PPC::fixup_ppc_lo16));
   return RegBits;
 }
 
@@ -158,8 +184,12 @@ unsigned PPCMCCodeEmitter::getMemRIXEncoding(const MCInst &MI, unsigned OpNo,
     return (getMachineOpValue(MI, MO, Fixups) & 0x3FFF) | RegBits;
   
   // Add a fixup for the branch target.
-  Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
-                                   (MCFixupKind)PPC::fixup_ppc_lo14));
+  if (isSVR4ABI() && is64BitMode())
+    Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
+                                     (MCFixupKind)PPC::fixup_ppc_toc16_ds));
+  else
+    Fixups.push_back(MCFixup::Create(0, MO.getExpr(),
+                                     (MCFixupKind)PPC::fixup_ppc_lo14));
   return RegBits;
 }
 
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index d8abd9fba0..6941413ed4 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -420,10 +420,14 @@ void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
   OutStreamer.EmitValueToAlignment(8);
   MCSymbol *Symbol1 = 
     OutContext.GetOrCreateSymbol(".L." + Twine(CurrentFnSym->getName()));
-  MCSymbol *Symbol2 = OutContext.GetOrCreateSymbol(StringRef(".TOC.@tocbase"));
+  // Generates a R_PPC64_ADDR64 (from FK_DATA_8) relocation for the function
+  // entry point.
   OutStreamer.EmitValue(MCSymbolRefExpr::Create(Symbol1, OutContext),
                         8/*size*/, 0/*addrspace*/);
-  OutStreamer.EmitValue(MCSymbolRefExpr::Create(Symbol2, OutContext),
+  MCSymbol *Symbol2 = OutContext.GetOrCreateSymbol(StringRef(".TOC."));
+  // Generates a R_PPC64_TOC relocation for TOC base insertion.
+  OutStreamer.EmitValue(MCSymbolRefExpr::Create(Symbol2,
+                        MCSymbolRefExpr::VK_PPC_TOC, OutContext),
                         8/*size*/, 0/*addrspace*/);
   // Emit a null environment pointer.
   OutStreamer.EmitIntValue(0, 8 /* size */, 0 /* addrspace */);
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index b93d50326a..de0d66124b 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -1498,9 +1498,10 @@ SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
 
   EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
   bool isPPC64 = (PtrVT == MVT::i64);
+  unsigned AS = 0;
   Type *IntPtrTy =
     DAG.getTargetLoweringInfo().getDataLayout()->getIntPtrType(
-                                                             *DAG.getContext());
+                                                             *DAG.getContext(), AS);
 
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
@@ -2083,25 +2084,42 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
       int FI = MFI->CreateFixedObject(ObjSize, CurArgOffset, true);
       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
       InVals.push_back(FIN);
-      if (ObjSize==1 || ObjSize==2 || ObjSize==4) {
+
+      if (ObjSize < 8) {
         if (GPR_idx != Num_GPR_Regs) {
-          unsigned VReg;
-          VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
+          unsigned VReg = MF.addLiveIn(GPR[GPR_idx], &PPC::G8RCRegClass);
           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
-          EVT ObjType = (ObjSize == 1 ? MVT::i8 :
-                         (ObjSize == 2 ? MVT::i16 : MVT::i32));
-          SDValue Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN,
-                                            MachinePointerInfo(FuncArg,
-                                              CurArgOffset),
-                                            ObjType, false, false, 0);
+          SDValue Store;
+
+          if (ObjSize==1 || ObjSize==2 || ObjSize==4) {
+            EVT ObjType = (ObjSize == 1 ? MVT::i8 :
+                           (ObjSize == 2 ? MVT::i16 : MVT::i32));
+            Store = DAG.getTruncStore(Val.getValue(1), dl, Val, FIN,
+                                      MachinePointerInfo(FuncArg, CurArgOffset),
+                                      ObjType, false, false, 0);
+          } else {
+            // For sizes that don't fit a truncating store (3, 5, 6, 7),
+            // store the whole register as-is to the parameter save area
+            // slot.  The address of the parameter was already calculated
+            // above (InVals.push_back(FIN)) to be the right-justified
+            // offset within the slot.  For this store, we need a new
+            // frame index that points at the beginning of the slot.
+            int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true);
+            SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+            Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
+                                 MachinePointerInfo(FuncArg, ArgOffset),
+                                 false, false, 0);
+          }
+
           MemOps.push_back(Store);
           ++GPR_idx;
         }
-
+        // Whether we copied from a register or not, advance the offset
+        // into the parameter save area by a full doubleword.
         ArgOffset += PtrByteSize;
-
         continue;
       }
+
       for (unsigned j = 0; j < ArgSize; j += PtrByteSize) {
         // Store whatever pieces of the object are in registers
         // to memory.  ArgOffset will be the address of the beginning
@@ -2112,16 +2130,7 @@ PPCTargetLowering::LowerFormalArguments_64SVR4(
           int FI = MFI->CreateFixedObject(PtrByteSize, ArgOffset, true);
           SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
           SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
-          SDValue Shifted = Val;
-
-          // For 64-bit SVR4, small structs come in right-adjusted.
-          // Shift them left so the following logic works as expected.
-          if (ObjSize < 8) {
-            SDValue ShiftAmt = DAG.getConstant(64 - 8 * ObjSize, PtrVT);
-            Shifted = DAG.getNode(ISD::SHL, dl, PtrVT, Val, ShiftAmt);
-          }
-
-          SDValue Store = DAG.getStore(Val.getValue(1), dl, Shifted, FIN,
+          SDValue Store = DAG.getStore(Val.getValue(1), dl, Val, FIN,
                                        MachinePointerInfo(FuncArg, ArgOffset),
                                        false, false, 0);
           MemOps.push_back(Store);
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index cb0ea01fcf..5a78e8ac6b 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -639,13 +639,13 @@ def LDtocCPT: Pseudo<(outs G8RC:$rD), (ins tocentry:$disp, G8RC:$reg),
                      (PPCtoc_entry tconstpool:$disp, G8RC:$reg))]>, isPPC64;
 
 let hasSideEffects = 1 in { 
-let RST = 2, DS_RA = 0 in // FIXME: Should be a pseudo.
-def LDinto_toc: DSForm_1<58, 0, (outs), (ins G8RC:$reg),
+let RST = 2, DS = 2 in
+def LDinto_toc: DSForm_1a<58, 0, (outs), (ins G8RC:$reg),
                     "ld 2, 8($reg)", LdStLD,
                     [(PPCload_toc G8RC:$reg)]>, isPPC64;
                     
-let RST = 2, DS_RA = 0 in // FIXME: Should be a pseudo.
-def LDtoc_restore : DSForm_1<58, 0, (outs), (ins),
+let RST = 2, DS = 10, RA = 1 in
+def LDtoc_restore : DSForm_1a<58, 0, (outs), (ins),
                     "ld 2, 40(1)", LdStLD,
                     [(PPCtoc_restore)]>, isPPC64;
 }
diff --git a/lib/Target/PowerPC/PPCSubtarget.cpp b/lib/Target/PowerPC/PPCSubtarget.cpp
index 14b8534d1c..9c8cb92cc7 100644
--- a/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -54,19 +54,26 @@ PPCSubtarget::PPCSubtarget(const std::string &TT, const std::string &CPU,
     CPUName = sys::getHostCPUName();
 #endif
 
-  // Parse features string.
-  ParseSubtargetFeatures(CPUName, FS);
-
   // Initialize scheduling itinerary for the specified CPU.
   InstrItins = getInstrItineraryForCPU(CPUName);
 
+  // Make sure 64-bit features are available when CPUname is generic
+  std::string FullFS = FS;
+
   // If we are generating code for ppc64, verify that options make sense.
   if (is64Bit) {
     Has64BitSupport = true;
     // Silently force 64-bit register use on ppc64.
     Use64BitRegs = true;
+    if (!FullFS.empty())
+      FullFS = "+64bit," + FullFS;
+    else
+      FullFS = "+64bit";
   }
 
+  // Parse features string.
+  ParseSubtargetFeatures(CPUName, FullFS);
+
   // If the user requested use of 64-bit regs, but the cpu selected doesn't
   // support it, ignore.
   if (use64BitRegs() && !has64BitSupport())
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index b861383475..3fc977ee2b 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -44,7 +44,7 @@ PPCTargetMachine::PPCTargetMachine(const Target &T, StringRef TT,
     FrameLowering(Subtarget), JITInfo(*this, is64Bit),
     TLInfo(*this), TSInfo(*this),
     InstrItins(Subtarget.getInstrItineraryData()),
-    STTI(&TLInfo){
+    STTI(&TLInfo), VTTI(&TLInfo) {
 
   // The binutils for the BG/P are too old for CFI.
   if (Subtarget.isBGP())
diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp
index 1d8cc771dd..45c962471d 100644
--- a/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -36,7 +36,7 @@ SparcTargetMachine::SparcTargetMachine(const Target &T, StringRef TT,
     DL(Subtarget.getDataLayout()),
     InstrInfo(Subtarget),
     TLInfo(*this), TSInfo(*this),
-    FrameLowering(Subtarget),STTI(&TLInfo) {
+    FrameLowering(Subtarget), STTI(&TLInfo), VTTI(&TLInfo) {
 }
 
 namespace {
diff --git a/lib/Target/Target.cpp b/lib/Target/Target.cpp
index 393178a469..7d3dd8f015 100644
--- a/lib/Target/Target.cpp
+++ b/lib/Target/Target.cpp
@@ -64,7 +64,7 @@ unsigned LLVMPointerSizeForAS(LLVMTargetDataRef TD, unsigned AS) {
 }
 
 LLVMTypeRef LLVMIntPtrType(LLVMTargetDataRef TD) {
-  return wrap(unwrap(TD)->getIntPtrType(getGlobalContext()));
+  return wrap(unwrap(TD)->getIntPtrType(getGlobalContext(), 0));
 }
 
 LLVMTypeRef LLVMIntPtrTypeForAS(LLVMTargetDataRef TD, unsigned AS) {
diff --git a/lib/Target/TargetTransformImpl.cpp b/lib/Target/TargetTransformImpl.cpp
index 1cb5edab9d..382eecb766 100644
--- a/lib/Target/TargetTransformImpl.cpp
+++ b/lib/Target/TargetTransformImpl.cpp
@@ -9,9 +9,16 @@
 
 #include "llvm/Target/TargetTransformImpl.h"
 #include "llvm/Target/TargetLowering.h"
+#include <utility>
 
 using namespace llvm;
 
+//===----------------------------------------------------------------------===//
+//
+// Calls used by scalar transformations.
+//
+//===----------------------------------------------------------------------===//
+
 bool ScalarTargetTransformImpl::isLegalAddImmediate(int64_t imm) const {
   return TLI->isLegalAddImmediate(imm);
 }
@@ -41,3 +48,151 @@ unsigned ScalarTargetTransformImpl::getJumpBufAlignment() const {
 unsigned ScalarTargetTransformImpl::getJumpBufSize() const {
   return TLI->getJumpBufSize();
 }
+
+//===----------------------------------------------------------------------===//
+//
+// Calls used by the vectorizers.
+//
+//===----------------------------------------------------------------------===//
+int InstructionOpcodeToISD(unsigned Opcode) {
+  static const int OpToISDTbl[] = {
+    /*Instruction::Ret           */ 0, // Opcode numbering start at #1.
+    /*Instruction::Br            */ 0,
+    /*Instruction::Switch        */ 0,
+    /*Instruction::IndirectBr    */ 0,
+    /*Instruction::Invoke        */ 0,
+    /*Instruction::Resume        */ 0,
+    /*Instruction::Unreachable   */ 0,
+    /*Instruction::Add           */ ISD::ADD,
+    /*Instruction::FAdd          */ ISD::FADD,
+    /*Instruction::Sub           */ ISD::SUB,
+    /*Instruction::FSub          */ ISD::FSUB,
+    /*Instruction::Mul           */ ISD::MUL,
+    /*Instruction::FMul          */ ISD::FMUL,
+    /*Instruction::UDiv          */ ISD::UDIV,
+    /*Instruction::SDiv          */ ISD::UDIV,
+    /*Instruction::FDiv          */ ISD::FDIV,
+    /*Instruction::URem          */ ISD::UREM,
+    /*Instruction::SRem          */ ISD::SREM,
+    /*Instruction::FRem          */ ISD::FREM,
+    /*Instruction::Shl           */ ISD::SHL,
+    /*Instruction::LShr          */ ISD::SRL,
+    /*Instruction::AShr          */ ISD::SRA,
+    /*Instruction::And           */ ISD::AND,
+    /*Instruction::Or            */ ISD::OR,
+    /*Instruction::Xor           */ ISD::XOR,
+    /*Instruction::Alloca        */ 0,
+    /*Instruction::Load          */ ISD::LOAD,
+    /*Instruction::Store         */ ISD::STORE,
+    /*Instruction::GetElementPtr */ 0,
+    /*Instruction::Fence         */ 0,
+    /*Instruction::AtomicCmpXchg */ 0,
+    /*Instruction::AtomicRMW     */ 0,
+    /*Instruction::Trunc         */ ISD::TRUNCATE,
+    /*Instruction::ZExt          */ ISD::ZERO_EXTEND,
+    /*Instruction::SExt          */ ISD::SEXTLOAD,
+    /*Instruction::FPToUI        */ ISD::FP_TO_UINT,
+    /*Instruction::FPToSI        */ ISD::FP_TO_SINT,
+    /*Instruction::UIToFP        */ ISD::UINT_TO_FP,
+    /*Instruction::SIToFP        */ ISD::SINT_TO_FP,
+    /*Instruction::FPTrunc       */ ISD::FP_ROUND,
+    /*Instruction::FPExt         */ ISD::FP_EXTEND,
+    /*Instruction::PtrToInt      */ ISD::BITCAST,
+    /*Instruction::IntToPtr      */ ISD::BITCAST,
+    /*Instruction::BitCast       */ ISD::BITCAST,
+    /*Instruction::ICmp          */ ISD::SETCC,
+    /*Instruction::FCmp          */ ISD::SETCC,
+    /*Instruction::PHI           */ 0,
+    /*Instruction::Call          */ 0,
+    /*Instruction::Select        */ ISD::SELECT,
+    /*Instruction::UserOp1       */ 0,
+    /*Instruction::UserOp2       */ 0,
+    /*Instruction::VAArg         */ 0,
+    /*Instruction::ExtractElement*/ ISD::EXTRACT_VECTOR_ELT,
+    /*Instruction::InsertElement */ ISD::INSERT_VECTOR_ELT,
+    /*Instruction::ShuffleVector */ ISD::VECTOR_SHUFFLE,
+    /*Instruction::ExtractValue  */ ISD::MERGE_VALUES,
+    /*Instruction::InsertValue   */ ISD::MERGE_VALUES,
+    /*Instruction::LandingPad    */ 0};
+
+  assert((Instruction::Ret == 1) && (Instruction::LandingPad == 58) &&
+         "Instruction order had changed");
+
+  // Opcode numbering starts at #1 but the table starts at #0, so we subtract
+  // one from the opcode number.
+  return OpToISDTbl[Opcode - 1];
+}
+
+std::pair<unsigned, EVT> 
+VectorTargetTransformImpl::getTypeLegalizationCost(LLVMContext &C,
+                                                         EVT Ty) const {
+  unsigned Cost = 1;
+  // We keep legalizing the type until we find a legal kind. We assume that
+  // the only operation that costs anything is the split. After splitting
+  // we need to handle two types.
+  while (true) {
+    TargetLowering::LegalizeKind LK = TLI->getTypeConversion(C, Ty);
+
+    if (LK.first == TargetLowering::TypeLegal)
+      return std::make_pair(Cost, LK.second);
+
+    if (LK.first == TargetLowering::TypeSplitVector)
+      Cost *= 2;
+
+    // Keep legalizing the type.
+    Ty = LK.second;
+  }
+}
+
+unsigned
+VectorTargetTransformImpl::getInstrCost(unsigned Opcode, Type *Ty1,
+                                        Type *Ty2) const {
+  // Check if any of the operands are vector operands.
+  int ISD = InstructionOpcodeToISD(Opcode);
+
+  // Selects on vectors are actually vector selects.
+  if (ISD == ISD::SELECT) {
+    assert(Ty2 && "Ty2 must hold the select type");
+    if (Ty2->isVectorTy())
+    ISD = ISD::VSELECT;
+  }
+
+  // If we don't have any information about this instruction assume it costs 1.
+  if (ISD == 0)
+    return 1;
+
+  assert(Ty1 && "We need to have at least one type");
+
+  // From this stage we look at the legalized type.
+  std::pair<unsigned, EVT>  LT =
+  getTypeLegalizationCost(Ty1->getContext(), TLI->getValueType(Ty1));
+
+  if (TLI->isOperationLegalOrCustom(ISD, LT.second)) {
+    // The operation is legal. Assume it costs 1. Multiply
+    // by the type-legalization overhead.
+    return LT.first * 1;
+  }
+
+  unsigned NumElem =
+  (LT.second.isVector() ? LT.second.getVectorNumElements() : 1);
+
+  // We will probably scalarize this instruction. Assume that the cost is the
+  // number of the vector elements.
+  return LT.first * NumElem * 1;
+}
+
+unsigned
+VectorTargetTransformImpl::getBroadcastCost(Type *Tp) const {
+  return 1;
+}
+
+unsigned
+VectorTargetTransformImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+                                           unsigned Alignment,
+                                           unsigned AddressSpace) const {
+  // From this stage we look at the legalized type.
+  std::pair<unsigned, EVT>  LT =
+  getTypeLegalizationCost(Src->getContext(), TLI->getValueType(Src));
+  // Assume that all loads of legal types cost 1.
+  return LT.first;
+}
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 9689180afd..708951126f 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -18,6 +18,7 @@
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -54,10 +55,13 @@ private:
   X86Operand *ParseOperand();
   X86Operand *ParseATTOperand();
   X86Operand *ParseIntelOperand();
+  X86Operand *ParseIntelOffsetOfOperator(SMLoc StartLoc);
   X86Operand *ParseIntelMemOperand(unsigned SegReg, SMLoc StartLoc);
   X86Operand *ParseIntelBracExpression(unsigned SegReg, unsigned Size);
   X86Operand *ParseMemOperand(unsigned SegReg, SMLoc StartLoc);
 
+  const MCExpr *ParseIntelDotOperator(const MCExpr *Disp);
+
   bool ParseDirectiveWord(unsigned Size, SMLoc L);
   bool ParseDirectiveCode(StringRef IDVal, SMLoc L);
 
@@ -159,6 +163,7 @@ struct X86Operand : public MCParsedAsmOperand {
   } Kind;
 
   SMLoc StartLoc, EndLoc;
+  SMLoc OffsetOfLoc;
 
   union {
     struct {
@@ -181,7 +186,6 @@ struct X86Operand : public MCParsedAsmOperand {
       unsigned IndexReg;
       unsigned Scale;
       unsigned Size;
-      bool OffsetOf;
       bool NeedSizeDir;
     } Mem;
   };
@@ -196,6 +200,8 @@ struct X86Operand : public MCParsedAsmOperand {
   /// getLocRange - Get the range between the first and last token of this
   /// operand.
   SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); }
+  /// getOffsetOfLoc - Get the location of the offset operator.
+  SMLoc getOffsetOfLoc() const { return OffsetOfLoc; }
 
   virtual void print(raw_ostream &OS) const {}
 
@@ -321,8 +327,7 @@ struct X86Operand : public MCParsedAsmOperand {
   }
 
   bool isOffsetOf() const {
-    assert(Kind == Memory && "Invalid access!");
-    return Mem.OffsetOf;
+    return OffsetOfLoc.getPointer();
   }
 
   bool needSizeDirective() const {
@@ -455,9 +460,11 @@ struct X86Operand : public MCParsedAsmOperand {
     return Res;
   }
 
-  static X86Operand *CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc) {
+  static X86Operand *CreateReg(unsigned RegNo, SMLoc StartLoc, SMLoc EndLoc,
+                               SMLoc OffsetOfLoc = SMLoc()) {
     X86Operand *Res = new X86Operand(Register, StartLoc, EndLoc);
     Res->Reg.RegNo = RegNo;
+    Res->OffsetOfLoc = OffsetOfLoc;
     return Res;
   }
 
@@ -468,9 +475,8 @@ struct X86Operand : public MCParsedAsmOperand {
   }
 
   /// Create an absolute memory operand.
-  static X86Operand *CreateMem(const MCExpr *Disp, SMLoc StartLoc,
-                               SMLoc EndLoc, unsigned Size = 0,
-                               bool OffsetOf = false, bool NeedSizeDir = false){
+  static X86Operand *CreateMem(const MCExpr *Disp, SMLoc StartLoc, SMLoc EndLoc,
+                               unsigned Size = 0, bool NeedSizeDir = false){
     X86Operand *Res = new X86Operand(Memory, StartLoc, EndLoc);
     Res->Mem.SegReg   = 0;
     Res->Mem.Disp     = Disp;
@@ -478,7 +484,6 @@ struct X86Operand : public MCParsedAsmOperand {
     Res->Mem.IndexReg = 0;
     Res->Mem.Scale    = 1;
     Res->Mem.Size     = Size;
-    Res->Mem.OffsetOf = OffsetOf;
     Res->Mem.NeedSizeDir = NeedSizeDir;
     return Res;
   }
@@ -487,8 +492,7 @@ struct X86Operand : public MCParsedAsmOperand {
   static X86Operand *CreateMem(unsigned SegReg, const MCExpr *Disp,
                                unsigned BaseReg, unsigned IndexReg,
                                unsigned Scale, SMLoc StartLoc, SMLoc EndLoc,
-                               unsigned Size = 0, bool OffsetOf = false,
-                               bool NeedSizeDir = false) {
+                               unsigned Size = 0, bool NeedSizeDir = false) {
     // We should never just have a displacement, that should be parsed as an
     // absolute memory operand.
     assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!");
@@ -503,7 +507,6 @@ struct X86Operand : public MCParsedAsmOperand {
     Res->Mem.IndexReg = IndexReg;
     Res->Mem.Scale    = Scale;
     Res->Mem.Size     = Size;
-    Res->Mem.OffsetOf = OffsetOf;
     Res->Mem.NeedSizeDir = NeedSizeDir;
     return Res;
   }
@@ -661,9 +664,10 @@ static unsigned getIntelMemOperandSize(StringRef OpStr) {
 X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg, 
                                                    unsigned Size) {
   unsigned BaseReg = 0, IndexReg = 0, Scale = 1;
-  SMLoc Start = Parser.getTok().getLoc(), End;
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc Start = Tok.getLoc(), End;
 
-  const MCExpr *Disp = MCConstantExpr::Create(0, getParser().getContext());
+  const MCExpr *Disp = MCConstantExpr::Create(0, getContext());
   // Parse [ BaseReg + Scale*IndexReg + Disp ] or [ symbol ]
 
   // Eat '['
@@ -682,9 +686,9 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg,
       return X86Operand::CreateMem(Disp, Start, End, Size);
     }
   } else if (getLexer().is(AsmToken::Integer)) {
-      int64_t Val = Parser.getTok().getIntVal();
+      int64_t Val = Tok.getIntVal();
       Parser.Lex();
-      SMLoc Loc = Parser.getTok().getLoc();
+      SMLoc Loc = Tok.getLoc();
       if (getLexer().is(AsmToken::RBrac)) {
         // Handle '[' number ']'
         Parser.Lex();
@@ -696,7 +700,7 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg,
       } else if (getLexer().is(AsmToken::Star)) {
         // Handle '[' Scale*IndexReg ']'
         Parser.Lex();
-        SMLoc IdxRegLoc = Parser.getTok().getLoc();
+        SMLoc IdxRegLoc = Tok.getLoc();
         if (ParseRegister(IndexReg, IdxRegLoc, End))
           return ErrorOperand(IdxRegLoc, "Expected register");
         Scale = Val;
@@ -707,13 +711,13 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg,
   if (getLexer().is(AsmToken::Plus) || getLexer().is(AsmToken::Minus)) {
     bool isPlus = getLexer().is(AsmToken::Plus);
     Parser.Lex();
-    SMLoc PlusLoc = Parser.getTok().getLoc();
+    SMLoc PlusLoc = Tok.getLoc();
     if (getLexer().is(AsmToken::Integer)) {
-      int64_t Val = Parser.getTok().getIntVal();
+      int64_t Val = Tok.getIntVal();
       Parser.Lex();
       if (getLexer().is(AsmToken::Star)) {
         Parser.Lex();
-        SMLoc IdxRegLoc = Parser.getTok().getLoc();
+        SMLoc IdxRegLoc = Tok.getLoc();
         if (ParseRegister(IndexReg, IdxRegLoc, End))
           return ErrorOperand(IdxRegLoc, "Expected register");
         Scale = Val;
@@ -724,7 +728,7 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg,
         return ErrorOperand(PlusLoc, "unexpected token after +");
     } else if (getLexer().is(AsmToken::Identifier)) {
       // This could be an index register or a displacement expression.
-      End = Parser.getTok().getLoc();
+      End = Tok.getLoc();
       if (!IndexReg)
         ParseRegister(IndexReg, Start, End);
       else if (getParser().ParseExpression(Disp, End)) return 0;
@@ -734,11 +738,16 @@ X86Operand *X86AsmParser::ParseIntelBracExpression(unsigned SegReg,
   if (getLexer().isNot(AsmToken::RBrac))
     if (getParser().ParseExpression(Disp, End)) return 0;
 
-  End = Parser.getTok().getLoc();
+  End = Tok.getLoc();
   if (getLexer().isNot(AsmToken::RBrac))
     return ErrorOperand(End, "expected ']' token!");
   Parser.Lex();
-  End = Parser.getTok().getLoc();
+  End = Tok.getLoc();
+
+  if (Tok.getString().startswith("."))
+    Disp = ParseIntelDotOperator(Disp);
+
+  End = Tok.getLoc();
 
   // handle [-42]
   if (!BaseReg && !IndexReg)
@@ -761,22 +770,10 @@ X86Operand *X86AsmParser::ParseIntelMemOperand(unsigned SegReg, SMLoc Start) {
     Parser.Lex();
   }
 
-  // Parse the 'offset' operator.  This operator is used to specify the
-  // location rather then the content of a variable.
-  bool OffsetOf = false;
-  if(isParsingInlineAsm() && (Tok.getString() == "offset" ||
-                              Tok.getString() == "OFFSET")) {
-    OffsetOf = true;
-    Parser.Lex(); // Eat offset.
-  }
-
-  if (getLexer().is(AsmToken::LBrac)) {
-    assert (!OffsetOf && "Unexpected offset operator!");
+  if (getLexer().is(AsmToken::LBrac))
     return ParseIntelBracExpression(SegReg, Size);
-  }
 
   if (!ParseRegister(SegReg, Start, End)) {
-    assert (!OffsetOf && "Unexpected offset operator!");
     // Handel SegReg : [ ... ]
     if (getLexer().isNot(AsmToken::Colon))
       return ErrorOperand(Start, "Expected ':' token!");
@@ -801,12 +798,74 @@ X86Operand *X86AsmParser::ParseIntelMemOperand(unsigned SegReg, SMLoc Start) {
       NeedSizeDir = Size > 0;
     }
   }
-  return X86Operand::CreateMem(Disp, Start, End, Size, OffsetOf, NeedSizeDir);
+  if (!isParsingInlineAsm())
+    return X86Operand::CreateMem(Disp, Start, End, Size);
+  else
+    // When parsing inline assembly we set the base register to a non-zero value
+    // as we don't know the actual value at this time.  This is necessary to
+    // get the matching correct in some cases.
+    return X86Operand::CreateMem(/*SegReg*/0, Disp, /*BaseReg*/1, /*IndexReg*/0,
+                                 /*Scale*/1, Start, End, Size, NeedSizeDir);
+}
+
+/// Parse the '.' operator.
+const MCExpr *X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp) {
+  AsmToken Tok = *&Parser.getTok();
+
+  // Drop the '.'.
+  StringRef DotDispStr = Tok.getString().drop_front(1);
+
+  Lex(); // Eat .field.
+
+  // .Imm gets lexed as a real.
+  if (Tok.is(AsmToken::Real)) {
+    APInt DotDisp;
+    DotDispStr.getAsInteger(10, DotDisp);
+    uint64_t DotDispVal = DotDisp.getZExtValue();
+
+    // Special case zero dot displacement.
+    if (!DotDispVal) return Disp;
+
+    // FIXME: Handle non-constant expressions.
+    if (const MCConstantExpr *OrigDisp = dyn_cast<MCConstantExpr>(Disp)) {
+      uint64_t OrigDispVal = OrigDisp->getValue();
+      return MCConstantExpr::Create(OrigDispVal + DotDispVal, getContext());
+    }
+  }
+  return Disp;
+}
+
+/// Parse the 'offset' operator.  This operator is used to specify the
+/// location rather then the content of a variable.
+X86Operand *X86AsmParser::ParseIntelOffsetOfOperator(SMLoc Start) {
+  SMLoc OffsetOfLoc = Start;
+  Parser.Lex(); // Eat offset.
+  Start = Parser.getTok().getLoc();
+  assert (Parser.getTok().is(AsmToken::Identifier) && "Expected an identifier");
+
+  SMLoc End;  
+  const MCExpr *Val;
+  if (getParser().ParseExpression(Val, End))
+    return 0;
+
+  End = Parser.getTok().getLoc();
+
+  // The offset operator will have an 'r' constraint, thus we need to create
+  // register operand to ensure proper matching.  Just pick a GPR based on
+  // the size of a pointer.
+  unsigned RegNo = is64BitMode() ? X86::RBX : X86::EBX;
+  return X86Operand::CreateReg(RegNo, Start, End, OffsetOfLoc);
 }
 
 X86Operand *X86AsmParser::ParseIntelOperand() {
   SMLoc Start = Parser.getTok().getLoc(), End;
 
+  // offset operator.
+  const AsmToken &Tok = Parser.getTok();
+  if ((Tok.getString() == "offset" || Tok.getString() == "OFFSET") &&
+      isParsingInlineAsm())
+    return ParseIntelOffsetOfOperator(Start);
+
   // immediate.
   if (getLexer().is(AsmToken::Integer) || getLexer().is(AsmToken::Real) ||
       getLexer().is(AsmToken::Minus)) {
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
index edad47312d..a4bd1147bc 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -34,11 +34,9 @@ using namespace llvm;
 
 void X86ATTInstPrinter::printRegName(raw_ostream &OS,
                                      unsigned RegNo) const {
-  if (UseMarkup)
-    OS << "<reg:";
-  OS << '%' << getRegisterName(RegNo);
-  if (UseMarkup)
-    OS << ">";
+  OS << markup("<reg:")
+     << '%' << getRegisterName(RegNo)
+     << markup(">");
 }
 
 void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
@@ -155,29 +153,21 @@ void X86ATTInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
                                      raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNo);
   if (Op.isReg()) {
-    if (UseMarkup)
-      O << "<reg:";
-    O << '%' << getRegisterName(Op.getReg());
-    if (UseMarkup)
-      O << ">";
+    printRegName(O, Op.getReg());
   } else if (Op.isImm()) {
-    if (UseMarkup)
-      O << "<imm:";
     // Print X86 immediates as signed values.
-    O << '$' << (int64_t)Op.getImm();
-    if (UseMarkup)
-      O << ">";
+    O << markup("<imm:")
+      << '$' << (int64_t)Op.getImm()
+      << markup(">");
     
     if (CommentStream && (Op.getImm() > 255 || Op.getImm() < -256))
       *CommentStream << format("imm = 0x%" PRIX64 "\n", (uint64_t)Op.getImm());
     
   } else {
     assert(Op.isExpr() && "unknown operand kind in printOperand");
-    if (UseMarkup)
-      O << "<imm:";
-    O << '$' << *Op.getExpr();
-    if (UseMarkup)
-      O << ">";
+    O << markup("<imm:")
+      << '$' << *Op.getExpr()
+      << markup(">");
   }
 }
 
@@ -188,8 +178,7 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
   const MCOperand &DispSpec = MI->getOperand(Op+3);
   const MCOperand &SegReg = MI->getOperand(Op+4);
   
-  if (UseMarkup)
-    O << "<mem:";
+  O << markup("<mem:");
 
   // If this has a segment register, print it.
   if (SegReg.getReg()) {
@@ -216,17 +205,14 @@ void X86ATTInstPrinter::printMemReference(const MCInst *MI, unsigned Op,
       printOperand(MI, Op+2, O);
       unsigned ScaleVal = MI->getOperand(Op+1).getImm();
       if (ScaleVal != 1) {
-        O << ',';
-        if (UseMarkup)
-          O << "<imm:";
-        O << ScaleVal;
-        if (UseMarkup)
-          O << ">";
+        O << ','
+	  << markup("<imm:")
+          << ScaleVal
+	  << markup(">");
       }
     }
     O << ')';
   }
 
-  if (UseMarkup)
-    O << ">";
+  O << markup(">");
 }
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 1dc4aa9989..1b5c4d9753 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -162,7 +162,7 @@ def : Proc<"core2",           [FeatureSSSE3, FeatureCMPXCHG16B,
                                FeatureSlowBTMem]>;
 def : Proc<"penryn",          [FeatureSSE41, FeatureCMPXCHG16B,
                                FeatureSlowBTMem]>;
-def : AtomProc<"atom",        [ProcIntelAtom, FeatureSSE3, FeatureCMPXCHG16B,
+def : AtomProc<"atom",        [ProcIntelAtom, FeatureSSSE3, FeatureCMPXCHG16B,
                                FeatureMOVBE, FeatureSlowBTMem, FeatureLeaForSP,
                                FeatureSlowDivide]>;
 // "Arrandale" along with corei3 and corei5
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index a6d2709b37..6786756c7f 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -88,6 +88,21 @@ def RetCC_X86_32_Fast : CallingConv<[
   CCDelegateTo<RetCC_X86Common>
 ]>;
 
+// Intel_OCL_BI return-value convention.
+def RetCC_Intel_OCL_BI : CallingConv<[
+  // Vector types are returned in XMM0,XMM1,XMMM2 and XMM3.
+  CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64],
+            CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>,
+
+  // 256-bit FP vectors
+  // No more than 4 registers
+  CCIfType<[v8f32, v4f64, v8i32, v4i64],
+            CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>,
+
+  // i32, i64 in the standard way
+  CCDelegateTo<RetCC_X86Common>
+]>;
+
 // X86-64 C return-value convention.
 def RetCC_X86_64_C : CallingConv<[
   // The X86-64 calling convention always returns FP values in XMM0.
@@ -128,6 +143,10 @@ def RetCC_X86_64 : CallingConv<[
 
 // This is the return-value convention used for the entire X86 backend.
 def RetCC_X86 : CallingConv<[
+
+  // Check if this is the Intel OpenCL built-ins calling convention
+  CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<RetCC_Intel_OCL_BI>>,
+
   CCIfSubtarget<"is64Bit()", CCDelegateTo<RetCC_X86_64>>,
   CCDelegateTo<RetCC_X86_32>
 ]>;
@@ -235,6 +254,29 @@ def CC_X86_Win64_C : CallingConv<[
   CCIfType<[f80], CCAssignToStack<0, 0>>
 ]>;
 
+// X86-64 Intel OpenCL built-ins calling convention.
+def CC_Intel_OCL_BI : CallingConv<[
+  CCIfType<[i32], CCIfSubtarget<"isTargetWin32()", CCAssignToStack<4, 4>>>,
+
+  CCIfType<[i32], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[ECX, EDX, R8D, R9D]>>>,
+  CCIfType<[i64], CCIfSubtarget<"isTargetWin64()", CCAssignToReg<[RCX, RDX, R8,  R9 ]>>>,
+
+  CCIfType<[i32], CCAssignToReg<[EDI, ESI, EDX, ECX]>>,
+  CCIfType<[i64], CCAssignToReg<[RDI, RSI, RDX, RCX]>>,
+
+ // The SSE vector arguments are passed in XMM registers.
+  CCIfType<[f32, f64, v4i32, v2i64, v4f32, v2f64],
+           CCAssignToReg<[XMM0, XMM1, XMM2, XMM3]>>,
+  
+  // The 256-bit vector arguments are passed in YMM registers.
+  CCIfType<[v8f32, v4f64, v8i32, v4i64],
+                CCAssignToReg<[YMM0, YMM1, YMM2, YMM3]>>,
+  
+  CCIfSubtarget<"isTargetWin64()", CCDelegateTo<CC_X86_Win64_C>>,
+  CCDelegateTo<CC_X86_64_C>
+]>;
+
+
 def CC_X86_64_GHC : CallingConv<[
   // Promote i8/i16/i32 arguments to i64.
   CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
@@ -324,7 +366,7 @@ def CC_X86_32_FastCall : CallingConv<[
   CCIfNest<CCAssignToReg<[EAX]>>,
 
   // The first 2 integer arguments are passed in ECX/EDX
-  CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>,
+  CCIfInReg<CCIfType<[i32], CCAssignToReg<[ECX, EDX]>>>,
 
   // Otherwise, same as everything else.
   CCDelegateTo<CC_X86_32_Common>
@@ -408,6 +450,7 @@ def CC_X86_64 : CallingConv<[
 
 // This is the argument convention used for the entire X86 backend.
 def CC_X86 : CallingConv<[
+  CCIfCC<"CallingConv::Intel_OCL_BI", CCDelegateTo<CC_Intel_OCL_BI>>,
   CCIfSubtarget<"is64Bit()", CCDelegateTo<CC_X86_64>>,
   CCDelegateTo<CC_X86_32>
 ]>;
@@ -426,3 +469,17 @@ def CSR_64EHRet : CalleeSavedRegs<(add RAX, RDX, CSR_64)>;
 
 def CSR_Win64 : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R15,
                                      (sequence "XMM%u", 6, 15))>;
+
+
+// Standard C + YMM6-15
+def CSR_Win64_Intel_OCL_BI_AVX : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12,
+                                                  R13, R14, R15, 
+                                                  (sequence "YMM%u", 6, 15))>;
+
+//Standard C + XMM 8-15
+def CSR_64_Intel_OCL_BI       : CalleeSavedRegs<(add CSR_64,
+                                                 (sequence "XMM%u", 8, 15))>;
+
+//Standard C + YMM 8-15
+def CSR_64_Intel_OCL_BI_AVX    : CalleeSavedRegs<(add CSR_64,
+                                                  (sequence "YMM%u", 8, 15))>;
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index c9301bb2c9..b8f7a6881f 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -282,8 +282,9 @@ X86FastISel::X86FastEmitStore(EVT VT, unsigned Val, const X86AddressMode &AM) {
 bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
                                    const X86AddressMode &AM) {
   // Handle 'null' like i32/i64 0.
-  if (isa<ConstantPointerNull>(Val))
-    Val = Constant::getNullValue(TD.getIntPtrType(Val->getContext()));
+  if (isa<ConstantPointerNull>(Val)) {
+    Val = Constant::getNullValue(TD.getIntPtrType(Val->getType()));
+  }
 
   // If this is a store of a simple constant, fold the constant into the store.
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
@@ -908,8 +909,9 @@ bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1,
   if (Op0Reg == 0) return false;
 
   // Handle 'null' like i32/i64 0.
-  if (isa<ConstantPointerNull>(Op1))
-    Op1 = Constant::getNullValue(TD.getIntPtrType(Op0->getContext()));
+  if (isa<ConstantPointerNull>(Op1)) {
+    Op1 = Constant::getNullValue(TD.getIntPtrType(Op0->getType()));
+  }
 
   // We have two options: compare with register or immediate.  If the RHS of
   // the compare is an immediate that we can fold into this compare, use
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 4141068806..5bfb5054b0 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -315,11 +315,11 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(MachineFunction &MF,
   if (CSI.empty()) return;
 
   std::vector<MachineMove> &Moves = MMI.getFrameMoves();
-  const DataLayout *TD = TM.getDataLayout();
+  const X86RegisterInfo *RegInfo = TM.getRegisterInfo();
   bool HasFP = hasFP(MF);
 
   // Calculate amount of bytes used for return address storing.
-  int stackGrowth = -TM.getFrameLowering()->getStackSlotSize(); // @LOCALMOD
+  int stackGrowth = -RegInfo->getSlotSize();
 
   // FIXME: This is dirty hack. The code itself is pretty mess right now.
   // It should be rewritten from scratch and generalized sometimes.
@@ -717,9 +717,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF) const {
   //        ELSE                        => DW_CFA_offset_extended
 
   std::vector<MachineMove> &Moves = MMI.getFrameMoves();
-  const DataLayout *TD = MF.getTarget().getDataLayout();
   uint64_t NumBytes = 0;
-  int stackGrowth = -TM.getFrameLowering()->getStackSlotSize(); // @LOCALMOD
+  int stackGrowth = -SlotSize;
 
   if (HasFP) {
     // Calculate required stack adjustment.
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index b9348a2b90..5d3c5f0347 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -959,6 +959,13 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::FP_TO_SINT,         MVT::v4i32, Legal);
     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
 
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i8,  Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v4i16, Custom);
+    // As there is no 64-bit GPR available, we need build a special custom
+    // sequence to convert from v2i32 to v2f32.
+    if (!Subtarget->is64Bit())
+      setOperationAction(ISD::UINT_TO_FP,       MVT::v2f32, Custom);
+
     setOperationAction(ISD::FP_EXTEND,          MVT::v2f32, Custom);
     setOperationAction(ISD::FP_ROUND,           MVT::v2f32, Custom);
 
@@ -1078,6 +1085,10 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::SINT_TO_FP,         MVT::v8i32, Legal);
     setOperationAction(ISD::FP_ROUND,           MVT::v4f32, Legal);
 
+    setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i32, Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i8,  Custom);
+    setOperationAction(ISD::UINT_TO_FP,         MVT::v8i16, Custom);
+
     setLoadExtAction(ISD::EXTLOAD,              MVT::v4f32, Legal);
 
     setOperationAction(ISD::SRL,               MVT::v16i16, Custom);
@@ -1268,7 +1279,6 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setTargetDAGCombine(ISD::ANY_EXTEND);
   setTargetDAGCombine(ISD::SIGN_EXTEND);
   setTargetDAGCombine(ISD::TRUNCATE);
-  setTargetDAGCombine(ISD::UINT_TO_FP);
   setTargetDAGCombine(ISD::SINT_TO_FP);
   setTargetDAGCombine(ISD::SETCC);
   if (Subtarget->is64Bit())
@@ -2204,16 +2214,14 @@ X86TargetLowering::EmitTailCallLoadRetAddr(SelectionDAG &DAG,
 /// optimization is performed and it is required (FPDiff!=0).
 static SDValue
 EmitTailCallStoreRetAddr(SelectionDAG & DAG, MachineFunction &MF,
-                         SDValue Chain, SDValue RetAddrFrIdx,
-                         bool Is64Bit, int FPDiff, DebugLoc dl) {
+                         SDValue Chain, SDValue RetAddrFrIdx, EVT PtrVT,
+                         unsigned SlotSize, int FPDiff, DebugLoc dl) {
   // Store the return address to the appropriate stack slot.
   if (!FPDiff) return Chain;
   // Calculate the new stack slot for the return address.
-  int SlotSize = Is64Bit ? 8 : 4;
   int NewReturnAddrFI =
     MF.getFrameInfo()->CreateFixedObject(SlotSize, FPDiff-SlotSize, false);
-  EVT VT = Is64Bit ? MVT::i64 : MVT::i32;
-  SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, VT);
+  SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewReturnAddrFI, PtrVT);
   Chain = DAG.getStore(Chain, dl, RetAddrFrIdx, NewRetAddrFrIdx,
                        MachinePointerInfo::getFixedStack(NewReturnAddrFI),
                        false, false, 0);
@@ -2482,7 +2490,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                           &MemOpChains2[0], MemOpChains2.size());
 
     // Store the return address to the appropriate stack slot.
-    Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx, Is64Bit,
+    Chain = EmitTailCallStoreRetAddr(DAG, MF, Chain, RetAddrFrIdx,
+                                     getPointerTy(), RegInfo->getSlotSize(),
                                      FPDiff, dl);
   }
 
@@ -2694,8 +2703,7 @@ X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
   unsigned StackAlignment = TFI.getStackAlignment();
   uint64_t AlignMask = StackAlignment - 1;
   int64_t Offset = StackSize;
-  // @LOCALMOD
-  uint64_t SlotSize = Subtarget->is64Bit() ? 8 : 4;
+  unsigned SlotSize = RegInfo->getSlotSize();
   if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
     // Number smaller than 12 so just add the difference.
     Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
@@ -3063,7 +3071,7 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
 
   if (ReturnAddrIndex == 0) {
     // Set up a frame object for the return address.
-    uint64_t SlotSize = Subtarget->is64Bit() ? 8 : 4; // @LOCALMOD
+    unsigned SlotSize = RegInfo->getSlotSize();
     ReturnAddrIndex = MF.getFrameInfo()->CreateFixedObject(SlotSize, -SlotSize,
                                                            false);
     FuncInfo->setRAIndex(ReturnAddrIndex);
@@ -6594,6 +6602,81 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasSSE2) {
                               getShuffleSHUFImmediate(SVOp), DAG);
 }
 
+// Reduce a vector shuffle to zext.
+SDValue
+X86TargetLowering::lowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const {
+  // PMOVZX is only available from SSE41.
+  if (!Subtarget->hasSSE41())
+    return SDValue();
+
+  EVT VT = Op.getValueType();
+
+  // Only AVX2 support 256-bit vector integer extending.
+  if (!Subtarget->hasAVX2() && VT.is256BitVector())
+    return SDValue();
+
+  ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+  DebugLoc DL = Op.getDebugLoc();
+  SDValue V1 = Op.getOperand(0);
+  SDValue V2 = Op.getOperand(1);
+  unsigned NumElems = VT.getVectorNumElements();
+
+  // Extending is an unary operation and the element type of the source vector
+  // won't be equal to or larger than i64.
+  if (V2.getOpcode() != ISD::UNDEF || !VT.isInteger() ||
+      VT.getVectorElementType() == MVT::i64)
+    return SDValue();
+
+  // Find the expansion ratio, e.g. expanding from i8 to i32 has a ratio of 4.
+  unsigned Shift = 1; // Start from 2, i.e. 1 << 1.
+  while ((1 << Shift) < NumElems) {
+    if (SVOp->getMaskElt(1 << Shift) == 1)
+      break;
+    Shift += 1;
+    // The maximal ratio is 8, i.e. from i8 to i64.
+    if (Shift > 3)
+      return SDValue();
+  }
+
+  // Check the shuffle mask.
+  unsigned Mask = (1U << Shift) - 1;
+  for (unsigned i = 0; i != NumElems; ++i) {
+    int EltIdx = SVOp->getMaskElt(i);
+    if ((i & Mask) != 0 && EltIdx != -1)
+      return SDValue();
+    if ((i & Mask) == 0 && (unsigned)EltIdx != (i >> Shift))
+      return SDValue();
+  }
+
+  unsigned NBits = VT.getVectorElementType().getSizeInBits() << Shift;
+  EVT NeVT = EVT::getIntegerVT(*DAG.getContext(), NBits);
+  EVT NVT = EVT::getVectorVT(*DAG.getContext(), NeVT, NumElems >> Shift);
+
+  if (!isTypeLegal(NVT))
+    return SDValue();
+
+  // Simplify the operand as it's prepared to be fed into shuffle.
+  unsigned SignificantBits = NVT.getSizeInBits() >> Shift;
+  if (V1.getOpcode() == ISD::BITCAST &&
+      V1.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
+      V1.getOperand(0).getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+      V1.getOperand(0)
+        .getOperand(0).getValueType().getSizeInBits() == SignificantBits) {
+    // (bitcast (sclr2vec (ext_vec_elt x))) -> (bitcast x)
+    SDValue V = V1.getOperand(0).getOperand(0).getOperand(0);
+    ConstantSDNode *CIdx =
+      dyn_cast<ConstantSDNode>(V1.getOperand(0).getOperand(0).getOperand(1));
+    // If it's foldable, i.e. normal load with single use, we will let code
+    // selection to fold it. Otherwise, we will short the conversion sequence.
+    if (CIdx && CIdx->getZExtValue() == 0 &&
+        (!ISD::isNormalLoad(V.getNode()) || !V.hasOneUse()))
+      V1 = DAG.getNode(ISD::BITCAST, DL, V1.getValueType(), V);
+  }
+
+  return DAG.getNode(ISD::BITCAST, DL, VT,
+                     DAG.getNode(X86ISD::VZEXT, DL, NVT, V1));
+}
+
 SDValue
 X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
@@ -6624,6 +6707,11 @@ X86TargetLowering::NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const {
     return PromoteSplat(SVOp, DAG);
   }
 
+  // Check integer expanding shuffles.
+  SDValue NewOp = lowerVectorIntExtend(Op, DAG);
+  if (NewOp.getNode())
+    return NewOp;
+
   // If the shuffle can be profitably rewritten as a narrower shuffle, then
   // do it!
   if (VT == MVT::v8i16  || VT == MVT::v16i8 ||
@@ -8098,11 +8186,29 @@ SDValue X86TargetLowering::LowerUINT_TO_FP_i32(SDValue Op,
   return Sub;
 }
 
+SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  SDValue N0 = Op.getOperand(0);
+  EVT SVT = N0.getValueType();
+  DebugLoc dl = Op.getDebugLoc();
+
+  assert((SVT == MVT::v4i8 || SVT == MVT::v4i16 ||
+          SVT == MVT::v8i8 || SVT == MVT::v8i16) &&
+         "Custom UINT_TO_FP is not supported!");
+
+  EVT NVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, SVT.getVectorNumElements());
+  return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
+                     DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
+}
+
 SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
                                            SelectionDAG &DAG) const {
   SDValue N0 = Op.getOperand(0);
   DebugLoc dl = Op.getDebugLoc();
 
+  if (Op.getValueType().isVector())
+    return lowerUINT_TO_FP_vec(Op, DAG);
+
   // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
   // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
   // the optimization here.
@@ -8276,6 +8382,30 @@ FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned, bool IsReplace) co
   }
 }
 
+SDValue X86TargetLowering::lowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const {
+  DebugLoc DL = Op.getDebugLoc();
+  EVT VT = Op.getValueType();
+  SDValue In = Op.getOperand(0);
+  EVT SVT = In.getValueType();
+
+  if (!VT.is256BitVector() || !SVT.is128BitVector() ||
+      VT.getVectorNumElements() != SVT.getVectorNumElements())
+    return SDValue();
+
+  assert(Subtarget->hasAVX() && "256-bit vector is observed without AVX!");
+
+  // AVX2 has better support of integer extending.
+  if (Subtarget->hasAVX2())
+    return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
+
+  SDValue Lo = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32, In);
+  static const int Mask[] = {4, 5, 6, 7, -1, -1, -1, -1};
+  SDValue Hi = DAG.getNode(X86ISD::VZEXT, DL, MVT::v4i32,
+                           DAG.getVectorShuffle(MVT::v8i16, DL, In, DAG.getUNDEF(MVT::v8i16), &Mask[0]));
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i32, Lo, Hi);
+}
+
 SDValue X86TargetLowering::lowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   DebugLoc DL = Op.getDebugLoc();
   EVT VT = Op.getValueType();
@@ -10571,23 +10701,21 @@ SDValue X86TargetLowering::LowerRETURNADDR(SDValue Op,
 
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   DebugLoc dl = Op.getDebugLoc();
+  EVT PtrVT = getPointerTy();
 
   if (Depth > 0) {
     SDValue FrameAddr = LowerFRAMEADDR(Op, DAG);
     SDValue Offset =
-      // @LOCALMOD-BEGIN
-      DAG.getConstant(Subtarget->is64Bit() ? 8 : 4,
-                      getPointerTy());
-      // @LOCALMOD-END
-    return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
-                       DAG.getNode(ISD::ADD, dl, getPointerTy(),
+      DAG.getConstant(RegInfo->getSlotSize(), PtrVT);
+    return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
+                       DAG.getNode(ISD::ADD, dl, PtrVT,
                                    FrameAddr, Offset),
                        MachinePointerInfo(), false, false, false, 0);
   }
 
   // Just load the return address.
   SDValue RetAddrFI = getReturnAddressFrameIndex(DAG);
-  return DAG.getLoad(getPointerTy(), dl, DAG.getEntryNode(),
+  return DAG.getLoad(PtrVT, dl, DAG.getEntryNode(),
                      RetAddrFI, MachinePointerInfo(), false, false, false, 0);
 }
 
@@ -10609,10 +10737,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
                                                      SelectionDAG &DAG) const {
-  // @LOCALMOD-START
-  int SlotSize = Subtarget->is64Bit() ? 8 : 4;
-  return DAG.getIntPtrConstant(2*SlotSize);
-  // @LOCALMOD-END
+  return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize());
 }
 
 SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
@@ -10620,17 +10745,17 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
   SDValue Offset    = Op.getOperand(1);
   SDValue Handler   = Op.getOperand(2);
   DebugLoc dl       = Op.getDebugLoc();
-  // @LOCALMOD-START
-  bool Has64BitPtrs = Subtarget->has64BitPointers();
 
+  // @LOCALMOD-START
+  bool Has64BitPointers = Subtarget->has64BitPointers();
   SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
-                                     Has64BitPtrs ? X86::RBP : X86::EBP,
+                                     Has64BitPointers ? X86::RBP : X86::EBP,
                                      getPointerTy());
-  unsigned StoreAddrReg = (Has64BitPtrs ? X86::RCX : X86::ECX);
-  int SlotSize = Subtarget->is64Bit() ? 8 : 4;
-  SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame,
-                                  DAG.getIntPtrConstant(SlotSize));
+  unsigned StoreAddrReg = (Has64BitPointers ? X86::RCX : X86::ECX);
   // @LOCALMOD-END
+
+  SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame,
+                                  DAG.getIntPtrConstant(RegInfo->getSlotSize()));
   StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), StoreAddr, Offset);
   Chain = DAG.getStore(Chain, dl, Handler, StoreAddr, MachinePointerInfo(),
                        false, false, 0);
@@ -11659,6 +11784,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SINT_TO_FP:         return LowerSINT_TO_FP(Op, DAG);
   case ISD::UINT_TO_FP:         return LowerUINT_TO_FP(Op, DAG);
   case ISD::TRUNCATE:           return lowerTRUNCATE(Op, DAG);
+  case ISD::ZERO_EXTEND:        return lowerZERO_EXTEND(Op, DAG);
   case ISD::FP_TO_SINT:         return LowerFP_TO_SINT(Op, DAG);
   case ISD::FP_TO_UINT:         return LowerFP_TO_UINT(Op, DAG);
   case ISD::FP_EXTEND:          return lowerFP_EXTEND(Op, DAG);
@@ -11797,6 +11923,22 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     }
     return;
   }
+  case ISD::UINT_TO_FP: {
+    if (N->getOperand(0).getValueType() != MVT::v2i32 &&
+        N->getValueType(0) != MVT::v2f32)
+      return;
+    SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64,
+                                 N->getOperand(0));
+    SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL),
+                                     MVT::f64);
+    SDValue VBias = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v2f64, Bias, Bias);
+    SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
+                             DAG.getNode(ISD::BITCAST, dl, MVT::v2i64, VBias));
+    Or = DAG.getNode(ISD::BITCAST, dl, MVT::v2f64, Or);
+    SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
+    Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
+    return;
+  }
   case ISD::FP_ROUND: {
     SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
     Results.push_back(V);
@@ -11999,6 +12141,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
   case X86ISD::VSEXT_MOVL:         return "X86ISD::VSEXT_MOVL";
   case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
+  case X86ISD::VZEXT:              return "X86ISD::VZEXT";
+  case X86ISD::VSEXT:              return "X86ISD::VSEXT";
   case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
   case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
   case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
@@ -16565,23 +16709,6 @@ static SDValue PerformBrCondCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-static SDValue PerformUINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG) {
-  SDValue Op0 = N->getOperand(0);
-  EVT InVT = Op0->getValueType(0);
-
-  // UINT_TO_FP(v4i8) -> SINT_TO_FP(ZEXT(v4i8 to v4i32))
-  if (InVT == MVT::v8i8 || InVT == MVT::v4i8) {
-    DebugLoc dl = N->getDebugLoc();
-    MVT DstVT = InVT == MVT::v4i8 ? MVT::v4i32 : MVT::v8i32;
-    SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
-    // Notice that we use SINT_TO_FP because we know that the high bits
-    // are zero and SINT_TO_FP is better supported by the hardware.
-    return DAG.getNode(ISD::SINT_TO_FP, dl, N->getValueType(0), P);
-  }
-
-  return SDValue();
-}
-
 static SDValue PerformSINT_TO_FPCombine(SDNode *N, SelectionDAG &DAG,
                                         const X86TargetLowering *XTLI) {
   SDValue Op0 = N->getOperand(0);
@@ -16727,6 +16854,21 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
   return OptimizeConditionalInDecrement(N, DAG);
 }
 
+/// performVZEXTCombine - Performs build vector combines
+static SDValue performVZEXTCombine(SDNode *N, SelectionDAG &DAG,
+                                        TargetLowering::DAGCombinerInfo &DCI,
+                                        const X86Subtarget *Subtarget) {
+  // (vzext (bitcast (vzext (x)) -> (vzext x)
+  SDValue In = N->getOperand(0);
+  while (In.getOpcode() == ISD::BITCAST)
+    In = In.getOperand(0);
+
+  if (In.getOpcode() != X86ISD::VZEXT)
+    return SDValue();
+
+  return DAG.getNode(X86ISD::VZEXT, N->getDebugLoc(), N->getValueType(0), In.getOperand(0));
+}
+
 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -16749,7 +16891,6 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::XOR:            return PerformXorCombine(N, DAG, DCI, Subtarget);
   case ISD::LOAD:           return PerformLOADCombine(N, DAG, DCI, Subtarget);
   case ISD::STORE:          return PerformSTORECombine(N, DAG, Subtarget);
-  case ISD::UINT_TO_FP:     return PerformUINT_TO_FPCombine(N, DAG);
   case ISD::SINT_TO_FP:     return PerformSINT_TO_FPCombine(N, DAG, this);
   case ISD::FADD:           return PerformFADDCombine(N, DAG, Subtarget);
   case ISD::FSUB:           return PerformFSUBCombine(N, DAG, Subtarget);
@@ -16767,6 +16908,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SETCC:          return PerformISDSETCCCombine(N, DAG);
   case X86ISD::SETCC:       return PerformSETCCCombine(N, DAG, DCI, Subtarget);
   case X86ISD::BRCOND:      return PerformBrCondCombine(N, DAG, DCI, Subtarget);
+  case X86ISD::VZEXT:       return performVZEXTCombine(N, DAG, DCI, Subtarget);
   case X86ISD::SHUFP:       // Handle all target specific shuffles
   case X86ISD::PALIGN:
   case X86ISD::UNPCKH:
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index f6f9e584af..8d8f3f5161 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -243,6 +243,12 @@ namespace llvm {
       // VSEXT_MOVL - Vector move low and sign extend.
       VSEXT_MOVL,
 
+      // VZEXT - Vector integer zero-extend.
+      VZEXT,
+
+      // VSEXT - Vector integer signed-extend.
+      VSEXT,
+
       // VFPEXT - Vector FP extend.
       VFPEXT,
 
@@ -803,7 +809,9 @@ namespace llvm {
     SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
+    SDValue lowerZERO_EXTEND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_TO_SINT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_TO_UINT(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
@@ -847,6 +855,8 @@ namespace llvm {
     
     SDValue LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const;
 
+    SDValue lowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const;
+
     virtual SDValue
       LowerFormalArguments(SDValue Chain,
                            CallingConv::ID CallConv, bool isVarArg,
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 46281efa57..73ba0011df 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -90,6 +90,14 @@ def X86vsmovl  : SDNode<"X86ISD::VSEXT_MOVL",
 def X86vzload  : SDNode<"X86ISD::VZEXT_LOAD", SDTLoad,
                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 
+def X86vzext   : SDNode<"X86ISD::VZEXT",
+                         SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                              SDTCisInt<0>, SDTCisInt<1>]>>;
+
+def X86vsext   : SDNode<"X86ISD::VSEXT",
+                         SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
+                                              SDTCisInt<0>, SDTCisInt<1>]>>;
+
 def X86vfpext  : SDNode<"X86ISD::VFPEXT",
                         SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>,
                                              SDTCisFP<0>, SDTCisFP<1>]>>;
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index cc1291a8a0..e9c7f3e7f1 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -5841,6 +5841,85 @@ let Predicates = [UseSSE41] in {
             (PMOVZXBQrm addr:$src)>;
 }
 
+let Predicates = [HasAVX2] in {
+  def : Pat<(v16i16 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBWYrr VR128:$src)>;
+  def : Pat<(v8i32  (X86vzext (v16i8 VR128:$src))), (VPMOVZXBDYrr VR128:$src)>;
+  def : Pat<(v4i64  (X86vzext (v16i8 VR128:$src))), (VPMOVZXBQYrr VR128:$src)>;
+
+  def : Pat<(v8i32  (X86vzext (v8i16 VR128:$src))), (VPMOVZXWDYrr VR128:$src)>;
+  def : Pat<(v4i64  (X86vzext (v8i16 VR128:$src))), (VPMOVZXWQYrr VR128:$src)>;
+
+  def : Pat<(v4i64  (X86vzext (v4i32 VR128:$src))), (VPMOVZXDQYrr VR128:$src)>;
+}
+
+let Predicates = [HasAVX] in {
+  def : Pat<(v8i16 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBWrr VR128:$src)>;
+  def : Pat<(v4i32 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vzext (v16i8 VR128:$src))), (VPMOVZXBQrr VR128:$src)>;
+
+  def : Pat<(v4i32 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vzext (v8i16 VR128:$src))), (VPMOVZXWQrr VR128:$src)>;
+
+  def : Pat<(v2i64 (X86vzext (v4i32 VR128:$src))), (VPMOVZXDQrr VR128:$src)>;
+
+  def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+            (VPMOVZXBWrm addr:$src)>;
+  def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
+            (VPMOVZXBWrm addr:$src)>;
+  def : Pat<(v4i32 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+            (VPMOVZXBDrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))),
+            (VPMOVZXBQrm addr:$src)>;
+
+  def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+            (VPMOVZXWDrm addr:$src)>;
+  def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
+            (VPMOVZXWDrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v8i16 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+            (VPMOVZXWQrm addr:$src)>;
+
+  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+            (VPMOVZXDQrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
+            (VPMOVZXDQrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (X86vzload addr:$src)))))),
+            (VPMOVZXDQrm addr:$src)>;
+}
+
+let Predicates = [UseSSE41] in {
+  def : Pat<(v8i16 (X86vzext (v16i8 VR128:$src))), (PMOVZXBWrr VR128:$src)>;
+  def : Pat<(v4i32 (X86vzext (v16i8 VR128:$src))), (PMOVZXBDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vzext (v16i8 VR128:$src))), (PMOVZXBQrr VR128:$src)>;
+
+  def : Pat<(v4i32 (X86vzext (v8i16 VR128:$src))), (PMOVZXWDrr VR128:$src)>;
+  def : Pat<(v2i64 (X86vzext (v8i16 VR128:$src))), (PMOVZXWQrr VR128:$src)>;
+
+  def : Pat<(v2i64 (X86vzext (v4i32 VR128:$src))), (PMOVZXDQrr VR128:$src)>;
+
+  def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+            (PMOVZXBWrm addr:$src)>;
+  def : Pat<(v8i16 (X86vzext (v16i8 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
+            (PMOVZXBWrm addr:$src)>;
+  def : Pat<(v4i32 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+            (PMOVZXBDrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v16i8 (bitconvert (v4i32 (scalar_to_vector (loadi16_anyext addr:$src))))))),
+            (PMOVZXBQrm addr:$src)>;
+
+  def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+            (PMOVZXWDrm addr:$src)>;
+  def : Pat<(v4i32 (X86vzext (v8i16 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
+            (PMOVZXWDrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v8i16 (bitconvert (v4i32 (scalar_to_vector (loadi32 addr:$src))))))),
+            (PMOVZXWQrm addr:$src)>;
+
+  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+            (PMOVZXDQrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2f64 (scalar_to_vector (loadf64 addr:$src))))))),
+            (PMOVZXDQrm addr:$src)>;
+  def : Pat<(v2i64 (X86vzext (v4i32 (bitconvert (v2i64 (X86vzload addr:$src)))))),
+            (PMOVZXDQrm addr:$src)>;
+}
+
 //===----------------------------------------------------------------------===//
 // SSE4.1 - Extract Instructions
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index f7a17cd7c1..9054345d35 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -234,15 +234,26 @@ const uint16_t *
 X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   bool callsEHReturn = false;
   bool ghcCall = false;
+  bool oclBiCall = false;
+  bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
 
   if (MF) {
     callsEHReturn = MF->getMMI().callsEHReturn();
     const Function *F = MF->getFunction();
     ghcCall = (F ? F->getCallingConv() == CallingConv::GHC : false);
+    oclBiCall = (F ? F->getCallingConv() == CallingConv::Intel_OCL_BI : false);
   }
 
   if (ghcCall)
     return CSR_NoRegs_SaveList;
+  if (oclBiCall) {
+    if (HasAVX && IsWin64)
+        return CSR_Win64_Intel_OCL_BI_AVX_SaveList;
+    if (HasAVX && Is64Bit)
+        return CSR_64_Intel_OCL_BI_AVX_SaveList;
+    if (!HasAVX && !IsWin64 && Is64Bit)
+        return CSR_64_Intel_OCL_BI_SaveList;
+  }
   if (Is64Bit) {
     if (IsWin64)
       return CSR_Win64_SaveList;
@@ -257,6 +268,16 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
 
 const uint32_t*
 X86RegisterInfo::getCallPreservedMask(CallingConv::ID CC) const {
+  bool HasAVX = TM.getSubtarget<X86Subtarget>().hasAVX();
+
+  if (CC == CallingConv::Intel_OCL_BI) {
+    if (IsWin64 && HasAVX)
+      return CSR_Win64_Intel_OCL_BI_AVX_RegMask;
+    if (Is64Bit && HasAVX)
+      return CSR_64_Intel_OCL_BI_AVX_RegMask;
+    if (!HasAVX && !IsWin64 && Is64Bit)
+      return CSR_64_Intel_OCL_BI_RegMask;
+  }
   if (CC == CallingConv::GHC)
     return CSR_NoRegs_RegMask;
   if (!Is64Bit)
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index a102935b4b..6e53e7ac93 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -62,7 +62,8 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
     if (const char *bzeroEntry =  V &&
         V->isNullValue() ? Subtarget->getBZeroEntry() : 0) {
       EVT IntPtr = TLI.getPointerTy();
-      Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
+      unsigned AS = DstPtrInfo.getAddrSpace();
+      Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext(), AS);
       TargetLowering::ArgListTy Args;
       TargetLowering::ArgListEntry Entry;
       Entry.Node = Dst;
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index c804195f27..e31bedf6de 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -51,7 +51,7 @@ X86_32TargetMachine::X86_32TargetMachine(const Target &T, StringRef TT,
     TSInfo(*this),
     TLInfo(*this),
     JITInfo(*this),
-    STTI(&TLInfo) {
+    STTI(&TLInfo), VTTI(&TLInfo) {
 }
 
 void X86_64TargetMachine::anchor() { }
@@ -71,7 +71,7 @@ X86_64TargetMachine::X86_64TargetMachine(const Target &T, StringRef TT,
     TSInfo(*this),
     TLInfo(*this),
     JITInfo(*this),
-    STTI(&TLInfo) {
+    STTI(&TLInfo), VTTI(&TLInfo){
 }
 
 /// X86TargetMachine ctor - Create an X86 target.
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index 9e7816e21f..eaa745ba9b 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -477,7 +477,8 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   }
 
   // Lower to a call to __misaligned_load(BasePtr).
-  Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
+  unsigned AS = LD->getAddressSpace();
+  Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext(), AS);
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
 
@@ -536,7 +537,8 @@ LowerSTORE(SDValue Op, SelectionDAG &DAG) const
   }
 
   // Lower to a call to __misaligned_store(BasePtr, Value).
-  Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext());
+  unsigned AS = ST->getAddressSpace();
+  Type *IntPtrTy = getDataLayout()->getIntPtrType(*DAG.getContext(), AS);
   TargetLowering::ArgListTy Args;
   TargetLowering::ArgListEntry Entry;
 
diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp
index 0b7e3e10d4..d5a932c518 100644
--- a/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -32,7 +32,7 @@ XCoreTargetMachine::XCoreTargetMachine(const Target &T, StringRef TT,
     InstrInfo(),
     FrameLowering(Subtarget),
     TLInfo(*this),
-    TSInfo(*this), STTI(&TLInfo) {
+    TSInfo(*this), STTI(&TLInfo), VTTI(&TLInfo) {
 }
 
 namespace {
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index 678189b3d6..3d5657fe6a 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -1500,7 +1500,7 @@ static GlobalVariable *PerformHeapAllocSRoA(GlobalVariable *GV, CallInst *CI,
     unsigned TypeSize = TD->getTypeAllocSize(FieldTy);
     if (StructType *ST = dyn_cast<StructType>(FieldTy))
       TypeSize = TD->getStructLayout(ST)->getSizeInBytes();
-    Type *IntPtrTy = TD->getIntPtrType(CI->getContext());
+    Type *IntPtrTy = TD->getIntPtrType(GV->getType());
     Value *NMI = CallInst::CreateMalloc(CI, IntPtrTy, FieldTy,
                                         ConstantInt::get(IntPtrTy, TypeSize),
                                         NElems, 0,
@@ -1730,7 +1730,7 @@ static bool TryToOptimizeStoreOfMallocToGlobal(GlobalVariable *GV,
     // If this is a fixed size array, transform the Malloc to be an alloc of
     // structs.  malloc [100 x struct],1 -> malloc struct, 100
     if (ArrayType *AT = dyn_cast<ArrayType>(getMallocAllocatedType(CI, TLI))) {
-      Type *IntPtrTy = TD->getIntPtrType(CI->getContext());
+      Type *IntPtrTy = TD->getIntPtrType(GV->getType());
       unsigned TypeSize = TD->getStructLayout(AllocSTy)->getSizeInBytes();
       Value *AllocSize = ConstantInt::get(IntPtrTy, TypeSize);
       Value *NumElements = ConstantInt::get(IntPtrTy, AT->getNumElements());
diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp
index 44283ddce7..1c6477c022 100644
--- a/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/lib/Transforms/IPO/MergeFunctions.cpp
@@ -206,9 +206,8 @@ bool FunctionComparator::isEquivalentType(Type *Ty1,
     return true;
   if (Ty1->getTypeID() != Ty2->getTypeID()) {
     if (TD) {
-      LLVMContext &Ctx = Ty1->getContext();
-      if (isa<PointerType>(Ty1) && Ty2 == TD->getIntPtrType(Ctx)) return true;
-      if (isa<PointerType>(Ty2) && Ty1 == TD->getIntPtrType(Ctx)) return true;
+      if (isa<PointerType>(Ty1) && Ty2 == TD->getIntPtrType(Ty1)) return true;
+      if (isa<PointerType>(Ty2) && Ty1 == TD->getIntPtrType(Ty2)) return true;
     }
     return false;
   }
diff --git a/lib/Transforms/InstCombine/InstCombine.h b/lib/Transforms/InstCombine/InstCombine.h
index 7467eca7ab..0e765f7aaa 100644
--- a/lib/Transforms/InstCombine/InstCombine.h
+++ b/lib/Transforms/InstCombine/InstCombine.h
@@ -208,7 +208,7 @@ private:
   bool ShouldChangeType(Type *From, Type *To) const;
   Value *dyn_castNegVal(Value *V) const;
   Value *dyn_castFNegVal(Value *V) const;
-  Type *FindElementAtOffset(Type *Ty, int64_t Offset, 
+  Type *FindElementAtOffset(Type *Ty, int64_t Offset, Type *IntPtrTy,
                                   SmallVectorImpl<Value*> &NewIndices);
   Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI);
                                  
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 4f4c388a92..0958842d08 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -996,9 +996,9 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
         // Conversion is ok if changing from one pointer type to another or from
         // a pointer to an integer of the same size.
         !((OldRetTy->isPointerTy() || !TD ||
-           OldRetTy == TD->getIntPtrType(Caller->getContext())) &&
+           OldRetTy == TD->getIntPtrType(NewRetTy)) &&
           (NewRetTy->isPointerTy() || !TD ||
-           NewRetTy == TD->getIntPtrType(Caller->getContext()))))
+           NewRetTy == TD->getIntPtrType(OldRetTy))))
       return false;   // Cannot transform this return value.
 
     if (!Caller->use_empty() &&
@@ -1057,11 +1057,13 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
 
     // Converting from one pointer type to another or between a pointer and an
     // integer of the same size is safe even if we do not have a body.
+    // FIXME: Not sure what to do here, so setting AS to 0.
+    // How can the AS for a function call be outside the default?
     bool isConvertible = ActTy == ParamTy ||
       (TD && ((ParamTy->isPointerTy() ||
-      ParamTy == TD->getIntPtrType(Caller->getContext())) &&
+      ParamTy == TD->getIntPtrType(ActTy)) &&
               (ActTy->isPointerTy() ||
-              ActTy == TD->getIntPtrType(Caller->getContext()))));
+              ActTy == TD->getIntPtrType(ParamTy))));
     if (Callee->isDeclaration() && !isConvertible) return false;
   }
 
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index f3f3f8f585..119d2f5c99 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -30,7 +30,7 @@ static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale,
     Scale  = 0;
     return ConstantInt::get(Val->getType(), 0);
   }
-  
+
   if (BinaryOperator *I = dyn_cast<BinaryOperator>(Val)) {
     // Cannot look past anything that might overflow.
     OverflowingBinaryOperator *OBI = dyn_cast<OverflowingBinaryOperator>(Val);
@@ -47,19 +47,19 @@ static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale,
         Offset = 0;
         return I->getOperand(0);
       }
-      
+
       if (I->getOpcode() == Instruction::Mul) {
         // This value is scaled by 'RHS'.
         Scale = RHS->getZExtValue();
         Offset = 0;
         return I->getOperand(0);
       }
-      
+
       if (I->getOpcode() == Instruction::Add) {
-        // We have X+C.  Check to see if we really have (X*C2)+C1, 
+        // We have X+C.  Check to see if we really have (X*C2)+C1,
         // where C1 is divisible by C2.
         unsigned SubScale;
-        Value *SubVal = 
+        Value *SubVal =
           DecomposeSimpleLinearExpr(I->getOperand(0), SubScale, Offset);
         Offset += RHS->getZExtValue();
         Scale = SubScale;
@@ -82,7 +82,7 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
   if (!TD) return 0;
 
   PointerType *PTy = cast<PointerType>(CI.getType());
-  
+
   BuilderTy AllocaBuilder(*Builder);
   AllocaBuilder.SetInsertPoint(AI.getParent(), &AI);
 
@@ -110,7 +110,7 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
   uint64_t ArrayOffset;
   Value *NumElements = // See if the array size is a decomposable linear expr.
     DecomposeSimpleLinearExpr(AI.getOperand(0), ArraySizeScale, ArrayOffset);
- 
+
   // If we can now satisfy the modulus, by using a non-1 scale, we really can
   // do the xform.
   if ((AllocElTySize*ArraySizeScale) % CastElTySize != 0 ||
@@ -125,17 +125,17 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
     // Insert before the alloca, not before the cast.
     Amt = AllocaBuilder.CreateMul(Amt, NumElements);
   }
-  
+
   if (uint64_t Offset = (AllocElTySize*ArrayOffset)/CastElTySize) {
     Value *Off = ConstantInt::get(AI.getArraySize()->getType(),
                                   Offset, true);
     Amt = AllocaBuilder.CreateAdd(Amt, Off);
   }
-  
+
   AllocaInst *New = AllocaBuilder.CreateAlloca(CastElTy, Amt);
   New->setAlignment(AI.getAlignment());
   New->takeName(&AI);
-  
+
   // If the allocation has multiple real uses, insert a cast and change all
   // things that used it to use the new cast.  This will also hack on CI, but it
   // will die soon.
@@ -148,10 +148,10 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
   return ReplaceInstUsesWith(CI, New);
 }
 
-/// EvaluateInDifferentType - Given an expression that 
+/// EvaluateInDifferentType - Given an expression that
 /// CanEvaluateTruncated or CanEvaluateSExtd returns true for, actually
 /// insert the code to evaluate the expression.
-Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty, 
+Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
                                              bool isSigned) {
   if (Constant *C = dyn_cast<Constant>(V)) {
     C = ConstantExpr::getIntegerCast(C, Ty, isSigned /*Sext or ZExt*/);
@@ -181,7 +181,7 @@ Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
     Value *RHS = EvaluateInDifferentType(I->getOperand(1), Ty, isSigned);
     Res = BinaryOperator::Create((Instruction::BinaryOps)Opc, LHS, RHS);
     break;
-  }    
+  }
   case Instruction::Trunc:
   case Instruction::ZExt:
   case Instruction::SExt:
@@ -190,7 +190,7 @@ Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
     // new.
     if (I->getOperand(0)->getType() == Ty)
       return I->getOperand(0);
-    
+
     // Otherwise, must be the same type of cast, so just reinsert a new one.
     // This also handles the case of zext(trunc(x)) -> zext(x).
     Res = CastInst::CreateIntegerCast(I->getOperand(0), Ty,
@@ -212,11 +212,11 @@ Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
     Res = NPN;
     break;
   }
-  default: 
+  default:
     // TODO: Can handle more cases here.
     llvm_unreachable("Unreachable!");
   }
-  
+
   Res->takeName(I);
   return InsertNewInstWith(Res, *I);
 }
@@ -224,7 +224,7 @@ Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
 
 /// This function is a wrapper around CastInst::isEliminableCastPair. It
 /// simply extracts arguments and returns what that function returns.
-static Instruction::CastOps 
+static Instruction::CastOps
 isEliminableCastPair(
   const CastInst *CI, ///< The first cast instruction
   unsigned opcode,       ///< The opcode of the second cast instruction
@@ -238,19 +238,18 @@ isEliminableCastPair(
   // Get the opcodes of the two Cast instructions
   Instruction::CastOps firstOp = Instruction::CastOps(CI->getOpcode());
   Instruction::CastOps secondOp = Instruction::CastOps(opcode);
-
   unsigned Res = CastInst::isEliminableCastPair(firstOp, secondOp, SrcTy, MidTy,
                                                 DstTy,
-                                  TD ? TD->getIntPtrType(CI->getContext()) : 0);
-  
+                                  TD ? TD->getIntPtrType(DstTy) : 0);
+
   // We don't want to form an inttoptr or ptrtoint that converts to an integer
   // type that differs from the pointer size.
   if ((Res == Instruction::IntToPtr &&
-          (!TD || SrcTy != TD->getIntPtrType(CI->getContext()))) ||
+          (!TD || SrcTy != TD->getIntPtrType(DstTy))) ||
       (Res == Instruction::PtrToInt &&
-          (!TD || DstTy != TD->getIntPtrType(CI->getContext()))))
+          (!TD || DstTy != TD->getIntPtrType(SrcTy))))
     Res = 0;
-  
+
   return Instruction::CastOps(Res);
 }
 
@@ -262,18 +261,18 @@ bool InstCombiner::ShouldOptimizeCast(Instruction::CastOps opc, const Value *V,
                                       Type *Ty) {
   // Noop casts and casts of constants should be eliminated trivially.
   if (V->getType() == Ty || isa<Constant>(V)) return false;
-  
+
   // If this is another cast that can be eliminated, we prefer to have it
   // eliminated.
   if (const CastInst *CI = dyn_cast<CastInst>(V))
     if (isEliminableCastPair(CI, opc, Ty, TD))
       return false;
-  
+
   // If this is a vector sext from a compare, then we don't want to break the
   // idiom where each element of the extended vector is either zero or all ones.
   if (opc == Instruction::SExt && isa<CmpInst>(V) && Ty->isVectorTy())
     return false;
-  
+
   return true;
 }
 
@@ -285,7 +284,7 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
   // Many cases of "cast of a cast" are eliminable. If it's eliminable we just
   // eliminate it now.
   if (CastInst *CSrc = dyn_cast<CastInst>(Src)) {   // A->B->C cast
-    if (Instruction::CastOps opc = 
+    if (Instruction::CastOps opc =
         isEliminableCastPair(CSrc, CI.getOpcode(), CI.getType(), TD)) {
       // The first cast (CSrc) is eliminable so we need to fix up or replace
       // the second cast (CI). CSrc will then have a good chance of being dead.
@@ -308,7 +307,7 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
       if (Instruction *NV = FoldOpIntoPhi(CI))
         return NV;
   }
-  
+
   return 0;
 }
 
@@ -327,15 +326,15 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty) {
   // We can always evaluate constants in another type.
   if (isa<Constant>(V))
     return true;
-  
+
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) return false;
-  
+
   Type *OrigTy = V->getType();
-  
+
   // If this is an extension from the dest type, we can eliminate it, even if it
   // has multiple uses.
-  if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) && 
+  if ((isa<ZExtInst>(I) || isa<SExtInst>(I)) &&
       I->getOperand(0)->getType() == Ty)
     return true;
 
@@ -420,29 +419,29 @@ static bool CanEvaluateTruncated(Value *V, Type *Ty) {
     // TODO: Can handle more cases here.
     break;
   }
-  
+
   return false;
 }
 
 Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
   if (Instruction *Result = commonCastTransforms(CI))
     return Result;
-  
-  // See if we can simplify any instructions used by the input whose sole 
+
+  // See if we can simplify any instructions used by the input whose sole
   // purpose is to compute bits we don't care about.
   if (SimplifyDemandedInstructionBits(CI))
     return &CI;
-  
+
   Value *Src = CI.getOperand(0);
   Type *DestTy = CI.getType(), *SrcTy = Src->getType();
-  
+
   // Attempt to truncate the entire input expression tree to the destination
   // type.   Only do this if the dest type is a simple type, don't convert the
   // expression tree to something weird like i93 unless the source is also
   // strange.
   if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) &&
       CanEvaluateTruncated(Src, DestTy)) {
-      
+
     // If this cast is a truncate, evaluting in a different type always
     // eliminates the cast, so it is always a win.
     DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type"
@@ -459,7 +458,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
     Value *Zero = Constant::getNullValue(Src->getType());
     return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero);
   }
-  
+
   // Transform trunc(lshr (zext A), Cst) to eliminate one type conversion.
   Value *A = 0; ConstantInt *Cst = 0;
   if (Src->hasOneUse() &&
@@ -469,7 +468,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
     // ASize < MidSize   and MidSize > ResultSize, but don't know the relation
     // between ASize and ResultSize.
     unsigned ASize = A->getType()->getPrimitiveSizeInBits();
-    
+
     // If the shift amount is larger than the size of A, then the result is
     // known to be zero because all the input bits got shifted out.
     if (Cst->getZExtValue() >= ASize)
@@ -482,7 +481,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
     Shift->takeName(Src);
     return CastInst::CreateIntegerCast(Shift, CI.getType(), false);
   }
-  
+
   // Transform "trunc (and X, cst)" -> "and (trunc X), cst" so long as the dest
   // type isn't non-native.
   if (Src->hasOneUse() && isa<IntegerType>(Src->getType()) &&
@@ -505,7 +504,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
   // cast to integer to avoid the comparison.
   if (ConstantInt *Op1C = dyn_cast<ConstantInt>(ICI->getOperand(1))) {
     const APInt &Op1CV = Op1C->getValue();
-      
+
     // zext (x <s  0) to i32 --> x>>u31      true if signbit set.
     // zext (x >s -1) to i32 --> (x>>u31)^1  true if signbit clear.
     if ((ICI->getPredicate() == ICmpInst::ICMP_SLT && Op1CV == 0) ||
@@ -535,14 +534,14 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
     // zext (X != 0) to i32 --> X>>1     iff X has only the 2nd bit set.
     // zext (X != 1) to i32 --> X^1      iff X has only the low bit set.
     // zext (X != 2) to i32 --> (X>>1)^1 iff X has only the 2nd bit set.
-    if ((Op1CV == 0 || Op1CV.isPowerOf2()) && 
+    if ((Op1CV == 0 || Op1CV.isPowerOf2()) &&
         // This only works for EQ and NE
         ICI->isEquality()) {
       // If Op1C some other power of two, convert:
       uint32_t BitWidth = Op1C->getType()->getBitWidth();
       APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
       ComputeMaskedBits(ICI->getOperand(0), KnownZero, KnownOne);
-        
+
       APInt KnownZeroMask(~KnownZero);
       if (KnownZeroMask.isPowerOf2()) { // Exactly 1 possible 1?
         if (!DoXform) return ICI;
@@ -556,7 +555,7 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
           Res = ConstantExpr::getZExt(Res, CI.getType());
           return ReplaceInstUsesWith(CI, Res);
         }
-          
+
         uint32_t ShiftAmt = KnownZeroMask.logBase2();
         Value *In = ICI->getOperand(0);
         if (ShiftAmt) {
@@ -565,12 +564,12 @@ Instruction *InstCombiner::transformZExtICmp(ICmpInst *ICI, Instruction &CI,
           In = Builder->CreateLShr(In, ConstantInt::get(In->getType(),ShiftAmt),
                                    In->getName()+".lobit");
         }
-          
+
         if ((Op1CV != 0) == isNE) { // Toggle the low bit.
           Constant *One = ConstantInt::get(In->getType(), 1);
           In = Builder->CreateXor(In, One);
         }
-          
+
         if (CI.getType() == In->getType())
           return ReplaceInstUsesWith(CI, In);
         return CastInst::CreateIntegerCast(In, CI.getType(), false/*ZExt*/);
@@ -643,19 +642,19 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear) {
   BitsToClear = 0;
   if (isa<Constant>(V))
     return true;
-  
+
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) return false;
-  
+
   // If the input is a truncate from the destination type, we can trivially
   // eliminate it.
   if (isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty)
     return true;
-  
+
   // We can't extend or shrink something that has multiple uses: doing so would
   // require duplicating the instruction in general, which isn't profitable.
   if (!I->hasOneUse()) return false;
-  
+
   unsigned Opc = I->getOpcode(), Tmp;
   switch (Opc) {
   case Instruction::ZExt:  // zext(zext(x)) -> zext(x).
@@ -675,7 +674,7 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear) {
     // These can all be promoted if neither operand has 'bits to clear'.
     if (BitsToClear == 0 && Tmp == 0)
       return true;
-      
+
     // If the operation is an AND/OR/XOR and the bits to clear are zero in the
     // other side, BitsToClear is ok.
     if (Tmp == 0 &&
@@ -688,10 +687,10 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear) {
                             APInt::getHighBitsSet(VSize, BitsToClear)))
         return true;
     }
-      
+
     // Otherwise, we don't know how to analyze this BitsToClear case yet.
     return false;
-      
+
   case Instruction::LShr:
     // We can promote lshr(x, cst) if we can promote x.  This requires the
     // ultimate 'and' to clear out the high zero bits we're clearing out though.
@@ -713,7 +712,7 @@ static bool CanEvaluateZExtd(Value *V, Type *Ty, unsigned &BitsToClear) {
         Tmp != BitsToClear)
       return false;
     return true;
-      
+
   case Instruction::PHI: {
     // We can change a phi if we can change all operands.  Note that we never
     // get into trouble with cyclic PHIs here because we only consider
@@ -740,44 +739,44 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
   // eliminated before we try to optimize this zext.
   if (CI.hasOneUse() && isa<TruncInst>(CI.use_back()))
     return 0;
-  
+
   // If one of the common conversion will work, do it.
   if (Instruction *Result = commonCastTransforms(CI))
     return Result;
 
-  // See if we can simplify any instructions used by the input whose sole 
+  // See if we can simplify any instructions used by the input whose sole
   // purpose is to compute bits we don't care about.
   if (SimplifyDemandedInstructionBits(CI))
     return &CI;
-  
+
   Value *Src = CI.getOperand(0);
   Type *SrcTy = Src->getType(), *DestTy = CI.getType();
-  
+
   // Attempt to extend the entire input expression tree to the destination
   // type.   Only do this if the dest type is a simple type, don't convert the
   // expression tree to something weird like i93 unless the source is also
   // strange.
   unsigned BitsToClear;
   if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) &&
-      CanEvaluateZExtd(Src, DestTy, BitsToClear)) { 
+      CanEvaluateZExtd(Src, DestTy, BitsToClear)) {
     assert(BitsToClear < SrcTy->getScalarSizeInBits() &&
            "Unreasonable BitsToClear");
-    
+
     // Okay, we can transform this!  Insert the new expression now.
     DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type"
           " to avoid zero extend: " << CI);
     Value *Res = EvaluateInDifferentType(Src, DestTy, false);
     assert(Res->getType() == DestTy);
-    
+
     uint32_t SrcBitsKept = SrcTy->getScalarSizeInBits()-BitsToClear;
     uint32_t DestBitSize = DestTy->getScalarSizeInBits();
-    
+
     // If the high bits are already filled with zeros, just replace this
     // cast with the result.
     if (MaskedValueIsZero(Res, APInt::getHighBitsSet(DestBitSize,
                                                      DestBitSize-SrcBitsKept)))
       return ReplaceInstUsesWith(CI, Res);
-    
+
     // We need to emit an AND to clear the high bits.
     Constant *C = ConstantInt::get(Res->getType(),
                                APInt::getLowBitsSet(DestBitSize, SrcBitsKept));
@@ -789,7 +788,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
   // 'and' which will be much cheaper than the pair of casts.
   if (TruncInst *CSrc = dyn_cast<TruncInst>(Src)) {   // A->B->C cast
     // TODO: Subsume this into EvaluateInDifferentType.
-    
+
     // Get the sizes of the types involved.  We know that the intermediate type
     // will be smaller than A or C, but don't know the relation between A and C.
     Value *A = CSrc->getOperand(0);
@@ -806,7 +805,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
       Value *And = Builder->CreateAnd(A, AndConst, CSrc->getName()+".mask");
       return new ZExtInst(And, CI.getType());
     }
-    
+
     if (SrcSize == DstSize) {
       APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize));
       return BinaryOperator::CreateAnd(A, ConstantInt::get(A->getType(),
@@ -815,7 +814,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
     if (SrcSize > DstSize) {
       Value *Trunc = Builder->CreateTrunc(A, CI.getType());
       APInt AndValue(APInt::getLowBitsSet(DstSize, MidSize));
-      return BinaryOperator::CreateAnd(Trunc, 
+      return BinaryOperator::CreateAnd(Trunc,
                                        ConstantInt::get(Trunc->getType(),
                                                         AndValue));
     }
@@ -873,7 +872,7 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
     Value *New = Builder->CreateZExt(X, CI.getType());
     return BinaryOperator::CreateXor(New, ConstantInt::get(CI.getType(), 1));
   }
-  
+
   return 0;
 }
 
@@ -986,14 +985,14 @@ static bool CanEvaluateSExtd(Value *V, Type *Ty) {
   // If this is a constant, it can be trivially promoted.
   if (isa<Constant>(V))
     return true;
-  
+
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) return false;
-  
+
   // If this is a truncate from the dest type, we can trivially eliminate it.
   if (isa<TruncInst>(I) && I->getOperand(0)->getType() == Ty)
     return true;
-  
+
   // We can't extend or shrink something that has multiple uses: doing so would
   // require duplicating the instruction in general, which isn't profitable.
   if (!I->hasOneUse()) return false;
@@ -1012,14 +1011,14 @@ static bool CanEvaluateSExtd(Value *V, Type *Ty) {
     // These operators can all arbitrarily be extended if their inputs can.
     return CanEvaluateSExtd(I->getOperand(0), Ty) &&
            CanEvaluateSExtd(I->getOperand(1), Ty);
-      
+
   //case Instruction::Shl:   TODO
   //case Instruction::LShr:  TODO
-      
+
   case Instruction::Select:
     return CanEvaluateSExtd(I->getOperand(1), Ty) &&
            CanEvaluateSExtd(I->getOperand(2), Ty);
-      
+
   case Instruction::PHI: {
     // We can change a phi if we can change all operands.  Note that we never
     // get into trouble with cyclic PHIs here because we only consider
@@ -1033,7 +1032,7 @@ static bool CanEvaluateSExtd(Value *V, Type *Ty) {
     // TODO: Can handle more cases here.
     break;
   }
-  
+
   return false;
 }
 
@@ -1042,15 +1041,15 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
   // eliminated before we try to optimize this zext.
   if (CI.hasOneUse() && isa<TruncInst>(CI.use_back()))
     return 0;
-  
+
   if (Instruction *I = commonCastTransforms(CI))
     return I;
-  
-  // See if we can simplify any instructions used by the input whose sole 
+
+  // See if we can simplify any instructions used by the input whose sole
   // purpose is to compute bits we don't care about.
   if (SimplifyDemandedInstructionBits(CI))
     return &CI;
-  
+
   Value *Src = CI.getOperand(0);
   Type *SrcTy = Src->getType(), *DestTy = CI.getType();
 
@@ -1073,7 +1072,7 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
     // cast with the result.
     if (ComputeNumSignBits(Res) > DestBitSize - SrcBitSize)
       return ReplaceInstUsesWith(CI, Res);
-    
+
     // We need to emit a shl + ashr to do the sign extend.
     Value *ShAmt = ConstantInt::get(DestTy, DestBitSize-SrcBitSize);
     return BinaryOperator::CreateAShr(Builder->CreateShl(Res, ShAmt, "sext"),
@@ -1086,7 +1085,7 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
     if (TI->hasOneUse() && TI->getOperand(0)->getType() == DestTy) {
       uint32_t SrcBitSize = SrcTy->getScalarSizeInBits();
       uint32_t DestBitSize = DestTy->getScalarSizeInBits();
-      
+
       // We need to emit a shl + ashr to do the sign extend.
       Value *ShAmt = ConstantInt::get(DestTy, DestBitSize-SrcBitSize);
       Value *Res = Builder->CreateShl(TI->getOperand(0), ShAmt, "sext");
@@ -1122,7 +1121,7 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
     A = Builder->CreateShl(A, ShAmtV, CI.getName());
     return BinaryOperator::CreateAShr(A, ShAmtV);
   }
-  
+
   return 0;
 }
 
@@ -1144,7 +1143,7 @@ static Value *LookThroughFPExtensions(Value *V) {
   if (Instruction *I = dyn_cast<Instruction>(V))
     if (I->getOpcode() == Instruction::FPExt)
       return LookThroughFPExtensions(I->getOperand(0));
-  
+
   // If this value is a constant, return the constant in the smallest FP type
   // that can accurately represent it.  This allows us to turn
   // (float)((double)X+2.0) into x+2.0f.
@@ -1163,14 +1162,14 @@ static Value *LookThroughFPExtensions(Value *V) {
       return V;
     // Don't try to shrink to various long double types.
   }
-  
+
   return V;
 }
 
 Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
   if (Instruction *I = commonCastTransforms(CI))
     return I;
-  
+
   // If we have fptrunc(fadd (fpextend x), (fpextend y)), where x and y are
   // smaller than the destination type, we can eliminate the truncate by doing
   // the add as the smaller type.  This applies to fadd/fsub/fmul/fdiv as well
@@ -1187,7 +1186,7 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
       Type *SrcTy = OpI->getType();
       Value *LHSTrunc = LookThroughFPExtensions(OpI->getOperand(0));
       Value *RHSTrunc = LookThroughFPExtensions(OpI->getOperand(1));
-      if (LHSTrunc->getType() != SrcTy && 
+      if (LHSTrunc->getType() != SrcTy &&
           RHSTrunc->getType() != SrcTy) {
         unsigned DstSize = CI.getType()->getScalarSizeInBits();
         // If the source types were both smaller than the destination type of
@@ -1199,10 +1198,10 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
           return BinaryOperator::Create(OpI->getOpcode(), LHSTrunc, RHSTrunc);
         }
       }
-      break;  
+      break;
     }
   }
-  
+
   // Fold (fptrunc (sqrt (fpext x))) -> (sqrtf x)
   CallInst *Call = dyn_cast<CallInst>(CI.getOperand(0));
   if (Call && Call->getCalledFunction() && TLI->has(LibFunc::sqrtf) &&
@@ -1217,7 +1216,7 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
         Arg->getOperand(0)->getType()->isFloatTy()) {
       Function *Callee = Call->getCalledFunction();
       Module *M = CI.getParent()->getParent()->getParent();
-      Constant *SqrtfFunc = M->getOrInsertFunction("sqrtf", 
+      Constant *SqrtfFunc = M->getOrInsertFunction("sqrtf",
                                                    Callee->getAttributes(),
                                                    Builder->getFloatTy(),
                                                    Builder->getFloatTy(),
@@ -1225,15 +1224,15 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
       CallInst *ret = CallInst::Create(SqrtfFunc, Arg->getOperand(0),
                                        "sqrtfcall");
       ret->setAttributes(Callee->getAttributes());
-      
-      
+
+
       // Remove the old Call.  With -fmath-errno, it won't get marked readnone.
       ReplaceInstUsesWith(*Call, UndefValue::get(Call->getType()));
       EraseInstFromFunction(*Call);
       return ret;
     }
   }
-  
+
   return 0;
 }
 
@@ -1251,7 +1250,7 @@ Instruction *InstCombiner::visitFPToUI(FPToUIInst &FI) {
   // This is safe if the intermediate type has enough bits in its mantissa to
   // accurately represent all values of X.  For example, do not do this with
   // i64->float->i64.  This is also safe for sitofp case, because any negative
-  // 'X' value would cause an undefined result for the fptoui. 
+  // 'X' value would cause an undefined result for the fptoui.
   if ((isa<UIToFPInst>(OpI) || isa<SIToFPInst>(OpI)) &&
       OpI->getOperand(0)->getType() == FI.getType() &&
       (int)FI.getType()->getScalarSizeInBits() < /*extra bit for sign */
@@ -1265,19 +1264,19 @@ Instruction *InstCombiner::visitFPToSI(FPToSIInst &FI) {
   Instruction *OpI = dyn_cast<Instruction>(FI.getOperand(0));
   if (OpI == 0)
     return commonCastTransforms(FI);
-  
+
   // fptosi(sitofp(X)) --> X
   // fptosi(uitofp(X)) --> X
   // This is safe if the intermediate type has enough bits in its mantissa to
   // accurately represent all values of X.  For example, do not do this with
   // i64->float->i64.  This is also safe for sitofp case, because any negative
-  // 'X' value would cause an undefined result for the fptoui. 
+  // 'X' value would cause an undefined result for the fptoui.
   if ((isa<UIToFPInst>(OpI) || isa<SIToFPInst>(OpI)) &&
       OpI->getOperand(0)->getType() == FI.getType() &&
       (int)FI.getType()->getScalarSizeInBits() <=
                     OpI->getType()->getFPMantissaWidth())
     return ReplaceInstUsesWith(FI, OpI->getOperand(0));
-  
+
   return commonCastTransforms(FI);
 }
 
@@ -1298,17 +1297,17 @@ Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) {
     if (CI.getOperand(0)->getType()->getScalarSizeInBits() >
         TD->getPointerSizeInBits(AS)) {
       Value *P = Builder->CreateTrunc(CI.getOperand(0),
-                                      TD->getIntPtrType(CI.getContext()));
+                                      TD->getIntPtrType(CI.getType()));
       return new IntToPtrInst(P, CI.getType());
     }
     if (CI.getOperand(0)->getType()->getScalarSizeInBits() <
         TD->getPointerSizeInBits(AS)) {
       Value *P = Builder->CreateZExt(CI.getOperand(0),
-                                     TD->getIntPtrType(CI.getContext()));
+                                     TD->getIntPtrType(CI.getType()));
       return new IntToPtrInst(P, CI.getType());
     }
   }
-  
+
   if (Instruction *I = commonCastTransforms(CI))
     return I;
 
@@ -1318,19 +1317,19 @@ Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) {
 /// @brief Implement the transforms for cast of pointer (bitcast/ptrtoint)
 Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
   Value *Src = CI.getOperand(0);
-  
+
   if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Src)) {
     // If casting the result of a getelementptr instruction with no offset, turn
     // this into a cast of the original pointer!
     if (GEP->hasAllZeroIndices()) {
       // Changing the cast operand is usually not a good idea but it is safe
-      // here because the pointer operand is being replaced with another 
+      // here because the pointer operand is being replaced with another
       // pointer operand so the opcode doesn't need to change.
       Worklist.Add(GEP);
       CI.setOperand(0, GEP->getOperand(0));
       return &CI;
     }
-    
+
     // If the GEP has a single use, and the base pointer is a bitcast, and the
     // GEP computes a constant offset, see if we can convert these three
     // instructions into fewer.  This typically happens with unions and other
@@ -1345,7 +1344,8 @@ Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
       Type *GEPIdxTy =
       cast<PointerType>(OrigBase->getType())->getElementType();
       SmallVector<Value*, 8> NewIndices;
-      if (FindElementAtOffset(GEPIdxTy, Offset, NewIndices)) {
+      Type *IntPtrTy = TD->getIntPtrType(OrigBase->getType());
+      if (FindElementAtOffset(GEPIdxTy, Offset, IntPtrTy, NewIndices)) {
         // If we were able to index down into an element, create the GEP
         // and bitcast the result.  This eliminates one bitcast, potentially
         // two.
@@ -1353,15 +1353,15 @@ Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
         Builder->CreateInBoundsGEP(OrigBase, NewIndices) :
         Builder->CreateGEP(OrigBase, NewIndices);
         NGEP->takeName(GEP);
-        
+
         if (isa<BitCastInst>(CI))
           return new BitCastInst(NGEP, CI.getType());
         assert(isa<PtrToIntInst>(CI));
         return new PtrToIntInst(NGEP, CI.getType());
-      }      
+      }
     }
   }
-  
+
   return commonCastTransforms(CI);
 }
 
@@ -1373,16 +1373,16 @@ Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) {
   if (TD) {
     if (CI.getType()->getScalarSizeInBits() < TD->getPointerSizeInBits(AS)) {
       Value *P = Builder->CreatePtrToInt(CI.getOperand(0),
-                                         TD->getIntPtrType(CI.getContext()));
+                                       TD->getIntPtrType(CI.getContext(), AS));
       return new TruncInst(P, CI.getType());
     }
     if (CI.getType()->getScalarSizeInBits() > TD->getPointerSizeInBits(AS)) {
       Value *P = Builder->CreatePtrToInt(CI.getOperand(0),
-                                         TD->getIntPtrType(CI.getContext()));
+                                       TD->getIntPtrType(CI.getContext(), AS));
       return new ZExtInst(P, CI.getType());
     }
   }
-  
+
   return commonPointerCastTransforms(CI);
 }
 
@@ -1397,33 +1397,33 @@ static Instruction *OptimizeVectorResize(Value *InVal, VectorType *DestTy,
   // element size, or the input is a multiple of the output element size.
   // Convert the input type to have the same element type as the output.
   VectorType *SrcTy = cast<VectorType>(InVal->getType());
-  
+
   if (SrcTy->getElementType() != DestTy->getElementType()) {
     // The input types don't need to be identical, but for now they must be the
     // same size.  There is no specific reason we couldn't handle things like
     // <4 x i16> -> <4 x i32> by bitcasting to <2 x i32> but haven't gotten
-    // there yet. 
+    // there yet.
     if (SrcTy->getElementType()->getPrimitiveSizeInBits() !=
         DestTy->getElementType()->getPrimitiveSizeInBits())
       return 0;
-    
+
     SrcTy = VectorType::get(DestTy->getElementType(), SrcTy->getNumElements());
     InVal = IC.Builder->CreateBitCast(InVal, SrcTy);
   }
-  
+
   // Now that the element types match, get the shuffle mask and RHS of the
   // shuffle to use, which depends on whether we're increasing or decreasing the
   // size of the input.
   SmallVector<uint32_t, 16> ShuffleMask;
   Value *V2;
-  
+
   if (SrcTy->getNumElements() > DestTy->getNumElements()) {
     // If we're shrinking the number of elements, just shuffle in the low
     // elements from the input and use undef as the second shuffle input.
     V2 = UndefValue::get(SrcTy);
     for (unsigned i = 0, e = DestTy->getNumElements(); i != e; ++i)
       ShuffleMask.push_back(i);
-    
+
   } else {
     // If we're increasing the number of elements, shuffle in all of the
     // elements from InVal and fill the rest of the result elements with zeros
@@ -1437,7 +1437,7 @@ static Instruction *OptimizeVectorResize(Value *InVal, VectorType *DestTy,
     for (unsigned i = 0, e = DestTy->getNumElements()-SrcElts; i != e; ++i)
       ShuffleMask.push_back(SrcElts);
   }
-  
+
   return new ShuffleVectorInst(InVal, V2,
                                ConstantDataVector::get(V2->getContext(),
                                                        ShuffleMask));
@@ -1464,7 +1464,7 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
                                      Type *VecEltTy) {
   // Undef values never contribute useful bits to the result.
   if (isa<UndefValue>(V)) return true;
-  
+
   // If we got down to a value of the right type, we win, try inserting into the
   // right element.
   if (V->getType() == VecEltTy) {
@@ -1472,15 +1472,15 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
     if (Constant *C = dyn_cast<Constant>(V))
       if (C->isNullValue())
         return true;
-    
+
     // Fail if multiple elements are inserted into this slot.
     if (ElementIndex >= Elements.size() || Elements[ElementIndex] != 0)
       return false;
-    
+
     Elements[ElementIndex] = V;
     return true;
   }
-  
+
   if (Constant *C = dyn_cast<Constant>(V)) {
     // Figure out the # elements this provides, and bitcast it or slice it up
     // as required.
@@ -1491,7 +1491,7 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
     if (NumElts == 1)
       return CollectInsertionElements(ConstantExpr::getBitCast(C, VecEltTy),
                                       ElementIndex, Elements, VecEltTy);
-    
+
     // Okay, this is a constant that covers multiple elements.  Slice it up into
     // pieces and insert each element-sized piece into the vector.
     if (!isa<IntegerType>(C->getType()))
@@ -1499,7 +1499,7 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
                                        C->getType()->getPrimitiveSizeInBits()));
     unsigned ElementSize = VecEltTy->getPrimitiveSizeInBits();
     Type *ElementIntTy = IntegerType::get(C->getContext(), ElementSize);
-    
+
     for (unsigned i = 0; i != NumElts; ++i) {
       Constant *Piece = ConstantExpr::getLShr(C, ConstantInt::get(C->getType(),
                                                                i*ElementSize));
@@ -1509,23 +1509,23 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
     }
     return true;
   }
-  
+
   if (!V->hasOneUse()) return false;
-  
+
   Instruction *I = dyn_cast<Instruction>(V);
   if (I == 0) return false;
   switch (I->getOpcode()) {
   default: return false; // Unhandled case.
   case Instruction::BitCast:
     return CollectInsertionElements(I->getOperand(0), ElementIndex,
-                                    Elements, VecEltTy);  
+                                    Elements, VecEltTy);
   case Instruction::ZExt:
     if (!isMultipleOfTypeSize(
                           I->getOperand(0)->getType()->getPrimitiveSizeInBits(),
                               VecEltTy))
       return false;
     return CollectInsertionElements(I->getOperand(0), ElementIndex,
-                                    Elements, VecEltTy);  
+                                    Elements, VecEltTy);
   case Instruction::Or:
     return CollectInsertionElements(I->getOperand(0), ElementIndex,
                                     Elements, VecEltTy) &&
@@ -1537,11 +1537,11 @@ static bool CollectInsertionElements(Value *V, unsigned ElementIndex,
     if (CI == 0) return false;
     if (!isMultipleOfTypeSize(CI->getZExtValue(), VecEltTy)) return false;
     unsigned IndexShift = getTypeSizeIndex(CI->getZExtValue(), VecEltTy);
-    
+
     return CollectInsertionElements(I->getOperand(0), ElementIndex+IndexShift,
                                     Elements, VecEltTy);
   }
-      
+
   }
 }
 
@@ -1576,11 +1576,11 @@ static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI,
   Value *Result = Constant::getNullValue(CI.getType());
   for (unsigned i = 0, e = Elements.size(); i != e; ++i) {
     if (Elements[i] == 0) continue;  // Unset element.
-    
+
     Result = IC.Builder->CreateInsertElement(Result, Elements[i],
                                              IC.Builder->getInt32(i));
   }
-  
+
   return Result;
 }
 
@@ -1608,11 +1608,11 @@ static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI,InstCombiner &IC){
                                 VecTy->getPrimitiveSizeInBits() / DestWidth);
         VecInput = IC.Builder->CreateBitCast(VecInput, VecTy);
       }
-    
+
       return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(0));
     }
   }
-  
+
   // bitcast(trunc(lshr(bitcast(somevector), cst))
   ConstantInt *ShAmt = 0;
   if (match(Src, m_Trunc(m_LShr(m_BitCast(m_Value(VecInput)),
@@ -1629,7 +1629,7 @@ static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI,InstCombiner &IC){
                                 VecTy->getPrimitiveSizeInBits() / DestWidth);
         VecInput = IC.Builder->CreateBitCast(VecInput, VecTy);
       }
-      
+
       unsigned Elt = ShAmt->getZExtValue() / DestWidth;
       return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(Elt));
     }
@@ -1653,12 +1653,12 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     PointerType *SrcPTy = cast<PointerType>(SrcTy);
     Type *DstElTy = DstPTy->getElementType();
     Type *SrcElTy = SrcPTy->getElementType();
-    
+
     // If the address spaces don't match, don't eliminate the bitcast, which is
     // required for changing types.
     if (SrcPTy->getAddressSpace() != DstPTy->getAddressSpace())
       return 0;
-    
+
     // If we are casting a alloca to a pointer to a type of the same
     // size, rewrite the allocation instruction to allocate the "right" type.
     // There is no need to modify malloc calls because it is their bitcast that
@@ -1666,14 +1666,14 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     if (AllocaInst *AI = dyn_cast<AllocaInst>(Src))
       if (Instruction *V = PromoteCastOfAllocation(CI, *AI))
         return V;
-    
+
     // If the source and destination are pointers, and this cast is equivalent
     // to a getelementptr X, 0, 0, 0...  turn it into the appropriate gep.
     // This can enhance SROA and other transforms that want type-safe pointers.
     Constant *ZeroUInt =
       Constant::getNullValue(Type::getInt32Ty(CI.getContext()));
     unsigned NumZeros = 0;
-    while (SrcElTy != DstElTy && 
+    while (SrcElTy != DstElTy &&
            isa<CompositeType>(SrcElTy) && !SrcElTy->isPointerTy() &&
            SrcElTy->getNumContainedTypes() /* not "{}" */) {
       SrcElTy = cast<CompositeType>(SrcElTy)->getTypeAtIndex(ZeroUInt);
@@ -1686,7 +1686,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
       return GetElementPtrInst::CreateInBounds(Src, Idxs);
     }
   }
-  
+
   // Try to optimize int -> float bitcasts.
   if ((DestTy->isFloatTy() || DestTy->isDoubleTy()) && isa<IntegerType>(SrcTy))
     if (Instruction *I = OptimizeIntToFloatBitCast(CI, *this))
@@ -1699,7 +1699,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
                      Constant::getNullValue(Type::getInt32Ty(CI.getContext())));
       // FIXME: Canonicalize bitcast(insertelement) -> insertelement(bitcast)
     }
-    
+
     if (isa<IntegerType>(SrcTy)) {
       // If this is a cast from an integer to vector, check to see if the input
       // is a trunc or zext of a bitcast from vector.  If so, we can replace all
@@ -1712,7 +1712,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
                                                cast<VectorType>(DestTy), *this))
               return I;
       }
-      
+
       // If the input is an 'or' instruction, we may be doing shifts and ors to
       // assemble the elements of the vector manually.  Try to rip the code out
       // and replace it with insertelements.
@@ -1723,7 +1723,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
 
   if (VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy)) {
     if (SrcVTy->getNumElements() == 1 && !DestTy->isVectorTy()) {
-      Value *Elem = 
+      Value *Elem =
         Builder->CreateExtractElement(Src,
                    Constant::getNullValue(Type::getInt32Ty(CI.getContext())));
       return CastInst::Create(Instruction::BitCast, Elem, DestTy);
@@ -1733,7 +1733,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
   if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(Src)) {
     // Okay, we have (bitcast (shuffle ..)).  Check to see if this is
     // a bitcast to a vector with the same # elts.
-    if (SVI->hasOneUse() && DestTy->isVectorTy() && 
+    if (SVI->hasOneUse() && DestTy->isVectorTy() &&
         cast<VectorType>(DestTy)->getNumElements() ==
               SVI->getType()->getNumElements() &&
         SVI->getType()->getNumElements() ==
@@ -1742,9 +1742,9 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
       // If either of the operands is a cast from CI.getType(), then
       // evaluating the shuffle in the casted destination's type will allow
       // us to eliminate at least one cast.
-      if (((Tmp = dyn_cast<BitCastInst>(SVI->getOperand(0))) && 
+      if (((Tmp = dyn_cast<BitCastInst>(SVI->getOperand(0))) &&
            Tmp->getOperand(0)->getType() == DestTy) ||
-          ((Tmp = dyn_cast<BitCastInst>(SVI->getOperand(1))) && 
+          ((Tmp = dyn_cast<BitCastInst>(SVI->getOperand(1))) &&
            Tmp->getOperand(0)->getType() == DestTy)) {
         Value *LHS = Builder->CreateBitCast(SVI->getOperand(0), DestTy);
         Value *RHS = Builder->CreateBitCast(SVI->getOperand(1), DestTy);
@@ -1754,7 +1754,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
       }
     }
   }
-  
+
   if (SrcTy->isPointerTy())
     return commonPointerCastTransforms(CI);
   return commonCastTransforms(CI);
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index e3e5ddae80..055c3b1514 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -371,7 +371,7 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
   // an inbounds GEP because the index can't be out of range.
   if (!GEP->isInBounds() &&
       Idx->getType()->getPrimitiveSizeInBits() > TD->getPointerSizeInBits(AS))
-    Idx = Builder->CreateTrunc(Idx, TD->getIntPtrType(Idx->getContext()));
+    Idx = Builder->CreateTrunc(Idx, TD->getIntPtrType(Idx->getContext(), AS));
 
   // If the comparison is only true for one or two elements, emit direct
   // comparisons.
@@ -539,7 +539,7 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) {
     // we don't need to bother extending: the extension won't affect where the
     // computation crosses zero.
     if (VariableIdx->getType()->getPrimitiveSizeInBits() > IntPtrWidth) {
-      Type *IntPtrTy = TD.getIntPtrType(VariableIdx->getContext());
+      Type *IntPtrTy = TD.getIntPtrType(VariableIdx->getContext(), AS);
       VariableIdx = IC.Builder->CreateTrunc(VariableIdx, IntPtrTy);
     }
     return VariableIdx;
@@ -561,7 +561,7 @@ static Value *EvaluateGEPOffsetExpression(User *GEP, InstCombiner &IC) {
     return 0;
 
   // Okay, we can do this evaluation.  Start by converting the index to intptr.
-  Type *IntPtrTy = TD.getIntPtrType(VariableIdx->getContext());
+  Type *IntPtrTy = TD.getIntPtrType(VariableIdx->getContext(), AS);
   if (VariableIdx->getType() != IntPtrTy)
     VariableIdx = IC.Builder->CreateIntCast(VariableIdx, IntPtrTy,
                                             true /*Signed*/);
@@ -1554,8 +1554,7 @@ Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) {
   // Turn icmp (ptrtoint x), (ptrtoint/c) into a compare of the input if the
   // integer type is the same size as the pointer type.
   if (TD && LHSCI->getOpcode() == Instruction::PtrToInt &&
-      TD->getPointerSizeInBits(
-        cast<PtrToIntInst>(LHSCI)->getPointerAddressSpace()) ==
+      TD->getTypeSizeInBits(DestTy) ==
          cast<IntegerType>(DestTy)->getBitWidth()) {
     Value *RHSOp = 0;
     if (Constant *RHSC = dyn_cast<Constant>(ICI.getOperand(1))) {
@@ -2251,7 +2250,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       case Instruction::IntToPtr:
         // icmp pred inttoptr(X), null -> icmp pred X, 0
         if (RHSC->isNullValue() && TD &&
-            TD->getIntPtrType(RHSC->getContext()) ==
+            TD->getIntPtrType(LHSI->getType()) ==
                LHSI->getOperand(0)->getType())
           return new ICmpInst(I.getPredicate(), LHSI->getOperand(0),
                         Constant::getNullValue(LHSI->getOperand(0)->getType()));
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 4ab5b6e4a0..633ad93ad9 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -173,7 +173,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
   // Ensure that the alloca array size argument has type intptr_t, so that
   // any casting is exposed early.
   if (TD) {
-    Type *IntPtrTy = TD->getIntPtrType(AI.getContext());
+    Type *IntPtrTy = TD->getIntPtrType(AI.getType());
     if (AI.getArraySize()->getType() != IntPtrTy) {
       Value *V = Builder->CreateIntCast(AI.getArraySize(),
                                         IntPtrTy, false);
@@ -185,7 +185,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
   // Convert: alloca Ty, C - where C is a constant != 1 into: alloca [C x Ty], 1
   if (AI.isArrayAllocation()) {  // Check C != 1
     if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) {
-      Type *NewTy = 
+      Type *NewTy =
         ArrayType::get(AI.getAllocatedType(), C->getZExtValue());
       AllocaInst *New = Builder->CreateAlloca(NewTy, 0, AI.getName());
       New->setAlignment(AI.getAlignment());
@@ -311,7 +311,7 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
 
     Type *SrcPTy = SrcTy->getElementType();
 
-    if (DestPTy->isIntegerTy() || DestPTy->isPointerTy() || 
+    if (DestPTy->isIntegerTy() || DestPTy->isPointerTy() ||
          DestPTy->isVectorTy()) {
       // If the source is an array, the code below will not succeed.  Check to
       // see if a trivial 'gep P, 0, 0' will help matters.  Only do this for
@@ -328,7 +328,7 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
           }
 
       if (IC.getDataLayout() &&
-          (SrcPTy->isIntegerTy() || SrcPTy->isPointerTy() || 
+          (SrcPTy->isIntegerTy() || SrcPTy->isPointerTy() ||
             SrcPTy->isVectorTy()) &&
           // Do not allow turning this into a load of an integer, which is then
           // casted to a pointer, this pessimizes pointer analysis a lot.
@@ -339,7 +339,7 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
         // Okay, we are casting from one integer or pointer type to another of
         // the same size.  Instead of casting the pointer before the load, cast
         // the result of the loaded value.
-        LoadInst *NewLoad = 
+        LoadInst *NewLoad =
           IC.Builder->CreateLoad(CastOp, LI.isVolatile(), CI->getName());
         NewLoad->setAlignment(LI.getAlignment());
         NewLoad->setAtomic(LI.getOrdering(), LI.getSynchScope());
@@ -376,7 +376,7 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
   // None of the following transforms are legal for volatile/atomic loads.
   // FIXME: Some of it is okay for atomic loads; needs refactoring.
   if (!LI.isSimple()) return 0;
-  
+
   // Do really simple store-to-load forwarding and load CSE, to catch cases
   // where there are several consecutive memory accesses to the same location,
   // separated by a few arithmetic operations.
@@ -397,7 +397,7 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
                     Constant::getNullValue(Op->getType()), &LI);
       return ReplaceInstUsesWith(LI, UndefValue::get(LI.getType()));
     }
-  } 
+  }
 
   // load null/undef -> unreachable
   // TODO: Consider a target hook for valid address spaces for this xform.
@@ -416,7 +416,7 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
     if (CE->isCast())
       if (Instruction *Res = InstCombineLoadCast(*this, LI, TD))
         return Res;
-  
+
   if (Op->hasOneUse()) {
     // Change select and PHI nodes to select values instead of addresses: this
     // helps alias analysis out a lot, allows many others simplifications, and
@@ -470,18 +470,18 @@ static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) {
   Type *DestPTy = cast<PointerType>(CI->getType())->getElementType();
   PointerType *SrcTy = dyn_cast<PointerType>(CastOp->getType());
   if (SrcTy == 0) return 0;
-  
+
   Type *SrcPTy = SrcTy->getElementType();
 
   if (!DestPTy->isIntegerTy() && !DestPTy->isPointerTy())
     return 0;
-  
+
   /// NewGEPIndices - If SrcPTy is an aggregate type, we can emit a "noop gep"
   /// to its first element.  This allows us to handle things like:
   ///   store i32 xxx, (bitcast {foo*, float}* %P to i32*)
   /// on 32-bit hosts.
   SmallVector<Value*, 4> NewGEPIndices;
-  
+
   // If the source is an array, the code below will not succeed.  Check to
   // see if a trivial 'gep P, 0, 0' will help matters.  Only do this for
   // constants.
@@ -489,7 +489,7 @@ static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) {
     // Index through pointer.
     Constant *Zero = Constant::getNullValue(Type::getInt32Ty(SI.getContext()));
     NewGEPIndices.push_back(Zero);
-    
+
     while (1) {
       if (StructType *STy = dyn_cast<StructType>(SrcPTy)) {
         if (!STy->getNumElements()) /* Struct can be empty {} */
@@ -503,24 +503,23 @@ static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) {
         break;
       }
     }
-    
+
     SrcTy = PointerType::get(SrcPTy, SrcTy->getAddressSpace());
   }
 
   if (!SrcPTy->isIntegerTy() && !SrcPTy->isPointerTy())
     return 0;
-  
+
   // If the pointers point into different address spaces or if they point to
   // values with different sizes, we can't do the transformation.
   if (!IC.getDataLayout() ||
-      SrcTy->getAddressSpace() != 
-        cast<PointerType>(CI->getType())->getAddressSpace() ||
+      SrcTy->getAddressSpace() != CI->getType()->getPointerAddressSpace() ||
       IC.getDataLayout()->getTypeSizeInBits(SrcPTy) !=
       IC.getDataLayout()->getTypeSizeInBits(DestPTy))
     return 0;
 
   // Okay, we are casting from one integer or pointer type to another of
-  // the same size.  Instead of casting the pointer before 
+  // the same size.  Instead of casting the pointer before
   // the store, cast the value to be stored.
   Value *NewCast;
   Value *SIOp0 = SI.getOperand(0);
@@ -534,12 +533,12 @@ static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) {
     if (SIOp0->getType()->isPointerTy())
       opcode = Instruction::PtrToInt;
   }
-  
+
   // SIOp0 is a pointer to aggregate and this is a store to the first field,
   // emit a GEP to index into its first field.
   if (!NewGEPIndices.empty())
     CastOp = IC.Builder->CreateInBoundsGEP(CastOp, NewGEPIndices);
-  
+
   NewCast = IC.Builder->CreateCast(opcode, SIOp0, CastDstTy,
                                    SIOp0->getName()+".c");
   SI.setOperand(0, NewCast);
@@ -558,7 +557,7 @@ static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) {
 static bool equivalentAddressValues(Value *A, Value *B) {
   // Test if the values are trivially equivalent.
   if (A == B) return true;
-  
+
   // Test if the values come form identical arithmetic instructions.
   // This uses isIdenticalToWhenDefined instead of isIdenticalTo because
   // its only used to compare two uses within the same basic block, which
@@ -571,7 +570,7 @@ static bool equivalentAddressValues(Value *A, Value *B) {
     if (Instruction *BI = dyn_cast<Instruction>(B))
       if (cast<Instruction>(A)->isIdenticalToWhenDefined(BI))
         return true;
-  
+
   // Otherwise they may not be equivalent.
   return false;
 }
@@ -602,7 +601,7 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
   // If the RHS is an alloca with a single use, zapify the store, making the
   // alloca dead.
   if (Ptr->hasOneUse()) {
-    if (isa<AllocaInst>(Ptr)) 
+    if (isa<AllocaInst>(Ptr))
       return EraseInstFromFunction(SI);
     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
       if (isa<AllocaInst>(GEP->getOperand(0))) {
@@ -625,8 +624,8 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
         (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy())) {
       ScanInsts++;
       continue;
-    }    
-    
+    }
+
     if (StoreInst *PrevSI = dyn_cast<StoreInst>(BBI)) {
       // Prev store isn't volatile, and stores to the same location?
       if (PrevSI->isSimple() && equivalentAddressValues(PrevSI->getOperand(1),
@@ -638,7 +637,7 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
       }
       break;
     }
-    
+
     // If this is a load, we have to stop.  However, if the loaded value is from
     // the pointer we're loading and is producing the pointer we're storing,
     // then *this* store is dead (X = load P; store X -> P).
@@ -646,12 +645,12 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
       if (LI == Val && equivalentAddressValues(LI->getOperand(0), Ptr) &&
           LI->isSimple())
         return EraseInstFromFunction(SI);
-      
+
       // Otherwise, this is a load from some other location.  Stores before it
       // may not be dead.
       break;
     }
-    
+
     // Don't skip over loads or things that can modify memory.
     if (BBI->mayWriteToMemory() || BBI->mayReadFromMemory())
       break;
@@ -681,11 +680,11 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
       if (Instruction *Res = InstCombineStoreToCast(*this, SI))
         return Res;
 
-  
+
   // If this store is the last instruction in the basic block (possibly
   // excepting debug info instructions), and if the block ends with an
   // unconditional branch, try to move it to the successor block.
-  BBI = &SI; 
+  BBI = &SI;
   do {
     ++BBI;
   } while (isa<DbgInfoIntrinsic>(BBI) ||
@@ -694,7 +693,7 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
     if (BI->isUnconditional())
       if (SimplifyStoreAtEndOfBlock(SI))
         return 0;  // xform done!
-  
+
   return 0;
 }
 
@@ -708,12 +707,12 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
 ///
 bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
   BasicBlock *StoreBB = SI.getParent();
-  
+
   // Check to see if the successor block has exactly two incoming edges.  If
   // so, see if the other predecessor contains a store to the same location.
   // if so, insert a PHI node (if needed) and move the stores down.
   BasicBlock *DestBB = StoreBB->getTerminator()->getSuccessor(0);
-  
+
   // Determine whether Dest has exactly two predecessors and, if so, compute
   // the other predecessor.
   pred_iterator PI = pred_begin(DestBB);
@@ -725,7 +724,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
 
   if (++PI == pred_end(DestBB))
     return false;
-  
+
   P = *PI;
   if (P != StoreBB) {
     if (OtherBB)
@@ -745,7 +744,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
   BranchInst *OtherBr = dyn_cast<BranchInst>(BBI);
   if (!OtherBr || BBI == OtherBB->begin())
     return false;
-  
+
   // If the other block ends in an unconditional branch, check for the 'if then
   // else' case.  there is an instruction before the branch.
   StoreInst *OtherStore = 0;
@@ -767,10 +766,10 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
   } else {
     // Otherwise, the other block ended with a conditional branch. If one of the
     // destinations is StoreBB, then we have the if/then case.
-    if (OtherBr->getSuccessor(0) != StoreBB && 
+    if (OtherBr->getSuccessor(0) != StoreBB &&
         OtherBr->getSuccessor(1) != StoreBB)
       return false;
-    
+
     // Okay, we know that OtherBr now goes to Dest and StoreBB, so this is an
     // if/then triangle.  See if there is a store to the same ptr as SI that
     // lives in OtherBB.
@@ -788,7 +787,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
           BBI == OtherBB->begin())
         return false;
     }
-    
+
     // In order to eliminate the store in OtherBr, we have to
     // make sure nothing reads or overwrites the stored value in
     // StoreBB.
@@ -798,7 +797,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
         return false;
     }
   }
-  
+
   // Insert a PHI node now if we need it.
   Value *MergedVal = OtherStore->getOperand(0);
   if (MergedVal != SI.getOperand(0)) {
@@ -807,7 +806,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
     PN->addIncoming(OtherStore->getOperand(0), OtherBB);
     MergedVal = InsertNewInstBefore(PN, DestBB->front());
   }
-  
+
   // Advance to a place where it is safe to insert the new store and
   // insert it.
   BBI = DestBB->getFirstInsertionPt();
@@ -817,7 +816,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
                                    SI.getOrdering(),
                                    SI.getSynchScope());
   InsertNewInstBefore(NewSI, *BBI);
-  NewSI->setDebugLoc(OtherStore->getDebugLoc()); 
+  NewSI->setDebugLoc(OtherStore->getDebugLoc());
 
   // Nuke the old stores.
   EraseInstFromFunction(SI);
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 7f8c3ae558..00b7fca681 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -738,7 +738,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
 /// or not there is a sequence of GEP indices into the type that will land us at
 /// the specified offset.  If so, fill them into NewIndices and return the
 /// resultant element type, otherwise return null.
-Type *InstCombiner::FindElementAtOffset(Type *Ty, int64_t Offset,
+Type *InstCombiner::FindElementAtOffset(Type *Ty, int64_t Offset, Type *IntPtrTy,
                                           SmallVectorImpl<Value*> &NewIndices) {
   if (!TD) return 0;
   if (!Ty->isSized()) return 0;
@@ -746,7 +746,6 @@ Type *InstCombiner::FindElementAtOffset(Type *Ty, int64_t Offset,
   // Start with the index over the outer type.  Note that the type size
   // might be zero (even if the offset isn't zero) if the indexed type
   // is something like [0 x {int, int}]
-  Type *IntPtrTy = TD->getIntPtrType(Ty->getContext());
   int64_t FirstIdx = 0;
   if (int64_t TySize = TD->getTypeAllocSize(Ty)) {
     FirstIdx = Offset/TySize;
@@ -1055,7 +1054,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   // by multiples of a zero size type with zero.
   if (TD) {
     bool MadeChange = false;
-    Type *IntPtrTy = TD->getIntPtrType(GEP.getContext());
+    Type *IntPtrTy = TD->getIntPtrType(PtrOp->getType());
 
     gep_type_iterator GTI = gep_type_begin(GEP);
     for (User::op_iterator I = GEP.op_begin() + 1, E = GEP.op_end();
@@ -1240,7 +1239,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
 
           // Earlier transforms ensure that the index has type IntPtrType, which
           // considerably simplifies the logic by eliminating implicit casts.
-          assert(Idx->getType() == TD->getIntPtrType(GEP.getContext()) &&
+          assert(Idx->getType() == TD->getIntPtrType(GEP.getType()) &&
                  "Index not cast to pointer width?");
 
           bool NSW;
@@ -1275,7 +1274,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
 
           // Earlier transforms ensure that the index has type IntPtrType, which
           // considerably simplifies the logic by eliminating implicit casts.
-          assert(Idx->getType() == TD->getIntPtrType(GEP.getContext()) &&
+          assert(Idx->getType() == TD->getIntPtrType(GEP.getType()) &&
                  "Index not cast to pointer width?");
 
           bool NSW;
@@ -1337,7 +1336,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       SmallVector<Value*, 8> NewIndices;
       Type *InTy =
         cast<PointerType>(BCI->getOperand(0)->getType())->getElementType();
-      if (FindElementAtOffset(InTy, Offset, NewIndices)) {
+      Type *IntPtrTy = TD->getIntPtrType(BCI->getOperand(0)->getType());
+      if (FindElementAtOffset(InTy, Offset, IntPtrTy, NewIndices)) {
         Value *NGEP = GEP.isInBounds() ?
           Builder->CreateInBoundsGEP(BCI->getOperand(0), NewIndices) :
           Builder->CreateGEP(BCI->getOperand(0), NewIndices);
diff --git a/lib/Transforms/Instrumentation/BoundsChecking.cpp b/lib/Transforms/Instrumentation/BoundsChecking.cpp
index 976b963046..dd36a00070 100644
--- a/lib/Transforms/Instrumentation/BoundsChecking.cpp
+++ b/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -143,7 +143,7 @@ bool BoundsChecking::instrument(Value *Ptr, Value *InstVal) {
   Value *Offset = SizeOffset.second;
   ConstantInt *SizeCI = dyn_cast<ConstantInt>(Size);
 
-  IntegerType *IntTy = TD->getIntPtrType(Inst->getContext());
+  IntegerType *IntTy = TD->getIntPtrType(Ptr->getType());
   Value *NeededSizeVal = ConstantInt::get(IntTy, NeededSize);
 
   // three checks are required to ensure safety:
diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp
index 2b42c75d14..74e310f7e7 100644
--- a/lib/Transforms/Scalar/CodeGenPrepare.cpp
+++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -935,7 +935,7 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
                  << *MemoryInst);
     Type *IntPtrTy =
-          TLI->getDataLayout()->getIntPtrType(AccessTy->getContext());
+          TLI->getDataLayout()->getIntPtrType(Addr->getType());
 
     Value *Result = 0;
 
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index eb0da20abb..b6e15540e7 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -746,6 +746,15 @@ static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal,
   return true;
 }
 
+/// Wrap TD.getIntPtrType, but return a vector type for vector inputs.
+static Type *getIntPtrType(Type *Ty, const DataLayout &TD) {
+  Type *ITy = TD.getIntPtrType(Ty);
+  if (Ty->isVectorTy()) {
+    ITy = VectorType::get(ITy, Ty->getVectorNumElements());
+  }
+
+  return ITy;
+}
 
 /// CoerceAvailableValueToLoadType - If we saw a store of a value to memory, and
 /// then a load from a must-aliased pointer of a different type, try to coerce
@@ -769,24 +778,25 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal,
   // If the store and reload are the same size, we can always reuse it.
   if (StoreSize == LoadSize) {
     // Pointer to Pointer -> use bitcast.
-    if (StoredValTy->isPointerTy() && LoadedTy->isPointerTy())
+    if (StoredValTy->getScalarType()->isPointerTy() &&
+        LoadedTy->getScalarType()->isPointerTy())
       return new BitCastInst(StoredVal, LoadedTy, "", InsertPt);
 
     // Convert source pointers to integers, which can be bitcast.
-    if (StoredValTy->isPointerTy()) {
-      StoredValTy = TD.getIntPtrType(StoredValTy->getContext());
+    if (StoredValTy->getScalarType()->isPointerTy()) {
+      StoredValTy = getIntPtrType(StoredValTy, TD);
       StoredVal = new PtrToIntInst(StoredVal, StoredValTy, "", InsertPt);
     }
 
     Type *TypeToCastTo = LoadedTy;
-    if (TypeToCastTo->isPointerTy())
-      TypeToCastTo = TD.getIntPtrType(StoredValTy->getContext());
+    if (TypeToCastTo->getScalarType()->isPointerTy())
+      TypeToCastTo = getIntPtrType(StoredValTy, TD);
 
     if (StoredValTy != TypeToCastTo)
       StoredVal = new BitCastInst(StoredVal, TypeToCastTo, "", InsertPt);
 
     // Cast to pointer if the load needs a pointer type.
-    if (LoadedTy->isPointerTy())
+    if (LoadedTy->getScalarType()->isPointerTy())
       StoredVal = new IntToPtrInst(StoredVal, LoadedTy, "", InsertPt);
 
     return StoredVal;
@@ -798,8 +808,8 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal,
   assert(StoreSize >= LoadSize && "CanCoerceMustAliasedValueToLoad fail");
 
   // Convert source pointers to integers, which can be manipulated.
-  if (StoredValTy->isPointerTy()) {
-    StoredValTy = TD.getIntPtrType(StoredValTy->getContext());
+  if (StoredValTy->getScalarType()->isPointerTy()) {
+    StoredValTy = getIntPtrType(StoredValTy, TD);
     StoredVal = new PtrToIntInst(StoredVal, StoredValTy, "", InsertPt);
   }
 
@@ -824,7 +834,7 @@ static Value *CoerceAvailableValueToLoadType(Value *StoredVal,
     return StoredVal;
 
   // If the result is a pointer, inttoptr.
-  if (LoadedTy->isPointerTy())
+  if (LoadedTy->getScalarType()->isPointerTy())
     return new IntToPtrInst(StoredVal, LoadedTy, "inttoptr", InsertPt);
 
   // Otherwise, bitcast.
@@ -1019,8 +1029,9 @@ static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset,
 
   // Compute which bits of the stored value are being used by the load.  Convert
   // to an integer type to start with.
-  if (SrcVal->getType()->isPointerTy())
-    SrcVal = Builder.CreatePtrToInt(SrcVal, TD.getIntPtrType(Ctx));
+  if (SrcVal->getType()->getScalarType()->isPointerTy())
+    SrcVal = Builder.CreatePtrToInt(SrcVal,
+        getIntPtrType(SrcVal->getType(), TD));
   if (!SrcVal->getType()->isIntegerTy())
     SrcVal = Builder.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize*8));
 
@@ -1301,7 +1312,7 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI,
   Value *V = SSAUpdate.GetValueInMiddleOfBlock(LI->getParent());
 
   // If new PHI nodes were created, notify alias analysis.
-  if (V->getType()->isPointerTy()) {
+  if (V->getType()->getScalarType()->isPointerTy()) {
     AliasAnalysis *AA = gvn.getAliasAnalysis();
 
     for (unsigned i = 0, e = NewPHIs.size(); i != e; ++i)
@@ -1498,7 +1509,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
 
     if (isa<PHINode>(V))
       V->takeName(LI);
-    if (V->getType()->isPointerTy())
+    if (V->getType()->getScalarType()->isPointerTy())
       MD->invalidateCachedPointerInfo(V);
     markInstructionForDeletion(LI);
     ++NumGVNLoad;
@@ -1730,7 +1741,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
   LI->replaceAllUsesWith(V);
   if (isa<PHINode>(V))
     V->takeName(LI);
-  if (V->getType()->isPointerTy())
+  if (V->getType()->getScalarType()->isPointerTy())
     MD->invalidateCachedPointerInfo(V);
   markInstructionForDeletion(LI);
   ++NumPRELoad;
@@ -1857,7 +1868,7 @@ bool GVN::processLoad(LoadInst *L) {
 
       // Replace the load!
       L->replaceAllUsesWith(AvailVal);
-      if (AvailVal->getType()->isPointerTy())
+      if (AvailVal->getType()->getScalarType()->isPointerTy())
         MD->invalidateCachedPointerInfo(AvailVal);
       markInstructionForDeletion(L);
       ++NumGVNLoad;
@@ -1914,7 +1925,7 @@ bool GVN::processLoad(LoadInst *L) {
 
     // Remove it!
     L->replaceAllUsesWith(StoredVal);
-    if (StoredVal->getType()->isPointerTy())
+    if (StoredVal->getType()->getScalarType()->isPointerTy())
       MD->invalidateCachedPointerInfo(StoredVal);
     markInstructionForDeletion(L);
     ++NumGVNLoad;
@@ -1943,7 +1954,7 @@ bool GVN::processLoad(LoadInst *L) {
 
     // Remove it!
     patchAndReplaceAllUsesWith(AvailableVal, L);
-    if (DepLI->getType()->isPointerTy())
+    if (DepLI->getType()->getScalarType()->isPointerTy())
       MD->invalidateCachedPointerInfo(DepLI);
     markInstructionForDeletion(L);
     ++NumGVNLoad;
@@ -2184,7 +2195,7 @@ bool GVN::processInstruction(Instruction *I) {
   // "%z = and i32 %x, %y" becomes "%z = and i32 %x, %x" which we now simplify.
   if (Value *V = SimplifyInstruction(I, TD, TLI, DT)) {
     I->replaceAllUsesWith(V);
-    if (MD && V->getType()->isPointerTy())
+    if (MD && V->getType()->getScalarType()->isPointerTy())
       MD->invalidateCachedPointerInfo(V);
     markInstructionForDeletion(I);
     ++NumGVNSimpl;
@@ -2284,7 +2295,7 @@ bool GVN::processInstruction(Instruction *I) {
 
   // Remove it!
   patchAndReplaceAllUsesWith(repl, I);
-  if (MD && repl->getType()->isPointerTy())
+  if (MD && repl->getType()->getScalarType()->isPointerTy())
     MD->invalidateCachedPointerInfo(repl);
   markInstructionForDeletion(I);
   return true;
@@ -2532,7 +2543,7 @@ bool GVN::performPRE(Function &F) {
       addToLeaderTable(ValNo, Phi, CurrentBlock);
       Phi->setDebugLoc(CurInst->getDebugLoc());
       CurInst->replaceAllUsesWith(Phi);
-      if (Phi->getType()->isPointerTy()) {
+      if (Phi->getType()->getScalarType()->isPointerTy()) {
         // Because we have added a PHI-use of the pointer value, it has now
         // "escaped" from alias analysis' perspective.  We need to inform
         // AA of this.
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 82eb746467..8a2f093629 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -1430,7 +1430,8 @@ FindLoopCounter(Loop *L, const SCEV *BECount,
 /// genLoopLimit - Help LinearFunctionTestReplace by generating a value that
 /// holds the RHS of the new loop test.
 static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L,
-                           SCEVExpander &Rewriter, ScalarEvolution *SE) {
+                           SCEVExpander &Rewriter, ScalarEvolution *SE,
+                           Type *IntPtrTy) {
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(IndVar));
   assert(AR && AR->getLoop() == L && AR->isAffine() && "bad loop counter");
   const SCEV *IVInit = AR->getStart();
@@ -1456,7 +1457,8 @@ static Value *genLoopLimit(PHINode *IndVar, const SCEV *IVCount, Loop *L,
     // We could handle pointer IVs other than i8*, but we need to compensate for
     // gep index scaling. See canExpandBackedgeTakenCount comments.
     assert(SE->getSizeOfExpr(
-             cast<PointerType>(GEPBase->getType())->getElementType())->isOne()
+             cast<PointerType>(GEPBase->getType())->getElementType(),
+             IntPtrTy)->isOne()
            && "unit stride pointer IV must be i8*");
 
     IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
@@ -1555,7 +1557,9 @@ LinearFunctionTestReplace(Loop *L,
     CmpIndVar = IndVar;
   }
 
-  Value *ExitCnt = genLoopLimit(IndVar, IVCount, L, Rewriter, SE);
+  Type *IntPtrTy = TD ? TD->getIntPtrType(IndVar->getType()) :
+    IntegerType::getInt64Ty(IndVar->getContext());
+  Value *ExitCnt = genLoopLimit(IndVar, IVCount, L, Rewriter, SE, IntPtrTy);
   assert(ExitCnt->getType()->isPointerTy() == IndVar->getType()->isPointerTy()
          && "genLoopLimit missed a cast");
 
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index a44e798f12..e4b40f3d3a 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -486,7 +486,9 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
   // would be unsafe to do if there is anything else in the loop that may read
   // or write to the aliased location.  Check for any overlap by generating the
   // base pointer and checking the region.
-  unsigned AddrSpace = cast<PointerType>(DestPtr->getType())->getAddressSpace();
+  assert(DestPtr->getType()->isPointerTy()
+      && "Must be a pointer type.");
+  unsigned AddrSpace = DestPtr->getType()->getPointerAddressSpace();
   Value *BasePtr =
     Expander.expandCodeFor(Ev->getStart(), Builder.getInt8PtrTy(AddrSpace),
                            Preheader->getTerminator());
@@ -505,7 +507,7 @@ processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
 
   // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
   // pointer size if it isn't already.
-  Type *IntPtr = TD->getIntPtrType(DestPtr->getContext());
+  Type *IntPtr = TD->getIntPtrType(DestPtr->getType());
   BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr);
 
   const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1),
@@ -611,7 +613,7 @@ processLoopStoreOfLoopLoad(StoreInst *SI, unsigned StoreSize,
 
   // The # stored bytes is (BECount+1)*Size.  Expand the trip count out to
   // pointer size if it isn't already.
-  Type *IntPtr = TD->getIntPtrType(SI->getContext());
+  Type *IntPtr = TD->getIntPtrType(SI->getType());
   BECount = SE->getTruncateOrZeroExtend(BECount, IntPtr);
 
   const SCEV *NumBytesS = SE->getAddExpr(BECount, SE->getConstant(IntPtr, 1),
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index 71c62257e7..af3a880cb9 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -582,7 +582,8 @@ private:
     P.Partitions.push_back(New);
   }
 
-  bool handleLoadOrStore(Type *Ty, Instruction &I, int64_t Offset) {
+  bool handleLoadOrStore(Type *Ty, Instruction &I, int64_t Offset,
+                         bool IsVolatile) {
     uint64_t Size = TD.getTypeStoreSize(Ty);
 
     // If this memory access can be shown to *statically* extend outside the
@@ -603,7 +604,14 @@ private:
       return true;
     }
 
-    insertUse(I, Offset, Size);
+    // We allow splitting of loads and stores where the type is an integer type
+    // and which cover the entire alloca. Such integer loads and stores
+    // often require decomposition into fine grained loads and stores.
+    bool IsSplittable = false;
+    if (IntegerType *ITy = dyn_cast<IntegerType>(Ty))
+      IsSplittable = !IsVolatile && ITy->getBitWidth() == AllocSize*8;
+
+    insertUse(I, Offset, Size, IsSplittable);
     return true;
   }
 
@@ -624,7 +632,7 @@ private:
   bool visitLoadInst(LoadInst &LI) {
     assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
            "All simple FCA loads should have been pre-split");
-    return handleLoadOrStore(LI.getType(), LI, Offset);
+    return handleLoadOrStore(LI.getType(), LI, Offset, LI.isVolatile());
   }
 
   bool visitStoreInst(StoreInst &SI) {
@@ -634,7 +642,7 @@ private:
 
     assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
            "All simple FCA stores should have been pre-split");
-    return handleLoadOrStore(ValOp->getType(), SI, Offset);
+    return handleLoadOrStore(ValOp->getType(), SI, Offset, SI.isVolatile());
   }
 
 
@@ -1173,6 +1181,21 @@ Type *AllocaPartitioning::getCommonType(iterator I) const {
       UserTy = LI->getType();
     } else if (StoreInst *SI = dyn_cast<StoreInst>(UI->U->getUser())) {
       UserTy = SI->getValueOperand()->getType();
+    } else {
+      return 0; // Bail if we have weird uses.
+    }
+
+    if (IntegerType *ITy = dyn_cast<IntegerType>(UserTy)) {
+      // If the type is larger than the partition, skip it. We only encounter
+      // this for split integer operations where we want to use the type of the
+      // entity causing the split.
+      if (ITy->getBitWidth() > (I->EndOffset - I->BeginOffset)*8)
+        continue;
+
+      // If we have found an integer type use covering the alloca, use that
+      // regardless of the other types, as integers are often used for a "bucket
+      // of bits" type.
+      return ITy;
     }
 
     if (Ty && Ty != UserTy)
@@ -2138,6 +2161,14 @@ static bool isIntegerWideningViable(const DataLayout &TD,
   if (SizeInBits != TD.getTypeStoreSizeInBits(AllocaTy))
     return false;
 
+  // We need to ensure that an integer type with the appropriate bitwidth can
+  // be converted to the alloca type, whatever that is. We don't want to force
+  // the alloca itself to have an integer type if there is a more suitable one.
+  Type *IntTy = Type::getIntNTy(AllocaTy->getContext(), SizeInBits);
+  if (!canConvertValue(TD, AllocaTy, IntTy) ||
+      !canConvertValue(TD, IntTy, AllocaTy))
+    return false;
+
   uint64_t Size = TD.getTypeStoreSize(AllocaTy);
 
   // Check the uses to ensure the uses are (likely) promoteable integer uses.
@@ -2364,8 +2395,9 @@ private:
 
   Value *getAdjustedAllocaPtr(IRBuilder<> &IRB, Type *PointerTy) {
     assert(BeginOffset >= NewAllocaBeginOffset);
-    unsigned AS = cast<PointerType>(PointerTy)->getAddressSpace();
-    APInt Offset(TD.getPointerSizeInBits(AS), BeginOffset - NewAllocaBeginOffset);
+    assert(PointerTy->isPointerTy() &&
+        "Type must be pointer type!");
+    APInt Offset(TD.getTypeSizeInBits(PointerTy), BeginOffset - NewAllocaBeginOffset);
     return getAdjustedPtr(IRB, TD, &NewAI, Offset, PointerTy, getName(""));
   }
 
@@ -2460,6 +2492,50 @@ private:
 
     if (VecTy)
       return rewriteVectorizedLoadInst(IRB, LI, OldOp);
+
+    uint64_t Size = EndOffset - BeginOffset;
+    if (Size < TD.getTypeStoreSize(LI.getType())) {
+      assert(!LI.isVolatile());
+      assert(LI.getType()->isIntegerTy() &&
+             "Only integer type loads and stores are split");
+      assert(LI.getType()->getIntegerBitWidth() ==
+             TD.getTypeStoreSizeInBits(LI.getType()) &&
+             "Non-byte-multiple bit width");
+      assert(LI.getType()->getIntegerBitWidth() ==
+             TD.getTypeSizeInBits(OldAI.getAllocatedType()) &&
+             "Only alloca-wide loads can be split and recomposed");
+      IntegerType *NarrowTy = Type::getIntNTy(LI.getContext(), Size * 8);
+      bool IsConvertable = (BeginOffset - NewAllocaBeginOffset == 0) &&
+                           canConvertValue(TD, NewAllocaTy, NarrowTy);
+      Value *V;
+      // Move the insertion point just past the load so that we can refer to it.
+      IRB.SetInsertPoint(llvm::next(BasicBlock::iterator(&LI)));
+      if (IsConvertable)
+        V = convertValue(TD, IRB,
+                         IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
+                                               getName(".load")),
+                         NarrowTy);
+      else
+        V = IRB.CreateAlignedLoad(
+          getAdjustedAllocaPtr(IRB, NarrowTy->getPointerTo()),
+          getPartitionTypeAlign(NarrowTy), getName(".load"));
+      // Create a placeholder value with the same type as LI to use as the
+      // basis for the new value. This allows us to replace the uses of LI with
+      // the computed value, and then replace the placeholder with LI, leaving
+      // LI only used for this computation.
+      Value *Placeholder
+        = IRB.CreateLoad(UndefValue::get(LI.getType()->getPointerTo()));
+      V = insertInteger(TD, IRB, Placeholder, V, BeginOffset,
+                        getName(".insert"));
+      LI.replaceAllUsesWith(V);
+      Placeholder->replaceAllUsesWith(&LI);
+      cast<Instruction>(Placeholder)->eraseFromParent();
+      if (Pass.DeadSplitInsts.insert(&LI))
+        Pass.DeadInsts.push_back(&LI);
+      DEBUG(dbgs() << "          to: " << *V << "\n");
+      return IsConvertable;
+    }
+
     if (IntTy && LI.getType()->isIntegerTy())
       return rewriteIntegerLoad(IRB, LI);
 
@@ -2539,6 +2615,39 @@ private:
     if (VecTy)
       return rewriteVectorizedStoreInst(IRB, SI, OldOp);
     Type *ValueTy = SI.getValueOperand()->getType();
+
+    uint64_t Size = EndOffset - BeginOffset;
+    if (Size < TD.getTypeStoreSize(ValueTy)) {
+      assert(!SI.isVolatile());
+      assert(ValueTy->isIntegerTy() &&
+             "Only integer type loads and stores are split");
+      assert(ValueTy->getIntegerBitWidth() ==
+             TD.getTypeStoreSizeInBits(ValueTy) &&
+             "Non-byte-multiple bit width");
+      assert(ValueTy->getIntegerBitWidth() ==
+             TD.getTypeSizeInBits(OldAI.getAllocatedType()) &&
+             "Only alloca-wide stores can be split and recomposed");
+      IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), Size * 8);
+      Value *V = extractInteger(TD, IRB, SI.getValueOperand(), NarrowTy,
+                                BeginOffset, getName(".extract"));
+      StoreInst *NewSI;
+      bool IsConvertable = (BeginOffset - NewAllocaBeginOffset == 0) &&
+                           canConvertValue(TD, NarrowTy, NewAllocaTy);
+      if (IsConvertable)
+        NewSI = IRB.CreateAlignedStore(convertValue(TD, IRB, V, NewAllocaTy),
+                                       &NewAI, NewAI.getAlignment());
+      else
+        NewSI = IRB.CreateAlignedStore(
+          V, getAdjustedAllocaPtr(IRB, NarrowTy->getPointerTo()),
+          getPartitionTypeAlign(NarrowTy));
+      (void)NewSI;
+      if (Pass.DeadSplitInsts.insert(&SI))
+        Pass.DeadInsts.push_back(&SI);
+
+      DEBUG(dbgs() << "          to: " << *NewSI << "\n");
+      return IsConvertable;
+    }
+
     if (IntTy && ValueTy->isIntegerTy())
       return rewriteIntegerStore(IRB, SI);
 
@@ -2687,9 +2796,8 @@ private:
       = P.getMemTransferOffsets(II);
 
     assert(OldPtr->getType()->isPointerTy() && "Must be a pointer type!");
-    unsigned AS = cast<PointerType>(OldPtr->getType())->getAddressSpace();
     // Compute the relative offset within the transfer.
-    unsigned IntPtrWidth = TD.getPointerSizeInBits(AS);
+    unsigned IntPtrWidth = TD.getTypeSizeInBits(OldPtr->getType());
     APInt RelOffset(IntPtrWidth, BeginOffset - (IsDest ? MTO.DestBegin
                                                        : MTO.SourceBegin));
 
@@ -3173,6 +3281,9 @@ static Type *getTypePartition(const DataLayout &TD, Type *Ty,
                               uint64_t Offset, uint64_t Size) {
   if (Offset == 0 && TD.getTypeAllocSize(Ty) == Size)
     return stripAggregateTypeWrapping(TD, Ty);
+  if (Offset > TD.getTypeAllocSize(Ty) ||
+      (TD.getTypeAllocSize(Ty) - Offset) < Size)
+    return 0;
 
   if (SequentialType *SeqTy = dyn_cast<SequentialType>(Ty)) {
     // We can't partition pointers...
@@ -3464,6 +3575,8 @@ void SROA::deleteDeadInstructions(SmallPtrSet<AllocaInst*, 4> &DeletedAllocas) {
     Instruction *I = DeadInsts.pop_back_val();
     DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");
 
+    I->replaceAllUsesWith(UndefValue::get(I->getType()));
+
     for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
       if (Instruction *U = dyn_cast<Instruction>(*OI)) {
         // Zero out the operand and see if it becomes trivially dead.
diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
index a46d09c320..a5446294e3 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -963,7 +963,7 @@ ConvertScalar_InsertValue(Value *SV, Value *Old,
   if (SV->getType()->isFloatingPointTy() || SV->getType()->isVectorTy())
     SV = Builder.CreateBitCast(SV, IntegerType::get(SV->getContext(),SrcWidth));
   else if (SV->getType()->isPointerTy())
-    SV = Builder.CreatePtrToInt(SV, TD.getIntPtrType(SV->getContext()));
+    SV = Builder.CreatePtrToInt(SV, TD.getIntPtrType(SV->getType()));
 
   // Zero extend or truncate the value if needed.
   if (SV->getType() != AllocaType) {
diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
index c82a00fc2c..f3448bcd87 100644
--- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
@@ -165,9 +165,10 @@ struct StpCpyOpt: public LibCallOptimization {
     uint64_t Len = GetStringLength(Src);
     if (Len == 0) return 0;
 
-    Value *LenV = ConstantInt::get(TD->getIntPtrType(*Context), Len);
+    Type *PT = FT->getParamType(0);
+    Value *LenV = ConstantInt::get(TD->getIntPtrType(PT), Len);
     Value *DstEnd = B.CreateGEP(Dst,
-                                ConstantInt::get(TD->getIntPtrType(*Context),
+                                ConstantInt::get(TD->getIntPtrType(PT),
                                                  Len - 1));
 
     // We have enough information to now generate the memcpy call to do the
@@ -220,9 +221,10 @@ struct StrNCpyOpt : public LibCallOptimization {
     // Let strncpy handle the zero padding
     if (Len > SrcLen+1) return 0;
 
+    Type *PT = FT->getParamType(0);
     // strncpy(x, s, c) -> memcpy(x, s, c, 1) [s and c are constant]
     B.CreateMemCpy(Dst, Src,
-                   ConstantInt::get(TD->getIntPtrType(*Context), Len), 1);
+                   ConstantInt::get(TD->getIntPtrType(PT), Len), 1);
 
     return Dst;
   }
@@ -508,10 +510,11 @@ struct MemCpyOpt : public LibCallOptimization {
     if (!TD) return 0;
 
     FunctionType *FT = Callee->getFunctionType();
+    Type *PT = FT->getParamType(0);
     if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
         !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isPointerTy() ||
-        FT->getParamType(2) != TD->getIntPtrType(*Context))
+        FT->getParamType(2) != TD->getIntPtrType(PT))
       return 0;
 
     // memcpy(x, y, n) -> llvm.memcpy(x, y, n, 1)
@@ -530,10 +533,11 @@ struct MemMoveOpt : public LibCallOptimization {
     if (!TD) return 0;
 
     FunctionType *FT = Callee->getFunctionType();
+    Type *PT = FT->getParamType(0);
     if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
         !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isPointerTy() ||
-        FT->getParamType(2) != TD->getIntPtrType(*Context))
+        FT->getParamType(2) != TD->getIntPtrType(PT))
       return 0;
 
     // memmove(x, y, n) -> llvm.memmove(x, y, n, 1)
@@ -552,10 +556,11 @@ struct MemSetOpt : public LibCallOptimization {
     if (!TD) return 0;
 
     FunctionType *FT = Callee->getFunctionType();
+    Type *PT = FT->getParamType(0);
     if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
         !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isIntegerTy() ||
-        FT->getParamType(2) != TD->getIntPtrType(*Context))
+        FT->getParamType(2) != TD->getIntPtrType(PT))
       return 0;
 
     // memset(p, v, n) -> llvm.memset(p, v, n, 1)
@@ -980,8 +985,9 @@ struct SPrintFOpt : public LibCallOptimization {
       if (!TD) return 0;
 
       // sprintf(str, fmt) -> llvm.memcpy(str, fmt, strlen(fmt)+1, 1)
+      Type *AT = CI->getArgOperand(0)->getType();
       B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
-                     ConstantInt::get(TD->getIntPtrType(*Context), // Copy the
+                     ConstantInt::get(TD->getIntPtrType(AT), // Copy the
                                       FormatStr.size() + 1), 1);   // nul byte.
       return ConstantInt::get(CI->getType(), FormatStr.size());
     }
@@ -1108,8 +1114,9 @@ struct FPutsOpt : public LibCallOptimization {
     uint64_t Len = GetStringLength(CI->getArgOperand(0));
     if (!Len) return 0;
     // Known to have no uses (see above).
+    Type *PT = FT->getParamType(0);
     return EmitFWrite(CI->getArgOperand(0),
-                      ConstantInt::get(TD->getIntPtrType(*Context), Len-1),
+                      ConstantInt::get(TD->getIntPtrType(PT), Len-1),
                       CI->getArgOperand(1), B, TD, TLI);
   }
 };
@@ -1134,8 +1141,9 @@ struct FPrintFOpt : public LibCallOptimization {
       // These optimizations require DataLayout.
       if (!TD) return 0;
 
+      Type *AT = CI->getArgOperand(1)->getType();
       Value *NewCI = EmitFWrite(CI->getArgOperand(1),
-                                ConstantInt::get(TD->getIntPtrType(*Context),
+                                ConstantInt::get(TD->getIntPtrType(AT),
                                                  FormatStr.size()),
                                 CI->getArgOperand(0), B, TD, TLI);
       return NewCI ? ConstantInt::get(CI->getType(), FormatStr.size()) : 0;
diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp
index fa2faa2dad..bd28f10654 100644
--- a/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -46,9 +46,8 @@ Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout *TD,
   AWI[1] = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
                                    ArrayRef<Attributes::AttrVal>(AVs, 2));
 
-  LLVMContext &Context = B.GetInsertBlock()->getContext();
   Constant *StrLen = M->getOrInsertFunction("strlen", AttrListPtr::get(AWI),
-                                            TD->getIntPtrType(Context),
+                                            TD->getIntPtrType(Ptr->getType()),
                                             B.getInt8PtrTy(),
                                             NULL);
   CallInst *CI = B.CreateCall(StrLen, CastToCStr(Ptr, B), "strlen");
@@ -73,11 +72,10 @@ Value *llvm::EmitStrNLen(Value *Ptr, Value *MaxLen, IRBuilder<> &B,
   AWI[1] = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
                                    ArrayRef<Attributes::AttrVal>(AVs, 2));
 
-  LLVMContext &Context = B.GetInsertBlock()->getContext();
   Constant *StrNLen = M->getOrInsertFunction("strnlen", AttrListPtr::get(AWI),
-                                             TD->getIntPtrType(Context),
+                                             TD->getIntPtrType(Ptr->getType()),
                                              B.getInt8PtrTy(),
-                                             TD->getIntPtrType(Context),
+                                             TD->getIntPtrType(Ptr->getType()),
                                              NULL);
   CallInst *CI = B.CreateCall2(StrNLen, CastToCStr(Ptr, B), MaxLen, "strnlen");
   if (const Function *F = dyn_cast<Function>(StrNLen->stripPointerCasts()))
@@ -126,12 +124,12 @@ Value *llvm::EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len,
   AWI[2] = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
                                    ArrayRef<Attributes::AttrVal>(AVs, 2));
 
-  LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *StrNCmp = M->getOrInsertFunction("strncmp", AttrListPtr::get(AWI),
                                           B.getInt32Ty(),
                                           B.getInt8PtrTy(),
                                           B.getInt8PtrTy(),
-                                          TD->getIntPtrType(Context), NULL);
+                                          TD->getIntPtrType(Ptr1->getType()),
+                                          NULL);
   CallInst *CI = B.CreateCall3(StrNCmp, CastToCStr(Ptr1, B),
                                CastToCStr(Ptr2, B), Len, "strncmp");
 
@@ -201,14 +199,14 @@ Value *llvm::EmitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
   AttributeWithIndex AWI;
   AWI = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
                                 Attributes::NoUnwind);
-  LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *MemCpy = M->getOrInsertFunction("__memcpy_chk",
                                          AttrListPtr::get(AWI),
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
-                                         TD->getIntPtrType(Context),
-                                         TD->getIntPtrType(Context), NULL);
+                                         TD->getIntPtrType(Dst->getType()),
+                                         TD->getIntPtrType(Src->getType()),
+                                         NULL);
   Dst = CastToCStr(Dst, B);
   Src = CastToCStr(Src, B);
   CallInst *CI = B.CreateCall4(MemCpy, Dst, Src, Len, ObjSize);
@@ -230,12 +228,11 @@ Value *llvm::EmitMemChr(Value *Ptr, Value *Val,
   Attributes::AttrVal AVs[2] = { Attributes::ReadOnly, Attributes::NoUnwind };
   AWI = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
                                 ArrayRef<Attributes::AttrVal>(AVs, 2));
-  LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *MemChr = M->getOrInsertFunction("memchr", AttrListPtr::get(AWI),
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
                                          B.getInt32Ty(),
-                                         TD->getIntPtrType(Context),
+                                         TD->getIntPtrType(Ptr->getType()),
                                          NULL);
   CallInst *CI = B.CreateCall3(MemChr, CastToCStr(Ptr, B), Val, Len, "memchr");
 
@@ -260,12 +257,12 @@ Value *llvm::EmitMemCmp(Value *Ptr1, Value *Ptr2,
   AWI[2] = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
                                    ArrayRef<Attributes::AttrVal>(AVs, 2));
 
-  LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *MemCmp = M->getOrInsertFunction("memcmp", AttrListPtr::get(AWI),
                                          B.getInt32Ty(),
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
-                                         TD->getIntPtrType(Context), NULL);
+                                         TD->getIntPtrType(Ptr1->getType()),
+                                         NULL);
   CallInst *CI = B.CreateCall3(MemCmp, CastToCStr(Ptr1, B), CastToCStr(Ptr2, B),
                                Len, "memcmp");
 
@@ -425,24 +422,24 @@ Value *llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File,
   AWI[1] = AttributeWithIndex::get(M->getContext(), 4, Attributes::NoCapture);
   AWI[2] = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
                                    Attributes::NoUnwind);
-  LLVMContext &Context = B.GetInsertBlock()->getContext();
   StringRef FWriteName = TLI->getName(LibFunc::fwrite);
   Constant *F;
+  Type *PtrTy = Ptr->getType();
   if (File->getType()->isPointerTy())
     F = M->getOrInsertFunction(FWriteName, AttrListPtr::get(AWI),
-                               TD->getIntPtrType(Context),
+                               TD->getIntPtrType(PtrTy),
                                B.getInt8PtrTy(),
-                               TD->getIntPtrType(Context),
-                               TD->getIntPtrType(Context),
+                               TD->getIntPtrType(PtrTy),
+                               TD->getIntPtrType(PtrTy),
                                File->getType(), NULL);
   else
-    F = M->getOrInsertFunction(FWriteName, TD->getIntPtrType(Context),
+    F = M->getOrInsertFunction(FWriteName, TD->getIntPtrType(PtrTy),
                                B.getInt8PtrTy(),
-                               TD->getIntPtrType(Context),
-                               TD->getIntPtrType(Context),
+                               TD->getIntPtrType(PtrTy),
+                               TD->getIntPtrType(PtrTy),
                                File->getType(), NULL);
   CallInst *CI = B.CreateCall4(F, CastToCStr(Ptr, B), Size,
-                        ConstantInt::get(TD->getIntPtrType(Context), 1), File);
+                        ConstantInt::get(TD->getIntPtrType(PtrTy), 1), File);
 
   if (const Function *Fn = dyn_cast<Function>(F->stripPointerCasts()))
     CI->setCallingConv(Fn->getCallingConv());
@@ -464,12 +461,13 @@ bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const DataLayout *TD,
   IRBuilder<> B(CI);
 
   if (Name == "__memcpy_chk") {
+    Type *PT = FT->getParamType(0);
     // Check if this has the right signature.
     if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
         !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isPointerTy() ||
-        FT->getParamType(2) != TD->getIntPtrType(Context) ||
-        FT->getParamType(3) != TD->getIntPtrType(Context))
+        FT->getParamType(2) != TD->getIntPtrType(PT) ||
+        FT->getParamType(3) != TD->getIntPtrType(PT))
       return false;
 
     if (isFoldable(3, 2, false)) {
@@ -488,11 +486,12 @@ bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const DataLayout *TD,
 
   if (Name == "__memmove_chk") {
     // Check if this has the right signature.
+    Type *PT = FT->getParamType(0);
     if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
         !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isPointerTy() ||
-        FT->getParamType(2) != TD->getIntPtrType(Context) ||
-        FT->getParamType(3) != TD->getIntPtrType(Context))
+        FT->getParamType(2) != TD->getIntPtrType(PT) ||
+        FT->getParamType(3) != TD->getIntPtrType(PT))
       return false;
 
     if (isFoldable(3, 2, false)) {
@@ -506,11 +505,12 @@ bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const DataLayout *TD,
 
   if (Name == "__memset_chk") {
     // Check if this has the right signature.
+    Type *PT = FT->getParamType(0);
     if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
         !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isIntegerTy() ||
-        FT->getParamType(2) != TD->getIntPtrType(Context) ||
-        FT->getParamType(3) != TD->getIntPtrType(Context))
+        FT->getParamType(2) != TD->getIntPtrType(PT) ||
+        FT->getParamType(3) != TD->getIntPtrType(PT))
       return false;
 
     if (isFoldable(3, 2, false)) {
@@ -525,11 +525,12 @@ bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const DataLayout *TD,
 
   if (Name == "__strcpy_chk" || Name == "__stpcpy_chk") {
     // Check if this has the right signature.
+    Type *PT = FT->getParamType(0);
     if (FT->getNumParams() != 3 ||
         FT->getReturnType() != FT->getParamType(0) ||
         FT->getParamType(0) != FT->getParamType(1) ||
         FT->getParamType(0) != Type::getInt8PtrTy(Context) ||
-        FT->getParamType(2) != TD->getIntPtrType(Context))
+        FT->getParamType(2) != TD->getIntPtrType(PT))
       return 0;
     
     
@@ -551,11 +552,12 @@ bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const DataLayout *TD,
 
   if (Name == "__strncpy_chk" || Name == "__stpncpy_chk") {
     // Check if this has the right signature.
+    Type *PT = FT->getParamType(0);
     if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
         FT->getParamType(0) != FT->getParamType(1) ||
         FT->getParamType(0) != Type::getInt8PtrTy(Context) ||
         !FT->getParamType(2)->isIntegerTy() ||
-        FT->getParamType(3) != TD->getIntPtrType(Context))
+        FT->getParamType(3) != TD->getIntPtrType(PT))
       return false;
 
     if (isFoldable(3, 2, false)) {
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 9729687a83..c09d982d65 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -806,8 +806,7 @@ unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign,
                                           const DataLayout *TD) {
   assert(V->getType()->isPointerTy() &&
          "getOrEnforceKnownAlignment expects a pointer!");
-  unsigned AS = cast<PointerType>(V->getType())->getAddressSpace();
-  unsigned BitWidth = TD ? TD->getPointerSizeInBits(AS) : 64;
+  unsigned BitWidth = TD ? TD->getTypeSizeInBits(V->getType()) : 64;
   APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
   ComputeMaskedBits(V, KnownZero, KnownOne, TD);
   unsigned TrailZ = KnownZero.countTrailingOnes();
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index a008da67e9..870e2b2ade 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -392,7 +392,7 @@ static ConstantInt *GetConstantInt(Value *V, const DataLayout *TD) {
 
   // This is some kind of pointer constant. Turn it into a pointer-sized
   // ConstantInt if possible.
-  IntegerType *PtrTy = TD->getIntPtrType(V->getContext());
+  IntegerType *PtrTy = TD->getIntPtrType(V->getType());
 
   // Null pointer means 0, see SelectionDAGBuilder::getValue(const Value*).
   if (isa<ConstantPointerNull>(V))
@@ -532,9 +532,13 @@ Value *SimplifyCFGOpt::isValueEqualityComparison(TerminatorInst *TI) {
           CV = ICI->getOperand(0);
 
   // Unwrap any lossless ptrtoint cast.
-  if (TD && CV && CV->getType() == TD->getIntPtrType(CV->getContext()))
-    if (PtrToIntInst *PTII = dyn_cast<PtrToIntInst>(CV))
+  if (TD && CV) {
+    PtrToIntInst *PTII = NULL;
+    if ((PTII = dyn_cast<PtrToIntInst>(CV)) &&
+        CV->getType() == TD->getIntPtrType(CV->getContext(),
+          PTII->getPointerAddressSpace()))
       CV = PTII->getOperand(0);
+  }
   return CV;
 }
 
@@ -981,7 +985,7 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
       // Convert pointer to int before we switch.
       if (CV->getType()->isPointerTy()) {
         assert(TD && "Cannot switch on pointer without DataLayout");
-        CV = Builder.CreatePtrToInt(CV, TD->getIntPtrType(CV->getContext()),
+        CV = Builder.CreatePtrToInt(CV, TD->getIntPtrType(CV->getType()),
                                     "magicptr");
       }
 
@@ -2709,7 +2713,7 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, const DataLayout *TD,
   if (CompVal->getType()->isPointerTy()) {
     assert(TD && "Cannot switch on pointer without DataLayout");
     CompVal = Builder.CreatePtrToInt(CompVal,
-                                     TD->getIntPtrType(CompVal->getContext()),
+                                     TD->getIntPtrType(CompVal->getType()),
                                      "magicptr");
   }
 
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index b15acdff63..162b29e829 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -102,14 +102,13 @@ struct MemCpyChkOpt : public InstFortifiedLibCallOptimization {
   virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
     this->CI = CI;
     FunctionType *FT = Callee->getFunctionType();
-    LLVMContext &Context = CI->getParent()->getContext();
 
     // Check if this has the right signature.
     if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
         !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isPointerTy() ||
-        FT->getParamType(2) != TD->getIntPtrType(Context) ||
-        FT->getParamType(3) != TD->getIntPtrType(Context))
+        FT->getParamType(2) != TD->getIntPtrType(FT->getParamType(0)) ||
+        FT->getParamType(3) != TD->getIntPtrType(FT->getParamType(1)))
       return 0;
 
     if (isFoldable(3, 2, false)) {
@@ -125,14 +124,13 @@ struct MemMoveChkOpt : public InstFortifiedLibCallOptimization {
   virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
     this->CI = CI;
     FunctionType *FT = Callee->getFunctionType();
-    LLVMContext &Context = CI->getParent()->getContext();
 
     // Check if this has the right signature.
     if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
         !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isPointerTy() ||
-        FT->getParamType(2) != TD->getIntPtrType(Context) ||
-        FT->getParamType(3) != TD->getIntPtrType(Context))
+        FT->getParamType(2) != TD->getIntPtrType(FT->getParamType(0)) ||
+        FT->getParamType(3) != TD->getIntPtrType(FT->getParamType(1)))
       return 0;
 
     if (isFoldable(3, 2, false)) {
@@ -148,14 +146,13 @@ struct MemSetChkOpt : public InstFortifiedLibCallOptimization {
   virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
     this->CI = CI;
     FunctionType *FT = Callee->getFunctionType();
-    LLVMContext &Context = CI->getParent()->getContext();
 
     // Check if this has the right signature.
     if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
         !FT->getParamType(0)->isPointerTy() ||
         !FT->getParamType(1)->isIntegerTy() ||
-        FT->getParamType(2) != TD->getIntPtrType(Context) ||
-        FT->getParamType(3) != TD->getIntPtrType(Context))
+        FT->getParamType(2) != TD->getIntPtrType(FT->getParamType(0)) ||
+        FT->getParamType(3) != TD->getIntPtrType(FT->getParamType(0)))
       return 0;
 
     if (isFoldable(3, 2, false)) {
@@ -180,7 +177,7 @@ struct StrCpyChkOpt : public InstFortifiedLibCallOptimization {
         FT->getReturnType() != FT->getParamType(0) ||
         FT->getParamType(0) != FT->getParamType(1) ||
         FT->getParamType(0) != Type::getInt8PtrTy(Context) ||
-        FT->getParamType(2) != TD->getIntPtrType(Context))
+        FT->getParamType(2) != TD->getIntPtrType(FT->getParamType(0)))
       return 0;
 
     Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
@@ -205,8 +202,8 @@ struct StrCpyChkOpt : public InstFortifiedLibCallOptimization {
 
       Value *Ret =
 	EmitMemCpyChk(Dst, Src,
-                      ConstantInt::get(TD->getIntPtrType(Context), Len),
-                      CI->getArgOperand(2), B, TD, TLI);
+                      ConstantInt::get(TD->getIntPtrType(Dst->getType()),
+                      Len), CI->getArgOperand(2), B, TD, TLI);
       return Ret;
     }
     return 0;
@@ -225,7 +222,7 @@ struct StrNCpyChkOpt : public InstFortifiedLibCallOptimization {
         FT->getParamType(0) != FT->getParamType(1) ||
         FT->getParamType(0) != Type::getInt8PtrTy(Context) ||
         !FT->getParamType(2)->isIntegerTy() ||
-        FT->getParamType(3) != TD->getIntPtrType(Context))
+        FT->getParamType(3) != TD->getIntPtrType(FT->getParamType(0)))
       return 0;
 
     if (isFoldable(3, 2, false)) {
@@ -287,7 +284,8 @@ struct StrCatOpt : public LibCallOptimization {
     // We have enough information to now generate the memcpy call to do the
     // concatenation for us.  Make a memcpy to copy the nul byte with align = 1.
     B.CreateMemCpy(CpyDst, Src,
-                   ConstantInt::get(TD->getIntPtrType(*Context), Len + 1), 1);
+                   ConstantInt::get(TD->getIntPtrType(Src->getType()),
+                   Len + 1), 1);
     return Dst;
   }
 };
@@ -359,8 +357,9 @@ struct StrChrOpt : public LibCallOptimization {
       if (Len == 0 || !FT->getParamType(1)->isIntegerTy(32))// memchr needs i32.
         return 0;
 
+      Type *PT = FT->getParamType(0);
       return EmitMemChr(SrcStr, CI->getArgOperand(1), // include nul.
-                        ConstantInt::get(TD->getIntPtrType(*Context), Len),
+                        ConstantInt::get(TD->getIntPtrType(PT), Len),
                         B, TD, TLI);
     }
 
@@ -454,8 +453,9 @@ struct StrCmpOpt : public LibCallOptimization {
       // These optimizations require DataLayout.
       if (!TD) return 0;
 
+      Type *PT = FT->getParamType(0);
       return EmitMemCmp(Str1P, Str2P,
-                        ConstantInt::get(TD->getIntPtrType(*Context),
+                        ConstantInt::get(TD->getIntPtrType(PT),
                         std::min(Len1, Len2)), B, TD, TLI);
     }
 
@@ -537,7 +537,7 @@ struct StrCpyOpt : public LibCallOptimization {
     // We have enough information to now generate the memcpy call to do the
     // copy for us.  Make a memcpy to copy the nul byte with align = 1.
     B.CreateMemCpy(Dst, Src,
-		   ConstantInt::get(TD->getIntPtrType(*Context), Len), 1);
+		   ConstantInt::get(TD->getIntPtrType(Dst->getType()), Len), 1);
     return Dst;
   }
 };
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index f944d9b4fc..423c7a4911 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -18,10 +18,13 @@
 //
 // This pass has three parts:
 // 1. The main loop pass that drives the different parts.
-// 2. LoopVectorizationLegality - A helper class that checks for the legality
+// 2. LoopVectorizationLegality - A unit that checks for the legality
 //    of the vectorization.
-// 3. SingleBlockLoopVectorizer - A helper class that performs the actual
+// 3. SingleBlockLoopVectorizer - A unit that performs the actual
 //    widening of instructions.
+// 4. LoopVectorizationCostModel - A unit that checks for the profitability
+//    of vectorization. It decides on the optimal vector width, which
+//    can be one, if vectorization is not profitable.
 //===----------------------------------------------------------------------===//
 //
 // The reduction-variable vectorization is based on the paper:
@@ -51,13 +54,14 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/TargetTransformInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -67,13 +71,14 @@
 using namespace llvm;
 
 static cl::opt<unsigned>
-DefaultVectorizationFactor("default-loop-vectorize-width",
-                          cl::init(4), cl::Hidden,
-                          cl::desc("Set the default loop vectorization width"));
+VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
+          cl::desc("Set the default vectorization width. Zero is autoselect."));
+
 namespace {
 
-// Forward declaration.
+// Forward declarations.
 class LoopVectorizationLegality;
+class LoopVectorizationCostModel;
 
 /// SingleBlockLoopVectorizer vectorizes loops which contain only one basic
 /// block to a specified vectorization factor (VF).
@@ -203,7 +208,10 @@ public:
   enum ReductionKind {
     NoReduction = -1, /// Not a reduction.
     IntegerAdd  = 0,  /// Sum of numbers.
-    IntegerMult = 1  /// Product of numbers.
+    IntegerMult = 1,  /// Product of numbers.
+    IntegerOr   = 2,  /// Bitwise or logical OR of numbers.
+    IntegerAnd  = 3,  /// Bitwise or logical AND of numbers.
+    IntegerXor  = 4   /// Bitwise or logical XOR of numbers.
   };
 
   /// This POD struct holds information about reduction variables.
@@ -229,11 +237,10 @@ public:
   /// of the reductions that were found in the loop.
   typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
 
-  /// Returns the maximum vectorization factor that we *can* use to vectorize
-  /// this loop. This does not mean that it is profitable to vectorize this
-  /// loop, only that it is legal to do so. This may be a large number. We
-  /// can vectorize to any SIMD width below this number.
-  unsigned getLoopMaxVF();
+  /// Returns true if it is legal to vectorize this loop.
+  /// This does not mean that it is profitable to vectorize this
+  /// loop, only that it is legal to do so.
+  bool canVectorize();
 
   /// Returns the Induction variable.
   PHINode *getInduction() {return Induction;}
@@ -259,10 +266,6 @@ private:
   /// Returns true if BB is vectorizable
   bool canVectorizeMemory(BasicBlock &BB);
 
-  // Check if a pointer value is known to be disjoint.
-  // Example: Alloca, Global, NoAlias.
-  bool isIdentifiedSafeObject(Value* Val);
-
   /// Returns True, if 'Phi' is the kind of reduction variable for type
   /// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
   bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
@@ -290,6 +293,48 @@ private:
   SmallPtrSet<Value*, 4> AllowedExit;
 };
 
+/// LoopVectorizationCostModel - estimates the expected speedups due to
+/// vectorization.
+/// In many cases vectorization is not profitable. This can happen because
+/// of a number of reasons. In this class we mainly attempt to predict
+/// the expected speedup/slowdowns due to the supported instruction set.
+/// We use the VectorTargetTransformInfo to query the different backends
+/// for the cost of different operations.
+class LoopVectorizationCostModel {
+public:
+  /// C'tor.
+  LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se,
+                             LoopVectorizationLegality *Leg,
+                             const VectorTargetTransformInfo *Vtti):
+  TheLoop(Lp), SE(Se), Legal(Leg), VTTI(Vtti) { }
+
+  /// Returns the most profitable vectorization factor for the loop that is
+  /// smaller or equal to the VF argument. This method checks every power
+  /// of two up to VF.
+  unsigned findBestVectorizationFactor(unsigned VF = 4);
+
+private:
+  /// Returns the expected execution cost. The unit of the cost does
+  /// not matter because we use the 'cost' units to compare different
+  /// vector widths. The cost that is returned is *not* normalized by
+  /// the factor width.
+  unsigned expectedCost(unsigned VF);
+
+  /// Returns the execution time cost of an instruction for a given vector
+  /// width. Vector width of one means scalar.
+  unsigned getInstructionCost(Instruction *I, unsigned VF);
+
+  /// The loop that we evaluate.
+  Loop *TheLoop;
+  /// Scev analysis.
+  ScalarEvolution *SE;
+
+  /// Vectorization legality.
+  LoopVectorizationLegality *Legal;
+  /// Vector target information.
+  const VectorTargetTransformInfo *VTTI;
+};
+
 struct LoopVectorize : public LoopPass {
   static char ID; // Pass identification, replacement for typeid
 
@@ -300,6 +345,7 @@ struct LoopVectorize : public LoopPass {
   ScalarEvolution *SE;
   DataLayout *DL;
   LoopInfo *LI;
+  TargetTransformInfo *TTI;
 
   virtual bool runOnLoop(Loop *L, LPPassManager &LPM) {
     // We only vectorize innermost loops.
@@ -309,25 +355,42 @@ struct LoopVectorize : public LoopPass {
     SE = &getAnalysis<ScalarEvolution>();
     DL = getAnalysisIfAvailable<DataLayout>();
     LI = &getAnalysis<LoopInfo>();
+    TTI = getAnalysisIfAvailable<TargetTransformInfo>();
 
     DEBUG(dbgs() << "LV: Checking a loop in \"" <<
           L->getHeader()->getParent()->getName() << "\"\n");
 
     // Check if it is legal to vectorize the loop.
     LoopVectorizationLegality LVL(L, SE, DL);
-    unsigned MaxVF = LVL.getLoopMaxVF();
-
-    // Check that we can vectorize this loop using the chosen vectorization
-    // width.
-    if (MaxVF < DefaultVectorizationFactor) {
-      DEBUG(dbgs() << "LV: non-vectorizable MaxVF ("<< MaxVF << ").\n");
+    if (!LVL.canVectorize()) {
+      DEBUG(dbgs() << "LV: Not vectorizing.\n");
       return false;
     }
 
-    DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< MaxVF << ").\n");
+    // Select the preffered vectorization factor.
+    unsigned VF = 1;
+    if (VectorizationFactor == 0) {
+      const VectorTargetTransformInfo *VTTI = 0;
+      if (TTI)
+        VTTI = TTI->getVectorTargetTransformInfo();
+      // Use the cost model.
+      LoopVectorizationCostModel CM(L, SE, &LVL, VTTI);
+      VF = CM.findBestVectorizationFactor();
+
+      if (VF == 1) {
+        DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
+        return false;
+      }
+
+    } else {
+      // Use the user command flag.
+      VF = VectorizationFactor;
+    }
+
+    DEBUG(dbgs() << "LV: Found a vectorizable loop ("<< VF << ").\n");
 
     // If we decided that it is *legal* to vectorizer the loop then do it.
-    SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, DefaultVectorizationFactor);
+    SingleBlockLoopVectorizer LB(L, SE, LI, &LPM, VF);
     LB.vectorize(&LVL);
 
     DEBUG(verifyFunction(*L->getHeader()->getParent()));
@@ -660,6 +723,13 @@ void SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal
 
 void
 SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
+  //===------------------------------------------------===//
+  //
+  // Notice: any optimization or new instruction that go
+  // into the code below should be also be implemented in
+  // the cost-model.
+  //
+  //===------------------------------------------------===//
   typedef SmallVector<PHINode*, 4> PhiVector;
   BasicBlock &BB = *OrigLoop->getHeader();
   Constant *Zero = ConstantInt::get(
@@ -914,14 +984,28 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     // Extract the first scalar.
     Value *Scalar0 =
       Builder.CreateExtractElement(NewPhi, Builder.getInt32(0));
-    // Extract and sum the remaining vector elements.
+    // Extract and reduce the remaining vector elements.
     for (unsigned i=1; i < VF; ++i) {
       Value *Scalar1 =
         Builder.CreateExtractElement(NewPhi, Builder.getInt32(i));
-      if (RdxDesc.Kind == LoopVectorizationLegality::IntegerAdd) {
-        Scalar0 = Builder.CreateAdd(Scalar0, Scalar1);
-      } else {
-        Scalar0 = Builder.CreateMul(Scalar0, Scalar1);
+      switch (RdxDesc.Kind) {
+        case LoopVectorizationLegality::IntegerAdd:
+          Scalar0 = Builder.CreateAdd(Scalar0, Scalar1);
+          break;
+        case LoopVectorizationLegality::IntegerMult:
+          Scalar0 = Builder.CreateMul(Scalar0, Scalar1);
+          break;
+        case LoopVectorizationLegality::IntegerOr:
+          Scalar0 = Builder.CreateOr(Scalar0, Scalar1);
+          break;
+        case LoopVectorizationLegality::IntegerAnd:
+          Scalar0 = Builder.CreateAnd(Scalar0, Scalar1);
+          break;
+        case LoopVectorizationLegality::IntegerXor:
+          Scalar0 = Builder.CreateXor(Scalar0, Scalar1);
+          break;
+        default:
+          llvm_unreachable("Unknown reduction operation");
       }
     }
 
@@ -961,18 +1045,18 @@ void SingleBlockLoopVectorizer::cleanup() {
   SE->forgetLoop(OrigLoop);
 }
 
-unsigned LoopVectorizationLegality::getLoopMaxVF() {
+bool LoopVectorizationLegality::canVectorize() {
   if (!TheLoop->getLoopPreheader()) {
     assert(false && "No preheader!!");
     DEBUG(dbgs() << "LV: Loop not normalized." << "\n");
-    return  1;
+    return  false;
   }
 
   // We can only vectorize single basic block loops.
   unsigned NumBlocks = TheLoop->getNumBlocks();
   if (NumBlocks != 1) {
     DEBUG(dbgs() << "LV: Too many blocks:" << NumBlocks << "\n");
-    return 1;
+    return false;
   }
 
   // We need to have a loop header.
@@ -982,22 +1066,22 @@ unsigned LoopVectorizationLegality::getLoopMaxVF() {
   // Go over each instruction and look at memory deps.
   if (!canVectorizeBlock(*BB)) {
     DEBUG(dbgs() << "LV: Can't vectorize this loop header\n");
-    return 1;
+    return false;
   }
 
   // ScalarEvolution needs to be able to find the exit count.
   const SCEV *ExitCount = SE->getExitCount(TheLoop, BB);
   if (ExitCount == SE->getCouldNotCompute()) {
     DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
-    return 1;
+    return false;
   }
 
   DEBUG(dbgs() << "LV: We can vectorize this loop!\n");
 
   // Okay! We can vectorize. At this point we don't have any other mem analysis
-  // which may limit our maximum vectorization factor, so just return the
-  // maximum SIMD size.
-  return DefaultVectorizationFactor;
+  // which may limit our maximum vectorization factor, so just return true with
+  // no restrictions.
+  return true;
 }
 
 bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
@@ -1032,7 +1116,19 @@ bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
         continue;
       }
       if (AddReductionVar(Phi, IntegerMult)) {
-        DEBUG(dbgs() << "LV: Found an Mult reduction PHI."<< *Phi <<"\n");
+        DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n");
+        continue;
+      }
+      if (AddReductionVar(Phi, IntegerOr)) {
+        DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n");
+        continue;
+      }
+      if (AddReductionVar(Phi, IntegerAnd)) {
+        DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n");
+        continue;
+      }
+      if (AddReductionVar(Phi, IntegerXor)) {
+        DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n");
         continue;
       }
 
@@ -1178,7 +1274,7 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
     GetUnderlyingObjects(*I, TempObjects, DL);
     for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
          it != e; ++it) {
-      if (!isIdentifiedSafeObject(*it)) {
+      if (!isIdentifiedObject(*it)) {
         DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n");
         return false;
       }
@@ -1196,7 +1292,7 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
     GetUnderlyingObjects(*I, TempObjects, DL);
     for (ValueVector::iterator it=TempObjects.begin(), e=TempObjects.end();
          it != e; ++it) {
-      if (!isIdentifiedSafeObject(*it)) {
+      if (!isIdentifiedObject(*it)) {
         DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n");
         return false;
       }
@@ -1213,19 +1309,6 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
   return true;
 }
 
-/// Checks if the value is a Global variable or if it is an Arguments
-/// marked with the NoAlias attribute.
-bool LoopVectorizationLegality::isIdentifiedSafeObject(Value* Val) {
-  assert(Val && "Invalid value");
-  if (isa<GlobalValue>(Val))
-    return true;
-  if (isa<AllocaInst>(Val))
-    return true;
-  if (Argument *A = dyn_cast<Argument>(Val))
-    return A->hasNoAliasAttr();
-  return false;
-}
-
 bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
                                                 ReductionKind Kind) {
   if (Phi->getNumIncomingValues() != 2)
@@ -1319,6 +1402,12 @@ LoopVectorizationLegality::isReductionInstr(Instruction *I,
     case Instruction::UDiv:
     case Instruction::SDiv:
       return Kind == IntegerMult;
+    case Instruction::And:
+      return Kind == IntegerAnd;
+    case Instruction::Or:
+      return Kind == IntegerOr;
+    case Instruction::Xor:
+      return Kind == IntegerXor;
     }
 }
 
@@ -1340,6 +1429,193 @@ bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
   return true;
 }
 
+unsigned
+LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) {
+  if (!VTTI) {
+    DEBUG(dbgs() << "LV: No vector target information. Not vectorizing. \n");
+    return 1;
+  }
+
+  float Cost = expectedCost(1);
+  unsigned Width = 1;
+  DEBUG(dbgs() << "LV: Scalar loop costs: "<< (int)Cost << ".\n");
+  for (unsigned i=2; i <= VF; i*=2) {
+    // Notice that the vector loop needs to be executed less times, so
+    // we need to divide the cost of the vector loops by the width of
+    // the vector elements.
+    float VectorCost = expectedCost(i) / (float)i;
+    DEBUG(dbgs() << "LV: Vector loop of width "<< i << " costs: " <<
+          (int)VectorCost << ".\n");
+    if (VectorCost < Cost) {
+      Cost = VectorCost;
+      Width = i;
+    }
+  }
+
+  DEBUG(dbgs() << "LV: Selecting VF = : "<< Width << ".\n");
+  return Width;
+}
+
+unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
+  // We can only estimate the cost of single basic block loops.
+  assert(1 == TheLoop->getNumBlocks() && "Too many blocks in loop");
+
+  BasicBlock *BB = TheLoop->getHeader();
+  unsigned Cost = 0;
+
+  // For each instruction in the old loop.
+  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+    Instruction *Inst = it;
+    unsigned C = getInstructionCost(Inst, VF);
+    Cost += C;
+    DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF "<< VF <<
+          " For instruction: "<< *Inst << "\n");
+  }
+
+  return Cost;
+}
+
+unsigned
+LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
+  assert(VTTI && "Invalid vector target transformation info");
+  switch (I->getOpcode()) {
+    case Instruction::GetElementPtr:
+      return 0;
+    case Instruction::Br: {
+      return VTTI->getInstrCost(I->getOpcode());
+    }
+    case Instruction::PHI:
+      return 0;
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor: {
+      Type *VTy = VectorType::get(I->getType(), VF);
+      return VTTI->getInstrCost(I->getOpcode(), VTy);
+    }
+    case Instruction::Select: {
+      SelectInst *SI = cast<SelectInst>(I);
+      Type *VTy = VectorType::get(I->getType(), VF);
+      const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
+      bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
+      Type *CondTy = SI->getCondition()->getType();
+      if (ScalarCond)
+        CondTy = VectorType::get(CondTy, VF);
+
+      return VTTI->getInstrCost(I->getOpcode(), VTy, CondTy);
+    }
+    case Instruction::ICmp:
+    case Instruction::FCmp: {
+      Type *VTy = VectorType::get(I->getOperand(0)->getType(), VF);
+      return VTTI->getInstrCost(I->getOpcode(), VTy);
+    }
+    case Instruction::Store: {
+      StoreInst *SI = cast<StoreInst>(I);
+      Type *VTy = VectorType::get(SI->getValueOperand()->getType(), VF);
+
+      // Scalarized stores.
+      if (!Legal->isConsecutiveGep(SI->getPointerOperand())) {
+        unsigned Cost = 0;
+        if (VF != 1) {
+          unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
+                                                VTy);
+          // The cost of extracting from the value vector and pointer vector.
+          Cost += VF * (ExtCost * 2);
+        }
+        // The cost of the scalar stores.
+        Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
+                                           VTy->getScalarType(),
+                                           SI->getAlignment(),
+                                           SI->getPointerAddressSpace());
+        return Cost;
+      }
+
+      // Wide stores.
+      return VTTI->getMemoryOpCost(I->getOpcode(), VTy, SI->getAlignment(),
+                                   SI->getPointerAddressSpace());
+    }
+    case Instruction::Load: {
+      LoadInst *LI = cast<LoadInst>(I);
+      Type *VTy = VectorType::get(I->getType(), VF);
+
+      // Scalarized loads.
+      if (!Legal->isConsecutiveGep(LI->getPointerOperand())) {
+        unsigned Cost = 0;
+        if (VF != 1) {
+          unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, VTy);
+          unsigned ExCost = VTTI->getInstrCost(Instruction::ExtractValue, VTy);
+
+          // The cost of inserting the loaded value into the result vector, and
+          // extracting from a vector of pointers.
+          Cost += VF * (InCost + ExCost);
+        }
+        // The cost of the scalar stores.
+        Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(), VTy->getScalarType(),
+                                           LI->getAlignment(),
+                                           LI->getPointerAddressSpace());
+        return Cost;
+      }
+
+      // Wide loads.
+      return VTTI->getMemoryOpCost(I->getOpcode(), VTy, LI->getAlignment(),
+                                   LI->getPointerAddressSpace());
+    }
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::SIToFP:
+    case Instruction::UIToFP:
+    case Instruction::Trunc:
+    case Instruction::FPTrunc:
+    case Instruction::BitCast: {
+      Type *SrcTy = VectorType::get(I->getOperand(0)->getType(), VF);
+      Type *DstTy = VectorType::get(I->getType(), VF);
+      return VTTI->getInstrCost(I->getOpcode(), DstTy, SrcTy);
+    }
+    default: {
+      // We are scalarizing the instruction. Return the cost of the scalar
+      // instruction, plus the cost of insert and extract into vector
+      // elements, times the vector width.
+      unsigned Cost = 0;
+      Type *Ty = I->getType();
+
+      if (!Ty->isVoidTy()) {
+        Type *VTy = VectorType::get(Ty, VF);
+        unsigned InsCost = VTTI->getInstrCost(Instruction::InsertElement, VTy);
+        unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement, VTy);
+        Cost += VF * (InsCost + ExtCost);
+      }
+
+      /// We don't have any information on the scalar instruction, but maybe
+      /// the target has.
+      /// TODO: This may be a target-specific intrinsic.
+      /// Need to add API for that.
+      Cost += VF * VTTI->getInstrCost(I->getOpcode(), Ty);
+
+      return Cost;
+    }
+  }// end of switch.
+}
+
+
 } // namespace
 
 char LoopVectorize::ID = 0;
diff --git a/lib/VMCore/AsmWriter.cpp b/lib/VMCore/AsmWriter.cpp
index 5e23e6fc78..b72c17f667 100644
--- a/lib/VMCore/AsmWriter.cpp
+++ b/lib/VMCore/AsmWriter.cpp
@@ -74,6 +74,7 @@ static void PrintCallingConv(unsigned cc, raw_ostream &Out)
     case CallingConv::X86_StdCall:  Out << "x86_stdcallcc"; break;
     case CallingConv::X86_FastCall: Out << "x86_fastcallcc"; break;
     case CallingConv::X86_ThisCall: Out << "x86_thiscallcc"; break;
+    case CallingConv::Intel_OCL_BI: Out << "intel_ocl_bicc"; break;
     case CallingConv::ARM_APCS:     Out << "arm_apcscc"; break;
     case CallingConv::ARM_AAPCS:    Out << "arm_aapcscc"; break;
     case CallingConv::ARM_AAPCS_VFP:Out << "arm_aapcs_vfpcc"; break;
diff --git a/lib/VMCore/DataLayout.cpp b/lib/VMCore/DataLayout.cpp
index e6994be257..104e5da057 100644
--- a/lib/VMCore/DataLayout.cpp
+++ b/lib/VMCore/DataLayout.cpp
@@ -524,6 +524,14 @@ std::string DataLayout::getStringRepresentation() const {
   return OS.str();
 }
 
+unsigned DataLayout::getPointerTypeSizeInBits(Type *Ty) const
+{
+    if (Ty->isPointerTy()) return getTypeSizeInBits(Ty);
+    if (Ty->isVectorTy()
+        && cast<VectorType>(Ty)->getElementType()->isPointerTy())
+      return getTypeSizeInBits(cast<VectorType>(Ty)->getElementType());
+    return getPointerSizeInBits(0);
+}
 
 uint64_t DataLayout::getTypeSizeInBits(Type *Ty) const {
   assert(Ty->isSized() && "Cannot getTypeInfo() on a type that is unsized!");
@@ -660,13 +668,32 @@ unsigned DataLayout::getPreferredTypeAlignmentShift(Type *Ty) const {
   return Log2_32(Align);
 }
 
-/// getIntPtrType - Return an unsigned integer type that is the same size or
-/// greater to the host pointer size.
+/// getIntPtrType - Return an integer type that is the same size or
+/// greater to the pointer size for the address space.
 IntegerType *DataLayout::getIntPtrType(LLVMContext &C,
                                        unsigned AddressSpace) const {
   return IntegerType::get(C, getPointerSizeInBits(AddressSpace));
 }
 
+/// getIntPtrType - Return an integer type that is the same size or
+/// greater to the pointer size of the specific PointerType.
+IntegerType *DataLayout::getIntPtrType(Type *Ty) const {
+  LLVMContext &C = Ty->getContext();
+  // For pointers, we return the size for the specific address space.
+  if (Ty->isPointerTy()) return IntegerType::get(C, getTypeSizeInBits(Ty));
+  // For vector of pointers, we return the size of the address space
+  // of the pointer type.
+  if (Ty->isVectorTy() && cast<VectorType>(Ty)->getElementType()->isPointerTy())
+    return IntegerType::get(C,
+        getTypeSizeInBits(cast<VectorType>(Ty)->getElementType()));
+  // Otherwise return the address space for the default address space.
+  // An example of this occuring is that you want to get the IntPtr
+  // for all of the arguments in a function. However, the IntPtr
+  // for a non-pointer type cannot be determined by the type, so
+  // the default value is used.
+  return getIntPtrType(C, 0);
+}
+
 
 uint64_t DataLayout::getIndexedOffset(Type *ptrTy,
                                       ArrayRef<Value *> Indices) const {
diff --git a/lib/VMCore/Instructions.cpp b/lib/VMCore/Instructions.cpp
index 13c4a5d257..e9b96d6cd2 100644
--- a/lib/VMCore/Instructions.cpp
+++ b/lib/VMCore/Instructions.cpp
@@ -2120,6 +2120,17 @@ bool CastInst::isNoopCast(Type *IntPtrTy) const {
   return isNoopCast(getOpcode(), getOperand(0)->getType(), getType(), IntPtrTy);
 }
 
+/// @brief Determine if a cast is a no-op
+bool CastInst::isNoopCast(const DataLayout &DL) const {
+  unsigned AS = 0;
+  if (getOpcode() == Instruction::PtrToInt)
+    AS = getOperand(0)->getType()->getPointerAddressSpace();
+  else if (getOpcode() == Instruction::IntToPtr)
+    AS = getType()->getPointerAddressSpace();
+  Type *IntPtrTy = DL.getIntPtrType(getContext(), AS);
+  return isNoopCast(getOpcode(), getOperand(0)->getType(), getType(), IntPtrTy);
+}
+
 /// This function determines if a pair of casts can be eliminated and what 
 /// opcode should be used in the elimination. This assumes that there are two 
 /// instructions like this:
diff --git a/lib/VMCore/Type.cpp b/lib/VMCore/Type.cpp
index 1a7a650989..54146e118c 100644
--- a/lib/VMCore/Type.cpp
+++ b/lib/VMCore/Type.cpp
@@ -233,7 +233,12 @@ unsigned Type::getVectorNumElements() const {
 }
 
 unsigned Type::getPointerAddressSpace() const {
-  return cast<PointerType>(this)->getAddressSpace();
+  if (isPointerTy())
+    return cast<PointerType>(this)->getAddressSpace();
+  if (isVectorTy())
+    return getSequentialElementType()->getPointerAddressSpace();
+  llvm_unreachable("Should never reach here!");
+  return 0;
 }
 
 
diff --git a/lib/VMCore/User.cpp b/lib/VMCore/User.cpp
index 5f35ce4b9a..e847ce6ee5 100644
--- a/lib/VMCore/User.cpp
+++ b/lib/VMCore/User.cpp
@@ -10,6 +10,7 @@
 #include "llvm/Constant.h"
 #include "llvm/GlobalValue.h"
 #include "llvm/User.h"
+#include "llvm/Operator.h"
 
 namespace llvm {
 
@@ -78,4 +79,12 @@ void User::operator delete(void *Usr) {
   ::operator delete(Storage);
 }
 
+//===----------------------------------------------------------------------===//
+//                             Operator Class
+//===----------------------------------------------------------------------===//
+
+Operator::~Operator() {
+  llvm_unreachable("should never destroy an Operator");
+}
+
 } // End llvm namespace
diff --git a/lib/VMCore/Verifier.cpp b/lib/VMCore/Verifier.cpp
index fd629b485a..eb40b09d29 100644
--- a/lib/VMCore/Verifier.cpp
+++ b/lib/VMCore/Verifier.cpp
@@ -705,6 +705,7 @@ void Verifier::visitFunction(Function &F) {
   case CallingConv::Cold:
   case CallingConv::X86_FastCall:
   case CallingConv::X86_ThisCall:
+  case CallingConv::Intel_OCL_BI:
   case CallingConv::PTX_Kernel:
   case CallingConv::PTX_Device:
     Assert1(!F.isVarArg(),
diff --git a/projects/CMakeLists.txt b/projects/CMakeLists.txt
index a9d2af6ad2..36751cd31d 100644
--- a/projects/CMakeLists.txt
+++ b/projects/CMakeLists.txt
@@ -14,8 +14,6 @@ endforeach(entry)
 # Also add in the compiler-rt tree if present and we have a sufficiently
 # recent version of CMake.
 if(${CMAKE_VERSION} VERSION_GREATER 2.8.7 AND
-   ${LLVM_BUILD_RUNTIME} AND
-   IS_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/compiler-rt AND
-   EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/compiler-rt/CMakeLists.txt)
-  add_subdirectory(compiler-rt)
+   ${LLVM_BUILD_RUNTIME})
+  add_llvm_external_project(compiler-rt)
 endif()
diff --git a/test/CodeGen/ARM/carry.ll b/test/CodeGen/ARM/carry.ll
index f84774d9b6..bf51cd627b 100644
--- a/test/CodeGen/ARM/carry.ll
+++ b/test/CodeGen/ARM/carry.ll
@@ -45,3 +45,16 @@ entry:
   %0 = sub nsw i64 0, %x
   ret i64 %0
 }
+
+; rdar://12559385
+define i64 @f5(i32 %vi) {
+entry:
+; CHECK: f5:
+; CHECK: movw [[REG:r[0-9]+]], #36102
+; CHECK: sbc r{{[0-9]+}}, r{{[0-9]+}}, [[REG]]
+    %v0 = zext i32 %vi to i64
+    %v1 = xor i64 %v0, -155057456198619
+    %v4 = add i64 %v1, 155057456198619
+    %v5 = add i64 %v4, %v1
+    ret i64 %v5
+}
diff --git a/test/CodeGen/Mips/mips64-sret.ll b/test/CodeGen/Mips/mips64-sret.ll
index 498c5fe174..e26b0223b4 100644
--- a/test/CodeGen/Mips/mips64-sret.ll
+++ b/test/CodeGen/Mips/mips64-sret.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n64 -O0 < %s
+; RUN: llc -march=mips64el -mcpu=mips64r2 -mattr=n64 -O3 < %s | FileCheck %s
 
 %struct.S = type { [8 x i32] }
 
@@ -6,6 +6,8 @@
 
 define void @f(%struct.S* noalias sret %agg.result) nounwind {
 entry:
+; CHECK: daddu $2, $zero, $4
+
   %0 = bitcast %struct.S* %agg.result to i8*
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* bitcast (%struct.S* @g to i8*), i64 32, i32 4, i1 false)
   ret void
diff --git a/test/CodeGen/PowerPC/jaggedstructs.ll b/test/CodeGen/PowerPC/jaggedstructs.ll
new file mode 100644
index 0000000000..62aa7cf929
--- /dev/null
+++ b/test/CodeGen/PowerPC/jaggedstructs.ll
@@ -0,0 +1,48 @@
+; RUN: llc -mcpu=pwr7 -O0 < %s | FileCheck %s
+
+; This tests receiving and re-passing parameters consisting of structures
+; of size 3, 5, 6, and 7.  They are to be found/placed right-adjusted in
+; the parameter registers.
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+%struct.S3 = type { [3 x i8] }
+%struct.S5 = type { [5 x i8] }
+%struct.S6 = type { [6 x i8] }
+%struct.S7 = type { [7 x i8] }
+
+define void @test(%struct.S3* byval %s3, %struct.S5* byval %s5, %struct.S6* byval %s6, %struct.S7* byval %s7) nounwind {
+entry:
+  call void @check(%struct.S3* byval %s3, %struct.S5* byval %s5, %struct.S6* byval %s6, %struct.S7* byval %s7)
+  ret void
+}
+
+; CHECK: std 6, 216(1)
+; CHECK: std 5, 208(1)
+; CHECK: std 4, 200(1)
+; CHECK: std 3, 192(1)
+; CHECK: lbz {{[0-9]+}}, 199(1)
+; CHECK: stb {{[0-9]+}}, 55(1)
+; CHECK: lhz {{[0-9]+}}, 197(1)
+; CHECK: sth {{[0-9]+}}, 53(1)
+; CHECK: lbz {{[0-9]+}}, 207(1)
+; CHECK: stb {{[0-9]+}}, 63(1)
+; CHECK: lwz {{[0-9]+}}, 203(1)
+; CHECK: stw {{[0-9]+}}, 59(1)
+; CHECK: lhz {{[0-9]+}}, 214(1)
+; CHECK: sth {{[0-9]+}}, 70(1)
+; CHECK: lwz {{[0-9]+}}, 210(1)
+; CHECK: stw {{[0-9]+}}, 66(1)
+; CHECK: lbz {{[0-9]+}}, 223(1)
+; CHECK: stb {{[0-9]+}}, 79(1)
+; CHECK: lhz {{[0-9]+}}, 221(1)
+; CHECK: sth {{[0-9]+}}, 77(1)
+; CHECK: lwz {{[0-9]+}}, 217(1)
+; CHECK: stw {{[0-9]+}}, 73(1)
+; CHECK: ld 6, 72(1)
+; CHECK: ld 5, 64(1)
+; CHECK: ld 4, 56(1)
+; CHECK: ld 3, 48(1)
+
+declare void @check(%struct.S3* byval, %struct.S5* byval, %struct.S6* byval, %struct.S7* byval)
diff --git a/test/CodeGen/PowerPC/structsinregs.ll b/test/CodeGen/PowerPC/structsinregs.ll
index 43ba13b426..ef706af95d 100644
--- a/test/CodeGen/PowerPC/structsinregs.ll
+++ b/test/CodeGen/PowerPC/structsinregs.ll
@@ -188,17 +188,13 @@ entry:
   %add13 = add nsw i32 %add11, %6
   ret i32 %add13
 
-; CHECK: sldi 9, 9, 8
-; CHECK: sldi 8, 8, 16
-; CHECK: sldi 7, 7, 24
-; CHECK: sldi 5, 5, 40
-; CHECK: stw 6, 76(1)
-; CHECK: sth 4, 62(1)
-; CHECK: stb 3, 55(1)
 ; CHECK: std 9, 96(1)
 ; CHECK: std 8, 88(1)
 ; CHECK: std 7, 80(1)
+; CHECK: stw 6, 76(1)
 ; CHECK: std 5, 64(1)
+; CHECK: sth 4, 62(1)
+; CHECK: stb 3, 55(1)
 ; CHECK: lbz {{[0-9]+}}, 85(1)
 ; CHECK: lbz {{[0-9]+}}, 86(1)
 ; CHECK: lbz {{[0-9]+}}, 83(1)
diff --git a/test/CodeGen/Thumb2/carry.ll b/test/CodeGen/Thumb2/carry.ll
index de6f6e260d..85b4370fa5 100644
--- a/test/CodeGen/Thumb2/carry.ll
+++ b/test/CodeGen/Thumb2/carry.ll
@@ -20,3 +20,16 @@ entry:
 	%tmp2 = sub i64 %tmp1, %b
 	ret i64 %tmp2
 }
+
+; rdar://12559385
+define i64 @f3(i32 %vi) {
+entry:
+; CHECK: f3:
+; CHECK: movw [[REG:r[0-9]+]], #36102
+; CHECK: sbcs r{{[0-9]+}}, [[REG]]
+    %v0 = zext i32 %vi to i64
+    %v1 = xor i64 %v0, -155057456198619
+    %v4 = add i64 %v1, 155057456198619
+    %v5 = add i64 %v4, %v1
+    ret i64 %v5
+}
diff --git a/test/CodeGen/X86/2012-01-18-vbitcast.ll b/test/CodeGen/X86/2012-01-18-vbitcast.ll
index 8a3ccc8dfd..3ce7db6e41 100644
--- a/test/CodeGen/X86/2012-01-18-vbitcast.ll
+++ b/test/CodeGen/X86/2012-01-18-vbitcast.ll
@@ -2,8 +2,8 @@
 
 ;CHECK: vcast
 define <2 x i32> @vcast(<2 x float> %a, <2 x float> %b) {
-;CHECK: pshufd
-;CHECK: pshufd
+;CHECK: pmovzxdq
+;CHECK: pmovzxdq
   %af = bitcast <2 x float> %a to <2 x i32>
   %bf = bitcast <2 x float> %b to <2 x i32>
   %x = sub <2 x i32> %af, %bf
diff --git a/test/CodeGen/X86/2012-03-15-build_vector_wl.ll b/test/CodeGen/X86/2012-03-15-build_vector_wl.ll
index fec17e9f4a..c4b307e5a5 100644
--- a/test/CodeGen/X86/2012-03-15-build_vector_wl.ll
+++ b/test/CodeGen/X86/2012-03-15-build_vector_wl.ll
@@ -4,7 +4,7 @@
 define <4 x i8> @build_vector_again(<16 x i8> %in) nounwind readnone {
 entry:
   %out = shufflevector <16 x i8> %in, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK: shufb
+; CHECK: pmovzxbd
   ret <4 x i8> %out
 ; CHECK: ret
 }
diff --git a/test/CodeGen/X86/2012-07-10-extload64.ll b/test/CodeGen/X86/2012-07-10-extload64.ll
index 906b748fa4..4abdded38d 100644
--- a/test/CodeGen/X86/2012-07-10-extload64.ll
+++ b/test/CodeGen/X86/2012-07-10-extload64.ll
@@ -3,7 +3,7 @@
 ; CHECK: load_store
 define void @load_store(<4 x i16>* %in) {
 entry:
-; CHECK: movsd
+; CHECK: pmovzxwd
   %A27 = load <4 x i16>* %in, align 4
   %A28 = add <4 x i16> %A27, %A27
 ; CHECK: movlpd
@@ -27,6 +27,6 @@ define <2 x i32> @load_64(<2 x i32>* %ptr) {
 BB:
   %t = load <2 x i32>* %ptr
   ret <2 x i32> %t
-;CHECK: movsd
+;CHECK: pmovzxdq
 ;CHECK: ret
 }
diff --git a/test/CodeGen/X86/avx-intel-ocl.ll b/test/CodeGen/X86/avx-intel-ocl.ll
new file mode 100644
index 0000000000..1446b36a0f
--- /dev/null
+++ b/test/CodeGen/X86/avx-intel-ocl.ll
@@ -0,0 +1,107 @@
+; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=WIN32 %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=WIN64 %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck -check-prefix=NOT_WIN %s
+
+declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *)
+declare <16 x float> @func_float16(<16 x float>, <16 x float>)
+; WIN64: testf16_inp
+; WIN64: vaddps  {{.*}}, {{%ymm[0-1]}}
+; WIN64: vaddps  {{.*}}, {{%ymm[0-1]}}
+; WIN64: leaq    {{.*}}(%rsp), %rcx
+; WIN64: call
+; WIN64: ret
+
+; WIN32: testf16_inp
+; WIN32: movl    %eax, (%esp)
+; WIN32: vaddps  {{.*}}, {{%ymm[0-1]}}
+; WIN32: vaddps  {{.*}}, {{%ymm[0-1]}}
+; WIN32: call
+; WIN32: ret
+
+; NOT_WIN: testf16_inp
+; NOT_WIN: vaddps  {{.*}}, {{%ymm[0-1]}}
+; NOT_WIN: vaddps  {{.*}}, {{%ymm[0-1]}}
+; NOT_WIN: leaq    {{.*}}(%rsp), %rdi
+; NOT_WIN: call
+; NOT_WIN: ret
+
+;test calling conventions - input parameters
+define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
+  %y = alloca <16 x float>, align 16
+  %x = fadd <16 x float> %a, %b
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 
+  %2 = load <16 x float>* %y, align 16
+  %3 = fadd <16 x float> %2, %1
+  ret <16 x float> %3
+}
+
+;test calling conventions - preserved registers
+
+; preserved ymm6-ymm15
+; WIN64: testf16_regs
+; WIN64: call
+; WIN64: vaddps  {{%ymm[6-7]}}, %ymm0, %ymm0
+; WIN64: vaddps  {{%ymm[6-7]}}, %ymm1, %ymm1
+; WIN64: ret
+
+; preserved ymm8-ymm15
+; NOT_WIN: testf16_regs
+; NOT_WIN: call
+; NOT_WIN: vaddps  {{%ymm[8-9]}}, %ymm0, %ymm0
+; NOT_WIN: vaddps  {{%ymm[8-9]}}, %ymm1, %ymm1
+; NOT_WIN: ret
+
+define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
+  %y = alloca <16 x float>, align 16
+  %x = fadd <16 x float> %a, %b
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 
+  %2 = load <16 x float>* %y, align 16
+  %3 = fadd <16 x float> %1, %b
+  %4 = fadd <16 x float> %2, %3
+  ret <16 x float> %4
+}
+
+; test calling conventions - prolog and epilog
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: vmovaps {{%ymm([6-9]|1[0-5])}}, {{.*(%rsp).*}}     # 32-byte Spill
+; WIN64: call
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+; WIN64: vmovaps {{.*(%rsp).*}}, {{%ymm([6-9]|1[0-5])}}     # 32-byte Reload
+
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: vmovaps {{%ymm([8-9]|1[0-5])}}, {{.*}}(%rbp)  ## 32-byte Spill
+; NOT_WIN: call
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+; NOT_WIN: vmovaps {{.*}}(%rbp), {{%ymm([8-9]|1[0-5])}} ## 32-byte Reload
+define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {
+   %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
+   ret <16 x float> %c
+}
diff --git a/test/CodeGen/X86/cvtv2f32.ll b/test/CodeGen/X86/cvtv2f32.ll
new file mode 100644
index 0000000000..466b096067
--- /dev/null
+++ b/test/CodeGen/X86/cvtv2f32.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -mtriple=i686-linux-pc -mcpu=corei7 | FileCheck %s
+
+define <2 x float> @foo(i32 %x, i32 %y, <2 x float> %v) {
+  %t1 = uitofp i32 %x to float
+  %t2 = insertelement <2 x float> undef, float %t1, i32 0
+  %t3 = uitofp i32 %y to float
+  %t4 = insertelement <2 x float> %t2, float %t3, i32 1
+  %t5 = fmul <2 x float> %v, %t4
+  ret <2 x float> %t5
+; CHECK: foo
+; CHECK: or
+; CHECK: subpd
+; CHECK: cvtpd2ps
+; CHECK: ret
+}
+
+define <2 x float> @bar(<2 x i32> %in) {
+  %r = uitofp <2 x i32> %in to <2 x float>
+  ret <2 x float> %r
+; CHECK: bar
+; CHECK: or
+; CHECK: subpd
+; CHECK: cvtpd2ps
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/fast-cc-callee-pops.ll b/test/CodeGen/X86/fast-cc-callee-pops.ll
index ea10897c73..2c5b80ac4a 100644
--- a/test/CodeGen/X86/fast-cc-callee-pops.ll
+++ b/test/CodeGen/X86/fast-cc-callee-pops.ll
@@ -2,12 +2,12 @@
 
 ; Check that a fastcc function pops its stack variables before returning.
 
-define x86_fastcallcc void @func(i64 %X, i64 %Y, float %G, double %Z) nounwind {
+define x86_fastcallcc void @func(i64 inreg %X, i64 %Y, float %G, double %Z) nounwind {
         ret void
 ; CHECK: ret{{.*}}20
 }
 
-define x86_thiscallcc void @func2(i32 %X, i64 %Y, float %G, double %Z) nounwind {
+define x86_thiscallcc void @func2(i32 inreg %X, i64 %Y, float %G, double %Z) nounwind {
         ret void
 ; CHECK: ret{{.*}}20
 }
diff --git a/test/CodeGen/X86/fast-cc-merge-stack-adj.ll b/test/CodeGen/X86/fast-cc-merge-stack-adj.ll
index 14cb136f89..d591f9408b 100644
--- a/test/CodeGen/X86/fast-cc-merge-stack-adj.ll
+++ b/test/CodeGen/X86/fast-cc-merge-stack-adj.ll
@@ -3,7 +3,7 @@
 
 target triple = "i686-pc-linux-gnu"
 
-declare x86_fastcallcc void @func(i32*, i64)
+declare x86_fastcallcc void @func(i32*, i64 inreg)
 
 define x86_fastcallcc void @caller(i32, i64) {
         %X = alloca i32         ; <i32*> [#uses=1]
diff --git a/test/CodeGen/X86/fast-cc-pass-in-regs.ll b/test/CodeGen/X86/fast-cc-pass-in-regs.ll
index a96e5043fe..b60b68bd38 100644
--- a/test/CodeGen/X86/fast-cc-pass-in-regs.ll
+++ b/test/CodeGen/X86/fast-cc-pass-in-regs.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | FileCheck %s
 ; check that fastcc is passing stuff in regs.
 
-declare x86_fastcallcc i64 @callee(i64)
+declare x86_fastcallcc i64 @callee(i64 inreg)
 
 define i64 @caller() {
         %X = call x86_fastcallcc  i64 @callee( i64 4294967299 )          ; <i64> [#uses=1]
@@ -9,7 +9,7 @@ define i64 @caller() {
         ret i64 %X
 }
 
-define x86_fastcallcc i64 @caller2(i64 %X) {
+define x86_fastcallcc i64 @caller2(i64 inreg %X) {
         ret i64 %X
 ; CHECK: mov{{.*}}EAX, ECX
 }
diff --git a/test/CodeGen/X86/ms-inline-asm.ll b/test/CodeGen/X86/ms-inline-asm.ll
index 59efa8d547..24d28adda8 100644
--- a/test/CodeGen/X86/ms-inline-asm.ll
+++ b/test/CodeGen/X86/ms-inline-asm.ll
@@ -38,3 +38,26 @@ entry:
 ; CHECK: .att_syntax
 ; CHECK: {{## InlineAsm End|#NO_APP}}
 }
+
+%struct.t18_type = type { i32, i32 }
+
+define i32 @t18() nounwind {
+entry:
+  %foo = alloca %struct.t18_type, align 4
+  %a = getelementptr inbounds %struct.t18_type* %foo, i32 0, i32 0
+  store i32 1, i32* %a, align 4
+  %b = getelementptr inbounds %struct.t18_type* %foo, i32 0, i32 1
+  store i32 2, i32* %b, align 4
+  call void asm sideeffect inteldialect "lea ebx, foo\0A\09mov eax, [ebx].0\0A\09mov [ebx].4, ecx", "~{eax},~{dirflag},~{fpsr},~{flags}"() nounwind
+  %b1 = getelementptr inbounds %struct.t18_type* %foo, i32 0, i32 1
+  %0 = load i32* %b1, align 4
+  ret i32 %0
+; CHECK: t18
+; CHECK: {{## InlineAsm Start|#APP}}
+; CHECK: .intel_syntax
+; CHECK: lea ebx, foo
+; CHECK: mov eax, [ebx].0
+; CHECK: mov [ebx].4, ecx
+; CHECK: .att_syntax
+; CHECK: {{## InlineAsm End|#NO_APP}}
+}
diff --git a/test/CodeGen/X86/pointer-vector.ll b/test/CodeGen/X86/pointer-vector.ll
index 800fbedb4f..58423d1959 100644
--- a/test/CodeGen/X86/pointer-vector.ll
+++ b/test/CodeGen/X86/pointer-vector.ll
@@ -81,8 +81,7 @@ define <4 x i32*> @INT2PTR1(<4 x i8>* %p) nounwind {
 entry:
   %G = load <4 x i8>* %p
 ;CHECK: movl
-;CHECK: movd
-;CHECK: pshufb
+;CHECK: pmovzxbd
 ;CHECK: pand
   %K = inttoptr <4 x i8> %G to <4 x i32*>
 ;CHECK: ret
@@ -105,7 +104,7 @@ define <2 x i32*> @BITCAST1(<2 x i8*>* %p) nounwind {
 entry:
   %G = load <2 x i8*>* %p
 ;CHECK: movl
-;CHECK: movsd
+;CHECK: pmovzxdq
   %T = bitcast <2 x i8*> %G to <2 x i32*>
 ;CHECK: ret
   ret <2 x i32*> %T
diff --git a/test/CodeGen/X86/pr14161.ll b/test/CodeGen/X86/pr14161.ll
new file mode 100644
index 0000000000..ff4532eac3
--- /dev/null
+++ b/test/CodeGen/X86/pr14161.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=x86_64-linux-pc -mcpu=corei7 | FileCheck %s
+
+declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>)
+
+define <2 x i16> @good(<4 x i32>*, <4 x i8>*) {
+entry:
+  %2 = load <4 x i32>* %0, align 16
+  %3 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %2, <4 x i32> <i32 127, i32 127, i32 127, i32 127>)
+  %4 = extractelement <4 x i32> %3, i32 0
+  %5 = extractelement <4 x i32> %3, i32 1
+  %6 = extractelement <4 x i32> %3, i32 2
+  %7 = extractelement <4 x i32> %3, i32 3
+  %8 = bitcast i32 %4 to <2 x i16>
+  %9 = bitcast i32 %5 to <2 x i16>
+  ret <2 x i16> %8
+; CHECK: good
+; CHECK: pminud
+; CHECK-NEXT: pmovzxwq
+; CHECK: ret
+}
+
+define <2 x i16> @bad(<4 x i32>*, <4 x i8>*) {
+entry:
+  %2 = load <4 x i32>* %0, align 16
+  %3 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %2, <4 x i32> <i32 127, i32 127, i32 127, i32 127>)
+  %4 = extractelement <4 x i32> %3, i32 0
+  %5 = extractelement <4 x i32> %3, i32 1
+  %6 = extractelement <4 x i32> %3, i32 2
+  %7 = extractelement <4 x i32> %3, i32 3
+  %8 = bitcast i32 %4 to <2 x i16>
+  %9 = bitcast i32 %5 to <2 x i16>
+  ret <2 x i16> %9
+; CHECK: bad
+; CHECK: pminud
+; CHECK: pextrd
+; CHECK: pmovzxwq
+; CHECK: ret
+}
diff --git a/test/CodeGen/X86/promote.ll b/test/CodeGen/X86/promote.ll
index 8b30dc718b..283f48cd37 100644
--- a/test/CodeGen/X86/promote.ll
+++ b/test/CodeGen/X86/promote.ll
@@ -20,7 +20,7 @@ entry:
 ; CHECK: shuff_f
 define i32 @shuff_f(<4 x i8>* %A) {
 entry:
-; CHECK: pshufb
+; CHECK: pmovzxbd
 ; CHECK: paddd
 ; CHECK: pshufb
   %0 = load <4 x i8>* %A, align 8
diff --git a/test/CodeGen/X86/sse-intel-ocl.ll b/test/CodeGen/X86/sse-intel-ocl.ll
new file mode 100644
index 0000000000..188505072f
--- /dev/null
+++ b/test/CodeGen/X86/sse-intel-ocl.ll
@@ -0,0 +1,93 @@
+; RUN: llc < %s -mtriple=i386-pc-win32 -mcpu=nehalem | FileCheck -check-prefix=WIN32 %s
+; RUN: llc < %s -mtriple=x86_64-win32 -mcpu=nehalem | FileCheck -check-prefix=WIN64 %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=nehalem | FileCheck -check-prefix=NOT_WIN %s
+
+declare <16 x float> @func_float16_ptr(<16 x float>, <16 x float> *)
+declare <16 x float> @func_float16(<16 x float>, <16 x float>)
+; WIN64: testf16_inp
+; WIN64: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN64: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN64: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN64: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN64: leaq    {{.*}}(%rsp), %rcx
+; WIN64: call
+; WIN64: ret
+
+; WIN32: testf16_inp
+; WIN32: movl    %eax, (%esp)
+; WIN32: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN32: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN32: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN32: addps  {{.*}}, {{%xmm[0-3]}}
+; WIN32: call
+; WIN32: ret
+
+; NOT_WIN: testf16_inp
+; NOT_WIN: addps  {{.*}}, {{%xmm[0-3]}}
+; NOT_WIN: addps  {{.*}}, {{%xmm[0-3]}}
+; NOT_WIN: addps  {{.*}}, {{%xmm[0-3]}}
+; NOT_WIN: addps  {{.*}}, {{%xmm[0-3]}}
+; NOT_WIN: leaq    {{.*}}(%rsp), %rdi
+; NOT_WIN: call
+; NOT_WIN: ret
+
+;test calling conventions - input parameters
+define <16 x float> @testf16_inp(<16 x float> %a, <16 x float> %b) nounwind {
+  %y = alloca <16 x float>, align 16
+  %x = fadd <16 x float> %a, %b
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 
+  %2 = load <16 x float>* %y, align 16
+  %3 = fadd <16 x float> %2, %1
+  ret <16 x float> %3
+}
+
+;test calling conventions - preserved registers
+
+; preserved xmm6-xmm15
+; WIN64: testf16_regs
+; WIN64: call
+; WIN64: addps  {{%xmm[6-9]}}, {{.*}}
+; WIN64: addps  {{%xmm[6-9]}}, {{.*}}
+; WIN64: ret
+
+; preserved xmm8-xmm15
+; NOT_WIN: testf16_regs
+; NOT_WIN: call
+; NOT_WIN: addps  {{%xmm([8-9]|1[0-1])}}, {{.*}}
+; NOT_WIN: addps  {{%xmm([8-9]|1[0-1])}}, {{.*}}
+; NOT_WIN: addps  {{%xmm([8-9]|1[0-1])}}, {{.*}}
+; NOT_WIN: addps  {{%xmm([8-9]|1[0-1])}}, {{.*}}
+; NOT_WIN: ret
+
+define <16 x float> @testf16_regs(<16 x float> %a, <16 x float> %b) nounwind {
+  %y = alloca <16 x float>, align 16
+  %x = fadd <16 x float> %a, %b
+  %1 = call intel_ocl_bicc <16 x float> @func_float16_ptr(<16 x float> %x, <16 x float>* %y) 
+  %2 = load <16 x float>* %y, align 16
+  %3 = fadd <16 x float> %1, %b
+  %4 = fadd <16 x float> %2, %3
+  ret <16 x float> %4
+}
+
+; test calling conventions - prolog and epilog
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: movaps {{%xmm([8-9]|1[0-5])}}, {{.*(%rsp).*}}  ## 16-byte Spill
+; NOT_WIN: call
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+; NOT_WIN: movaps {{.*(%rsp).*}}, {{%xmm([8-9]|1[0-5])}}  ## 16-byte Reload
+define intel_ocl_bicc <16 x float> @test_prolog_epilog(<16 x float> %a, <16 x float> %b) nounwind {
+   %c = call <16 x float> @func_float16(<16 x float> %a, <16 x float> %b)
+   ret <16 x float> %c
+}
diff --git a/test/CodeGen/X86/trunc-ext-ld-st.ll b/test/CodeGen/X86/trunc-ext-ld-st.ll
index 9877d7be16..1d22a185de 100644
--- a/test/CodeGen/X86/trunc-ext-ld-st.ll
+++ b/test/CodeGen/X86/trunc-ext-ld-st.ll
@@ -2,8 +2,7 @@
 
 ;CHECK: load_2_i8
 ; A single 16-bit load
-;CHECK: movzwl
-;CHECK: pshufb
+;CHECK: pmovzxbq
 ;CHECK: paddq
 ;CHECK: pshufb
 ; A single 16-bit store
@@ -19,8 +18,7 @@ define void @load_2_i8(<2 x i8>* %A)  {
 
 ;CHECK: load_2_i16
 ; Read 32-bits
-;CHECK: movd
-;CHECK: pshufb
+;CHECK: pmovzxwq
 ;CHECK: paddq
 ;CHECK: pshufb
 ;CHECK: movd
@@ -33,7 +31,7 @@ define void @load_2_i16(<2 x i16>* %A)  {
 } 
 
 ;CHECK: load_2_i32
-;CHECK: pshufd
+;CHECK: pmovzxdq
 ;CHECK: paddq
 ;CHECK: pshufd
 ;CHECK: ret
@@ -45,8 +43,7 @@ define void @load_2_i32(<2 x i32>* %A)  {
 } 
 
 ;CHECK: load_4_i8
-;CHECK: movd
-;CHECK: pshufb
+;CHECK: pmovzxbd
 ;CHECK: paddd
 ;CHECK: pshufb
 ;CHECK: ret
@@ -58,7 +55,7 @@ define void @load_4_i8(<4 x i8>* %A)  {
 } 
 
 ;CHECK: load_4_i16
-;CHECK: punpcklwd
+;CHECK: pmovzxwd
 ;CHECK: paddd
 ;CHECK: pshufb
 ;CHECK: ret
@@ -70,7 +67,7 @@ define void @load_4_i16(<4 x i16>* %A)  {
 } 
 
 ;CHECK: load_8_i8
-;CHECK: punpcklbw
+;CHECK: pmovzxbw
 ;CHECK: paddw
 ;CHECK: pshufb
 ;CHECK: ret
diff --git a/test/CodeGen/X86/vec_compare-2.ll b/test/CodeGen/X86/vec_compare-2.ll
index 46d6a23554..4da79538db 100644
--- a/test/CodeGen/X86/vec_compare-2.ll
+++ b/test/CodeGen/X86/vec_compare-2.ll
@@ -10,8 +10,7 @@ define void @blackDespeckle_wrapper(i8** %args_list, i64* %gtid, i64 %xend) {
 entry:
 ; CHECK: cfi_def_cfa_offset
 ; CHECK-NOT: set
-; CHECK: punpcklwd
-; CHECK: pshufd
+; CHECK: pmovzxwq
 ; CHECK: pshufb
   %shr.i = ashr <4 x i32> zeroinitializer, <i32 3, i32 3, i32 3, i32 3> ; <<4 x i32>> [#uses=1]
   %cmp318.i = sext <4 x i1> zeroinitializer to <4 x i32> ; <<4 x i32>> [#uses=1]
diff --git a/test/CodeGen/X86/widen_load-2.ll b/test/CodeGen/X86/widen_load-2.ll
index 79aa000502..224898c1a3 100644
--- a/test/CodeGen/X86/widen_load-2.ll
+++ b/test/CodeGen/X86/widen_load-2.ll
@@ -170,7 +170,7 @@ define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp
 ; CHECK: rot
 %i8vec3pack = type { <3 x i8>, i8 }
 define %i8vec3pack  @rot() nounwind {
-; CHECK: movd {{-?[0-9]+}}(%rsp), {{%xmm[0-9]}}
+; CHECK: pmovzxbd {{-?[0-9]+}}(%rsp), {{%xmm[0-9]}}
 entry:
   %X = alloca %i8vec3pack, align 4
   %rot = alloca %i8vec3pack, align 4
diff --git a/test/MC/PowerPC/lit.local.cfg b/test/MC/PowerPC/lit.local.cfg
new file mode 100644
index 0000000000..88488cdd04
--- /dev/null
+++ b/test/MC/PowerPC/lit.local.cfg
@@ -0,0 +1,5 @@
+config.suffixes = ['.ll', '.c', '.cpp', '.s']
+
+targets = set(config.root.targets_to_build.split())
+if not 'PowerPC' in targets:
+    config.unsupported = True
diff --git a/test/MC/PowerPC/ppc64-relocs-01.ll b/test/MC/PowerPC/ppc64-relocs-01.ll
new file mode 100644
index 0000000000..5996af84f4
--- /dev/null
+++ b/test/MC/PowerPC/ppc64-relocs-01.ll
@@ -0,0 +1,66 @@
+;; RUN: llc -mtriple=powerpc64-unknown-linux-gnu -O3  \
+;; RUN:  -filetype=obj %s -o - | \
+;; RUN: elf-dump --dump-section-data | FileCheck %s
+
+;; FIXME: this file need to be in .s form, change when asm parse is done.
+
+@number64 = global i64 10, align 8
+
+define i64 @access_int64(i64 %a) nounwind readonly {
+entry:
+  %0 = load i64* @number64, align 8
+  %cmp = icmp eq i64 %0, %a
+  %conv1 = zext i1 %cmp to i64
+  ret i64 %conv1
+}
+
+declare double @sin(double) nounwind
+
+define double @test_branch24 (double %x) nounwind readonly {
+entry:
+  %add = call double @sin(double %x) nounwind
+  ret double %add
+}
+
+;; The relocations in .rela.text are the 'number64' load using a
+;; R_PPC64_TOC16_DS against the .toc and the 'sin' external function
+;; address using a R_PPC64_REL24
+;; CHECK:       '.rela.text'
+;; CHECK:       Relocation 0
+;; CHECK-NEXT:  'r_offset',
+;; CHECK-NEXT:  'r_sym', 0x00000006
+;; CHECK-NEXT:  'r_type', 0x0000003f
+;; CHECK:       Relocation 1
+;; CHECK-NEXT:  'r_offset',
+;; CHECK-NEXT:  'r_sym', 0x0000000a
+;; CHECK-NEXT:  'r_type', 0x0000000a
+
+;; The .opd entry for the 'access_int64' function creates 2 relocations:
+;; 1. A R_PPC64_ADDR64 against the .text segment plus addend (the function
+;    address itself);
+;; 2. And a R_PPC64_TOC against no symbol (the linker will replace for the
+;;    module's TOC base).
+;; CHECK:       '.rela.opd'
+;; CHECK:       Relocation 0
+;; CHECK-NEXT:  'r_offset',
+;; CHECK-NEXT:  'r_sym', 0x00000002
+;; CHECK-NEXT:  'r_type', 0x00000026
+;; CHECK:       Relocation 1
+;; CHECK-NEXT:  'r_offset',
+;; CHECK-NEXT:  'r_sym', 0x00000000
+;; CHECK-NEXT:  'r_type', 0x00000033
+
+;; Finally the TOC creates the relocation for the 'number64'.
+;; CHECK:       '.rela.toc'
+;; CHECK:       Relocation 0
+;; CHECK-NEXT:  'r_offset',
+;; CHECK-NEXT:  'r_sym', 0x00000008
+;; CHECK-NEXT:  'r_type', 0x00000026
+
+;; Check if the relocation references are for correct symbols.
+;; CHECK:       Symbol 7
+;; CHECK-NEXT:  'access_int64'
+;; CHECK:       Symbol 8
+;; CHECK-NEXT:  'number64'
+;; CHECK:       Symbol 10
+;; CHECK-NEXT:  'sin'
diff --git a/test/MC/X86/x86-32-ms-inline-asm.s b/test/MC/X86/x86-32-ms-inline-asm.s
new file mode 100644
index 0000000000..a5e80b2c93
--- /dev/null
+++ b/test/MC/X86/x86-32-ms-inline-asm.s
@@ -0,0 +1,10 @@
+// RUN: llvm-mc -x86-asm-syntax=intel -triple i386-unknown-unknown --show-encoding %s | FileCheck %s
+
+mov eax, [ebx].0
+mov [ebx].4, ecx
+
+// CHECK: movl (%ebx), %eax
+// CHECK: encoding: [0x8b,0x03]
+// CHECK: movl %ecx, 4(%ebx)
+// CHECK: encoding: [0x89,0x4b,0x04]
+        
diff --git a/test/Other/multi-pointer-size.ll b/test/Other/multi-pointer-size.ll
new file mode 100644
index 0000000000..95fa54b8f2
--- /dev/null
+++ b/test/Other/multi-pointer-size.ll
@@ -0,0 +1,43 @@
+; RUN: opt -instcombine %s | llvm-dis | FileCheck %s
+target datalayout = "e-p:32:32:32-p1:64:64:64-p2:8:8:8-p3:16:16:16--p4:96:96:96-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:32"
+
+define i32 @test_as0(i32 addrspace(0)* %A) {
+entry:
+; CHECK: %arrayidx = getelementptr i32* %A, i32 1
+  %arrayidx = getelementptr i32 addrspace(0)* %A, i64 1
+  %y = load i32 addrspace(0)* %arrayidx, align 4
+  ret i32 %y
+}
+
+define i32 @test_as1(i32 addrspace(1)* %A) {
+entry:
+; CHECK: %arrayidx = getelementptr i32 addrspace(1)* %A, i64 1
+  %arrayidx = getelementptr i32 addrspace(1)* %A, i32 1
+  %y = load i32 addrspace(1)* %arrayidx, align 4
+  ret i32 %y
+}
+
+define i32 @test_as2(i32 addrspace(2)* %A) {
+entry:
+; CHECK: %arrayidx = getelementptr i32 addrspace(2)* %A, i8 1
+  %arrayidx = getelementptr i32 addrspace(2)* %A, i32 1
+  %y = load i32 addrspace(2)* %arrayidx, align 4
+  ret i32 %y
+}
+
+define i32 @test_as3(i32 addrspace(3)* %A) {
+entry:
+; CHECK: %arrayidx = getelementptr i32 addrspace(3)* %A, i16 1
+  %arrayidx = getelementptr i32 addrspace(3)* %A, i32 1
+  %y = load i32 addrspace(3)* %arrayidx, align 4
+  ret i32 %y
+}
+
+define i32 @test_as4(i32 addrspace(4)* %A) {
+entry:
+; CHECK: %arrayidx = getelementptr i32 addrspace(4)* %A, i96 1
+  %arrayidx = getelementptr i32 addrspace(4)* %A, i32 1
+  %y = load i32 addrspace(4)* %arrayidx, align 4
+  ret i32 %y
+}
+
diff --git a/test/Transforms/GVN/crash.ll b/test/Transforms/GVN/crash.ll
index 31eae256c6..4a8c8e4589 100644
--- a/test/Transforms/GVN/crash.ll
+++ b/test/Transforms/GVN/crash.ll
@@ -163,3 +163,39 @@ entry:
   ret i8 %1
 }
 
+
+; Test that a GEP in an unreachable block with the following form doesn't crash
+; GVN:
+;
+;    %x = gep %some.type %x, ...
+
+%struct.type = type { i64, i32, i32 }
+
+define fastcc void @func() nounwind uwtable ssp align 2 {
+entry:
+  br label %reachable.bb
+
+;; Unreachable code.
+
+unreachable.bb:
+  %gep.val = getelementptr inbounds %struct.type* %gep.val, i64 1
+  br i1 undef, label %u2.bb, label %u1.bb
+
+u1.bb:
+  %tmp1 = getelementptr inbounds %struct.type* %gep.val, i64 0, i32 0
+  store i64 -1, i64* %tmp1, align 8
+  br label %unreachable.bb
+
+u2.bb:
+  %0 = load i32* undef, align 4
+  %conv.i.i.i.i.i = zext i32 %0 to i64
+  br label %u2.bb
+
+;; Reachable code.
+
+reachable.bb:
+  br label %r1.bb
+
+r1.bb:
+  br label %u2.bb
+}
diff --git a/test/Transforms/GVN/pr14166.ll b/test/Transforms/GVN/pr14166.ll
new file mode 100644
index 0000000000..9f47e46426
--- /dev/null
+++ b/test/Transforms/GVN/pr14166.ll
@@ -0,0 +1,27 @@
+; RUN: opt -gvn -S < %s | FileCheck %s
+target datalayout = "e-p:32:32:32"
+target triple = "i386-pc-linux-gnu"
+define <2 x i32> @test1() {
+  %v1 = alloca <2 x i32>
+  call void @anything(<2 x i32>* %v1)
+  %v2 = load <2 x i32>* %v1
+  %v3 = inttoptr <2 x i32> %v2 to <2 x i8*>
+  %v4 = bitcast <2 x i32>* %v1 to <2 x i8*>*
+  store <2 x i8*> %v3, <2 x i8*>* %v4
+  %v5 = load <2 x i32>* %v1
+  ret <2 x i32> %v5
+; CHECK: @test1
+; CHECK: %v1 = alloca <2 x i32>
+; CHECK: call void @anything(<2 x i32>* %v1)
+; CHECK: %v2 = load <2 x i32>* %v1
+; CHECK: %v3 = inttoptr <2 x i32> %v2 to <2 x i8*>
+; CHECK: %v4 = bitcast <2 x i32>* %v1 to <2 x i8*>*
+; CHECK: store <2 x i8*> %v3, <2 x i8*>* %v4
+; CHECK: %1 = ptrtoint <2 x i8*> %v3 to <2 x i32>
+; CHECK: %2 = bitcast <2 x i32> %1 to i64
+; CHECK: %3 = bitcast i64 %2 to <2 x i32>
+; CHECK: ret <2 x i32> %3
+}
+
+declare void @anything(<2 x i32>*)
+
diff --git a/test/Transforms/InstCombine/cast.ll b/test/Transforms/InstCombine/cast.ll
index 899ffddd5b..b4eb69d436 100644
--- a/test/Transforms/InstCombine/cast.ll
+++ b/test/Transforms/InstCombine/cast.ll
@@ -891,3 +891,12 @@ define double @test80([100 x double]* %p, i32 %i) {
   ret double %l
 ; CHECK-NEXT: ret double
 }
+
+define double @test81(double *%p, float %f) {
+  %i = fptosi float %f to i64
+  %q = bitcast double* %p to i8*
+  %pp = getelementptr i8* %q, i64 %i
+  %r = bitcast i8* %pp to double*
+  %l = load double* %r
+  ret double %l
+}
diff --git a/test/Transforms/InstCombine/constant-fold-gep-as-0.ll b/test/Transforms/InstCombine/constant-fold-gep-as-0.ll
new file mode 100644
index 0000000000..74fe316137
--- /dev/null
+++ b/test/Transforms/InstCombine/constant-fold-gep-as-0.ll
@@ -0,0 +1,235 @@
+; "PLAIN" - No optimizations. This tests the target-independent
+; constant folder.
+; RUN: opt -S -o - < %s | FileCheck --check-prefix=PLAIN %s
+
+target datalayout = "e-p:128:128:128-p1:32:32:32-p2:8:8:8-p3:16:16:16-p4:64:64:64-p5:96:96:96-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:32"
+
+; PLAIN: ModuleID = '<stdin>'
+
+; The automatic constant folder in opt does not have targetdata access, so
+; it can't fold gep arithmetic, in general. However, the constant folder run
+; from instcombine and global opt can use targetdata.
+; PLAIN: @G8 = global i8 addrspace(1)* getelementptr (i8 addrspace(1)* inttoptr (i32 1 to i8 addrspace(1)*), i32 -1)
+@G8 = global i8 addrspace(1)* getelementptr (i8 addrspace(1)* inttoptr (i32 1 to i8 addrspace(1)*), i32 -1)
+; PLAIN: @G1 = global i1 addrspace(2)* getelementptr (i1 addrspace(2)* inttoptr (i8 1 to i1 addrspace(2)*), i8 -1)
+@G1 = global i1 addrspace(2)* getelementptr (i1 addrspace(2)* inttoptr (i8 1 to i1 addrspace(2)*), i8 -1)
+; PLAIN: @F8 = global i8 addrspace(1)* getelementptr (i8 addrspace(1)* inttoptr (i32 1 to i8 addrspace(1)*), i32 -2)
+@F8 = global i8 addrspace(1)* getelementptr (i8 addrspace(1)* inttoptr (i32 1 to i8 addrspace(1)*), i32 -2)
+; PLAIN: @F1 = global i1 addrspace(2)* getelementptr (i1 addrspace(2)* inttoptr (i8 1 to i1 addrspace(2)*), i8 -2)
+@F1 = global i1 addrspace(2)* getelementptr (i1 addrspace(2)* inttoptr (i8 1 to i1 addrspace(2)*), i8 -2)
+; PLAIN: @H8 = global i8 addrspace(1)* getelementptr (i8 addrspace(1)* null, i32 -1)
+@H8 = global i8 addrspace(1)* getelementptr (i8 addrspace(1)* inttoptr (i32 0 to i8 addrspace(1)*), i32 -1)
+; PLAIN: @H1 = global i1 addrspace(2)* getelementptr (i1 addrspace(2)* null, i8 -1)
+@H1 = global i1 addrspace(2)* getelementptr (i1 addrspace(2)* inttoptr (i8 0 to i1 addrspace(2)*), i8 -1)
+
+
+; The target-independent folder should be able to do some clever
+; simplifications on sizeof, alignof, and offsetof expressions. The
+; target-dependent folder should fold these down to constants.
+; PLAIN-X: @a = constant i64 mul (i64 ptrtoint (double addrspace(4)* getelementptr (double addrspace(4)* null, i32 1) to i64), i64 2310)
+@a = constant i64 mul (i64 3, i64 mul (i64 ptrtoint ({[7 x double], [7 x double]} addrspace(4)* getelementptr ({[7 x double], [7 x double]} addrspace(4)* null, i64 11) to i64), i64 5))
+
+; PLAIN-X: @b = constant i64 ptrtoint (double addrspace(4)* getelementptr ({ i1, double }* null, i64 0, i32 1) to i64)
+@b = constant i64 ptrtoint ([13 x double] addrspace(4)* getelementptr ({i1, [13 x double]} addrspace(4)* null, i64 0, i32 1) to i64)
+
+; PLAIN-X: @c = constant i64 mul nuw (i64 ptrtoint (double addrspace(4)* getelementptr (double addrspace(4)* null, i32 1) to i64), i64 2)
+@c = constant i64 ptrtoint (double addrspace(4)* getelementptr ({double, double, double, double} addrspace(4)* null, i64 0, i32 2) to i64)
+
+; PLAIN-X: @d = constant i64 mul nuw (i64 ptrtoint (double addrspace(4)* getelementptr (double addrspace(4)* null, i32 1) to i64), i64 11)
+@d = constant i64 ptrtoint (double addrspace(4)* getelementptr ([13 x double] addrspace(4)* null, i64 0, i32 11) to i64)
+
+; PLAIN-X: @e = constant i64 ptrtoint (double addrspace(4)* getelementptr ({ double, float, double, double }* null, i64 0, i32 2) to i64)
+@e = constant i64 ptrtoint (double addrspace(4)* getelementptr ({double, float, double, double} addrspace(4)* null, i64 0, i32 2) to i64)
+
+; PLAIN-X: @f = constant i64 1
+@f = constant i64 ptrtoint (<{ i16, i128 }> addrspace(4)* getelementptr ({i1, <{ i16, i128 }>} addrspace(4)* null, i64 0, i32 1) to i64)
+
+; PLAIN-X: @g = constant i64 ptrtoint (double addrspace(4)* getelementptr ({ i1, double }* null, i64 0, i32 1) to i64)
+@g = constant i64 ptrtoint ({double, double} addrspace(4)* getelementptr ({i1, {double, double}} addrspace(4)* null, i64 0, i32 1) to i64)
+
+; PLAIN-X: @h = constant i64 ptrtoint (i1 addrspace(2)* getelementptr (i1 addrspace(2)* null, i32 1) to i64)
+@h = constant i64 ptrtoint (double addrspace(4)* getelementptr (double addrspace(4)* null, i64 1) to i64)
+
+; PLAIN-X: @i = constant i64 ptrtoint (i1 addrspace(2)* getelementptr ({ i1, i1 addrspace(2)* }* null, i64 0, i32 1) to i64)
+@i = constant i64 ptrtoint (double addrspace(4)* getelementptr ({i1, double} addrspace(4)* null, i64 0, i32 1) to i64)
+
+; The target-dependent folder should cast GEP indices to integer-sized pointers.
+
+; PLAIN: @M = constant i64 addrspace(5)* getelementptr (i64 addrspace(5)* null, i32 1)
+; PLAIN: @N = constant i64 addrspace(5)* getelementptr ({ i64, i64 } addrspace(5)* null, i32 0, i32 1)
+; PLAIN: @O = constant i64 addrspace(5)* getelementptr ([2 x i64] addrspace(5)* null, i32 0, i32 1)
+
+@M = constant i64 addrspace(5)* getelementptr (i64 addrspace(5)* null, i32 1)
+@N = constant i64 addrspace(5)* getelementptr ({ i64, i64 } addrspace(5)* null, i32 0, i32 1)
+@O = constant i64 addrspace(5)* getelementptr ([2 x i64] addrspace(5)* null, i32 0, i32 1)
+
+; Fold GEP of a GEP. Very simple cases are folded.
+
+; PLAIN-X: @Y = global [3 x { i32, i32 }]addrspace(3)* getelementptr inbounds ([3 x { i32, i32 }]addrspace(3)* @ext, i64 2)
+@ext = external addrspace(3) global [3 x { i32, i32 }]
+@Y = global [3 x { i32, i32 }]addrspace(3)* getelementptr inbounds ([3 x { i32, i32 }]addrspace(3)* getelementptr inbounds ([3 x { i32, i32 }]addrspace(3)* @ext, i64 1), i64 1)
+
+; PLAIN-X: @Z = global i32addrspace(3)* getelementptr inbounds (i32addrspace(3)* getelementptr inbounds ([3 x { i32, i32 }]addrspace(3)* @ext, i64 0, i64 1, i32 0), i64 1)
+@Z = global i32addrspace(3)* getelementptr inbounds (i32addrspace(3)* getelementptr inbounds ([3 x { i32, i32 }]addrspace(3)* @ext, i64 0, i64 1, i32 0), i64 1)
+
+
+; Duplicate all of the above as function return values rather than
+; global initializers.
+
+; PLAIN: define i8 addrspace(1)* @goo8() nounwind {
+; PLAIN:   %t = bitcast i8 addrspace(1)* getelementptr (i8 addrspace(1)* inttoptr (i32 1 to i8 addrspace(1)*), i32 -1) to i8 addrspace(1)*
+; PLAIN:   ret i8 addrspace(1)* %t
+; PLAIN: }
+; PLAIN: define i1 addrspace(2)* @goo1() nounwind {
+; PLAIN:   %t = bitcast i1 addrspace(2)* getelementptr (i1 addrspace(2)* inttoptr (i32 1 to i1 addrspace(2)*), i32 -1) to i1 addrspace(2)*
+; PLAIN:   ret i1 addrspace(2)* %t
+; PLAIN: }
+; PLAIN: define i8 addrspace(1)* @foo8() nounwind {
+; PLAIN:   %t = bitcast i8 addrspace(1)* getelementptr (i8 addrspace(1)* inttoptr (i32 1 to i8 addrspace(1)*), i32 -2) to i8 addrspace(1)*
+; PLAIN:   ret i8 addrspace(1)* %t
+; PLAIN: }
+; PLAIN: define i1 addrspace(2)* @foo1() nounwind {
+; PLAIN:   %t = bitcast i1 addrspace(2)* getelementptr (i1 addrspace(2)* inttoptr (i32 1 to i1 addrspace(2)*), i32 -2) to i1 addrspace(2)*
+; PLAIN:   ret i1 addrspace(2)* %t
+; PLAIN: }
+; PLAIN: define i8 addrspace(1)* @hoo8() nounwind {
+; PLAIN:   %t = bitcast i8 addrspace(1)* getelementptr (i8 addrspace(1)* null, i32 -1) to i8 addrspace(1)*
+; PLAIN:   ret i8 addrspace(1)* %t
+; PLAIN: }
+; PLAIN: define i1 addrspace(2)* @hoo1() nounwind {
+; PLAIN:   %t = bitcast i1 addrspace(2)* getelementptr (i1 addrspace(2)* null, i32 -1) to i1 addrspace(2)*
+; PLAIN:   ret i1 addrspace(2)* %t
+; PLAIN: }
+define i8 addrspace(1)* @goo8() nounwind {
+  %t = bitcast i8 addrspace(1)* getelementptr (i8 addrspace(1)* inttoptr (i32 1 to i8 addrspace(1)*), i32 -1) to i8 addrspace(1)*
+  ret i8 addrspace(1)* %t
+}
+define i1 addrspace(2)* @goo1() nounwind {
+  %t = bitcast i1 addrspace(2)* getelementptr (i1 addrspace(2)* inttoptr (i32 1 to i1 addrspace(2)*), i32 -1) to i1 addrspace(2)*
+  ret i1 addrspace(2)* %t
+}
+define i8 addrspace(1)* @foo8() nounwind {
+  %t = bitcast i8 addrspace(1)* getelementptr (i8 addrspace(1)* inttoptr (i32 1 to i8 addrspace(1)*), i32 -2) to i8 addrspace(1)*
+  ret i8 addrspace(1)* %t
+}
+define i1 addrspace(2)* @foo1() nounwind {
+  %t = bitcast i1 addrspace(2)* getelementptr (i1 addrspace(2)* inttoptr (i32 1 to i1 addrspace(2)*), i32 -2) to i1 addrspace(2)*
+  ret i1 addrspace(2)* %t
+}
+define i8 addrspace(1)* @hoo8() nounwind {
+  %t = bitcast i8 addrspace(1)* getelementptr (i8 addrspace(1)* inttoptr (i32 0 to i8 addrspace(1)*), i32 -1) to i8 addrspace(1)*
+  ret i8 addrspace(1)* %t
+}
+define i1 addrspace(2)* @hoo1() nounwind {
+  %t = bitcast i1 addrspace(2)* getelementptr (i1 addrspace(2)* inttoptr (i32 0 to i1 addrspace(2)*), i32 -1) to i1 addrspace(2)*
+  ret i1 addrspace(2)* %t
+}
+
+; PLAIN-X: define i64 @fa() nounwind {
+; PLAIN-X:   %t = bitcast i64 mul (i64 ptrtoint (double addrspace(4)* getelementptr (double addrspace(4)* null, i32 1) to i64), i64 2310) to i64
+; PLAIN-X:   ret i64 %t
+; PLAIN-X: }
+; PLAIN-X: define i64 @fb() nounwind {
+; PLAIN-X:   %t = bitcast i64 ptrtoint (double addrspace(4)* getelementptr ({ i1, double }* null, i64 0, i32 1) to i64) to i64
+; PLAIN-X:   ret i64 %t
+; PLAIN-X: }
+; PLAIN-X: define i64 @fc() nounwind {
+; PLAIN-X:   %t = bitcast i64 mul nuw (i64 ptrtoint (double addrspace(4)* getelementptr (double addrspace(4)* null, i32 1) to i64), i64 2) to i64
+; PLAIN-X:   ret i64 %t
+; PLAIN-X: }
+; PLAIN-X: define i64 @fd() nounwind {
+; PLAIN-X:   %t = bitcast i64 mul nuw (i64 ptrtoint (double addrspace(4)* getelementptr (double addrspace(4)* null, i32 1) to i64), i64 11) to i64
+; PLAIN-X:   ret i64 %t
+; PLAIN-X: }
+; PLAIN-X: define i64 @fe() nounwind {
+; PLAIN-X:   %t = bitcast i64 ptrtoint (double addrspace(4)* getelementptr ({ double, float, double, double }* null, i64 0, i32 2) to i64) to i64
+; PLAIN-X:   ret i64 %t
+; PLAIN-X: }
+; PLAIN-X: define i64 @ff() nounwind {
+; PLAIN-X:   %t = bitcast i64 1 to i64
+; PLAIN-X:   ret i64 %t
+; PLAIN-X: }
+; PLAIN-X: define i64 @fg() nounwind {
+; PLAIN-X:   %t = bitcast i64 ptrtoint (double addrspace(4)* getelementptr ({ i1, double }* null, i64 0, i32 1) to i64) to i64
+; PLAIN-X:   ret i64 %t
+; PLAIN-X: }
+; PLAIN-X: define i64 @fh() nounwind {
+; PLAIN-X:   %t = bitcast i64 ptrtoint (i1 addrspace(2)* getelementptr (i1 addrspace(2)* null, i32 1) to i64) to i64
+; PLAIN-X:   ret i64 %t
+; PLAIN-X: }
+; PLAIN-X: define i64 @fi() nounwind {
+; PLAIN-X:   %t = bitcast i64 ptrtoint (i1 addrspace(2)* getelementptr ({ i1, i1 addrspace(2)* }* null, i64 0, i32 1) to i64) to i64
+; PLAIN-X:   ret i64 %t
+; PLAIN-X: }
+define i64 @fa() nounwind {
+  %t = bitcast i64 mul (i64 3, i64 mul (i64 ptrtoint ({[7 x double], [7 x double]}* getelementptr ({[7 x double], [7 x double]}* null, i64 11) to i64), i64 5)) to i64
+  ret i64 %t
+}
+define i64 @fb() nounwind {
+  %t = bitcast i64 ptrtoint ([13 x double] addrspace(4)* getelementptr ({i1, [13 x double]} addrspace(4)* null, i64 0, i32 1) to i64) to i64
+  ret i64 %t
+}
+define i64 @fc() nounwind {
+  %t = bitcast i64 ptrtoint (double addrspace(4)* getelementptr ({double, double, double, double} addrspace(4)* null, i64 0, i32 2) to i64) to i64
+  ret i64 %t
+}
+define i64 @fd() nounwind {
+  %t = bitcast i64 ptrtoint (double addrspace(4)* getelementptr ([13 x double] addrspace(4)* null, i64 0, i32 11) to i64) to i64
+  ret i64 %t
+}
+define i64 @fe() nounwind {
+  %t = bitcast i64 ptrtoint (double addrspace(4)* getelementptr ({double, float, double, double} addrspace(4)* null, i64 0, i32 2) to i64) to i64
+  ret i64 %t
+}
+define i64 @ff() nounwind {
+  %t = bitcast i64 ptrtoint (<{ i16, i128 }> addrspace(4)* getelementptr ({i1, <{ i16, i128 }>} addrspace(4)* null, i64 0, i32 1) to i64) to i64
+  ret i64 %t
+}
+define i64 @fg() nounwind {
+  %t = bitcast i64 ptrtoint ({double, double} addrspace(4)* getelementptr ({i1, {double, double}} addrspace(4)* null, i64 0, i32 1) to i64) to i64
+  ret i64 %t
+}
+define i64 @fh() nounwind {
+  %t = bitcast i64 ptrtoint (double addrspace(4)* getelementptr (double addrspace(4)* null, i32 1) to i64) to i64
+  ret i64 %t
+}
+define i64 @fi() nounwind {
+  %t = bitcast i64 ptrtoint (double addrspace(4)* getelementptr ({i1, double}addrspace(4)* null, i64 0, i32 1) to i64) to i64
+  ret i64 %t
+}
+
+; PLAIN: define i64* @fM() nounwind {
+; PLAIN:   %t = bitcast i64* getelementptr (i64* null, i32 1) to i64*
+; PLAIN:   ret i64* %t
+; PLAIN: }
+; PLAIN: define i64* @fN() nounwind {
+; PLAIN:   %t = bitcast i64* getelementptr ({ i64, i64 }* null, i32 0, i32 1) to i64*
+; PLAIN:   ret i64* %t
+; PLAIN: }
+; PLAIN: define i64* @fO() nounwind {
+; PLAIN:   %t = bitcast i64* getelementptr ([2 x i64]* null, i32 0, i32 1) to i64*
+; PLAIN:   ret i64* %t
+; PLAIN: }
+
+define i64* @fM() nounwind {
+  %t = bitcast i64* getelementptr (i64* null, i32 1) to i64*
+  ret i64* %t
+}
+define i64* @fN() nounwind {
+  %t = bitcast i64* getelementptr ({ i64, i64 }* null, i32 0, i32 1) to i64*
+  ret i64* %t
+}
+define i64* @fO() nounwind {
+  %t = bitcast i64* getelementptr ([2 x i64]* null, i32 0, i32 1) to i64*
+  ret i64* %t
+}
+
+; PLAIN: define i32 addrspace(1)* @fZ() nounwind {
+; PLAIN:   %t = bitcast i32 addrspace(1)* getelementptr inbounds (i32 addrspace(1)* getelementptr inbounds ([3 x { i32, i32 }] addrspace(1)* @ext2, i64 0, i64 1, i32 0), i64 1) to i32 addrspace(1)*
+; PLAIN:   ret i32 addrspace(1)* %t
+; PLAIN: }
+@ext2 = external addrspace(1) global [3 x { i32, i32 }]
+define i32 addrspace(1)* @fZ() nounwind {
+  %t = bitcast i32 addrspace(1)* getelementptr inbounds (i32 addrspace(1)* getelementptr inbounds ([3 x { i32, i32 }] addrspace(1)* @ext2, i64 0, i64 1, i32 0), i64 1) to i32 addrspace(1)*
+  ret i32 addrspace(1)* %t
+}
diff --git a/test/Transforms/LoopUnroll/pr14167.ll b/test/Transforms/LoopUnroll/pr14167.ll
new file mode 100644
index 0000000000..205ae44b72
--- /dev/null
+++ b/test/Transforms/LoopUnroll/pr14167.ll
@@ -0,0 +1,44 @@
+; RUN: opt < %s -S -loop-unroll -unroll-runtime | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-bgq-linux"
+
+define void @test1() nounwind {
+; Ensure that we don't crash when the trip count == -1.
+; CHECK: @test1
+entry:
+  br label %for.cond2.preheader
+
+for.cond2.preheader:                              ; preds = %for.end, %entry
+  br i1 false, label %middle.block, label %vector.ph
+
+vector.ph:                                        ; preds = %for.cond2.preheader
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  br i1 undef, label %middle.block.loopexit, label %vector.body
+
+middle.block.loopexit:                            ; preds = %vector.body
+  br label %middle.block
+
+middle.block:                                     ; preds = %middle.block.loopexit, %for.cond2.preheader
+  br i1 true, label %for.end, label %scalar.preheader
+
+scalar.preheader:                                 ; preds = %middle.block
+  br label %for.body4
+
+for.body4:                                        ; preds = %for.body4, %scalar.preheader
+  %indvars.iv = phi i64 [ 16000, %scalar.preheader ], [ %indvars.iv.next, %for.body4 ]
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp ne i32 %lftr.wideiv, 16000
+  br i1 %exitcond, label %for.body4, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.body4
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %middle.block
+  br i1 undef, label %for.cond2.preheader, label %for.end15
+
+for.end15:                                        ; preds = %for.end
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/2012-10-20-infloop.ll b/test/Transforms/LoopVectorize/2012-10-20-infloop.ll
index 5caaffc8dd..0176c9a189 100644
--- a/test/Transforms/LoopVectorize/2012-10-20-infloop.ll
+++ b/test/Transforms/LoopVectorize/2012-10-20-infloop.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -dce
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce
 
 ; Check that we don't fall into an infinite loop.
 define void @test() nounwind {
diff --git a/test/Transforms/LoopVectorize/cost-model.ll b/test/Transforms/LoopVectorize/cost-model.ll
new file mode 100644
index 0000000000..18abf2885e
--- /dev/null
+++ b/test/Transforms/LoopVectorize/cost-model.ll
@@ -0,0 +1,38 @@
+; RUN: opt < %s  -loop-vectorize -mtriple=x86_64-apple-macosx10.8.0 -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+@c = common global [2048 x i32] zeroinitializer, align 16
+@b = common global [2048 x i32] zeroinitializer, align 16
+@d = common global [2048 x i32] zeroinitializer, align 16
+@a = common global [2048 x i32] zeroinitializer, align 16
+
+;CHECK: cost_model_1
+;CHECK-NOT: <4 x i32>
+;CHECK: ret void
+define void @cost_model_1() nounwind uwtable noinline ssp {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %0 = shl nsw i64 %indvars.iv, 1
+  %arrayidx = getelementptr inbounds [2048 x i32]* @c, i64 0, i64 %0
+  %1 = load i32* %arrayidx, align 8
+  %idxprom1 = sext i32 %1 to i64
+  %arrayidx2 = getelementptr inbounds [2048 x i32]* @b, i64 0, i64 %idxprom1
+  %2 = load i32* %arrayidx2, align 4
+  %arrayidx4 = getelementptr inbounds [2048 x i32]* @d, i64 0, i64 %indvars.iv
+  %3 = load i32* %arrayidx4, align 4
+  %idxprom5 = sext i32 %3 to i64
+  %arrayidx6 = getelementptr inbounds [2048 x i32]* @a, i64 0, i64 %idxprom5
+  store i32 %2, i32* %arrayidx6, align 4
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, 256
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/cpp-new-array.ll b/test/Transforms/LoopVectorize/cpp-new-array.ll
new file mode 100644
index 0000000000..26902eba9e
--- /dev/null
+++ b/test/Transforms/LoopVectorize/cpp-new-array.ll
@@ -0,0 +1,46 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;CHECK: @cpp_new_arrays
+;CHECK: insertelement <4 x i32>
+;CHECK: load <4 x float>
+;CHECK: fadd <4 x float>
+;CHECK: ret i32
+define i32 @cpp_new_arrays() uwtable ssp {
+entry:
+  %call = call noalias i8* @_Znwm(i64 4)
+  %0 = bitcast i8* %call to float*
+  store float 1.000000e+03, float* %0, align 4
+  %call1 = call noalias i8* @_Znwm(i64 4)
+  %1 = bitcast i8* %call1 to float*
+  store float 1.000000e+03, float* %1, align 4
+  %call3 = call noalias i8* @_Znwm(i64 4)
+  %2 = bitcast i8* %call3 to float*
+  store float 1.000000e+03, float* %2, align 4
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %idxprom = sext i32 %i.01 to i64
+  %arrayidx = getelementptr inbounds float* %0, i64 %idxprom
+  %3 = load float* %arrayidx, align 4
+  %idxprom5 = sext i32 %i.01 to i64
+  %arrayidx6 = getelementptr inbounds float* %1, i64 %idxprom5
+  %4 = load float* %arrayidx6, align 4
+  %add = fadd float %3, %4
+  %idxprom7 = sext i32 %i.01 to i64
+  %arrayidx8 = getelementptr inbounds float* %2, i64 %idxprom7
+  store float %add, float* %arrayidx8, align 4
+  %inc = add nsw i32 %i.01, 1
+  %cmp = icmp slt i32 %inc, 1000
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  %5 = load float* %2, align 4
+  %conv10 = fptosi float %5 to i32
+  ret i32 %conv10
+}
+
+declare noalias i8* @_Znwm(i64)
diff --git a/test/Transforms/LoopVectorize/gcc-examples.ll b/test/Transforms/LoopVectorize/gcc-examples.ll
index 6fb1792b2c..d8942ac861 100644
--- a/test/Transforms/LoopVectorize/gcc-examples.ll
+++ b/test/Transforms/LoopVectorize/gcc-examples.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
diff --git a/test/Transforms/LoopVectorize/increment.ll b/test/Transforms/LoopVectorize/increment.ll
index e818d68562..069b7ea031 100644
--- a/test/Transforms/LoopVectorize/increment.ll
+++ b/test/Transforms/LoopVectorize/increment.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
diff --git a/test/Transforms/LoopVectorize/induction_plus.ll b/test/Transforms/LoopVectorize/induction_plus.ll
index bd90113e52..b31bceb50d 100644
--- a/test/Transforms/LoopVectorize/induction_plus.ll
+++ b/test/Transforms/LoopVectorize/induction_plus.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -loop-vectorize -instcombine -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -instcombine -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
diff --git a/test/Transforms/LoopVectorize/non-const-n.ll b/test/Transforms/LoopVectorize/non-const-n.ll
index 04c5c84a4f..7727b0a2dc 100644
--- a/test/Transforms/LoopVectorize/non-const-n.ll
+++ b/test/Transforms/LoopVectorize/non-const-n.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
diff --git a/test/Transforms/LoopVectorize/read-only.ll b/test/Transforms/LoopVectorize/read-only.ll
index 4095ea68ef..b4d1bac132 100644
--- a/test/Transforms/LoopVectorize/read-only.ll
+++ b/test/Transforms/LoopVectorize/read-only.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
diff --git a/test/Transforms/LoopVectorize/reduction.ll b/test/Transforms/LoopVectorize/reduction.ll
index 3e871b229b..746a08c3ea 100644
--- a/test/Transforms/LoopVectorize/reduction.ll
+++ b/test/Transforms/LoopVectorize/reduction.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
@@ -149,4 +149,83 @@ for.end:                                          ; preds = %for.body, %entry
   ret i32 %sum.0.lcssa
 }
 
+;CHECK: @reduction_and
+;CHECK: and <4 x i32>
+;CHECK: ret i32
+define i32 @reduction_and(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi i32 [ %and, %for.body ], [ -1, %entry ]
+  %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %1 = load i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %and = and i32 %add, %result.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
 
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ -1, %entry ], [ %and, %for.body ]
+  ret i32 %result.0.lcssa
+}
+
+;CHECK: @reduction_or
+;CHECK: or <4 x i32>
+;CHECK: ret i32
+define i32 @reduction_or(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi i32 [ %or, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %1 = load i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %or = or i32 %add, %result.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ 0, %entry ], [ %or, %for.body ]
+  ret i32 %result.0.lcssa
+}
+
+;CHECK: @reduction_xor
+;CHECK: xor <4 x i32>
+;CHECK: ret i32
+define i32 @reduction_xor(i32 %n, i32* nocapture %A, i32* nocapture %B) nounwind uwtable readonly {
+entry:
+  %cmp7 = icmp sgt i32 %n, 0
+  br i1 %cmp7, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %result.08 = phi i32 [ %xor, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32* %A, i64 %indvars.iv
+  %0 = load i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32* %B, i64 %indvars.iv
+  %1 = load i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %xor = xor i32 %add, %result.08
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa = phi i32 [ 0, %entry ], [ %xor, %for.body ]
+  ret i32 %result.0.lcssa
+}
diff --git a/test/Transforms/LoopVectorize/scalar-select.ll b/test/Transforms/LoopVectorize/scalar-select.ll
index 8d5b6fd8af..e537bde31b 100644
--- a/test/Transforms/LoopVectorize/scalar-select.ll
+++ b/test/Transforms/LoopVectorize/scalar-select.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -loop-vectorize -dce -instcombine -licm -S | FileCheck %s
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
diff --git a/test/Transforms/SROA/basictest.ll b/test/Transforms/SROA/basictest.ll
index 03120f7a32..110950f76a 100644
--- a/test/Transforms/SROA/basictest.ll
+++ b/test/Transforms/SROA/basictest.ll
@@ -577,9 +577,17 @@ entry:
   %ai = load i24* %aiptr
 ; CHCEK-NOT: store
 ; CHCEK-NOT: load
-; CHECK:      %[[mask0:.*]] = and i24 undef, -256
-; CHECK-NEXT: %[[mask1:.*]] = and i24 %[[mask0]], -65281
-; CHECK-NEXT: %[[mask2:.*]] = and i24 %[[mask1]], 65535
+; CHECK:      %[[ext2:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[shift2:.*]] = shl i24 %[[ext2]], 16
+; CHECK-NEXT: %[[mask2:.*]] = and i24 undef, 65535
+; CHECK-NEXT: %[[insert2:.*]] = or i24 %[[mask2]], %[[shift2]]
+; CHECK-NEXT: %[[ext1:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[shift1:.*]] = shl i24 %[[ext1]], 8
+; CHECK-NEXT: %[[mask1:.*]] = and i24 %[[insert2]], -65281
+; CHECK-NEXT: %[[insert1:.*]] = or i24 %[[mask1]], %[[shift1]]
+; CHECK-NEXT: %[[ext0:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[mask0:.*]] = and i24 %[[insert1]], -256
+; CHECK-NEXT: %[[insert0:.*]] = or i24 %[[mask0]], %[[ext0]]
 
   %biptr = bitcast [3 x i8]* %b to i24*
   store i24 %ai, i24* %biptr
@@ -591,10 +599,10 @@ entry:
   %b2 = load i8* %b2ptr
 ; CHCEK-NOT: store
 ; CHCEK-NOT: load
-; CHECK:      %[[trunc0:.*]] = trunc i24 %[[mask2]] to i8
-; CHECK-NEXT: %[[shift1:.*]] = lshr i24 %[[mask2]], 8
+; CHECK:      %[[trunc0:.*]] = trunc i24 %[[insert0]] to i8
+; CHECK-NEXT: %[[shift1:.*]] = lshr i24 %[[insert0]], 8
 ; CHECK-NEXT: %[[trunc1:.*]] = trunc i24 %[[shift1]] to i8
-; CHECK-NEXT: %[[shift2:.*]] = lshr i24 %[[mask2]], 16
+; CHECK-NEXT: %[[shift2:.*]] = lshr i24 %[[insert0]], 16
 ; CHECK-NEXT: %[[trunc2:.*]] = trunc i24 %[[shift2]] to i8
 
   %bsum0 = add i8 %b0, %b1
@@ -1064,6 +1072,49 @@ entry:
   ret void
 }
 
+define i64 @PR14059.2({ float, float }* %phi) {
+; Check that SROA can split up alloca-wide integer loads and stores where the
+; underlying alloca has smaller components that are accessed independently. This
+; shows up particularly with ABI lowering patterns coming out of Clang that rely
+; on the particular register placement of a single large integer return value.
+; CHECK: @PR14059.2
+
+entry:
+  %retval = alloca { float, float }, align 4
+  ; CHECK-NOT: alloca
+
+  %0 = bitcast { float, float }* %retval to i64*
+  store i64 0, i64* %0
+  ; CHECK-NOT: store
+
+  %phi.realp = getelementptr inbounds { float, float }* %phi, i32 0, i32 0
+  %phi.real = load float* %phi.realp
+  %phi.imagp = getelementptr inbounds { float, float }* %phi, i32 0, i32 1
+  %phi.imag = load float* %phi.imagp
+  ; CHECK:      %[[realp:.*]] = getelementptr inbounds { float, float }* %phi, i32 0, i32 0
+  ; CHECK-NEXT: %[[real:.*]] = load float* %[[realp]]
+  ; CHECK-NEXT: %[[imagp:.*]] = getelementptr inbounds { float, float }* %phi, i32 0, i32 1
+  ; CHECK-NEXT: %[[imag:.*]] = load float* %[[imagp]]
+
+  %real = getelementptr inbounds { float, float }* %retval, i32 0, i32 0
+  %imag = getelementptr inbounds { float, float }* %retval, i32 0, i32 1
+  store float %phi.real, float* %real
+  store float %phi.imag, float* %imag
+  ; CHECK-NEXT: %[[imag_convert:.*]] = bitcast float %[[imag]] to i32
+  ; CHECK-NEXT: %[[imag_ext:.*]] = zext i32 %[[imag_convert]] to i64
+  ; CHECK-NEXT: %[[imag_shift:.*]] = shl i64 %[[imag_ext]], 32
+  ; CHECK-NEXT: %[[imag_mask:.*]] = and i64 undef, 4294967295
+  ; CHECK-NEXT: %[[imag_insert:.*]] = or i64 %[[imag_mask]], %[[imag_shift]]
+  ; CHECK-NEXT: %[[real_convert:.*]] = bitcast float %[[real]] to i32
+  ; CHECK-NEXT: %[[real_ext:.*]] = zext i32 %[[real_convert]] to i64
+  ; CHECK-NEXT: %[[real_mask:.*]] = and i64 %[[imag_insert]], -4294967296
+  ; CHECK-NEXT: %[[real_insert:.*]] = or i64 %[[real_mask]], %[[real_ext]]
+
+  %1 = load i64* %0, align 1
+  ret i64 %1
+  ; CHECK-NEXT: ret i64 %[[real_insert]]
+}
+
 define void @PR14105({ [16 x i8] }* %ptr) {
 ; Ensure that when rewriting the GEP index '-1' for this alloca we preserve is
 ; sign as negative. We use a volatile memcpy to ensure promotion never actually
diff --git a/test/Transforms/SROA/big-endian.ll b/test/Transforms/SROA/big-endian.ll
index 532f8690cf..ce82d1f30b 100644
--- a/test/Transforms/SROA/big-endian.ll
+++ b/test/Transforms/SROA/big-endian.ll
@@ -26,9 +26,17 @@ entry:
   %ai = load i24* %aiptr
 ; CHCEK-NOT: store
 ; CHCEK-NOT: load
-; CHECK:      %[[mask0:.*]] = and i24 undef, 65535
-; CHECK-NEXT: %[[mask1:.*]] = and i24 %[[mask0]], -65281
-; CHECK-NEXT: %[[mask2:.*]] = and i24 %[[mask1]], -256
+; CHECK:      %[[ext2:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[mask2:.*]] = and i24 undef, -256
+; CHECK-NEXT: %[[insert2:.*]] = or i24 %[[mask2]], %[[ext2]]
+; CHECK-NEXT: %[[ext1:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[shift1:.*]] = shl i24 %[[ext1]], 8
+; CHECK-NEXT: %[[mask1:.*]] = and i24 %[[insert2]], -65281
+; CHECK-NEXT: %[[insert1:.*]] = or i24 %[[mask1]], %[[shift1]]
+; CHECK-NEXT: %[[ext0:.*]] = zext i8 0 to i24
+; CHECK-NEXT: %[[shift0:.*]] = shl i24 %[[ext0]], 16
+; CHECK-NEXT: %[[mask0:.*]] = and i24 %[[insert1]], 65535
+; CHECK-NEXT: %[[insert0:.*]] = or i24 %[[mask0]], %[[shift0]]
 
   %biptr = bitcast [3 x i8]* %b to i24*
   store i24 %ai, i24* %biptr
@@ -40,11 +48,11 @@ entry:
   %b2 = load i8* %b2ptr
 ; CHCEK-NOT: store
 ; CHCEK-NOT: load
-; CHECK:      %[[shift0:.*]] = lshr i24 %[[mask2]], 16
+; CHECK:      %[[shift0:.*]] = lshr i24 %[[insert0]], 16
 ; CHECK-NEXT: %[[trunc0:.*]] = trunc i24 %[[shift0]] to i8
-; CHECK-NEXT: %[[shift1:.*]] = lshr i24 %[[mask2]], 8
+; CHECK-NEXT: %[[shift1:.*]] = lshr i24 %[[insert0]], 8
 ; CHECK-NEXT: %[[trunc1:.*]] = trunc i24 %[[shift1]] to i8
-; CHECK-NEXT: %[[trunc2:.*]] = trunc i24 %[[mask2]] to i8
+; CHECK-NEXT: %[[trunc2:.*]] = trunc i24 %[[insert0]] to i8
 
   %bsum0 = add i8 %b0, %b1
   %bsum1 = add i8 %bsum0, %b2
@@ -74,27 +82,26 @@ entry:
 
   %a0i16ptr = bitcast i8* %a0ptr to i16*
   store i16 1, i16* %a0i16ptr
-; CHECK:      %[[mask:.*]] = and i56 undef, 1099511627775
-; CHECK-NEXT: %[[or:.*]] = or i56 %[[mask]], 1099511627776
+; CHECK:      %[[mask0:.*]] = and i16 1, -16
 
   %a1i4ptr = bitcast i8* %a1ptr to i4*
   store i4 1, i4* %a1i4ptr
-; CHECK:      %[[mask:.*]] = and i56 %[[or]], -16492674416641
-; CHECK-NEXT: %[[or:.*]] = or i56 %[[mask]], 1099511627776
+; CHECK-NEXT: %[[insert0:.*]] = or i16 %[[mask0]], 1
 
   store i8 1, i8* %a2ptr
-; CHECK-NEXT: %[[mask:.*]] = and i56 %[[or]], -1095216660481
-; CHECK-NEXT: %[[or:.*]] = or i56 %[[mask]], 4294967296
+; CHECK-NEXT: %[[mask1:.*]] = and i40 undef, 4294967295
+; CHECK-NEXT: %[[insert1:.*]] = or i40 %[[mask1]], 4294967296
 
   %a3i24ptr = bitcast i8* %a3ptr to i24*
   store i24 1, i24* %a3i24ptr
-; CHECK-NEXT: %[[mask:.*]] = and i56 %[[or]], -4294967041
-; CHECK-NEXT: %[[or:.*]] = or i56 %[[mask]], 256
+; CHECK-NEXT: %[[mask2:.*]] = and i40 %[[insert1]], -4294967041
+; CHECK-NEXT: %[[insert2:.*]] = or i40 %[[mask2]], 256
 
   %a2i40ptr = bitcast i8* %a2ptr to i40*
   store i40 1, i40* %a2i40ptr
-; CHECK-NEXT: %[[mask:.*]] = and i56 %[[or]], -1099511627776
-; CHECK-NEXT: %[[or:.*]] = or i56 %[[mask]], 1
+; CHECK-NEXT: %[[ext3:.*]] = zext i40 1 to i56
+; CHECK-NEXT: %[[mask3:.*]] = and i56 undef, -1099511627776
+; CHECK-NEXT: %[[insert3:.*]] = or i56 %[[mask3]], %[[ext3]]
 
 ; CHCEK-NOT: store
 ; CHCEK-NOT: load
@@ -103,6 +110,10 @@ entry:
   %ai = load i56* %aiptr
   %ret = zext i56 %ai to i64
   ret i64 %ret
-; CHECK:      %[[ret:.*]] = zext i56 %[[or]] to i64
+; CHECK-NEXT: %[[ext4:.*]] = zext i16 %[[insert0]] to i56
+; CHECK-NEXT: %[[shift4:.*]] = shl i56 %[[ext4]], 40
+; CHECK-NEXT: %[[mask4:.*]] = and i56 %[[insert3]], 1099511627775
+; CHECK-NEXT: %[[insert4:.*]] = or i56 %[[mask4]], %[[shift4]]
+; CHECK-NEXT: %[[ret:.*]] = zext i56 %[[insert4]] to i64
 ; CHECK-NEXT: ret i64 %[[ret]]
 }
diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp
index 751c8e7da0..0390bc470a 100644
--- a/tools/opt/opt.cpp
+++ b/tools/opt/opt.cpp
@@ -39,6 +39,7 @@
 #include "llvm/Support/SystemUtils.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/TargetSelect.h"
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/LinkAllPasses.h"
 #include "llvm/LinkAllVMCore.h"
@@ -564,6 +565,9 @@ int main(int argc, char **argv) {
   llvm_shutdown_obj Y;  // Call llvm_shutdown() on exit.
   LLVMContext &Context = getGlobalContext();
 
+  InitializeAllTargets();
+  InitializeAllTargetMCs();
+
   // Initialize passes
   PassRegistry &Registry = *PassRegistry::getPassRegistry();
   initializeCore(Registry);
diff --git a/unittests/ADT/DenseMapTest.cpp b/unittests/ADT/DenseMapTest.cpp
index 75e7006434..15eb6988f6 100644
--- a/unittests/ADT/DenseMapTest.cpp
+++ b/unittests/ADT/DenseMapTest.cpp
@@ -330,4 +330,37 @@ TEST(DenseMapCustomTest, FindAsTest) {
   EXPECT_TRUE(map.find_as("d") == map.end());
 }
 
+struct ContiguousDenseMapInfo {
+  static inline unsigned getEmptyKey() { return ~0; }
+  static inline unsigned getTombstoneKey() { return ~0U - 1; }
+  static unsigned getHashValue(const unsigned& Val) { return Val; }
+  static bool isEqual(const unsigned& LHS, const unsigned& RHS) {
+    return LHS == RHS;
+  }
+};
+
+// Test that filling a small dense map with exactly the number of elements in
+// the map grows to have enough space for an empty bucket.
+TEST(DenseMapCustomTest, SmallDenseMapGrowTest) {
+  SmallDenseMap<unsigned, unsigned, 32, ContiguousDenseMapInfo> map;
+  // Add some number of elements, then delete a few to leave us some tombstones.
+  // If we just filled the map with 32 elements we'd grow because of not enough
+  // tombstones which masks the issue here.
+  for (unsigned i = 0; i < 20; ++i)
+    map[i] = i + 1;
+  for (unsigned i = 0; i < 10; ++i)
+    map.erase(i);
+  for (unsigned i = 20; i < 32; ++i)
+    map[i] = i + 1;
+
+  // Size tests
+  EXPECT_EQ(22u, map.size());
+
+  // Try to find an element which doesn't exist.  There was a bug in
+  // SmallDenseMap which led to a map with num elements == small capacity not
+  // having an empty bucket any more.  Finding an element not in the map would
+  // therefore never terminate.
+  EXPECT_TRUE(map.find(32) == map.end());
+}
+
 }
diff --git a/utils/TableGen/CMakeLists.txt b/utils/TableGen/CMakeLists.txt
index b8a6daf9e0..116fa57522 100644
--- a/utils/TableGen/CMakeLists.txt
+++ b/utils/TableGen/CMakeLists.txt
@@ -9,6 +9,7 @@ add_tablegen(llvm-tblgen LLVM
   CodeEmitterGen.cpp
   CodeGenDAGPatterns.cpp
   CodeGenInstruction.cpp
+  CodeGenMapTable.cpp
   CodeGenRegisters.cpp
   CodeGenSchedule.cpp
   CodeGenTarget.cpp
diff --git a/utils/TableGen/CodeGenMapTable.cpp b/utils/TableGen/CodeGenMapTable.cpp
new file mode 100644
index 0000000000..4bfd1ba798
--- /dev/null
+++ b/utils/TableGen/CodeGenMapTable.cpp
@@ -0,0 +1,608 @@
+//===- CodeGenMapTable.cpp - Instruction Mapping Table Generator ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// CodeGenMapTable provides functionality for the TabelGen to create
+// relation mapping between instructions. Relation models are defined using
+// InstrMapping as a base class. This file implements the functionality which
+// parses these definitions and generates relation maps using the information
+// specified there. These maps are emitted as tables in the XXXGenInstrInfo.inc
+// file along with the functions to query them.
+//
+// A relationship model to relate non-predicate instructions with their
+// predicated true/false forms can be defined as follows:
+//
+// def getPredOpcode : InstrMapping {
+//  let FilterClass = "PredRel";
+//  let RowFields = ["BaseOpcode"];
+//  let ColFields = ["PredSense"];
+//  let KeyCol = ["none"];
+//  let ValueCols = [["true"], ["false"]]; }
+//
+// CodeGenMapTable parses this map and generates a table in XXXGenInstrInfo.inc
+// file that contains the instructions modeling this relationship. This table
+// is defined in the function
+// "int getPredOpcode(uint16_t Opcode, enum PredSense inPredSense)"
+// that can be used to retrieve the predicated form of the instruction by
+// passing its opcode value and the predicate sense (true/false) of the desired
+// instruction as arguments.
+//
+// Short description of the algorithm:
+//
+// 1) Iterate through all the records that derive from "InstrMapping" class.
+// 2) For each record, filter out instructions based on the FilterClass value.
+// 3) Iterate through this set of instructions and insert them into
+// RowInstrMap map based on their RowFields values. RowInstrMap is keyed by the
+// vector of RowFields values and contains vectors of Records (instructions) as
+// values. RowFields is a list of fields that are required to have the same
+// values for all the instructions appearing in the same row of the relation
+// table. All the instructions in a given row of the relation table have some
+// sort of relationship with the key instruction defined by the corresponding
+// relationship model.
+//
+// Ex: RowInstrMap(RowVal1, RowVal2, ...) -> [Instr1, Instr2, Instr3, ... ]
+// Here Instr1, Instr2, Instr3 have same values (RowVal1, RowVal2) for
+// RowFields. These groups of instructions are later matched against ValueCols
+// to determine the column they belong to, if any.
+//
+// While building the RowInstrMap map, collect all the key instructions in
+// KeyInstrVec. These are the instructions having the same values as KeyCol
+// for all the fields listed in ColFields.
+//
+// For Example:
+//
+// Relate non-predicate instructions with their predicated true/false forms.
+//
+// def getPredOpcode : InstrMapping {
+//  let FilterClass = "PredRel";
+//  let RowFields = ["BaseOpcode"];
+//  let ColFields = ["PredSense"];
+//  let KeyCol = ["none"];
+//  let ValueCols = [["true"], ["false"]]; }
+//
+// Here, only instructions that have "none" as PredSense will be selected as key
+// instructions.
+//
+// 4) For each key instruction, get the group of instructions that share the
+// same key-value as the key instruction from RowInstrMap. Iterate over the list
+// of columns in ValueCols (it is defined as a list<list<string> >. Therefore,
+// it can specify multi-column relationships). For each column, find the
+// instruction from the group that matches all the values for the column.
+// Multiple matches are not allowed.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CodeGenTarget.h"
+#include "llvm/Support/Format.h"
+using namespace llvm;
+typedef std::map<std::string, std::vector<Record*> > InstrRelMapTy;
+
+typedef std::map<std::vector<Init*>, std::vector<Record*> > RowInstrMapTy;
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// This class is used to represent InstrMapping class defined in Target.td file.
+class InstrMap {
+private:
+  std::string Name;
+  std::string FilterClass;
+  ListInit *RowFields;
+  ListInit *ColFields;
+  ListInit *KeyCol;
+  std::vector<ListInit*> ValueCols;
+
+public:
+  InstrMap(Record* MapRec) {
+    Name = MapRec->getName();
+
+    // FilterClass - It's used to reduce the search space only to the
+    // instructions that define the kind of relationship modeled by
+    // this InstrMapping object/record.
+    const RecordVal *Filter = MapRec->getValue("FilterClass");
+    FilterClass = Filter->getValue()->getAsUnquotedString();
+
+    // List of fields/attributes that need to be same across all the
+    // instructions in a row of the relation table.
+    RowFields = MapRec->getValueAsListInit("RowFields");
+
+    // List of fields/attributes that are constant across all the instruction
+    // in a column of the relation table. Ex: ColFields = 'predSense'
+    ColFields = MapRec->getValueAsListInit("ColFields");
+
+    // Values for the fields/attributes listed in 'ColFields'.
+    // Ex: KeyCol = 'noPred' -- key instruction is non predicated
+    KeyCol = MapRec->getValueAsListInit("KeyCol");
+
+    // List of values for the fields/attributes listed in 'ColFields', one for
+    // each column in the relation table.
+    //
+    // Ex: ValueCols = [['true'],['false']] -- it results two columns in the
+    // table. First column requires all the instructions to have predSense
+    // set to 'true' and second column requires it to be 'false'.
+    ListInit *ColValList = MapRec->getValueAsListInit("ValueCols");
+
+    // Each instruction map must specify at least one column for it to be valid.
+    if (ColValList->getSize() == 0)
+      throw "InstrMapping record `" + MapRec->getName() + "' has empty " +
+            "`ValueCols' field!";
+
+    for (unsigned i = 0, e = ColValList->getSize(); i < e; i++) {
+      ListInit *ColI = dyn_cast<ListInit>(ColValList->getElement(i));
+
+      // Make sure that all the sub-lists in 'ValueCols' have same number of
+      // elements as the fields in 'ColFields'.
+      if (ColI->getSize() == ColFields->getSize())
+        ValueCols.push_back(ColI);
+      else {
+        throw "Record `" + MapRec->getName() + "', field `" + "ValueCols" +
+            "' entries don't match with the entries in 'ColFields'!";
+      }
+    }
+  }
+
+  std::string getName() const {
+    return Name;
+  }
+
+  std::string getFilterClass() {
+    return FilterClass;
+  }
+
+  ListInit *getRowFields() const {
+    return RowFields;
+  }
+
+  ListInit *getColFields() const {
+    return ColFields;
+  }
+
+  ListInit *getKeyCol() const {
+    return KeyCol;
+  }
+
+  const std::vector<ListInit*> &getValueCols() const {
+    return ValueCols;
+  }
+};
+} // End anonymous namespace.
+
+
+//===----------------------------------------------------------------------===//
+// class MapTableEmitter : It builds the instruction relation maps using
+// the information provided in InstrMapping records. It outputs these
+// relationship maps as tables into XXXGenInstrInfo.inc file along with the
+// functions to query them.
+
+namespace {
+class MapTableEmitter {
+private:
+//  std::string TargetName;
+  const CodeGenTarget &Target;
+  // InstrMapDesc - InstrMapping record to be processed.
+  InstrMap InstrMapDesc;
+
+  // InstrDefs - list of instructions filtered using FilterClass defined
+  // in InstrMapDesc.
+  std::vector<Record*> InstrDefs;
+
+  // RowInstrMap - maps RowFields values to the instructions. It's keyed by the
+  // values of the row fields and contains vector of records as values.
+  RowInstrMapTy RowInstrMap;
+
+  // KeyInstrVec - list of key instructions.
+  std::vector<Record*> KeyInstrVec;
+  DenseMap<Record*, std::vector<Record*> > MapTable;
+
+public:
+  MapTableEmitter(CodeGenTarget &Target, RecordKeeper &Records, Record *IMRec):
+                  Target(Target), InstrMapDesc(IMRec) {
+    const std::string FilterClass = InstrMapDesc.getFilterClass();
+    InstrDefs = Records.getAllDerivedDefinitions(FilterClass);
+  }
+
+  void buildRowInstrMap();
+
+  // Returns true if an instruction is a key instruction, i.e., its ColFields
+  // have same values as KeyCol.
+  bool isKeyColInstr(Record* CurInstr);
+
+  // Find column instruction corresponding to a key instruction based on the
+  // constraints for that column.
+  Record *getInstrForColumn(Record *KeyInstr, ListInit *CurValueCol);
+
+  // Find column instructions for each key instruction based
+  // on ValueCols and store them into MapTable.
+  void buildMapTable();
+
+  void emitBinSearch(raw_ostream &OS, unsigned TableSize);
+  void emitTablesWithFunc(raw_ostream &OS);
+  unsigned emitBinSearchTable(raw_ostream &OS);
+
+  // Lookup functions to query binary search tables.
+  void emitMapFuncBody(raw_ostream &OS, unsigned TableSize);
+
+};
+} // End anonymous namespace.
+
+
+//===----------------------------------------------------------------------===//
+// Process all the instructions that model this relation (alreday present in
+// InstrDefs) and insert them into RowInstrMap which is keyed by the values of
+// the fields listed as RowFields. It stores vectors of records as values.
+// All the related instructions have the same values for the RowFields thus are
+// part of the same key-value pair.
+//===----------------------------------------------------------------------===//
+
+void MapTableEmitter::buildRowInstrMap() {
+  for (unsigned i = 0, e = InstrDefs.size(); i < e; i++) {
+    std::vector<Record*> InstrList;
+    Record *CurInstr = InstrDefs[i];
+    std::vector<Init*> KeyValue;
+    ListInit *RowFields = InstrMapDesc.getRowFields();
+    for (unsigned j = 0, endRF = RowFields->getSize(); j < endRF; j++) {
+      Init *RowFieldsJ = RowFields->getElement(j);
+      Init *CurInstrVal = CurInstr->getValue(RowFieldsJ)->getValue();
+      KeyValue.push_back(CurInstrVal);
+    }
+
+    // Collect key instructions into KeyInstrVec. Later, these instructions are
+    // processed to assign column position to the instructions sharing
+    // their KeyValue in RowInstrMap.
+    if (isKeyColInstr(CurInstr))
+      KeyInstrVec.push_back(CurInstr);
+
+    RowInstrMap[KeyValue].push_back(CurInstr);
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Return true if an instruction is a KeyCol instruction.
+//===----------------------------------------------------------------------===//
+
+bool MapTableEmitter::isKeyColInstr(Record* CurInstr) {
+  ListInit *ColFields = InstrMapDesc.getColFields();
+  ListInit *KeyCol = InstrMapDesc.getKeyCol();
+
+  // Check if the instruction is a KeyCol instruction.
+  bool MatchFound = true;
+  for (unsigned j = 0, endCF = ColFields->getSize();
+      (j < endCF) && MatchFound; j++) {
+    RecordVal *ColFieldName = CurInstr->getValue(ColFields->getElement(j));
+    std::string CurInstrVal = ColFieldName->getValue()->getAsUnquotedString();
+    std::string KeyColValue = KeyCol->getElement(j)->getAsUnquotedString();
+    MatchFound = (CurInstrVal == KeyColValue);
+  }
+  return MatchFound;
+}
+
+//===----------------------------------------------------------------------===//
+// Build a map to link key instructions with the column instructions arranged
+// according to their column positions.
+//===----------------------------------------------------------------------===//
+
+void MapTableEmitter::buildMapTable() {
+  // Find column instructions for a given key based on the ColField
+  // constraints.
+  const std::vector<ListInit*> &ValueCols = InstrMapDesc.getValueCols();
+  unsigned NumOfCols = ValueCols.size();
+  for (unsigned j = 0, endKI = KeyInstrVec.size(); j < endKI; j++) {
+    Record *CurKeyInstr = KeyInstrVec[j];
+    std::vector<Record*> ColInstrVec(NumOfCols);
+
+    // Find the column instruction based on the constraints for the column.
+    for (unsigned ColIdx = 0; ColIdx < NumOfCols; ColIdx++) {
+      ListInit *CurValueCol = ValueCols[ColIdx];
+      Record *ColInstr = getInstrForColumn(CurKeyInstr, CurValueCol);
+      ColInstrVec[ColIdx] = ColInstr;
+    }
+    MapTable[CurKeyInstr] = ColInstrVec;
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Find column instruction based on the constraints for that column.
+//===----------------------------------------------------------------------===//
+
+Record *MapTableEmitter::getInstrForColumn(Record *KeyInstr,
+                                           ListInit *CurValueCol) {
+  ListInit *RowFields = InstrMapDesc.getRowFields();
+  std::vector<Init*> KeyValue;
+
+  // Construct KeyValue using KeyInstr's values for RowFields.
+  for (unsigned j = 0, endRF = RowFields->getSize(); j < endRF; j++) {
+    Init *RowFieldsJ = RowFields->getElement(j);
+    Init *KeyInstrVal = KeyInstr->getValue(RowFieldsJ)->getValue();
+    KeyValue.push_back(KeyInstrVal);
+  }
+
+  // Get all the instructions that share the same KeyValue as the KeyInstr
+  // in RowInstrMap. We search through these instructions to find a match
+  // for the current column, i.e., the instruction which has the same values
+  // as CurValueCol for all the fields in ColFields.
+  const std::vector<Record*> &RelatedInstrVec = RowInstrMap[KeyValue];
+
+  ListInit *ColFields = InstrMapDesc.getColFields();
+  Record *MatchInstr = NULL;
+
+  for (unsigned i = 0, e = RelatedInstrVec.size(); i < e; i++) {
+    bool MatchFound = true;
+    Record *CurInstr = RelatedInstrVec[i];
+    for (unsigned j = 0, endCF = ColFields->getSize();
+        (j < endCF) && MatchFound; j++) {
+      Init *ColFieldJ = ColFields->getElement(j);
+      Init *CurInstrInit = CurInstr->getValue(ColFieldJ)->getValue();
+      std::string CurInstrVal = CurInstrInit->getAsUnquotedString();
+      Init *ColFieldJVallue = CurValueCol->getElement(j);
+      MatchFound = (CurInstrVal == ColFieldJVallue->getAsUnquotedString());
+    }
+
+    if (MatchFound) {
+      if (MatchInstr) // Already had a match
+        // Error if multiple matches are found for a column.
+        throw "Multiple matches found for `" + KeyInstr->getName() +
+              "', for the relation `" + InstrMapDesc.getName();
+      else
+        MatchInstr = CurInstr;
+    }
+  }
+  return MatchInstr;
+}
+
+//===----------------------------------------------------------------------===//
+// Emit one table per relation. Only instructions with a valid relation of a
+// given type are included in the table sorted by their enum values (opcodes).
+// Binary search is used for locating instructions in the table.
+//===----------------------------------------------------------------------===//
+
+unsigned MapTableEmitter::emitBinSearchTable(raw_ostream &OS) {
+
+  const std::vector<const CodeGenInstruction*> &NumberedInstructions =
+                                            Target.getInstructionsByEnumValue();
+  std::string TargetName = Target.getName();
+  const std::vector<ListInit*> &ValueCols = InstrMapDesc.getValueCols();
+  unsigned NumCol = ValueCols.size();
+  unsigned TotalNumInstr = NumberedInstructions.size();
+  unsigned TableSize = 0;
+
+  OS << "static const uint16_t "<<InstrMapDesc.getName();
+  // Number of columns in the table are NumCol+1 because key instructions are
+  // emitted as first column.
+  OS << "Table[]["<< NumCol+1 << "] = {\n";
+  for (unsigned i = 0; i < TotalNumInstr; i++) {
+    Record *CurInstr = NumberedInstructions[i]->TheDef;
+    std::vector<Record*> ColInstrs = MapTable[CurInstr];
+    std::string OutStr("");
+    unsigned RelExists = 0;
+    if (ColInstrs.size()) {
+      for (unsigned j = 0; j < NumCol; j++) {
+        if (ColInstrs[j] != NULL) {
+          RelExists = 1;
+          OutStr += ", ";
+          OutStr += TargetName;
+          OutStr += "::";
+          OutStr += ColInstrs[j]->getName();
+        } else { OutStr += ", -1";}
+      }
+
+      if (RelExists) {
+        OS << "  { " << TargetName << "::" << CurInstr->getName();
+        OS << OutStr <<" },\n";
+        TableSize++;
+      }
+    }
+  }
+  if (!TableSize) {
+    OS << "  { " << TargetName << "::" << "INSTRUCTION_LIST_END, ";
+    OS << TargetName << "::" << "INSTRUCTION_LIST_END }";
+  }
+  OS << "}; // End of " << InstrMapDesc.getName() << "Table\n\n";
+  return TableSize;
+}
+
+//===----------------------------------------------------------------------===//
+// Emit binary search algorithm as part of the functions used to query
+// relation tables.
+//===----------------------------------------------------------------------===//
+
+void MapTableEmitter::emitBinSearch(raw_ostream &OS, unsigned TableSize) {
+  OS << "  unsigned mid;\n";
+  OS << "  unsigned start = 0;\n";
+  OS << "  unsigned end = " << TableSize << ";\n";
+  OS << "  while (start < end) {\n";
+  OS << "    mid = start + (end - start)/2;\n";
+  OS << "    if (Opcode == " << InstrMapDesc.getName() << "Table[mid][0]) {\n";
+  OS << "      break;\n";
+  OS << "    }\n";
+  OS << "    if (Opcode < " << InstrMapDesc.getName() << "Table[mid][0])\n";
+  OS << "      end = mid;\n";
+  OS << "    else\n";
+  OS << "      start = mid + 1;\n";
+  OS << "  }\n";
+  OS << "  if (start == end)\n";
+  OS << "    return -1; // Instruction doesn't exist in this table.\n\n";
+}
+
+//===----------------------------------------------------------------------===//
+// Emit functions to query relation tables.
+//===----------------------------------------------------------------------===//
+
+void MapTableEmitter::emitMapFuncBody(raw_ostream &OS,
+                                           unsigned TableSize) {
+
+  ListInit *ColFields = InstrMapDesc.getColFields();
+  const std::vector<ListInit*> &ValueCols = InstrMapDesc.getValueCols();
+
+  // Emit binary search algorithm to locate instructions in the
+  // relation table. If found, return opcode value from the appropriate column
+  // of the table.
+  emitBinSearch(OS, TableSize);
+
+  if (ValueCols.size() > 1) {
+    for (unsigned i = 0, e = ValueCols.size(); i < e; i++) {
+      ListInit *ColumnI = ValueCols[i];
+      for (unsigned j = 0, ColSize = ColumnI->getSize(); j < ColSize; j++) {
+        std::string ColName = ColFields->getElement(j)->getAsUnquotedString();
+        OS << "  if (in" << ColName;
+        OS << " == ";
+        OS << ColName << "_" << ColumnI->getElement(j)->getAsUnquotedString();
+        if (j < ColumnI->getSize() - 1) OS << " && ";
+        else OS << ")\n";
+      }
+      OS << "    return " << InstrMapDesc.getName();
+      OS << "Table[mid]["<<i+1<<"];\n";
+    }
+    OS << "  return -1;";
+  }
+  else
+    OS << "  return " << InstrMapDesc.getName() << "Table[mid][1];\n";
+
+  OS <<"}\n\n";
+}
+
+//===----------------------------------------------------------------------===//
+// Emit relation tables and the functions to query them.
+//===----------------------------------------------------------------------===//
+
+void MapTableEmitter::emitTablesWithFunc(raw_ostream &OS) {
+
+  // Emit function name and the input parameters : mostly opcode value of the
+  // current instruction. However, if a table has multiple columns (more than 2
+  // since first column is used for the key instructions), then we also need
+  // to pass another input to indicate the column to be selected.
+
+  ListInit *ColFields = InstrMapDesc.getColFields();
+  const std::vector<ListInit*> &ValueCols = InstrMapDesc.getValueCols();
+  OS << "// "<< InstrMapDesc.getName() << "\n";
+  OS << "int "<< InstrMapDesc.getName() << "(uint16_t Opcode";
+  if (ValueCols.size() > 1) {
+    for (unsigned i = 0, e = ColFields->getSize(); i < e; i++) {
+      std::string ColName = ColFields->getElement(i)->getAsUnquotedString();
+      OS << ", enum " << ColName << " in" << ColName << ") {\n";
+    }
+  } else { OS << ") {\n"; }
+
+  // Emit map table.
+  unsigned TableSize = emitBinSearchTable(OS);
+
+  // Emit rest of the function body.
+  emitMapFuncBody(OS, TableSize);
+}
+
+//===----------------------------------------------------------------------===//
+// Emit enums for the column fields across all the instruction maps.
+//===----------------------------------------------------------------------===//
+
+static void emitEnums(raw_ostream &OS, RecordKeeper &Records) {
+
+  std::vector<Record*> InstrMapVec;
+  InstrMapVec = Records.getAllDerivedDefinitions("InstrMapping");
+  std::map<std::string, std::vector<Init*> > ColFieldValueMap;
+
+  // Iterate over all InstrMapping records and create a map between column
+  // fields and their possible values across all records.
+  for (unsigned i = 0, e = InstrMapVec.size(); i < e; i++) {
+    Record *CurMap = InstrMapVec[i];
+    ListInit *ColFields;
+    ColFields = CurMap->getValueAsListInit("ColFields");
+    ListInit *List = CurMap->getValueAsListInit("ValueCols");
+    std::vector<ListInit*> ValueCols;
+    unsigned ListSize = List->getSize();
+
+    for (unsigned j = 0; j < ListSize; j++) {
+      ListInit *ListJ = dyn_cast<ListInit>(List->getElement(j));
+
+      if (ListJ->getSize() != ColFields->getSize()) {
+        throw "Record `" + CurMap->getName() + "', field `" + "ValueCols" +
+            "' entries don't match with the entries in 'ColFields' !";
+      }
+      ValueCols.push_back(ListJ);
+    }
+
+    for (unsigned j = 0, endCF = ColFields->getSize(); j < endCF; j++) {
+      for (unsigned k = 0; k < ListSize; k++){
+        std::string ColName = ColFields->getElement(j)->getAsUnquotedString();
+        ColFieldValueMap[ColName].push_back((ValueCols[k])->getElement(j));
+      }
+    }
+  }
+
+  for (std::map<std::string, std::vector<Init*> >::iterator
+       II = ColFieldValueMap.begin(), IE = ColFieldValueMap.end();
+       II != IE; II++) {
+    std::vector<Init*> FieldValues = (*II).second;
+    unsigned FieldSize = FieldValues.size();
+
+    // Delete duplicate entries from ColFieldValueMap
+    for (unsigned i = 0; i < FieldSize - 1; i++) {
+      Init *CurVal = FieldValues[i];
+      for (unsigned j = i+1; j < FieldSize; j++) {
+        if (CurVal == FieldValues[j]) {
+          FieldValues.erase(FieldValues.begin()+j);
+        }
+      }
+    }
+
+    // Emit enumerated values for the column fields.
+    OS << "enum " << (*II).first << " {\n";
+    for (unsigned i = 0; i < FieldSize; i++) {
+      OS << "\t" << (*II).first << "_" << FieldValues[i]->getAsUnquotedString();
+      if (i != FieldValues.size() - 1)
+        OS << ",\n";
+      else
+        OS << "\n};\n\n";
+    }
+  }
+}
+
+namespace llvm {
+//===----------------------------------------------------------------------===//
+// Parse 'InstrMapping' records and use the information to form relationship
+// between instructions. These relations are emitted as a tables along with the
+// functions to query them.
+//===----------------------------------------------------------------------===//
+void EmitMapTable(RecordKeeper &Records, raw_ostream &OS) {
+  CodeGenTarget Target(Records);
+  std::string TargetName = Target.getName();
+  std::vector<Record*> InstrMapVec;
+  InstrMapVec = Records.getAllDerivedDefinitions("InstrMapping");
+
+  if (!InstrMapVec.size())
+    return;
+
+  OS << "#ifdef GET_INSTRMAP_INFO\n";
+  OS << "#undef GET_INSTRMAP_INFO\n";
+  OS << "namespace llvm {\n\n";
+  OS << "namespace " << TargetName << " {\n\n";
+
+  // Emit coulumn field names and their values as enums.
+  emitEnums(OS, Records);
+
+  // Iterate over all instruction mapping records and construct relationship
+  // maps based on the information specified there.
+  //
+  for (unsigned i = 0, e = InstrMapVec.size(); i < e; i++) {
+    MapTableEmitter IMap(Target, Records, InstrMapVec[i]);
+
+    // Build RowInstrMap to group instructions based on their values for
+    // RowFields. In the process, also collect key instructions into
+    // KeyInstrVec.
+    IMap.buildRowInstrMap();
+
+    // Build MapTable to map key instructions with the corresponding column
+    // instructions.
+    IMap.buildMapTable();
+
+    // Emit map tables and the functions to query them.
+    IMap.emitTablesWithFunc(OS);
+  }
+  OS << "} // End " << TargetName << " namespace\n";
+  OS << "} // End llvm namespace\n";
+  OS << "#endif // GET_INSTRMAP_INFO\n\n";
+}
+
+} // End llvm namespace
diff --git a/utils/TableGen/CodeGenRegisters.cpp b/utils/TableGen/CodeGenRegisters.cpp
index f195b4e3fa..10064fdd16 100644
--- a/utils/TableGen/CodeGenRegisters.cpp
+++ b/utils/TableGen/CodeGenRegisters.cpp
@@ -600,7 +600,7 @@ struct TupleExpander : SetTheory::Expander {
     unsigned Length = ~0u;
     SmallVector<SetTheory::RecSet, 4> Lists(Dim);
     for (unsigned i = 0; i != Dim; ++i) {
-      ST.evaluate(SubRegs->getElement(i), Lists[i]);
+      ST.evaluate(SubRegs->getElement(i), Lists[i], Def->getLoc());
       Length = std::min(Length, unsigned(Lists[i].size()));
     }
 
@@ -728,7 +728,7 @@ CodeGenRegisterClass::CodeGenRegisterClass(CodeGenRegBank &RegBank, Record *R)
   // Alternative allocation orders may be subsets.
   SetTheory::RecSet Order;
   for (unsigned i = 0, e = AltOrders->size(); i != e; ++i) {
-    RegBank.getSets().evaluate(AltOrders->getElement(i), Order);
+    RegBank.getSets().evaluate(AltOrders->getElement(i), Order, R->getLoc());
     Orders[1 + i].append(Order.begin(), Order.end());
     // Verify that all altorder members are regclass members.
     while (!Order.empty()) {
diff --git a/utils/TableGen/CodeGenSchedule.cpp b/utils/TableGen/CodeGenSchedule.cpp
index fc101eec61..1cca3e3f85 100644
--- a/utils/TableGen/CodeGenSchedule.cpp
+++ b/utils/TableGen/CodeGenSchedule.cpp
@@ -38,8 +38,9 @@ static void dumpIdxVec(const SmallVectorImpl<unsigned> &V) {
 
 // (instrs a, b, ...) Evaluate and union all arguments. Identical to AddOp.
 struct InstrsOp : public SetTheory::Operator {
-  void apply(SetTheory &ST, DagInit *Expr, SetTheory::RecSet &Elts) {
-    ST.evaluate(Expr->arg_begin(), Expr->arg_end(), Elts);
+  void apply(SetTheory &ST, DagInit *Expr, SetTheory::RecSet &Elts,
+             ArrayRef<SMLoc> Loc) {
+    ST.evaluate(Expr->arg_begin(), Expr->arg_end(), Elts, Loc);
   }
 };
 
@@ -55,13 +56,15 @@ struct InstRegexOp : public SetTheory::Operator {
   const CodeGenTarget &Target;
   InstRegexOp(const CodeGenTarget &t): Target(t) {}
 
-  void apply(SetTheory &ST, DagInit *Expr, SetTheory::RecSet &Elts) {
+  void apply(SetTheory &ST, DagInit *Expr, SetTheory::RecSet &Elts,
+             ArrayRef<SMLoc> Loc) {
     SmallVector<Regex*, 4> RegexList;
     for (DagInit::const_arg_iterator
            AI = Expr->arg_begin(), AE = Expr->arg_end(); AI != AE; ++AI) {
       StringInit *SI = dyn_cast<StringInit>(*AI);
       if (!SI)
-        throw "instregex requires pattern string: " + Expr->getAsString();
+        throw TGError(Loc, "instregex requires pattern string: "
+          + Expr->getAsString());
       std::string pat = SI->getValue();
       // Implement a python-style prefix match.
       if (pat[0] != '^') {
diff --git a/utils/TableGen/InstrInfoEmitter.cpp b/utils/TableGen/InstrInfoEmitter.cpp
index e447c16b16..8e670e3cbc 100644
--- a/utils/TableGen/InstrInfoEmitter.cpp
+++ b/utils/TableGen/InstrInfoEmitter.cpp
@@ -16,6 +16,7 @@
 #include "CodeGenDAGPatterns.h"
 #include "CodeGenSchedule.h"
 #include "CodeGenTarget.h"
+#include "TableGenBackends.h"
 #include "SequenceToOffsetTable.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/TableGen/Record.h"
@@ -415,6 +416,7 @@ namespace llvm {
 
 void EmitInstrInfo(RecordKeeper &RK, raw_ostream &OS) {
   InstrInfoEmitter(RK).run(OS);
+  EmitMapTable(RK, OS);
 }
 
 } // End llvm namespace
diff --git a/utils/TableGen/SetTheory.cpp b/utils/TableGen/SetTheory.cpp
index 5b760e7a23..33a8f0e337 100644
--- a/utils/TableGen/SetTheory.cpp
+++ b/utils/TableGen/SetTheory.cpp
@@ -27,20 +27,20 @@ typedef SetTheory::RecVec RecVec;
 
 // (add a, b, ...) Evaluate and union all arguments.
 struct AddOp : public SetTheory::Operator {
-  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts) {
-    ST.evaluate(Expr->arg_begin(), Expr->arg_end(), Elts);
+  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts, ArrayRef<SMLoc> Loc) {
+    ST.evaluate(Expr->arg_begin(), Expr->arg_end(), Elts, Loc);
   }
 };
 
 // (sub Add, Sub, ...) Set difference.
 struct SubOp : public SetTheory::Operator {
-  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts) {
+  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts, ArrayRef<SMLoc> Loc) {
     if (Expr->arg_size() < 2)
-      throw "Set difference needs at least two arguments: " +
-        Expr->getAsString();
+      throw TGError(Loc, "Set difference needs at least two arguments: " +
+        Expr->getAsString());
     RecSet Add, Sub;
-    ST.evaluate(*Expr->arg_begin(), Add);
-    ST.evaluate(Expr->arg_begin() + 1, Expr->arg_end(), Sub);
+    ST.evaluate(*Expr->arg_begin(), Add, Loc);
+    ST.evaluate(Expr->arg_begin() + 1, Expr->arg_end(), Sub, Loc);
     for (RecSet::iterator I = Add.begin(), E = Add.end(); I != E; ++I)
       if (!Sub.count(*I))
         Elts.insert(*I);
@@ -49,12 +49,13 @@ struct SubOp : public SetTheory::Operator {
 
 // (and S1, S2) Set intersection.
 struct AndOp : public SetTheory::Operator {
-  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts) {
+  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts, ArrayRef<SMLoc> Loc) {
     if (Expr->arg_size() != 2)
-      throw "Set intersection requires two arguments: " + Expr->getAsString();
+      throw TGError(Loc, "Set intersection requires two arguments: " +
+        Expr->getAsString());
     RecSet S1, S2;
-    ST.evaluate(Expr->arg_begin()[0], S1);
-    ST.evaluate(Expr->arg_begin()[1], S2);
+    ST.evaluate(Expr->arg_begin()[0], S1, Loc);
+    ST.evaluate(Expr->arg_begin()[1], S2, Loc);
     for (RecSet::iterator I = S1.begin(), E = S1.end(); I != E; ++I)
       if (S2.count(*I))
         Elts.insert(*I);
@@ -65,17 +66,19 @@ struct AndOp : public SetTheory::Operator {
 struct SetIntBinOp : public SetTheory::Operator {
   virtual void apply2(SetTheory &ST, DagInit *Expr,
                      RecSet &Set, int64_t N,
-                     RecSet &Elts) =0;
+                     RecSet &Elts, ArrayRef<SMLoc> Loc) =0;
 
-  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts) {
+  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts, ArrayRef<SMLoc> Loc) {
     if (Expr->arg_size() != 2)
-      throw "Operator requires (Op Set, Int) arguments: " + Expr->getAsString();
+      throw TGError(Loc, "Operator requires (Op Set, Int) arguments: " +
+        Expr->getAsString());
     RecSet Set;
-    ST.evaluate(Expr->arg_begin()[0], Set);
+    ST.evaluate(Expr->arg_begin()[0], Set, Loc);
     IntInit *II = dyn_cast<IntInit>(Expr->arg_begin()[1]);
     if (!II)
-      throw "Second argument must be an integer: " + Expr->getAsString();
-    apply2(ST, Expr, Set, II->getValue(), Elts);
+      throw TGError(Loc, "Second argument must be an integer: " +
+        Expr->getAsString());
+    apply2(ST, Expr, Set, II->getValue(), Elts, Loc);
   }
 };
 
@@ -83,9 +86,10 @@ struct SetIntBinOp : public SetTheory::Operator {
 struct ShlOp : public SetIntBinOp {
   void apply2(SetTheory &ST, DagInit *Expr,
              RecSet &Set, int64_t N,
-             RecSet &Elts) {
+             RecSet &Elts, ArrayRef<SMLoc> Loc) {
     if (N < 0)
-      throw "Positive shift required: " + Expr->getAsString();
+      throw TGError(Loc, "Positive shift required: " +
+        Expr->getAsString());
     if (unsigned(N) < Set.size())
       Elts.insert(Set.begin() + N, Set.end());
   }
@@ -95,9 +99,10 @@ struct ShlOp : public SetIntBinOp {
 struct TruncOp : public SetIntBinOp {
   void apply2(SetTheory &ST, DagInit *Expr,
              RecSet &Set, int64_t N,
-             RecSet &Elts) {
+             RecSet &Elts, ArrayRef<SMLoc> Loc) {
     if (N < 0)
-      throw "Positive length required: " + Expr->getAsString();
+      throw TGError(Loc, "Positive length required: " +
+        Expr->getAsString());
     if (unsigned(N) > Set.size())
       N = Set.size();
     Elts.insert(Set.begin(), Set.begin() + N);
@@ -112,7 +117,7 @@ struct RotOp : public SetIntBinOp {
 
   void apply2(SetTheory &ST, DagInit *Expr,
              RecSet &Set, int64_t N,
-             RecSet &Elts) {
+             RecSet &Elts, ArrayRef<SMLoc> Loc) {
     if (Reverse)
       N = -N;
     // N > 0 -> rotate left, N < 0 -> rotate right.
@@ -131,9 +136,10 @@ struct RotOp : public SetIntBinOp {
 struct DecimateOp : public SetIntBinOp {
   void apply2(SetTheory &ST, DagInit *Expr,
              RecSet &Set, int64_t N,
-             RecSet &Elts) {
+             RecSet &Elts, ArrayRef<SMLoc> Loc) {
     if (N <= 0)
-      throw "Positive stride required: " + Expr->getAsString();
+      throw TGError(Loc, "Positive stride required: " +
+        Expr->getAsString());
     for (unsigned I = 0; I < Set.size(); I += N)
       Elts.insert(Set[I]);
   }
@@ -141,12 +147,12 @@ struct DecimateOp : public SetIntBinOp {
 
 // (interleave S1, S2, ...) Interleave elements of the arguments.
 struct InterleaveOp : public SetTheory::Operator {
-  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts) {
+  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts, ArrayRef<SMLoc> Loc) {
     // Evaluate the arguments individually.
     SmallVector<RecSet, 4> Args(Expr->getNumArgs());
     unsigned MaxSize = 0;
     for (unsigned i = 0, e = Expr->getNumArgs(); i != e; ++i) {
-      ST.evaluate(Expr->getArg(i), Args[i]);
+      ST.evaluate(Expr->getArg(i), Args[i], Loc);
       MaxSize = std::max(MaxSize, unsigned(Args[i].size()));
     }
     // Interleave arguments into Elts.
@@ -159,38 +165,38 @@ struct InterleaveOp : public SetTheory::Operator {
 
 // (sequence "Format", From, To) Generate a sequence of records by name.
 struct SequenceOp : public SetTheory::Operator {
-  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts) {
+  void apply(SetTheory &ST, DagInit *Expr, RecSet &Elts, ArrayRef<SMLoc> Loc) {
     int Step = 1;
     if (Expr->arg_size() > 4)
-      throw "Bad args to (sequence \"Format\", From, To): " +
-        Expr->getAsString();
+      throw TGError(Loc, "Bad args to (sequence \"Format\", From, To): " +
+        Expr->getAsString());
     else if (Expr->arg_size() == 4) {
       if (IntInit *II = dyn_cast<IntInit>(Expr->arg_begin()[3])) {
         Step = II->getValue();
       } else
-        throw "Stride must be an integer: " + Expr->getAsString();
+        throw TGError(Loc, "Stride must be an integer: " + Expr->getAsString());
     }
 
     std::string Format;
     if (StringInit *SI = dyn_cast<StringInit>(Expr->arg_begin()[0]))
       Format = SI->getValue();
     else
-      throw "Format must be a string: " + Expr->getAsString();
+      throw TGError(Loc,  "Format must be a string: " + Expr->getAsString());
 
     int64_t From, To;
     if (IntInit *II = dyn_cast<IntInit>(Expr->arg_begin()[1]))
       From = II->getValue();
     else
-      throw "From must be an integer: " + Expr->getAsString();
+      throw TGError(Loc, "From must be an integer: " + Expr->getAsString());
     if (From < 0 || From >= (1 << 30))
-      throw "From out of range";
+      throw TGError(Loc, "From out of range");
 
     if (IntInit *II = dyn_cast<IntInit>(Expr->arg_begin()[2]))
       To = II->getValue();
     else
-      throw "From must be an integer: " + Expr->getAsString();
+      throw TGError(Loc, "From must be an integer: " + Expr->getAsString());
     if (To < 0 || To >= (1 << 30))
-      throw "To out of range";
+      throw TGError(Loc, "To out of range");
 
     RecordKeeper &Records =
       cast<DefInit>(Expr->getOperator())->getDef()->getRecords();
@@ -206,7 +212,8 @@ struct SequenceOp : public SetTheory::Operator {
       OS << format(Format.c_str(), unsigned(From));
       Record *Rec = Records.getDef(OS.str());
       if (!Rec)
-        throw "No def named '" + Name + "': " + Expr->getAsString();
+        throw TGError(Loc, "No def named '" + Name + "': " +
+          Expr->getAsString());
       // Try to reevaluate Rec in case it is a set.
       if (const RecVec *Result = ST.expand(Rec))
         Elts.insert(Result->begin(), Result->end());
@@ -225,7 +232,7 @@ struct FieldExpander : public SetTheory::Expander {
   FieldExpander(StringRef fn) : FieldName(fn) {}
 
   void expand(SetTheory &ST, Record *Def, RecSet &Elts) {
-    ST.evaluate(Def->getValueInit(FieldName), Elts);
+    ST.evaluate(Def->getValueInit(FieldName), Elts, Def->getLoc());
   }
 };
 } // end anonymous namespace
@@ -259,7 +266,7 @@ void SetTheory::addFieldExpander(StringRef ClassName, StringRef FieldName) {
   addExpander(ClassName, new FieldExpander(FieldName));
 }
 
-void SetTheory::evaluate(Init *Expr, RecSet &Elts) {
+void SetTheory::evaluate(Init *Expr, RecSet &Elts, ArrayRef<SMLoc> Loc) {
   // A def in a list can be a just an element, or it may expand.
   if (DefInit *Def = dyn_cast<DefInit>(Expr)) {
     if (const RecVec *Result = expand(Def->getDef()))
@@ -270,19 +277,19 @@ void SetTheory::evaluate(Init *Expr, RecSet &Elts) {
 
   // Lists simply expand.
   if (ListInit *LI = dyn_cast<ListInit>(Expr))
-    return evaluate(LI->begin(), LI->end(), Elts);
+    return evaluate(LI->begin(), LI->end(), Elts, Loc);
 
   // Anything else must be a DAG.
   DagInit *DagExpr = dyn_cast<DagInit>(Expr);
   if (!DagExpr)
-    throw "Invalid set element: " + Expr->getAsString();
+    throw TGError(Loc, "Invalid set element: " + Expr->getAsString());
   DefInit *OpInit = dyn_cast<DefInit>(DagExpr->getOperator());
   if (!OpInit)
-    throw "Bad set expression: " + Expr->getAsString();
+    throw TGError(Loc, "Bad set expression: " + Expr->getAsString());
   Operator *Op = Operators.lookup(OpInit->getDef()->getName());
   if (!Op)
-    throw "Unknown set operator: " + Expr->getAsString();
-  Op->apply(*this, DagExpr, Elts);
+    throw TGError(Loc, "Unknown set operator: " + Expr->getAsString());
+  Op->apply(*this, DagExpr, Elts, Loc);
 }
 
 const RecVec *SetTheory::expand(Record *Set) {
@@ -292,23 +299,19 @@ const RecVec *SetTheory::expand(Record *Set) {
     return &I->second;
 
   // This is the first time we see Set. Find a suitable expander.
-  try {
-    const std::vector<Record*> &SC = Set->getSuperClasses();
-    for (unsigned i = 0, e = SC.size(); i != e; ++i) {
-      // Skip unnamed superclasses.
-      if (!dyn_cast<StringInit>(SC[i]->getNameInit()))
-        continue;
-      if (Expander *Exp = Expanders.lookup(SC[i]->getName())) {
-        // This breaks recursive definitions.
-        RecVec &EltVec = Expansions[Set];
-        RecSet Elts;
-        Exp->expand(*this, Set, Elts);
-        EltVec.assign(Elts.begin(), Elts.end());
-        return &EltVec;
-      }
+  const std::vector<Record*> &SC = Set->getSuperClasses();
+  for (unsigned i = 0, e = SC.size(); i != e; ++i) {
+    // Skip unnamed superclasses.
+    if (!dyn_cast<StringInit>(SC[i]->getNameInit()))
+      continue;
+    if (Expander *Exp = Expanders.lookup(SC[i]->getName())) {
+      // This breaks recursive definitions.
+      RecVec &EltVec = Expansions[Set];
+      RecSet Elts;
+      Exp->expand(*this, Set, Elts);
+      EltVec.assign(Elts.begin(), Elts.end());
+      return &EltVec;
     }
-  } catch (const std::string &Error) {
-    throw TGError(Set->getLoc(), Error);
   }
 
   // Set is not expandable.
diff --git a/utils/TableGen/SetTheory.h b/utils/TableGen/SetTheory.h
index b394058f4c..122372ab33 100644
--- a/utils/TableGen/SetTheory.h
+++ b/utils/TableGen/SetTheory.h
@@ -49,6 +49,7 @@
 
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/Support/SourceMgr.h"
 #include <map>
 #include <vector>
 
@@ -72,7 +73,8 @@ public:
 
     /// apply - Apply this operator to Expr's arguments and insert the result
     /// in Elts.
-    virtual void apply(SetTheory&, DagInit *Expr, RecSet &Elts) =0;
+    virtual void apply(SetTheory&, DagInit *Expr, RecSet &Elts,
+                       ArrayRef<SMLoc> Loc) =0;
   };
 
   /// Expander - A callback function that can transform a Record representing a
@@ -119,13 +121,13 @@ public:
   void addOperator(StringRef Name, Operator*);
 
   /// evaluate - Evaluate Expr and append the resulting set to Elts.
-  void evaluate(Init *Expr, RecSet &Elts);
+  void evaluate(Init *Expr, RecSet &Elts, ArrayRef<SMLoc> Loc);
 
   /// evaluate - Evaluate a sequence of Inits and append to Elts.
   template<typename Iter>
-  void evaluate(Iter begin, Iter end, RecSet &Elts) {
+  void evaluate(Iter begin, Iter end, RecSet &Elts, ArrayRef<SMLoc> Loc) {
     while (begin != end)
-      evaluate(*begin++, Elts);
+      evaluate(*begin++, Elts, Loc);
   }
 
   /// expand - Expand a record into a set of elements if possible.  Return a
diff --git a/utils/TableGen/TableGenBackends.h b/utils/TableGen/TableGenBackends.h
index 2c00c40cfe..f0d25d8a2c 100644
--- a/utils/TableGen/TableGenBackends.h
+++ b/utils/TableGen/TableGenBackends.h
@@ -74,5 +74,6 @@ void EmitInstrInfo(RecordKeeper &RK, raw_ostream &OS);
 void EmitPseudoLowering(RecordKeeper &RK, raw_ostream &OS);
 void EmitRegisterInfo(RecordKeeper &RK, raw_ostream &OS);
 void EmitSubtarget(RecordKeeper &RK, raw_ostream &OS);
+void EmitMapTable(RecordKeeper &RK, raw_ostream &OS);
 
 } // End llvm namespace