265 files changed, 16166 insertions, 337 deletions
diff --git a/Makefile b/Makefile
index 1e5dae470d..f8d89ebd62 100644
--- a/Makefile
+++ b/Makefile
@@ -43,6 +43,11 @@ EXTRA_DIST := test unittests llvm.spec include win32 Xcode
 
 include $(LEVEL)/Makefile.config
 
+ifeq ($(NACL_SANDBOX),1)
+  DIRS := $(filter-out tools/llvm-shlib runtime docs unittests, $(DIRS))
+  OPTIONAL_DIRS :=
+endif
+
 ifneq ($(ENABLE_SHARED),1)
   DIRS := $(filter-out tools/llvm-shlib, $(DIRS))
 endif
@@ -121,6 +126,7 @@ cross-compile-build-tools:
 	fi; \
 	($(MAKE) -C BuildTools \
 	  BUILD_DIRS_ONLY=1 \
+	  NACL_SANDBOX=0 \
 	  UNIVERSAL= \
 	  UNIVERSAL_SDK_PATH= \
 	  SDKROOT= \
diff --git a/Makefile.rules b/Makefile.rules
index b2b02c25d4..51accc512b 100644
--- a/Makefile.rules
+++ b/Makefile.rules
@@ -655,6 +655,23 @@ else
 endif
 endif
 
+ifeq ($(NACL_SANDBOX),1)
+  # NOTE: we specify --noirt to tell the driver that we should link
+  # against private (non-stable, non-IRT) libraries for the
+  # sandboxed translator.  This could have been specified directly,
+  # except that LLVM slips in -lpthread elsewhere in the build system,
+  # and we need it to use -lpthread_private instead.
+  LIBS += -Wl,--noirt -lsrpc -limc_syscalls -lplatform -lgio -lpthread \
+	-lm -lnacl -lnacl_dyncode -lnosys
+  ifeq ($(USE_TCMALLOC),1)
+    # Note: -ltcmalloc_minimal needs to stay last on the link line
+    LIBS += -ltcmalloc_minimal
+    CXX.Flags += -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free
+    C.Flags += -fno-builtin-malloc -fno-builtin-calloc -fno-builtin-realloc -fno-builtin-free
+  endif
+else
+  LIBS +=
+endif
 
 #----------------------------------------------------------
 # Options To Invoke Tools
@@ -1239,8 +1256,10 @@ endif
 $(LibName.SO): $(ObjectsO) $(ProjLibsPaths) $(LLVMLibsPaths) $(SharedLibDir)/.dir
 	$(Echo) Linking $(BuildMode) $(SharedLibKindMessage) \
 	  $(notdir $@)
+        # @LOCALMOD: the EXTRA_LIBS hack is necessary for LLVMgold.so
+	#             c.f. llvm/tools/gold/Makefile
 	$(Verb) $(Link) $(SharedLinkOptions) -o $@ $(ObjectsO) \
-	  $(ProjLibsOptions) $(LLVMLibsOptions) $(LIBS)
+	  $(ProjLibsOptions) $(LLVMLibsOptions) $(LIBS) $(EXTRA_LIBS)
 else
 $(LibName.SO): $(ObjectsO) $(SharedLibDir)/.dir
 	$(Echo) Linking $(BuildMode) Shared Library $(notdir $@)
diff --git a/OWNERS b/OWNERS
new file mode 100644
index 0000000000..3f2cc43ac7
--- /dev/null
+++ b/OWNERS
@@ -0,0 +1,7 @@
+dschuff@chromium.org
+eliben@chromium.org
+jvoung@chromium.org
+mseaborn@chromium.org
+robertm@chromium.org
+sehr@chromium.org
+
diff --git a/PRESUBMIT.py b/PRESUBMIT.py
new file mode 100644
index 0000000000..af60ae4f1e
--- /dev/null
+++ b/PRESUBMIT.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2012 The Native Client Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+# Documentation on PRESUBMIT.py can be found at:
+# http://www.chromium.org/developers/how-tos/depottools/presubmit-scripts
+
+EXCLUDE_PROJECT_CHECKS_DIRS = [ '.' ]
+
+import subprocess
+def CheckGitBranch():
+  p = subprocess.Popen("git branch -vv", shell=True,
+                       stdout=subprocess.PIPE)
+  output, _ = p.communicate()
+
+  lines = output.split('\n')
+  for line in lines:
+    # output format for checked-out branch should be
+    # * branchname hash [TrackedBranchName ...
+    toks = line.split()
+    if '*' not in toks[0]:
+      continue
+    if not 'origin/master' in toks[3]:
+      warning = 'Warning: your current branch:\n' + line
+      warning += '\nis not tracking origin/master. git cl push may silently '
+      warning += 'fail to push your change. To fix this, do\n'
+      warning += 'git branch --set-upstream '+ toks[1] + ' origin/master'
+      return warning
+    return None
+  print 'Warning: presubmit check could not determine local git branch'
+  return None
+
+def _CommonChecks(input_api, output_api):
+  """Checks for both upload and commit."""
+  results = []
+  results.extend(input_api.canned_checks.PanProjectChecks(
+      input_api, output_api, project_name='Native Client',
+      excluded_paths=tuple(EXCLUDE_PROJECT_CHECKS_DIRS)))
+  branch_warning = CheckGitBranch()
+  if branch_warning:
+    results.append(output_api.PresubmitPromptWarning(branch_warning))
+  return results
+
+def CheckChangeOnUpload(input_api, output_api):
+  """Verifies all changes in all files.
+  Args:
+    input_api: the limited set of input modules allowed in presubmit.
+    output_api: the limited set of output modules allowed in presubmit.
+  """
+  report = []
+  report.extend(_CommonChecks(input_api, output_api))
+  return report
+
+def CheckChangeOnCommit(input_api, output_api):
+  """Verifies all changes in all files and verifies that the
+  tree is open and can accept a commit.
+  Args:
+    input_api: the limited set of input modules allowed in presubmit.
+    output_api: the limited set of output modules allowed in presubmit.
+  """
+  report = []
+  report.extend(CheckChangeOnUpload(input_api, output_api))
+  return report
+
+def GetPreferredTrySlaves(project, change):
+  return []
diff --git a/autoconf/config.sub b/autoconf/config.sub
index 9942491533..a4f411f6c6 100755
--- a/autoconf/config.sub
+++ b/autoconf/config.sub
@@ -239,6 +239,10 @@ case $os in
 		basic_machine=m68k-atari
 		os=-mint
 		;;
+        -nacl*)
+                basic_machine=i686-pc
+                os=-nacl
+                ;;
 esac
 
 # Decode aliases for certain CPU-COMPANY combinations.
@@ -347,6 +351,14 @@ case $basic_machine in
 	i*86 | x86_64)
 	  basic_machine=$basic_machine-pc
 	  ;;
+        nacl64*)
+          basic_machine=x86_64-pc
+          os=-nacl
+          ;;
+        nacl*)
+          basic_machine=i686-pc
+          os=-nacl
+          ;;
 	# Object if more than one company name word.
 	*-*-*)
 		echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
@@ -1364,6 +1376,9 @@ case $os in
 			;;
 		esac
 		;;
+        -nacl*)
+                os=-nacl
+                ;;
 	-nto-qnx*)
 		;;
 	-nto*)
diff --git a/autoconf/configure.ac b/autoconf/configure.ac
index f1842a6d8a..13134589af 100644
--- a/autoconf/configure.ac
+++ b/autoconf/configure.ac
@@ -266,6 +266,11 @@ AC_CACHE_CHECK([type of operating system we're going to host on],
     llvm_cv_no_link_all_option="-Wl,--no-whole-archive"
     llvm_cv_os_type="Freestanding"
     llvm_cv_platform_type="Unix" ;;
+  *-*-nacl*)
+    llvm_cv_link_all_option="-Wl,--whole-archive"
+    llvm_cv_no_link_all_option="-Wl,--no-whole-archive"
+    llvm_cv_os_type="Freestanding"
+    llvm_cv_platform_type="Unix" ;;
   *)
     llvm_cv_link_all_option=""
     llvm_cv_no_link_all_option=""
diff --git a/codereview.settings b/codereview.settings
new file mode 100644
index 0000000000..1940586a7f
--- /dev/null
+++ b/codereview.settings
@@ -0,0 +1,10 @@
+# This file is used by gcl to get repository specific information.
+CODE_REVIEW_SERVER: codereview.chromium.org
+CC_LIST: native-client-reviews@googlegroups.com
+VIEW_VC: https://gerrit.chromium.org/gerrit/gitweb?p=native_client/pnacl-llvm.git;a=commit;h=
+STATUS: http://nativeclient-status.appspot.com/status
+TRY_ON_UPLOAD: False
+TRYSERVER_PROJECT: nacl
+TRYSERVER_SVN_URL: svn://svn.chromium.org/chrome-try/try-nacl
+PUSH_URL_CONFIG: url.ssh://gerrit.chromium.org.pushinsteadof
+ORIGIN_URL_CONFIG: http://git.chromium.org
diff --git a/configure b/configure
index d4a42f70ff..80b5c18c1c 100755
--- a/configure
+++ b/configure
@@ -3784,6 +3784,11 @@ else
     llvm_cv_no_link_all_option="-Wl,--no-whole-archive"
     llvm_cv_os_type="Freestanding"
     llvm_cv_platform_type="Unix" ;;
+  *-*-nacl*)
+    llvm_cv_link_all_option="-Wl,--whole-archive"
+    llvm_cv_no_link_all_option="-Wl,--no-whole-archive"
+    llvm_cv_os_type="Freestanding"
+    llvm_cv_platform_type="Unix" ;;
   *)
     llvm_cv_link_all_option=""
     llvm_cv_no_link_all_option=""
diff --git a/include/llvm-c/lto.h b/include/llvm-c/lto.h
index f43d365e3d..864870bfe7 100644
--- a/include/llvm-c/lto.h
+++ b/include/llvm-c/lto.h
@@ -60,6 +60,13 @@ typedef enum {
     LTO_CODEGEN_PIC_MODEL_DYNAMIC_NO_PIC = 2
 } lto_codegen_model;
 
+/* @LOCALMOD-BEGIN */
+typedef enum {
+    LTO_OUTPUT_FORMAT_OBJECT = 0,  /* object file */
+    LTO_OUTPUT_FORMAT_SHARED = 1,  /* shared library */
+    LTO_OUTPUT_FORMAT_EXEC   = 2   /* executable */
+} lto_output_format;
+/* @LOCALMOD-END */
 
 /** opaque reference to a loaded object module */
 typedef struct LTOModule*         lto_module_t;
@@ -71,6 +78,17 @@ typedef struct LTOCodeGenerator*  lto_code_gen_t;
 extern "C" {
 #endif
 
+
+/* @LOCALMOD-BEGIN */
+
+/* Add a command-line option */
+void lto_add_command_line_option(const char* opt);
+
+/* Parse command line options */
+void lto_parse_command_line_options();
+
+/* @LOCALMOD-END */
+
 /**
  * Returns a printable string.
  */
@@ -165,6 +183,36 @@ lto_module_get_target_triple(lto_module_t mod);
 extern void
 lto_module_set_target_triple(lto_module_t mod, const char *triple);
 
+/* @LOCALMOD-BEGIN */
+
+/**
+ * Get the module format for this module
+ */
+extern lto_output_format
+lto_module_get_output_format(lto_module_t mod);
+
+/**
+ * Get the module soname
+ */
+extern const char*
+lto_module_get_soname(lto_module_t mod);
+
+
+/**
+ * Get the i'th library dependency.
+ * Returns NULL if i >= lto_module_get_num_library_deps()
+ */
+extern const char*
+lto_module_get_library_dep(lto_module_t mod, unsigned int i);
+
+
+/**
+ * Return the number of library dependencies of this module.
+ */
+extern unsigned int
+lto_module_get_num_library_deps(lto_module_t mod);
+
+/* @LOCALMOD-END */
 
 /**
  * Returns the number of symbols in the object module.
@@ -211,7 +259,26 @@ lto_codegen_dispose(lto_code_gen_t);
 extern bool
 lto_codegen_add_module(lto_code_gen_t cg, lto_module_t mod);
 
+/* @LOCALMOD-BEGIN */
+/**
+ * Add an object module to the set of modules for which code will be generated.
+ * This does not merge the module immediately, unlike lto_codegen_add_module.
+ * It will hold onto the module until the user calls
+ * lto_codegen_link_gathered_modules_and_dispose().  The lto_module_t
+ * should now by owned by the lto_code_gen_t, and will be freed when
+ * the link is done.
+ */
+extern bool
+lto_codegen_gather_module_for_link(lto_code_gen_t cg, lto_module_t mod);
 
+/**
+ * Merges modules that are part of the set of modules gathered by
+ * lto_codegen_gather_module_for_link(), and the also destroys the modules
+ * as lto_module_dispose() would.
+ */
+extern bool
+lto_codegen_link_gathered_modules_and_dispose(lto_code_gen_t cg);
+/* @LOCALMOD-END*/
 
 /**
  * Sets if debug info should be generated.
@@ -258,6 +325,56 @@ lto_codegen_set_assembler_args(lto_code_gen_t cg, const char **args,
 extern void
 lto_codegen_add_must_preserve_symbol(lto_code_gen_t cg, const char* symbol);
 
+/* @LOCALMOD-BEGIN */
+
+/**
+ * Sets the module type for the merged module
+ */
+extern void
+lto_codegen_set_merged_module_output_format(lto_code_gen_t cg,
+                                            lto_output_format format);
+
+/**
+ * Sets the SOName for the merged module
+ */
+extern void
+lto_codegen_set_merged_module_soname(lto_code_gen_t cg,
+                                     const char *soname);
+
+/**
+ * Add a library dependency to the merged module
+ */
+extern void
+lto_codegen_add_merged_module_library_dep(lto_code_gen_t cg,
+                                          const char *lib);
+
+/**
+ * Wrap a symbol in the merged module.
+ */
+extern void
+lto_codegen_wrap_symbol_in_merged_module(lto_code_gen_t cg,
+                                         const char *sym);
+
+
+/**
+ * Set version of a defined symbol in the merged module
+ */
+extern const char *
+lto_codegen_set_symbol_def_version(lto_code_gen_t cg,
+                                   const char *sym,
+                                   const char *version,
+                                   bool is_default);
+
+
+/**
+ * Set version of an undefined symbol in the merged module
+ */
+extern const char *
+lto_codegen_set_symbol_needed(lto_code_gen_t cg,
+                              const char *sym,
+                              const char *version,
+                              const char *dynfile);
+/* @LOCALMOD-END */
 /**
  * Writes a new object file at the specified path that contains the
  * merged contents of all modules added so far.
diff --git a/include/llvm/CodeGen/AsmPrinter.h b/include/llvm/CodeGen/AsmPrinter.h
index a92b85939f..02c5f422ce 100644
--- a/include/llvm/CodeGen/AsmPrinter.h
+++ b/include/llvm/CodeGen/AsmPrinter.h
@@ -94,6 +94,12 @@ namespace llvm {
     /// default, this is equal to CurrentFnSym.
     MCSymbol *CurrentFnSymForSize;
 
+    /// @LOCALMOD-BEGIN
+    /// Is the bitcode module a plain object? This is false
+    /// for shared (pso) and executable (pexe) files.
+    bool IsPlainObject;
+    /// @LOCALMOD-END
+
   private:
     // GCMetadataPrinters - The garbage collection metadata printer table.
     void *GCMetadataPrinters;  // Really a DenseMap.
@@ -240,6 +246,18 @@ namespace llvm {
     // Targets can, or in the case of EmitInstruction, must implement these to
     // customize output.
 
+    // @LOCALMOD-START
+    /// UseReadOnlyJumpTables - true if JumpTableInfo must be in rodata.
+    virtual bool UseReadOnlyJumpTables() const { return false; }
+    /// GetTargetBasicBlockAlign - the target alignment for basic blocks.
+    virtual unsigned GetTargetBasicBlockAlign() const { return 0; }
+    /// GetTargetLabelAlign - Get optional alignment for TargetOpcode
+    /// labels E.g., EH_LABEL.
+    virtual unsigned GetTargetLabelAlign(const MachineInstr *MI) const {
+      return 0;
+    }
+    // @LOCALMOD-END
+
     /// EmitStartOfAsmFile - This virtual method can be overridden by targets
     /// that want to emit something at the start of their file.
     virtual void EmitStartOfAsmFile(Module &) {}
@@ -254,7 +272,12 @@ namespace llvm {
 
     /// EmitFunctionBodyEnd - Targets can override this to emit stuff after
     /// the last basic block in the function.
-    virtual void EmitFunctionBodyEnd() {}
+    virtual void EmitFunctionBodyEnd() {
+      // @LOCALMOD-START
+      unsigned NextFunctionAlignment = GetTargetBasicBlockAlign();
+      if (NextFunctionAlignment) EmitAlignment(NextFunctionAlignment);
+      // @LOCALMOD-END
+    }
 
     /// EmitInstruction - Targets should implement this to emit instructions.
     virtual void EmitInstruction(const MachineInstr *) {
diff --git a/include/llvm/CodeGen/CallingConvLower.h b/include/llvm/CodeGen/CallingConvLower.h
index 436918b1eb..c1bc3aba5e 100644
--- a/include/llvm/CodeGen/CallingConvLower.h
+++ b/include/llvm/CodeGen/CallingConvLower.h
@@ -165,6 +165,7 @@ private:
   SmallVector<uint32_t, 16> UsedRegs;
   unsigned FirstByValReg;
   bool FirstByValRegValid;
+  bool HasByValInRegPosition;  // @LOCALMOD -- ARM only: see comment below.
 
 protected:
   ParmContext CallOrPrologue;
@@ -313,6 +314,19 @@ public:
   void clearFirstByValReg() { FirstByValReg = 0; FirstByValRegValid = false; }
   bool isFirstByValRegValid() const { return FirstByValRegValid; }
 
+  // @LOCALMOD-BEGIN
+  // We disabled the splitting of byval between registers and memory.
+  // This separate flag indicates that a byval existed.  We cannot reuse
+  // isFirstByValRegValid() because that is already used by the broken
+  // mechanism of splitting between stack and regs.  We should check
+  // again if this mechanism is still broken later, or try to fix that
+  // mechanism.
+  // NOTE: this is only for ARM, so should be refactored.
+  bool hasByValInRegPosition() const { return HasByValInRegPosition; }
+  void setHasByValInRegPosition() { HasByValInRegPosition = true; }
+  void clearHasByValInRegPosition() { HasByValInRegPosition = false; }
+  // @LOCALMOD-END
+
   ParmContext getCallOrPrologue() const { return CallOrPrologue; }
 
 private:
diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h
index 5d0a3b4c70..47170e4e58 100644
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@@ -641,6 +641,19 @@ namespace ISD {
     /// is the chain and the second operand is the alloca pointer.
     LIFETIME_START, LIFETIME_END,
 
+    // @LOCALMOD-BEGIN
+    // NACL_* - Native Client instrinsics.
+    // NACL_READ_TP is a fast built-in version of NaCl's tls_get() IRT
+    // interface.
+    NACL_READ_TP,
+    // These correspond to functions in:
+    // native_client/src/untrusted/nacl/tls_params.h
+    NACL_TP_TLS_OFFSET,
+    NACL_TP_TDB_OFFSET,
+    // Expands to the target architecture enumeration value.
+    NACL_TARGET_ARCH,
+    // @LOCALMOD-END
+
     /// BUILTIN_OP_END - This must be the last enum value in this list.
     /// The target-specific pre-isel opcode values start here.
     BUILTIN_OP_END
diff --git a/include/llvm/CodeGen/IntrinsicLowering.h b/include/llvm/CodeGen/IntrinsicLowering.h
index 5a3fb4b1a3..dcb013e1f9 100644
--- a/include/llvm/CodeGen/IntrinsicLowering.h
+++ b/include/llvm/CodeGen/IntrinsicLowering.h
@@ -16,6 +16,7 @@
 #ifndef LLVM_CODEGEN_INTRINSICLOWERING_H
 #define LLVM_CODEGEN_INTRINSICLOWERING_H
 
+#include "llvm/ADT/StringSet.h" // @LOCALMOD
 #include "llvm/Intrinsics.h"
 
 namespace llvm {
@@ -26,12 +27,23 @@ namespace llvm {
   class IntrinsicLowering {
     const DataLayout& TD;
 
-    
+    static StringSet<> FuncNames; // @LOCALMOD
+
     bool Warned;
   public:
     explicit IntrinsicLowering(const DataLayout &td) :
       TD(td), Warned(false) {}
 
+    /// @LOCALMOD-BEGIN
+    /// GetFuncNames - Get the names of all functions which may
+    /// be called by an intrinsic.
+    static const StringSet<> &GetFuncNames();
+
+    /// IsCalledByIntrinsic - Returns true if a function may be called
+    /// by an intrinsic.
+    static bool IsCalledByIntrinsic(const StringRef &FuncName);
+    /// @LOCALMOD-END
+
     /// AddPrototypes - This method, if called, causes all of the prototypes
     /// that might be needed by an intrinsic lowering implementation to be
     /// inserted into the module specified.
diff --git a/include/llvm/CodeGen/JITCodeEmitter.h b/include/llvm/CodeGen/JITCodeEmitter.h
index 89f00e91f7..f95b8b6b84 100644
--- a/include/llvm/CodeGen/JITCodeEmitter.h
+++ b/include/llvm/CodeGen/JITCodeEmitter.h
@@ -290,7 +290,7 @@ public:
 
   /// getCurrentPCOffset - Return the offset from the start of the emitted
   /// buffer that we are currently writing to.
-  uintptr_t getCurrentPCOffset() const {
+  virtual uintptr_t getCurrentPCOffset() const { // @LOCALMOD
     return CurBufferPtr-BufferBegin;
   }
 
@@ -335,6 +335,13 @@ public:
   /// getLabelLocations - Return the label locations map of the label IDs to
   /// their address.
   virtual DenseMap<MCSymbol*, uintptr_t> *getLabelLocations() { return 0; }
+
+  // @LOCALMOD-START
+  virtual void beginBundleLock() {};
+  virtual void endBundleLock() {};
+  virtual void alignToBundleBeginning() {};
+  virtual void alignToBundleEnd() {};
+  // @LOCALMOD-END
 };
 
 } // End llvm namespace
diff --git a/include/llvm/CodeGen/LexicalScopes.h b/include/llvm/CodeGen/LexicalScopes.h
index 8414c64544..e1911cfd82 100644
--- a/include/llvm/CodeGen/LexicalScopes.h
+++ b/include/llvm/CodeGen/LexicalScopes.h
@@ -162,6 +162,12 @@ public:
 #ifndef NDEBUG
     IndentLevel = 0;
 #endif
+    // @LOCALMOD-BEGIN -- Hack for bug
+    // http://code.google.com/p/nativeclient/issues/detail?id=2786
+    Desc.make_weak();
+    InlinedAtLocation.make_weak();
+    // @LOCALMOD-END
+
     if (Parent)
       Parent->addChild(this);
   }
diff --git a/include/llvm/CodeGen/MachineConstantPool.h b/include/llvm/CodeGen/MachineConstantPool.h
index 8ed215d75b..827a9f81e8 100644
--- a/include/llvm/CodeGen/MachineConstantPool.h
+++ b/include/llvm/CodeGen/MachineConstantPool.h
@@ -57,6 +57,17 @@ public:
 
   virtual void addSelectionDAGCSEId(FoldingSetNodeID &ID) = 0;
 
+  // @LOCALMOD-START
+  /// getJumpTableIndex - Check if this is a reference to a jump table.
+  /// If so, return a pointer to the jump table index value that is stored
+  /// in the constant pool, else return 0.
+  /// The default behavior is to indicate that the value is not a jump table
+  /// index. This is used by BranchFolder::runOnMachineFunction() and only in
+  /// conjunction with ARM targets
+  /// TODO: this should be cleaned up as it does tripple duty: tester, setter, getter
+  virtual unsigned *getJumpTableIndex() { return 0; }
+  // @LOCALMOD-END
+
   /// print - Implement operator<<
   virtual void print(raw_ostream &O) const = 0;
 };
diff --git a/include/llvm/CodeGen/MachineInstrBuilder.h b/include/llvm/CodeGen/MachineInstrBuilder.h
index 770685358a..01291e43c8 100644
--- a/include/llvm/CodeGen/MachineInstrBuilder.h
+++ b/include/llvm/CodeGen/MachineInstrBuilder.h
@@ -291,6 +291,21 @@ inline MachineInstrBuilder BuildMI(MachineBasicBlock &BB,
   return BuildMI(BB, MII, DL, MCID);
 }
 
+// @LOCALMOD-BEGIN
+/// BuildMI - This version of the builder inserts the newly-built
+/// instruction before the given position in the given MachineBasicBlock,
+/// does NOT take a destination register, and does not add implicit operands.
+///
+inline MachineInstrBuilder BuildMI_NoImp(MachineBasicBlock &BB,
+                                         MachineBasicBlock::iterator I,
+                                         DebugLoc DL,
+                                         const MCInstrDesc &MCID) {
+  MachineInstr *MI = BB.getParent()->CreateMachineInstr(MCID, DL, true);
+  BB.insert(I, MI);
+  return MachineInstrBuilder(MI);
+}
+// @LOCALMOD-END
+
 /// BuildMI - This version of the builder inserts the newly-built
 /// instruction at the end of the given MachineBasicBlock, and does NOT take a
 /// destination register.
diff --git a/include/llvm/CodeGen/MachineRelocation.h b/include/llvm/CodeGen/MachineRelocation.h
index 244b466e17..8d71930882 100644
--- a/include/llvm/CodeGen/MachineRelocation.h
+++ b/include/llvm/CodeGen/MachineRelocation.h
@@ -197,6 +197,14 @@ public:
     return Offset;
   }
 
+  // @LOCALMOD-START
+  /// setMachineCodeOffset() - Adjust the offset in the code buffer (this is
+  /// used when the instruction is moved after emission for bundle alignment)
+  void setMachineCodeOffset(intptr_t offset) {
+    Offset = offset;
+  }
+  // @LOCALMOD-END
+
   /// getRelocationType - Return the target-specific relocation ID for this
   /// relocation.
   unsigned getRelocationType() const {
diff --git a/include/llvm/ExecutionEngine/NaClJITMemoryManager.h b/include/llvm/ExecutionEngine/NaClJITMemoryManager.h
new file mode 100644
index 0000000000..dcd06627df
--- /dev/null
+++ b/include/llvm/ExecutionEngine/NaClJITMemoryManager.h
@@ -0,0 +1,237 @@
+//=-- NaClJITMemoryManager.h - Interface JIT uses to Allocate Mem -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef LLVM_EXECUTION_ENGINE_NACL_JIT_MEMMANAGER_H
+#define LLVM_EXECUTION_ENGINE_NACL_JIT_MEMMANAGER_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ExecutionEngine/JITMemoryManager.h"
+#include "llvm/Support/Allocator.h"
+
+namespace llvm {
+
+class Function;
+class GlobalValue;
+
+struct SimpleSlab {
+  uint8_t *address;
+  size_t size;
+  uint8_t *next_free;
+};
+
+struct FreeListNode {
+  uint8_t *address;
+  uintptr_t size;
+  FreeListNode *Prev;
+  FreeListNode *Next;
+  FreeListNode *RemoveFromFreeList() {
+    assert(Next->Prev == this && Prev->Next == this && "Freelist broken!");
+    Next->Prev = Prev;
+    return Prev->Next = Next;
+  }
+  void AddToFreeList(FreeListNode *FreeList) {
+     Next = FreeList;
+     Prev = FreeList->Prev;
+     Prev->Next = this;
+     Next->Prev = this;
+  }
+};
+
+class NaClJITMemoryManager : public JITMemoryManager {
+  // NaCl disallows writing into any code region, and disallows executing any
+  // data region. Thus we can never get any RWX memory and the the strategy
+  // used by the other allocators of colocation of allocation metadata
+  // with the allocated code won't work.
+  // Currently with NaCl we have one single pool of usable space between the
+  // text and rodata segments, defined by the linker
+  // so to support stub allocation in the middle of a function, we allocate
+  // them in slabs interspersed with the functions.
+
+  static const size_t kStubSlabSize = 16 * 1024;
+  static const size_t kDataSlabSize = 16 * 1024;
+  static const size_t kCodeSlabSize = 64 * 1024;
+
+  typedef DenseMap<uint8_t *, size_t> AllocationTable;
+
+  uint8_t *AllocatableRegionStart;
+  uint8_t *AllocatableRegionLimit;
+  uint8_t *NextCode;
+  SimpleSlab CurrentStubSlab;
+
+  // Allocation metadata must be kept separate from code, so the free list is
+  // allocated with new rather than being a header in the code blocks
+  FreeListNode *CodeFreeListHead;
+  FreeListNode *CurrentCodeBlock;
+  // Mapping from pointer to allocated function, to size of allocation
+  AllocationTable AllocatedFunctions;
+
+  // Since Exception tables are allocated like functions (i.e. we don't know
+  // ahead of time how large they are) we use the same allocation method for
+  // simplicity even though it's not strictly necessary to separate the
+  // allocation metadata from the allocated data.
+  FreeListNode *DataFreeListHead;
+  FreeListNode *CurrentDataBlock;
+  AllocationTable AllocatedTables;
+  BumpPtrAllocator DataAllocator;
+
+  uint8_t *GOTBase;     // Target Specific reserved memory
+
+  FreeListNode *allocateCodeSlab(size_t MinSize);
+  FreeListNode *allocateDataSlab(size_t MinSize);
+  SimpleSlab allocateStubSlab(size_t MinSize);
+
+  // Functions for allocations using one of the free lists
+  void InitFreeList(FreeListNode **Head);
+  void DestroyFreeList(FreeListNode *Head);
+  FreeListNode *FreeListAllocate(uintptr_t &ActualSize, FreeListNode *Head,
+      FreeListNode * (NaClJITMemoryManager::*allocate)(size_t));
+  void FreeListFinishAllocation(FreeListNode *Block, FreeListNode *Head,
+      uint8_t *AllocationStart, uint8_t *AllocationEnd, AllocationTable &table);
+  void FreeListDeallocate(FreeListNode *Head, AllocationTable &Table,
+                          void *Body);
+ public:
+  // TODO(dschuff): how to find the real value? is it a flag?
+  static const int kBundleSize = 32;
+  static const intptr_t kJumpMask = -32;
+  NaClJITMemoryManager();
+  virtual ~NaClJITMemoryManager();
+  static inline bool classof(const JITMemoryManager*) { return true; }
+
+  /// setMemoryWritable - No-op on NaCl - code is never writable
+  virtual void setMemoryWritable() {}
+
+  /// setMemoryExecutable - No-op on NaCl - data is never executable
+  virtual void setMemoryExecutable() {}
+
+  /// setPoisonMemory - No-op on NaCl - nothing unvalidated is ever executable
+  virtual void setPoisonMemory(bool poison) {}
+
+  /// getPointerToNamedFunction - This method returns the address of the
+  /// specified function. As such it is only useful for resolving library
+  /// symbols, not code generated symbols.
+  ///
+  /// If AbortOnFailure is false and no function with the given name is
+  /// found, this function silently returns a null pointer. Otherwise,
+  /// it prints a message to stderr and aborts.
+  ///
+  virtual void *getPointerToNamedFunction(const std::string &Name,
+                                          bool AbortOnFailure = true) ;
+
+  //===--------------------------------------------------------------------===//
+  // Global Offset Table Management
+  //===--------------------------------------------------------------------===//
+
+  /// AllocateGOT - If the current table requires a Global Offset Table, this
+  /// method is invoked to allocate it.  This method is required to set HasGOT
+  /// to true.
+  virtual void AllocateGOT();
+
+  /// getGOTBase - If this is managing a Global Offset Table, this method should
+  /// return a pointer to its base.
+  virtual uint8_t *getGOTBase() const {
+    return GOTBase;
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Main Allocation Functions
+  //===--------------------------------------------------------------------===//
+
+  /// startFunctionBody - When we start JITing a function, the JIT calls this
+  /// method to allocate a block of free RWX memory, which returns a pointer to
+  /// it.  If the JIT wants to request a block of memory of at least a certain
+  /// size, it passes that value as ActualSize, and this method returns a block
+  /// with at least that much space.  If the JIT doesn't know ahead of time how
+  /// much space it will need to emit the function, it passes 0 for the
+  /// ActualSize.  In either case, this method is required to pass back the size
+  /// of the allocated block through ActualSize.  The JIT will be careful to
+  /// not write more than the returned ActualSize bytes of memory.
+  virtual uint8_t *startFunctionBody(const Function *F,
+                                     uintptr_t &ActualSize);
+
+  /// allocateStub - This method is called by the JIT to allocate space for a
+  /// function stub (used to handle limited branch displacements) while it is
+  /// JIT compiling a function.  For example, if foo calls bar, and if bar
+  /// either needs to be lazily compiled or is a native function that exists too
+  /// far away from the call site to work, this method will be used to make a
+  /// thunk for it.  The stub should be "close" to the current function body,
+  /// but should not be included in the 'actualsize' returned by
+  /// startFunctionBody.
+  virtual uint8_t *allocateStub(const GlobalValue* F, unsigned StubSize,
+                                unsigned Alignment);
+
+  /// endFunctionBody - This method is called when the JIT is done codegen'ing
+  /// the specified function.  At this point we know the size of the JIT
+  /// compiled function.  This passes in FunctionStart (which was returned by
+  /// the startFunctionBody method) and FunctionEnd which is a pointer to the
+  /// actual end of the function.  This method should mark the space allocated
+  /// and remember where it is in case the client wants to deallocate it.
+  virtual void endFunctionBody(const Function *F, uint8_t *FunctionStart,
+                               uint8_t *FunctionEnd);
+
+  /// allocateCodeSection - Allocate a memory block of (at least) the given
+  /// size suitable for executable code. The SectionID is a unique identifier
+  /// assigned by the JIT and passed through to the memory manager for
+  /// the instance class to use if it needs to communicate to the JIT about
+  /// a given section after the fact.
+  virtual uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
+                                       unsigned SectionID);
+
+  /// allocateDataSection - Allocate a memory block of (at least) the given
+  /// size suitable for data. The SectionID is a unique identifier
+  /// assigned by the JIT and passed through to the memory manager for
+  /// the instance class to use if it needs to communicate to the JIT about
+  /// a given section after the fact.
+  virtual uint8_t *allocateDataSection(uintptr_t Size, unsigned Alignment,
+                                       unsigned SectionID);
+
+  /// allocateSpace - Allocate a memory block of the given size.  This method
+  /// cannot be called between calls to startFunctionBody and endFunctionBody.
+  virtual uint8_t *allocateSpace(intptr_t Size, unsigned Alignment);
+
+  /// allocateGlobal - Allocate memory for a global.
+  virtual uint8_t *allocateGlobal(uintptr_t Size, unsigned Alignment);
+
+  /// deallocateFunctionBody - Free the specified function body.  The argument
+  /// must be the return value from a call to startFunctionBody() that hasn't
+  /// been deallocated yet.  This is never called when the JIT is currently
+  /// emitting a function.
+  virtual void deallocateFunctionBody(void *Body);
+
+  /// startExceptionTable - When we finished JITing the function, if exception
+  /// handling is set, we emit the exception table.
+  virtual uint8_t* startExceptionTable(const Function* F,
+                                       uintptr_t &ActualSize);
+
+  /// endExceptionTable - This method is called when the JIT is done emitting
+  /// the exception table.
+  virtual void endExceptionTable(const Function *F, uint8_t *TableStart,
+                                 uint8_t *TableEnd, uint8_t* FrameRegister);
+
+  /// deallocateExceptionTable - Free the specified exception table's memory.
+  /// The argument must be the return value from a call to startExceptionTable()
+  /// that hasn't been deallocated yet.  This is never called when the JIT is
+  /// currently emitting an exception table.
+  virtual void deallocateExceptionTable(void *ET);
+
+  virtual size_t GetDefaultCodeSlabSize() {
+    return kCodeSlabSize;
+  }
+  virtual size_t GetDefaultDataSlabSize() {
+    return kDataSlabSize;
+  }
+  virtual size_t GetDefaultStubSlabSize() {
+    return kStubSlabSize;
+  }
+
+};
+
+}
+
+#endif  // LLVM_EXECUTION_ENGINE_NACL_JIT_MEMMANAGER_H
diff --git a/include/llvm/GlobalValue.h b/include/llvm/GlobalValue.h
index 7f7f74b1e2..aaab1922f5 100644
--- a/include/llvm/GlobalValue.h
+++ b/include/llvm/GlobalValue.h
@@ -76,6 +76,26 @@ public:
     removeDeadConstantUsers();   // remove any dead constants using this.
   }
 
+  // @LOCALMOD-BEGIN
+  /// Set the symbol version for this definition.
+  void setVersionDef(StringRef Version, bool IsDefault);
+
+  /// Set the symbol version and dynamic source file (soname)
+  /// for this exterally provided global.
+  void setNeeded(StringRef Version, StringRef DynFile);
+
+  /// Get the name of this symbol without the version suffix.
+  StringRef getUnversionedName() const;
+
+  /// Get the version of this symbol.
+  /// Returns an empty string if the symbol is unversioned.
+  StringRef getVersion() const;
+
+  /// Returns true if this is the default version of the symbol.
+  /// This may only be called if the symbol is versioned.
+  bool isDefaultVersion() const;
+  // @LOCALMOD-END
+
   unsigned getAlignment() const {
     return (1u << Alignment) >> 1;
   }
diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index 8c164eb919..a6b7d31817 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -265,6 +265,8 @@ void initializeFinalizeMachineBundlesPass(PassRegistry&);
 void initializeLoopVectorizePass(PassRegistry&);
 void initializeBBVectorizePass(PassRegistry&);
 void initializeMachineFunctionPrinterPassPass(PassRegistry&);
+void initializeExpandCtorsPass(PassRegistry&); // @LOCALMOD
+void initializeNaClCcRewritePass(PassRegistry&); // @LOCALMOD
 }
 
 #endif
diff --git a/include/llvm/Intrinsics.td b/include/llvm/Intrinsics.td
index 2e1597fe6f..42b9da6914 100644
--- a/include/llvm/Intrinsics.td
+++ b/include/llvm/Intrinsics.td
@@ -457,6 +457,36 @@ def int_convertus  : Intrinsic<[llvm_anyint_ty],
 def int_convertuu  : Intrinsic<[llvm_anyint_ty],
                                [llvm_anyint_ty, llvm_i32_ty, llvm_i32_ty]>;
 
+// @LOCALMOD-BEGIN
+//===----------------------- Native Client Intrinsics ---------------------===//
+// TODO(sehr): conditionalize this on IsNaCl64 | IsNaCl32 | IsNaClArm.
+// The expansions of these are in lib/Target/X86/X86InstrNacl.{td, cpp} and
+// lib/Target/ARM/ARMInstrInfo.td.
+def int_nacl_setjmp : Intrinsic<[llvm_i32_ty],  [llvm_ptr_ty, llvm_ptr_ty]>,
+                      GCCBuiltin<"__builtin_nacl_setjmp">;
+def int_nacl_longjmp : Intrinsic<[],  [llvm_ptr_ty, llvm_i32_ty]>,
+                       GCCBuiltin<"__builtin_nacl_longjmp">;
+
+// Fast built-in version of NaCl's tls_get() IRT interface.
+def int_nacl_read_tp : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>,
+                       GCCBuiltin<"__builtin_nacl_read_tp">;
+
+// The following intrinsics provide target-specific implementations of
+// the interface in native_client/src/untrusted/nacl/tls_params.h.
+// The intrinsic names are basically the functions there without the
+// leading underscores.
+def int_nacl_tp_tls_offset : Intrinsic<[llvm_i32_ty], [llvm_i32_ty]>,
+                             GCCBuiltin<"__builtin_nacl_tp_tls_offset">;
+def int_nacl_tp_tdb_offset : Intrinsic<[llvm_i32_ty], [llvm_i32_ty]>,
+                             GCCBuiltin<"__builtin_nacl_tp_tdb_offset">;
+
+// The following intrinsic provides a target-specific constant value to
+// indicate the target platform compiled to.  The enum values are enumerated
+// pnaclintrin.h.
+def int_nacl_target_arch : Intrinsic<[llvm_i32_ty], []>,
+                            GCCBuiltin<"__builtin_nacl_target_arch">;
+// @LOCALMOD-END
+
 //===----------------------------------------------------------------------===//
 // Target-specific intrinsics
 //===----------------------------------------------------------------------===//
diff --git a/include/llvm/MC/MCAsmBackend.h b/include/llvm/MC/MCAsmBackend.h
index 72ed1a317c..f5ca3d5471 100644
--- a/include/llvm/MC/MCAsmBackend.h
+++ b/include/llvm/MC/MCAsmBackend.h
@@ -25,6 +25,7 @@ class MCInst;
 class MCInstFragment;
 class MCObjectWriter;
 class MCSection;
+class MCStreamer;
 class MCValue;
 class raw_ostream;
 
@@ -157,6 +158,23 @@ public:
   /// handleAssemblerFlag - Handle any target-specific assembler flags.
   /// By default, do nothing.
   virtual void handleAssemblerFlag(MCAssemblerFlag Flag) {}
+  
+  // @LOCALMOD-BEGIN
+  /// getBundleSize - Return the size (in bytes) of code bundling units
+  /// for this target. If 0, bundling is disabled. This is used exclusively
+  /// for Native Client.
+  virtual unsigned getBundleSize() const {
+    return 0;
+  }
+
+  /// CustomExpandInst -
+  ///   If the MCInst instruction has a custom expansion, write it to the
+  /// MCStreamer 'Out'. This can be used to perform "last minute" rewrites of
+  /// MCInst instructions for emission.
+  virtual bool CustomExpandInst(const MCInst &Inst, MCStreamer &Out) const {
+    return false;
+  }
+  // @LOCALMOD-END
 };
 
 } // End llvm namespace
diff --git a/include/llvm/MC/MCAsmInfo.h b/include/llvm/MC/MCAsmInfo.h
index 97aad71fd9..29ec1020c3 100644
--- a/include/llvm/MC/MCAsmInfo.h
+++ b/include/llvm/MC/MCAsmInfo.h
@@ -48,6 +48,14 @@ namespace llvm {
     ///               Default is 4.
     unsigned PointerSize;
 
+    /// @LOCALMOD-BEGIN
+    /// TODO(pdox): Before upstreaming this, make sure every target backend
+    ///             sets it correctly.
+    /// StackSlotSize - Stack slot size in bytes.
+    ///                 Default is 4.
+    unsigned StackSlotSize;
+    /// @LOCALMOD-END
+
     /// IsLittleEndian - True if target is little endian.
     ///                  Default is true.
     bool IsLittleEndian;
@@ -340,6 +348,13 @@ namespace llvm {
       return PointerSize;
     }
 
+    /// @LOCALMOD-BEGIN
+    /// getStackSlotSize - Get the stack slot size in bytes.
+    unsigned getStackSlotSize() const {
+      return StackSlotSize;
+    }
+    /// @LOCALMOD-END
+
     /// islittleendian - True if the target is little endian.
     bool isLittleEndian() const {
       return IsLittleEndian;
diff --git a/include/llvm/MC/MCAsmLayout.h b/include/llvm/MC/MCAsmLayout.h
index cf79216d07..fdded4ffa7 100644
--- a/include/llvm/MC/MCAsmLayout.h
+++ b/include/llvm/MC/MCAsmLayout.h
@@ -80,6 +80,11 @@ public:
   /// \brief Get the offset of the given fragment inside its containing section.
   uint64_t getFragmentOffset(const MCFragment *F) const;
 
+  // @LOCALMOD-BEGIN
+  /// \brief Get the bundle padding of the given fragment.
+  uint8_t getFragmentPadding(const MCFragment *F) const;
+  // @LOCALMOD-END
+
   /// @}
   /// @name Utility Functions
   /// @{
diff --git a/include/llvm/MC/MCAssembler.h b/include/llvm/MC/MCAssembler.h
index 5771415c81..f67aa9b966 100644
--- a/include/llvm/MC/MCAssembler.h
+++ b/include/llvm/MC/MCAssembler.h
@@ -52,11 +52,39 @@ public:
     FT_Org,
     FT_Dwarf,
     FT_DwarfFrame,
-    FT_LEB
+    FT_LEB,
+    FT_Tiny           // @LOCALMOD
   };
 
+  // @LOCALMOD-BEGIN
+  enum BundleAlignType {
+    BundleAlignNone  = 0,
+    BundleAlignStart = 1,
+    BundleAlignEnd   = 2
+  };
+  // @LOCALMOD-END
+
 private:
-  FragmentType Kind;
+  // @LOCALMOD-BEGIN
+  // Try to compress the layout of MCFragment by:
+  // 1) Making Kind, the bundling flags, and BundlePadding fit in 32 bits.
+  // 2) Move LayoutOrder to fit in the hole left by aligning for 64 bits.
+
+  FragmentType Kind : 4;
+
+  BundleAlignType BundleAlign : 2;
+  bool BundleGroupStart       : 1;
+  bool BundleGroupEnd         : 1;
+
+  /// BundlePadding - The computed padding for this fragment. This is ~0
+  /// until initialized.
+  uint8_t BundlePadding;
+
+  /// LayoutOrder - The layout order of this fragment.
+  unsigned LayoutOrder;
+
+  // @LOCALMOD-END
+
 
   /// Parent - The data for the section this fragment is in.
   MCSectionData *Parent;
@@ -75,9 +103,6 @@ private:
   /// initialized.
   uint64_t Offset;
 
-  /// LayoutOrder - The layout order of this fragment.
-  unsigned LayoutOrder;
-
   /// @}
 
 protected:
@@ -99,12 +124,44 @@ public:
   unsigned getLayoutOrder() const { return LayoutOrder; }
   void setLayoutOrder(unsigned Value) { LayoutOrder = Value; }
 
+  // @LOCALMOD-BEGIN
+  bool isBundleGroupStart() const { return BundleGroupStart; }
+  void setBundleGroupStart(bool Value) { BundleGroupStart = Value; }
+
+  bool isBundleGroupEnd() const { return BundleGroupEnd; }
+  void setBundleGroupEnd(bool Value) { BundleGroupEnd = Value; }
+
+  BundleAlignType getBundleAlign() const { return BundleAlign; }
+  void setBundleAlign(BundleAlignType Value) { BundleAlign = Value; }
+  // @LOCALMOD-END
+
   void dump();
 };
 
+// @LOCALMOD-BEGIN
+// This is just a tiny data fragment with no fixups.
+// (To help with memory usage)
+class MCTinyFragment : public MCFragment {
+ private:
+  SmallString<6> Contents;
+
+ public:
+
+  MCTinyFragment(MCSectionData *SD = 0) : MCFragment(FT_Tiny, SD) {}
+
+  SmallString<6> &getContents() { return Contents; }
+  const SmallString<6> &getContents() const { return Contents; }
+
+  static bool classof(const MCFragment *F) {
+    return F->getKind() == MCFragment::FT_Tiny;
+  }
+  static bool classof(const MCTinyFragment *) { return true; }
+};
+// @LOCALMOD-END
+
 class MCDataFragment : public MCFragment {
   virtual void anchor();
-  SmallString<32> Contents;
+  SmallString<6> Contents;  // @LOCALMOD: Memory efficiency
 
   /// Fixups - The list of fixups in this fragment.
   std::vector<MCFixup> Fixups;
@@ -119,8 +176,8 @@ public:
   /// @name Accessors
   /// @{
 
-  SmallString<32> &getContents() { return Contents; }
-  const SmallString<32> &getContents() const { return Contents; }
+  SmallString<6> &getContents() { return Contents; }  // @LOCALMOD
+  const SmallString<6> &getContents() const { return Contents; } // @LOCALMOD
 
   /// @}
   /// @name Fixup Access
@@ -464,6 +521,29 @@ private:
   /// it.
   unsigned HasInstructions : 1;
 
+  // @LOCALMOD-BEGIN
+  bool BundlingEnabled : 1;
+  bool BundleLocked : 1;
+
+  // Because ".bundle_lock" occurs before the fragment it applies to exists,
+  // we need to keep this flag around so we know to mark the next fragment
+  // as the start of a bundle group. A similar flag is not necessary for the
+  // last fragment, because when a .bundle_unlock occurs, the last fragment
+  // in the group already exists and can be marked directly.
+  bool BundleGroupFirstFrag : 1;
+
+  typedef MCFragment::BundleAlignType BundleAlignType;
+  BundleAlignType BundleAlignNext : 2;
+
+  // Optimization to reduce the number of fragments generated (for memory
+  // savings).  Keep track of when we know the offset of the next point to
+  // emit an instruction.  If we know the offset from a known alignment point,
+  // we can just append to the previous fragment.
+  bool BundleOffsetKnown : 1;
+  unsigned BundleSize;
+  unsigned BundleOffset;
+  // @LOCALMOD-END
+
   /// @}
 
 public:
@@ -485,6 +565,25 @@ public:
   unsigned getLayoutOrder() const { return LayoutOrder; }
   void setLayoutOrder(unsigned Value) { LayoutOrder = Value; }
 
+  // @LOCALMOD-BEGIN
+  bool isBundlingEnabled() const { return BundlingEnabled; }
+
+  bool isBundleLocked() const { return BundleLocked; }
+  void setBundleLocked(bool Value) { BundleLocked = Value; }
+
+  bool isBundleGroupFirstFrag() const { return BundleGroupFirstFrag; }
+  void setBundleGroupFirstFrag(bool Value) { BundleGroupFirstFrag = Value; }
+
+
+  BundleAlignType getBundleAlignNext() const { return BundleAlignNext; }
+  void setBundleAlignNext(BundleAlignType Value) { BundleAlignNext = Value; }
+
+  void MarkBundleOffsetUnknown();
+  bool ShouldCreateNewFragment(size_t Size);
+  void UpdateBundleOffset(size_t Size);
+  void AlignBundleOffsetTo(size_t AlignBase);
+  // @LOCALMOD-END
+
   /// @name Fragment Access
   /// @{
 
@@ -743,6 +842,13 @@ private:
   bool fragmentNeedsRelaxation(const MCInstFragment *IF,
                                const MCAsmLayout &Layout) const;
 
+  // @LOCALMOD-BEGIN
+  uint8_t ComputeBundlePadding(const MCAsmLayout &Layout,
+                               MCFragment *F,
+                               uint64_t FragmentOffset) const;
+  // @LOCALMOD-END
+
+
   /// layoutOnce - Perform one layout iteration and return true if any offsets
   /// were adjusted.
   bool layoutOnce(MCAsmLayout &Layout);
@@ -809,6 +915,12 @@ public:
 
   MCAsmBackend &getBackend() const { return Backend; }
 
+  // @LOCALMOD-BEGIN
+  uint64_t getBundleSize() const;
+  uint64_t getBundleMask() const;
+  // @LOCALMOD-END
+
+
   MCCodeEmitter &getEmitter() const { return Emitter; }
 
   MCObjectWriter &getWriter() const { return Writer; }
diff --git a/include/llvm/MC/MCELFObjectWriter.h b/include/llvm/MC/MCELFObjectWriter.h
index 38cdc7293b..8b0f191792 100644
--- a/include/llvm/MC/MCELFObjectWriter.h
+++ b/include/llvm/MC/MCELFObjectWriter.h
@@ -69,6 +69,12 @@ public:
         return ELF::ELFOSABI_FREEBSD;
       case Triple::Linux:
         return ELF::ELFOSABI_LINUX;
+        // @LOCALMOD-BEGIN
+        // This shouldn't be needed anymore (sel_ldr doesn't check for it),
+        // but removing it may require some changes in binutils also.
+      case Triple::NativeClient:
+        return ELF::ELFOSABI_NACL;
+        // @LOCALMOD-END
       default:
         return ELF::ELFOSABI_NONE;
     }
diff --git a/include/llvm/MC/MCObjectStreamer.h b/include/llvm/MC/MCObjectStreamer.h
index 08b00f1c47..d9f72b7f42 100644
--- a/include/llvm/MC/MCObjectStreamer.h
+++ b/include/llvm/MC/MCObjectStreamer.h
@@ -68,6 +68,14 @@ public:
                              unsigned AddrSpace);
   virtual void EmitULEB128Value(const MCExpr *Value);
   virtual void EmitSLEB128Value(const MCExpr *Value);
+                             
+  // @LOCALMOD-BEGIN
+  void EmitBundleLock();
+  void EmitBundleUnlock();
+  void EmitBundleAlignStart();
+  void EmitBundleAlignEnd();
+  // @LOCALMOD-END
+
   virtual void EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol);
   virtual void ChangeSection(const MCSection *Section);
   virtual void EmitInstruction(const MCInst &Inst);
diff --git a/include/llvm/MC/MCStreamer.h b/include/llvm/MC/MCStreamer.h
index 230d27ef2e..40f83bf5d5 100644
--- a/include/llvm/MC/MCStreamer.h
+++ b/include/llvm/MC/MCStreamer.h
@@ -467,6 +467,27 @@ namespace llvm {
 
     /// @}
 
+    // @LOCALMOD-BEGIN
+    /// @name Bundling Directives
+    /// @{
+
+    /// EmitBundleLock - Begin a group of instructions which cannot
+    /// cross a bundle boundary.
+    virtual void EmitBundleLock() = 0;
+
+    /// EmitBundleUnlock - End a bundle-locked group of instructions.
+    virtual void EmitBundleUnlock() = 0;
+
+    /// EmitBundleAlignStart - Guarantee that the next instruction or
+    /// bundle-locked group starts at the beginning of a bundle.
+    virtual void EmitBundleAlignStart() = 0;
+
+    /// EmitBundleAlignEnd - Guarantee that the next instruction or
+    /// bundle-locked group finishes at the end of a bundle.
+    virtual void EmitBundleAlignEnd() = 0;
+    /// @}
+    // @LOCALMOD-END
+
     /// EmitFileDirective - Switch to a new logical file.  This is used to
     /// implement the '.file "foo.c"' assembler directive.
     virtual void EmitFileDirective(StringRef Filename) = 0;
diff --git a/include/llvm/Module.h b/include/llvm/Module.h
index e6303ac775..13b56433dc 100644
--- a/include/llvm/Module.h
+++ b/include/llvm/Module.h
@@ -186,6 +186,22 @@ public:
       : Behavior(B), Key(K), Val(V) {}
   };
 
+  /// @LOCALMOD-BEGIN
+  /// An enumeration for describing the module format
+  enum OutputFormat {
+    ObjectOutputFormat,
+    SharedOutputFormat,
+    ExecutableOutputFormat
+  };
+
+  /// A structure describing the symbols needed from an external file.
+  struct NeededRecord {
+    std::string              DynFile; // Source file (soname)
+    std::vector<std::string> Symbols; // List of symbol names
+                                      // (with version suffix)
+  };
+  /// @LOCALMOD-END
+  
 /// @}
 /// @name Member Variables
 /// @{
@@ -203,6 +219,9 @@ private:
   std::string ModuleID;           ///< Human readable identifier for the module
   std::string TargetTriple;       ///< Platform target triple Module compiled on
   std::string DataLayout;         ///< Target data description
+  // @LOCALMOD-BEGIN
+  mutable std::string ModuleSOName; ///< Module SOName (for shared format)
+  // @LOCALMOD-END
   void *NamedMDSymTab;            ///< NamedMDNode names.
 
   friend class Constant;
@@ -234,6 +253,24 @@ public:
   /// @returns a string containing the target triple.
   const std::string &getTargetTriple() const { return TargetTriple; }
 
+  // @LOCALMOD-BEGIN
+
+  /// Get the module format
+  /// @returns the module format
+  OutputFormat getOutputFormat() const;
+
+  /// Get the SOName of this module.
+  /// @returns a string containing the module soname
+  const std::string &getSOName() const;
+
+  /// Record the needed information for a global value.
+  /// This creates a needed record for DynFile, if one does not already exist.
+  void addNeededRecord(StringRef DynFile, GlobalValue *GV);
+
+  // Fill NeededOut with all needed records present in the module.
+  void getNeededRecords(std::vector<NeededRecord> *NeededOut) const;
+  // @LOCALMOD-END
+
   /// Get the target endian information.
   /// @returns Endianess - an enumeration for the endianess of the target
   Endianness getEndianness() const;
@@ -263,6 +300,18 @@ public:
   /// Set the target triple.
   void setTargetTriple(StringRef T) { TargetTriple = T; }
 
+  /// @LOCALMOD-BEGIN
+
+  /// Set the module format
+  void setOutputFormat(OutputFormat F);
+
+  /// For modules with output format "shared", set the output soname.
+  void setSOName(StringRef Name);
+
+  /// Wrap a global symbol.
+  void wrapSymbol(StringRef SymName);
+  /// @LOCALMOD-END
+
   /// Set the module-scope inline assembly blocks.
   void setModuleInlineAsm(StringRef Asm) {
     GlobalScopeAsm = Asm;
@@ -584,6 +633,11 @@ public:
   /// Dump the module to stderr (for debugging).
   void dump() const;
   
+  /// @LOCALMOD-BEGIN
+  /// Print the PNaCl metadata for the module.
+  void dumpMeta(raw_ostream &OS) const;
+  /// @LOCALMOD-END
+
   /// This function causes all the subinstructions to "let go" of all references
   /// that they are maintaining.  This allows one to 'delete' a whole class at
   /// a time, even though there may be circular references... first all
diff --git a/include/llvm/Support/ELF.h b/include/llvm/Support/ELF.h
index b676e91eba..a67a6ac09e 100644
--- a/include/llvm/Support/ELF.h
+++ b/include/llvm/Support/ELF.h
@@ -325,6 +325,7 @@ enum {
   ELFOSABI_C6000_ELFABI = 64, // Bare-metal TMS320C6000
   ELFOSABI_C6000_LINUX = 65,  // Linux TMS320C6000
   ELFOSABI_ARM = 97,          // ARM
+  ELFOSABI_NACL = 123,        // Native Client // @LOCALMOD
   ELFOSABI_STANDALONE = 255   // Standalone (embedded) application
 };
 
diff --git a/include/llvm/Support/ValueHandle.h b/include/llvm/Support/ValueHandle.h
index dbcf0fd11d..5e98fbd07a 100644
--- a/include/llvm/Support/ValueHandle.h
+++ b/include/llvm/Support/ValueHandle.h
@@ -104,6 +104,11 @@ protected:
   void setValPtrInt(unsigned K) { VP.setInt(K); }
   unsigned getValPtrInt() const { return VP.getInt(); }
 
+  // @LOCALMOD-BEGIN -- Hack for bug:
+  // http://code.google.com/p/nativeclient/issues/detail?id=2786
+  void setKind(HandleBaseKind K) { PrevPair.setInt(K); }
+  // @LOCALMOD-END
+
   static bool isValid(Value *V) {
     return V &&
            V != DenseMapInfo<Value *>::getEmptyKey() &&
@@ -232,6 +237,15 @@ public:
     return getValPtr();
   }
 
+  // @LOCALMOD-BEGIN -- Hack for bug:
+  // http://code.google.com/p/nativeclient/issues/detail?id=2786
+  // This allows us to weaken the Asserting Value Handle in LexicalScopes.h,
+  // for Debug info only.
+  void make_weak() {
+    setKind(Weak);
+  }
+  // @LOCALMOD-END
+
   ValueTy *operator->() const { return getValPtr(); }
   ValueTy &operator*() const { return *getValPtr(); }
 };
diff --git a/include/llvm/Support/support_macros.h b/include/llvm/Support/support_macros.h
new file mode 100644
index 0000000000..83d62c722c
--- /dev/null
+++ b/include/llvm/Support/support_macros.h
@@ -0,0 +1,25 @@
+// Define support macros for defining classes, etc.
+
+#ifndef LLVM_SUPPORT_SUPPORT_MACROS_H__
+#define LLVM_SUPPORT_SUPPORT_MACROS_H__
+
+// Define macro, to use within a class declaration,  to disallow constructor
+// copy. Defines copy constructor declaration under the assumption that it
+// is never defined.
+#define DISALLOW_CLASS_COPY(class_name) \
+  class_name(class_name& arg)  // Do not implement
+
+// Define macro, to use within a class declaration,  to disallow assignment.
+// Defines assignment operation declaration under the assumption that it
+// is never defined.
+#define DISALLOW_CLASS_ASSIGN(class_name) \
+  void operator=(class_name& arg)  // Do not implement
+
+// Define macro to add copy and assignment declarations to a class file,
+// for which no bodies will be defined, effectively disallowing these from
+// being defined in the class.
+#define DISALLOW_CLASS_COPY_AND_ASSIGN(class_name) \
+  DISALLOW_CLASS_COPY(class_name); \
+  DISALLOW_CLASS_ASSIGN(class_name)
+
+#endif  // LLVM_SUPPORT_SUPPORT_MACROS_H__
diff --git a/include/llvm/Support/system_error.h b/include/llvm/Support/system_error.h
index 0d164f688d..844013ed5d 100644
--- a/include/llvm/Support/system_error.h
+++ b/include/llvm/Support/system_error.h
@@ -597,7 +597,7 @@ enum _ {
 #else
   stream_timeout                      = ETIMEDOUT,
 #endif
-  text_file_busy                      = ETXTBSY,
+  text_file_busy                      = EINVAL, // @LOCALMOD
   timed_out                           = ETIMEDOUT,
   too_many_files_open_in_system       = ENFILE,
   too_many_files_open                 = EMFILE,
diff --git a/include/llvm/Target/Target.td b/include/llvm/Target/Target.td
index 12f5c0eb30..5fb12f503e 100644
--- a/include/llvm/Target/Target.td
+++ b/include/llvm/Target/Target.td
@@ -766,6 +766,40 @@ def LIFETIME_END : Instruction {
   let AsmString = "LIFETIME_END";
   let neverHasSideEffects = 1;
 }
+// @LOCALMOD-BEGIN
+def BUNDLE_ALIGN_START : Instruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins);
+  let AsmString = "";
+  let neverHasSideEffects = 1;
+  let isAsCheapAsAMove = 1;
+  let isNotDuplicable = 1;
+}
+def BUNDLE_ALIGN_END : Instruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins);
+  let AsmString = "";
+  let neverHasSideEffects = 1;
+  let isAsCheapAsAMove = 1;
+  let isNotDuplicable = 1;
+}
+def BUNDLE_LOCK : Instruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins);
+  let AsmString = "";
+  let neverHasSideEffects = 1;
+  let isAsCheapAsAMove = 1;
+  let isNotDuplicable = 1;
+}
+def BUNDLE_UNLOCK : Instruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins);
+  let AsmString = "";
+  let neverHasSideEffects = 1;
+  let isAsCheapAsAMove = 1;
+  let isNotDuplicable = 1;
+}
+// @LOCALMOD-END
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/include/llvm/Target/TargetFrameLowering.h b/include/llvm/Target/TargetFrameLowering.h
index d56db7b511..7df3bfa473 100644
--- a/include/llvm/Target/TargetFrameLowering.h
+++ b/include/llvm/Target/TargetFrameLowering.h
@@ -48,11 +48,19 @@ private:
   unsigned StackAlignment;
   unsigned TransientStackAlignment;
   int LocalAreaOffset;
+
+  // @LOCALMOD-BEGIN
+  // TODO(pdox): Refactor this and upstream it, to get rid of the
+  // assumption that StackSlotSize == PointerSize.
+  unsigned StackSlotSize;
+  // @LOCALMOD-END
 public:
-  TargetFrameLowering(StackDirection D, unsigned StackAl, int LAO,
-                      unsigned TransAl = 1)
+  TargetFrameLowering(StackDirection D,
+                      unsigned StackAl, int LAO,
+                      unsigned TransAl = 1,
+                      unsigned SlotSize = 0) // @LOCALMOD
     : StackDir(D), StackAlignment(StackAl), TransientStackAlignment(TransAl),
-      LocalAreaOffset(LAO) {}
+      LocalAreaOffset(LAO), StackSlotSize(SlotSize) {}
 
   virtual ~TargetFrameLowering();
 
@@ -63,6 +71,11 @@ public:
   ///
   StackDirection getStackGrowthDirection() const { return StackDir; }
 
+  // @LOCALMOD-BEGIN
+  /// getStackSlotSize - Return the size of a stack slot
+  unsigned getStackSlotSize() const { return StackSlotSize; }
+  // @LOCALMOD-END
+
   /// getStackAlignment - This method returns the number of bytes to which the
   /// stack pointer must be aligned on entry to a function.  Typically, this
   /// is the largest alignment for any data object in the target.
diff --git a/include/llvm/Target/TargetJITInfo.h b/include/llvm/Target/TargetJITInfo.h
index 044afd9b73..c2bb376131 100644
--- a/include/llvm/Target/TargetJITInfo.h
+++ b/include/llvm/Target/TargetJITInfo.h
@@ -129,6 +129,25 @@ namespace llvm {
     /// separately allocated heap memory rather than in the same
     /// code memory allocated by JITCodeEmitter.
     virtual bool allocateSeparateGVMemory() const { return false; }
+
+    // @LOCALMOD-START
+    // NaCl-specific, target-specific stuff
+    typedef struct { uint8_t *ins; int len; } HaltInstruction;
+    /// Get a sequence of NOPs of length len. Returns a pointer to a buffer
+    /// containing a val
+    virtual const uint8_t *getNopSequence(size_t len) const { return NULL; }
+    /// Get the length and definition of the halt/roadblock instruction
+    virtual const HaltInstruction *getHalt() const { return NULL; }
+    virtual int getBundleSize() const { return 0; }
+    virtual int32_t getJumpMask() const { return 0; }
+
+    /// Relocations cannot happen in-place in NaCl because we can't write to
+    /// code. This function takes a pointer to where the code has been emitted,
+    /// before it is copied to the code region. The subsequent call to
+    /// relocate takes pointers to the target code location, but rewrites the
+    /// code in the relocation buffer rather than at the target
+    virtual void setRelocationBuffer(unsigned char * BufferBegin) {}
+    // @LOCALMOD-END
   protected:
     bool useGOT;
   };
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index 580a30fcd2..f8925f25a1 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -121,6 +121,18 @@ public:
                           // mask (ex: x86 blends).
   };
 
+  // @LOCALMOD-START
+  // This needs to be kept in sync with
+  // native_client/src/untrusted/nacl/pnaclintrin.h.
+  enum PnaclTargetArchitecture {
+    PnaclTargetArchitectureInvalid = 0,
+    PnaclTargetArchitectureX86_32,
+    PnaclTargetArchitectureX86_64,
+    PnaclTargetArchitectureARM_32,
+    PnaclTargetArchitectureARM_32_Thumb
+  };
+  // @LOCALMOD-END
+
   static ISD::NodeType getExtendForContent(BooleanContent Content) {
     switch (Content) {
     case UndefinedBooleanContent:
diff --git a/include/llvm/Target/TargetOpcodes.h b/include/llvm/Target/TargetOpcodes.h
index 516e0706b8..2c9459974a 100644
--- a/include/llvm/Target/TargetOpcodes.h
+++ b/include/llvm/Target/TargetOpcodes.h
@@ -91,7 +91,14 @@ namespace TargetOpcode {
 
     /// Lifetime markers.
     LIFETIME_START = 15,
-    LIFETIME_END = 16
+    LIFETIME_END = 16,
+
+    // @LOCALMOD-BEGIN
+    BUNDLE_ALIGN_START = 14,
+    BUNDLE_ALIGN_END = 15,
+    BUNDLE_LOCK = 16,
+    BUNDLE_UNLOCK = 17
+    // @LOCALMOD-END
   };
 } // end namespace TargetOpcode
 } // end namespace llvm
diff --git a/include/llvm/Target/TargetOptions.h b/include/llvm/Target/TargetOptions.h
index 68ca567836..0a1b73e352 100644
--- a/include/llvm/Target/TargetOptions.h
+++ b/include/llvm/Target/TargetOptions.h
@@ -30,6 +30,12 @@ namespace llvm {
     };
   }
 
+  // @LOCALMOD-BEGIN
+  /// TLSUseCall - This flag enables the use of a function call to get the
+  /// thread pointer for TLS accesses, instead of using inline code.
+  extern bool TLSUseCall;
+  // @LOCALMOD-END
+
   namespace FPOpFusion {
     enum FPOpFusionMode {
       Fast,     // Enable fusion of FP ops wherever it's profitable.
diff --git a/include/llvm/Transforms/NaCl.h b/include/llvm/Transforms/NaCl.h
new file mode 100644
index 0000000000..fe29463a8b
--- /dev/null
+++ b/include/llvm/Transforms/NaCl.h
@@ -0,0 +1,21 @@
+//===-- NaCl.h - NaCl Transformations ---------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_NACL_H
+#define LLVM_TRANSFORMS_NACL_H
+
+namespace llvm {
+
+class ModulePass;
+
+ModulePass *createExpandCtorsPass();
+
+}
+
+#endif
diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h
index a5d8eed746..1ddca844c9 100644
--- a/include/llvm/Transforms/Scalar.h
+++ b/include/llvm/Transforms/Scalar.h
@@ -372,7 +372,7 @@ extern char &InstructionSimplifierID;
 // "block_weights" metadata.
 FunctionPass *createLowerExpectIntrinsicPass();
 
-
+FunctionPass *createNaClCcRewritePass(const TargetLowering *TLI = 0);
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Value.h b/include/llvm/Value.h
index 5b19435eba..be218183e5 100644
--- a/include/llvm/Value.h
+++ b/include/llvm/Value.h
@@ -105,6 +105,12 @@ public:
   ///
   Type *getType() const { return VTy; }
 
+  // @LOCALMOD-START
+  // Currently only used for function type update during
+  // the NaCl calling convention rewrite pass
+  void setType(Type* t) { VTy = t; }
+  // @LOCALMOD-END
+  
   /// All values hold a context through their type.
   LLVMContext &getContext() const;
 
diff --git a/include/llvm/Wrap/BCHeaderField.h b/include/llvm/Wrap/BCHeaderField.h
new file mode 100644
index 0000000000..40a3714c9f
--- /dev/null
+++ b/include/llvm/Wrap/BCHeaderField.h
@@ -0,0 +1,106 @@
+/* Copyright 2012 The Native Client Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ */
+
+#ifndef LLVM_WRAP_BCHEADERFIELD_H
+#define LLVM_WRAP_BCHEADERFIELD_H
+#include <limits>
+#include <stdint.h>
+#include <stdio.h>
+#include <string.h>
+
+// Class representing a variable-size metadata field in the bitcode header.
+// Also contains the list of known Tag IDs.
+// Contains a pointer to the data but does not own the data, so it can be
+// copied with the trivial copy constructor/assignment operator.
+
+// The serialized format has 2 fixed subfields (ID and length) and the
+// variable-length data subfield
+class BCHeaderField {
+ public:
+  typedef enum {
+    kInvalid = 0,
+    kBitcodeHash = 1,
+    kAndroidCompilerVersion = 0x4001,
+    kAndroidOptimizationLevel = 0x4002
+  } Tag;
+  typedef uint16_t FixedSubfield;
+
+  BCHeaderField(Tag ID, size_t len, uint8_t* data) :
+      ID_(ID), len_(len), data_(data) {}
+  size_t GetTotalSize() {
+    // Round up to 4 byte alignment
+    return (kTagLenSize + len_ + 3) & ~3;
+  }
+
+  bool Write(uint8_t* buf, size_t buf_len) {
+    size_t fields_len = kTagLenSize + len_;
+    size_t pad_len = (4 - (fields_len & 3)) & 3;
+    // Ensure buffer is large enough and that length can be represented
+    // in 16 bits
+    if (buf_len < fields_len + pad_len ||
+        len_ > std::numeric_limits<FixedSubfield>::max()) return false;
+
+    WriteFixedSubfield(static_cast<FixedSubfield>(ID_), buf);
+    WriteFixedSubfield(static_cast<FixedSubfield>(len_),
+                       buf + sizeof(FixedSubfield));
+    memcpy(buf + kTagLenSize, data_, len_);
+    // Pad out to 4 byte alignment
+    if (pad_len) {
+      memset(buf + fields_len, 0, pad_len);
+    }
+    return true;
+  }
+
+  bool Read(const uint8_t* buf, size_t buf_len) {
+    if (buf_len < kTagLenSize) return false;
+    FixedSubfield field;
+    ReadFixedSubfield(&field, buf);
+    ID_ = static_cast<Tag>(field);
+    ReadFixedSubfield(&field, buf + sizeof(FixedSubfield));
+    len_ = static_cast<size_t>(field);
+    if (buf_len < kTagLenSize + len_) return false;
+    memcpy(data_, buf + kTagLenSize, len_);
+    return true;
+  }
+
+  void Print() {
+    fprintf(stderr, "Field ID: %d, data length %d, total length %d\n",
+            ID_, static_cast<int>(len_), static_cast<int>(GetTotalSize()));
+    fprintf(stderr, "Data: ");
+    for (size_t i = 0; i < len_; i++) fprintf(stderr, "%02x", data_[i]);
+    fprintf(stderr, "\n");
+  }
+
+  // Get the data size from a serialized field to allow allocation
+  static size_t GetDataSizeFromSerialized(const uint8_t* buf) {
+    FixedSubfield len;
+    ReadFixedSubfield(&len, buf + sizeof(FixedSubfield));
+    return len;
+  }
+
+  Tag getID() const {
+    return ID_;
+  }
+
+  size_t getLen() const {
+    return len_;
+  }
+
+ private:
+ // Combined size of the fixed subfields
+ const static size_t kTagLenSize = 2 * sizeof(FixedSubfield);
+  static void WriteFixedSubfield(FixedSubfield value, uint8_t* buf) {
+    buf[0] = value & 0xFF;
+    buf[1] = (value >> 8) & 0xFF;
+  }
+  static void ReadFixedSubfield(FixedSubfield* value, const uint8_t* buf) {
+    *value = buf[0] | buf[1] << 8;
+  }
+  Tag ID_;
+  size_t len_;
+  uint8_t *data_;
+};
+
+#endif
diff --git a/include/llvm/Wrap/bitcode_wrapperer.h b/include/llvm/Wrap/bitcode_wrapperer.h
new file mode 100644
index 0000000000..89f2a4cbcc
--- /dev/null
+++ b/include/llvm/Wrap/bitcode_wrapperer.h
@@ -0,0 +1,192 @@
+/* Copyright 2012 The Native Client Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ */
+
+// Define utility class to wrap/unwrap bitcode files. Does wrapping/unwrapping
+// in such a way that the wrappered bitcode file is still a bitcode file.
+
+#ifndef LLVM_WRAP_BITCODE_WRAPPERER_H__
+#define LLVM_WRAP_BITCODE_WRAPPERER_H__
+
+#include <stdint.h>
+#include <stddef.h>
+#include <vector>
+
+#include "llvm/Support/support_macros.h"
+#include "llvm/Wrap/BCHeaderField.h"
+#include "llvm/Wrap/wrapper_input.h"
+#include "llvm/Wrap/wrapper_output.h"
+
+// The bitcode wrapper header is the following 7 fixed 4-byte fields:
+//      1) 0B17C0DE - The magic number expected by llvm for wrapped bitcodes
+//      2) Version # 0 - The current version of wrapped bitcode files
+//      3) (raw) bitcode offset
+//      4) (raw) bitcode size
+//      5) Android header version
+//      6) Android target API
+//      7) PNaCl Bitcode version
+//      plus 0 or more variable-length fields (consisting of ID, length, data)
+
+// Initial buffer size. It is expanded if needed to hold large variable-size
+// fields.
+static const size_t kBitcodeWrappererBufferSize = 1024;
+
+// Support class for outputting a wrapped bitcode file from a raw bitcode
+// file (and optionally additional header fields), or for outputting a raw
+// bitcode file from a wrapped one.
+class BitcodeWrapperer {
+ public:
+  // Create a bitcode wrapperer using the following
+  // input and output files.
+  BitcodeWrapperer(WrapperInput* infile, WrapperOutput* outfile);
+
+  // Returns true if the input file begins with a bitcode
+  // wrapper magic number. As a side effect, _wrapper_ fields are set.
+  bool IsInputBitcodeWrapper();
+
+  // Returns true if the input file begins with a bitcode
+  // file magic number.
+  bool IsInputBitcodeFile();
+
+  // Add a variable-length field to the header. The caller is responsible
+  // for freeing the data pointed to by the BCHeaderField.
+  void AddHeaderField(BCHeaderField* field);
+
+  // Generate a wrapped bitcode file from the input bitcode file
+  // and the current header data. Return true on success.
+  bool GenerateWrappedBitcodeFile();
+
+  // Unwrap the wrapped bitcode file, to the corresponding
+  // outfile. Return true on success.
+  bool GenerateRawBitcodeFile();
+
+  // Print current wrapper header fields to stderr for debugging.
+  void PrintWrapperHeader();
+
+  ~BitcodeWrapperer();
+
+ private:
+  DISALLOW_CLASS_COPY_AND_ASSIGN(BitcodeWrapperer);
+
+  // Refills the buffer with more bytes. Does this in a way
+  // such that it is maximally filled.
+  void FillBuffer();
+
+  // Returns the number of bytes in infile.
+  off_t GetInFileSize() {
+    if (infile_ != NULL) {
+      return infile_->Size();
+    } else {
+      return 0;
+    }
+  }
+
+  // Returns the offset of bitcode (i.e. the size of the wrapper header)
+  // if the output file were to be written now.
+  size_t BitcodeOffset();
+
+  // Returns true if we can read a word. If necessary, fills the buffer
+  // with enough characters so that there are at least a 32-bit value
+  // in the buffer. Returns false if there isn't a 32-bit value
+  // to read from the input file.
+  bool CanReadWord();
+
+  // Read a (32-bit) word from the input. Return true
+  // if able to read the word.
+  bool ReadWord(uint32_t& word);
+
+  // Write a (32-bit) word to the output. Return true if successful
+  bool WriteWord(uint32_t word);
+
+  // Write all variable-sized header fields to the output. Return true
+  // if successful.
+  bool WriteVariableFields();
+
+  // Parse the bitcode wrapper header in the infile, if any. Return true
+  // if successful.
+  bool ParseWrapperHeader();
+
+  // Returns the i-th character in front of the cursor in the buffer.
+  uint8_t BufferLookahead(int i) { return buffer_[cursor_ + i]; }
+
+  // Returns how many unread bytes are in the buffer.
+  size_t GetBufferUnreadBytes() { return buffer_size_ - cursor_; }
+
+
+  // Backs up the read cursor to the beginning of the input buffer.
+  void ResetCursor() {
+    cursor_ = 0;
+  }
+
+  // Generates the header sequence for the wrapped bitcode being
+  // generated.
+  bool WriteBitcodeWrapperHeader();
+
+  // Copies size bytes of infile to outfile, using the buffer.
+  bool BufferCopyInToOut(uint32_t size);
+
+  // Discards the old infile and replaces it with the given file.
+  void ReplaceInFile(WrapperInput* new_infile);
+
+  // Discards the old outfile and replaces it with the given file.
+  void ReplaceOutFile(WrapperOutput* new_outfile);
+
+  // Moves to the given position in the input file. Returns false
+  // if unsuccessful.
+  bool Seek(uint32_t pos);
+
+  // Clear the buffer of all contents.
+  void ClearBuffer();
+
+  // The input file being processed. Can be either
+  // a bitcode file, a wrappered bitcode file, or a secondary
+  // file to be wrapped.
+  WrapperInput* infile_;
+
+  // The output file being generated. Can be either
+  // a bitcode file, a wrappered bitcode file, or a secondary
+  // unwrapped file.
+  WrapperOutput* outfile_;
+
+  // A buffer of bytes read from the input file.
+  std::vector<uint8_t> buffer_;
+
+  // The number of bytes that were read from the input file
+  // into the buffer.
+  size_t buffer_size_;
+
+  // The index to the current read point within the buffer.
+  size_t cursor_;
+
+  // True when eof of input is reached.
+  bool infile_at_eof_;
+
+  // The 32-bit value defining the offset of the raw bitcode in the input file.
+  uint32_t infile_bc_offset_;
+
+  // The 32-bit value defining the generated offset of the wrapped bitcode.
+  // This value changes as new fields are added with AddHeaderField
+  uint32_t wrapper_bc_offset_;
+
+  // The 32-bit value defining the size of the raw wrapped bitcode.
+  uint32_t wrapper_bc_size_;
+
+  // Android header version and target API
+  uint32_t android_header_version_;
+  uint32_t android_target_api_;
+
+  // PNaCl bitcode version
+  uint32_t pnacl_bc_version_;
+
+  // Vector of variable header fields
+  std::vector<BCHeaderField> header_fields_;
+  // If any bufferdata from header fields is owned, it is stored here and
+  // freed on destruction.
+  std::vector<uint8_t*> variable_field_data_;
+
+  // True if there was an error condition (e.g. the file is not bitcode)
+  bool error_;
+};
+
+#endif  // LLVM_WRAP_BITCODE_WRAPPERER_H__
diff --git a/include/llvm/Wrap/file_wrapper_input.h b/include/llvm/Wrap/file_wrapper_input.h
new file mode 100644
index 0000000000..9f3de004c4
--- /dev/null
+++ b/include/llvm/Wrap/file_wrapper_input.h
@@ -0,0 +1,48 @@
+/* Copyright 2012 The Native Client Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ */
+
+// Defines utility allowing files for bitcode input wrapping.
+
+#ifndef FILE_WRAPPER_INPUT_H__
+#define FILE_WRAPPER_INPUT_H__
+
+#include "llvm/Support/support_macros.h"
+#include "llvm/Wrap/wrapper_input.h"
+
+#include <stdio.h>
+#include <string>
+
+// Define a class to wrap named files.
+class FileWrapperInput : public WrapperInput {
+ public:
+  FileWrapperInput(const std::string& name);
+  ~FileWrapperInput();
+  // Tries to read the requested number of bytes into the buffer. Returns the
+  // actual number of bytes read.
+  virtual size_t Read(uint8_t* buffer, size_t wanted);
+  // Returns true if at end of file. Note: May return false
+  // until Read is called, and returns 0.
+  virtual bool AtEof();
+  // Returns the size of the file (in bytes).
+  virtual off_t Size();
+  // Moves to the given offset within the file. Returns
+  // false if unable to move to that position.
+  virtual bool Seek(uint32_t pos);
+ private:
+  // The name of the file.
+  std::string _name;
+  // True once eof has been encountered.
+  bool _at_eof;
+  // True if size has been computed.
+  bool _size_found;
+  // The size of the file.
+  off_t _size;
+  // The corresponding (opened) file.
+  FILE* _file;
+ private:
+  DISALLOW_CLASS_COPY_AND_ASSIGN(FileWrapperInput);
+};
+
+#endif // FILE_WRAPPER_INPUT_H__
diff --git a/include/llvm/Wrap/file_wrapper_output.h b/include/llvm/Wrap/file_wrapper_output.h
new file mode 100644
index 0000000000..714bd36a75
--- /dev/null
+++ b/include/llvm/Wrap/file_wrapper_output.h
@@ -0,0 +1,34 @@
+/* Copyright 2012 The Native Client Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ */
+
+// Defines utility allowing files for bitcode output wrapping.
+
+#ifndef FILE_WRAPPER_OUTPUT_H__
+#define FILE_WRAPPER_OUTPUT_H__
+
+#include "llvm/Support/support_macros.h"
+#include "llvm/Wrap/wrapper_output.h"
+#include <stdio.h>
+#include <string>
+
+// Define a class to wrap named files. */
+class FileWrapperOutput : public WrapperOutput {
+ public:
+  FileWrapperOutput(const std::string& name);
+  ~FileWrapperOutput();
+  // Writes a single byte, returning false if unable to write.
+  virtual bool Write(uint8_t byte);
+  // Writes the specified number of bytes in the buffer to
+  // output. Returns false if unable to write.
+  virtual bool Write(const uint8_t* buffer, size_t buffer_size);
+ private:
+  // The name of the file
+  std::string _name;
+  // The corresponding (opened) file.
+  FILE* _file;
+ private:
+  DISALLOW_CLASS_COPY_AND_ASSIGN(FileWrapperOutput);
+};
+#endif  // FILE_WRAPPER_OUTPUT_H__
diff --git a/include/llvm/Wrap/wrapper_input.h b/include/llvm/Wrap/wrapper_input.h
new file mode 100644
index 0000000000..cde918083a
--- /dev/null
+++ b/include/llvm/Wrap/wrapper_input.h
@@ -0,0 +1,38 @@
+/* Copyright 2012 The Native Client Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ */
+
+// Define a generic interface to a file/memory region that contains
+// a bitcode file, a wrapped bitcode file, or a data file to wrap.
+
+#ifndef LLVM_WRAP_WRAPPER_INPUT_H__
+#define LLVM_WRAP_WRAPPER_INPUT_H__
+
+#include <stdint.h>
+#include <sys/types.h>
+
+#include "llvm/Support/support_macros.h"
+
+// The following is a generic interface to a file/memory region that contains
+// a bitcode file, a wrapped bitcode file, or data file to wrap.
+class WrapperInput {
+ public:
+  WrapperInput() {}
+  virtual ~WrapperInput() {}
+  // Tries to read the requested number of bytes into the buffer. Returns the
+  // actual number of bytes read.
+  virtual size_t Read(uint8_t* buffer, size_t wanted) = 0;
+  // Returns true if at end of input. Note: May return false until
+  // Read is called, and returns 0.
+  virtual bool AtEof() = 0;
+  // Returns the size of the input (in bytes).
+  virtual off_t Size() = 0;
+  // Moves to the given offset within the input region. Returns false
+  // if unable to move to that position.
+  virtual bool Seek(uint32_t pos) = 0;
+ private:
+  DISALLOW_CLASS_COPY_AND_ASSIGN(WrapperInput);
+};
+
+#endif  // LLVM_WRAP_WRAPPER_INPUT_H__
diff --git a/include/llvm/Wrap/wrapper_output.h b/include/llvm/Wrap/wrapper_output.h
new file mode 100644
index 0000000000..7045705991
--- /dev/null
+++ b/include/llvm/Wrap/wrapper_output.h
@@ -0,0 +1,34 @@
+/* Copyright 2012 The Native Client Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ */
+
+// Defines a generic interface to a file/memory region that
+// contains a generated wrapped bitcode file, bitcode file,
+// or data file.
+
+#ifndef LLVM_WRAP_WRAPPER_OUTPUT_H__
+#define LLVM_WRAP_WRAPPER_OUTPUT_H__
+
+#include <stdint.h>
+#include <stddef.h>
+
+#include "llvm/Support/support_macros.h"
+
+// The following is a generic interface to a file/memory region
+// that contains a generated bitcode file, wrapped bitcode file,
+// or a data file.
+class WrapperOutput {
+ public:
+  WrapperOutput() {}
+  virtual ~WrapperOutput() {}
+  // Writes a single byte, returning false if unable to write.
+  virtual bool Write(uint8_t byte) = 0;
+  // Writes the specified number of bytes in the buffer to
+  // output. Returns false if unable to write.
+  virtual bool Write(const uint8_t* buffer, size_t buffer_size);
+ private:
+  DISALLOW_CLASS_COPY_AND_ASSIGN(WrapperOutput);
+};
+
+#endif  // LLVM_WRAP_WRAPPER_OUTPUT_H__
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index 279343c48c..96b3925ed7 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -1544,6 +1544,14 @@ bool BitcodeReader::ParseModule(bool Resume) {
       std::string S;
       if (ConvertToString(Record, 0, S))
         return Error("Invalid MODULE_CODE_TRIPLE record");
+
+      // @LOCALMOD-BEGIN
+      // This hack is needed in order to get Clang compiled binaries
+      // working with the Gold plugin, until PNaCl backend is introduced
+      // in lib/Target/PNaCl.
+      if (S == "le32-unknown-nacl")
+        S = "armv7-none-linux-gnueabi";
+      // @LOCALMOD-END
       TheModule->setTargetTriple(S);
       break;
     }
@@ -2831,6 +2839,16 @@ bool BitcodeReader::isDematerializable(const GlobalValue *GV) const {
   const Function *F = dyn_cast<Function>(GV);
   if (!F || F->isDeclaration())
     return false;
+  // @LOCALMOD-START
+  // Don't dematerialize functions with BBs which have their address taken;
+  // it will cause any referencing blockAddress constants to also be destroyed,
+  // but because they are GVs, they need to stay around until PassManager
+  // finalization.
+  for (Function::const_iterator BB = F->begin(); BB != F->end(); ++BB) {
+    if (BB->hasAddressTaken())
+      return false;
+  }
+  // @LOCALMOD-END
   return DeferredFunctionInfo.count(const_cast<Function*>(F));
 }
 
@@ -2980,6 +2998,9 @@ Module *llvm::getStreamedBitcodeModule(const std::string &name,
     return 0;
   }
   R->setBufferOwned(false); // no buffer to delete
+
+  R->materializeForwardReferencedFunctions();
+
   return M;
 }
 
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index d74a70362a..b4f0b174b5 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -156,6 +156,11 @@ bool AsmPrinter::doInitialization(Module &M) {
   MMI = getAnalysisIfAvailable<MachineModuleInfo>();
   MMI->AnalyzeModule(M);
 
+  // @LOCALMOD-BEGIN
+  IsPlainObject =
+    (MMI->getModule()->getOutputFormat() == Module::ObjectOutputFormat);
+  // @LOCALMOD-END
+
   // Initialize TargetLoweringObjectFile.
   const_cast<TargetLoweringObjectFile&>(getObjFileLowering())
     .Initialize(OutContext, TM);
@@ -272,6 +277,17 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
   MCSymbol *GVSym = Mang->getSymbol(GV);
   EmitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
 
+  // @LOCALMOD-BEGIN
+  // For .pexe and .pso files, emit ELF type STT_OBJECT or STT_TLS instead
+  // of NOTYPE for undefined symbols.
+  // BUG= http://code.google.com/p/nativeclient/issues/detail?id=2527
+  if (!GV->hasInitializer() && !IsPlainObject) {
+    OutStreamer.EmitSymbolAttribute(GVSym,
+                                    GV->isThreadLocal() ? MCSA_ELF_TypeTLS
+                                                        : MCSA_ELF_TypeObject);
+  }
+  // @LOCALMOD-END
+
   if (!GV->hasInitializer())   // External globals require no extra code.
     return;
 
@@ -681,9 +697,14 @@ void AsmPrinter::EmitFunctionBody() {
         break;
 
       case TargetOpcode::EH_LABEL:
-      case TargetOpcode::GC_LABEL:
+      case TargetOpcode::GC_LABEL: {
+        // @LOCALMOD-START
+        unsigned LabelAlign = GetTargetLabelAlign(II);
+        if (LabelAlign) EmitAlignment(LabelAlign);
+        // @LOCALMOD-END
         OutStreamer.EmitLabel(II->getOperand(0).getMCSymbol());
         break;
+      }
       case TargetOpcode::INLINEASM:
         EmitInlineAsm(II);
         break;
@@ -699,6 +720,20 @@ void AsmPrinter::EmitFunctionBody() {
       case TargetOpcode::KILL:
         if (isVerbose()) emitKill(II, *this);
         break;
+      // @LOCALMOD-BEGIN
+      case TargetOpcode::BUNDLE_ALIGN_START:
+        OutStreamer.EmitBundleAlignStart();
+        break;
+      case TargetOpcode::BUNDLE_ALIGN_END:
+        OutStreamer.EmitBundleAlignEnd();
+        break;
+      case TargetOpcode::BUNDLE_LOCK:
+        OutStreamer.EmitBundleLock();
+        break;
+      case TargetOpcode::BUNDLE_UNLOCK:
+        OutStreamer.EmitBundleUnlock();
+        break;
+      // @LOCALMOD-END
       default:
         if (!TM.hasMCUseLoc())
           MCLineEntry::Make(&OutStreamer, getCurrentSection());
@@ -848,6 +883,16 @@ bool AsmPrinter::doFinalization(Module &M) {
     const Function &F = *I;
     if (!F.isDeclaration())
       continue;
+
+    // @LOCALMOD-BEGIN
+    // For .pexe and .pso files, emit STT_FUNC for function declarations.
+    // BUG= http://code.google.com/p/nativeclient/issues/detail?id=2527
+    if (!IsPlainObject) {
+      OutStreamer.EmitSymbolAttribute(Mang->getSymbol(&F),
+                                      MCSA_ELF_TypeFunction);
+    }
+    // @LOCALMOD-END
+
     GlobalValue::VisibilityTypes V = F.getVisibility();
     if (V == GlobalValue::DefaultVisibility)
       continue;
@@ -1065,12 +1110,25 @@ void AsmPrinter::EmitJumpTableInfo() {
   if (// In PIC mode, we need to emit the jump table to the same section as the
       // function body itself, otherwise the label differences won't make sense.
       // FIXME: Need a better predicate for this: what about custom entries?
-      MJTI->getEntryKind() == MachineJumpTableInfo::EK_LabelDifference32 ||
+      (MJTI->getEntryKind() == MachineJumpTableInfo::EK_LabelDifference32 ||
       // We should also do if the section name is NULL or function is declared
       // in discardable section
       // FIXME: this isn't the right predicate, should be based on the MCSection
       // for the function.
-      F->isWeakForLinker()) {
+      // @LOCALMOD-START
+      // the original code is a hack
+      // jumptables usually end up in .rodata
+      // but for functions with weak linkage there is a chance that the are
+      // not needed. So in order to be discard the function AND the jumptable
+      // they keep them both in .text. This fix only works if we never discard
+      // weak functions. This is guaranteed because the bitcode linker already
+      // throws out unused ones.
+      // TODO: Investigate the other case of concern -- PIC code.
+      // Concern is about jumptables being in a different section: can the
+      // rodata and text be too far apart for a RIP-relative offset?
+       F->isWeakForLinker())
+      && !UseReadOnlyJumpTables()) {
+      // @LOCALMOD-END
     OutStreamer.SwitchSection(getObjFileLowering().SectionForGlobal(F,Mang,TM));
   } else {
     // Otherwise, drop it in the readonly section.
@@ -1097,7 +1155,7 @@ void AsmPrinter::EmitJumpTableInfo() {
     // .set directive for each unique entry.  This reduces the number of
     // relocations the assembler will generate for the jump table.
     if (MJTI->getEntryKind() == MachineJumpTableInfo::EK_LabelDifference32 &&
-        MAI->hasSetDirective()) {
+        MAI->hasSetDirective() && !UseReadOnlyJumpTables()) { // @LOCALMOD
       SmallPtrSet<const MachineBasicBlock*, 16> EmittedSets;
       const TargetLowering *TLI = TM.getTargetLowering();
       const MCExpr *Base = TLI->getPICJumpTableRelocBaseExpr(MF,JTI,OutContext);
@@ -1180,7 +1238,7 @@ void AsmPrinter::EmitJumpTableEntry(const MachineJumpTableInfo *MJTI,
     // If we have emitted set directives for the jump table entries, print
     // them rather than the entries themselves.  If we're emitting PIC, then
     // emit the table entries as differences between two text section labels.
-    if (MAI->hasSetDirective()) {
+    if (MAI->hasSetDirective() && !UseReadOnlyJumpTables()) { // @LOCALMOD
       // If we used .set, reference the .set's symbol.
       Value = MCSymbolRefExpr::Create(GetJTSetSymbol(UID, MBB->getNumber()),
                                       OutContext);
@@ -1200,7 +1258,6 @@ void AsmPrinter::EmitJumpTableEntry(const MachineJumpTableInfo *MJTI,
   OutStreamer.EmitValue(Value, EntrySize, /*addrspace*/0);
 }
 
-
 /// EmitSpecialLLVMGlobal - Check to see if the specified global is a
 /// special global used by LLVM.  If so, emit it and return true, otherwise
 /// do nothing and return false.
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 367b523079..22535fe5b4 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -571,7 +571,8 @@ DIE *DwarfDebug::constructScopeDIE(CompileUnit *TheCU, LexicalScope *Scope) {
 /// in the SourceIds map. This can update DirectoryNames and SourceFileNames
 /// maps as well.
 unsigned DwarfDebug::GetOrCreateSourceID(StringRef FileName, 
-                                         StringRef DirName) {
+                                         StringRef DirName,
+                                         StringRef Extra) { // @LOCALMOD
   // If FE did not provide a file name, then assume stdin.
   if (FileName.empty())
     return GetOrCreateSourceID("<stdin>", StringRef());
@@ -587,6 +588,9 @@ unsigned DwarfDebug::GetOrCreateSourceID(StringRef FileName,
   NamePair += DirName;
   NamePair += '\0'; // Zero bytes are not allowed in paths.
   NamePair += FileName;
+  // @LOCALMOD
+  NamePair += '\0'; // Zero bytes are not allowed in paths.
+  NamePair += Extra;
 
   StringMapEntry<unsigned> &Ent = SourceIdMap.GetOrCreateValue(NamePair, SrcId);
   if (Ent.getValue() != SrcId)
@@ -598,13 +602,37 @@ unsigned DwarfDebug::GetOrCreateSourceID(StringRef FileName,
   return SrcId;
 }
 
+// @LOCALMOD-BEGIN
+// A special version of GetOrCreateSourceID for CompileUnits.
+// It is possible that with bitcode linking, we end up with distinct
+// compile units based on the same source file.
+// E.g., compile foo.c with -DMACRO1 to foo1.bc, then compile
+// foo.c again with -DMACRO2 to foo2.bc and link.
+// We use additional information to form a unique ID in that case.
+unsigned DwarfDebug::GetOrCreateCompileUnitID(StringRef Filename,
+                                              StringRef Dirname,
+                                              const MDNode *N) {
+  std::string DIUnitStr;
+  raw_string_ostream ostr(DIUnitStr);
+
+  // Using information from the compile unit (N)'s getEnumTypes(),
+  // getRetainedTypes(), getSubprograms(), getGlobalVariables()
+  // could be pretty expensive.
+  // Cheat and use the MDNode's address as an additional identifying factor.
+  // constructCompileUnit() is only called once per compile unit.
+  ostr << static_cast<const void*>(N);
+  return GetOrCreateSourceID(Filename, Dirname, ostr.str());
+}
+// @LOCALMOD-END
+
 /// constructCompileUnit - Create new CompileUnit for the given
 /// metadata node with tag DW_TAG_compile_unit.
 CompileUnit *DwarfDebug::constructCompileUnit(const MDNode *N) {
   DICompileUnit DIUnit(N);
   StringRef FN = DIUnit.getFilename();
   CompilationDir = DIUnit.getDirectory();
-  unsigned ID = GetOrCreateSourceID(FN, CompilationDir);
+  // @LOCALMOD
+  unsigned ID = GetOrCreateCompileUnitID(FN, CompilationDir, N);
 
   DIE *Die = new DIE(dwarf::DW_TAG_compile_unit);
   CompileUnit *NewCU = new CompileUnit(ID, DIUnit.getLanguage(), Die,
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index 61d9a51a52..475c6f86d9 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -524,7 +524,16 @@ public:
   /// GetOrCreateSourceID - Look up the source id with the given directory and
   /// source file names. If none currently exists, create a new id and insert it
   /// in the SourceIds map.
-  unsigned GetOrCreateSourceID(StringRef DirName, StringRef FullName);
+  unsigned GetOrCreateSourceID(StringRef DirName, StringRef FullName,
+                               StringRef Extra = ""); // @LOCALMOD for Extra
+
+  // @LOCALMOD-BEGIN - Create an ID for CompileUnits, taking extra care
+  // in the case that we have multiple compile units coming from the
+  // same source file and directory.
+  unsigned GetOrCreateCompileUnitID(StringRef FileName, StringRef DirName,
+                                    const MDNode *N);
+  // @LOCALMOD-END
+
 
   /// getStringPool - returns the entry into the start of the pool.
   MCSymbol *getStringPool();
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index 6f4c5a2f66..90f6eec831 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -20,6 +20,7 @@
 #include "BranchFolding.h"
 #include "llvm/Function.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/MachineConstantPool.h" //  @LOCALMOD
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
@@ -234,6 +235,21 @@ bool BranchFolder::OptimizeFunction(MachineFunction &MF,
       }
   }
 
+    // @LOCALMOD-START
+    // This currently only used on ARM targets where the ConstantPool
+    // subclass is overloading getJumpTableIndex()
+    const std::vector<MachineConstantPoolEntry>& CPs =
+      MF.getConstantPool()->getConstants();
+    for (unsigned i = 0, e = CPs.size(); i != e; ++i) {
+      if (!CPs[i].isMachineConstantPoolEntry()) continue;
+      unsigned *JTIndex = CPs[i].Val.MachineCPVal->getJumpTableIndex();
+      if (!JTIndex) continue;
+      // Remember that this JT is live.
+      JTIsLive.set(*JTIndex);
+    }
+    // @LOCALMOD-END
+
+
   // Finally, remove dead jump tables.  This happens when the
   // indirect jump was unreachable (and thus deleted).
   for (unsigned i = 0, e = JTIsLive.size(); i != e; ++i)
diff --git a/lib/CodeGen/CallingConvLower.cpp b/lib/CodeGen/CallingConvLower.cpp
index 22b9140924..6ae07dfb0b 100644
--- a/lib/CodeGen/CallingConvLower.cpp
+++ b/lib/CodeGen/CallingConvLower.cpp
@@ -33,6 +33,7 @@ CCState::CCState(CallingConv::ID CC, bool isVarArg, MachineFunction &mf,
   StackOffset = 0;
 
   clearFirstByValReg();
+  clearHasByValInRegPosition(); // @LOCALMOD.
   UsedRegs.resize((TRI.getNumRegs()+31)/32);
 }
 
diff --git a/lib/CodeGen/IntrinsicLowering.cpp b/lib/CodeGen/IntrinsicLowering.cpp
index 6120ae56b4..b7c9f17df9 100644
--- a/lib/CodeGen/IntrinsicLowering.cpp
+++ b/lib/CodeGen/IntrinsicLowering.cpp
@@ -92,6 +92,46 @@ static CallInst *ReplaceCallWith(const char *NewFn, CallInst *CI,
 #  define setjmp_undefined_for_msvc
 #endif
 
+// @LOCALMOD-BEGIN
+// Calls to these functions may materialize as part of a conversion
+// from an intrinsics, e.g. llvm.memset -> memset
+// So if these functions are available in bitcode form we need to:
+// * make sure they do not get discarded -- if there is a chance that
+//   a caller might materialize
+// * make sure they do not get specialized for a given callsite
+// Both problems are avoided by pretending there are unknown callers.
+// The function: IntrinsicLowering::AddPrototypes() below does just that.
+// TODO(robertm): elaborate some more
+static const char *IntrinsicNames[] = {
+  "abort",
+  "memcpy", "memset", "memmove",
+  "sqrtf", "sqrt", "sqrtl",
+  "sinf", "sin", "sinl",
+  "cosf", "cos", "cosl",
+  "powf", "pow", "powl",
+  "logf", "log", "logl",
+  "log2f", "log2", "log2l",
+  "log10f", "log10", "log10l",
+  "expf", "exp", "expl",
+  "exp2f", "exp2", "exp2l",
+  NULL
+};
+
+StringSet<> IntrinsicLowering::FuncNames;
+
+const StringSet<> &IntrinsicLowering::GetFuncNames() {
+  if (FuncNames.empty()) {
+    for (unsigned i=0; IntrinsicNames[i]; ++i)
+      FuncNames.insert(IntrinsicNames[i]);
+  }
+  return FuncNames;
+}
+
+bool IntrinsicLowering::IsCalledByIntrinsic(const StringRef &FuncName) {
+  return IntrinsicLowering::GetFuncNames().count(FuncName) > 0;
+}
+// @LOCALMOD-END
+
 void IntrinsicLowering::AddPrototypes(Module &M) {
   LLVMContext &Context = M.getContext();
   for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I)
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index 18d021d521..34b24b6085 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -164,7 +164,8 @@ MachineBasicBlock::SkipPHIsAndLabels(MachineBasicBlock::iterator I) {
 
 MachineBasicBlock::iterator MachineBasicBlock::getFirstTerminator() {
   iterator B = begin(), E = end(), I = E;
-  while (I != B && ((--I)->isTerminator() || I->isDebugValue()))
+  while (I != B && ((--I)->isTerminator() || I->isDebugValue()
+      || I->getOpcode() == TargetOpcode::BUNDLE_UNLOCK)) // @LOCALMOD
     ; /*noop */
   while (I != E && !I->isTerminator())
     ++I;
diff --git a/lib/CodeGen/Passes.cpp b/lib/CodeGen/Passes.cpp
index 4ea21d4ff7..7c7d2c8045 100644
--- a/lib/CodeGen/Passes.cpp
+++ b/lib/CodeGen/Passes.cpp
@@ -352,6 +352,16 @@ void TargetPassConfig::addIRPasses() {
   addPass(createTypeBasedAliasAnalysisPass());
   addPass(createBasicAliasAnalysisPass());
 
+  // @LOCALMOD-START
+  addPass(createNaClCcRewritePass(TM->getTargetLowering()));
+  // TODO: consider adding a cleanup pass, e.g. constant propagation
+  // Note: we run this before the verfier step because it may cause
+  // a *temporary* inconsistency:
+  //   A function may have been rewritting before we are rewriting
+  //   its callers - which would lead to a parameter mismatch complaint
+  //   from the verifier.
+  // @LOCALMOD-END
+
   // Before running any passes, run the verifier to determine if the input
   // coming from the front-end and/or optimizer is valid.
   if (!DisableVerify)
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 3fbf7c2fe6..be3168618e 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5278,6 +5278,28 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::donothing:
     // ignore
     return 0;
+  // @LOCALMOD-BEGIN
+  // Native Client Intrinsics for TLS setup / layout.
+  case Intrinsic::nacl_tp_tls_offset: {
+    SDValue tls_size = getValue(I.getArgOperand(0));
+    setValue(&I, DAG.getNode(ISD::NACL_TP_TLS_OFFSET, dl,
+                             tls_size.getValueType(),
+                             tls_size));
+    return 0;
+  }
+  case Intrinsic::nacl_tp_tdb_offset: {
+    SDValue tdb_size = getValue(I.getArgOperand(0));
+    setValue(&I, DAG.getNode(ISD::NACL_TP_TDB_OFFSET, dl,
+                             tdb_size.getValueType(),
+                             tdb_size));
+    return 0;
+  }
+  case Intrinsic::nacl_target_arch: {
+    EVT DestVT = TLI.getValueType(I.getType());
+    setValue(&I, DAG.getNode(ISD::NACL_TARGET_ARCH, dl, DestVT));
+    return 0;
+  }
+  // @LOCALMOD-END
   }
 }
 
@@ -6454,7 +6476,10 @@ void SelectionDAGBuilder::visitVAArg(const VAArgInst &I) {
   SDValue V = DAG.getVAArg(TLI.getValueType(I.getType()), getCurDebugLoc(),
                            getRoot(), getValue(I.getOperand(0)),
                            DAG.getSrcValue(I.getOperand(0)),
-                           TD.getABITypeAlignment(I.getType()));
+// @LOCALMOD-BEGIN
+                           TD.getCallFrameTypeAlignment(I.getType()));
+// @LOCALMOD-END
+
   setValue(&I, V);
   DAG.setRoot(V.getValue(1));
 }
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 6f3ce7a44b..a870ee2ac8 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -313,6 +313,13 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
     case ISD::SETFALSE:                 return "setfalse";
     case ISD::SETFALSE2:                return "setfalse2";
     }
+
+  // @LOCALMOD-BEGIN
+  // NaCl intrinsics for TLS setup
+  case ISD::NACL_TP_TLS_OFFSET:        return "nacl_tls_offset";
+  case ISD::NACL_TP_TDB_OFFSET:        return "nacl_tdb_offset";
+  case ISD::NACL_TARGET_ARCH:          return "nacl_target_arch";
+  // @LOCALMOD-END
   }
 }
 
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index c314fa5b51..20afa3def3 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -564,7 +564,6 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   }
   DEBUG(dbgs() << "Initial selection DAG: BB#" << BlockNumber
         << " '" << BlockName << "'\n"; CurDAG->dump());
-
   if (ViewDAGCombine1) CurDAG->viewGraph("dag-combine1 input for " + BlockName);
 
   // Run the DAG combiner in pre-legalize mode.
@@ -593,7 +592,6 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   if (Changed) {
     if (ViewDAGCombineLT)
       CurDAG->viewGraph("dag-combine-lt input for " + BlockName);
-
     // Run the DAG combiner in post-type-legalize mode.
     {
       NamedRegionTimer T("DAG Combining after legalize types", GroupName,
@@ -615,10 +613,8 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
       NamedRegionTimer T("Type Legalization 2", GroupName, TimePassesIsEnabled);
       CurDAG->LegalizeTypes();
     }
-
     if (ViewDAGCombineLT)
       CurDAG->viewGraph("dag-combine-lv input for " + BlockName);
-
     // Run the DAG combiner in post-type-legalize mode.
     {
       NamedRegionTimer T("DAG Combining after legalize vectors", GroupName,
@@ -629,19 +625,15 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
     DEBUG(dbgs() << "Optimized vector-legalized selection DAG: BB#"
           << BlockNumber << " '" << BlockName << "'\n"; CurDAG->dump());
   }
-
   if (ViewLegalizeDAGs) CurDAG->viewGraph("legalize input for " + BlockName);
-
   {
     NamedRegionTimer T("DAG Legalization", GroupName, TimePassesIsEnabled);
     CurDAG->Legalize();
   }
-
   DEBUG(dbgs() << "Legalized selection DAG: BB#" << BlockNumber
         << " '" << BlockName << "'\n"; CurDAG->dump());
 
   if (ViewDAGCombine2) CurDAG->viewGraph("dag-combine2 input for " + BlockName);
-
   // Run the DAG combiner in post-legalize mode.
   {
     NamedRegionTimer T("DAG Combining 2", GroupName, TimePassesIsEnabled);
diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 8f5d770f66..6df4a0aa2a 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -55,8 +55,16 @@ TargetLoweringObjectFileELF::getCFIPersonalitySymbol(const GlobalValue *GV,
   case dwarf::DW_EH_PE_absptr:
     return  Mang->getSymbol(GV);
   case dwarf::DW_EH_PE_pcrel: {
+    // @LOCALMOD-BEGIN
+    // The dwarf section label should not include the version suffix.
+    // Strip it off here.
+    StringRef Name = Mang->getSymbol(GV)->getName();
+    size_t atpos = Name.find("@");
+    if (atpos != StringRef::npos)
+      Name = Name.substr(0, atpos);
+    // @LOCALMOD-END
     return getContext().GetOrCreateSymbol(StringRef("DW.ref.") +
-                                          Mang->getSymbol(GV)->getName());
+                                          Name); // @LOCALMOD
   }
   }
 }
@@ -65,7 +73,15 @@ void TargetLoweringObjectFileELF::emitPersonalityValue(MCStreamer &Streamer,
                                                        const TargetMachine &TM,
                                                        const MCSymbol *Sym) const {
   SmallString<64> NameData("DW.ref.");
-  NameData += Sym->getName();
+  // @LOCALMOD-BEGIN
+  // The dwarf section label should not include the version suffix.
+  // Strip it off here.
+  StringRef Name = Sym->getName();
+  size_t atpos = Name.find("@");
+  if (atpos != StringRef::npos)
+    Name = Name.substr(0, atpos);
+  // @LOCALMOD-END
+  NameData += Name; // @LOCALMOD
   MCSymbol *Label = getContext().GetOrCreateSymbol(NameData);
   Streamer.EmitSymbolAttribute(Label, MCSA_Hidden);
   Streamer.EmitSymbolAttribute(Label, MCSA_Weak);
diff --git a/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp b/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
index e16e2d112a..e3b90fdf78 100644
--- a/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
+++ b/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
@@ -322,7 +322,9 @@ GenericValue lle_X_abort(FunctionType *FT,
                          const std::vector<GenericValue> &Args) {
   //FIXME: should we report or raise here?
   //report_fatal_error("Interpreted program raised SIGABRT");
-  raise (SIGABRT);
+  //TODO(dschuff) fixme or figure out how to get raise()
+  abort(); // @LOCALMOD 
+  //raise (SIGABRT);
   return GenericValue();
 }
 
diff --git a/lib/ExecutionEngine/JIT/JIT.h b/lib/ExecutionEngine/JIT/JIT.h
index 2ae155bebf..338db8f454 100644
--- a/lib/ExecutionEngine/JIT/JIT.h
+++ b/lib/ExecutionEngine/JIT/JIT.h
@@ -210,6 +210,8 @@ public:
 private:
   static JITCodeEmitter *createEmitter(JIT &J, JITMemoryManager *JMM,
                                        TargetMachine &tm);
+  // Native client needs its own memory manager, so custom ones are unsupported
+  static JITCodeEmitter *createNaClEmitter(JIT &J, TargetMachine &tm);
   void runJITOnFunctionUnlocked(Function *F, const MutexGuard &locked);
   void updateFunctionStub(Function *F);
   void jitTheFunction(Function *F, const MutexGuard &locked);
diff --git a/lib/ExecutionEngine/JIT/JITEmitter.cpp b/lib/ExecutionEngine/JIT/JITEmitter.cpp
index ecafda7286..1c5abf751d 100644
--- a/lib/ExecutionEngine/JIT/JITEmitter.cpp
+++ b/lib/ExecutionEngine/JIT/JITEmitter.cpp
@@ -30,6 +30,7 @@
 #include "llvm/ExecutionEngine/GenericValue.h"
 #include "llvm/ExecutionEngine/JITEventListener.h"
 #include "llvm/ExecutionEngine/JITMemoryManager.h"
+#include "llvm/ExecutionEngine/NaClJITMemoryManager.h"
 #include "llvm/DataLayout.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetJITInfo.h"
@@ -52,12 +53,15 @@
 #ifndef NDEBUG
 #include <iomanip>
 #endif
+#ifdef __native_client__
+#include <nacl/nacl_dyncode.h>
+#endif
 using namespace llvm;
 
 STATISTIC(NumBytes, "Number of bytes of machine code compiled");
 STATISTIC(NumRelos, "Number of relocations applied");
 STATISTIC(NumRetries, "Number of retries with more memory");
-
+STATISTIC(NumNopBytes, "Number of bytes of NOPs emitted");
 
 // A declaration may stop being a declaration once it's fully read from bitcode.
 // This function returns true if F is fully read and is still a declaration.
@@ -281,8 +285,6 @@ namespace {
   /// JITEmitter - The JIT implementation of the MachineCodeEmitter, which is
   /// used to output functions to memory for execution.
   class JITEmitter : public JITCodeEmitter {
-    JITMemoryManager *MemMgr;
-
     // When outputting a function stub in the context of some other function, we
     // save BufferBegin/BufferEnd/CurBufferPtr here.
     uint8_t *SavedBufferBegin, *SavedBufferEnd, *SavedCurBufferPtr;
@@ -292,11 +294,13 @@ namespace {
     // ask the memory manager for at least this much space.  When we
     // successfully emit the function, we reset this back to zero.
     uintptr_t SizeEstimate;
-
+protected: //TODO:(dschuff): fix/move this once we do validation and are sure
+           // which functions/data we need in NaClJITEmitter. also add LOCALMOD
+    JITMemoryManager *MemMgr;
     /// Relocations - These are the relocations that the function needs, as
     /// emitted.
     std::vector<MachineRelocation> Relocations;
-
+private:
     /// MBBLocations - This vector is a mapping from MBB ID's to their address.
     /// It is filled in by the StartMachineBasicBlock callback and queried by
     /// the getMachineBasicBlockAddress callback.
@@ -380,7 +384,7 @@ namespace {
         DE.reset(new JITDwarfEmitter(jit));
       }
     }
-    ~JITEmitter() {
+    virtual ~JITEmitter() { // @LOCALMOD
       delete MemMgr;
     }
 
@@ -393,10 +397,10 @@ namespace {
     void initJumpTableInfo(MachineJumpTableInfo *MJTI);
     void emitJumpTableInfo(MachineJumpTableInfo *MJTI);
 
-    void startGVStub(const GlobalValue* GV,
+    virtual void startGVStub(const GlobalValue* GV,
                      unsigned StubSize, unsigned Alignment = 1);
-    void startGVStub(void *Buffer, unsigned StubSize);
-    void finishGVStub();
+    virtual void startGVStub(void *Buffer, unsigned StubSize);
+    virtual void finishGVStub();
     virtual void *allocIndirectGV(const GlobalValue *GV,
                                   const uint8_t *Buffer, size_t Size,
                                   unsigned Alignment);
@@ -468,6 +472,360 @@ namespace {
                              bool MayNeedFarStub);
     void *getPointerToGVIndirectSym(GlobalValue *V, void *Reference);
   };
+
+  // @LOCALMOD-START
+  class NaClJITEmitter : public JITEmitter {
+    /* There are two Nacl-specific requirements that must be dealt with: the
+     * first is that the data and code spaces are strictly separated, and code
+     * must be copied (by the service runtime/validator)to its destination
+     * after emission and relocation have finished.
+     * The second is bundle alignment: neither instructions nor multi-
+     * instruction pseudoinstruction groups may cross bundle boundaries.
+     *
+     * Requirement 1 is dealt with jointly by NaClJITMemoryManager  and
+     * and NaClJITEmitter. NaClJITMemoryManager separates metadata from
+     * code and returns pointers in the proper space
+     * for code (startFunctionBody, allocateStub) and data (allocateSpace,
+     * startExceptionTable, etc). NaClJITEmitter emits code into a separate
+     * memory buffer (EmissionBuffer). After startFunction allocates the
+     * function's memory, NaClJITEmitter's startFunction points BufferBegin,
+     * CurBufferPtr and BufferEnd at the EmissionBuffer (this avoids having to
+     * override all of the actual emission methods from JITCodeEmitter)
+     * JITEmitter already uses this trick for emitting a stub in the middle
+     * of emitting a function so it doesn't seem so terrible to do our own
+     *  similar swapping of the pointers.
+     *
+     * Requirement 2 is bundle alignment.
+     * X86CodeEmitter makes several calls into JITCodeEmitter per instruction,
+     * to add the various bytes, constants, etc. To implement bundle alignment,
+     * we add methods to start and end a bundle-locked group
+     * (the group can include just one instruction or several).
+     * The X86CodeEmitter will pass-through any such markers created by the
+     * rewriting passes (which surround multiple-instruction groups),
+     * and will also generate them surrounding each individual instruction
+     * (there should never be more than two-deep nesting).
+     * When beginBundleLock is called, the CurBufferPtr is marked. When
+     * endBundleLock is called, it checks that the group does not cross a
+     * bundle boundary; if it does, it inserts nop padding as necessary.
+     * If padding is added, the relocations must also be fixed up; this also
+     * happens in endBundleLock.
+     *
+     */
+  public:
+    NaClJITEmitter(JIT &jit, TargetMachine &TM) :
+        JITEmitter(jit, new NaClJITMemoryManager(), TM),
+        BundleLockSavedCurBufferPtr(NULL),
+        BundleNestCount(0),
+        AlignNextGroup(kNone),
+        GroupRelocationCount(0),
+        JITInfo(&jit.getJITInfo()),
+        kBundleSize(jit.getJITInfo().getBundleSize()),
+        kJumpMask(jit.getJITInfo().getJumpMask()) {
+      uintptr_t CodeSlabSize = MemMgr->GetDefaultCodeSlabSize();
+      EmissionBuffer = MemMgr->allocateSpace(CodeSlabSize, kBundleSize);
+      EmissionBufferSize = CodeSlabSize;
+      DEBUG(dbgs() << "EmissionBuffer " << EmissionBuffer << " size "
+                  << EmissionBufferSize << "\n");
+      StubEmissionBuffer = MemMgr->allocateSpace(kBundleSize, kBundleSize);
+      StubEmissionBufferSize = kBundleSize;
+      DEBUG(dbgs() << "StubEmissionBuffer " << StubEmissionBuffer << " size "
+                  << StubEmissionBufferSize << "\n");
+      JITInfo = &jit.getJITInfo();
+    }
+
+    virtual ~NaClJITEmitter() {
+    }
+
+    static inline bool classof(const JITEmitter*) { return true; }
+
+    virtual void startFunction(MachineFunction &F) {
+      JITEmitter::startFunction(F);
+      // Make sure the emission buffer is at least as big as the allocated
+      // function
+      if (BufferEnd - BufferBegin > (intptr_t)EmissionBufferSize) {
+        EmissionBufferSize = std::max((uintptr_t)(BufferEnd - BufferBegin),
+                                      2 * EmissionBufferSize);
+        // BumpPtrAllocator doesn't do anything when you call Deallocate. it
+        // will be freed on destruction
+        EmissionBuffer = MemMgr->allocateSpace(EmissionBufferSize,
+                                                   kBundleSize);
+        DEBUG(dbgs() << "new EmissionBuffer " << EmissionBuffer << " size "
+                    << EmissionBufferSize << "\n");
+      }
+      // We ensure that the emission buffer is bundle-aligned, and constant
+      // pool emission should not go into code space
+      assert((CurBufferPtr == BufferBegin ||
+          (int)F.getFunction()->getAlignment() > kBundleSize) &&
+             "Pre-function data should not be emitted into code space");
+      if (CurBufferPtr > BufferBegin) {
+        // If CurBufferPtr has been bumped forward for alignment, we need to
+        // pad the space with nops
+        memcpy(EmissionBuffer,
+               JITInfo->getNopSequence(CurBufferPtr - BufferBegin),
+               CurBufferPtr - BufferBegin);
+        NumNopBytes += CurBufferPtr - BufferBegin;
+      }
+      FunctionDestination = BufferBegin;
+      setBufferPtrs(EmissionBuffer);
+    }
+
+    virtual bool finishFunction(MachineFunction &F) {
+      uint8_t *end = CurBufferPtr;
+      emitAlignment(kBundleSize);
+      memcpy(end, JITInfo->getNopSequence(CurBufferPtr - end),
+                   CurBufferPtr - end);
+      NumNopBytes += CurBufferPtr - end;
+      JITInfo->setRelocationBuffer(BufferBegin);
+      assert(BufferBegin == EmissionBuffer);
+      int FunctionSize = CurBufferPtr - BufferBegin;
+      setBufferPtrs(FunctionDestination);
+      bool result = JITEmitter::finishFunction(F);
+      // If we ran out of memory, don't bother validating, we'll just retry
+      if (result) return result;
+
+      DEBUG({
+        dbgs() << "Validating " << FunctionDestination << "-" <<
+            FunctionDestination + FunctionSize << "\n";
+        if (sys::hasDisassembler()) {
+          dbgs() << "Disassembled code:\n";
+          dbgs() << sys::disassembleBuffer(EmissionBuffer,
+                                           FunctionSize,
+                                           (uintptr_t)FunctionDestination);
+        } else {
+          dbgs() << "Binary code:\n";
+          uint8_t* q = BufferBegin;
+          for (int i = 0; q < CurBufferPtr; q += 4, ++i) {
+            if (i == 4)
+              i = 0;
+            if (i == 0)
+              dbgs() << "JIT: " << (long)(q - BufferBegin) << ": ";
+            bool Done = false;
+            for (int j = 3; j >= 0; --j) {
+              if (q + j >= CurBufferPtr)
+                Done = true;
+              else
+                dbgs() << (unsigned short)q[j];
+            }
+            if (Done)
+              break;
+            dbgs() << ' ';
+            if (i == 3)
+              dbgs() << '\n';
+          }
+          dbgs()<< '\n';
+        }
+      });
+#ifdef __native_client__
+      if(nacl_dyncode_create(FunctionDestination, EmissionBuffer,
+                             FunctionSize) != 0) {
+        report_fatal_error("NaCl validation failed");
+      }
+#endif
+      return result;
+    }
+
+    virtual void startGVStub(const GlobalValue* GV,
+                             unsigned StubSize, unsigned Alignment = 1) {
+      JITEmitter::startGVStub(GV, StubSize, Alignment);
+      ReusedStub = false;
+      assert(StubSize <= StubEmissionBufferSize);
+      StubDestination = BufferBegin;
+      setBufferPtrs(StubEmissionBuffer);
+    }
+    virtual void startGVStub(void *Buffer, unsigned StubSize) {
+      JITEmitter::startGVStub(Buffer, StubSize);
+      ReusedStub = true;
+      assert(StubSize <= StubEmissionBufferSize);
+      StubDestination = BufferBegin;
+      setBufferPtrs(StubEmissionBuffer);
+    }
+    virtual void finishGVStub() {
+      assert(CurBufferPtr - BufferBegin == kBundleSize);
+
+      DEBUG(dbgs() << "Validating "<< BufferBegin<<"-"<<StubDestination<<"\n");
+      int ValidationResult;
+#ifdef __native_client__
+      if (!ReusedStub) {
+         ValidationResult = nacl_dyncode_create(StubDestination, BufferBegin,
+                                                CurBufferPtr - BufferBegin);
+      } else {
+        // This is not a thread-safe modification because it updates the whole
+        // stub rather than just a jump target. However it is only used by
+        // eager compilation to replace a stub which is not in use yet
+        // (it jumps to 0).
+        ValidationResult = nacl_dyncode_modify(StubDestination, BufferBegin,
+                                               CurBufferPtr - BufferBegin);
+      }
+#endif
+      if (ValidationResult) {
+        dbgs() << "NaCl stub validation failed:\n";
+        if (sys::hasDisassembler()) {
+          dbgs() << "Disassembled code:\n";
+          dbgs() << sys::disassembleBuffer(BufferBegin,
+                                           CurBufferPtr-BufferBegin,
+                                           (uintptr_t)StubDestination);
+        }
+        report_fatal_error("Stub validation failed");
+      }
+      setBufferPtrs(StubDestination);
+      JITEmitter::finishGVStub();
+    }
+
+    /// allocateSpace - Allocates *data* space, rather than space in the
+    // current code block.
+    virtual void *allocateSpace(uintptr_t Size, unsigned Alignment) {
+      return MemMgr->allocateSpace(Size, Alignment);
+    }
+
+    virtual void StartMachineBasicBlock(MachineBasicBlock *MBB) {
+      uint8_t *end = CurBufferPtr;
+      emitAlignment(MBB->getAlignment());
+      memcpy(end, JITInfo->getNopSequence(CurBufferPtr - end),
+             CurBufferPtr - end);
+      NumNopBytes += CurBufferPtr - end;
+      JITEmitter::StartMachineBasicBlock(MBB);
+    }
+
+    /// beginBundleLock - Save the current location of CurBufferPtr so we can
+    // tell if the block crosses a bundle boundary
+    virtual void beginBundleLock() {
+      assert(BundleNestCount <= 2 && "Bundle-locked groups can't be nested");
+      if (++BundleNestCount == 2) return;
+      DEBUG(dbgs() << "begin lock, buffer begin:end:cur "<<BufferBegin<<" "<<
+            BufferEnd<< " "<<CurBufferPtr << "\n");
+      BundleLockSavedCurBufferPtr = CurBufferPtr;
+      GroupRelocationCount = 0;
+    }
+
+    /// endBundleLock - Check if the group crosses a bundle boundary. If so
+    // (or if the group must be aligned to the end of a bundle), move the
+    // group and add appropriate padding
+    virtual void endBundleLock() {
+      assert(BundleNestCount > 0 && "mismatched bundle-lock start/end");
+      if (--BundleNestCount > 0) return;
+      DEBUG(dbgs() <<"end lock, buffer begin:end:cur:savd "<<BufferBegin<<" "<<
+            BufferEnd<< " "<<CurBufferPtr <<" "<<
+            BundleLockSavedCurBufferPtr<<"\n");
+
+      int GroupLen = CurBufferPtr - BundleLockSavedCurBufferPtr;
+      if (BufferEnd - CurBufferPtr <
+          GroupLen + kBundleSize) {
+        // Added padding can be no more than kBundleSize. Retry if there's any
+        // possibility of overflow
+        CurBufferPtr = BufferEnd;
+        AlignNextGroup = kNone;
+        return;
+      }
+      // Space left in the current bundle
+      int SpaceLeft = (((intptr_t)BundleLockSavedCurBufferPtr + kBundleSize)
+                       & kJumpMask) - (intptr_t)BundleLockSavedCurBufferPtr;
+      int TotalPadding = 0;
+      if (SpaceLeft < GroupLen || AlignNextGroup == kBegin) {
+        DEBUG(dbgs() << "space " << SpaceLeft <<" len "<<GroupLen<<"\n");
+        memmove(BundleLockSavedCurBufferPtr + SpaceLeft,
+                BundleLockSavedCurBufferPtr, GroupLen);
+        memcpy(BundleLockSavedCurBufferPtr, JITInfo->getNopSequence(SpaceLeft),
+               SpaceLeft);
+        NumNopBytes += SpaceLeft;
+        assert(CurBufferPtr == BundleLockSavedCurBufferPtr + GroupLen);
+        CurBufferPtr += SpaceLeft;
+        BundleLockSavedCurBufferPtr += SpaceLeft;
+        TotalPadding = SpaceLeft;
+        SpaceLeft = kBundleSize;
+      }
+
+      if (AlignNextGroup == kEnd) {
+        DEBUG(dbgs() << "alignend, space len "<<SpaceLeft<<" "<<GroupLen<<"\n");
+        int MoveDistance = SpaceLeft - GroupLen;
+        memmove(BundleLockSavedCurBufferPtr + MoveDistance,
+                BundleLockSavedCurBufferPtr, GroupLen);
+        memcpy(BundleLockSavedCurBufferPtr,
+               JITInfo->getNopSequence(MoveDistance), MoveDistance);
+        NumNopBytes += MoveDistance;
+        CurBufferPtr += MoveDistance;
+        TotalPadding += MoveDistance;
+      }
+
+      AlignNextGroup = kNone;
+
+      assert(CurBufferPtr <= BufferEnd && "Bundled group caused buf overflow");
+      if (TotalPadding && GroupRelocationCount) {
+        assert(Relocations.size() >= GroupRelocationCount &&
+               "Too many relocations recorded for this group");
+        for(std::vector<MachineRelocation>::reverse_iterator I =
+            Relocations.rbegin(); GroupRelocationCount > 0;
+            ++I, GroupRelocationCount--) {
+          int NewOffset = I->getMachineCodeOffset()
+                          + TotalPadding;
+          I->setMachineCodeOffset(NewOffset);
+        }
+      }
+    }
+
+    virtual void alignToBundleBeginning() {
+      // mark that the next locked group must be aligned to bundle start
+      // (e.g. an indirect branch target)
+      assert(AlignNextGroup == kNone && "Conflicting group alignments");
+      AlignNextGroup = kBegin;
+    }
+
+    virtual void alignToBundleEnd() {
+      // mark that the next locked group must be aligned to bundle end (e.g. a
+      // call)
+      assert(AlignNextGroup == kNone && "Conflicting group alignments");
+      AlignNextGroup = kEnd;
+    }
+
+    virtual uintptr_t getCurrentPCValue() const {
+      // return destination PC value rather than generating location
+      if (BufferBegin == EmissionBuffer) {
+        return (uintptr_t)(FunctionDestination + (CurBufferPtr - BufferBegin));
+      } else if (BufferBegin == StubEmissionBuffer) {
+        return (uintptr_t)(StubDestination + (CurBufferPtr - BufferBegin));
+      } else {
+        return (uintptr_t)CurBufferPtr;
+      }
+    }
+
+    // addRelocation gets called in the middle of emitting an instruction, and
+    // creates the relocation based on the instruction's current position in
+    // the emission buffer; however it could get moved if it crosses the bundle
+    // boundary. so we intercept relocation creation and adjust newly-created
+    // relocations if necessary
+    virtual void addRelocation(const MachineRelocation &MR) {
+      GroupRelocationCount++;
+      JITEmitter::addRelocation(MR);
+    }
+
+ private:
+    typedef enum _GroupAlign { kNone, kBegin, kEnd } GroupAlign;
+    // FunctionDestination points to the final destination for the function
+    // (i.e. where it will be copied after validation)
+    uint8_t *FunctionDestination;
+    uint8_t *BundleLockSavedCurBufferPtr;
+    int BundleNestCount; // should not exceed 2
+    GroupAlign AlignNextGroup;
+    unsigned GroupRelocationCount;
+    uint8_t *EmissionBuffer;
+    uintptr_t EmissionBufferSize;
+
+    bool ReusedStub;
+    uint8_t *StubDestination;
+    uint8_t *StubEmissionBuffer;
+    uintptr_t StubEmissionBufferSize;
+
+    TargetJITInfo *JITInfo;
+    const int kBundleSize;
+    const int32_t kJumpMask;
+
+    // Set the buffer pointers (begin, cur, end) so they point into the buffer
+    // at dest, preserving their relative positions
+    void setBufferPtrs(uint8_t* dest) {
+      BufferEnd = dest + (BufferEnd - BufferBegin);
+      CurBufferPtr = dest + (CurBufferPtr - BufferBegin);
+      BufferBegin = dest;
+    }
+};
 }
 
 void CallSiteValueMapConfig::onDelete(JITResolverState *JRS, Function *F) {
@@ -934,6 +1292,12 @@ bool JITEmitter::finishFunction(MachineFunction &F) {
   // Mark code region readable and executable if it's not so already.
   MemMgr->setMemoryExecutable();
 
+  // @LOCALMOD-START
+#ifndef __native_client__
+  // In NaCl, we haven't yet validated and copied the function code to the
+  // destination yet, so there is nothing to disassemble. Furthermore we can't
+  // touch the destination because it may not even be mapped yet
+  // @LOCALMOD-END
   DEBUG({
       if (sys::hasDisassembler()) {
         dbgs() << "JIT: Disassembled code:\n";
@@ -963,6 +1327,7 @@ bool JITEmitter::finishFunction(MachineFunction &F) {
         dbgs()<< '\n';
       }
     });
+#endif // @LOCALMOD
 
   if (JITExceptionHandling) {
     uintptr_t ActualSize = 0;
@@ -1247,7 +1612,14 @@ void JITEmitter::EmittedFunctionConfig::onRAUW(
 
 JITCodeEmitter *JIT::createEmitter(JIT &jit, JITMemoryManager *JMM,
                                    TargetMachine &tm) {
+// @LOCALMOD-START
+#ifndef __native_client__
   return new JITEmitter(jit, JMM, tm);
+#else
+  assert(!JMM && "NaCl does not support custom memory managers");
+  return new NaClJITEmitter(jit, tm);
+#endif
+// @LOCALMOD-END
 }
 
 // getPointerToFunctionOrStub - If the specified function has been
diff --git a/lib/ExecutionEngine/JIT/NaClJITMemoryManager.cpp b/lib/ExecutionEngine/JIT/NaClJITMemoryManager.cpp
new file mode 100644
index 0000000000..d44fee2292
--- /dev/null
+++ b/lib/ExecutionEngine/JIT/NaClJITMemoryManager.cpp
@@ -0,0 +1,430 @@
+//===-- NaClJITMemoryManager.cpp - Memory Allocator for JIT'd code --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the NaClJITMemoryManager class.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "llvm/ExecutionEngine/NaClJITMemoryManager.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Config/config.h"
+#include <vector>
+
+#if defined(__linux__) || defined(__native_client__)
+#if defined(HAVE_SYS_STAT_H)
+#include <sys/stat.h>
+#endif
+#include <fcntl.h>
+#include <unistd.h>
+#endif
+
+using namespace llvm;
+
+#ifdef __native_client__
+// etext is guarded by ifdef so the code still compiles on non-ELF platforms
+extern char etext;
+#endif
+
+// The way NaCl linking is currently setup, there is a gap between the text
+// segment and the rodata segment where we can fill dyncode. The text ends
+// at etext, but there's no symbol for the start of rodata. Currently the
+// linker script puts it at 0x11000000
+// If we run out of space there, we can also allocate below the text segment
+// and keep going downward until we run into code loaded by the dynamic
+// linker. (TODO(dschuff): make that work)
+// For now, just start at etext and go until we hit rodata
+
+// It's an open issue that lazy jitting is not thread safe (PR5184). However
+// NaCl's dyncode_create solves exactly this problem, so in the future
+// this allocator could (should?) be made thread safe
+
+const size_t NaClJITMemoryManager::kStubSlabSize;
+const size_t NaClJITMemoryManager::kDataSlabSize;
+const size_t NaClJITMemoryManager::kCodeSlabSize;
+
+// TODO(dschuff) fix allocation start (etext + 64M is hopefully after where
+// glibc is loaded) and limit (maybe need a linker-provide symbol for the start
+// of the IRT or end of the segment gap)
+// (also fix allocateCodeSlab and maybe allocateStubSlab at that time)
+// what we really need is a usable nacl_dyncode_alloc(), but this could still
+// be improved upon using dl_iterate_phdr
+const static intptr_t kNaClSegmentGapEnd = 0x11000000;
+
+NaClJITMemoryManager::NaClJITMemoryManager() :
+    AllocatableRegionLimit((uint8_t *)kNaClSegmentGapEnd),
+    NextCode(AllocatableRegionStart), GOTBase(NULL) {
+#ifdef __native_client__
+  AllocatableRegionStart = (uint8_t *)&etext + 1024*1024*64;
+#else
+    assert(false && "NaClJITMemoryManager will not work outside NaCl sandbox");
+#endif
+  AllocatableRegionStart =
+      (uint8_t *)RoundUpToAlignment((uint64_t)AllocatableRegionStart,
+                                    kBundleSize);
+  NextCode = AllocatableRegionStart;
+
+  // Allocate 1 stub slab to get us started
+  CurrentStubSlab = allocateStubSlab(0);
+  InitFreeList(&CodeFreeListHead);
+  InitFreeList(&DataFreeListHead);
+
+  DEBUG(dbgs() << "NaClJITMemoryManager: AllocatableRegionStart " <<
+        AllocatableRegionStart << " Limit " << AllocatableRegionLimit << "\n");
+}
+
+NaClJITMemoryManager::~NaClJITMemoryManager() {
+  delete [] GOTBase;
+  DestroyFreeList(CodeFreeListHead);
+  DestroyFreeList(DataFreeListHead);
+}
+
+FreeListNode *NaClJITMemoryManager::allocateCodeSlab(size_t MinSize) {
+  FreeListNode *node = new FreeListNode();
+  if (AllocatableRegionLimit - NextCode < (int)kCodeSlabSize) {
+    // TODO(dschuff): might be possible to try the space below text segment?
+    report_fatal_error("Ran out of code space");
+  }
+  node->address = NextCode;
+  node->size = std::max(kCodeSlabSize, MinSize);
+  NextCode += node->size;
+  DEBUG(dbgs() << "allocated code slab " << NextCode - node->size << "-" <<
+        NextCode << "\n");
+  return node;
+}
+
+SimpleSlab NaClJITMemoryManager::allocateStubSlab(size_t MinSize) {
+  SimpleSlab s;
+  DEBUG(dbgs() << "allocateStubSlab: ");
+  // It's a little weird to just allocate and throw away the FreeListNode, but
+  // since code region allocation is still a bit ugly and magical, I decided
+  // it's better to reuse allocateCodeSlab than duplicate the logic.
+  FreeListNode *n = allocateCodeSlab(MinSize);
+  s.address = n->address;
+  s.size = n->size;
+  s.next_free = n->address;
+  delete n;
+  return s;
+}
+
+FreeListNode *NaClJITMemoryManager::allocateDataSlab(size_t MinSize) {
+  FreeListNode *node = new FreeListNode;
+  size_t size = std::max(kDataSlabSize, MinSize);
+  node->address = (uint8_t*)DataAllocator.Allocate(size, kBundleSize);
+  node->size = size;
+  return node;
+}
+
+void NaClJITMemoryManager::InitFreeList(FreeListNode **Head) {
+  // Make sure there is always at least one entry in the free list
+  *Head = new FreeListNode;
+  (*Head)->Next = (*Head)->Prev = *Head;
+  (*Head)->size = 0;
+}
+
+void NaClJITMemoryManager::DestroyFreeList(FreeListNode *Head) {
+  FreeListNode *n = Head->Next;
+  while(n != Head) {
+    FreeListNode *next = n->Next;
+    delete n;
+    n = next;
+  }
+  delete Head;
+}
+
+FreeListNode *NaClJITMemoryManager::FreeListAllocate(uintptr_t &ActualSize,
+    FreeListNode *Head,
+    FreeListNode * (NaClJITMemoryManager::*allocate)(size_t)) {
+  FreeListNode *candidateBlock = Head;
+  FreeListNode *iter = Head->Next;
+
+  uintptr_t largest = candidateBlock->size;
+  // Search for the largest free block
+  while (iter != Head) {
+    if (iter->size > largest) {
+      largest = iter->size;
+      candidateBlock = iter;
+    }
+    iter = iter->Next;
+  }
+
+  if (largest < ActualSize || largest == 0) {
+    candidateBlock = (this->*allocate)(ActualSize);
+  } else {
+    candidateBlock->RemoveFromFreeList();
+  }
+  return candidateBlock;
+}
+
+void NaClJITMemoryManager::FreeListFinishAllocation(FreeListNode *Block,
+    FreeListNode *Head, uint8_t *AllocationStart, uint8_t *AllocationEnd,
+    AllocationTable &Table) {
+  assert(AllocationEnd > AllocationStart);
+  assert(Block->address == AllocationStart);
+  uint8_t *End = (uint8_t *)RoundUpToAlignment((uint64_t)AllocationEnd,
+                                               kBundleSize);
+  assert(End <= Block->address + Block->size);
+  int AllocationSize = End - Block->address;
+  Table[AllocationStart] = AllocationSize;
+
+  Block->size -= AllocationSize;
+  if (Block->size >= kBundleSize * 2) {//TODO(dschuff): better heuristic?
+    Block->address = End;
+    Block->AddToFreeList(Head);
+  } else {
+    delete Block;
+  }
+  DEBUG(dbgs()<<"FinishAllocation size "<< AllocationSize <<" end "<<End<<"\n");
+}
+
+void NaClJITMemoryManager::FreeListDeallocate(FreeListNode *Head,
+                                              AllocationTable &Table,
+                                              void *Body) {
+  uint8_t *Allocation = (uint8_t *)Body;
+  DEBUG(dbgs() << "deallocating "<<Body<<" ");
+  assert(Table.count(Allocation) && "FreeList Deallocation not found in table");
+  FreeListNode *Block = new FreeListNode;
+  Block->address = Allocation;
+  Block->size = Table[Allocation];
+  Block->AddToFreeList(Head);
+  DEBUG(dbgs() << "deallocated "<< Allocation<< " size " << Block->size <<"\n");
+}
+
+uint8_t *NaClJITMemoryManager::startFunctionBody(const Function *F,
+                                                 uintptr_t &ActualSize) {
+  CurrentCodeBlock = FreeListAllocate(ActualSize, CodeFreeListHead,
+                                  &NaClJITMemoryManager::allocateCodeSlab);
+  DEBUG(dbgs() << "startFunctionBody CurrentBlock " << CurrentCodeBlock <<
+        " addr " << CurrentCodeBlock->address << "\n");
+  ActualSize = CurrentCodeBlock->size;
+  return CurrentCodeBlock->address;
+}
+
+void NaClJITMemoryManager::endFunctionBody(const Function *F,
+                                           uint8_t *FunctionStart,
+                                           uint8_t *FunctionEnd) {
+  DEBUG(dbgs() << "endFunctionBody ");
+  FreeListFinishAllocation(CurrentCodeBlock, CodeFreeListHead,
+                           FunctionStart, FunctionEnd, AllocatedFunctions);
+
+}
+
+uint8_t *NaClJITMemoryManager::allocateCodeSection(uintptr_t Size,
+                                                   unsigned Alignment,
+                                                   unsigned SectionID) {
+  llvm_unreachable("Implement me! (or don't.)");
+}
+
+uint8_t *NaClJITMemoryManager::allocateDataSection(uintptr_t Size,
+                                                   unsigned Alignment,
+                                                   unsigned SectionID) {
+  return (uint8_t *)DataAllocator.Allocate(Size, Alignment);
+}
+
+void NaClJITMemoryManager::deallocateFunctionBody(void *Body) {
+  DEBUG(dbgs() << "deallocateFunctionBody, ");
+  if (Body) FreeListDeallocate(CodeFreeListHead, AllocatedFunctions, Body);
+}
+
+uint8_t *NaClJITMemoryManager::allocateStub(const GlobalValue* F,
+                                            unsigned StubSize,
+                                            unsigned Alignment) {
+  uint8_t *StartAddress = (uint8_t *)(uintptr_t)
+      RoundUpToAlignment((uintptr_t)CurrentStubSlab.next_free, Alignment);
+  if (StartAddress + StubSize >
+      CurrentStubSlab.address + CurrentStubSlab.size) {
+    CurrentStubSlab = allocateStubSlab(kStubSlabSize);
+    StartAddress = (uint8_t *)(uintptr_t)
+        RoundUpToAlignment((uintptr_t)CurrentStubSlab.next_free, Alignment);
+  }
+  CurrentStubSlab.next_free = StartAddress + StubSize;
+  DEBUG(dbgs() <<"allocated stub "<<StartAddress<< " size "<<StubSize<<"\n");
+  return StartAddress;
+}
+
+uint8_t *NaClJITMemoryManager::allocateSpace(intptr_t Size,
+                                             unsigned Alignment) {
+  uint8_t *r = (uint8_t*)DataAllocator.Allocate(Size, Alignment);
+  DEBUG(dbgs() << "allocateSpace " << Size <<"/"<<Alignment<<" ret "<<r<<"\n");
+  return r;
+}
+
+uint8_t *NaClJITMemoryManager::allocateGlobal(uintptr_t Size,
+                                              unsigned Alignment) {
+  uint8_t *r = (uint8_t*)DataAllocator.Allocate(Size, Alignment);
+  DEBUG(dbgs() << "allocateGlobal " << Size <<"/"<<Alignment<<" ret "<<r<<"\n");
+  return r;
+}
+
+uint8_t* NaClJITMemoryManager::startExceptionTable(const Function* F,
+                                                   uintptr_t &ActualSize) {
+  CurrentDataBlock = FreeListAllocate(ActualSize, DataFreeListHead,
+                                      &NaClJITMemoryManager::allocateDataSlab);
+  DEBUG(dbgs() << "startExceptionTable CurrentBlock " << CurrentDataBlock <<
+        " addr " << CurrentDataBlock->address << "\n");
+  ActualSize = CurrentDataBlock->size;
+  return CurrentDataBlock->address;
+}
+
+void NaClJITMemoryManager::endExceptionTable(const Function *F,
+                                           uint8_t *TableStart,
+                       uint8_t *TableEnd, uint8_t* FrameRegister) {
+  DEBUG(dbgs() << "endExceptionTable ");
+  FreeListFinishAllocation(CurrentDataBlock, DataFreeListHead,
+                           TableStart, TableEnd, AllocatedTables);
+}
+
+void NaClJITMemoryManager::deallocateExceptionTable(void *ET) {
+  DEBUG(dbgs() << "deallocateExceptionTable, ");
+  if (ET) FreeListDeallocate(DataFreeListHead, AllocatedTables, ET);
+}
+
+// Copy of DefaultJITMemoryManager's implementation
+void NaClJITMemoryManager::AllocateGOT() {
+  assert(GOTBase == 0 && "Cannot allocate the got multiple times");
+  GOTBase = new uint8_t[sizeof(void*) * 8192];
+  HasGOT = true;
+}
+
+//===----------------------------------------------------------------------===//
+// getPointerToNamedFunction() implementation.
+// This code is pasted directly from r153607 of JITMemoryManager.cpp and has
+// never been tested. It most likely doesn't work inside the sandbox.
+//===----------------------------------------------------------------------===//
+
+// AtExitHandlers - List of functions to call when the program exits,
+// registered with the atexit() library function.
+static std::vector<void (*)()> AtExitHandlers;
+
+/// runAtExitHandlers - Run any functions registered by the program's
+/// calls to atexit(3), which we intercept and store in
+/// AtExitHandlers.
+///
+static void runAtExitHandlers() {
+  while (!AtExitHandlers.empty()) {
+    void (*Fn)() = AtExitHandlers.back();
+    AtExitHandlers.pop_back();
+    Fn();
+  }
+}
+
+//===----------------------------------------------------------------------===//
+// Function stubs that are invoked instead of certain library calls
+//
+// Force the following functions to be linked in to anything that uses the
+// JIT. This is a hack designed to work around the all-too-clever Glibc
+// strategy of making these functions work differently when inlined vs. when
+// not inlined, and hiding their real definitions in a separate archive file
+// that the dynamic linker can't see. For more info, search for
+// 'libc_nonshared.a' on Google, or read http://llvm.org/PR274.
+#if defined(__linux__)
+/* stat functions are redirecting to __xstat with a version number.  On x86-64
+ * linking with libc_nonshared.a and -Wl,--export-dynamic doesn't make 'stat'
+ * available as an exported symbol, so we have to add it explicitly.
+ */
+namespace {
+class StatSymbols {
+public:
+  StatSymbols() {
+    sys::DynamicLibrary::AddSymbol("stat", (void*)(intptr_t)stat);
+    sys::DynamicLibrary::AddSymbol("fstat", (void*)(intptr_t)fstat);
+    sys::DynamicLibrary::AddSymbol("lstat", (void*)(intptr_t)lstat);
+    sys::DynamicLibrary::AddSymbol("stat64", (void*)(intptr_t)stat64);
+    sys::DynamicLibrary::AddSymbol("\x1stat64", (void*)(intptr_t)stat64);
+    sys::DynamicLibrary::AddSymbol("\x1open64", (void*)(intptr_t)open64);
+    sys::DynamicLibrary::AddSymbol("\x1lseek64", (void*)(intptr_t)lseek64);
+    sys::DynamicLibrary::AddSymbol("fstat64", (void*)(intptr_t)fstat64);
+    sys::DynamicLibrary::AddSymbol("lstat64", (void*)(intptr_t)lstat64);
+    sys::DynamicLibrary::AddSymbol("atexit", (void*)(intptr_t)atexit);
+    sys::DynamicLibrary::AddSymbol("mknod", (void*)(intptr_t)mknod);
+  }
+};
+}
+static StatSymbols initStatSymbols;
+#endif // __linux__
+
+// jit_exit - Used to intercept the "exit" library call.
+static void jit_exit(int Status) {
+  runAtExitHandlers();   // Run atexit handlers...
+  exit(Status);
+}
+
+// jit_atexit - Used to intercept the "atexit" library call.
+static int jit_atexit(void (*Fn)()) {
+  AtExitHandlers.push_back(Fn);    // Take note of atexit handler...
+  return 0;  // Always successful
+}
+
+static int jit_noop() {
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+//
+/// getPointerToNamedFunction - This method returns the address of the specified
+/// function by using the dynamic loader interface.  As such it is only useful
+/// for resolving library symbols, not code generated symbols.
+///
+void *NaClJITMemoryManager::getPointerToNamedFunction(const std::string &Name,
+                                     bool AbortOnFailure) {
+  // Check to see if this is one of the functions we want to intercept.  Note,
+  // we cast to intptr_t here to silence a -pedantic warning that complains
+  // about casting a function pointer to a normal pointer.
+  if (Name == "exit") return (void*)(intptr_t)&jit_exit;
+  if (Name == "atexit") return (void*)(intptr_t)&jit_atexit;
+
+  // We should not invoke parent's ctors/dtors from generated main()!
+  // On Mingw and Cygwin, the symbol __main is resolved to
+  // callee's(eg. tools/lli) one, to invoke wrong duplicated ctors
+  // (and register wrong callee's dtors with atexit(3)).
+  // We expect ExecutionEngine::runStaticConstructorsDestructors()
+  // is called before ExecutionEngine::runFunctionAsMain() is called.
+  if (Name == "__main") return (void*)(intptr_t)&jit_noop;
+
+  const char *NameStr = Name.c_str();
+  // If this is an asm specifier, skip the sentinal.
+  if (NameStr[0] == 1) ++NameStr;
+
+  // If it's an external function, look it up in the process image...
+  void *Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr);
+  if (Ptr) return Ptr;
+
+  // If it wasn't found and if it starts with an underscore ('_') character,
+  // try again without the underscore.
+  if (NameStr[0] == '_') {
+    Ptr = sys::DynamicLibrary::SearchForAddressOfSymbol(NameStr+1);
+    if (Ptr) return Ptr;
+  }
+
+  // Darwin/PPC adds $LDBLStub suffixes to various symbols like printf.  These
+  // are references to hidden visibility symbols that dlsym cannot resolve.
+  // If we have one of these, strip off $LDBLStub and try again.
+#if defined(__APPLE__) && defined(__ppc__)
+  if (Name.size() > 9 && Name[Name.size()-9] == '$' &&
+      memcmp(&Name[Name.size()-8], "LDBLStub", 8) == 0) {
+    // First try turning $LDBLStub into $LDBL128. If that fails, strip it off.
+    // This mirrors logic in libSystemStubs.a.
+    std::string Prefix = std::string(Name.begin(), Name.end()-9);
+    if (void *Ptr = getPointerToNamedFunction(Prefix+"$LDBL128", false))
+      return Ptr;
+    if (void *Ptr = getPointerToNamedFunction(Prefix, false))
+      return Ptr;
+  }
+#endif
+
+  if (AbortOnFailure) {
+    report_fatal_error("Program used external function '"+Name+
+                      "' which could not be resolved!");
+  }
+  return 0;
+}
diff --git a/lib/LLVMBuild.txt b/lib/LLVMBuild.txt
index e22b8cd406..f7f814b9cb 100644
--- a/lib/LLVMBuild.txt
+++ b/lib/LLVMBuild.txt
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = Analysis Archive AsmParser Bitcode CodeGen DebugInfo ExecutionEngine Linker MC Object Support TableGen Target Transforms VMCore
+subdirectories = Analysis Archive AsmParser Bitcode CodeGen DebugInfo ExecutionEngine Linker MC Object Support TableGen Target Transforms VMCore Wrap
 
 [component_0]
 type = Group
diff --git a/lib/Linker/LinkArchives.cpp b/lib/Linker/LinkArchives.cpp
index c16d1958cd..c5656a54c9 100644
--- a/lib/Linker/LinkArchives.cpp
+++ b/lib/Linker/LinkArchives.cpp
@@ -16,10 +16,24 @@
 #include "llvm/Module.h"
 #include "llvm/ADT/SetOperations.h"
 #include "llvm/Bitcode/Archive.h"
+
+#include "llvm/Support/CommandLine.h" // @LOCALMOD
+
 #include <memory>
 #include <set>
 using namespace llvm;
 
+// @LOCALMOD-START
+// NOTE: this has a similar effect as
+//        tools/llvm/llvm-preserve.ll
+// which in turn is similar to the GNUS's attribute((used))
+// TODO(robertm): This is a little hackish for now
+static cl::list<std::string>
+UndefList("referenced-list", cl::value_desc("list"),
+          cl::desc("A list of symbols assumed to be referenced externally"),
+          cl::CommaSeparated);
+// @LOCALMOD-END
+  
 /// GetAllUndefinedSymbols - calculates the set of undefined symbols that still
 /// exist in an LLVM module. This is a bit tricky because there may be two
 /// symbols with the same name but different LLVM types that will be resolved to
@@ -36,7 +50,10 @@ static void
 GetAllUndefinedSymbols(Module *M, std::set<std::string> &UndefinedSymbols) {
   std::set<std::string> DefinedSymbols;
   UndefinedSymbols.clear();
-
+  // @LOCALMOD-START
+  UndefinedSymbols.insert(UndefList.begin(), UndefList.end());
+  // @LOCALMOD-END
+  
   // If the program doesn't define a main, try pulling one in from a .a file.
   // This is needed for programs where the main function is defined in an
   // archive, such f2c'd programs.
diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index a6599bfe4f..b3426fb19f 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -933,6 +933,19 @@ void ModuleLinker::linkFunctionBody(Function *Dst, Function *Src) {
     ValueMap[I] = DI;
   }
 
+  // @LOCALMOD-BEGIN
+  // Local patch for http://llvm.org/bugs/show_bug.cgi?id=11112
+  // and http://llvm.org/bugs/show_bug.cgi?id=10887
+  // Create an identity mapping for instructions so that alloca instructions
+  // do not get dropped and related debug info isn't lost.  E.g., prevent
+  //   call @llvm.dbg.declare(metadata !{i32 * %local_var}, ...)
+  // from becoming
+  //   call @llvm.dbg.declare(null, ...)
+  for (Function::iterator BB = Src->begin(), BE = Src->end(); BB != BE; ++BB)
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+      ValueMap[I] = I;
+  // @LOCALMOD-END
+
   if (Mode == Linker::DestroySource) {
     // Splice the body of the source function into the dest function.
     Dst->getBasicBlockList().splice(Dst->end(), Src->getBasicBlockList());
@@ -950,6 +963,13 @@ void ModuleLinker::linkFunctionBody(Function *Dst, Function *Src) {
     SmallVector<ReturnInst*, 8> Returns; // Ignore returns.
     CloneFunctionInto(Dst, Src, ValueMap, false, Returns, "", NULL, &TypeMap);
   }
+
+  // @LOCALMOD-BEGIN
+  // There is no need for the identity mapping anymore.
+  for (Function::iterator BB = Src->begin(), BE = Src->end(); BB != BE; ++BB)
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+      ValueMap.erase(I);
+  // @LOCALMOD-END
   
   // There is no need to map the arguments anymore.
   for (Function::arg_iterator I = Src->arg_begin(), E = Src->arg_end();
diff --git a/lib/MC/ELFObjectWriter.cpp b/lib/MC/ELFObjectWriter.cpp
index eda062376e..a94d51bb74 100644
--- a/lib/MC/ELFObjectWriter.cpp
+++ b/lib/MC/ELFObjectWriter.cpp
@@ -1568,4 +1568,5 @@ MCObjectWriter *llvm::createELFObjectWriter(MCELFObjectTargetWriter *MOTW,
                                             raw_ostream &OS,
                                             bool IsLittleEndian) {
   return new ELFObjectWriter(MOTW, OS, IsLittleEndian);
+
 }
diff --git a/lib/MC/MCAsmInfo.cpp b/lib/MC/MCAsmInfo.cpp
index 7ea0f3b85a..e0a83453df 100644
--- a/lib/MC/MCAsmInfo.cpp
+++ b/lib/MC/MCAsmInfo.cpp
@@ -24,6 +24,7 @@ using namespace llvm;
 
 MCAsmInfo::MCAsmInfo() {
   PointerSize = 4;
+  StackSlotSize = 4; // @LOCALMOD
   IsLittleEndian = true;
   StackGrowsUp = false;
   HasSubsectionsViaSymbols = false;
diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp
index 17a6323d0e..16d1fff8a6 100644
--- a/lib/MC/MCAsmStreamer.cpp
+++ b/lib/MC/MCAsmStreamer.cpp
@@ -205,6 +205,13 @@ public:
   virtual bool EmitValueToOffset(const MCExpr *Offset,
                                  unsigned char Value = 0);
 
+  // @LOCALMOD-BEGIN
+  virtual void EmitBundleLock();
+  virtual void EmitBundleUnlock();
+  virtual void EmitBundleAlignStart();
+  virtual void EmitBundleAlignEnd();
+  // @LOCALMOD-END
+
   virtual void EmitFileDirective(StringRef Filename);
   virtual bool EmitDwarfFileDirective(unsigned FileNo, StringRef Directory,
                                       StringRef Filename);
@@ -783,6 +790,27 @@ bool MCAsmStreamer::EmitValueToOffset(const MCExpr *Offset,
   return false;
 }
 
+// @LOCALMOD-BEGIN
+void MCAsmStreamer::EmitBundleLock() {
+  OS << "\t.bundle_lock";
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitBundleUnlock() {
+  OS << "\t.bundle_unlock";
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitBundleAlignStart() {
+  OS << "\t.bundle_align_start";
+  EmitEOL();
+}
+
+void MCAsmStreamer::EmitBundleAlignEnd() {
+  OS << "\t.bundle_align_end";
+  EmitEOL();
+}
+// @LOCALMOD-END
 
 void MCAsmStreamer::EmitFileDirective(StringRef Filename) {
   assert(MAI.hasSingleParameterDotFile());
diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp
index 726ec5aba5..b6c3bb20b5 100644
--- a/lib/MC/MCAssembler.cpp
+++ b/lib/MC/MCAssembler.cpp
@@ -25,6 +25,7 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"  // @LOCALMOD
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/LEB128.h"
@@ -71,6 +72,26 @@ bool MCAsmLayout::isFragmentUpToDate(const MCFragment *F) const {
 }
 
 void MCAsmLayout::Invalidate(MCFragment *F) {
+  // @LOCALMOD-BEGIN
+  if (F->getParent()->isBundlingEnabled()) {
+    // If this fragment is part of a bundle locked group,
+    // we need to invalidate all the way to the first fragment
+    // in the group.
+    while (F && !F->isBundleGroupStart())
+      F = F->getPrevNode();
+    assert(F);
+    // With padding enabled, we need to invalidate back one
+    // fragment further in in order to force the recalculuation
+    // of the padding and offset.
+    if (F->getPrevNode()) {
+      F = F->getPrevNode();
+    } else {
+      LastValidFragment[F->getParent()] = NULL;
+      return;
+    }
+  }
+  // @LOCALMOD-END
+
   // If this fragment wasn't already up-to-date, we don't need to do anything.
   if (!isFragmentUpToDate(F))
     return;
@@ -133,6 +154,15 @@ uint64_t MCAsmLayout::getSymbolOffset(const MCSymbolData *SD) const {
   assert(SD->getFragment() && "Invalid getOffset() on undefined symbol!");
   return getFragmentOffset(SD->getFragment()) + SD->getOffset();
 }
+  
+// @LOCALMOD-BEGIN
+uint8_t MCAsmLayout::getFragmentPadding(const MCFragment *F) const {
+  EnsureValid(F);
+  assert(F->BundlePadding != (uint8_t)~UINT8_C(0) && "Padding not set!");
+  return F->BundlePadding;
+}
+// @LOCALMOD-END
+
 
 uint64_t MCAsmLayout::getSectionAddressSize(const MCSectionData *SD) const {
   // The size is the last fragment's end offset.
@@ -158,10 +188,32 @@ MCFragment::~MCFragment() {
 }
 
 MCFragment::MCFragment(FragmentType _Kind, MCSectionData *_Parent)
-  : Kind(_Kind), Parent(_Parent), Atom(0), Offset(~UINT64_C(0))
+  : Kind(_Kind),
+    // @LOCALMOD-BEGIN
+    BundleAlign(BundleAlignNone),
+    BundleGroupStart(false),
+    BundleGroupEnd(false),
+    BundlePadding(~UINT8_C(0)),
+    // @LOCALMOD-END
+    Parent(_Parent), Atom(0), Offset(~UINT64_C(0))
 {
   if (Parent)
     Parent->getFragmentList().push_back(this);
+
+  // @LOCALMOD-BEGIN
+  if (Parent && Parent->isBundlingEnabled()) {
+    BundleAlign = Parent->getBundleAlignNext();
+    Parent->setBundleAlignNext(MCFragment::BundleAlignNone);
+    if (Parent->isBundleLocked()) {
+      BundleGroupStart = Parent->isBundleGroupFirstFrag();
+      BundleGroupEnd = false;
+      Parent->setBundleGroupFirstFrag(false);
+    } else {
+      BundleGroupStart = true;
+      BundleGroupEnd = true;
+    }
+  }
+  // @LOCALMOD-END
 }
 
 /* *** */
@@ -172,12 +224,91 @@ MCSectionData::MCSectionData(const MCSection &_Section, MCAssembler *A)
   : Section(&_Section),
     Ordinal(~UINT32_C(0)),
     Alignment(1),
-    HasInstructions(false)
+    HasInstructions(false),
+// @LOCALMOD-BEGIN
+    BundlingEnabled(false),
+    BundleLocked(false),
+    BundleGroupFirstFrag(false),
+    BundleAlignNext(MCFragment::BundleAlignNone),
+    BundleOffsetKnown(false),
+    BundleOffset(0)
+// @LOCALMOD-END
 {
   if (A)
     A->getSectionList().push_back(this);
+
+  // @LOCALMOD-BEGIN
+  BundleSize = A->getBackend().getBundleSize();
+  if (BundleSize && _Section.UseCodeAlign()) {
+    BundlingEnabled = true;
+    setAlignment(BundleSize);
+  }
+  // @LOCALMOD-END
+}
+
+// @LOCALMOD-BEGIN
+void MCSectionData::MarkBundleOffsetUnknown() {
+  BundleOffsetKnown = false;
+  BundleOffset = 0;
 }
 
+// Only create a new fragment if:
+// 1) we are emitting the first instruction of a bundle locked sequence.
+// 2) we are not currently emitting a bundle locked sequence and we cannot
+//    guarantee the instruction would not span a bundle boundary.
+// Otherwise, append to the current fragment to reduce the number of fragments.
+bool MCSectionData::ShouldCreateNewFragment(size_t Size) {
+  // The first instruction of a bundle locked region starts a new fragment.
+  if (isBundleLocked() && isBundleGroupFirstFrag())
+    return true;
+  // Unless we know the relative offset of the end of the current fragment,
+  // we need to create a new fragment.
+  if (!isBundleLocked() && !BundleOffsetKnown)
+    return true;
+  assert(BundleSize != 0 && "BundleSize needs to be non-zero");
+  assert(Size < BundleSize && "Instruction size must be less than BundleSize");
+  // If inserting the instruction would overlap a bundle boundary, start a
+  // new fragment.
+  // TODO(sehr): we could still explicitly insert a NOP and continue here.
+  if (BundleOffset + (unsigned) Size > BundleSize)
+    return true;
+  return false;
+}
+
+void MCSectionData::UpdateBundleOffset(size_t Size) {
+  // A bundle locked fragment could move if it spans a bundle boundary.
+  if (isBundleLocked()) {
+    BundleOffsetKnown = false;
+    return;
+  }
+  // If inserting the instruction would overlap a bundle boundary, starting a
+  // new fragment moves the known offset to the end of the instruction in the
+  // next bundle.
+  // TODO(sehr): we could insert a NOP and continue the fragment.
+  if (BundleOffset + (unsigned) Size > BundleSize)
+    BundleOffset = Size;
+  else
+    BundleOffset = BundleOffset + Size;
+}
+
+void MCSectionData::AlignBundleOffsetTo(size_t AlignBase) {
+  // If BundleOffset is already known, an alignment just moves bundleOffset.
+  if (BundleOffsetKnown) {
+    BundleOffset = RoundUpToAlignment(BundleOffset, AlignBase);
+    return;
+  }
+  // Otherwise, if AlignBase is at least as big as a bundle, then we know the
+  // offset relative to a bundle start.
+  if (AlignBase >= BundleSize) {
+    BundleOffsetKnown = true;
+    BundleOffset = 0;
+  } else {
+    BundleOffsetKnown = false;
+    BundleOffset = 0;
+  }
+}
+// @LOCALMOD-END
+
 /* *** */
 
 MCSymbolData::MCSymbolData() : Symbol(0) {}
@@ -319,7 +450,10 @@ uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout,
 
   case MCFragment::FT_LEB:
     return cast<MCLEBFragment>(F).getContents().size();
-
+// @LOCALMOD-BEGIN
+  case MCFragment::FT_Tiny:
+    return cast<MCTinyFragment>(F).getContents().size();
+// @LOCALMOD-END
   case MCFragment::FT_Align: {
     const MCAlignFragment &AF = cast<MCAlignFragment>(F);
     unsigned Offset = Layout.getFragmentOffset(&AF);
@@ -375,15 +509,145 @@ void MCAsmLayout::LayoutFragment(MCFragment *F) {
   uint64_t Offset = 0;
   if (Prev)
     Offset += Prev->Offset + getAssembler().computeFragmentSize(*this, *Prev);
-
+  // @LOCALMOD-BEGIN
+  F->BundlePadding = getAssembler().ComputeBundlePadding(*this, F, Offset);
+  Offset += F->BundlePadding;
+  // @LOCALMOD-END
   F->Offset = Offset;
   LastValidFragment[F->getParent()] = F;
 }
 
+// @LOCALMOD-BEGIN
+// Returns number of bytes of padding needed to align to bundle start.
+static uint64_t AddressToBundlePadding(uint64_t Address, uint64_t BundleMask) {
+  return (~Address + 1) & BundleMask;
+}
+
+uint64_t MCAssembler::getBundleSize() const {
+  return getBackend().getBundleSize();
+}
+
+uint64_t MCAssembler::getBundleMask() const {
+  uint64_t BundleSize = getBundleSize();
+  uint64_t BundleMask = BundleSize - 1;
+  assert(BundleSize != 0);
+  assert((BundleSize & BundleMask) == 0 &&
+         "Bundle size must be a power of 2!");
+  return BundleMask;
+}
+
+static unsigned ComputeGroupSize(MCFragment *F) {
+  if (!F->isBundleGroupStart()) {
+    return 0;
+  }
+
+  unsigned GroupSize = 0;
+  MCFragment *Cur = F;
+  while (Cur) {
+    switch (Cur->getKind()) {
+    default: llvm_unreachable("Unexpected fragment type in bundle!");
+    case MCFragment::FT_Align:
+    case MCFragment::FT_Org:
+    case MCFragment::FT_Fill:
+      if (Cur == F && Cur->isBundleGroupEnd()) {
+        return 0;
+      }
+      llvm_unreachable(".bundle_lock cannot contain .align, .org, or .fill");
+    case MCFragment::FT_Inst:
+      GroupSize += cast<MCInstFragment>(Cur)->getInstSize();
+      break;
+    case MCFragment::FT_Data:
+      GroupSize += cast<MCDataFragment>(Cur)->getContents().size();
+      break;
+    case MCFragment::FT_Tiny:
+      GroupSize += cast<MCTinyFragment>(Cur)->getContents().size();
+      break;
+    }
+    if (Cur->isBundleGroupEnd())
+      break;
+    Cur = Cur->getNextNode();
+  }
+  return GroupSize;
+}
+
+uint8_t MCAssembler::ComputeBundlePadding(const MCAsmLayout &Layout,
+                                          MCFragment *F,
+                                          uint64_t FragmentOffset) const {
+  if (!F->getParent()->isBundlingEnabled())
+    return 0;
+
+  uint64_t BundleSize = getBundleSize();
+  uint64_t BundleMask = getBundleMask();
+  unsigned GroupSize = ComputeGroupSize(F);
+
+  if (GroupSize > BundleSize) {
+    // EmitFill creates large groups consisting of repeated single bytes.
+    // These should be safe at any alignment, and in any case we cannot
+    // fix them up here.
+    return 0;
+  }
+
+  uint64_t Padding = 0;
+  uint64_t OffsetInBundle = FragmentOffset & BundleMask;
+
+  if (OffsetInBundle + GroupSize > BundleSize ||
+      F->getBundleAlign() == MCFragment::BundleAlignStart) {
+    // If this group would cross the bundle boundary, or this group must be
+    // aligned to the start of a bundle, then pad up to start of the next bundle
+    Padding += AddressToBundlePadding(OffsetInBundle, BundleMask);
+    OffsetInBundle = 0;
+  }
+  if (F->getBundleAlign() == MCFragment::BundleAlignEnd) {
+    // Push to the end of the bundle
+    Padding += AddressToBundlePadding(OffsetInBundle + GroupSize, BundleMask);
+  }
+  return Padding;
+}
+// @LOCALMOD-END
+
+
+
+
+// @LOCALMOD-BEGIN
+// Write out BundlePadding bytes in NOPs, being careful not to cross a bundle
+// boundary.
+static void WriteBundlePadding(const MCAssembler &Asm,
+                               const MCAsmLayout &Layout,
+                               uint64_t Offset, uint64_t TotalPadding,
+                               MCObjectWriter *OW) {
+  uint64_t BundleSize = Asm.getBundleSize();
+  uint64_t BundleMask = Asm.getBundleMask();
+  uint64_t PaddingLeft = TotalPadding;
+  uint64_t StartPos = Offset;
+
+  bool FirstWrite = true;
+  while (PaddingLeft > 0) {
+    uint64_t NopsToWrite =
+      FirstWrite ? AddressToBundlePadding(StartPos, BundleMask) :
+                   BundleSize;
+    if (NopsToWrite > PaddingLeft)
+      NopsToWrite = PaddingLeft;
+    if (!Asm.getBackend().writeNopData(NopsToWrite, OW))
+      report_fatal_error("unable to write nop sequence of " +
+                         Twine(NopsToWrite) + " bytes");
+    PaddingLeft -= NopsToWrite;
+    FirstWrite = false;
+  }
+}
+// @LOCALMOD-END
+
 /// WriteFragmentData - Write the \p F data to the output file.
 static void WriteFragmentData(const MCAssembler &Asm, const MCAsmLayout &Layout,
                               const MCFragment &F) {
   MCObjectWriter *OW = &Asm.getWriter();
+  // @LOCALMOD-BEGIN
+  if (F.getParent()->isBundlingEnabled()) {
+    uint64_t BundlePadding = Layout.getFragmentPadding(&F);
+    uint64_t PaddingOffset = Layout.getFragmentOffset(&F) - BundlePadding;
+    WriteBundlePadding(Asm, Layout, PaddingOffset, BundlePadding, OW);
+  }
+  // @LOCALMOD-END
+
   uint64_t Start = OW->getStream().tell();
   (void) Start;
 
@@ -412,6 +676,16 @@ static void WriteFragmentData(const MCAssembler &Asm, const MCAsmLayout &Layout,
     // bytes left to fill use the Value and ValueSize to fill the rest.
     // If we are aligning with nops, ask that target to emit the right data.
     if (AF.hasEmitNops()) {
+      // @LOCALMOD-BEGIN
+      if (Asm.getBundleSize()) {
+        WriteBundlePadding(Asm, Layout,
+                           Layout.getFragmentOffset(&F),
+                           FragmentSize,
+                           OW);
+        break;
+      }
+      // @LOCALMOD-END
+
       if (!Asm.getBackend().writeNopData(Count, OW))
         report_fatal_error("unable to write nop sequence of " +
                           Twine(Count) + " bytes");
@@ -438,6 +712,15 @@ static void WriteFragmentData(const MCAssembler &Asm, const MCAsmLayout &Layout,
     break;
   }
 
+  // @LOCALMOD-BEGIN
+  case MCFragment::FT_Tiny: {
+    MCTinyFragment &TF = cast<MCTinyFragment>(F);
+    assert(FragmentSize == TF.getContents().size() && "Invalid size!");
+    OW->WriteBytes(TF.getContents().str());
+    break;
+  }
+  // @LOCALMOD-END
+
   case MCFragment::FT_Fill: {
     MCFillFragment &FF = cast<MCFillFragment>(F);
 
@@ -843,10 +1126,24 @@ void MCFragment::dump() {
   case MCFragment::FT_Dwarf: OS << "MCDwarfFragment"; break;
   case MCFragment::FT_DwarfFrame: OS << "MCDwarfCallFrameFragment"; break;
   case MCFragment::FT_LEB:   OS << "MCLEBFragment"; break;
+  // @LOCALMOD-BEGIN
+  case MCFragment::FT_Tiny: OS << "MCTinyFragment"; break;
+  // @LOCALMOD-END
   }
 
   OS << "<MCFragment " << (void*) this << " LayoutOrder:" << LayoutOrder
-     << " Offset:" << Offset << ">";
+     << " Offset:" << Offset;
+  // @LOCALMOD-BEGIN
+  if (BundleGroupStart)
+    OS << " BundleGroupStart";
+  if (BundleGroupEnd)
+    OS << " BundleGroupEnd";
+  if (BundleAlign == BundleAlignStart)
+    OS << " BundleAlign: Start";
+  else if (BundleAlign == BundleAlignEnd)
+    OS << " BundleAlign: End";
+  OS << ">";
+  // @LOCALMOD-END
 
   switch (getKind()) {
   case MCFragment::FT_Align: {
@@ -895,6 +1192,20 @@ void MCFragment::dump() {
     IF->getInst().dump_pretty(OS);
     break;
   }
+  // @LOCALMOD-BEGIN
+  case MCFragment::FT_Tiny: {
+    const MCTinyFragment *TF = cast<MCTinyFragment>(this);
+    OS << "\n       ";
+    OS << " Contents:[";
+    const SmallVectorImpl<char> &Contents = TF->getContents();
+    for (unsigned i = 0, e = Contents.size(); i != e; ++i) {
+      if (i) OS << ",";
+      OS << hexdigit((Contents[i] >> 4) & 0xF) << hexdigit(Contents[i] & 0xF);
+    }
+    OS << "] (" << Contents.size() << " bytes)";
+    break;
+  }
+  // @LOCALMOD-END
   case MCFragment::FT_Org:  {
     const MCOrgFragment *OF = cast<MCOrgFragment>(this);
     OS << "\n       ";
diff --git a/lib/MC/MCDwarf.cpp b/lib/MC/MCDwarf.cpp
index f71b266ad6..a1643b2da5 100644
--- a/lib/MC/MCDwarf.cpp
+++ b/lib/MC/MCDwarf.cpp
@@ -777,7 +777,7 @@ void MCGenDwarfLabelEntry::Make(MCSymbol *Symbol, MCStreamer *MCOS,
 static int getDataAlignmentFactor(MCStreamer &streamer) {
   MCContext &context = streamer.getContext();
   const MCAsmInfo &asmInfo = context.getAsmInfo();
-  int size = asmInfo.getPointerSize();
+  int size = asmInfo.getStackSlotSize(); // @LOCALMOD
   if (asmInfo.isStackGrowthDirectionUp())
     return size;
   else
diff --git a/lib/MC/MCELFStreamer.cpp b/lib/MC/MCELFStreamer.cpp
index 14fbc1ec83..b1bded288d 100644
--- a/lib/MC/MCELFStreamer.cpp
+++ b/lib/MC/MCELFStreamer.cpp
@@ -355,6 +355,7 @@ void MCELFStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
                                   unsigned AddrSpace) {
   fixSymbolsInTLSFixups(Value);
   MCObjectStreamer::EmitValueImpl(Value, Size, AddrSpace);
+  getCurrentSectionData()->MarkBundleOffsetUnknown();  // @LOCALMOD
 }
 
 
@@ -423,10 +424,10 @@ void MCELFStreamer::EmitInstToFragment(const MCInst &Inst) {
 
   for (unsigned i = 0, e = F.getFixups().size(); i != e; ++i)
     fixSymbolsInTLSFixups(F.getFixups()[i].getValue());
+  getCurrentSectionData()->MarkBundleOffsetUnknown();  // @LOCALMOD
 }
 
 void MCELFStreamer::EmitInstToData(const MCInst &Inst) {
-  MCDataFragment *DF = getOrCreateDataFragment();
 
   SmallVector<MCFixup, 4> Fixups;
   SmallString<256> Code;
@@ -437,12 +438,26 @@ void MCELFStreamer::EmitInstToData(const MCInst &Inst) {
   for (unsigned i = 0, e = Fixups.size(); i != e; ++i)
     fixSymbolsInTLSFixups(Fixups[i].getValue());
 
-  // Add the fixups and data.
-  for (unsigned i = 0, e = Fixups.size(); i != e; ++i) {
-    Fixups[i].setOffset(Fixups[i].getOffset() + DF->getContents().size());
-    DF->addFixup(Fixups[i]);
+  // @LOCALMOD-BEGIN
+  MCSectionData *SD = getCurrentSectionData();
+
+  if (Fixups.size() > 0 || !SD->isBundlingEnabled()) {
+    MCDataFragment *DF = getOrCreateDataFragment();
+
+    // Add the fixups and data.
+    for (unsigned i = 0, e = Fixups.size(); i != e; ++i) {
+      Fixups[i].setOffset(Fixups[i].getOffset() + DF->getContents().size());
+      DF->addFixup(Fixups[i]);
+    }
+    DF->getContents().append(Code.begin(), Code.end());
+  } else {
+    MCTinyFragment *TF = dyn_cast_or_null<MCTinyFragment>(getCurrentFragment());
+    if (!TF || SD->ShouldCreateNewFragment(Code.size()))
+      TF = new MCTinyFragment(SD);
+    TF->getContents().append(Code.begin(), Code.end());
   }
-  DF->getContents().append(Code.begin(), Code.end());
+  SD->UpdateBundleOffset(Code.size());
+  // @LOCALMOD-END
 }
 
 void MCELFStreamer::FinishImpl() {
diff --git a/lib/MC/MCNullStreamer.cpp b/lib/MC/MCNullStreamer.cpp
index 4c17d91551..46579d7b1f 100644
--- a/lib/MC/MCNullStreamer.cpp
+++ b/lib/MC/MCNullStreamer.cpp
@@ -83,6 +83,13 @@ namespace {
     virtual bool EmitValueToOffset(const MCExpr *Offset,
                                    unsigned char Value = 0) { return false; }
 
+    // @LOCALMOD-BEGIN
+    virtual void EmitBundleLock() {}
+    virtual void EmitBundleUnlock() {}
+    virtual void EmitBundleAlignStart() {}
+    virtual void EmitBundleAlignEnd() {}
+    // @LOCALMOD-END
+
     virtual void EmitFileDirective(StringRef Filename) {}
     virtual bool EmitDwarfFileDirective(unsigned FileNo, StringRef Directory,
                                         StringRef Filename) {
diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp
index 2e1604d6b5..3338a17e5c 100644
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp
@@ -543,9 +543,16 @@ void MCObjectFileInfo::InitCOFFMCObjectFileInfo(Triple T) {
                         SectionKind::getDataRel());
 }
 
+// @LOCALMOD-START
+// TODO(petarj): HACK! Find a better way to set ELF::EF_MIPS_PIC flag.
+// See also file lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp.
+Reloc::Model RelocModelOption = Reloc::Default;
+// @LOCALMOD-END
+
 void MCObjectFileInfo::InitMCObjectFileInfo(StringRef TT, Reloc::Model relocm,
                                             CodeModel::Model cm,
                                             MCContext &ctx) {
+  RelocModelOption = relocm;  // @LOCALMOD
   RelocM = relocm;
   CMModel = cm;
   Ctx = &ctx;
diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp
index 774632306d..37a445fae0 100644
--- a/lib/MC/MCObjectStreamer.cpp
+++ b/lib/MC/MCObjectStreamer.cpp
@@ -16,6 +16,7 @@
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSection.h" // @LOCALMOD
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/ErrorHandling.h"
 using namespace llvm;
@@ -54,6 +55,11 @@ MCFragment *MCObjectStreamer::getCurrentFragment() const {
 }
 
 MCDataFragment *MCObjectStreamer::getOrCreateDataFragment() const {
+  // @LOCALMOD-BEGIN
+  if (getCurrentSectionData()->isBundlingEnabled()) {
+    return new MCDataFragment(getCurrentSectionData());
+  }
+  // @LOCALMOD-END
   MCDataFragment *F = dyn_cast_or_null<MCDataFragment>(getCurrentFragment());
   if (!F)
     F = new MCDataFragment(getCurrentSectionData());
@@ -153,6 +159,54 @@ void MCObjectStreamer::EmitWeakReference(MCSymbol *Alias,
   report_fatal_error("This file format doesn't support weak aliases.");
 }
 
+// @LOCALMOD-BEGIN ========================================================
+
+void MCObjectStreamer::EmitBundleAlignStart() {
+  MCSectionData *SD = getCurrentSectionData();
+  assert(SD->isBundlingEnabled() &&
+         ".bundle_align_start called, but bundling disabled!");
+  assert(!SD->isBundleLocked() &&
+         ".bundle_align_start while bundle locked");
+  SD->setBundleAlignNext(MCFragment::BundleAlignStart);
+}
+
+void MCObjectStreamer::EmitBundleAlignEnd() {
+  MCSectionData *SD = getCurrentSectionData();
+  assert(SD->isBundlingEnabled() &&
+         ".bundle_align_end called, but bundling disabled!");
+  assert(!SD->isBundleLocked() &&
+         ".bundle_align_end while bundle locked");
+  SD->setBundleAlignNext(MCFragment::BundleAlignEnd);
+}
+
+void MCObjectStreamer::EmitBundleLock() {
+  MCSectionData *SD = getCurrentSectionData();
+  assert(SD->isBundlingEnabled() &&
+         ".bundle_lock called, but bundling disabled!");
+  assert(!SD->isBundleLocked() &&
+         ".bundle_lock issued when bundle already locked");
+  SD->setBundleLocked(true);
+  SD->setBundleGroupFirstFrag(true);
+}
+
+void MCObjectStreamer::EmitBundleUnlock() {
+  MCSectionData *SD = getCurrentSectionData();
+  assert(SD->isBundlingEnabled() &&
+         ".bundle_unlock called, but bundling disabled!");
+  assert(SD->isBundleLocked() &&
+         ".bundle_unlock called when bundle not locked");
+  // If there has been at least one fragment emitted inside
+  // this bundle lock, then we need to mark the last emitted
+  // fragment as the group end.
+  if (!SD->isBundleGroupFirstFrag()) {
+    assert(getCurrentFragment() != NULL);
+    getCurrentFragment()->setBundleGroupEnd(true);
+  }
+  SD->setBundleLocked(false);
+  SD->setBundleGroupFirstFrag(false);
+}
+// @LOCALMOD-END ==========================================================
+
 void MCObjectStreamer::ChangeSection(const MCSection *Section) {
   assert(Section && "Cannot switch to a null section!");
 
@@ -160,6 +214,13 @@ void MCObjectStreamer::ChangeSection(const MCSection *Section) {
 }
 
 void MCObjectStreamer::EmitInstruction(const MCInst &Inst) {
+
+  // @LOCALMOD-BEGIN
+  if (getAssembler().getBackend().CustomExpandInst(Inst, *this)) {
+    return;
+  }
+  // @LOCALMOD-END
+
   // Scan for values.
   for (unsigned i = Inst.getNumOperands(); i--; )
     if (Inst.getOperand(i).isExpr())
@@ -235,6 +296,7 @@ void MCObjectStreamer::EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
 void MCObjectStreamer::EmitBytes(StringRef Data, unsigned AddrSpace) {
   assert(AddrSpace == 0 && "Address space must be 0!");
   getOrCreateDataFragment()->getContents().append(Data.begin(), Data.end());
+  getCurrentSectionData()->MarkBundleOffsetUnknown();  // @LOCALMOD
 }
 
 void MCObjectStreamer::EmitValueToAlignment(unsigned ByteAlignment,
@@ -246,6 +308,10 @@ void MCObjectStreamer::EmitValueToAlignment(unsigned ByteAlignment,
   new MCAlignFragment(ByteAlignment, Value, ValueSize, MaxBytesToEmit,
                       getCurrentSectionData());
 
+  // @LOCALMOD-BEGIN
+  // Bump the bundle offset to account for alignment.
+  getCurrentSectionData()->AlignBundleOffsetTo(ByteAlignment);
+  // @LOCALMOD-END
   // Update the maximum alignment on the current section if necessary.
   if (ByteAlignment > getCurrentSectionData()->getAlignment())
     getCurrentSectionData()->setAlignment(ByteAlignment);
@@ -301,6 +367,7 @@ void MCObjectStreamer::EmitFill(uint64_t NumBytes, uint8_t FillValue,
   // FIXME: A MCFillFragment would be more memory efficient but MCExpr has
   //        problems evaluating expressions across multiple fragments.
   getOrCreateDataFragment()->getContents().append(NumBytes, FillValue);
+  getCurrentSectionData()->MarkBundleOffsetUnknown();
 }
 
 void MCObjectStreamer::FinishImpl() {
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index 6f2e85e553..cf86a44d1b 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -295,6 +295,13 @@ private:
   // ".align{,32}", ".p2align{,w,l}"
   bool ParseDirectiveAlign(bool IsPow2, unsigned ValueSize);
 
+  // @LOCALMOD-BEGIN
+  bool ParseDirectiveBundleLock();
+  bool ParseDirectiveBundleUnlock();
+  bool ParseDirectiveBundleAlignStart();
+  bool ParseDirectiveBundleAlignEnd();
+  // @LOCALMOD-END
+
   /// ParseDirectiveSymbolAttribute - Parse a directive like ".globl" which
   /// accepts a single symbol (which should be a label or an external).
   bool ParseDirectiveSymbolAttribute(MCSymbolAttr Attr);
@@ -1283,6 +1290,17 @@ bool AsmParser::ParseStatement(ParseStatementInfo &Info) {
     if (IDVal == ".p2alignl")
       return ParseDirectiveAlign(/*IsPow2=*/true, /*ExprSize=*/4);
 
+    // @LOCALMOD-BEGIN
+    if (IDVal == ".bundle_lock")
+      return ParseDirectiveBundleLock();
+    if (IDVal == ".bundle_unlock")
+      return ParseDirectiveBundleUnlock();
+    if (IDVal == ".bundle_align_start")
+      return ParseDirectiveBundleAlignStart();
+    if (IDVal == ".bundle_align_end")
+      return ParseDirectiveBundleAlignEnd();
+    // @LOCALMOD-END
+
     if (IDVal == ".org")
       return ParseDirectiveOrg();
 
@@ -2404,6 +2422,50 @@ bool AsmParser::ParseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
   return false;
 }
 
+// @LOCALMOD-BEGIN
+bool AsmParser::ParseDirectiveBundleLock() {
+  CheckForValidSection();
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.bundle_lock' directive");
+  Lex();
+  getStreamer().EmitBundleLock();
+  return false;
+}
+
+bool AsmParser::ParseDirectiveBundleUnlock() {
+  CheckForValidSection();
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.bundle_unlock' directive");
+  Lex();
+  getStreamer().EmitBundleUnlock();
+  return false;
+}
+
+bool AsmParser::ParseDirectiveBundleAlignStart() {
+  CheckForValidSection();
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.bundle_align_start' directive");
+  Lex();
+  getStreamer().EmitBundleAlignStart();
+  return false;
+}
+
+bool AsmParser::ParseDirectiveBundleAlignEnd() {
+  CheckForValidSection();
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '.bundle_align_end' directive");
+  Lex();
+  getStreamer().EmitBundleAlignEnd();
+  return false;
+}
+
+// @LOCALMOD-END
+
+
 /// ParseDirectiveSymbolAttribute
 ///  ::= { ".globl", ".weak", ... } [ identifier ( , identifier )* ]
 bool AsmParser::ParseDirectiveSymbolAttribute(MCSymbolAttr Attr) {
diff --git a/lib/MC/SubtargetFeature.cpp b/lib/MC/SubtargetFeature.cpp
index 7625abd465..7f902f1dd7 100644
--- a/lib/MC/SubtargetFeature.cpp
+++ b/lib/MC/SubtargetFeature.cpp
@@ -370,5 +370,11 @@ void SubtargetFeatures::getDefaultSubtargetFeatures(const Triple& Triple) {
       AddFeature("64bit");
       AddFeature("altivec");
     }
+// @LOCALMOD-BEGIN
+  } else if (Triple.getArch() == Triple::arm &&
+             Triple.getOS() == Triple::NativeClient) {
+    AddFeature("-neon");
+    AddFeature("+vfp2");
+// @LOCALMOD-END
   }
 }
diff --git a/lib/Makefile b/lib/Makefile
index fd575cd195..c59d77d009 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -11,7 +11,12 @@ LEVEL = ..
 include $(LEVEL)/Makefile.config
 
 PARALLEL_DIRS := VMCore AsmParser Bitcode Archive Analysis Transforms CodeGen \
-                Target ExecutionEngine Linker MC Object DebugInfo
+                Target ExecutionEngine Linker MC Object Wrap DebugInfo
+
+ifeq ($(NACL_SANDBOX),1)
+  PARALLEL_DIRS := $(filter-out Archive Linker, \
+                $(PARALLEL_DIRS))
+endif
 
 include $(LEVEL)/Makefile.common
 
diff --git a/lib/Support/CrashRecoveryContext.cpp b/lib/Support/CrashRecoveryContext.cpp
index e175056279..508bec4028 100644
--- a/lib/Support/CrashRecoveryContext.cpp
+++ b/lib/Support/CrashRecoveryContext.cpp
@@ -267,6 +267,7 @@ void CrashRecoveryContext::Enable() {
 
   gCrashRecoveryEnabled = true;
 
+#if !defined(__native_client__)
   // Setup the signal handler.
   struct sigaction Handler;
   Handler.sa_handler = CrashRecoverySignalHandler;
@@ -276,6 +277,9 @@ void CrashRecoveryContext::Enable() {
   for (unsigned i = 0; i != NumSignals; ++i) {
     sigaction(Signals[i], &Handler, &PrevActions[i]);
   }
+#else
+#warning Cannot setup the signal handler on this machine
+#endif
 }
 
 void CrashRecoveryContext::Disable() {
@@ -286,9 +290,11 @@ void CrashRecoveryContext::Disable() {
 
   gCrashRecoveryEnabled = false;
 
+#if !defined(__native_client__)
   // Restore the previous signal handlers.
   for (unsigned i = 0; i != NumSignals; ++i)
     sigaction(Signals[i], &PrevActions[i], 0);
+#endif
 }
 
 #endif
diff --git a/lib/Support/DynamicLibrary.cpp b/lib/Support/DynamicLibrary.cpp
index 45fec361c1..d8884381ab 100644
--- a/lib/Support/DynamicLibrary.cpp
+++ b/lib/Support/DynamicLibrary.cpp
@@ -187,3 +187,4 @@ void* DynamicLibrary::SearchForAddressOfSymbol(const char *symbolName) {
 }
 
 #endif // LLVM_ON_WIN32
+
diff --git a/lib/Support/LockFileManager.cpp b/lib/Support/LockFileManager.cpp
index 59bfcfcd25..7610d281f0 100644
--- a/lib/Support/LockFileManager.cpp
+++ b/lib/Support/LockFileManager.cpp
@@ -19,7 +19,7 @@
 #include <unistd.h>
 #endif
 using namespace llvm;
-
+#ifndef __native_client__
 /// \brief Attempt to read the lock file with the given name, if it exists.
 ///
 /// \param LockFileName The name of the lock file to read.
@@ -214,3 +214,5 @@ void LockFileManager::waitForUnlock() {
 
   // Give up.
 }
+
+#endif
diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp
index ec373e7f99..0423c7acb3 100644
--- a/lib/Support/MemoryBuffer.cpp
+++ b/lib/Support/MemoryBuffer.cpp
@@ -264,7 +264,7 @@ error_code MemoryBuffer::getFile(const char *Filename,
 static bool shouldUseMmap(int FD,
                           size_t FileSize,
                           size_t MapSize,
-                          off_t Offset,
+                          int64_t Offset,
                           bool RequiresNullTerminator,
                           int PageSize) {
   // We don't use mmap for small files because this can severely fragment our
@@ -275,7 +275,6 @@ static bool shouldUseMmap(int FD,
   if (!RequiresNullTerminator)
     return true;
 
-
   // If we don't know the file size, use fstat to find out.  fstat on an open
   // file descriptor is cheaper than stat on a random path.
   // FIXME: this chunk of code is duplicated, but it avoids a fstat when
@@ -335,8 +334,8 @@ error_code MemoryBuffer::getOpenFile(int FD, const char *Filename,
 
   if (shouldUseMmap(FD, FileSize, MapSize, Offset, RequiresNullTerminator,
                     PageSize)) {
-    off_t RealMapOffset = Offset & ~(PageSize - 1);
-    off_t Delta = Offset - RealMapOffset;
+    int64_t RealMapOffset = Offset & ~(PageSize - 1);
+    int64_t Delta = Offset - RealMapOffset;
     size_t RealMapSize = MapSize + Delta;
 
     if (const char *Pages = sys::Path::MapInFilePages(FD,
diff --git a/lib/Support/Mutex.cpp b/lib/Support/Mutex.cpp
index 4e4a026b2f..586392fc1e 100644
--- a/lib/Support/Mutex.cpp
+++ b/lib/Support/Mutex.cpp
@@ -60,7 +60,7 @@ MutexImpl::MutexImpl( bool recursive)
   assert(errorcode == 0);
 
 #if !defined(__FreeBSD__) && !defined(__OpenBSD__) && !defined(__NetBSD__) && \
-    !defined(__DragonFly__) && !defined(__Bitrig__)
+    !defined(__DragonFly__) && !defined(__Bitrig__) && !defined(__native_client__)
   // Make it a process local mutex
   errorcode = pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_PRIVATE);
   assert(errorcode == 0);
diff --git a/lib/Support/Unix/Host.inc b/lib/Support/Unix/Host.inc
index 726e2fbcf0..aa06763258 100644
--- a/lib/Support/Unix/Host.inc
+++ b/lib/Support/Unix/Host.inc
@@ -19,7 +19,9 @@
 #include "llvm/Config/config.h"
 #include "llvm/ADT/StringRef.h"
 #include "Unix.h"
+#if !defined(__native_client__)
 #include <sys/utsname.h>
+#endif // (__native_client__)
 #include <cctype>
 #include <string>
 #include <cstdlib> // ::getenv
@@ -27,12 +29,16 @@
 using namespace llvm;
 
 static std::string getOSVersion() {
+#if !defined(__native_client__)
   struct utsname info;
 
   if (uname(&info))
     return "";
 
   return info.release;
+#else // (__native_client__)
+  return "";
+#endif // (__native_client__)
 }
 
 std::string sys::getDefaultTargetTriple() {
diff --git a/lib/Support/Unix/Memory.inc b/lib/Support/Unix/Memory.inc
index 9a8abd27f1..f4cfbc65cf 100644
--- a/lib/Support/Unix/Memory.inc
+++ b/lib/Support/Unix/Memory.inc
@@ -12,9 +12,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "Unix.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Process.h"
+#include "llvm/Support/Debug.h"
 
 #ifdef HAVE_SYS_MMAN_H
 #include <sys/mman.h>
@@ -145,8 +147,12 @@ Memory::protectMappedMemory(const MemoryBlock &M, unsigned Flags) {
     return error_code(EINVAL, generic_category());
 
   int Protect = getPosixProtectionFlags(Flags);
-
+#ifndef __native_client__
   int Result = ::mprotect(M.Address, M.Size, Protect);
+#else
+  int Result = -1;
+  llvm_unreachable("Native client does not support mprotect");
+#endif
   if (Result != 0)
     return error_code(errno, system_category());
 
@@ -194,8 +200,10 @@ Memory::AllocateRWX(size_t NumBytes, const MemoryBlock* NearBlock,
   void *pa = ::mmap(start, pageSize*NumPages, PROT_READ|PROT_EXEC,
                     flags, fd, 0);
 #else
+dbgs() << "calling mmap, start " << start << "\n";
   void *pa = ::mmap(start, pageSize*NumPages, PROT_READ|PROT_WRITE|PROT_EXEC,
                     flags, fd, 0);
+  DEBUG(dbgs() << "mmap returned " << pa<<"\n");
 #endif
   if (pa == MAP_FAILED) {
     if (NearBlock) //Try again without a near hint
diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc
index 6a5ebb8cd9..b82371a7b6 100644
--- a/lib/Support/Unix/Path.inc
+++ b/lib/Support/Unix/Path.inc
@@ -133,7 +133,9 @@ Path::GetRootDirectory() {
 
 Path
 Path::GetTemporaryDirectory(std::string *ErrMsg) {
-#if defined(HAVE_MKDTEMP)
+#if defined(__native_client__)
+  return Path("");
+#elif defined(HAVE_MKDTEMP)
   // The best way is with mkdtemp but that's not available on many systems,
   // Linux and FreeBSD have it. Others probably won't.
   char pathname[] = "/tmp/llvm_XXXXXX";
@@ -251,6 +253,7 @@ Path::GetUserHomeDirectory() {
 
 Path
 Path::GetCurrentDirectory() {
+#if !defined(__native_client__)
   char pathname[MAXPATHLEN];
   if (!getcwd(pathname, MAXPATHLEN)) {
     assert(false && "Could not query current working directory.");
@@ -258,6 +261,9 @@ Path::GetCurrentDirectory() {
   }
 
   return Path(pathname);
+#else // (__native_client__)
+  return Path("./");
+#endif // (__native_client__)
 }
 
 #if defined(__FreeBSD__) || defined (__NetBSD__) || defined(__Bitrig__) || \
@@ -319,7 +325,9 @@ getprogpath(char ret[PATH_MAX], const char *bin)
 /// GetMainExecutable - Return the path to the main executable, given the
 /// value of argv[0] from program startup.
 Path Path::GetMainExecutable(const char *argv0, void *MainAddr) {
-#if defined(__APPLE__)
+#if defined(__native_client__)
+   return Path(std::string("./") + std::string(argv0));
+#elif defined(__APPLE__)
   // On OS X the executable path is saved to the stack by dyld. Reading it
   // from there is much faster than calling dladdr, especially for large
   // binaries with symbols.
@@ -420,7 +428,11 @@ bool Path::getMagicNumber(std::string &Magic, unsigned len) const {
 
 bool
 Path::exists() const {
+#if !defined(__native_client__)
   return 0 == access(path.c_str(), F_OK );
+#else // (__native_client__)
+  return true;
+#endif // (__native_client__)
 }
 
 bool
@@ -433,21 +445,33 @@ Path::isDirectory() const {
 
 bool
 Path::isSymLink() const {
+#if defined(__native_client__)
+  return false;
+#else
   struct stat buf;
   if (0 != lstat(path.c_str(), &buf))
     return false;
   return S_ISLNK(buf.st_mode);
+#endif
 }
 
 
 bool
 Path::canRead() const {
+#if !defined(__native_client__)
   return 0 == access(path.c_str(), R_OK);
+#else // (__native_client__)
+  return true;
+#endif // (__native_client__)
 }
 
 bool
 Path::canWrite() const {
+#if !defined(__native_client__)
   return 0 == access(path.c_str(), W_OK);
+#else // (__native_client__)
+  return true;
+#endif // (__native_client__)
 }
 
 bool
@@ -466,6 +490,7 @@ Path::isRegularFile() const {
 
 bool
 Path::canExecute() const {
+#if !defined(__native_client__)
   if (0 != access(path.c_str(), R_OK | X_OK ))
     return false;
   struct stat buf;
@@ -473,6 +498,7 @@ Path::canExecute() const {
     return false;
   if (!S_ISREG(buf.st_mode))
     return false;
+#endif // (__native_client__)
   return true;
 }
 
@@ -520,6 +546,7 @@ PathWithStatus::getFileStatus(bool update, std::string *ErrStr) const {
 }
 
 static bool AddPermissionBits(const Path &File, int bits) {
+#if !defined(__native_client__)
   // Get the umask value from the operating system.  We want to use it
   // when changing the file's permissions. Since calling umask() sets
   // the umask and returns its old value, we must call it a second
@@ -535,6 +562,7 @@ static bool AddPermissionBits(const Path &File, int bits) {
   // that the umask would not disable.
   if ((chmod(File.c_str(), (buf.st_mode | (bits & ~mask)))) == -1)
       return false;
+#endif // (__native_client__)
   return true;
 }
 
@@ -558,6 +586,7 @@ bool Path::makeExecutableOnDisk(std::string* ErrMsg) {
 
 bool
 Path::getDirectoryContents(std::set<Path>& result, std::string* ErrMsg) const {
+#if !defined(__native_client__)
   DIR* direntries = ::opendir(path.c_str());
   if (direntries == 0)
     return MakeErrMsg(ErrMsg, path + ": can't open directory");
@@ -583,6 +612,7 @@ Path::getDirectoryContents(std::set<Path>& result, std::string* ErrMsg) const {
   }
 
   closedir(direntries);
+#endif
   return false;
 }
 
@@ -635,7 +665,7 @@ Path::eraseSuffix() {
 }
 
 static bool createDirectoryHelper(char* beg, char* end, bool create_parents) {
-
+#if !defined(__native_client__)
   if (access(beg, R_OK | W_OK) == 0)
     return false;
 
@@ -660,6 +690,9 @@ static bool createDirectoryHelper(char* beg, char* end, bool create_parents) {
   }
 
   return mkdir(beg, S_IRWXU | S_IRWXG) != 0;
+#else // (__native_client__)
+  return false;
+#endif // (__native_client__)
 }
 
 bool
@@ -683,11 +716,13 @@ Path::createDirectoryOnDisk( bool create_parents, std::string* ErrMsg ) {
 
 bool
 Path::createFileOnDisk(std::string* ErrMsg) {
+#if !defined(__native_client__)
   // Create the file
   int fd = ::creat(path.c_str(), S_IRUSR | S_IWUSR);
   if (fd < 0)
     return MakeErrMsg(ErrMsg, path + ": can't create file");
   ::close(fd);
+#endif // (__native_client__)
   return false;
 }
 
@@ -707,6 +742,7 @@ Path::createTemporaryFileOnDisk(bool reuse_current, std::string* ErrMsg) {
 
 bool
 Path::eraseFromDisk(bool remove_contents, std::string *ErrStr) const {
+#if !defined(__native_client__)
   // Get the status so we can determine if it's a file or directory.
   struct stat buf;
   if (0 != stat(path.c_str(), &buf)) {
@@ -751,18 +787,26 @@ Path::eraseFromDisk(bool remove_contents, std::string *ErrStr) const {
   if (rmdir(pathname.c_str()) != 0)
     return MakeErrMsg(ErrStr, pathname + ": can't erase directory");
   return false;
+#else // (__native_client__)
+  MakeErrMsg(ErrStr, ": PNACL does not know how to erase directories!");
+  return false;
+#endif // (__native_client__)
+
 }
 
 bool
 Path::renamePathOnDisk(const Path& newName, std::string* ErrMsg) {
+#if !defined(__native_client__)
   if (0 != ::rename(path.c_str(), newName.c_str()))
     return MakeErrMsg(ErrMsg, std::string("can't rename '") + path + "' as '" +
                newName.str() + "'");
+#endif
   return false;
 }
 
 bool
 Path::setStatusInfoOnDisk(const FileStatus &si, std::string *ErrStr) const {
+#if !defined(__native_client__)
   struct utimbuf utb;
   utb.actime = si.modTime.toPosixTime();
   utb.modtime = utb.actime;
@@ -770,6 +814,7 @@ Path::setStatusInfoOnDisk(const FileStatus &si, std::string *ErrStr) const {
     return MakeErrMsg(ErrStr, path + ": can't set file modification time");
   if (0 != ::chmod(path.c_str(),si.mode))
     return MakeErrMsg(ErrStr, path + ": can't set mode");
+#endif // (__native_client__)
   return false;
 }
 
diff --git a/lib/Support/Unix/PathV2.inc b/lib/Support/Unix/PathV2.inc
index d04f590f87..59c5ae5808 100644
--- a/lib/Support/Unix/PathV2.inc
+++ b/lib/Support/Unix/PathV2.inc
@@ -117,7 +117,9 @@ error_code current_path(SmallVectorImpl<char> &result) {
 // For GNU Hurd
   result.reserve(1024);
 #endif
-
+#ifdef __native_client__
+  llvm_unreachable("current_path() not implemented for Native Client");
+#else
   while (true) {
     if (::getcwd(result.data(), result.capacity()) == 0) {
       // See if there was a real error.
@@ -130,6 +132,7 @@ error_code current_path(SmallVectorImpl<char> &result) {
   }
 
   result.set_size(strlen(result.data()));
+#endif
   return error_code::success();
 }
 
@@ -193,6 +196,9 @@ error_code copy_file(const Twine &from, const Twine &to, copy_option copt) {
 }
 
 error_code create_directory(const Twine &path, bool &existed) {
+#ifdef __native_client__
+  llvm_unreachable("create_directory() not implemented for Native Client");
+#else
   SmallString<128> path_storage;
   StringRef p = path.toNullTerminatedStringRef(path_storage);
 
@@ -204,9 +210,13 @@ error_code create_directory(const Twine &path, bool &existed) {
     existed = false;
 
   return error_code::success();
+#endif
 }
 
 error_code create_hard_link(const Twine &to, const Twine &from) {
+#ifdef __native_client__
+  llvm_unreachable("create_hard_link() not implemented for Native Client");
+#else
   // Get arguments.
   SmallString<128> from_storage;
   SmallString<128> to_storage;
@@ -217,9 +227,13 @@ error_code create_hard_link(const Twine &to, const Twine &from) {
     return error_code(errno, system_category());
 
   return error_code::success();
+#endif
 }
 
 error_code create_symlink(const Twine &to, const Twine &from) {
+#ifdef __native_client__
+  llvm_unreachable("create_symlink() not implemented for Native Client");
+#else
   // Get arguments.
   SmallString<128> from_storage;
   SmallString<128> to_storage;
@@ -230,9 +244,13 @@ error_code create_symlink(const Twine &to, const Twine &from) {
     return error_code(errno, system_category());
 
   return error_code::success();
+#endif
 }
 
 error_code remove(const Twine &path, bool &existed) {
+#ifdef __native_client__
+  llvm_unreachable("remove() not implemented for Native Client");
+#else
   SmallString<128> path_storage;
   StringRef p = path.toNullTerminatedStringRef(path_storage);
 
@@ -242,11 +260,14 @@ error_code remove(const Twine &path, bool &existed) {
     existed = false;
   } else
     existed = true;
-
   return error_code::success();
+#endif
 }
 
 error_code rename(const Twine &from, const Twine &to) {
+#ifdef __native_client__
+  llvm_unreachable("rename() not implemented for Native Client");
+#else
   // Get arguments.
   SmallString<128> from_storage;
   SmallString<128> to_storage;
@@ -266,9 +287,13 @@ error_code rename(const Twine &from, const Twine &to) {
   }
 
   return error_code::success();
+#endif
 }
 
 error_code resize_file(const Twine &path, uint64_t size) {
+#ifdef __native_client__
+  llvm_unreachable("resize_file() not implemented for Native Client");
+#else
   SmallString<128> path_storage;
   StringRef p = path.toNullTerminatedStringRef(path_storage);
 
@@ -276,6 +301,7 @@ error_code resize_file(const Twine &path, uint64_t size) {
     return error_code(errno, system_category());
 
   return error_code::success();
+#endif
 }
 
 error_code exists(const Twine &path, bool &result) {
@@ -390,6 +416,9 @@ error_code permissions(const Twine &path, perms prms) {
 error_code unique_file(const Twine &model, int &result_fd,
                        SmallVectorImpl<char> &result_path,
                        bool makeAbsolute, unsigned mode) {
+#ifdef __native_client__
+  llvm_unreachable("unique_file() not implemented for Native Client");
+#else
   SmallString<128> Model;
   model.toVector(Model);
   // Null terminate.
@@ -463,9 +492,14 @@ rety_open_create:
 
   result_fd = RandomFD;
   return error_code::success();
+#endif
 }
 
 error_code mapped_file_region::init(int fd, uint64_t offset) {
+#ifdef __native_client__
+  // Newlib does not have ftruncate.
+  llvm_unreachable("mapped_file_region not implemented for native client");
+#else
   AutoFD FD(fd);
 
   // Figure out how large the file is.
@@ -491,6 +525,7 @@ error_code mapped_file_region::init(int fd, uint64_t offset) {
   if (Mapping == MAP_FAILED)
     return error_code(errno, system_category());
   return error_code::success();
+#endif // __native_client__
 }
 
 mapped_file_region::mapped_file_region(const Twine &path,
@@ -501,6 +536,9 @@ mapped_file_region::mapped_file_region(const Twine &path,
   : Mode(mode)
   , Size(length)
   , Mapping() {
+#ifdef __native_client__
+  llvm_unreachable("mapped_file_region not implemented for native client");
+#endif
   // Make sure that the requested size fits within SIZE_T.
   if (length > std::numeric_limits<size_t>::max()) {
     ec = make_error_code(errc::invalid_argument);
@@ -529,6 +567,9 @@ mapped_file_region::mapped_file_region(int fd,
   : Mode(mode)
   , Size(length)
   , Mapping() {
+#ifdef __native_client__
+  llvm_unreachable("mapped_file_region not implemented for native client");
+#endif
   // Make sure that the requested size fits within SIZE_T.
   if (length > std::numeric_limits<size_t>::max()) {
     ec = make_error_code(errc::invalid_argument);
diff --git a/lib/Support/Unix/Process.inc b/lib/Support/Unix/Process.inc
index 5204147ce3..b2983b21f7 100644
--- a/lib/Support/Unix/Process.inc
+++ b/lib/Support/Unix/Process.inc
@@ -36,6 +36,8 @@
 #  include <termios.h>
 #endif
 
+#include <sys/unistd.h>
+
 //===----------------------------------------------------------------------===//
 //=== WARNING: Implementation here must contain only generic UNIX code that
 //===          is guaranteed to work on *all* UNIX variants.
@@ -54,9 +56,10 @@ Process::GetPageSize()
   const int page_size = 0x1000;
 #elif defined(HAVE_GETPAGESIZE)
   const int page_size = ::getpagesize();
-#elif defined(HAVE_SYSCONF)
+#elif defined(HAVE_SYSCONF)  && !defined(__native_client__)
   long page_size = ::sysconf(_SC_PAGE_SIZE);
 #else
+  const int page_size = 0;
 #warning Cannot get the page size on this machine
 #endif
   return static_cast<unsigned>(page_size);
@@ -111,7 +114,7 @@ Process::GetTimeUsage(TimeValue& elapsed, TimeValue& user_time,
                       TimeValue& sys_time)
 {
   elapsed = TimeValue::now();
-#if defined(HAVE_GETRUSAGE)
+#if defined(HAVE_GETRUSAGE) && !defined(__native_client__)
   struct rusage usage;
   ::getrusage(RUSAGE_SELF, &usage);
   user_time = TimeValue(
@@ -132,11 +135,23 @@ Process::GetTimeUsage(TimeValue& elapsed, TimeValue& user_time,
 }
 
 int Process::GetCurrentUserId() {
+#if !defined(__native_client__)
   return getuid();
+#else // (__native_client__)
+// TODO(abetul): What the proper return value should be for this function?
+// What about having a reserved user_id or the user "nobody" for PNACL?
+  return -1;
+#endif // (__native_client__)
 }
 
 int Process::GetCurrentGroupId() {
+#if !defined(__native_client__)
   return getgid();
+#else // (__native_client__)
+// TODO(abetul): What the proper return value should be for this function?
+// What about having a reserved/unused group_id?  
+  return -1;
+#endif // (__native_client__)
 }
 
 #if defined(HAVE_MACH_MACH_H) && !defined(__GNU__)
@@ -332,3 +347,6 @@ unsigned llvm::sys::Process::GetRandomNumber() {
   return ::rand();
 #endif
 }
+
+#if !defined(__native_client__)
+#endif
diff --git a/lib/Support/Unix/Program.inc b/lib/Support/Unix/Program.inc
index e5990d06ec..049c41b742 100644
--- a/lib/Support/Unix/Program.inc
+++ b/lib/Support/Unix/Program.inc
@@ -103,6 +103,10 @@ Program::FindProgramByName(const std::string& progName) {
 }
 
 static bool RedirectIO(const Path *Path, int FD, std::string* ErrMsg) {
+#if defined(__native_client__)
+  MakeErrMsg(ErrMsg, "Cannot redirect I/O in NaCl");
+  return true;
+#else // (__native_client__)
   if (Path == 0) // Noop
     return false;
   const char *File;
@@ -119,7 +123,6 @@ static bool RedirectIO(const Path *Path, int FD, std::string* ErrMsg) {
               + (FD == 0 ? "input" : "output"));
     return true;
   }
-
   // Install it as the requested FD
   if (dup2(InFD, FD) == -1) {
     MakeErrMsg(ErrMsg, "Cannot dup2");
@@ -128,6 +131,7 @@ static bool RedirectIO(const Path *Path, int FD, std::string* ErrMsg) {
   }
   close(InFD);      // Close the original FD
   return false;
+#endif // (__native_client__)
 }
 
 #ifdef HAVE_POSIX_SPAWN
@@ -233,6 +237,7 @@ Program::Execute(const Path &path, const char **args, const char **envp,
   }
 #endif
 
+#if !defined(__native_client__)
   // Create a child process.
   int child = fork();
   switch (child) {
@@ -293,6 +298,10 @@ Program::Execute(const Path &path, const char **args, const char **envp,
   Data_ = reinterpret_cast<void*>(child);
 
   return true;
+#else // (__native_client__)
+  MakeErrMsg(ErrMsg, "PNACL does not know how to execute child processes!");
+  return false;
+#endif // (__native_client__)
 }
 
 int
@@ -300,6 +309,7 @@ Program::Wait(const sys::Path &path,
               unsigned secondsToWait,
               std::string* ErrMsg)
 {
+#if !defined(__native_client__)
 #ifdef HAVE_SYS_WAIT_H
   struct sigaction Act, Old;
 
@@ -392,10 +402,16 @@ Program::Wait(const sys::Path &path,
     *ErrMsg = "Program::Wait is not implemented on this platform yet!";
   return -1;
 #endif
+#else // (__native_client__)
+// TODO(abetul): What should the proper return value be here?
+  MakeErrMsg(ErrMsg, "PNACL does not know how to wait for a child process!");
+  return -1;
+#endif // (__native_client__)
 }
 
 bool
 Program::Kill(std::string* ErrMsg) {
+#if !defined(__native_client__)
   if (Data_ == 0) {
     MakeErrMsg(ErrMsg, "Process not started!");
     return true;
@@ -410,6 +426,12 @@ Program::Kill(std::string* ErrMsg) {
   }
 
   return false;
+
+#else // (__native_client__)
+  MakeErrMsg(ErrMsg, "PNACL does not know how to kill processes!");
+  return true;
+#endif // (__native_client__)
+
 }
 
 error_code Program::ChangeStdinToBinary(){
diff --git a/lib/Support/Unix/Signals.inc b/lib/Support/Unix/Signals.inc
index 9e94068c9c..264fa5dbde 100644
--- a/lib/Support/Unix/Signals.inc
+++ b/lib/Support/Unix/Signals.inc
@@ -82,6 +82,7 @@ static struct {
 
 
 static void RegisterHandler(int Signal) {
+#if !defined(__native_client__)
   assert(NumRegisteredSignals <
          sizeof(RegisteredSignalInfo)/sizeof(RegisteredSignalInfo[0]) &&
          "Out of space for signal handlers!");
@@ -97,6 +98,7 @@ static void RegisterHandler(int Signal) {
             &RegisteredSignalInfo[NumRegisteredSignals].SA);
   RegisteredSignalInfo[NumRegisteredSignals].SigNo = Signal;
   ++NumRegisteredSignals;
+#endif // (__native_client__)
 }
 
 static void RegisterHandlers() {
@@ -108,11 +110,13 @@ static void RegisterHandlers() {
 }
 
 static void UnregisterHandlers() {
+#if !defined(__native_client__)
   // Restore all of the signal handlers to how they were before we showed up.
   for (unsigned i = 0, e = NumRegisteredSignals; i != e; ++i)
     sigaction(RegisteredSignalInfo[i].SigNo,
               &RegisteredSignalInfo[i].SA, 0);
   NumRegisteredSignals = 0;
+#endif // (__native_client__)
 }
 
 
@@ -155,10 +159,12 @@ static RETSIGTYPE SignalHandler(int Sig) {
   // instead of recursing in the signal handler.
   UnregisterHandlers();
 
+#if !defined(__native_client__)
   // Unmask all potentially blocked kill signals.
   sigset_t SigMask;
   sigfillset(&SigMask);
   sigprocmask(SIG_UNBLOCK, &SigMask, 0);
+#endif
 
   SignalsMutex.acquire();
   RemoveFilesToRemove();
diff --git a/lib/Support/Unix/TimeValue.inc b/lib/Support/Unix/TimeValue.inc
index 5cf5a9d44e..0eb4ac8ad3 100644
--- a/lib/Support/Unix/TimeValue.inc
+++ b/lib/Support/Unix/TimeValue.inc
@@ -18,6 +18,13 @@
 
 #include "Unix.h"
 
+// @LOCALMOD-START
+#ifndef timerclear
+// Newlib does not have the timer{clear,add,sub} macros
+#define timerclear(tvp)  ((tvp)->tv_sec = (tvp)->tv_usec = 0)
+#endif
+// @LOCALMOD-END
+
 namespace llvm {
   using namespace sys;
 
diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h
index 1446bbbb8e..0ac92f1ee8 100644
--- a/lib/Target/ARM/ARM.h
+++ b/lib/Target/ARM/ARM.h
@@ -20,6 +20,9 @@
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Target/TargetMachine.h"
 
+// @LOCALMOD (for LowerARMMachineInstrToMCInstPCRel)
+#include "llvm/MC/MCSymbol.h"
+
 namespace llvm {
 
 class ARMAsmPrinter;
@@ -44,9 +47,27 @@ FunctionPass *createMLxExpansionPass();
 FunctionPass *createThumb2ITBlockPass();
 FunctionPass *createThumb2SizeReductionPass();
 
+/* @LOCALMOD-START */
+FunctionPass *createARMNaClRewritePass();
+/* @LOCALMOD-END */
+
 void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
                                   ARMAsmPrinter &AP);
 
+                                          
+/* @LOCALMOD-START */
+// Used to lower the pc-relative MOVi16PIC / MOVTi16PIC pseudo instructions
+// into the real MOVi16 / MOVTi16 instructions.
+// See comment on MOVi16PIC for more details.
+void LowerARMMachineInstrToMCInstPCRel(const MachineInstr *MI,
+                                       MCInst &OutMI,
+                                       ARMAsmPrinter &AP,
+                                       unsigned ImmIndex,
+                                       unsigned PCIndex,
+                                       MCSymbol *PCLabel,
+                                       unsigned PCAdjustment);
+/* @LOCALMOD-END */
+
 } // end namespace llvm;
 
 #endif
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index 23974ad905..5c56b2dc47 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -220,8 +220,13 @@ def : Processor<"arm1156t2f-s",     ARMV6Itineraries, [HasV6T2Ops, FeatureVFP2,
 
 // V7a Processors.
 def : ProcessorModel<"cortex-a8",   CortexA8Model,
-                                    [ProcA8, HasV7Ops, FeatureNEON, FeatureDB,
+// @LOCALMOD-BEGIN
+// TODO(pdox): Resolve this mismatch.
+                                    [ProcA8, HasV7Ops, FeatureDB,
+// FeatureNEON,
                                      FeatureDSPThumb2, FeatureHasRAS]>;
+// @LOCALMOD-END
+
 def : ProcessorModel<"cortex-a9",   CortexA9Model,
                                     [ProcA9, HasV7Ops, FeatureNEON, FeatureDB,
                                      FeatureDSPThumb2, FeatureHasRAS]>;
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index d439d1d7cb..f67decc550 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -52,6 +52,13 @@
 #include <cctype>
 using namespace llvm;
 
+// @LOCALMOD-START
+namespace llvm {
+  extern cl::opt<bool> FlagSfiBranch;
+  extern cl::opt<bool> FlagSfiData;
+}
+// @LOCALMOD-END
+
 namespace {
 
   // Per section and per symbol attributes are not supported.
@@ -223,6 +230,75 @@ getDebugValueLocation(const MachineInstr *MI) const {
   return Location;
 }
 
+// @LOCALMOD-START
+// Make sure all jump targets are aligned and also all constant pools
+void NaclAlignAllJumpTargetsAndConstantPools(MachineFunction &MF) {
+  // JUMP TABLE TARGETS
+  MachineJumpTableInfo *jt_info = MF.getJumpTableInfo();
+  if (jt_info) {
+    const std::vector<MachineJumpTableEntry> &JT = jt_info->getJumpTables();
+    for (unsigned i=0; i < JT.size(); ++i) {
+      std::vector<MachineBasicBlock*> MBBs = JT[i].MBBs;
+
+      for (unsigned j=0; j < MBBs.size(); ++j) {
+        if (MBBs[j]->begin()->getOpcode() == ARM::CONSTPOOL_ENTRY) {
+          continue;
+        }
+        MBBs[j]->setAlignment(4);
+      }
+    }
+  }
+
+  // FIRST ENTRY IN A ConstanPool
+  bool last_bb_was_constant_pool = false;
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end();
+       I != E; ++I) {
+    if (I->isLandingPad()) {
+        I->setAlignment(4);
+    }
+
+    if (I->empty()) continue;
+
+    bool is_constant_pool = I->begin()->getOpcode() == ARM::CONSTPOOL_ENTRY;
+
+    if (last_bb_was_constant_pool != is_constant_pool) {
+      I->setAlignment(4);
+    }
+
+    last_bb_was_constant_pool = is_constant_pool;
+  }
+}
+
+bool ARMAsmPrinter::UseReadOnlyJumpTables() const {
+  if (Subtarget->isTargetNaCl())
+    return true;
+  return false;
+}
+
+unsigned ARMAsmPrinter::GetTargetBasicBlockAlign() const {
+  if (Subtarget->isTargetNaCl())
+    return 4;
+  return 0;
+}
+
+unsigned ARMAsmPrinter::GetTargetLabelAlign(const MachineInstr *MI) const {
+  if (Subtarget->isTargetNaCl()) {
+    switch (MI->getOpcode()) {
+      default: return 0;
+      // These labels may indicate an indirect entry point that is
+      // externally reachable and hence must be bundle aligned.
+      // Note: these labels appear to be always at basic block beginnings
+      // so it may be possible to simply set the MBB alignment.
+      // However, it is unclear whether this always holds.
+      case TargetOpcode::EH_LABEL:
+      case TargetOpcode::GC_LABEL:
+        return 4;
+    }
+  }
+  return 0;
+}
+// @LOCALMOD-END
+
 /// EmitDwarfRegOp - Emit dwarf register operation.
 void ARMAsmPrinter::EmitDwarfRegOp(const MachineLocation &MLoc) const {
   const TargetRegisterInfo *RI = TM.getRegisterInfo();
@@ -299,6 +375,17 @@ void ARMAsmPrinter::EmitFunctionEntryLabel() {
     OutStreamer.EmitThumbFunc(CurrentFnSym);
   }
 
+  // @LOCALMOD-START
+  // make sure function entry is aligned. We use  XmagicX as our basis
+  // for alignment decisions (c.f. assembler sfi macros)
+  int alignment = MF->getAlignment();
+  if (alignment < 4) alignment = 4;
+  EmitAlignment(alignment);
+  if (Subtarget->isTargetNaCl() && OutStreamer.hasRawTextSupport()) {
+    OutStreamer.EmitRawText(StringRef("\t.set XmagicX, .\n"));
+  }
+  // @LOCALMOD-END
+ 
   OutStreamer.EmitLabel(CurrentFnSym);
 }
 
@@ -325,6 +412,11 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   AFI = MF.getInfo<ARMFunctionInfo>();
   MCP = MF.getConstantPool();
 
+  // @LOCALMOD-START
+  if (FlagSfiBranch) {
+    NaclAlignAllJumpTargetsAndConstantPools(MF);
+  }
+  // @LOCALMOD-END
   return AsmPrinter::runOnMachineFunction(MF);
 }
 
@@ -360,10 +452,10 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
   case MachineOperand::MO_GlobalAddress: {
     const GlobalValue *GV = MO.getGlobal();
     if ((Modifier && strcmp(Modifier, "lo16") == 0) ||
-        (TF & ARMII::MO_LO16))
+        (TF == ARMII::MO_LO16)) // @LOCALMOD: TEMPORARY FIX
       O << ":lower16:";
     else if ((Modifier && strcmp(Modifier, "hi16") == 0) ||
-             (TF & ARMII::MO_HI16))
+             (TF == ARMII::MO_HI16)) // @LOCALMOD: TEMPORARY FIX
       O << ":upper16:";
     O << *Mang->getSymbol(GV);
 
@@ -389,6 +481,7 @@ void ARMAsmPrinter::printOperand(const MachineInstr *MI, int OpNum,
 
 //===--------------------------------------------------------------------===//
 
+
 MCSymbol *ARMAsmPrinter::
 GetARMJTIPICJumpTableLabel2(unsigned uid, unsigned uid2) const {
   SmallString<60> Name;
@@ -570,6 +663,8 @@ bool ARMAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
   return false;
 }
 
+void EmitSFIHeaders(raw_ostream &O);
+
 void ARMAsmPrinter::EmitStartOfAsmFile(Module &M) {
   if (Subtarget->isTargetDarwin()) {
     Reloc::Model RelocM = TM.getRelocationModel();
@@ -629,8 +724,16 @@ void ARMAsmPrinter::EmitStartOfAsmFile(Module &M) {
   // Emit ARM Build Attributes
   if (Subtarget->isTargetELF())
     emitAttributes();
-}
 
+  // @LOCALMOD-BEGIN
+  if (Subtarget->isTargetNaCl() && OutStreamer.hasRawTextSupport()) {
+    std::string str;
+    raw_string_ostream OS(str);
+    EmitSFIHeaders(OS);
+    OutStreamer.EmitRawText(StringRef(OS.str()));
+  }
+  // @LOCALMOD-END
+}
 
 void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
   if (Subtarget->isTargetDarwin()) {
@@ -700,6 +803,7 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
   }
 }
 
+
 //===----------------------------------------------------------------------===//
 // Helper routines for EmitStartOfAsmFile() and EmitEndOfAsmFile()
 // FIXME:
@@ -965,7 +1069,20 @@ EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
       PCRelExpr = MCBinaryExpr::CreateSub(PCRelExpr, DotExpr, OutContext);
     }
     Expr = MCBinaryExpr::CreateSub(Expr, PCRelExpr, OutContext);
+  } else {   // @LOCALMOD-BEGIN
+    // Check mustAddCurrentAddress() when getPCAdjustment() == 0,
+    // and make it actually *Subtract* the current address.
+    // A more appropriate name is probably "relativeToCurrentAddress",
+    // since the assembler can't actually handle "X + .", only "X - .".
+    if (ACPV->mustAddCurrentAddress()) {
+      MCSymbol *DotSym = OutContext.CreateTempSymbol();
+      OutStreamer.EmitLabel(DotSym);
+      const MCExpr *DotExpr = MCSymbolRefExpr::Create(DotSym, OutContext);
+      Expr = MCBinaryExpr::CreateSub(Expr, DotExpr, OutContext);
+    }
   }
+  // @LOCALMOD-END
+
   OutStreamer.EmitValue(Expr, Size);
 }
 
@@ -1595,6 +1712,28 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       InConstantPool = true;
     }
 
+    
+    // @LOCALMOD-START
+    // NOTE: we also should make sure that the first data item
+    // is not in a code bundle
+    // NOTE: there may be issues with alignment constraints
+    if (Subtarget->isTargetNaCl() && OutStreamer.hasRawTextSupport()) {
+      const unsigned size = MI->getOperand(2).getImm();
+      //assert(size == 4 || size == 8 && "Unsupported data item size");
+      if (size == 8) {
+        // we cannot generate a size 8 constant at offset 12 (mod 16)
+        OutStreamer.EmitRawText(StringRef("sfi_nop_if_at_bundle_end\n"));
+      }
+
+      if (FlagSfiData) {
+        SmallString<128> Str;
+        raw_svector_ostream OS(Str);
+        OS << "sfi_illegal_if_at_bundle_begining  @ ========== SFI (" << 
+          size << ")\n";
+        OutStreamer.EmitRawText(OS.str());
+      }
+    }
+    // @LOCALMOD-END
     OutStreamer.EmitLabel(GetCPISymbol(LabelId));
 
     const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPIdx];
@@ -1725,8 +1864,10 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // Non-Darwin binutils don't yet support the "trap" mnemonic.
     // FIXME: Remove this special case when they do.
     if (!Subtarget->isTargetDarwin()) {
-      //.long 0xe7ffdefe @ trap
-      uint32_t Val = 0xe7ffdefeUL;
+      // @LOCALMOD-START
+      //.long 0xe7fedef0 @ trap
+      uint32_t Val = 0xe7fedef0UL;
+      // @LOCALMOD-END
       OutStreamer.AddComment("trap");
       OutStreamer.EmitIntValue(Val, 4);
       return;
@@ -2023,6 +2164,50 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     }
     return;
   }
+
+  // @LOCALMOD-BEGIN
+  // These are pseudo ops for MOVW / MOVT with operands relative to a PC label.
+  // See the comments on MOVi16PIC in the .td file for more details.
+  case ARM::MOVi16PIC: {
+    MCInst TmpInst;
+    // First, build an instruction w/ the real opcode.
+    TmpInst.setOpcode(ARM::MOVi16);
+
+    unsigned ImmIndex = 1;
+    unsigned PIC_id_index = 2;
+    unsigned PCAdjustment = 8;
+    // NOTE: if getPICLabel was a method of "this", or otherwise in scope for
+    // LowerARMMachineInstrToMCInstPCRel, then we wouldn't need to create
+    // it here (as well as below).
+    MCSymbol *PCLabel = getPICLabel(MAI->getPrivateGlobalPrefix(),
+                                    getFunctionNumber(),
+                                    MI->getOperand(PIC_id_index).getImm(),
+                                    OutContext);
+    LowerARMMachineInstrToMCInstPCRel(MI, TmpInst, *this, ImmIndex,
+                                      PIC_id_index, PCLabel, PCAdjustment);
+    OutStreamer.EmitInstruction(TmpInst);
+    return;
+  }
+  case ARM::MOVTi16PIC: {
+    MCInst TmpInst;
+    // First, build an instruction w/ the real opcode.
+    TmpInst.setOpcode(ARM::MOVTi16);
+
+    unsigned ImmIndex = 2;
+    unsigned PIC_id_index = 3;
+    unsigned PCAdjustment = 8;
+
+    MCSymbol *PCLabel = getPICLabel(MAI->getPrivateGlobalPrefix(),
+                                    getFunctionNumber(),
+                                    MI->getOperand(PIC_id_index).getImm(),
+                                    OutContext);
+
+    LowerARMMachineInstrToMCInstPCRel(MI, TmpInst, *this, ImmIndex,
+                                      PIC_id_index, PCLabel, PCAdjustment);
+    OutStreamer.EmitInstruction(TmpInst);
+    return;
+  }
+  //@LOCALMOD-END
   }
 
   MCInst TmpInst;
diff --git a/lib/Target/ARM/ARMAsmPrinter.h b/lib/Target/ARM/ARMAsmPrinter.h
index c875b2cbdf..ee3604499f 100644
--- a/lib/Target/ARM/ARMAsmPrinter.h
+++ b/lib/Target/ARM/ARMAsmPrinter.h
@@ -72,9 +72,16 @@ public:
   virtual void EmitInstruction(const MachineInstr *MI) LLVM_OVERRIDE;
   virtual bool runOnMachineFunction(MachineFunction &F) LLVM_OVERRIDE;
 
+  // @LOCALMOD-START
+  // usually this does nothing on ARM as constants pools
+  // are handled with custom code.
+  // For the sfi case we do not use the custom logic and fall back
+  // to the default implementation.
   virtual void EmitConstantPool() LLVM_OVERRIDE {
-    // we emit constant pools customly!
+    if (FlagSfiDisableCP) AsmPrinter::EmitConstantPool();
   }
+  // @LOCALMOD-END
+
   virtual void EmitFunctionBodyEnd() LLVM_OVERRIDE;
   virtual void EmitFunctionEntryLabel() LLVM_OVERRIDE;
   virtual void EmitStartOfAsmFile(Module &M) LLVM_OVERRIDE;
@@ -83,6 +90,17 @@ public:
 
   // lowerOperand - Convert a MachineOperand into the equivalent MCOperand.
   bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp);
+  
+  // @LOCALMOD-START
+  /// UseReadOnlyJumpTables - true if JumpTableInfo must be in rodata.
+  virtual bool UseReadOnlyJumpTables() const;
+  /// GetTargetBasicBlockAlign - Get the target alignment for basic blocks.
+  virtual unsigned GetTargetBasicBlockAlign() const;
+  /// GetTargetLabelAlign - Get optional alignment for TargetOpcode
+  /// labels E.g., EH_LABEL.
+  /// TODO(sehr,robertm): remove this if the labeled block has address taken.
+  virtual unsigned GetTargetLabelAlign(const MachineInstr *MI) const;
+  // @LOCALMOD-END
 
 private:
   // Helpers for EmitStartOfAsmFile() and EmitEndOfAsmFile()
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 3c7bb24f42..5280abb40c 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -1783,6 +1783,7 @@ void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB,
 
     // Build the new ADD / SUB.
     unsigned Opc = isSub ? ARM::SUBri : ARM::ADDri;
+
     BuildMI(MBB, MBBI, dl, TII.get(Opc), DestReg)
       .addReg(BaseReg, RegState::Kill).addImm(ThisVal)
       .addImm((unsigned)Pred).addReg(PredReg).addReg(0)
@@ -2250,6 +2251,7 @@ optimizeCompareInstr(MachineInstr *CmpInstr, unsigned SrcReg, unsigned SrcReg2,
     // be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
     for (unsigned i = 0, e = OperandsToUpdate.size(); i < e; i++)
       OperandsToUpdate[i].first->setImm(OperandsToUpdate[i].second);
+
     return true;
   }
   }
diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index e5b300fc77..8f5be6a120 100644
--- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -17,6 +17,7 @@
 #include "ARMFrameLowering.h"
 #include "ARMMachineFunctionInfo.h"
 #include "ARMSubtarget.h"
+#include "ARMTargetMachine.h"  // @LOCALMOD
 #include "MCTargetDesc/ARMAddressingModes.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
@@ -60,8 +61,10 @@ ARMBaseRegisterInfo::ARMBaseRegisterInfo(const ARMBaseInstrInfo &tii,
     BasePtr(ARM::R6) {
 }
 
+extern cl::opt<bool> ReserveR9; // @LOCALMOD
 const uint16_t*
 ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
+  if (ReserveR9) return CSR_NaCl_SaveList; // @LOCALMOD
   bool ghcCall = false;
  
   if (MF) {
@@ -80,6 +83,7 @@ ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
 
 const uint32_t*
 ARMBaseRegisterInfo::getCallPreservedMask(CallingConv::ID) const {
+  if (ReserveR9) return CSR_NaCl_RegMask; // @LOCALMOD
   return (STI.isTargetIOS() && !STI.isAAPCS_ABI())
     ? CSR_iOS_RegMask : CSR_AAPCS_RegMask;
 }
@@ -581,6 +585,13 @@ emitLoadConstPool(MachineBasicBlock &MBB,
                   unsigned DestReg, unsigned SubIdx, int Val,
                   ARMCC::CondCodes Pred,
                   unsigned PredReg, unsigned MIFlags) const {
+  // @LOCALMOD-START
+  // In the sfi case we do not want to use the load const pseudo instr.
+  // Sadly, the ARM backend is not very consistent about using this
+  // pseudo instr. and hence checking this is not sufficient.
+  // But, it should help detect some regressions early.
+  assert(!FlagSfiDisableCP && "unexpected call to emitLoadConstPool");
+  // @LOCALMOD-END
   MachineFunction &MF = *MBB.getParent();
   MachineConstantPool *ConstantPool = MF.getConstantPool();
   const Constant *C =
diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td
index b378b96626..a7544cd4cd 100644
--- a/lib/Target/ARM/ARMCallingConv.td
+++ b/lib/Target/ARM/ARMCallingConv.td
@@ -105,6 +105,10 @@ def CC_ARM_APCS_GHC : CallingConv<[
 
 def CC_ARM_AAPCS_Common : CallingConv<[
 
+  // @LOCALMOD-BEGIN (PR11018)
+  CCIfByVal<CCPassByVal<4, 4>>,
+  // @LOCALMOD-END
+
   CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
 
   // i64/f64 is passed in even pairs of GPRs
@@ -204,3 +208,9 @@ def CSR_iOS : CalleeSavedRegs<(add LR, R7, R6, R5, R4, (sub CSR_AAPCS, R9))>;
 // add is a workaround for not being able to compile empty list:
 // def CSR_GHC : CalleeSavedRegs<()>;
 def CSR_GHC : CalleeSavedRegs<(add)>;
+
+// @LOCALMOD-START
+// NaCl does not save R9, but otherwise uses the same order as AAPCS
+def CSR_NaCl : CalleeSavedRegs<(add LR, R11, R10, R8, R7, R6, R5, R4,
+                                     (sequence "D%u", 15, 8))>;
+// @LOCALMOD-END
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index a57368fdb5..be19a20182 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -374,6 +374,7 @@ FunctionPass *llvm::createARMConstantIslandPass() {
 }
 
 bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
+  if (FlagSfiDisableCP) return false;   // @LOCALMOD
   MF = &mf;
   MCP = mf.getConstantPool();
 
diff --git a/lib/Target/ARM/ARMConstantPoolValue.h b/lib/Target/ARM/ARMConstantPoolValue.h
index ae531c4ea8..24f2fcb666 100644
--- a/lib/Target/ARM/ARMConstantPoolValue.h
+++ b/lib/Target/ARM/ARMConstantPoolValue.h
@@ -81,6 +81,9 @@ public:
   bool isBlockAddress() const { return Kind == ARMCP::CPBlockAddress; }
   bool isLSDA() const { return Kind == ARMCP::CPLSDA; }
   bool isMachineBasicBlock() const{ return Kind == ARMCP::CPMachineBasicBlock; }
+  // @LOCALMOD-START
+  bool isValue() const { return Kind == ARMCP::CPValue; }
+  // @LOCALMOD-END
 
   virtual unsigned getRelocationInfo() const { return 2; }
 
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 8c45e0b98d..348f234f5c 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -24,6 +24,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetOptions.h" // @LOCALMOD for llvm::TLSUseCall
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/raw_ostream.h" // FIXME: for debug only. remove!
@@ -43,6 +44,7 @@ namespace {
     const TargetRegisterInfo *TRI;
     const ARMSubtarget *STI;
     ARMFunctionInfo *AFI;
+    bool IsRelocPIC; // @LOCALMOD
 
     virtual bool runOnMachineFunction(MachineFunction &Fn);
 
@@ -63,6 +65,16 @@ namespace {
                     unsigned Opc, bool IsExt);
     void ExpandMOV32BitImm(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator &MBBI);
+    // @LOCALMOD-BEGIN
+    void AddPICADD_MOVi16_PICID(MachineInstr &MI,
+                                MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator &MBBI,
+                                bool NotThumb,
+                                unsigned PredReg, ARMCC::CondCodes Pred,
+                                unsigned DstReg, bool DstIsDead,
+                                MachineInstrBuilder &LO16,
+                                MachineInstrBuilder &HI16);
+    // @LOCALMOD-END
   };
   char ARMExpandPseudo::ID = 0;
 }
@@ -478,13 +490,46 @@ void ARMExpandPseudo::ExpandVST(MachineBasicBlock::iterator &MBBI) {
   if (SrcIsKill && !SrcIsUndef) // Add an implicit kill for the super-reg.
     MIB->addRegisterKilled(SrcReg, TRI, true);
   TransferImpOps(MI, MIB, MIB);
-
   // Transfer memoperands.
   MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
-
   MI.eraseFromParent();
 }
 
+// @LOCALMOD-BEGIN
+// AddPICADD_MOVi16_PICID - Inserts a PICADD into the given basic block,
+// and adds the PC label ID (of the PICADD) as an operand of the LO16 / HI16
+// MOVs. The ID operand will follow the "Immediate" operand (assumes that
+// operand is already added).
+void ARMExpandPseudo::AddPICADD_MOVi16_PICID(MachineInstr &MI,
+                                       MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator &MBBI,
+                                       bool NotThumb,
+                                       unsigned PredReg, ARMCC::CondCodes Pred,
+                                       unsigned DstReg, bool DstIsDead,
+                                       MachineInstrBuilder &LO16,
+                                       MachineInstrBuilder &HI16) {
+  // Throw in a PICADD, and tack on the PC label ID to the MOVT/MOVWs
+  MachineFunction &MF = *MI.getParent()->getParent();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+
+  // Make a unique ID for this PC by pulling from pool of constPoolIDs
+  unsigned PC_ID = AFI->createPICLabelUId();
+  MachineInstrBuilder PicADD =
+      BuildMI(MBB, MBBI, MI.getDebugLoc(),
+              TII->get(NotThumb ? ARM::PICADD : ARM::tPICADD))
+      .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+      .addReg(DstReg)
+      .addImm(PC_ID)
+      .addImm(Pred)
+      .addReg(PredReg);
+  (void)PicADD; // squelch unused warning.
+
+  // Add the PC label ID after what would have been an absolute address.
+  LO16 = LO16.addImm(PC_ID);
+  HI16 = HI16.addImm(PC_ID);
+}
+// @LOCALMOD-END
+
 /// ExpandLaneOp - Translate VLD*LN and VST*LN instructions with Q, QQ or QQQQ
 /// register operands to real instructions with D register operands.
 void ARMExpandPseudo::ExpandLaneOp(MachineBasicBlock::iterator &MBBI) {
@@ -645,7 +690,9 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
 
   unsigned LO16Opc = 0;
   unsigned HI16Opc = 0;
-  if (Opcode == ARM::t2MOVi32imm || Opcode == ARM::t2MOVCCi32imm) {
+  // @LOCALMOD
+  bool isThumb2 = (Opcode == ARM::t2MOVi32imm || Opcode == ARM::t2MOVCCi32imm);
+  if (isThumb2) {
     LO16Opc = ARM::t2MOVi16;
     HI16Opc = ARM::t2MOVTi16;
   } else {
@@ -653,10 +700,28 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
     HI16Opc = ARM::MOVTi16;
   }
 
+  // @LOCALMOD-BEGIN
+  // If constant pools are "disabled" (actually, moved to rodata), then
+  // many addresses (e.g., the addresses of what used to be the "pools")
+  // may not be materialized in a pc-relative manner, because MOVT / MOVW
+  // are used to materialize the addresses.
+  // We need to know if it matters that references are pc-relative
+  // (e.g., to be PIC).
+  // See the comments on MOVi16PIC / MOVTi16PIC for more details.
+  const bool ShouldUseMOV16PIC = FlagSfiDisableCP && IsRelocPIC &&
+      (MO.isCPI() || MO.isJTI() || MO.isGlobal()); // TODO check this list.
+  if (ShouldUseMOV16PIC) {
+    if (isThumb2)
+      llvm_unreachable("FIXME: add PIC versions of t2MOVi16");
+    LO16Opc = ARM::MOVi16PIC;
+    HI16Opc = ARM::MOVTi16PIC;
+  }
+  // @LOCALMOD-END
+
   LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(LO16Opc), DstReg);
   HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(HI16Opc))
     .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
-    .addReg(DstReg);
+    .addReg(DstReg, RegState::Kill); // @LOCALMOD
 
   if (MO.isImm()) {
     unsigned Imm = MO.getImm();
@@ -664,13 +729,31 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
     unsigned Hi16 = (Imm >> 16) & 0xffff;
     LO16 = LO16.addImm(Lo16);
     HI16 = HI16.addImm(Hi16);
-  } else {
+  } else if (MO.isGlobal()) { // @LOCALMOD
     const GlobalValue *GV = MO.getGlobal();
     unsigned TF = MO.getTargetFlags();
     LO16 = LO16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_LO16);
     HI16 = HI16.addGlobalAddress(GV, MO.getOffset(), TF | ARMII::MO_HI16);
+  // @LOCALMOD-START - support for jumptable addresses and CPI
+  } else if (MO.isCPI()) {
+    int i = MO.getIndex();
+    unsigned TF = MO.getTargetFlags();
+    LO16 = LO16.addConstantPoolIndex(i, MO.getOffset(), TF|ARMII::MO_LO16);
+    HI16 = HI16.addConstantPoolIndex(i, MO.getOffset(), TF|ARMII::MO_HI16);
+  } else if (MO.isJTI()){
+    unsigned TF = MO.getTargetFlags();
+    LO16 = LO16.addJumpTableIndex(MO.getIndex(), TF | ARMII::MO_LO16);
+    HI16 = HI16.addJumpTableIndex(MO.getIndex(), TF | ARMII::MO_HI16);
+  } else {
+    assert (0 && "unexpected operand");
+  // @LOCALMOD-END
   }
-
+  // @LOCALMOD-BEGIN
+  if (ShouldUseMOV16PIC) {
+    AddPICADD_MOVi16_PICID(MI, MBB, MBBI, !isThumb2,
+                           PredReg, Pred, DstReg, DstIsDead, LO16, HI16);
+  }
+  // @LOCALMOD-END
   LO16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
   HI16->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
   LO16.addImm(Pred).addReg(PredReg);
@@ -848,13 +931,37 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     }
     case ARM::tTPsoft:
     case ARM::TPsoft: {
+      // @LOCALMOD-BEGIN
+      if (!STI->isTargetNaCl() || llvm::TLSUseCall) {
+        // Don't add implicit uses/defs for this call, otherwise
+        // liveness analysis passes get confused.
       MachineInstrBuilder MIB =
-        BuildMI(MBB, MBBI, MI.getDebugLoc(),
+          BuildMI_NoImp(MBB, MBBI, MI.getDebugLoc(), // @LOCALMOD
                 TII->get(Opcode == ARM::tTPsoft ? ARM::tBL : ARM::BL))
-        .addExternalSymbol("__aeabi_read_tp", 0);
+          .addExternalSymbol("__aeabi_read_tp", 0);
 
       MIB->setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
-      TransferImpOps(MI, MIB, MIB);
+        TransferImpOps(MI, MIB, MIB);
+      } else {
+        // Inline version for native client.
+        // See native_client/src/untrusted/nacl/aeabi_read_tp.S
+        // .nexe builds use this version, while irt builds use a call to
+        // __aeabi_read_tp.
+        if (FlagNaClUseM23ArmAbi) {
+          // mov r0, r9
+          AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                                 TII->get(ARM::MOVr), ARM::R0)
+                         .addReg(ARM::R9))
+              .addReg(0); // Doesn't use/modify CPSR.
+        } else {
+          // ldr r0, [r9, #0]
+          AddDefaultPred(BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                                 TII->get(ARM::LDRi12), ARM::R0)
+                         .addReg(ARM::R9)
+                         .addImm(0));
+        }
+      }
+      // @LOCALMOD-END
       MI.eraseFromParent();
       return true;
     }
@@ -1210,6 +1317,62 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     case ARM::VTBL4Pseudo: ExpandVTBL(MBBI, ARM::VTBL4, false); return true;
     case ARM::VTBX3Pseudo: ExpandVTBL(MBBI, ARM::VTBX3, true); return true;
     case ARM::VTBX4Pseudo: ExpandVTBL(MBBI, ARM::VTBX4, true); return true;
+
+    // @LOCALMOD-BEGIN
+    case ARM::ARMeh_return: {
+      // This pseudo instruction is generated as part of the lowering of
+      // ISD::EH_RETURN (c.f. ARMISelLowering.cpp)
+      // we convert it to a stack increment by OffsetReg and
+      // indirect jump to TargetReg
+      unsigned PredReg = 0;
+      ARMCC::CondCodes Pred = llvm::getInstrPredicate(&MI, PredReg);
+      unsigned OffsetReg = MI.getOperand(0).getReg();
+      unsigned TargetReg = MI.getOperand(1).getReg();
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::ADDrr), ARM::SP)
+          .addReg(OffsetReg)
+          .addReg(ARM::SP)
+          .addImm(Pred)
+          .addReg(PredReg)
+          .addReg(0);
+
+      BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(ARM::BX))
+          .addReg(TargetReg);
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::MOVGOTAddr : {
+      // Expand the pseudo-inst that requests for the GOT address
+      // to be materialized into a register. We use MOVW/MOVT for this.
+      // See ARMISelLowering.cpp for a comment on the strategy.
+      unsigned PredReg = 0;
+      ARMCC::CondCodes Pred = llvm::getInstrPredicate(&MI, PredReg);
+      unsigned DstReg = MI.getOperand(0).getReg();
+      bool DstIsDead = MI.getOperand(0).isDead();
+      MachineInstrBuilder LO16, HI16;
+
+      LO16 = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                     TII->get(ARM::MOVi16PIC),
+                     DstReg)
+        .addExternalSymbol("_GLOBAL_OFFSET_TABLE_", ARMII::MO_LO16);
+
+      HI16 = BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                     TII->get(ARM::MOVTi16PIC))
+        .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
+        .addReg(DstReg)
+        .addExternalSymbol("_GLOBAL_OFFSET_TABLE_", ARMII::MO_HI16);
+
+      AddPICADD_MOVi16_PICID(MI, MBB, MBBI, true,
+                             PredReg, Pred, DstReg, DstIsDead, LO16, HI16);
+
+      (*LO16).setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+      (*HI16).setMemRefs(MI.memoperands_begin(), MI.memoperands_end());
+      LO16.addImm(Pred).addReg(PredReg);
+      HI16.addImm(Pred).addReg(PredReg);
+      TransferImpOps(MI, LO16, HI16);
+      MI.eraseFromParent();
+      return true;
+    }
+    // @LOCALMOD-END
   }
 }
 
@@ -1232,6 +1395,7 @@ bool ARMExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
   TRI = TM.getRegisterInfo();
   STI = &TM.getSubtarget<ARMSubtarget>();
   AFI = MF.getInfo<ARMFunctionInfo>();
+  IsRelocPIC = MF.getTarget().getRelocationModel() == Reloc::PIC_;
 
   bool Modified = false;
   for (MachineFunction::iterator MFI = MF.begin(), E = MF.end(); MFI != E;
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 6611862ca0..033540ae7d 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -716,6 +716,11 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, EVT VT) {
 }
 
 unsigned ARMFastISel::TargetMaterializeConstant(const Constant *C) {
+  // @LOCALMOD-START
+  // In the sfi case we do not want to use the ARM custom cp handling.
+  // This assert should help detect some regressions early.
+  assert(!FlagSfiDisableCP && "unexpected call to TargetMaterializeConstant");
+  // @LOCALMOD-END
   EVT VT = TLI.getValueType(C->getType(), true);
 
   // Only handle simple types.
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index 9392497fd0..c8ddbcfaec 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -26,6 +26,9 @@
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Support/CommandLine.h"
+// @LOCALMOD-START
+#include "llvm/CodeGen/MachineModuleInfo.h"
+// @LOCALMOD-END
 
 using namespace llvm;
 
@@ -153,6 +156,14 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
   int FramePtrSpillFI = 0;
   int D8SpillFI = 0;
 
+  // @LOCALMOD-START
+  MachineModuleInfo &MMI = MF.getMMI();
+  // This condition was gleaned from x86 / PowerPC / XCore
+  bool needsFrameMoves = MMI.hasDebugInfo() ||
+                         !MF.getFunction()->doesNotThrow() ||
+                         MF.getFunction()->needsUnwindTableEntry();
+  // @LOCALMOD-END
+  
   // All calls are tail calls in GHC calling conv, and functions have no
   // prologue/epilogue.
   if (MF.getFunction()->getCallingConv() == CallingConv::GHC)
@@ -212,6 +223,42 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
   // Move past area 1.
   if (GPRCS1Size > 0) MBBI++;
 
+  // @LOCALMOD-START
+  if (needsFrameMoves && GPRCS1Size > 0) {
+    // we just skipped the initial callee save reg instructions, e.g.
+    // push {r4, r5, r6, lr}
+    // NOTE: this likely is not the right thing to do for darwin as it does not
+    //       treat all callee save regs uniformly
+    MCSymbol *AfterRegSave = MMI.getContext().CreateTempSymbol();
+    BuildMI(MBB, MBBI, dl, TII.get(ARM::PROLOG_LABEL)).addSym(AfterRegSave);
+    // record the fact that the stack has moved
+    MachineLocation dst(MachineLocation::VirtualFP);
+    MachineLocation src(MachineLocation::VirtualFP, -GPRCS1Size);
+    MMI.getFrameMoves().push_back(MachineMove(AfterRegSave, dst, src));
+    // for each callee saved register record where it has been saved
+    int offset = 0;
+    for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
+      unsigned Reg = CSI[i].getReg();
+      switch (Reg) {
+       case ARM::R4:
+       case ARM::R5:
+       case ARM::R6:
+       case ARM::R7:
+       case ARM::R8:
+       case ARM::R9:
+       case ARM::R10:
+       case ARM::R11:
+       case ARM::LR:
+        offset -= 4;
+        MachineLocation dst(MachineLocation::VirtualFP, offset);
+        MachineLocation src(Reg);
+        MMI.getFrameMoves().push_back(MachineMove(AfterRegSave, dst, src));
+        break;
+      }
+    }
+  }
+  // @LOCALMOD-END
+
   // Set FP to point to the stack slot that contains the previous FP.
   // For iOS, FP is R7, which has now been stored in spill area 1.
   // Otherwise, if this is not iOS, all the callee-saved registers go
@@ -225,8 +272,29 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
       .addFrameIndex(FramePtrSpillFI).addImm(0)
       .setMIFlag(MachineInstr::FrameSetup);
     AddDefaultCC(AddDefaultPred(MIB));
+    // @LOCALMOD-START
+    if (needsFrameMoves) {
+      // we just emitted the fp pointer setup instruction, e.g.
+      // add      r11, sp, #8
+      MCSymbol *AfterFramePointerInit = MMI.getContext().CreateTempSymbol();
+      BuildMI(MBB, MBBI, dl,
+              TII.get(ARM::PROLOG_LABEL)).addSym(AfterFramePointerInit);
+      // record the fact that the frame pointer is now tracking the "cfa"
+      // Note, gcc and llvm have a slightly different notion of where the
+      // frame pointer should be pointing. gcc points after the return address
+      // and llvm one word further down (two words = 8).
+      // This should be fine as long as we are consistent.
+      // NOTE: this is related to the offset computed for
+      // ISD::FRAME_TO_ARGS_OFFSET
+      MachineLocation dst(MachineLocation::VirtualFP);
+      MachineLocation src(FramePtr, 8);
+      MMI.getFrameMoves().push_back(MachineMove(AfterFramePointerInit, dst, src));
+    }
+    // @LOCALMOD-END
   }
 
+
+
   // Move past area 2.
   if (GPRCS2Size > 0) MBBI++;
 
@@ -275,6 +343,19 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF) const {
       // an inconsistent state (pointing to the middle of callee-saved area).
       // The interrupt handler can end up clobbering the registers.
       AFI->setShouldRestoreSPFromFP(true);
+
+    // @LOCALMOD-START
+    // we only track sp changes if do not have the fp to figure out where
+    // stack frame lives
+    if (needsFrameMoves && !HasFP) {
+      MCSymbol *AfterStackUpdate = MMI.getContext().CreateTempSymbol();
+      BuildMI(MBB, MBBI, dl,
+              TII.get(ARM::PROLOG_LABEL)).addSym(AfterStackUpdate);
+      MachineLocation dst(MachineLocation::VirtualFP);
+      MachineLocation src(MachineLocation::VirtualFP, - NumBytes - GPRCS1Size);
+      MMI.getFrameMoves().push_back(MachineMove(AfterStackUpdate, dst, src));
+    }
+    // @LOCALMOD-END
   }
 
   if (STI.isTargetELF() && hasFP(MF))
@@ -670,7 +751,8 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
       if (Reg >= ARM::D8 && Reg < ARM::D8 + NumAlignedDPRCS2Regs)
         continue;
 
-      if (Reg == ARM::LR && !isTailCall && !isVarArg && STI.hasV5TOps()) {
+      if (Reg == ARM::LR && !isTailCall && !isVarArg && STI.hasV5TOps() &&
+          false /* @LOCALMOD */) {
         Reg = ARM::PC;
         LdmOpc = AFI->isThumbFunction() ? ARM::t2LDMIA_RET : ARM::LDMIA_RET;
         // Fold the return instruction into the LDM.
diff --git a/lib/Target/ARM/ARMFrameLowering.h b/lib/Target/ARM/ARMFrameLowering.h
index a1c2b93562..a2280db515 100644
--- a/lib/Target/ARM/ARMFrameLowering.h
+++ b/lib/Target/ARM/ARMFrameLowering.h
@@ -27,7 +27,8 @@ protected:
 
 public:
   explicit ARMFrameLowering(const ARMSubtarget &sti)
-    : TargetFrameLowering(StackGrowsDown, sti.getStackAlignment(), 0, 4),
+    : TargetFrameLowering(StackGrowsDown, sti.getStackAlignment(), 0, 4,
+      4), // @LOCALMOD
       STI(sti) {
   }
 
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index efd6d2b839..90ae94b3b2 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -35,8 +35,17 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 
+// @LOCALMOD-START
+#include "llvm/Support/CommandLine.h"
+namespace llvm {
+  extern cl::opt<bool> FlagSfiLoad;
+  extern cl::opt<bool> FlagSfiStore;
+}
+// @LOCALMOD-END
+
 using namespace llvm;
 
+
 static cl::opt<bool>
 DisableShifterOp("disable-shifter-op", cl::Hidden,
   cl::desc("Disable isel of shifter-op"),
@@ -109,21 +118,24 @@ public:
   bool SelectAddrModeImm12(SDValue N, SDValue &Base, SDValue &OffImm);
   bool SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset, SDValue &Opc);
 
-  AddrMode2Type SelectAddrMode2Worker(SDValue N, SDValue &Base,
+  AddrMode2Type SelectAddrMode2Worker(SDNode *Op, SDValue N, SDValue &Base,
                                       SDValue &Offset, SDValue &Opc);
-  bool SelectAddrMode2Base(SDValue N, SDValue &Base, SDValue &Offset,
+  bool SelectAddrMode2Base(SDNode *Op,
+                           SDValue N, SDValue &Base, SDValue &Offset,
                            SDValue &Opc) {
-    return SelectAddrMode2Worker(N, Base, Offset, Opc) == AM2_BASE;
+    return SelectAddrMode2Worker(Op, N, Base, Offset, Opc) == AM2_BASE;
   }
 
-  bool SelectAddrMode2ShOp(SDValue N, SDValue &Base, SDValue &Offset,
+  bool SelectAddrMode2ShOp(SDNode *Op,
+                           SDValue N, SDValue &Base, SDValue &Offset,
                            SDValue &Opc) {
-    return SelectAddrMode2Worker(N, Base, Offset, Opc) == AM2_SHOP;
+    return SelectAddrMode2Worker(Op, N, Base, Offset, Opc) == AM2_SHOP;
   }
 
-  bool SelectAddrMode2(SDValue N, SDValue &Base, SDValue &Offset,
+  bool SelectAddrMode2(SDNode *Op, 
+                       SDValue N, SDValue &Base, SDValue &Offset,
                        SDValue &Opc) {
-    SelectAddrMode2Worker(N, Base, Offset, Opc);
+    SelectAddrMode2Worker(Op, N, Base, Offset, Opc);
 //    return SelectAddrMode2ShOp(N, Base, Offset, Opc);
     // This always matches one way or another.
     return true;
@@ -136,7 +148,7 @@ public:
   bool SelectAddrMode2OffsetImmPre(SDNode *Op, SDValue N,
                              SDValue &Offset, SDValue &Opc);
   bool SelectAddrOffsetNone(SDValue N, SDValue &Base);
-  bool SelectAddrMode3(SDValue N, SDValue &Base,
+  bool SelectAddrMode3(SDNode *Op, SDValue N, SDValue &Base,
                        SDValue &Offset, SDValue &Opc);
   bool SelectAddrMode3Offset(SDNode *Op, SDValue N,
                              SDValue &Offset, SDValue &Opc);
@@ -434,6 +446,22 @@ bool ARMDAGToDAGISel::SelectRegShifterOperand(SDValue N,
   return true;
 }
 
+// @LOCALMOD-START
+static bool ShouldOperandBeUnwrappedForUseAsBaseAddress(
+  SDValue& N, const ARMSubtarget* Subtarget) {
+  assert (N.getOpcode() == ARMISD::Wrapper);
+  // Never use this transformation if constant island pools are disallowed 
+  if (FlagSfiDisableCP) return false;
+
+  // always apply this when we do not have movt/movw available
+  // (if we do have movt/movw we be able to get rid of the
+  // constant pool entry altogether)
+  if (!Subtarget->useMovt()) return true;
+  // explain why we do not want to use this for TargetGlobalAddress
+  if (N.getOperand(0).getOpcode() != ISD::TargetGlobalAddress) return true;
+  return false;
+}
+// @LOCALMOD-END
 
 bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N,
                                           SDValue &Base,
@@ -452,8 +480,8 @@ bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N,
     }
 
     if (N.getOpcode() == ARMISD::Wrapper &&
-        !(Subtarget->useMovt() &&
-                     N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) {
+        // @LOCALMOD
+        ShouldOperandBeUnwrappedForUseAsBaseAddress(N, Subtarget)) {
       Base = N.getOperand(0);
     } else
       Base = N;
@@ -487,6 +515,11 @@ bool ARMDAGToDAGISel::SelectAddrModeImm12(SDValue N,
 
 bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset,
                                       SDValue &Opc) {
+  // @LOCALMOD-BEGIN
+  // Disallow offsets of Reg + Reg (which may escape sandbox).
+  if (Subtarget->isTargetNaCl())
+    return false;
+  // @LOCALMOD-END
   if (N.getOpcode() == ISD::MUL &&
       ((!Subtarget->isLikeA9() && !Subtarget->isSwift()) || N.hasOneUse())) {
     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
@@ -582,10 +615,24 @@ bool ARMDAGToDAGISel::SelectLdStSOReg(SDValue N, SDValue &Base, SDValue &Offset,
 
 //-----
 
-AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
+AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDNode *Op,
+                                                     SDValue N,
                                                      SDValue &Base,
                                                      SDValue &Offset,
+// @LOCALMOD-START
+// Note: In the code below we do not want "Offset" to be real register to
+// not violate ARM sandboxing.
+// @LOCALMOD-END
                                                      SDValue &Opc) {
+  // @LOCALMOD-START
+  // Avoid two reg addressing mode for loads and stores
+  const bool restrict_addressing_modes_for_nacl =
+     (FlagSfiLoad && (Op->getOpcode() == ISD::LOAD)) ||
+     (FlagSfiStore && (Op->getOpcode() == ISD::STORE));
+  // This is neither a sandboxable load nor a sandboxable store.
+  if (!restrict_addressing_modes_for_nacl) {
+  // @LOCALMOD-END
+
   if (N.getOpcode() == ISD::MUL &&
       (!(Subtarget->isLikeA9() || Subtarget->isSwift()) || N.hasOneUse())) {
     if (ConstantSDNode *RHS = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
@@ -609,6 +656,7 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
       }
     }
   }
+  } // @LOCALMOD
 
   if (N.getOpcode() != ISD::ADD && N.getOpcode() != ISD::SUB &&
       // ISD::OR that is equivalent to an ADD.
@@ -618,8 +666,8 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
       int FI = cast<FrameIndexSDNode>(N)->getIndex();
       Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
     } else if (N.getOpcode() == ARMISD::Wrapper &&
-               !(Subtarget->useMovt() &&
-                 N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) {
+               // @LOCALMOD
+               ShouldOperandBeUnwrappedForUseAsBaseAddress(N, Subtarget)) {
       Base = N.getOperand(0);
     }
     Offset = CurDAG->getRegister(0, MVT::i32);
@@ -652,7 +700,7 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
       return AM2_BASE;
     }
   }
-
+  
   if ((Subtarget->isLikeA9() || Subtarget->isSwift()) && !N.hasOneUse()) {
     // Compute R +/- (R << N) and reuse it.
     Base = N;
@@ -662,6 +710,24 @@ AddrMode2Type ARMDAGToDAGISel::SelectAddrMode2Worker(SDValue N,
                                     MVT::i32);
     return AM2_BASE;
   }
+  
+  // @LOCALMOD-START
+  // Keep load and store addressing modes simple
+  if (restrict_addressing_modes_for_nacl) {
+    Base = N;
+    if (N.getOpcode() == ISD::FrameIndex) {
+      int FI = cast<FrameIndexSDNode>(N)->getIndex();
+      Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
+    } else if (N.getOpcode() == ARMISD::Wrapper) {
+      Base = N.getOperand(0);
+    }
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM2Opc(ARM_AM::add, 0,
+                                                      ARM_AM::no_shift),
+                                    MVT::i32);
+    return AM2_BASE;
+  }
+  // @LOCALMOD-END
 
   // Otherwise this is R +/- [possibly shifted] R.
   ARM_AM::AddrOpc AddSub = N.getOpcode() != ISD::SUB ? ARM_AM::add:ARM_AM::sub;
@@ -730,13 +796,27 @@ bool ARMDAGToDAGISel::SelectAddrMode2OffsetReg(SDNode *Op, SDValue N,
   if (isScaledConstantInRange(N, /*Scale=*/1, 0, 0x1000, Val))
     return false;
 
+  // @LOCALMOD-BEGIN
+  // Avoid two reg addressing mode for loads and stores
+  const bool restrict_addressing_modes_for_nacl =
+     (FlagSfiLoad && (Op->getOpcode() == ISD::LOAD)) ||
+     (FlagSfiStore && (Op->getOpcode() == ISD::STORE));
+  // @LOCALMOD-END
+
+
   Offset = N;
   ARM_AM::ShiftOpc ShOpcVal = ARM_AM::getShiftOpcForNode(N.getOpcode());
   unsigned ShAmt = 0;
   if (ShOpcVal != ARM_AM::no_shift) {
     // Check to see if the RHS of the shift is a constant, if not, we can't fold
     // it.
-    if (ConstantSDNode *Sh = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+
+    //if (ConstantSDNode *Sh = dyn_cast<ConstantSDNode>(N.getOperand(1))) {
+    ConstantSDNode *Sh = dyn_cast<ConstantSDNode>(N.getOperand(1));
+    // @LOCALMOD-BEGIN
+    // Neither a sandboxable load nor a sandboxable store.
+    if (!restrict_addressing_modes_for_nacl && Sh ) {
+    // @LOCALMOD-END
       ShAmt = Sh->getZExtValue();
       if (isShifterOpProfitable(N, ShOpcVal, ShAmt))
         Offset = N.getOperand(0);
@@ -799,16 +879,25 @@ bool ARMDAGToDAGISel::SelectAddrOffsetNone(SDValue N, SDValue &Base) {
   return true;
 }
 
-bool ARMDAGToDAGISel::SelectAddrMode3(SDValue N,
+bool ARMDAGToDAGISel::SelectAddrMode3(SDNode *Op, SDValue N,
                                       SDValue &Base, SDValue &Offset,
                                       SDValue &Opc) {
+  // @LOCALMOD-START
+  // Avoid two reg addressing mode for loads and stores
+  const bool restrict_addressing_modes_for_nacl =
+     (FlagSfiLoad && (Op->getOpcode() == ISD::LOAD)) ||
+     (FlagSfiStore && (Op->getOpcode() == ISD::STORE));
+  if (!restrict_addressing_modes_for_nacl) {
+  // @LOCALMOD-END
   if (N.getOpcode() == ISD::SUB) {
+
     // X - C  is canonicalize to X + -C, no need to handle it here.
     Base = N.getOperand(0);
     Offset = N.getOperand(1);
     Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::sub, 0),MVT::i32);
     return true;
   }
+  } // @LOCALMOD-END
 
   if (!CurDAG->isBaseWithConstantOffset(N)) {
     Base = N;
@@ -841,6 +930,16 @@ bool ARMDAGToDAGISel::SelectAddrMode3(SDValue N,
     return true;
   }
 
+  // @LOCALMOD-START
+  // A sandboxable load or a sandboxable store.
+  if (restrict_addressing_modes_for_nacl) {
+    Base = N;
+    Offset = CurDAG->getRegister(0, MVT::i32);
+    Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0),MVT::i32);
+    return true;
+  }
+  // @LOCALMOD-END
+
   Base = N.getOperand(0);
   Offset = N.getOperand(1);
   Opc = CurDAG->getTargetConstant(ARM_AM::getAM3Opc(ARM_AM::add, 0), MVT::i32);
@@ -875,8 +974,8 @@ bool ARMDAGToDAGISel::SelectAddrMode5(SDValue N,
       int FI = cast<FrameIndexSDNode>(N)->getIndex();
       Base = CurDAG->getTargetFrameIndex(FI, TLI.getPointerTy());
     } else if (N.getOpcode() == ARMISD::Wrapper &&
-               !(Subtarget->useMovt() &&
-                 N.getOperand(0).getOpcode() == ISD::TargetGlobalAddress)) {
+               // @LOCALMOD
+               ShouldOperandBeUnwrappedForUseAsBaseAddress(N, Subtarget)) {
       Base = N.getOperand(0);
     }
     Offset = CurDAG->getTargetConstant(ARM_AM::getAM5Opc(ARM_AM::add, 0),
@@ -2467,6 +2566,8 @@ SDNode *ARMDAGToDAGISel::Select(SDNode *N) {
                  !ARM_AM::isSOImmTwoPartVal(Val));     // two instrs.
     }
 
+    if (FlagSfiDisableCP) UseCP = false; // @LOCALMOD
+
     if (UseCP) {
       SDValue CPIdx =
         CurDAG->getTargetConstantPool(ConstantInt::get(
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index ff99b04078..0893826427 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -48,6 +48,15 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+
+// @LOCALMOD-START
+namespace llvm {
+  extern cl::opt<bool> FlagSfiLoad;
+  extern cl::opt<bool> FlagSfiStore;
+  extern cl::opt<bool> FlagSfiDisableCP;
+}
+// @LOCALMOD-END
+
 using namespace llvm;
 
 STATISTIC(NumTailCalls, "Number of tail calls");
@@ -71,6 +80,7 @@ ARMInterworking("arm-interworking", cl::Hidden,
   cl::init(true));
 
 namespace {
+
   class ARMCCState : public CCState {
   public:
     ARMCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
@@ -255,8 +265,8 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setLibcallName(RTLIB::SHL_I128, 0);
   setLibcallName(RTLIB::SRL_I128, 0);
   setLibcallName(RTLIB::SRA_I128, 0);
-
-  if (Subtarget->isAAPCS_ABI() && !Subtarget->isTargetDarwin()) {
+  // @LOCALMOD: use standard names and calling conventions for pnacl
+  if (!Subtarget->isTargetNaCl() && Subtarget->isAAPCS_ABI() && !Subtarget->isTargetDarwin()) { 
     // Double-precision floating-point arithmetic helper functions
     // RTABI chapter 4.1.2, Table 2
     setLibcallName(RTLIB::ADD_F64, "__aeabi_dadd");
@@ -652,9 +662,14 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::GLOBAL_OFFSET_TABLE, MVT::i32, Custom);
   setOperationAction(ISD::GlobalTLSAddress, MVT::i32, Custom);
   setOperationAction(ISD::BlockAddress, MVT::i32, Custom);
-
+  // @LOCALMOD-START
+  if (!Subtarget->useInlineJumpTables())
+    setOperationAction(ISD::JumpTable,     MVT::i32,   Custom);
+  // @LOCALMOD-END
+  
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 
+
   // Use the default implementation.
   setOperationAction(ISD::VASTART,            MVT::Other, Custom);
   setOperationAction(ISD::VAARG,              MVT::Other, Expand);
@@ -667,9 +682,18 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     // Non-Darwin platforms may return values in these registers via the
     // personality function.
     setOperationAction(ISD::EHSELECTION,      MVT::i32,   Expand);
+    // @LOCALMOD-START
     setOperationAction(ISD::EXCEPTIONADDR,    MVT::i32,   Expand);
-    setExceptionPointerRegister(ARM::R0);
-    setExceptionSelectorRegister(ARM::R1);
+    // we use the first caller saved regs here
+    // c.f.: llvm-gcc/llvm-gcc-4.2/gcc/unwind-dw2.c::uw_install_context
+    // NOTE: these are related to the _Unwind_PNaClSetResult{0,1} functions
+    setExceptionPointerRegister(ARM::R4);
+    setExceptionSelectorRegister(ARM::R5);
+
+    setOperationAction(ISD::FRAME_TO_ARGS_OFFSET, MVT::i32, Custom);
+
+    setOperationAction(ISD::EH_RETURN, MVT::Other, Custom);
+    // @LOCALMOD-END
   }
 
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Expand);
@@ -755,8 +779,12 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   setOperationAction(ISD::BR_CC,     MVT::i32,   Custom);
   setOperationAction(ISD::BR_CC,     MVT::f32,   Custom);
   setOperationAction(ISD::BR_CC,     MVT::f64,   Custom);
-  setOperationAction(ISD::BR_JT,     MVT::Other, Custom);
-
+  // @LOCALMOD-START
+  //setOperationAction(ISD::BR_JT,     MVT::Other, Custom);
+  setOperationAction(ISD::BR_JT,     MVT::Other,
+                     Subtarget->useInlineJumpTables() ? Custom : Expand);
+  // @LOCALMOD-END
+  
   // We don't support sin/cos/fmod/copysign/pow
   setOperationAction(ISD::FSIN,      MVT::f64, Expand);
   setOperationAction(ISD::FSIN,      MVT::f32, Expand);
@@ -793,6 +821,14 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
     }
   }
 
+  // @LOCALMOD-BEGIN
+  if (Subtarget->isTargetNaCl()) {
+    setOperationAction(ISD::NACL_TP_TLS_OFFSET,        MVT::i32, Custom);
+    setOperationAction(ISD::NACL_TP_TDB_OFFSET,        MVT::i32, Custom);
+    setOperationAction(ISD::NACL_TARGET_ARCH,          MVT::i32, Custom);
+  }
+  // @LOCALMOD-END
+
   // We have target-specific dag combine patterns for the following nodes:
   // ARMISD::VMOVRRD  - No need to call setTargetDAGCombine
   setTargetDAGCombine(ISD::ADD);
@@ -884,6 +920,10 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::WrapperDYN:    return "ARMISD::WrapperDYN";
   case ARMISD::WrapperPIC:    return "ARMISD::WrapperPIC";
   case ARMISD::WrapperJT:     return "ARMISD::WrapperJT";
+  // @LOCALMOD-START
+  case ARMISD::WrapperJT2:    return "ARMISD::WrapperJT2"; 
+  case ARMISD::EH_RETURN:     return "ARMISD::EH_RETURN"; 
+  // @LOCALMOD-END
   case ARMISD::CALL:          return "ARMISD::CALL";
   case ARMISD::CALL_PRED:     return "ARMISD::CALL_PRED";
   case ARMISD::CALL_NOLINK:   return "ARMISD::CALL_NOLINK";
@@ -1662,6 +1702,27 @@ ARMTargetLowering::HandleByVal(
   assert((State->getCallOrPrologue() == Prologue ||
           State->getCallOrPrologue() == Call) &&
          "unhandled ParmContext");
+
+  // @LOCALMOD-BEGIN
+  // The original mechanism tries to split a byval argument between registers
+  // and the stack.  It doesn't work correctly yet, so disable it.
+  // This leaves the entire byval argument on the stack, and the rest
+  // of the parameters will need to be on the stack as well, to have
+  // the correct order for var-args.  We remember the fact that there was
+  // a byval param that forced this, so that we know not to use the
+  // handle var-args reg-save area.
+  // PR11018.
+  if ((!State->isFirstByValRegValid()) &&
+      (ARM::R0 <= reg) && (reg <= ARM::R3)) {
+    State->setHasByValInRegPosition();
+  }
+  // Confiscate any remaining parameter registers to preclude their
+  // assignment to subsequent parameters.
+  while (State->AllocateReg(GPRArgRegs, 4))
+    ;
+  return;
+  // @LOCALMOD-END
+
   if ((!State->isFirstByValRegValid()) &&
       (ARM::R0 <= reg) && (reg <= ARM::R3)) {
     if (Subtarget->isAAPCS_ABI() && Align > 4) {
@@ -2062,7 +2123,14 @@ static SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) {
 }
 
 unsigned ARMTargetLowering::getJumpTableEncoding() const {
-  return MachineJumpTableInfo::EK_Inline;
+  // @LOCALMOD-BEGIN
+  if (Subtarget->useInlineJumpTables()) { 
+    return MachineJumpTableInfo::EK_Inline;
+  } else {
+    // TODO: Find a better way to call the super-class.
+    return TargetLowering::getJumpTableEncoding();
+  }
+  // @LOCALMOD-END
 }
 
 SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
@@ -2095,28 +2163,120 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
   return DAG.getNode(ARMISD::PIC_ADD, DL, PtrVT, Result, PICLabel);
 }
 
+// @LOCALMOD-START
+// more conventional jumptable implementation
+SDValue ARMTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
+  assert(!Subtarget->useInlineJumpTables() &&
+         "inline jump tables not custom lowered");
+  const DebugLoc dl = Op.getDebugLoc();
+  EVT PTy = getPointerTy();
+  JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
+  SDValue JTI = DAG.getTargetJumpTable(JT->getIndex(), PTy);
+  return DAG.getNode(ARMISD::WrapperJT2, dl, MVT::i32, JTI);
+}
+
+//////////////////////////////////////////////////////////////////////
+// NaCl TLS setup / layout intrinsics.
+// See: native_client/src/untrusted/stubs/tls_params.h
+SDValue ARMTargetLowering::LowerNaClTpTlsOffset(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  // ssize_t __nacl_tp_tls_offset (size_t tls_size) {
+  //   return 8;
+  // }
+  return DAG.getConstant(8, Op.getValueType().getSimpleVT());
+}
+
+SDValue ARMTargetLowering::LowerNaClTpTdbOffset(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  // ssize_t __nacl_tp_tdb_offset (size_t tdb_size) {
+  //   return -tdb_size;
+  // }
+  DebugLoc dl = Op.getDebugLoc();
+  return DAG.getNode(ISD::SUB, dl, Op.getValueType().getSimpleVT(),
+                     DAG.getConstant(0, Op.getValueType().getSimpleVT()),
+                     Op.getOperand(0));
+}
+
+SDValue
+ARMTargetLowering::LowerNaClTargetArch(SDValue Op, SelectionDAG &DAG) const {
+  // size_t __nacl_target_arch () {
+  //   return PnaclTargetArchitectureARM_32;
+  // }
+  return DAG.getConstant(PnaclTargetArchitectureARM_32,
+                         Op.getValueType().getSimpleVT());
+}
+
+//////////////////////////////////////////////////////////////////////
+
+// @LOCALMOD-END
+
 // Lower ISD::GlobalTLSAddress using the "general dynamic" model
 SDValue
 ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
                                                  SelectionDAG &DAG) const {
   DebugLoc dl = GA->getDebugLoc();
   EVT PtrVT = getPointerTy();
-  unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
-  MachineFunction &MF = DAG.getMachineFunction();
-  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
-  ARMConstantPoolValue *CPV =
-    ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
-                                    ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
-  SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4);
-  Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
-  Argument = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument,
-                         MachinePointerInfo::getConstantPool(),
-                         false, false, false, 0);
-  SDValue Chain = Argument.getValue(1);
+  // @LOCALMOD-BEGIN
+  SDValue Chain;
+  SDValue Argument;
+
+  if (FlagSfiDisableCP) {
+    // With constant pools "disabled" (moved to rodata), this constant pool
+    // entry is no longer in text, and simultaneous PC relativeness
+    // and CP Addr relativeness is no longer expressible.
+    // So, instead of having:
+    //
+    // .LCPI12_0:
+    //   .long var(tlsgd)-((.LPC12_0+8) - .)
+    // ...
+    //    ldr r2, .LCPI12_0
+    // .LPC12_0:
+    //    add r0, pc, r2
+    //
+    // we have:
+    //
+    // .LCPI12_0:
+    //   .long var(tlsgd)
+    // ...
+    //    // get addr of .LCPI12_0 into r2
+    //    ldr r0, [r2]
+    //    add r0, r2, r0
+    // (1) No longer subtracting pc, so no longer adding that back
+    // (2) Not adding "." in the CP entry, so adding it via instructions.
+    //
+    unsigned char PCAdj = 0;
+    MachineFunction &MF = DAG.getMachineFunction();
+    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+    unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+    ARMConstantPoolValue *CPV =
+        ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
+                                        ARMCP::CPValue, PCAdj, ARMCP::TLSGD,
+                                        false);
+    SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+    CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+    Argument = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
+                                   MachinePointerInfo::getConstantPool(),
+                                   false, false, false, 0);
+    Chain = Argument.getValue(1);
+    Argument = DAG.getNode(ISD::ADD, dl, PtrVT, Argument, CPAddr);
+  } else { // sort of @LOCALMOD-END
+    unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
+    MachineFunction &MF = DAG.getMachineFunction();
+    ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
+    unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
+    ARMConstantPoolValue *CPV =
+        ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
+                                        ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
+    Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4); // @ LOCALMOD
+    Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
+    Argument = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), Argument,
+                           MachinePointerInfo::getConstantPool(),
+                           false, false, false, 0);
+    Chain = Argument.getValue(1); // @LOCALMOD
 
-  SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
-  Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
+    SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
+    Argument = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Argument, PICLabel);
+  } // @LOCALMOD-END
 
   // call __tls_get_addr.
   ArgListTy Args;
@@ -2153,25 +2313,49 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
     MachineFunction &MF = DAG.getMachineFunction();
     ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
     unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
-    // Initial exec model.
-    unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
-    ARMConstantPoolValue *CPV =
+
+    // @LOCALMOD-BEGIN
+    if (FlagSfiDisableCP) {
+      // Similar to change to LowerToTLSGeneralDynamicModel, and
+      // for the same reason.
+      unsigned char PCAdj = 0;
+      ARMConstantPoolValue *CPV =
+        ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
+                                        ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF,
+                                        false);
+      SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+      CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+      Offset = DAG.getLoad(PtrVT, dl, Chain, CPAddr,
+                           MachinePointerInfo::getConstantPool(),
+                           false, false, false, 0);
+      Chain = Offset.getValue(1);
+
+      Offset = DAG.getNode(ISD::ADD, dl, PtrVT, Offset, CPAddr);
+
+      Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
+                           MachinePointerInfo::getConstantPool(),
+                           false, false, false, 0);
+    } else { // sort of @LOCALMOD-END (indentation)
+      // Initial exec model.
+      unsigned char PCAdj = Subtarget->isThumb() ? 4 : 8;
+      ARMConstantPoolValue *CPV =
       ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
                                       ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF,
                                       true);
-    Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
-    Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
-    Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
-                         MachinePointerInfo::getConstantPool(),
-                         false, false, false, 0);
-    Chain = Offset.getValue(1);
+      Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+      Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
+      Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
+                           MachinePointerInfo::getConstantPool(),
+                           false, false, false, 0);
+      Chain = Offset.getValue(1);
 
-    SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
-    Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
+      SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
+      Offset = DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Offset, PICLabel);
 
-    Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
-                         MachinePointerInfo::getConstantPool(),
-                         false, false, false, 0);
+      Offset = DAG.getLoad(PtrVT, dl, Chain, Offset,
+                           MachinePointerInfo::getConstantPool(),
+                           false, false, false, 0);
+    } // @LOCALMOD-END
   } else {
     // local exec model
     assert(model == TLSModel::LocalExec);
@@ -2323,17 +2507,55 @@ SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op,
   unsigned ARMPCLabelIndex = AFI->createPICLabelUId();
   EVT PtrVT = getPointerTy();
   DebugLoc dl = Op.getDebugLoc();
-  unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
+
+  // @LOCALMOD-BEGIN
+  if (FlagSfiDisableCP) {
+    // With constant pools "disabled" (moved to rodata), the constant pool
+    // entry is no longer in text, and the PC relativeness is
+    // no longer expressible.
+    //
+    // Instead of having:
+    //
+    // .LCPI12_0:
+    //   .long _GLOBAL_OFFSET_TABLE_-(.LPC12_0+8)
+    // ...
+    //    ldr r2, .LCPI12_0
+    // .LPC12_0:
+    //    add r0, pc, r2
+    //
+    // Things to try:
+    // (1) get the address of the GOT through a pc-relative MOVW / MOVT.
+    //
+    //    movw r0, :lower16:_GLOBAL_OFFSET_TABLE_ - (.LPC12_0 + 8)
+    //    movt r0, :upper16:_GLOBAL_OFFSET_TABLE_ - (.LPC12_0 + 8)
+    // .LPC12_0:
+    //    add r0, pc, r0
+    //
+    // (2) Make the constant pool entry relative to its own location
+    //
+    // .LCPI12_0:
+    //   .long _GLOBAL_OFFSET_TABLE_-.
+    // ...
+    //    // get address of LCPI12_0 into r0 (possibly 3 instructions for PIC)
+    //    ldr r1, [r0]
+    //    add r1, r0, r1
+    //
+    // We will try (1) for now, since (2) takes about 3 more instructions
+    // (and one of them is a load).
+    return DAG.getNode(ARMISD::WrapperGOT, dl, MVT::i32);
+  } else { // Sort of LOCALMOD-END (indentation only
+    unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
   ARMConstantPoolValue *CPV =
     ARMConstantPoolSymbol::Create(*DAG.getContext(), "_GLOBAL_OFFSET_TABLE_",
                                   ARMPCLabelIndex, PCAdj);
-  SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
-  CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
-  SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
-                               MachinePointerInfo::getConstantPool(),
-                               false, false, false, 0);
-  SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
-  return DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
+    SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+    CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+    SDValue Result = DAG.getLoad(PtrVT, dl, DAG.getEntryNode(), CPAddr,
+                                 MachinePointerInfo::getConstantPool(),
+                                 false, false, false, 0);
+    SDValue PICLabel = DAG.getConstant(ARMPCLabelIndex, MVT::i32);
+    return DAG.getNode(ARMISD::PIC_ADD, dl, PtrVT, Result, PICLabel);
+  } // @LOCALMOD-END
 }
 
 SDValue
@@ -2359,6 +2581,7 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
   DebugLoc dl = Op.getDebugLoc();
   switch (IntNo) {
   default: return SDValue();    // Don't custom lower most intrinsics.
+  case Intrinsic::nacl_read_tp:  // @LOCALMOD
   case Intrinsic::arm_thread_pointer: {
     EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
     return DAG.getNode(ARMISD::THREAD_POINTER, dl, PtrVT);
@@ -2529,6 +2752,10 @@ ARMTargetLowering::computeRegArea(CCState &CCInfo, MachineFunction &MF,
   unsigned NumGPRs;
   if (CCInfo.isFirstByValRegValid())
     NumGPRs = ARM::R4 - CCInfo.getFirstByValReg();
+    // @LOCALMOD-BEGIN
+  else if (CCInfo.hasByValInRegPosition())
+    NumGPRs = 0;
+    // @LOCALMOD-END
   else {
     unsigned int firstUnalloced;
     firstUnalloced = CCInfo.getFirstUnallocated(GPRArgRegs,
@@ -2562,6 +2789,10 @@ ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
   unsigned firstRegToSaveIndex;
   if (CCInfo.isFirstByValRegValid())
     firstRegToSaveIndex = CCInfo.getFirstByValReg() - ARM::R0;
+  // @LOCALMOD-BEGIN
+  else if (CCInfo.hasByValInRegPosition())
+    firstRegToSaveIndex = 4; // Nothing to save.
+  // @LOCALMOD-END
   else {
     firstRegToSaveIndex = CCInfo.getFirstUnallocated
       (GPRArgRegs, sizeof(GPRArgRegs) / sizeof(GPRArgRegs[0]));
@@ -5032,7 +5263,7 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
            "unexpected types for extended operands to VMULL");
     return DAG.getNode(NewOpc, DL, VT, Op0, Op1);
   }
-
+  
   // Optimizing (zext A + zext B) * C, to (VMULL A, C) + (VMULL B, C) during
   // isel lowering to take advantage of no-stall back to back vmul + vmla.
   //   vmull q0, d4, d6
@@ -5051,6 +5282,38 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
 }
 
+// @LOCALMOD-START
+// An EH_RETURN is the result of lowering llvm.eh.return.i32 which in turn is
+// generated from __builtin_eh_return (offset, handler)
+// The effect of this is to adjust the stack pointer by "offset"
+// and then branch to "handler".
+SDValue ARMTargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG)
+    const {
+  SDValue Chain     = Op.getOperand(0);
+  SDValue Offset    = Op.getOperand(1);
+  SDValue Handler   = Op.getOperand(2);
+  DebugLoc dl       = Op.getDebugLoc();
+
+  // Store stack offset in R2, jump target in R3, dummy return value in R0
+  // The dummy return value is needed to make the use-def chains happy,
+  // because the EH_RETURN instruction uses the isReturn attribute, which
+  // means preceding code needs to define the return register (R0 on ARM).
+  // http://code.google.com/p/nativeclient/issues/detail?id=2643
+  unsigned OffsetReg = ARM::R2;
+  unsigned AddrReg = ARM::R3;
+  unsigned ReturnReg = ARM::R0;
+  Chain = DAG.getCopyToReg(Chain, dl, OffsetReg, Offset);
+  Chain = DAG.getCopyToReg(Chain, dl, AddrReg, Handler);
+  Chain = DAG.getCopyToReg(Chain, dl, ReturnReg, DAG.getIntPtrConstant(0));
+  return DAG.getNode(ARMISD::EH_RETURN, dl,
+                     MVT::Other,
+                     Chain,
+                     DAG.getRegister(OffsetReg, MVT::i32),
+                     DAG.getRegister(AddrReg, getPointerTy()));
+}
+// @LOCALMOD-END
+
+
 static SDValue
 LowerSDIV_v4i8(SDValue X, SDValue Y, DebugLoc dl, SelectionDAG &DAG) {
   // Convert to float
@@ -5296,7 +5559,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   default: llvm_unreachable("Don't know how to custom lower this!");
   case ISD::ConstantPool:  return LowerConstantPool(Op, DAG);
   case ISD::BlockAddress:  return LowerBlockAddress(Op, DAG);
-  case ISD::GlobalAddress:
+  case ISD::JumpTable:    return LowerJumpTable(Op, DAG); // @LOCALMOD
+   case ISD::GlobalAddress:
     return Subtarget->isTargetDarwin() ? LowerGlobalAddressDarwin(Op, DAG) :
       LowerGlobalAddressELF(Op, DAG);
   case ISD::GlobalTLSAddress: return LowerGlobalTLSAddress(Op, DAG);
@@ -5315,6 +5579,17 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FCOPYSIGN:     return LowerFCOPYSIGN(Op, DAG);
   case ISD::RETURNADDR:    return LowerRETURNADDR(Op, DAG);
   case ISD::FRAMEADDR:     return LowerFRAMEADDR(Op, DAG);
+  // @LOCALMOD-START
+  // The exact semantics of this ISD are not completely clear.
+  // LLVM seems to always point the fp after the push ra and the old fp, i.e.
+  // two register slots after the beginning of the stack frame.
+  // It is not clear what happens when there is no frame pointer but
+  // but llvm unlike gcc seems to always force one when this node is
+  // encountered.
+  case ISD::FRAME_TO_ARGS_OFFSET: return DAG.getIntPtrConstant(2*4);
+  case ISD::EH_RETURN:            return LowerEH_RETURN(Op, DAG);
+  // @LOCALMOD-END
+   
   case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG);
   case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
   case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
@@ -5345,6 +5620,11 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SUBE:          return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
   case ISD::ATOMIC_LOAD:
   case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
+  // @LOCALMOD-BEGIN
+  case ISD::NACL_TP_TLS_OFFSET:    return LowerNaClTpTlsOffset(Op, DAG);
+  case ISD::NACL_TP_TDB_OFFSET:    return LowerNaClTpTdbOffset(Op, DAG);
+  case ISD::NACL_TARGET_ARCH:      return LowerNaClTargetArch(Op, DAG);
+  // @LOCALMOD-END
   }
 }
 
@@ -6555,7 +6835,11 @@ EmitStructByval(MachineInstr *MI, MachineBasicBlock *BB) const {
       AddDefaultPred(BuildMI(BB, dl, TII->get(ARM::t2MOVTi16), varEnd)
                      .addReg(VReg1)
                      .addImm(LoopSize >> 16));
-  } else {
+  } else if (FlagSfiDisableCP) { // @LOCALMOD-START
+    BuildMI(BB, dl, TII->get(ARM::MOVi32imm))
+      .addReg(varEnd, RegState::Define)
+      .addImm(LoopSize);
+  } else { // @LOCALMOD-END
     MachineConstantPool *ConstantPool = MF->getConstantPool();
     Type *Int32Ty = Type::getInt32Ty(MF->getFunction()->getContext());
     const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
@@ -9482,6 +9766,16 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
   if (Subtarget->isThumb1Only())
     return false;
 
+  // @LOCALMOD-START
+  // Avoid two reg addressing mode for loads and stores
+  const bool restrict_addressing_modes_for_nacl =
+      ((FlagSfiLoad && N->getOpcode() == ISD::LOAD) ||
+       (FlagSfiStore && N->getOpcode() == ISD::STORE));
+  if (restrict_addressing_modes_for_nacl) {
+    return false;
+  }
+  // @LOCALMOD-END
+
   EVT VT;
   SDValue Ptr;
   bool isSEXTLoad = false;
@@ -9520,7 +9814,15 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
                                                    SelectionDAG &DAG) const {
   if (Subtarget->isThumb1Only())
     return false;
-
+   // @LOCALMOD-START
+  // Avoid two reg addressing mode for loads and stores
+  const bool restrict_addressing_modes_for_nacl =
+      ((FlagSfiLoad && N->getOpcode() == ISD::LOAD) ||
+       (FlagSfiStore && N->getOpcode() == ISD::STORE));
+  if (restrict_addressing_modes_for_nacl) {
+    return false;
+  }
+  // @LOCALMOD-END
   EVT VT;
   SDValue Ptr;
   bool isSEXTLoad = false;
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 4eb3b2cb51..3302ec69a5 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -40,10 +40,13 @@ namespace llvm {
       WrapperPIC,   // WrapperPIC - A wrapper node for TargetGlobalAddress in
                     // PIC mode.
       WrapperJT,    // WrapperJT - A wrapper node for TargetJumpTable
-
+      // @LOCALMOD-START
+      WrapperJT2,   // like WrapperJT but without the UID
+      WrapperGOT,   // A Wrapper node for GOT addresses
+      EH_RETURN,    // For LowerEH_RETURN
+      // @LOCALMOD-END
       // Add pseudo op to model memcpy for struct byval.
       COPY_STRUCT_BYVAL,
-
       CALL,         // Function call.
       CALL_PRED,    // Function call that's predicable.
       CALL_NOLINK,  // Function call with branch not branch-and-link.
@@ -434,6 +437,14 @@ namespace llvm {
     SDValue LowerToTLSExecModels(GlobalAddressSDNode *GA,
                                  SelectionDAG &DAG,
                                  TLSModel::Model model) const;
+    // @LOCALMOD-START
+    SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerNaClTpTlsOffset(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerNaClTpTdbOffset(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerNaClTargetArch(SDValue Op, SelectionDAG &DAG) const;
+    // @LOCALMOD-END
+
     SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBR_JT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index df2e55ed5c..9f7e50cd27 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -95,6 +95,14 @@ def ARMWrapper       : SDNode<"ARMISD::Wrapper",     SDTIntUnaryOp>;
 def ARMWrapperDYN    : SDNode<"ARMISD::WrapperDYN",  SDTIntUnaryOp>;
 def ARMWrapperPIC    : SDNode<"ARMISD::WrapperPIC",  SDTIntUnaryOp>;
 def ARMWrapperJT     : SDNode<"ARMISD::WrapperJT",   SDTIntBinOp>;
+// @LOCALMOD-START
+// support non-inline jumptables
+// we do not use the extre uid immediate that comes with ARMWrapperJT
+// TODO(robertm): figure out what it is used for
+def ARMWrapperJT2    : SDNode<"ARMISD::WrapperJT2",  SDTIntUnaryOp>;
+// Support for MOVW/MOVT'ing the GOT address directly into a register.
+def ARMWrapperGOT       : SDNode<"ARMISD::WrapperGOT",  SDTPtrLeaf>;
+// @LOCALMOD-END
 
 def ARMcallseq_start : SDNode<"ISD::CALLSEQ_START", SDT_ARMCallSeqStart,
                               [SDNPHasChain, SDNPSideEffect, SDNPOutGlue]>;
@@ -272,6 +280,11 @@ def DontUseVMOVSR : Predicate<"!Subtarget->isCortexA9() && Subtarget->useNEONFor
 def IsLE             : Predicate<"TLI.isLittleEndian()">;
 def IsBE             : Predicate<"TLI.isBigEndian()">;
 
+// @LOCALMOD-BEGIN
+def UseConstPool : Predicate<"Subtarget->useConstPool()">;
+def DontUseConstPool : Predicate<"!Subtarget->useConstPool()">;
+// @LOCALMOD-END
+
 //===----------------------------------------------------------------------===//
 // ARM Flag Definitions.
 
@@ -808,7 +821,8 @@ def postidx_reg : Operand<i32> {
 // use explicit imm vs. reg versions above (addrmode_imm12 and ldst_so_reg).
 def AddrMode2AsmOperand : AsmOperandClass { let Name = "AddrMode2"; }
 def addrmode2 : Operand<i32>,
-                ComplexPattern<i32, 3, "SelectAddrMode2", []> {
+                ComplexPattern<i32, 3, "SelectAddrMode2", [],
+                               [SDNPWantRoot]> { // @LOCALMOD
   let EncoderMethod = "getAddrMode2OpValue";
   let PrintMethod = "printAddrMode2Operand";
   let ParserMatchClass = AddrMode2AsmOperand;
@@ -848,7 +862,8 @@ def am2offset_imm : Operand<i32>,
 // FIXME: split into imm vs. reg versions.
 def AddrMode3AsmOperand : AsmOperandClass { let Name = "AddrMode3"; }
 def addrmode3 : Operand<i32>,
-                ComplexPattern<i32, 3, "SelectAddrMode3", []> {
+                ComplexPattern<i32, 3, "SelectAddrMode3", [],
+                               [SDNPWantRoot]> { // @LOCALMOD
   let EncoderMethod = "getAddrMode3OpValue";
   let PrintMethod = "printAddrMode3Operand";
   let ParserMatchClass = AddrMode3AsmOperand;
@@ -1568,6 +1583,42 @@ multiclass AI_str1nopc<bit isByte, string opc, InstrItinClass iii,
 // Instructions
 //===----------------------------------------------------------------------===//
 
+// @LOCALMOD-START
+
+def SFI_GUARD_LOADSTORE :
+PseudoInst<(outs GPR:$dst), (ins GPR:$a, pred:$p), NoItinerary, []>;
+
+let Defs = [CPSR] in
+def SFI_GUARD_LOADSTORE_TST :
+PseudoInst<(outs), (ins GPR:$a), NoItinerary, []>;
+
+// Like SFI_GUARD_LOADSTORE, but reserved for loads into SP.
+def SFI_GUARD_SP_LOAD :
+PseudoInst<(outs GPR:$dst), (ins GPR:$src, pred:$p), NoItinerary, []>;
+
+def SFI_GUARD_INDIRECT_CALL :
+PseudoInst<(outs GPR:$dst), (ins GPR:$a, pred:$p), NoItinerary, []>;
+
+def SFI_GUARD_INDIRECT_JMP :
+PseudoInst<(outs GPR:$dst), (ins GPR:$a, pred:$p), NoItinerary, []>;
+
+def SFI_GUARD_CALL :
+PseudoInst<(outs), (ins pred:$p), NoItinerary, []>;
+
+// NOTE: the BX_RET instruction hardcodes lr as well
+def SFI_GUARD_RETURN :
+PseudoInst<(outs), (ins pred:$p), NoItinerary, []>;
+
+def SFI_NOP_IF_AT_BUNDLE_END :
+PseudoInst<(outs), (ins), NoItinerary, []>;
+
+// Note: intention is that $src and $dst are the same register.
+def SFI_DATA_MASK :
+PseudoInst<(outs GPR:$dst), (ins GPR:$src, pred:$p), NoItinerary, []>;
+
+// @LOCALMOD-END
+
+
 //===----------------------------------------------------------------------===//
 //  Miscellaneous Instructions.
 //
@@ -1753,7 +1804,9 @@ let isBarrier = 1, isTerminator = 1 in
 def TRAP : AXI<(outs), (ins), MiscFrm, NoItinerary,
                "trap", [(trap)]>,
            Requires<[IsARM]> {
-  let Inst = 0xe7ffdefe;
+  // @LOCALMOD-START
+  let Inst = 0xe7fedef0;
+  // @LOCALMOD-END
 }
 
 // Address computation and loads and stores in PIC mode.
@@ -1868,6 +1921,33 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
 
 // SP is marked as a use to prevent stack-pointer assignments that appear
 // immediately before calls from potentially appearing dead.
+// @LOCALMOD-START
+// Exception handling related Node and Instructions.
+// The conversion sequence is:
+// ISD::EH_RETURN -> ARMISD::EH_RETURN ->  
+// ARMeh_return -> (stack change + indirect branch)
+// 
+// ARMeh_return takes the place of regular return instruction
+// but takes two arguments.
+// R2, R3 are used for storing the offset and return address respectively.
+def SDT_ARMEHRET : SDTypeProfile<0, 2, [SDTCisInt<0>, SDTCisPtrTy<1>]>;
+
+def ARMehret : SDNode<"ARMISD::EH_RETURN", SDT_ARMEHRET,
+                      [SDNPHasChain, SDNPOptInGlue]>;
+
+
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+  Defs = [SP],
+  Uses = [SP] in {
+    def ARMeh_return : PseudoInst<(outs), 
+                               (ins GPR:$spadj, GPR:$dst),
+                               IIC_Br,
+                               [(ARMehret GPR:$spadj, GPR:$dst)]>,
+                    Requires<[IsARM]>; 
+}
+// @LOCALMOD-END
+
+
 let isCall = 1,
   // FIXME:  Do we really need a non-predicated version? If so, it should
   // at least be a pseudo instruction expanding to the predicated version
@@ -2950,6 +3030,69 @@ def MOVTi16_ga_pcrel : PseudoInst<(outs GPR:$Rd),
 
 } // Constraints
 
+// @LOCALMOD-BEGIN
+// PIC / PC-relative versions of MOVi16/MOVTi16, which have an extra 
+// operand representing the ID of the PICADD instruction that corrects
+// for relativity. This is used to materialize addresses into
+// a register in a PC-relative manner.
+// 
+// E.g. Rather than have an absolute address in $imm, and transferred to 
+// a register with:
+//    movw $Rd, :lower16:$imm
+//    movt $Rd, :upper16:$imm
+//
+// we will instead have a relative offset:
+//    movw $Rd, :lower16:$imm - ($pic_add_id + 8)
+//    ...
+//    movt $Rd, :upper16:$imm - ($pic_add_id + 8)
+//    ...
+// $pic_add_id:
+//    add  $Rd, pc, $Rd
+//
+// One way these pseudo instructions (and the corresponding PICADD) 
+// come about is during expansion of the MOVi32imm pseudo instruction
+// (see ARMExpandPseudo::ExpandMBB).
+// These pseudo instructions become real instructions when they are
+// finally lowered to MCInsts (e.g., at ARMAsmPrinter::EmitInstruction),
+// and the extra pclabel ID becomes part of the appropriate operand.
+//
+// NOTE: aside from adding the pclabel operand, all other operands should
+// be the same as the non-PIC versions to simplify conversion to the 
+// non-pseudo instructions.
+let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1,
+    neverHasSideEffects = 1 in
+def MOVi16PIC : PseudoInst<(outs GPR:$Rd), (ins imm0_65535_expr:$imm,
+                                                pclabel:$pic_add_id,
+                                                pred:$p),
+                           IIC_iMOVi,
+                           []>,
+                 Requires<[IsARM, HasV6T2]>, UnaryDP;
+
+let Constraints = "$src = $Rd" in
+def MOVTi16PIC : PseudoInst<(outs GPR:$Rd), (ins GPR:$src,
+                                                 imm0_65535_expr:$imm,
+                                                 pclabel:$pic_add_id,
+                                                 pred:$p),
+                            IIC_iMOVi,
+                            []>,
+                 UnaryDP, Requires<[IsARM, HasV6T2]>;
+// @LOCALMOD-END
+
+// @LOCALMOD-BEGIN
+// Pseudo-instruction that will be expanded into MOVW / MOVT (PIC versions) w/
+// GOT as the operand. 
+// The alternative is to create a constant pool entry with the (relative)
+// GOT address and load from the constant pool. This is currently used
+// when constant islands are turned off, since MOVW / MOVT will be faster.
+let isReMaterializable = 1, isMoveImm = 1, neverHasSideEffects = 1 in
+def MOVGOTAddr : PseudoInst<(outs GPR:$dst), (ins),
+                           IIC_iMOVix2, // will expand to two MOVi's
+                           []>,
+                           Requires<[IsARM, UseMovt]>;
+
+def : ARMPat<(ARMWrapperGOT), (MOVGOTAddr)>;
+// @LOCALMOD-END
+
 def : ARMPat<(or GPR:$src, 0xffff0000), (MOVTi16 GPR:$src, 0xffff)>,
       Requires<[IsARM, HasV6T2]>;
 
@@ -3057,6 +3200,8 @@ def UBFX  : I<(outs GPR:$Rd),
 //  Arithmetic Instructions.
 //
 
+
+
 defm ADD  : AsI1_bin_irs<0b0100, "add",
                          IIC_iALUi, IIC_iALUr, IIC_iALUsr,
                          BinOpFrag<(add  node:$LHS, node:$RHS)>, 1>;
@@ -4798,9 +4943,20 @@ def MOV_ga_pcrel_ldr : PseudoInst<(outs GPR:$dst), (ins i32imm:$addr),
 // ConstantPool, GlobalAddress, and JumpTable
 def : ARMPat<(ARMWrapper  tglobaladdr :$dst), (LEApcrel tglobaladdr :$dst)>,
             Requires<[IsARM, DontUseMovt]>;
-def : ARMPat<(ARMWrapper  tconstpool  :$dst), (LEApcrel tconstpool  :$dst)>;
+// @LOCALMOD-START
+def : ARMPat<(ARMWrapper  tconstpool  :$dst), (LEApcrel tconstpool  :$dst)>,
+            Requires<[IsARM, DontUseMovt]>;
+// @LOCALMOD-END
 def : ARMPat<(ARMWrapper  tglobaladdr :$dst), (MOVi32imm tglobaladdr :$dst)>,
             Requires<[IsARM, UseMovt]>;
+// @LOCALMOD-START
+def : ARMPat<(ARMWrapper  tconstpool :$dst), (MOVi32imm tconstpool :$dst)>,
+            Requires<[IsARM, UseMovt, DontUseConstPool]>;
+def : ARMPat<(ARMWrapper  tconstpool :$dst), (LEApcrel tconstpool :$dst)>,
+            Requires<[IsARM, UseMovt, UseConstPool]>;
+def : ARMPat<(ARMWrapperJT2  tjumptable :$dst), (MOVi32imm tjumptable :$dst)>,
+            Requires<[IsARM, UseMovt]>;
+// @LOCALMOD-END
 def : ARMPat<(ARMWrapperJT tjumptable:$dst, imm:$id),
              (LEApcrelJT tjumptable:$dst, imm:$id)>;
 
@@ -5146,3 +5302,47 @@ def : InstAlias<"umull${s}${p} $RdLo, $RdHi, $Rn, $Rm",
 // 'it' blocks in ARM mode just validate the predicates. The IT itself
 // is discarded.
 def ITasm : ARMAsmPseudo<"it$mask $cc", (ins it_pred:$cc, it_mask:$mask)>;
+                   
+// @LOCALMOD-BEGIN
+//===----------------------------------------------------------------------===//
+// NativeClient intrinsics
+// These provide the ability to implement several low-level features without
+// having to link native ASM code on the client.
+// This code has to be kept in sync with include/llvm/Intrinsics.td and
+// lib/Target/X86InstrNaCl.{td, cpp}.
+// TODO(sehr): conditionalize this on IsNaCl64 | IsNaCl32 | IsNaClArm.
+
+let Uses = [R0], Defs = [R0] in {
+  // Saves all the callee-saves registers, sp, and lr to the JMP_BUF structure
+  // pointed to by r0.  The JMP_BUF structure is the maximum size over all
+  // supported architectures.
+  def NACL_SETJ : AXI<(outs), (ins),
+                      MiscFrm, NoItinerary,
+                      // Bundle start
+                      "sfi_nop_if_at_bundle_end; "
+                      "sfi_data_mask r0; "
+                      "stmia r0!, {{r4, r5, r6, r7, r8, r10, r11, sp, lr}}; "
+                      "mov r0, #0; ",
+                      [(set R0, (int_nacl_setjmp R0, LR))]>;
+}
+
+let isBranch = 1, isBarrier = 1, isTerminator = 1, Uses = [R0, R1] in {
+  // Restores all the callee-saves registers, sp, and lr from the JMP_BUF
+  // structure pointed to by r0.  Returns the value in r1 at entry.  This
+  // implements the tail of longjmp, with the normalization of the return value
+  // (if the caller passes zero to longjmp, it should return 1) done in the
+  // caller.
+  def NACL_LONGJ : AXI<(outs), (ins), MiscFrm, NoItinerary,
+                       // Bundle start
+                       "ldmia r0!, {{r4, r5, r6, r7, r8, r10, r11, r12, lr}}; "
+                       "sfi_nop_if_at_bundle_end; "
+                       "mov sp, r12; "
+                       "sfi_data_mask sp; "
+                       "movs r0, r1; "
+                       "moveq r0, #1; "
+                       "sfi_nop_if_at_bundle_end; "
+                       "sfi_code_mask lr; "
+                       "bx lr; ",
+                       [(int_nacl_longjmp R0, R1)]>;
+}
+// @LOCALMOD-END
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 002d64a2d0..c2800acccd 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -3533,12 +3533,24 @@ def t2MOV_ga_dyn : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr),
 // ConstantPool, GlobalAddress, and JumpTable
 def : T2Pat<(ARMWrapper  tglobaladdr :$dst), (t2LEApcrel tglobaladdr :$dst)>,
            Requires<[IsThumb2, DontUseMovt]>;
+// @LOCALMOD-START
+def : T2Pat<(ARMWrapper  tconstpool  :$dst), (t2LEApcrel tconstpool  :$dst)>,
+            Requires<[IsThumb2, DontUseMovt]>;
+// @LOCALMOD-END
 def : T2Pat<(ARMWrapper  tconstpool  :$dst), (t2LEApcrel tconstpool  :$dst)>;
 def : T2Pat<(ARMWrapper  tglobaladdr :$dst), (t2MOVi32imm tglobaladdr :$dst)>,
            Requires<[IsThumb2, UseMovt]>;
 
 def : T2Pat<(ARMWrapperJT tjumptable:$dst, imm:$id),
             (t2LEApcrelJT tjumptable:$dst, imm:$id)>;
+// @LOCALMOD-START
+def : T2Pat<(ARMWrapper  tconstpool :$dst), (t2MOVi32imm tconstpool :$dst)>,
+            Requires<[IsThumb2, UseMovt, DontUseConstPool]>;
+def : T2Pat<(ARMWrapper  tconstpool :$dst), (t2LEApcrel tconstpool :$dst)>,
+            Requires<[IsThumb2, UseMovt, UseConstPool]>;
+def : T2Pat<(ARMWrapperJT2  tjumptable :$dst), (t2MOVi32imm tjumptable :$dst)>,
+            Requires<[IsThumb2, UseMovt]>;
+// @LOCALMOD-END
 
 // Pseudo instruction that combines ldr from constpool and add pc. This should
 // be expanded into two instructions late to allow if-conversion and
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 0185289f3b..a8c8dce0cc 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -707,6 +707,7 @@ static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
 /// ldmia rn, <ra, rb, rc>
 /// =>
 /// ldmdb rn!, <ra, rb, rc>
+/// @LOCALMOD This is especially useful for rn == sp
 bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineBasicBlock &MBB,
                                                MachineBasicBlock::iterator MBBI,
                                                bool &Advance,
@@ -1387,7 +1388,16 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
 ///   mov pc, lr
 /// =>
 ///   ldmfd sp!, {..., pc}
+// @LOCALMOD for sfi we do not want this to happen
 bool ARMLoadStoreOpt::MergeReturnIntoLDM(MachineBasicBlock &MBB) {
+  // @LOCALMOD-START
+  // For NaCl, do not load into PC directly for a return, since NaCl requires
+  // masking the address first.
+  if (STI->isTargetNaCl()) {
+    return false;
+  }
+  // @LOCALMOD-END
+
   if (MBB.empty()) return false;
 
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
diff --git a/lib/Target/ARM/ARMMCInstLower.cpp b/lib/Target/ARM/ARMMCInstLower.cpp
index e2ac9a466e..3dd0848058 100644
--- a/lib/Target/ARM/ARMMCInstLower.cpp
+++ b/lib/Target/ARM/ARMMCInstLower.cpp
@@ -123,3 +123,57 @@ void llvm::LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
       OutMI.addOperand(MCOp);
   }
 }
+
+// @LOCALMOD-BEGIN
+// Unlike LowerARMMachineInstrToMCInst, the opcode has already been set.
+// Otherwise, this is like LowerARMMachineInstrToMCInst, but with special
+// handling where the "immediate" is PC Relative
+// (used for MOVi16PIC / MOVTi16PIC, etc. -- see .td file)
+void llvm::LowerARMMachineInstrToMCInstPCRel(const MachineInstr *MI,
+                                             MCInst &OutMI,
+                                             ARMAsmPrinter &AP,
+                                             unsigned ImmIndex,
+                                             unsigned PCIndex,
+                                             MCSymbol *PCLabel,
+                                             unsigned PCAdjustment) {
+
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    if (i == ImmIndex) {
+      MCContext &Ctx = AP.OutContext;
+      const MCExpr *PCRelExpr = MCSymbolRefExpr::Create(PCLabel, Ctx);
+      if (PCAdjustment) {
+        const MCExpr *AdjExpr = MCConstantExpr::Create(PCAdjustment, Ctx);
+        PCRelExpr = MCBinaryExpr::CreateAdd(PCRelExpr, AdjExpr, Ctx);
+      }
+
+      // Get the usual symbol operand, then subtract the PCRelExpr.
+      const MachineOperand &MOImm = MI->getOperand(ImmIndex);
+      MCOperand SymOp;
+      bool DidLower = AP.lowerOperand(MOImm, SymOp);
+      assert (DidLower && "Immediate-like operand should have been lowered");
+
+      const MCExpr *Expr = SymOp.getExpr();
+      ARMMCExpr::VariantKind TargetKind = ARMMCExpr::VK_ARM_None;
+      /* Unwrap and rewrap the ARMMCExpr */
+      if (Expr->getKind() == MCExpr::Target) {
+        const ARMMCExpr *TargetExpr = cast<ARMMCExpr>(Expr);
+        TargetKind = TargetExpr->getKind();
+        Expr = TargetExpr->getSubExpr();
+      }
+      Expr = MCBinaryExpr::CreateSub(Expr, PCRelExpr, Ctx);
+      if (TargetKind != ARMMCExpr::VK_ARM_None) {
+        Expr = ARMMCExpr::Create(TargetKind, Expr, Ctx);
+      }
+      MCOperand MCOp = MCOperand::CreateExpr(Expr);
+      OutMI.addOperand(MCOp);
+    } else if (i == PCIndex) {  // dummy index already handled as PCLabel
+      continue;
+    } else {
+      MCOperand MCOp;
+      if (AP.lowerOperand(MI->getOperand(i), MCOp)) {
+        OutMI.addOperand(MCOp);
+      }
+    }
+  }
+}
+// @LOCALMOD-END
diff --git a/lib/Target/ARM/ARMNaClHeaders.cpp b/lib/Target/ARM/ARMNaClHeaders.cpp
new file mode 100644
index 0000000000..a0b89ab05f
--- /dev/null
+++ b/lib/Target/ARM/ARMNaClHeaders.cpp
@@ -0,0 +1,176 @@
+//===-- ARMNaClHeaders.cpp - Print SFI headers to an ARM .s file -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the initial header string needed
+// for the Native Client target in ARM assembly.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/raw_ostream.h"
+#include "ARMNaClRewritePass.h"
+#include <string>
+
+using namespace llvm;
+
+void EmitSFIHeaders(raw_ostream &O) {
+  O << " @ ========================================\n";
+  O << "@ Branch: " << FlagSfiBranch << "\n";
+  O << "@ Stack: " << FlagSfiStack << "\n";
+  O << "@ Store: " << FlagSfiStore << "\n";
+  O << "@ Data: " << FlagSfiData << "\n";
+
+  O << " @ ========================================\n";
+  // NOTE: this macro does bundle alignment as follows
+  //       if current bundle pos is X emit pX data items of value "val"
+  // NOTE: that pos will be one of: 0,4,8,12
+  //
+  O <<
+    "\t.macro sfi_long_based_on_pos p0 p1 p2 p3 val\n"
+    "\t.set pos, (. - XmagicX) % 16\n"
+    "\t.fill  (((\\p3<<12)|(\\p2<<8)|(\\p1<<4)|\\p0)>>pos) & 15, 4, \\val\n"
+    "\t.endm\n"
+    "\n\n";
+
+  O <<
+    "\t.macro sfi_illegal_if_at_bundle_begining\n"
+    "\tsfi_long_based_on_pos 1 0 0 0 0xe125be70\n"
+    "\t.endm\n"
+    "\n\n";
+
+  O <<
+    "\t.macro sfi_nop_if_at_bundle_end\n"
+    "\tsfi_long_based_on_pos 0 0 0 1 0xe320f000\n"
+    "\t.endm\n"
+      "\n\n";
+
+  O <<
+    "\t.macro sfi_nops_to_force_slot3\n"
+    "\tsfi_long_based_on_pos 3 2 1 0 0xe320f000\n"
+    "\t.endm\n"
+    "\n\n";
+
+  O <<
+    "\t.macro sfi_nops_to_force_slot2\n"
+    "\tsfi_long_based_on_pos 2 1 0 3 0xe320f000\n"
+    "\t.endm\n"
+    "\n\n";
+
+  O <<
+    "\t.macro sfi_nops_to_force_slot1\n"
+    "\tsfi_long_based_on_pos 1 0 3 2 0xe320f000\n"
+    "\t.endm\n"
+    "\n\n";
+
+  O << " @ ========================================\n";
+  if (FlagSfiZeroMask) {
+    // This mode sets all mask to zero which makes them into nops
+    // this is useful for linking this code against non-sandboxed code
+    // for debugging purposes
+    O <<
+      "\t.macro sfi_data_mask reg cond\n"
+      "\tbic\\cond \\reg, \\reg, #0\n"
+      "\t.endm\n"
+      "\n\n";
+
+    O <<
+      "\t.macro sfi_data_tst reg\n"
+      "\ttst \\reg, #0x00000000\n"
+      "\t.endm\n"
+      "\n\n";
+
+    O <<
+      "\t.macro sfi_code_mask reg cond=\n"
+      "\tbic\\cond \\reg, \\reg, #0\n"
+      "\t.endm\n"
+      "\n\n";
+
+  } else {
+    O <<
+      "\t.macro sfi_data_mask reg cond\n"
+      "\tbic\\cond \\reg, \\reg, #0xc0000000\n"
+      "\t.endm\n"
+      "\n\n";
+
+    O <<
+      "\t.macro sfi_data_tst reg\n"
+      "\ttst \\reg, #0xc0000000\n"
+      "\t.endm\n"
+      "\n\n";
+
+    O <<
+      "\t.macro sfi_code_mask reg cond=\n"
+      "\tbic\\cond \\reg, \\reg, #0xc000000f\n"
+      "\t.endm\n"
+      "\n\n";
+  }
+
+  O << " @ ========================================\n";
+  if (FlagSfiBranch) {
+    O <<
+      "\t.macro sfi_call_preamble cond=\n"
+      "\tsfi_nops_to_force_slot3\n"
+      "\t.endm\n"
+      "\n\n";
+
+    O <<
+      "\t.macro sfi_return_preamble reg cond=\n"
+      "\tsfi_nop_if_at_bundle_end\n"
+      "\tsfi_code_mask \\reg \\cond\n"
+      "\t.endm\n"
+      "\n\n";
+    
+    // This is used just before "bx rx"
+    O <<
+      "\t.macro sfi_indirect_jump_preamble link cond=\n"
+      "\tsfi_nop_if_at_bundle_end\n"
+      "\tsfi_code_mask \\link \\cond\n"
+      "\t.endm\n"
+      "\n\n";
+
+    // This is use just before "blx rx"
+    O <<
+      "\t.macro sfi_indirect_call_preamble link cond=\n"
+      "\tsfi_nops_to_force_slot2\n"
+      "\tsfi_code_mask \\link \\cond\n"
+      "\t.endm\n"
+      "\n\n";
+
+  }
+
+  if (FlagSfiStore) {
+    O << " @ ========================================\n";
+
+    O <<
+      "\t.macro sfi_load_store_preamble reg cond\n"
+      "\tsfi_nop_if_at_bundle_end\n"
+      "\tsfi_data_mask \\reg, \\cond\n"
+      "\t.endm\n"
+      "\n\n";
+
+    O <<
+      "\t.macro sfi_cstore_preamble reg\n"
+      "\tsfi_nop_if_at_bundle_end\n"
+      "\tsfi_data_tst \\reg\n"
+      "\t.endm\n"
+      "\n\n";
+  } else {
+    O <<
+      "\t.macro sfi_load_store_preamble reg cond\n"
+      "\t.endm\n"
+      "\n\n";
+
+    O <<
+      "\t.macro sfi_cstore_preamble reg cond\n"
+      "\t.endm\n"
+      "\n\n";
+  }
+
+  O << " @ ========================================\n";
+  O << "\t.text\n";
+}
diff --git a/lib/Target/ARM/ARMNaClRewritePass.cpp b/lib/Target/ARM/ARMNaClRewritePass.cpp
new file mode 100644
index 0000000000..f7f64601d7
--- /dev/null
+++ b/lib/Target/ARM/ARMNaClRewritePass.cpp
@@ -0,0 +1,883 @@
+//===-- ARMNaClRewritePass.cpp - Native Client Rewrite Pass  ------*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Native Client Rewrite Pass
+// This final pass inserts the sandboxing instructions needed to run inside
+// the Native Client sandbox. Native Client requires certain software fault
+// isolation (SFI) constructions to be put in place, to prevent escape from
+// the sandbox. Native Client refuses to execute binaries without the correct
+// SFI sequences.
+//
+// Potentially dangerous operations which are protected include:
+// * Stores
+// * Branches
+// * Changes to SP
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "arm-sfi"
+#include "ARM.h"
+#include "ARMBaseInstrInfo.h"
+#include "ARMNaClRewritePass.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/CommandLine.h"
+#include <set>
+#include <stdio.h>
+
+using namespace llvm;
+
+namespace llvm {
+
+cl::opt<bool>
+FlagSfiData("sfi-data", cl::desc("use illegal at data bundle beginning"));
+
+cl::opt<bool>
+FlagSfiLoad("sfi-load", cl::desc("enable sandboxing for load"));
+
+cl::opt<bool>
+FlagSfiStore("sfi-store", cl::desc("enable sandboxing for stores"));
+
+cl::opt<bool>
+FlagSfiStack("sfi-stack", cl::desc("enable sandboxing for stack changes"));
+
+cl::opt<bool>
+FlagSfiBranch("sfi-branch", cl::desc("enable sandboxing for branches"));
+
+cl::opt<bool>
+FlagNaClUseM23ArmAbi("nacl-use-m23-arm-abi",
+                     cl::desc("use the Chrome M23 ARM ABI"));
+
+}
+
+namespace {
+  class ARMNaClRewritePass : public MachineFunctionPass {
+  public:
+    static char ID;
+    ARMNaClRewritePass() : MachineFunctionPass(ID) {}
+
+    const ARMBaseInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+    virtual const char *getPassName() const {
+      return "ARM Native Client Rewrite Pass";
+    }
+
+  private:
+
+    bool SandboxMemoryReferencesInBlock(MachineBasicBlock &MBB);
+    void SandboxMemory(MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator MBBI,
+                       MachineInstr &MI,
+                       int AddrIdx,
+                       bool CPSRLive,
+                       bool IsLoad);
+    bool TryPredicating(MachineInstr &MI, ARMCC::CondCodes);
+
+    bool SandboxBranchesInBlock(MachineBasicBlock &MBB);
+    bool SandboxStackChangesInBlock(MachineBasicBlock &MBB);
+
+    void SandboxStackChange(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MBBI);
+    void LightweightVerify(MachineFunction &MF);
+  };
+  char ARMNaClRewritePass::ID = 0;
+}
+
+static bool IsReturn(const MachineInstr &MI) {
+  return (MI.getOpcode() == ARM::BX_RET);
+}
+
+static bool IsIndirectJump(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+   default: return false;
+   case ARM::BX:
+   case ARM::TAILJMPr:
+    return true;
+  }
+}
+
+static bool IsIndirectCall(const MachineInstr &MI) {
+  return MI.getOpcode() == ARM::BLX;
+}
+
+static bool IsDirectCall(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+   default: return false;
+   case ARM::BL:
+   case ARM::BL_pred:
+   case ARM::TPsoft:
+     return true;
+  }
+}
+
+static bool IsCPSRLiveOut(const MachineBasicBlock &MBB) {
+  // CPSR is live-out if any successor lists it as live-in.
+  for (MachineBasicBlock::const_succ_iterator SI = MBB.succ_begin(),
+                                              E = MBB.succ_end();
+       SI != E;
+       ++SI) {
+    const MachineBasicBlock *Succ = *SI;
+    if (Succ->isLiveIn(ARM::CPSR)) return true;
+  }
+  return false;
+}
+
+static void DumpInstructionVerbose(const MachineInstr &MI) {
+  dbgs() << MI;
+  dbgs() << MI.getNumOperands() << " operands:" << "\n";
+  for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
+    const MachineOperand& op = MI.getOperand(i);
+    dbgs() << "  " << i << "(" << op.getType() << "):" << op << "\n";
+  }
+  dbgs() << "\n";
+}
+
+static void DumpBasicBlockVerbose(const MachineBasicBlock &MBB) {
+  dbgs() << "\n<<<<< DUMP BASIC BLOCK START\n";
+  for (MachineBasicBlock::const_iterator MBBI = MBB.begin(), MBBE = MBB.end();
+       MBBI != MBBE;
+       ++MBBI) {
+    DumpInstructionVerbose(*MBBI);
+  }
+  dbgs() << "<<<<< DUMP BASIC BLOCK END\n\n";
+}
+
+/**********************************************************************/
+/* Exported functions */
+
+namespace ARM_SFI {
+
+bool IsStackChange(const MachineInstr &MI, const TargetRegisterInfo *TRI) {
+  return MI.modifiesRegister(ARM::SP, TRI);
+}
+
+bool NextInstrMasksSP(const MachineInstr &MI) {
+  MachineBasicBlock::const_iterator It = &MI;
+  const MachineBasicBlock *MBB = MI.getParent();
+
+  MachineBasicBlock::const_iterator next = ++It;
+  if (next == MBB->end()) {
+    return false;
+  }
+
+  const MachineInstr &next_instr = *next;
+  unsigned opcode = next_instr.getOpcode();
+  return (opcode == ARM::SFI_DATA_MASK) &&
+      (next_instr.getOperand(0).getReg() == ARM::SP);
+}
+
+bool IsSandboxedStackChange(const MachineInstr &MI) {
+  // Calls do not change the stack on ARM but they have implicit-defs, so
+  // make sure they do not get sandboxed.
+  if (MI.getDesc().isCall())
+    return true;
+
+  unsigned opcode = MI.getOpcode();
+  switch (opcode) {
+    default: break;
+
+    // Our mask instructions correctly update the stack pointer.
+    case ARM::SFI_DATA_MASK:
+      return true;
+
+    // These just bump SP by a little (and access the stack),
+    // so that is okay due to guard pages.
+    case ARM::STMIA_UPD:
+    case ARM::STMDA_UPD:
+    case ARM::STMDB_UPD:
+    case ARM::STMIB_UPD:
+
+    case ARM::VSTMDIA_UPD:
+    case ARM::VSTMDDB_UPD:
+    case ARM::VSTMSIA_UPD:
+    case ARM::VSTMSDB_UPD:
+      return true;
+
+    // Similar, unless it is a load into SP...
+    case ARM::LDMIA_UPD:
+    case ARM::LDMDA_UPD:
+    case ARM::LDMDB_UPD:
+    case ARM::LDMIB_UPD:
+
+    case ARM::VLDMDIA_UPD:
+    case ARM::VLDMDDB_UPD:
+    case ARM::VLDMSIA_UPD:
+    case ARM::VLDMSDB_UPD: {
+      bool dest_SP = false;
+      // Dest regs start at operand index 4.
+      for (unsigned i = 4; i < MI.getNumOperands(); ++i) {
+        const MachineOperand &DestReg = MI.getOperand(i);
+        dest_SP = dest_SP || (DestReg.getReg() == ARM::SP);
+      }
+      if (dest_SP) {
+        break;
+      }
+      return true;
+    }
+
+    // Some localmods *should* prevent selecting a reg offset
+    // (see SelectAddrMode2 in ARMISelDAGToDAG.cpp).
+    // Otherwise, the store is already a potential violation.
+    case ARM::STR_PRE_REG:
+    case ARM::STR_PRE_IMM:
+
+    case ARM::STRH_PRE:
+
+    case ARM::STRB_PRE_REG:
+    case ARM::STRB_PRE_IMM:
+      return true;
+
+    // Similar, unless it is a load into SP...
+    case ARM::LDRi12:
+    case ARM::LDR_PRE_REG:
+    case ARM::LDR_PRE_IMM:
+    case ARM::LDRH_PRE:
+    case ARM::LDRB_PRE_REG:
+    case ARM::LDRB_PRE_IMM:
+    case ARM::LDRSH_PRE:
+    case ARM::LDRSB_PRE: {
+      const MachineOperand &DestReg = MI.getOperand(0);
+      if (DestReg.getReg() == ARM::SP) {
+        break;
+      }
+      return true;
+    }
+
+    // Here, if SP is the base / write-back reg, we need to check if
+    // a reg is used as offset (otherwise it is not a small nudge).
+    case ARM::STR_POST_REG:
+    case ARM::STR_POST_IMM:
+    case ARM::STRH_POST:
+    case ARM::STRB_POST_REG:
+    case ARM::STRB_POST_IMM: {
+      const MachineOperand &WBReg = MI.getOperand(0);
+      const MachineOperand &OffReg = MI.getOperand(3);
+      if (WBReg.getReg() == ARM::SP && OffReg.getReg() != 0) {
+        break;
+      }
+      return true;
+    }
+
+    // Similar, but also check that DestReg is not SP.
+    case ARM::LDR_POST_REG:
+    case ARM::LDR_POST_IMM:
+    case ARM::LDRB_POST_REG:
+    case ARM::LDRB_POST_IMM:
+    case ARM::LDRH_POST:
+    case ARM::LDRSH_POST:
+    case ARM::LDRSB_POST: {
+      const MachineOperand &DestReg = MI.getOperand(0);
+      if (DestReg.getReg() == ARM::SP) {
+        break;
+      }
+      const MachineOperand &WBReg = MI.getOperand(1);
+      const MachineOperand &OffReg = MI.getOperand(3);
+      if (WBReg.getReg() == ARM::SP && OffReg.getReg() != 0) {
+        break;
+      }
+      return true;
+    }
+  }
+
+  return (NextInstrMasksSP(MI));
+}
+
+bool NeedSandboxStackChange(const MachineInstr &MI,
+                               const TargetRegisterInfo *TRI) {
+  return (IsStackChange(MI, TRI) && !IsSandboxedStackChange(MI));
+}
+
+} // namespace ARM_SFI
+
+/**********************************************************************/
+
+void ARMNaClRewritePass::getAnalysisUsage(AnalysisUsage &AU) const {
+  // Slight (possibly unnecessary) efficiency tweak:
+  // Promise not to modify the CFG.
+  AU.setPreservesCFG();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+/*
+ * A primitive validator to catch problems at compile time.
+ * E.g., it could be used along with bugpoint to reduce a bitcode file.
+ */
+void ARMNaClRewritePass::LightweightVerify(MachineFunction &MF) {
+
+  for (MachineFunction::iterator MFI = MF.begin(), MFE = MF.end();
+       MFI != MFE;
+       ++MFI) {
+    MachineBasicBlock &MBB = *MFI;
+    for (MachineBasicBlock::iterator MBBI = MBB.begin(), MBBE = MBB.end();
+         MBBI != MBBE;
+         ++MBBI) {
+      MachineInstr &MI = *MBBI;
+
+      if (ARM_SFI::NeedSandboxStackChange(MI, TRI)) {
+        dbgs() << "LightWeightVerify for function: "
+               << MF.getFunction()->getName() << "  (BAD STACK CHANGE)\n";
+        DumpInstructionVerbose(MI);
+        DumpBasicBlockVerbose(MBB);
+        //        assert(false && "LightweightVerify Failed");
+      }
+    }
+  }
+}
+
+void ARMNaClRewritePass::SandboxStackChange(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MBBI) {
+  // (1) Ensure there is room in the bundle for a data mask instruction
+  // (nop'ing to the next bundle if needed).
+  // (2) Do a data mask on SP after the instruction that updated SP.
+  MachineInstr &MI = *MBBI;
+
+  // Use same predicate as current instruction.
+  unsigned PredReg = 0;
+  ARMCC::CondCodes Pred = llvm::getInstrPredicate(&MI, PredReg);
+
+  BuildMI(MBB, MBBI, MI.getDebugLoc(),
+          TII->get(ARM::SFI_NOP_IF_AT_BUNDLE_END));
+
+  // Get to next instr.
+  MachineBasicBlock::iterator MBBINext = (++MBBI);
+
+  BuildMI(MBB, MBBINext, MI.getDebugLoc(),
+          TII->get(ARM::SFI_DATA_MASK))
+      .addReg(ARM::SP, RegState::Define)  // modify SP (as dst)
+      .addReg(ARM::SP, RegState::Kill)    // start with SP (as src)
+      .addImm((int64_t) Pred)             // predicate condition
+      .addReg(PredReg);                   // predicate source register (CPSR)
+}
+
+bool ARMNaClRewritePass::SandboxStackChangesInBlock(MachineBasicBlock &MBB) {
+  bool Modified = false;
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+       MBBI != E;
+       ++MBBI) {
+    MachineInstr &MI = *MBBI;
+    if (ARM_SFI::NeedSandboxStackChange(MI, TRI)) {
+      SandboxStackChange(MBB, MBBI);
+      Modified |= true;
+    }
+  }
+  return Modified;
+}
+
+bool ARMNaClRewritePass::SandboxBranchesInBlock(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+       MBBI != E;
+       ++MBBI) {
+    MachineInstr &MI = *MBBI;
+    // Use same predicate as current instruction.
+    unsigned PredReg = 0;
+    ARMCC::CondCodes Pred = llvm::getInstrPredicate(&MI, PredReg);
+
+    if (IsReturn(MI)) {
+      BuildMI(MBB, MBBI, MI.getDebugLoc(),
+              TII->get(ARM::SFI_GUARD_RETURN))
+        .addImm((int64_t) Pred)  // predicate condition
+        .addReg(PredReg);        // predicate source register (CPSR)
+      Modified = true;
+    }
+
+    if (IsIndirectJump(MI)) {
+      unsigned Addr = MI.getOperand(0).getReg();
+      BuildMI(MBB, MBBI, MI.getDebugLoc(),
+              TII->get(ARM::SFI_GUARD_INDIRECT_JMP))
+        .addReg(Addr, RegState::Define)  // Destination definition (as dst)
+        .addReg(Addr, RegState::Kill)    // Destination read (as src)
+        .addImm((int64_t) Pred)          // predicate condition
+        .addReg(PredReg);                // predicate source register (CPSR)
+      Modified = true;
+    }
+
+    if (IsDirectCall(MI)) {
+      BuildMI(MBB, MBBI, MI.getDebugLoc(),
+              TII->get(ARM::SFI_GUARD_CALL))
+        .addImm((int64_t) Pred)  // predicate condition
+        .addReg(PredReg);        // predicate source register (CPSR)
+      Modified = true;
+    }
+
+    if (IsIndirectCall(MI)) {
+      unsigned Addr = MI.getOperand(0).getReg();
+      BuildMI(MBB, MBBI, MI.getDebugLoc(),
+              TII->get(ARM::SFI_GUARD_INDIRECT_CALL))
+        .addReg(Addr, RegState::Define)  // Destination definition (as dst)
+        .addReg(Addr, RegState::Kill)    // Destination read (as src)
+        .addImm((int64_t) Pred)          // predicate condition
+        .addReg(PredReg);                // predicate source register (CPSR)
+        Modified = true;
+    }
+  }
+
+  return Modified;
+}
+
+bool ARMNaClRewritePass::TryPredicating(MachineInstr &MI, ARMCC::CondCodes Pred) {
+  // Can't predicate if it's already predicated.
+  // TODO(cbiffle): actually we can, if the conditions match.
+  if (TII->isPredicated(&MI)) return false;
+
+  /*
+   * ARM predicate operands use two actual MachineOperands: an immediate
+   * holding the predicate condition, and a register referencing the flags.
+   */
+  SmallVector<MachineOperand, 2> PredOperands;
+  PredOperands.push_back(MachineOperand::CreateImm((int64_t) Pred));
+  PredOperands.push_back(MachineOperand::CreateReg(ARM::CPSR, false));
+
+  // This attempts to rewrite, but some instructions can't be predicated.
+  return TII->PredicateInstruction(&MI, PredOperands);
+}
+
+static bool IsDangerousLoad(const MachineInstr &MI, int *AddrIdx) {
+  unsigned Opcode = MI.getOpcode();
+  switch (Opcode) {
+  default: return false;
+
+  // Instructions with base address register in position 0...
+  case ARM::LDMIA:
+  case ARM::LDMDA:
+  case ARM::LDMDB:
+  case ARM::LDMIB:
+
+  case ARM::VLDMDIA:
+  case ARM::VLDMSIA:
+    *AddrIdx = 0;
+    break;
+  // Instructions with base address register in position 1...
+  case ARM::LDMIA_UPD: // same reg at position 0 and position 1
+  case ARM::LDMDA_UPD:
+  case ARM::LDMDB_UPD:
+  case ARM::LDMIB_UPD:
+
+  case ARM::LDRSB:
+  case ARM::LDRH:
+  case ARM::LDRSH:
+
+  case ARM::LDRi12:
+  case ARM::LDRrs:
+  case ARM::LDRBi12:
+  case ARM::LDRBrs:
+  case ARM::VLDMDIA_UPD:
+  case ARM::VLDMDDB_UPD:
+  case ARM::VLDMSIA_UPD:
+  case ARM::VLDMSDB_UPD:
+  case ARM::VLDRS:
+  case ARM::VLDRD:
+
+  case ARM::LDREX:
+  case ARM::LDREXB:
+  case ARM::LDREXH:
+    *AddrIdx = 1;
+    break;
+
+  // Instructions with base address register in position 2...
+  case ARM::LDR_PRE_REG:
+  case ARM::LDR_PRE_IMM:
+  case ARM::LDR_POST_REG:
+  case ARM::LDR_POST_IMM:
+
+  case ARM::LDRB_PRE_REG:
+  case ARM::LDRB_PRE_IMM:
+  case ARM::LDRB_POST_REG:
+  case ARM::LDRB_POST_IMM:
+  case ARM::LDRSB_PRE:
+  case ARM::LDRSB_POST:
+
+  case ARM::LDRH_PRE:
+  case ARM::LDRH_POST:
+  case ARM::LDRSH_PRE:
+  case ARM::LDRSH_POST:
+
+  case ARM::LDRD:
+    *AddrIdx = 2;
+    break;
+  }
+
+  if (MI.getOperand(*AddrIdx).getReg() == ARM::SP) {
+    // The contents of SP do not require masking.
+    return false;
+  }
+
+  return true;
+}
+
+/*
+ * Sandboxes a memory reference instruction by inserting an appropriate mask
+ * or check operation before it.
+ */
+void ARMNaClRewritePass::SandboxMemory(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MBBI,
+                                       MachineInstr &MI,
+                                       int AddrIdx,
+                                       bool CPSRLive,
+                                       bool IsLoad) {
+  unsigned Addr = MI.getOperand(AddrIdx).getReg();
+
+  if (!FlagNaClUseM23ArmAbi && Addr == ARM::R9) {
+    // R9-relative loads are no longer sandboxed.
+    assert(IsLoad && "There should be no r9-relative stores");
+  } else if (!CPSRLive && TryPredicating(MI, ARMCC::EQ)) {
+    /*
+     * For unconditional memory references where CPSR is not in use, we can use
+     * a faster sandboxing sequence by predicating the load/store -- assuming we
+     * *can* predicate the load/store.
+     */
+
+    // TODO(sehr): add SFI_GUARD_SP_LOAD_TST.
+    // Instruction can be predicated -- use the new sandbox.
+    BuildMI(MBB, MBBI, MI.getDebugLoc(),
+            TII->get(ARM::SFI_GUARD_LOADSTORE_TST))
+      .addReg(Addr);   // Address read (as src)
+  } else {
+    unsigned Opcode;
+    if (IsLoad && (MI.getOperand(0).getReg() == ARM::SP)) {
+      Opcode = ARM::SFI_GUARD_SP_LOAD;
+    } else {
+      Opcode = ARM::SFI_GUARD_LOADSTORE;
+    }
+    // Use same predicate as current instruction.
+    unsigned PredReg = 0;
+    ARMCC::CondCodes Pred = llvm::getInstrPredicate(&MI, PredReg);
+    // Use the older BIC sandbox, which is universal, but incurs a stall.
+    BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode))
+      .addReg(Addr, RegState::Define)  // Address definition (as dst).
+      .addReg(Addr, RegState::Kill)    // Address read (as src).
+      .addImm((int64_t) Pred)          // predicate condition
+      .addReg(PredReg);                // predicate source register (CPSR)
+
+    /*
+     * This pseudo-instruction is intended to generate something resembling the
+     * following, but with alignment enforced.
+     * TODO(cbiffle): move alignment into this function, use the code below.
+     *
+     *  // bic<cc> Addr, Addr, #0xC0000000
+     *  BuildMI(MBB, MBBI, MI.getDebugLoc(),
+     *          TII->get(ARM::BICri))
+     *    .addReg(Addr)            // rD
+     *    .addReg(Addr)            // rN
+     *    .addImm(0xC0000000)      // imm
+     *    .addImm((int64_t) Pred)  // predicate condition
+     *    .addReg(PredReg)         // predicate source register (CPSR)
+     *    .addReg(0);              // flag output register (0 == no flags)
+     */
+  }
+}
+
+static bool IsDangerousStore(const MachineInstr &MI, int *AddrIdx) {
+  unsigned Opcode = MI.getOpcode();
+  switch (Opcode) {
+  default: return false;
+
+  // Instructions with base address register in position 0...
+  case ARM::STMIA:
+  case ARM::STMDA:
+  case ARM::STMDB:
+  case ARM::STMIB:
+
+  case ARM::VSTMDIA:
+  case ARM::VSTMSIA:
+    *AddrIdx = 0;
+    break;
+
+  // Instructions with base address register in position 1...
+  case ARM::STMIA_UPD: // same reg at position 0 and position 1
+  case ARM::STMDA_UPD:
+  case ARM::STMDB_UPD:
+  case ARM::STMIB_UPD:
+
+  case ARM::STRH:
+  case ARM::STRi12:
+  case ARM::STRrs:
+  case ARM::STRBi12:
+  case ARM::STRBrs:
+  case ARM::VSTMDIA_UPD:
+  case ARM::VSTMDDB_UPD:
+  case ARM::VSTMSIA_UPD:
+  case ARM::VSTMSDB_UPD:
+  case ARM::VSTRS:
+  case ARM::VSTRD:
+    *AddrIdx = 1;
+    break;
+
+  //
+  // NEON stores
+  //
+
+  // VST1
+  case ARM::VST1d8:
+  case ARM::VST1d16:
+  case ARM::VST1d32:
+  case ARM::VST1d64:
+  case ARM::VST1q8:
+  case ARM::VST1q16:
+  case ARM::VST1q32:
+  case ARM::VST1q64:
+  case ARM::VST1d8wb_fixed:
+  case ARM::VST1d16wb_fixed:
+  case ARM::VST1d32wb_fixed:
+  case ARM::VST1d64wb_fixed:
+  case ARM::VST1q8wb_fixed:
+  case ARM::VST1q16wb_fixed:
+  case ARM::VST1q32wb_fixed:
+  case ARM::VST1q64wb_fixed:
+  case ARM::VST1d8wb_register:
+  case ARM::VST1d16wb_register:
+  case ARM::VST1d32wb_register:
+  case ARM::VST1d64wb_register:
+  case ARM::VST1q8wb_register:
+  case ARM::VST1q16wb_register:
+  case ARM::VST1q32wb_register:
+  case ARM::VST1q64wb_register:
+
+  // VST1LN
+  case ARM::VST1LNd8:
+  case ARM::VST1LNd16:
+  case ARM::VST1LNd32:
+  case ARM::VST1LNd8_UPD:
+  case ARM::VST1LNd16_UPD:
+  case ARM::VST1LNd32_UPD:
+
+  // VST2
+  case ARM::VST2d8:
+  case ARM::VST2d16:
+  case ARM::VST2d32:
+  case ARM::VST2q8:
+  case ARM::VST2q16:
+  case ARM::VST2q32:
+  case ARM::VST2d8wb_fixed:
+  case ARM::VST2d16wb_fixed:
+  case ARM::VST2d32wb_fixed:
+  case ARM::VST2q8wb_fixed:
+  case ARM::VST2q16wb_fixed:
+  case ARM::VST2q32wb_fixed:
+  case ARM::VST2d8wb_register:
+  case ARM::VST2d16wb_register:
+  case ARM::VST2d32wb_register:
+  case ARM::VST2q8wb_register:
+  case ARM::VST2q16wb_register:
+  case ARM::VST2q32wb_register:
+
+  // VST2LN
+  case ARM::VST2LNd8:
+  case ARM::VST2LNd16:
+  case ARM::VST2LNq16:
+  case ARM::VST2LNd32:
+  case ARM::VST2LNq32:
+  case ARM::VST2LNd8_UPD:
+  case ARM::VST2LNd16_UPD:
+  case ARM::VST2LNq16_UPD:
+  case ARM::VST2LNd32_UPD:
+  case ARM::VST2LNq32_UPD:
+
+  // VST3
+  case ARM::VST3d8:
+  case ARM::VST3d16:
+  case ARM::VST3d32:
+  case ARM::VST3q8:
+  case ARM::VST3q16:
+  case ARM::VST3q32:
+  case ARM::VST3d8_UPD:
+  case ARM::VST3d16_UPD:
+  case ARM::VST3d32_UPD:
+  case ARM::VST3q8_UPD:
+  case ARM::VST3q16_UPD:
+  case ARM::VST3q32_UPD:
+
+  // VST3LN
+  case ARM::VST3LNd8:
+  case ARM::VST3LNd16:
+  case ARM::VST3LNq16:
+  case ARM::VST3LNd32:
+  case ARM::VST3LNq32:
+  case ARM::VST3LNd8_UPD:
+  case ARM::VST3LNd16_UPD:
+  case ARM::VST3LNq16_UPD:
+  case ARM::VST3LNd32_UPD:
+  case ARM::VST3LNq32_UPD:
+
+  // VST4
+  case ARM::VST4d8:
+  case ARM::VST4d16:
+  case ARM::VST4d32:
+  case ARM::VST4q8:
+  case ARM::VST4q16:
+  case ARM::VST4q32:
+  case ARM::VST4d8_UPD:
+  case ARM::VST4d16_UPD:
+  case ARM::VST4d32_UPD:
+  case ARM::VST4q8_UPD:
+  case ARM::VST4q16_UPD:
+  case ARM::VST4q32_UPD:
+
+  // VST4LN
+  case ARM::VST4LNd8:
+  case ARM::VST4LNd16:
+  case ARM::VST4LNq16:
+  case ARM::VST4LNd32:
+  case ARM::VST4LNq32:
+  case ARM::VST4LNd8_UPD:
+  case ARM::VST4LNd16_UPD:
+  case ARM::VST4LNq16_UPD:
+  case ARM::VST4LNd32_UPD:
+  case ARM::VST4LNq32_UPD:
+
+    *AddrIdx = 0;
+    break;
+
+  // Instructions with base address register in position 2...
+  case ARM::STR_PRE_REG:
+  case ARM::STR_PRE_IMM:
+  case ARM::STR_POST_REG:
+  case ARM::STR_POST_IMM:
+
+  case ARM::STRB_PRE_REG:
+  case ARM::STRB_PRE_IMM:
+  case ARM::STRB_POST_REG:
+  case ARM::STRB_POST_IMM:
+
+  case ARM::STRH_PRE:
+  case ARM::STRH_POST:
+
+
+  case ARM::STRD:
+  case ARM::STREX:
+  case ARM::STREXB:
+  case ARM::STREXH:
+    *AddrIdx = 2;
+    break;
+  }
+
+  if (MI.getOperand(*AddrIdx).getReg() == ARM::SP) {
+    // The contents of SP do not require masking.
+    return false;
+  }
+
+  return true;
+}
+
+bool ARMNaClRewritePass::SandboxMemoryReferencesInBlock(
+    MachineBasicBlock &MBB) {
+  /*
+   * This is a simple local reverse-dataflow analysis to determine where CPSR
+   * is live.  We cannot use the conditional store sequence anywhere that CPSR
+   * is live, or we'd affect correctness.  The existing liveness analysis passes
+   * barf when applied pre-emit, after allocation, so we must do it ourselves.
+   */
+
+  // LOCALMOD(pdox): Short-circuit this function. Assume CPSR is always live,
+  //                 until we figure out why the assert is tripping.
+  bool Modified2 = false;
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+       MBBI != E;
+       ++MBBI) {
+    MachineInstr &MI = *MBBI;
+    int AddrIdx;
+
+    if (FlagSfiLoad && IsDangerousLoad(MI, &AddrIdx)) {
+      bool CPSRLive = true;
+      SandboxMemory(MBB, MBBI, MI, AddrIdx, CPSRLive, true);
+      Modified2 = true;
+    }
+    if (FlagSfiStore && IsDangerousStore(MI, &AddrIdx)) {
+      bool CPSRLive = true;
+      SandboxMemory(MBB, MBBI, MI, AddrIdx, CPSRLive, false);
+      Modified2 = true;
+    }
+  }
+  return Modified2;
+  // END LOCALMOD(pdox)
+
+  bool CPSRLive = IsCPSRLiveOut(MBB);
+
+  // Given that, record which instructions should not be altered to trash CPSR:
+  std::set<const MachineInstr *> InstrsWhereCPSRLives;
+  for (MachineBasicBlock::const_reverse_iterator MBBI = MBB.rbegin(),
+                                                 E = MBB.rend();
+       MBBI != E;
+       ++MBBI) {
+    const MachineInstr &MI = *MBBI;
+    // Check for kills first.
+    if (MI.modifiesRegister(ARM::CPSR, TRI)) CPSRLive = false;
+    // Then check for uses.
+    if (MI.readsRegister(ARM::CPSR)) CPSRLive = true;
+
+    if (CPSRLive) InstrsWhereCPSRLives.insert(&MI);
+  }
+
+  // Sanity check:
+  assert(CPSRLive == MBB.isLiveIn(ARM::CPSR)
+         && "CPSR Liveness analysis does not match cached live-in result.");
+
+  // Now: find and sandbox stores.
+  bool Modified = false;
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+       MBBI != E;
+       ++MBBI) {
+    MachineInstr &MI = *MBBI;
+    int AddrIdx;
+
+    if (FlagSfiLoad && IsDangerousLoad(MI, &AddrIdx)) {
+      bool CPSRLive =
+        (InstrsWhereCPSRLives.find(&MI) != InstrsWhereCPSRLives.end());
+      SandboxMemory(MBB, MBBI, MI, AddrIdx, CPSRLive, true);
+      Modified = true;
+    }
+    if (FlagSfiStore && IsDangerousStore(MI, &AddrIdx)) {
+      bool CPSRLive =
+        (InstrsWhereCPSRLives.find(&MI) != InstrsWhereCPSRLives.end());
+      SandboxMemory(MBB, MBBI, MI, AddrIdx, CPSRLive, false);
+      Modified = true;
+    }
+  }
+
+  return Modified;
+}
+
+/**********************************************************************/
+
+bool ARMNaClRewritePass::runOnMachineFunction(MachineFunction &MF) {
+  TII = static_cast<const ARMBaseInstrInfo*>(MF.getTarget().getInstrInfo());
+  TRI = MF.getTarget().getRegisterInfo();
+
+  bool Modified = false;
+  for (MachineFunction::iterator MFI = MF.begin(), E = MF.end();
+       MFI != E;
+       ++MFI) {
+    MachineBasicBlock &MBB = *MFI;
+
+    if (MBB.hasAddressTaken()) {
+      //FIXME: use symbolic constant or get this value from some configuration
+      MBB.setAlignment(4);
+      Modified = true;
+    }
+
+    if (FlagSfiLoad || FlagSfiStore)
+      Modified |= SandboxMemoryReferencesInBlock(MBB);
+    if (FlagSfiBranch) Modified |= SandboxBranchesInBlock(MBB);
+    if (FlagSfiStack)  Modified |= SandboxStackChangesInBlock(MBB);
+  }
+  DEBUG(LightweightVerify(MF));
+  return Modified;
+}
+
+/// createARMNaClRewritePass - returns an instance of the NaClRewritePass.
+FunctionPass *llvm::createARMNaClRewritePass() {
+  return new ARMNaClRewritePass();
+}
diff --git a/lib/Target/ARM/ARMNaClRewritePass.h b/lib/Target/ARM/ARMNaClRewritePass.h
new file mode 100644
index 0000000000..c8854a54fc
--- /dev/null
+++ b/lib/Target/ARM/ARMNaClRewritePass.h
@@ -0,0 +1,36 @@
+//===-- ARMNaClRewritePass.h - NaCl Sandboxing Pass    ------- --*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_ARMNACLREWRITEPASS_H
+#define TARGET_ARMNACLREWRITEPASS_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Support/CommandLine.h"
+
+namespace llvm {
+  extern cl::opt<bool> FlagSfiZeroMask;
+  extern cl::opt<bool> FlagSfiData;
+  extern cl::opt<bool> FlagSfiLoad;
+  extern cl::opt<bool> FlagSfiStore;
+  extern cl::opt<bool> FlagSfiStack;
+  extern cl::opt<bool> FlagSfiBranch;
+}
+
+namespace ARM_SFI {
+
+bool IsStackChange(const llvm::MachineInstr &MI,
+                   const llvm::TargetRegisterInfo *TRI);
+bool IsSandboxedStackChange(const llvm::MachineInstr &MI);
+bool NeedSandboxStackChange(const llvm::MachineInstr &MI,
+                               const llvm::TargetRegisterInfo *TRI);
+
+} // namespace ARM_SFI
+
+#endif
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index b33b3c915a..4c44f69f4d 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -146,7 +146,7 @@ EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
                         unsigned Align, bool isVolatile,
                         MachinePointerInfo DstPtrInfo) const {
   // Use default for non AAPCS (or Darwin) subtargets
-  if (!Subtarget->isAAPCS_ABI() || Subtarget->isTargetDarwin())
+  if (Subtarget->isTargetNaCl() || !Subtarget->isAAPCS_ABI() || Subtarget->isTargetDarwin()) // @LOCALMOD
     return SDValue();
 
   const ARMTargetLowering &TLI =
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index bcc9db4ae3..fc67d418ea 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -24,13 +24,22 @@
 
 using namespace llvm;
 
-static cl::opt<bool>
+cl::opt<bool> // @LOCALMOD
 ReserveR9("arm-reserve-r9", cl::Hidden,
           cl::desc("Reserve R9, making it unavailable as GPR"));
 
 static cl::opt<bool>
 DarwinUseMOVT("arm-darwin-use-movt", cl::init(true), cl::Hidden);
 
+// @LOCALMOD-START
+// TODO: * JITing has not been tested at all
+//       * Thumb mode operation is also not clear: it seems jump tables
+//         for thumb are broken independent of this option
+static cl::opt<bool>
+NoInlineJumpTables("no-inline-jumptables",
+                  cl::desc("Do not place jump tables inline in the code"));
+// @LOCALMOD-END
+
 static cl::opt<bool>
 UseFusedMulOps("arm-use-mulops",
                cl::init(true), cl::Hidden);
@@ -64,6 +73,7 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU,
   , NoARM(false)
   , PostRAScheduler(false)
   , IsR9Reserved(ReserveR9)
+  , UseInlineJumpTables(!NoInlineJumpTables) // @LOCALMOD
   , UseMovt(false)
   , SupportsTailCall(false)
   , HasFP16(false)
@@ -126,6 +136,18 @@ ARMSubtarget::ARMSubtarget(const std::string &TT, const std::string &CPU,
     SupportsTailCall = !getTargetTriple().isOSVersionLT(5, 0);
   }
 
+  // @LOCALMOD-BEGIN
+  // Advanced SIMD and Q registers are part of the NaCl ARM ABI.  The ARM
+  // EABI specifies only an 8 byte alignment, which can result in poor
+  // performance for these 16 byte data types if they straddle cache lines, etc.
+  // Therefore, NaCl aligns stack frames 0mod16.
+  if (isTargetNaCl())
+    stackAlignment = 16;
+  // NaCl uses MovT to avoid generating constant islands.
+  if (isTargetNaCl() && !useConstPool())
+    UseMovt = true;
+  // @LOCALMOD-END
+
   if (!isThumb() || hasThumb2())
     PostRAScheduler = true;
 
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index 8e6b650602..e99d1d4a48 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -23,6 +23,15 @@
 #define GET_SUBTARGETINFO_HEADER
 #include "ARMGenSubtargetInfo.inc"
 
+// @LOCALMOD-BEGIN
+#include "llvm/Support/CommandLine.h"
+namespace llvm {
+  extern cl::opt<bool> FlagSfiDisableCP;
+  extern cl::opt<bool> FlagNaClUseM23ArmAbi;
+}
+// @LOCALMOD-END
+
+
 namespace llvm {
 class GlobalValue;
 class StringRef;
@@ -91,6 +100,11 @@ protected:
   /// IsR9Reserved - True if R9 is a not available as general purpose register.
   bool IsR9Reserved;
 
+  // @LOCALMOD-START
+  /// UseInlineJumpTables - True if jump tables should be in-line in the code.
+  bool UseInlineJumpTables;
+  // @LOCALMOD-END
+
   /// UseMovt - True if MOVT / MOVW pairs are used for materialization of 32-bit
   /// imms (including global addresses).
   bool UseMovt;
@@ -262,6 +276,9 @@ protected:
   bool useMovt() const { return UseMovt && hasV6T2Ops(); }
   bool supportsTailCall() const { return SupportsTailCall; }
 
+  // @LOCALMOD
+  bool useConstPool() const { return !FlagSfiDisableCP; }
+
   bool allowsUnalignedMem() const { return AllowsUnalignedMem; }
 
   const std::string & getCPUString() const { return CPUString; }
@@ -285,6 +302,8 @@ protected:
   /// GVIsIndirectSymbol - true if the GV will be accessed via an indirect
   /// symbol.
   bool GVIsIndirectSymbol(const GlobalValue *GV, Reloc::Model RelocM) const;
+
+  bool useInlineJumpTables() const {return UseInlineJumpTables;} // @LOCALMOD
 };
 } // End llvm namespace
 
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index b486d4fe2e..fed2d99e65 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -28,6 +28,13 @@ EnableGlobalMerge("global-merge", cl::Hidden,
                   cl::desc("Enable global merge pass"),
                   cl::init(true));
 
+// @LOCALMOD-START
+namespace llvm {
+cl::opt<bool> FlagSfiDisableCP("sfi-disable-cp",
+                               cl::desc("disable arm constant island pools"));
+}
+// @LOCALMOD-END
+
 extern "C" void LLVMInitializeARMTarget() {
   // Register the target.
   RegisterTargetMachine<ARMTargetMachine> X(TheARMTarget);
@@ -194,8 +201,24 @@ bool ARMPassConfig::addPreEmitPass() {
     addPass(&UnpackMachineBundlesID);
   }
 
+  // @LOCALMOD-START
+  // Note with FlagSfiDisableCP we effectively disable the
+  // ARMConstantIslandPass and rely on movt/movw to eliminate the need
+  // for constant islands
+  if (FlagSfiDisableCP) {
+    assert(getARMSubtarget().useMovt());
+  }
+  // @LOCALMOD-END
+
   addPass(createARMConstantIslandPass());
 
+  // @LOCALMOD-START
+  // This pass does all the heavy sfi lifting.
+  if (getARMSubtarget().isTargetNaCl()) {
+    addPass(createARMNaClRewritePass());
+  }
+  // @LOCALMOD-END
+ 
   return true;
 }
 
diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
index ebdd5b4d64..cd6921e1ae 100644
--- a/lib/Target/ARM/ARMTargetMachine.h
+++ b/lib/Target/ARM/ARMTargetMachine.h
@@ -29,6 +29,13 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/ADT/OwningPtr.h"
 
+// @LOCALMOD-START
+#include "llvm/Support/CommandLine.h"
+namespace llvm {
+   extern cl::opt<bool> FlagSfiDisableCP;
+}
+// @LOCALMOD-END
+
 namespace llvm {
 
 class ARMBaseTargetMachine : public LLVMTargetMachine {
diff --git a/lib/Target/ARM/ARMTargetObjectFile.cpp b/lib/Target/ARM/ARMTargetObjectFile.cpp
index 3d85ca7d69..22db332f2b 100644
--- a/lib/Target/ARM/ARMTargetObjectFile.cpp
+++ b/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -29,7 +29,7 @@ void ARMElfTargetObjectFile::Initialize(MCContext &Ctx,
   InitializeELF(isAAPCS_ABI);
 
   if (isAAPCS_ABI) {
-    LSDASection = NULL;
+    //LSDASection = NULL;
   }
 
   AttributesSection =
diff --git a/lib/Target/ARM/CMakeLists.txt b/lib/Target/ARM/CMakeLists.txt
index 377bd9243c..1ea4e00867 100644
--- a/lib/Target/ARM/CMakeLists.txt
+++ b/lib/Target/ARM/CMakeLists.txt
@@ -33,6 +33,8 @@ add_llvm_target(ARMCodeGen
   ARMLoadStoreOptimizer.cpp
   ARMMCInstLower.cpp
   ARMMachineFunctionInfo.cpp
+  ARMNaClHeaders.cpp
+  ARMNaClRewritePass.cpp
   ARMRegisterInfo.cpp
   ARMSelectionDAGInfo.cpp
   ARMSubtarget.cpp
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index dcc41d93f5..beeabb6d42 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -223,6 +223,71 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
     return;
   }
 
+  // @LOCALMOD-BEGIN
+  // TODO(pdox): Kill this code once we switch to MC object emission
+  const char *SFIInst = NULL;
+  unsigned SFIEmitDest = ~0;
+  unsigned SFIEmitPred = ~0;
+  switch (Opcode) {
+  case ARM::SFI_NOP_IF_AT_BUNDLE_END :
+    SFIInst = "sfi_nop_if_at_bundle_end";
+    SFIEmitDest = ~0;
+    SFIEmitPred = ~0;
+    break;
+  case ARM::SFI_GUARD_LOADSTORE    :
+    SFIInst = "sfi_load_store_preamble";
+    SFIEmitDest = 0;
+    SFIEmitPred = 2;
+    break;
+  case ARM::SFI_GUARD_INDIRECT_CALL:
+    SFIInst = "sfi_indirect_call_preamble";
+    SFIEmitDest = 0;
+    SFIEmitPred = 2;
+    break;
+  case ARM::SFI_GUARD_INDIRECT_JMP :
+    SFIInst = "sfi_indirect_jump_preamble";
+    SFIEmitDest = 0;
+    SFIEmitPred = 2;
+    break;
+  case ARM::SFI_DATA_MASK          :
+    SFIInst = "sfi_data_mask";
+    SFIEmitDest = 0;
+    SFIEmitPred = 2;
+    break;
+  case ARM::SFI_GUARD_LOADSTORE_TST:
+    SFIInst = "sfi_cload_store_preamble";
+    SFIEmitDest = 0;
+    SFIEmitPred = ~0;
+    break;
+  case ARM::SFI_GUARD_CALL     :
+    SFIInst = "sfi_call_preamble";
+    SFIEmitDest = ~0;
+    SFIEmitPred = 0;
+    break;
+  case ARM::SFI_GUARD_RETURN   :
+    SFIInst = "sfi_return_preamble lr,";
+    SFIEmitDest = ~0;
+    SFIEmitPred = 0;
+    break;
+  }
+  if (SFIInst) {
+    O << '\t' << SFIInst;
+    if (SFIEmitDest != (unsigned)~0) {
+      O << ' ';
+      printOperand(MI, SFIEmitDest, O);
+    }
+    if (SFIEmitDest != (unsigned)~0 && SFIEmitPred != (unsigned)~0) {
+      O << ',';
+    }
+    if (SFIEmitPred != (unsigned)~0) {
+      O << ' ';
+      printPredicateOperand(MI, SFIEmitPred, O);
+    }
+    O << '\n';
+    return;
+  }
+  // @LOCALMOD-END
+
   if (Opcode == ARM::tLDMIA) {
     bool Writeback = true;
     unsigned BaseReg = MI->getOperand(0).getReg();
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index 1ba6ab039f..8abf449206 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -11,6 +11,7 @@
 #include "MCTargetDesc/ARMBaseInfo.h"
 #include "MCTargetDesc/ARMFixupKinds.h"
 #include "MCTargetDesc/ARMAddressingModes.h"
+#include "MCTargetDesc/ARMMCNaCl.h" // @LOCALMOD
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDirectives.h"
@@ -232,8 +233,16 @@ bool ARMAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
   const uint32_t nopEncoding = hasNOP() ? ARMv6T2_NopEncoding
                                         : ARMv4_NopEncoding;
   uint64_t NumNops = Count / 4;
+  // @LOCALMOD-BEGIN-UPSTREAM
+  // FIXME: e1a00000 vs e320f000
+  //  e1a00000 is mov r0, r0 which may result in a stall
+  //  but the real nop instruction is not available on early hw....
+  //  Perhaps this really needs to be switched on the Subtarget??
+  //  GNU as likes to emit e320f000...
   for (uint64_t i = 0; i != NumNops; ++i)
-    OW->Write32(nopEncoding);
+    OW->Write32(0xe320f000); // regular NOP
+  // @LOCALMOD-END
+
   // FIXME: should this function return false when unable to write exactly
   // 'Count' bytes with NOP encodings?
   switch (Count % 4) {
@@ -559,13 +568,31 @@ namespace {
 class ELFARMAsmBackend : public ARMAsmBackend {
 public:
   uint8_t OSABI;
+  Triple::OSType OSType; // @LOCALMOD: kept OSTYPE vs upstream. FIXME: remove.
   ELFARMAsmBackend(const Target &T, const StringRef TT,
-                   uint8_t _OSABI)
-    : ARMAsmBackend(T, TT), OSABI(_OSABI) { }
+                   uint8_t _OSABI,
+                   Triple::OSType _OSType)
+    : ARMAsmBackend(T, TT), OSABI(_OSABI), OSType(_OSType) { }
 
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
                   uint64_t Value) const;
 
+  // @LOCALMOD-BEGIN
+  // FIXME! NaCl should INHERIT from ELFARMAsmBackend, not
+  // add to it.
+  unsigned getBundleSize() const {
+    return (OSType == Triple::NativeClient) ? 16 : 0;
+  }
+
+  bool CustomExpandInst(const MCInst &Inst, MCStreamer &Out) const {
+    if (OSType == Triple::NativeClient) {
+      return CustomExpandInstNaClARM(Inst, Out);
+    }
+    return false;
+  }
+
+ // @LOCALMOD-END
+
   MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
     return createARMELFObjectWriter(OS, OSABI);
   }
@@ -705,5 +732,5 @@ MCAsmBackend *llvm::createARMAsmBackend(const Target &T, StringRef TT, StringRef
     assert(0 && "Windows not supported on ARM");
 
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(Triple(TT).getOS());
-  return new ELFARMAsmBackend(T, TT, OSABI);
+  return new ELFARMAsmBackend(T, TT, OSABI, TheTriple.getOS());
 }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index 99e4f713f6..253d1fa2ab 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -71,10 +71,11 @@ const MCSymbol *ARMELFObjectWriter::ExplicitRelSym(const MCAssembler &Asm,
                                                    const MCFixup &Fixup,
                                                    bool IsPCRel) const {
   const MCSymbol &Symbol = Target.getSymA()->getSymbol().AliasedSymbol();
+  const MCSymbol &ASymbol = Symbol.AliasedSymbol();
   bool EmitThisSym = false;
 
   const MCSectionELF &Section =
-    static_cast<const MCSectionELF&>(Symbol.getSection());
+    static_cast<const MCSectionELF&>(ASymbol.getSection());
   bool InNormalSection = true;
   unsigned RelocType = 0;
   RelocType = GetRelocTypeInner(Target, Fixup, IsPCRel);
@@ -137,9 +138,9 @@ const MCSymbol *ARMELFObjectWriter::ExplicitRelSym(const MCAssembler &Asm,
   }
 
   if (EmitThisSym)
-    return &Symbol;
+    return &ASymbol;
   if (! Symbol.isTemporary() && InNormalSection) {
-    return &Symbol;
+    return &ASymbol;
   }
   return NULL;
 }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
index f0b289c6f3..059ee99f1c 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.h
@@ -28,6 +28,12 @@ namespace llvm {
     virtual void anchor();
   public:
     explicit ARMELFMCAsmInfo();
+    // @LOCALMOD-BEGIN
+    // Exceptions handling
+    void setExceptionsType(ExceptionHandling::ExceptionsType ExType) {
+       ExceptionsType = ExType;
+    }
+    // @LOCALMOD-END
   };
 
 } // namespace llvm
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCNaCl.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCNaCl.cpp
new file mode 100644
index 0000000000..98ee80c358
--- /dev/null
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCNaCl.cpp
@@ -0,0 +1,330 @@
+//=== ARMMCNaCl.cpp -  Expansion of NaCl pseudo-instructions     --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "arm-mc-nacl"
+
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "MCTargetDesc/ARMMCExpr.h"
+#include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+namespace llvm {
+  cl::opt<bool> FlagSfiZeroMask("sfi-zero-mask");
+}
+
+/// Two helper functions for emitting the actual guard instructions
+
+static void EmitBICMask(MCStreamer &Out,
+                        unsigned Addr, int64_t  Pred, unsigned Mask) {
+  // bic\Pred \Addr, \Addr, #Mask
+  MCInst BICInst;
+  BICInst.setOpcode(ARM::BICri);
+  BICInst.addOperand(MCOperand::CreateReg(Addr)); // rD
+  BICInst.addOperand(MCOperand::CreateReg(Addr)); // rS
+  if (FlagSfiZeroMask) {
+    BICInst.addOperand(MCOperand::CreateImm(0)); // imm
+  } else {
+    BICInst.addOperand(MCOperand::CreateImm(Mask)); // imm
+  }
+  BICInst.addOperand(MCOperand::CreateImm(Pred));  // predicate
+  BICInst.addOperand(MCOperand::CreateReg(ARM::CPSR)); // CPSR
+  BICInst.addOperand(MCOperand::CreateReg(0)); // flag out
+  Out.EmitInstruction(BICInst);
+}
+
+static void EmitTST(MCStreamer &Out, unsigned Reg) {
+  // tst \reg, #\MASK typically 0xc0000000
+  const unsigned Mask = 0xC0000000;
+  MCInst TSTInst;
+  TSTInst.setOpcode(ARM::TSTri);
+  TSTInst.addOperand(MCOperand::CreateReg(Reg));  // rS
+  if (FlagSfiZeroMask) {
+    TSTInst.addOperand(MCOperand::CreateImm(0)); // imm
+  } else {
+    TSTInst.addOperand(MCOperand::CreateImm(Mask)); // imm
+  }
+  TSTInst.addOperand(MCOperand::CreateImm((int64_t)ARMCC::AL)); // Always
+  TSTInst.addOperand(MCOperand::CreateImm(0)); // flag out
+  Out.EmitInstruction(TSTInst);
+}
+
+
+// This is ONLY used for sandboxing stack changes.
+// The reason why SFI_NOP_IF_AT_BUNDLE_END gets handled here is that
+// it must ensure that the two instructions are in the same bundle.
+// It just so happens that the SFI_NOP_IF_AT_BUNDLE_END is always
+// emitted in conjunction with a SFI_DATA_MASK
+// 
+static void EmitDataMask(int I, MCInst Saved[], MCStreamer &Out) {
+  assert(I == 3 && 
+         (ARM::SFI_NOP_IF_AT_BUNDLE_END == Saved[0].getOpcode()) &&
+         (ARM::SFI_DATA_MASK == Saved[2].getOpcode()) &&
+         "Unexpected SFI Pseudo while lowering");
+
+  unsigned Addr = Saved[2].getOperand(0).getReg();
+  int64_t  Pred = Saved[2].getOperand(2).getImm();
+  assert((ARM::SP == Addr) && "Unexpected register at stack guard");
+
+  Out.EmitBundleLock();
+  Out.EmitInstruction(Saved[1]);
+  EmitBICMask(Out, Addr, Pred, 0xC0000000);
+  Out.EmitBundleUnlock();
+}
+
+static void EmitDirectGuardCall(int I, MCInst Saved[],
+                                MCStreamer &Out) {
+  // sfi_call_preamble cond=
+  //   sfi_nops_to_force_slot3
+  assert(I == 2 && (ARM::SFI_GUARD_CALL == Saved[0].getOpcode()) &&
+         "Unexpected SFI Pseudo while lowering SFI_GUARD_CALL");
+  Out.EmitBundleAlignEnd();
+  Out.EmitBundleLock();
+  Out.EmitInstruction(Saved[1]);
+  Out.EmitBundleUnlock();
+}
+
+static void EmitIndirectGuardCall(int I, MCInst Saved[],
+                                  MCStreamer &Out) {
+  // sfi_indirect_call_preamble link cond=
+  //   sfi_nops_to_force_slot2
+  //   sfi_code_mask \link \cond
+  assert(I == 2 && (ARM::SFI_GUARD_INDIRECT_CALL == Saved[0].getOpcode()) &&
+         "Unexpected SFI Pseudo while lowering SFI_GUARD_CALL");
+  unsigned Reg = Saved[0].getOperand(0).getReg();
+  int64_t Pred = Saved[0].getOperand(2).getImm();
+  Out.EmitBundleAlignEnd();
+  Out.EmitBundleLock();
+  EmitBICMask(Out, Reg, Pred, 0xC000000F);
+  Out.EmitInstruction(Saved[1]);
+  Out.EmitBundleUnlock();
+}
+
+static void EmitIndirectGuardJmp(int I, MCInst Saved[], MCStreamer &Out) {
+  //  sfi_indirect_jump_preamble link cond=
+  //   sfi_nop_if_at_bundle_end
+  //   sfi_code_mask \link \cond
+  assert(I == 2 && (ARM::SFI_GUARD_INDIRECT_JMP == Saved[0].getOpcode()) &&
+         "Unexpected SFI Pseudo while lowering SFI_GUARD_CALL");
+  unsigned Reg = Saved[0].getOperand(0).getReg();
+  int64_t Pred = Saved[0].getOperand(2).getImm();
+
+  Out.EmitBundleLock();
+  EmitBICMask(Out, Reg, Pred, 0xC000000F);
+  Out.EmitInstruction(Saved[1]);
+  Out.EmitBundleUnlock();
+}
+
+static void EmitGuardReturn(int I, MCInst Saved[], MCStreamer &Out) {
+  // sfi_return_preamble reg cond=
+  //    sfi_nop_if_at_bundle_end
+  //    sfi_code_mask \reg \cond
+  assert(I == 2 && (ARM::SFI_GUARD_RETURN == Saved[0].getOpcode()) &&
+         "Unexpected SFI Pseudo while lowering SFI_GUARD_RETURN");
+  int64_t Pred = Saved[0].getOperand(0).getImm();
+
+  Out.EmitBundleLock();
+  EmitBICMask(Out, ARM::LR, Pred, 0xC000000F);
+  Out.EmitInstruction(Saved[1]);
+  Out.EmitBundleUnlock();
+}
+
+static void EmitGuardLoadOrStore(int I, MCInst Saved[], MCStreamer &Out) {
+  // sfi_store_preamble reg cond ---->
+  //    sfi_nop_if_at_bundle_end
+  //    sfi_data_mask \reg, \cond
+  assert(I == 2 && (ARM::SFI_GUARD_LOADSTORE == Saved[0].getOpcode()) &&
+         "Unexpected SFI Pseudo while lowering SFI_GUARD_RETURN");
+  unsigned Reg = Saved[0].getOperand(0).getReg();
+  int64_t Pred = Saved[0].getOperand(2).getImm();
+
+  Out.EmitBundleLock();
+  EmitBICMask(Out, Reg, Pred, 0xC0000000);
+  Out.EmitInstruction(Saved[1]);
+  Out.EmitBundleUnlock();
+}
+
+static void EmitGuardLoadOrStoreTst(int I, MCInst Saved[], MCStreamer &Out) {
+  // sfi_cstore_preamble reg -->
+  //   sfi_nop_if_at_bundle_end
+  //   sfi_data_tst \reg
+  assert(I == 2 && (ARM::SFI_GUARD_LOADSTORE_TST == Saved[0].getOpcode()) &&
+         "Unexpected SFI Pseudo while lowering");
+  unsigned Reg = Saved[0].getOperand(0).getReg();
+
+  Out.EmitBundleLock();
+  EmitTST(Out, Reg);
+  Out.EmitInstruction(Saved[1]);
+  Out.EmitBundleUnlock();
+}
+
+// This is ONLY used for loads into the stack pointer.
+static void EmitGuardSpLoad(int I, MCInst Saved[], MCStreamer &Out) {
+  assert(I == 4 &&
+         (ARM::SFI_GUARD_SP_LOAD == Saved[0].getOpcode()) &&
+         (ARM::SFI_NOP_IF_AT_BUNDLE_END == Saved[1].getOpcode()) &&
+         (ARM::SFI_DATA_MASK == Saved[3].getOpcode()) &&
+         "Unexpected SFI Pseudo while lowering");
+
+  unsigned AddrReg = Saved[0].getOperand(0).getReg();
+  unsigned SpReg = Saved[3].getOperand(0).getReg();
+  int64_t  Pred = Saved[3].getOperand(2).getImm();
+  assert((ARM::SP == SpReg) && "Unexpected register at stack guard");
+
+  Out.EmitBundleLock();
+  EmitBICMask(Out, AddrReg, Pred, 0xC0000000);
+  Out.EmitInstruction(Saved[2]);
+  EmitBICMask(Out, SpReg, Pred, 0xC0000000);
+  Out.EmitBundleUnlock();
+}
+
+namespace llvm {
+// CustomExpandInstNaClARM -
+//   If Inst is a NaCl pseudo instruction, emits the substitute
+//   expansion to the MCStreamer and returns true.
+//   Otherwise, returns false.
+//
+//   NOTE: Each time this function calls Out.EmitInstruction(), it will be
+//   called again recursively to rewrite the new instruction being emitted.
+//   Care must be taken to ensure that this does not result in an infinite
+//   loop. Also, global state must be managed carefully so that it is
+//   consistent during recursive calls.
+//
+//   We need global state to keep track of the explicit prefix (PREFIX_*)
+//   instructions. Unfortunately, the assembly parser prefers to generate
+//   these instead of combined instructions. At this time, having only
+//   one explicit prefix is supported.
+
+
+bool CustomExpandInstNaClARM(const MCInst &Inst, MCStreamer &Out) {
+  const int MaxSaved = 4;
+  static MCInst Saved[MaxSaved];
+  static int SaveCount  = 0;
+  static int I = 0;
+  // This routine only executes  if RecurseGuard == 0
+  static bool RecurseGuard = false; 
+
+  // If we are emitting to .s, just emit all pseudo-instructions directly.
+  if (Out.hasRawTextSupport()) {
+    return false;
+  }
+
+  //No recursive calls allowed;
+  if (RecurseGuard) return false;
+
+  unsigned Opc = Inst.getOpcode();
+
+  DEBUG(dbgs() << "CustomExpandInstNaClARM("; Inst.dump(); dbgs() << ")\n");
+
+  // Note: SFI_NOP_IF_AT_BUNDLE_END is only emitted directly as part of
+  // a stack guard in conjunction with a SFI_DATA_MASK
+
+  // Logic:
+  // This is somewhat convoluted, but in the current model, the SFI
+  // guard pseudo instructions occur PRIOR to the actual instruction.
+  // So, the bundling/alignment operation has to refer to the FOLLOWING
+  // one or two instructions.
+  //
+  // When a SFI_* pseudo is detected, it is saved. Then, the saved SFI_*
+  // pseudo and the very next one or two instructions are used as arguments to
+  // the Emit*() functions in this file.  This is the reason why we have a
+  // doublely nested switch here.  First, to save the SFI_* pseudo, then to
+  // emit it and the next instruction
+
+  // By default, we only need to save two or three instructions
+
+  if ((I == 0) && (SaveCount == 0)) {
+    // Base State, no saved instructions.
+    // If the current instruction is a SFI instruction, set the SaveCount
+    // and fall through.
+    switch (Opc) {
+    default:
+      SaveCount = 0; // Nothing to do.
+      return false;  // Handle this Inst elsewhere.
+    case ARM::SFI_NOP_IF_AT_BUNDLE_END:
+      SaveCount = 3;
+      break;
+    case ARM::SFI_DATA_MASK:
+      SaveCount = 0; // Do nothing.
+      break;
+    case ARM::SFI_GUARD_CALL:
+    case ARM::SFI_GUARD_INDIRECT_CALL:
+    case ARM::SFI_GUARD_INDIRECT_JMP:
+    case ARM::SFI_GUARD_RETURN:
+    case ARM::SFI_GUARD_LOADSTORE:
+    case ARM::SFI_GUARD_LOADSTORE_TST:
+      SaveCount = 2;
+      break;
+    case ARM::SFI_GUARD_SP_LOAD:
+      SaveCount = 4;
+      break;
+    }
+  }
+
+  if (I < SaveCount) {
+    // Othewise, save the current Inst and return
+    Saved[I++] = Inst;
+    if (I < SaveCount)
+      return true;
+    // Else fall through to next stat
+  }
+
+  if (SaveCount > 0) { 
+    assert(I == SaveCount && "Bookeeping Error");
+    SaveCount = 0; // Reset for next iteration
+    // The following calls may call Out.EmitInstruction()
+    // which must not again call CustomExpandInst ...
+    // So set RecurseGuard = 1;
+    RecurseGuard = true;
+
+    switch (Saved[0].getOpcode()) {
+    default:  /* No action required */      break;
+    case ARM::SFI_NOP_IF_AT_BUNDLE_END:
+      EmitDataMask(I, Saved, Out);
+      break;
+    case ARM::SFI_DATA_MASK:
+      assert(0 && "Unexpected NOP_IF_AT_BUNDLE_END as a Saved Inst");
+      break;
+    case ARM::SFI_GUARD_CALL:
+      EmitDirectGuardCall(I, Saved, Out);
+      break;
+    case ARM::SFI_GUARD_INDIRECT_CALL:
+      EmitIndirectGuardCall(I, Saved, Out);
+      break;
+    case ARM::SFI_GUARD_INDIRECT_JMP:
+      EmitIndirectGuardJmp(I, Saved, Out);
+      break;
+    case ARM::SFI_GUARD_RETURN:
+      EmitGuardReturn(I, Saved, Out);
+      break;
+    case ARM::SFI_GUARD_LOADSTORE:
+      EmitGuardLoadOrStore(I, Saved, Out);
+      break;
+    case ARM::SFI_GUARD_LOADSTORE_TST:
+      EmitGuardLoadOrStoreTst(I, Saved, Out);
+      break;
+    case ARM::SFI_GUARD_SP_LOAD:
+      EmitGuardSpLoad(I, Saved, Out);
+      break;
+    }
+    I = 0; // Reset I for next.
+    assert(RecurseGuard && "Illegal Depth");
+    RecurseGuard = false;
+    return true;
+  }
+
+  return false;
+}
+
+} // namespace llvm
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCNaCl.h b/lib/Target/ARM/MCTargetDesc/ARMMCNaCl.h
new file mode 100644
index 0000000000..de7ed50662
--- /dev/null
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCNaCl.h
@@ -0,0 +1,19 @@
+//===-- ARMMCNaCl.h - Prototype for CustomExpandInstNaClARM   ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef ARMMCNACL_H
+#define ARMMCNACL_H
+
+namespace llvm {
+  class MCInst;
+  class MCStreamer;
+  bool CustomExpandInstNaClARM(const MCInst &Inst, MCStreamer &Out);
+}
+
+#endif
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index 00ffc94ac7..7a57e40a17 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -89,7 +89,13 @@ std::string ARM_MC::ParseARMTriple(StringRef TT, StringRef CPU) {
           ARMArchFeature = "+v7,+neon,+db,+t2dsp,+t2xtpk";
         else
           // Use CPU to figure out the exact features.
-          ARMArchFeature = "+v7";
+          // @LOCALMOD-BEGIN
+          // Orig:    ARMArchFeature = "+v7";
+          // TODO(pdox): Eliminate this strange exception, possibly
+          // with our own cpu tag. (neon doesn't work, but vfp2 does).
+          // We also don't seem to handle The DSP features.
+          ARMArchFeature = "+v7,+db,+vfp2";
+          // @LOCALMOD-END
       }
     } else if (SubVer == '6') {
       if (Len >= Idx+3 && TT[Idx+1] == 't' && TT[Idx+2] == '2')
@@ -154,7 +160,18 @@ static MCAsmInfo *createARMMCAsmInfo(const Target &T, StringRef TT) {
   if (TheTriple.isOSDarwin())
     return new ARMMCAsmInfoDarwin();
 
-  return new ARMELFMCAsmInfo();
+  // @LOCALMOD-BEGIN
+  ARMELFMCAsmInfo *MAI = new ARMELFMCAsmInfo();
+  if (TheTriple.getOS() == Triple::NativeClient) {
+    // NativeClient uses Dwarf exception handling
+    MAI->setExceptionsType(ExceptionHandling::DwarfCFI);
+    // Initial state of the frame ARM:SP points to cfa
+    MachineLocation Dst(MachineLocation::VirtualFP);
+    MachineLocation Src(ARM::SP, 0);
+    MAI->addInitialFrameState(0, Dst, Src);
+  }
+  return MAI;
+  // @LOCALMOD-END
 }
 
 static MCCodeGenInfo *createARMMCCodeGenInfo(StringRef TT, Reloc::Model RM,
diff --git a/lib/Target/ARM/MCTargetDesc/CMakeLists.txt b/lib/Target/ARM/MCTargetDesc/CMakeLists.txt
index 256599412e..3ee853c822 100644
--- a/lib/Target/ARM/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/ARM/MCTargetDesc/CMakeLists.txt
@@ -4,6 +4,7 @@ add_llvm_library(LLVMARMDesc
   ARMMCAsmInfo.cpp
   ARMMCCodeEmitter.cpp
   ARMMCExpr.cpp
+  ARMMCNaCl.cpp # LOCALMOD
   ARMMCTargetDesc.cpp
   ARMMachObjectWriter.cpp
   ARMELFObjectWriter.cpp
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index 9a35bb6bd7..9a94c75e2f 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -14,6 +14,7 @@
 
 #include "MipsFixupKinds.h"
 #include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "MCTargetDesc/MipsMCNaCl.h" // @LOCALMOD
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCDirectives.h"
@@ -239,6 +240,20 @@ public:
       OW->Write32(0);
     return true;
   }
+
+  // @LOCALMOD-BEGIN
+  // FIXME! NaCl should INHERIT from MipsAsmBackend, not add to it.
+  unsigned getBundleSize() const {
+    return (OSType == Triple::NativeClient) ? 16 : 0;
+  }
+
+  bool CustomExpandInst(const MCInst &Inst, MCStreamer &Out) const {
+    if (OSType == Triple::NativeClient) {
+      return CustomExpandInstNaClMips(Inst, Out);
+    }
+    return false;
+  }
+  // @LOCALMOD-END
 }; // class MipsAsmBackend
 
 } // namespace
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index 5d240fe847..6ad8669d04 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -18,6 +18,13 @@
 #include "llvm/Support/ErrorHandling.h"
 #include <list>
 
+// @LOCALMOD-START
+// TODO(petarj): HACK! Find better way to set ELF::EF_MIPS_PIC flag.
+// See also file lib/MC/MCObjectFileInfo.cpp.
+#include "llvm/Support/CodeGen.h"
+extern llvm::Reloc::Model RelocModelOption;
+// @LOCALMOD-END
+
 using namespace llvm;
 
 namespace {
@@ -71,6 +78,10 @@ unsigned MipsELFObjectWriter::getEFlags() const {
     Flag |= ELF::EF_MIPS_ARCH_64R2;
   else
     Flag |= ELF::EF_MIPS_ARCH_32R2;
+  /* @LOCLAMOD-START */
+  if (RelocModelOption == Reloc::PIC_ || RelocModelOption == Reloc::Default)
+    Flag |= ELF::EF_MIPS_PIC;
+  /* @LOCLAMOD-END */
   return Flag;
 }
 
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.cpp
new file mode 100644
index 0000000000..d39a60d41c
--- /dev/null
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.cpp
@@ -0,0 +1,261 @@
+//=== MipsMCNaCl.cpp -  Expansion of NaCl pseudo-instructions    --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "mips-mc-nacl"
+
+#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+/// Two helper functions for emitting the actual guard instructions
+
+static void EmitMask(MCStreamer &Out,
+                        unsigned Addr, unsigned Mask) {
+  // and \Addr, \Addr, \Mask
+  MCInst MaskInst;
+  MaskInst.setOpcode(Mips::AND);
+  MaskInst.addOperand(MCOperand::CreateReg(Addr));
+  MaskInst.addOperand(MCOperand::CreateReg(Addr));
+  MaskInst.addOperand(MCOperand::CreateReg(Mask));
+  Out.EmitInstruction(MaskInst);
+}
+
+// This is ONLY used for sandboxing stack changes.
+// The reason why SFI_NOP_IF_AT_BUNDLE_END gets handled here is that
+// it must ensure that the two instructions are in the same bundle.
+// It just so happens that the SFI_NOP_IF_AT_BUNDLE_END is always
+// emitted in conjunction with a SFI_DATA_MASK
+//
+static void EmitDataMask(int I, MCInst Saved[], MCStreamer &Out) {
+  assert(I == 3 &&
+         (Mips::SFI_NOP_IF_AT_BUNDLE_END == Saved[0].getOpcode()) &&
+         (Mips::SFI_DATA_MASK == Saved[2].getOpcode()) &&
+         "Unexpected SFI Pseudo while lowering");
+
+  unsigned Addr = Saved[2].getOperand(0).getReg();
+  unsigned Mask = Saved[2].getOperand(2).getReg();
+  assert((Mips::SP == Addr) && "Unexpected register at stack guard");
+
+  Out.EmitBundleLock();
+  Out.EmitInstruction(Saved[1]);
+  EmitMask(Out, Addr, Mask);
+  Out.EmitBundleUnlock();
+}
+
+static void EmitDirectGuardCall(int I, MCInst Saved[],
+                                MCStreamer &Out) {
+  // sfi_call_preamble --->
+  //   sfi_nops_to_force_slot2
+  assert(I == 3 && (Mips::SFI_GUARD_CALL == Saved[0].getOpcode()) &&
+         "Unexpected SFI Pseudo while lowering SFI_GUARD_CALL");
+  Out.EmitBundleAlignEnd();
+  Out.EmitBundleLock();
+  Out.EmitInstruction(Saved[1]);
+  Out.EmitInstruction(Saved[2]);
+  Out.EmitBundleUnlock();
+}
+
+static void EmitIndirectGuardCall(int I, MCInst Saved[],
+                                  MCStreamer &Out) {
+  // sfi_indirect_call_preamble link --->
+  //   sfi_nops_to_force_slot1
+  //   sfi_code_mask \link \link \maskreg
+  assert(I == 3 && (Mips::SFI_GUARD_INDIRECT_CALL == Saved[0].getOpcode()) &&
+         "Unexpected SFI Pseudo while lowering SFI_GUARD_INDIRECT_CALL");
+
+  unsigned Addr = Saved[0].getOperand(0).getReg();
+  unsigned Mask = Saved[0].getOperand(2).getReg();
+
+  Out.EmitBundleAlignEnd();
+  Out.EmitBundleLock();
+  EmitMask(Out, Addr, Mask);
+  Out.EmitInstruction(Saved[1]);
+  Out.EmitInstruction(Saved[2]);
+  Out.EmitBundleUnlock();
+}
+
+static void EmitIndirectGuardJmp(int I, MCInst Saved[], MCStreamer &Out) {
+  //  sfi_indirect_jump_preamble link --->
+  //    sfi_nop_if_at_bundle_end
+  //    sfi_code_mask \link \link \maskreg
+  assert(I == 2 && (Mips::SFI_GUARD_INDIRECT_JMP == Saved[0].getOpcode()) &&
+         "Unexpected SFI Pseudo while lowering SFI_GUARD_INDIRECT_JMP");
+  unsigned Addr = Saved[0].getOperand(0).getReg();
+  unsigned Mask = Saved[0].getOperand(2).getReg();
+
+  Out.EmitBundleLock();
+  EmitMask(Out, Addr, Mask);
+  Out.EmitInstruction(Saved[1]);
+  Out.EmitBundleUnlock();
+}
+
+static void EmitGuardReturn(int I, MCInst Saved[], MCStreamer &Out) {
+  // sfi_return_preamble reg --->
+  //    sfi_nop_if_at_bundle_end
+  //    sfi_code_mask \reg \reg \maskreg
+  assert(I == 2 && (Mips::SFI_GUARD_RETURN == Saved[0].getOpcode()) &&
+         "Unexpected SFI Pseudo while lowering SFI_GUARD_RETURN");
+  unsigned Reg = Saved[0].getOperand(0).getReg();
+  unsigned Mask = Saved[0].getOperand(2).getReg();
+
+  Out.EmitBundleLock();
+  EmitMask(Out, Reg, Mask);
+  Out.EmitInstruction(Saved[1]);
+  Out.EmitBundleUnlock();
+}
+
+static void EmitGuardLoadOrStore(int I, MCInst Saved[], MCStreamer &Out) {
+  // sfi_load_store_preamble reg --->
+  //    sfi_nop_if_at_bundle_end
+  //    sfi_data_mask \reg \reg \maskreg
+  assert(I == 2 && (Mips::SFI_GUARD_LOADSTORE == Saved[0].getOpcode()) &&
+         "Unexpected SFI Pseudo while lowering SFI_GUARD_LOADSTORE");
+  unsigned Reg = Saved[0].getOperand(0).getReg();
+  unsigned Mask = Saved[0].getOperand(2).getReg();
+
+  Out.EmitBundleLock();
+  EmitMask(Out, Reg, Mask);
+  Out.EmitInstruction(Saved[1]);
+  Out.EmitBundleUnlock();
+}
+
+namespace llvm {
+// CustomExpandInstNaClMips -
+//   If Inst is a NaCl pseudo instruction, emits the substitute
+//   expansion to the MCStreamer and returns true.
+//   Otherwise, returns false.
+//
+//   NOTE: Each time this function calls Out.EmitInstruction(), it will be
+//   called again recursively to rewrite the new instruction being emitted.
+//   Care must be taken to ensure that this does not result in an infinite
+//   loop. Also, global state must be managed carefully so that it is
+//   consistent during recursive calls.
+//
+//   We need global state to keep track of the explicit prefix (PREFIX_*)
+//   instructions. Unfortunately, the assembly parser prefers to generate
+//   these instead of combined instructions. At this time, having only
+//   one explicit prefix is supported.
+
+
+bool CustomExpandInstNaClMips(const MCInst &Inst, MCStreamer &Out) {
+  const int MaxSaved = 4;
+  static MCInst Saved[MaxSaved];
+  static int SaveCount  = 0;
+  static int I = 0;
+  // This routine only executes  if RecurseGuard == 0
+  static bool RecurseGuard = false;
+
+  // If we are emitting to .s, just emit all pseudo-instructions directly.
+  if (Out.hasRawTextSupport()) {
+    return false;
+  }
+
+  //No recursive calls allowed;
+  if (RecurseGuard) return false;
+
+  unsigned Opc = Inst.getOpcode();
+
+  DEBUG(dbgs() << "CustomExpandInstNaClMips("; Inst.dump(); dbgs() << ")\n");
+
+  // Note: SFI_NOP_IF_AT_BUNDLE_END is only emitted directly as part of
+  // a stack guard in conjunction with a SFI_DATA_MASK
+
+  // Logic:
+  // This is somewhat convoluted, but in the current model, the SFI
+  // guard pseudo instructions occur PRIOR to the actual instruction.
+  // So, the bundling/alignment operation has to refer to the FOLLOWING
+  // one or two instructions.
+  //
+  // When a SFI_* pseudo is detected, it is saved. Then, the saved SFI_*
+  // pseudo and the very next one or two instructions are used as arguments to
+  // the Emit*() functions in this file.  This is the reason why we have a
+  // doublely nested switch here.  First, to save the SFI_* pseudo, then to
+  // emit it and the next instruction
+
+  // By default, we only need to save two or three instructions
+
+  if ((I == 0) && (SaveCount == 0)) {
+    // Base State, no saved instructions.
+    // If the current instruction is a SFI instruction, set the SaveCount
+    // and fall through.
+    switch (Opc) {
+    default:
+      SaveCount = 0; // Nothing to do.
+      return false;  // Handle this Inst elsewhere.
+    case Mips::SFI_NOP_IF_AT_BUNDLE_END:
+    case Mips::SFI_GUARD_CALL:
+    case Mips::SFI_GUARD_INDIRECT_CALL:
+      SaveCount = 3;
+      break;
+    case Mips::SFI_DATA_MASK:
+      SaveCount = 0; // Do nothing.
+      break;
+    case Mips::SFI_GUARD_INDIRECT_JMP:
+    case Mips::SFI_GUARD_RETURN:
+    case Mips::SFI_GUARD_LOADSTORE:
+      SaveCount = 2;
+      break;
+    }
+  }
+
+  if (I < SaveCount) {
+    // Othewise, save the current Inst and return
+    Saved[I++] = Inst;
+    if (I < SaveCount)
+      return true;
+    // Else fall through to next stat
+  }
+
+  if (SaveCount > 0) {
+    assert(I == SaveCount && "Bookeeping Error");
+    SaveCount = 0; // Reset for next iteration
+    // The following calls may call Out.EmitInstruction()
+    // which must not again call CustomExpandInst ...
+    // So set RecurseGuard = 1;
+    RecurseGuard = true;
+
+    switch (Saved[0].getOpcode()) {
+    default:  /* No action required */      break;
+    case Mips::SFI_NOP_IF_AT_BUNDLE_END:
+      EmitDataMask(I, Saved, Out);
+      break;
+    case Mips::SFI_DATA_MASK:
+      assert(0 && "Unexpected NOP_IF_AT_BUNDLE_END as a Saved Inst");
+      break;
+    case Mips::SFI_GUARD_CALL:
+      EmitDirectGuardCall(I, Saved, Out);
+      break;
+    case Mips::SFI_GUARD_INDIRECT_CALL:
+      EmitIndirectGuardCall(I, Saved, Out);
+      break;
+    case Mips::SFI_GUARD_INDIRECT_JMP:
+      EmitIndirectGuardJmp(I, Saved, Out);
+      break;
+    case Mips::SFI_GUARD_RETURN:
+      EmitGuardReturn(I, Saved, Out);
+      break;
+    case Mips::SFI_GUARD_LOADSTORE:
+      EmitGuardLoadOrStore(I, Saved, Out);
+      break;
+    }
+    I = 0; // Reset I for next.
+    assert(RecurseGuard && "Illegal Depth");
+    RecurseGuard = false;
+    return true;
+  }
+  return false;
+}
+
+} // namespace llvm
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h b/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
new file mode 100644
index 0000000000..c90502ec33
--- /dev/null
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCNaCl.h
@@ -0,0 +1,19 @@
+//===-- MipsMCNaCl.h - Prototype for CustomExpandInstNaClMips ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MIPSMCNACL_H
+#define MIPSMCNACL_H
+
+namespace llvm {
+  class MCInst;
+  class MCStreamer;
+  bool CustomExpandInstNaClMips(const MCInst &Inst, MCStreamer &Out);
+}
+
+#endif
diff --git a/lib/Target/Mips/Mips.h b/lib/Target/Mips/Mips.h
index 2963f7e7fa..411030aaa1 100644
--- a/lib/Target/Mips/Mips.h
+++ b/lib/Target/Mips/Mips.h
@@ -18,6 +18,16 @@
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "llvm/Target/TargetMachine.h"
 
+/* @LOCALMOD-START */
+namespace llvm {
+
+namespace Mips {
+  extern unsigned LoadStoreStackMaskReg;
+  extern unsigned IndirectBranchMaskReg;
+}
+} // End llvm namespace
+/* @LOCALMOD-END */
+
 namespace llvm {
   class MipsTargetMachine;
   class FunctionPass;
@@ -28,6 +38,10 @@ namespace llvm {
   FunctionPass *createMipsJITCodeEmitterPass(MipsTargetMachine &TM,
                                              JITCodeEmitter &JCE);
 
+  // @LOCALMOD-START
+  FunctionPass *createMipsNaClRewritePass();
+  // @LOCALMOD-END
+
 } // end namespace llvm;
 
 #endif
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index bf2818d61d..9bb39a424c 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -212,13 +212,24 @@ const char *MipsAsmPrinter::getCurrentABIString() const {
 }
 
 void MipsAsmPrinter::EmitFunctionEntryLabel() {
-  if (OutStreamer.hasRawTextSupport()) {
+  // @LOCALMOD-START
+  // make sure function entry is aligned. We use XmagicX as our basis
+  // for alignment decisions (c.f. assembler sfi macros).
+  int alignment = MF->getAlignment();
+  if (alignment < 4) alignment = 4;
+  EmitAlignment(alignment);
+  if (Subtarget->isTargetNaCl() && OutStreamer.hasRawTextSupport()) {
     if (Subtarget->inMips16Mode())
       OutStreamer.EmitRawText(StringRef("\t.set\tmips16"));
     else
       OutStreamer.EmitRawText(StringRef("\t.set\tnomips16"));
     // leave out until FSF available gas has micromips changes
     // OutStreamer.EmitRawText(StringRef("\t.set\tnomicromips"));
+    OutStreamer.EmitRawText(StringRef("\t.set XmagicX, .\n"));
+  }
+  // @LOCALMOD-END
+
+  if (OutStreamer.hasRawTextSupport()) {
     OutStreamer.EmitRawText("\t.ent\t" + Twine(CurrentFnSym->getName()));
   }
   OutStreamer.EmitLabel(CurrentFnSym);
@@ -519,6 +530,10 @@ printFCCOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
   O << Mips::MipsFCCToString((Mips::CondCode)MO.getImm());
 }
 
+// @LOCALMOD-START
+extern void EmitMipsSFIHeaders(raw_ostream &O);
+// @LOCALMOD-END
+
 void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) {
   // FIXME: Use SwitchSection.
 
@@ -540,7 +555,35 @@ void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) {
   // return to previous section
   if (OutStreamer.hasRawTextSupport())
     OutStreamer.EmitRawText(StringRef("\t.previous"));
+
+  // @LOCALMOD-START
+  if (Subtarget->isTargetNaCl() && OutStreamer.hasRawTextSupport()) {
+    std::string str;
+    raw_string_ostream OS(str);
+    EmitMipsSFIHeaders(OS);
+    OutStreamer.EmitRawText(StringRef(OS.str()));
+  }
+  // @LOCALMOD-END
+}
+
+// @LOCALMOD-START
+unsigned MipsAsmPrinter::GetTargetLabelAlign(const MachineInstr *MI) const {
+  if (Subtarget->isTargetNaCl()) {
+    switch (MI->getOpcode()) {
+      default: return 0;
+      // These labels may indicate an indirect entry point that is
+      // externally reachable and hence must be bundle aligned.
+      // Note: these labels appear to be always at basic block beginnings
+      // so it may be possible to simply set the MBB alignment.
+      // However, it is unclear whether this always holds.
+      case TargetOpcode::EH_LABEL:
+      case TargetOpcode::GC_LABEL:
+        return 4;
+    }
+  }
+  return 0;
 }
+// @LOCALMOD-END
 
 MachineLocation
 MipsAsmPrinter::getDebugValueLocation(const MachineInstr *MI) const {
diff --git a/lib/Target/Mips/MipsAsmPrinter.h b/lib/Target/Mips/MipsAsmPrinter.h
index 94d8bfa105..efed6357a4 100644
--- a/lib/Target/Mips/MipsAsmPrinter.h
+++ b/lib/Target/Mips/MipsAsmPrinter.h
@@ -82,6 +82,10 @@ public:
   void EmitStartOfAsmFile(Module &M);
   virtual MachineLocation getDebugValueLocation(const MachineInstr *MI) const;
   void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
+
+  // @LOCALMOD-START
+  virtual unsigned GetTargetLabelAlign(const MachineInstr *MI) const;
+  // @LOCALMOD-END
 };
 }
 
diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp
index e3c8ed75cf..d014ba1792 100644
--- a/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -141,6 +141,11 @@ FunctionPass *llvm::createMipsDelaySlotFillerPass(MipsTargetMachine &tm) {
   return new Filler(tm);
 }
 
+// @LOCALMOD-START
+extern bool IsDangerousLoad(const MachineInstr &MI, int *AddrIdx);
+extern bool IsDangerousStore(const MachineInstr &MI, int *AddrIdx);
+// @LOCALMOD-END
+
 bool Filler::findDelayInstr(MachineBasicBlock &MBB,
                             InstrIter slot,
                             InstrIter &Filler) {
@@ -160,11 +165,18 @@ bool Filler::findDelayInstr(MachineBasicBlock &MBB,
     // Convert to forward iterator.
     InstrIter FI(llvm::next(I).base());
 
+    int Dummy; // @LOCALMOD
     if (I->hasUnmodeledSideEffects()
         || I->isInlineAsm()
         || I->isLabel()
         || FI == LastFiller
         || I->isPseudo()
+        // @LOCALMOD-START
+        // Don't put in delay slot instructions that could be masked.
+        || IsDangerousLoad(*FI, &Dummy)
+        || IsDangerousStore(*FI, &Dummy)
+        || FI->modifiesRegister(Mips::SP, TM.getRegisterInfo())
+        // @LOCALMOD-END
         //
         // Should not allow:
         // ERET, DERET or WAIT, PAUSE. Need to add these to instruction
diff --git a/lib/Target/Mips/MipsISelDAGToDAG.cpp b/lib/Target/Mips/MipsISelDAGToDAG.cpp
index c5fca7f4b2..778fe34275 100644
--- a/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -390,7 +390,7 @@ SelectAddr(SDNode *Parent, SDValue Addr, SDValue &Base, SDValue &Offset) {
 
     if (LS &&
         (LS->getMemoryVT() == MVT::f32 || LS->getMemoryVT() == MVT::f64) &&
-        Subtarget.hasMips32r2Or64())
+        Subtarget.hasMips32r2Or64() && !Subtarget.isTargetNaCl()/*@LOCALMOD*/)
       return false;
   }
 
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index e225b6c28e..32cf6c8be7 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -369,6 +369,13 @@ MipsTargetLowering(MipsTargetMachine &TM)
     setTruncStoreAction(MVT::i64, MVT::i32, Custom);
   }
 
+  // @LOCALMOD-BEGIN
+  if (Subtarget->isTargetNaCl()) {
+    setOperationAction(ISD::NACL_TP_TLS_OFFSET,        MVT::i32, Custom);
+    setOperationAction(ISD::NACL_TP_TDB_OFFSET,        MVT::i32, Custom);
+  }
+  // @LOCALMOD-END
+
   setTargetDAGCombine(ISD::ADDE);
   setTargetDAGCombine(ISD::SUBE);
   setTargetDAGCombine(ISD::SDIVREM);
@@ -919,6 +926,10 @@ LowerOperation(SDValue Op, SelectionDAG &DAG) const
     case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
     case ISD::INTRINSIC_W_CHAIN:  return LowerINTRINSIC_W_CHAIN(Op, DAG);
     case ISD::ADD:                return LowerADD(Op, DAG);
+    // @LOCALMOD-BEGIN
+    case ISD::NACL_TP_TLS_OFFSET: return LowerNaClTpTlsOffset(Op, DAG);
+    case ISD::NACL_TP_TDB_OFFSET: return LowerNaClTpTdbOffset(Op, DAG);
+    // @LOCALMOD-END
   }
   return SDValue();
 }
@@ -1817,6 +1828,24 @@ SDValue MipsTargetLowering::LowerBlockAddress(SDValue Op,
   return DAG.getNode(ISD::ADD, dl, ValTy, Load, Lo);
 }
 
+// @LOCALMOD-BEGIN
+
+// NaCl TLS setup / layout intrinsics.
+// See: native_client/src/untrusted/nacl/tls_params.h
+SDValue MipsTargetLowering::LowerNaClTpTlsOffset(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  return DAG.getConstant(0, Op.getValueType().getSimpleVT());
+}
+
+SDValue MipsTargetLowering::LowerNaClTpTdbOffset(SDValue Op,
+                                                 SelectionDAG &DAG) const {
+  DebugLoc dl = Op.getDebugLoc();
+  return DAG.getNode(ISD::SUB, dl, Op.getValueType().getSimpleVT(),
+                     DAG.getConstant(0, Op.getValueType().getSimpleVT()),
+		     Op.getOperand(0));
+}
+// @LOCALMOD-END
+
 SDValue MipsTargetLowering::
 LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
 {
@@ -1831,6 +1860,38 @@ LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
 
   TLSModel::Model model = getTargetMachine().getTLSModel(GV);
 
+  // @LOCALMOD-BEGIN
+  if (getTargetMachine().getSubtarget<MipsSubtarget>().isTargetNaCl()) {
+    SDVTList VTs = DAG.getVTList(MVT::i32);
+    SDValue TGAHi = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0,
+                                                 MipsII::MO_TPREL_HI);
+    SDValue TGALo = DAG.getTargetGlobalAddress(GV, dl, MVT::i32, 0,
+                                                 MipsII::MO_TPREL_LO);
+    SDValue Hi = DAG.getNode(MipsISD::Hi, dl, VTs, &TGAHi, 1);
+    SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, TGALo);
+    SDValue Offset = DAG.getNode(ISD::ADD, dl, MVT::i32, Hi, Lo);
+
+    unsigned PtrSize = PtrVT.getSizeInBits();
+    IntegerType *PtrTy = Type::getIntNTy(*DAG.getContext(), PtrSize);
+
+    SDValue TlsReadTp = DAG.getExternalSymbol("__nacl_read_tp", PtrVT);
+
+    ArgListTy Args;
+    TargetLowering::CallLoweringInfo CLI(DAG.getEntryNode(), PtrTy,
+                  false, false, false, false, 0, CallingConv::C,
+                  /*isTailCall=*/false, /*doesNotRet=*/false,
+                  /*isReturnValueUsed=*/true,
+                  TlsReadTp, Args, DAG, dl);
+    std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+
+    SDValue ThreadPointer = CallResult.first;
+    SDValue TPOffset = DAG.getConstant(0x7000, MVT::i32);
+    SDValue ThreadPointer2 = DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer,
+                                         TPOffset);
+    return DAG.getNode(ISD::ADD, dl, PtrVT, ThreadPointer2, Offset);
+  }
+  // @LOCALMOD-END
+
   if (model == TLSModel::GeneralDynamic || model == TLSModel::LocalDynamic) {
     // General Dynamic and Local Dynamic TLS Model.
     unsigned Flag = (model == TLSModel::LocalDynamic) ? MipsII::MO_TLSLDM
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index 43f97e89a7..77045c3162 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -304,6 +304,11 @@ namespace llvm {
     void writeVarArgRegs(std::vector<SDValue> &OutChains, const MipsCC &CC,
                          SDValue Chain, DebugLoc DL, SelectionDAG &DAG) const;
 
+    // @LOCALMOD-BEGIN
+    SDValue LowerNaClTpTlsOffset(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerNaClTpTdbOffset(SDValue Op, SelectionDAG &DAG) const;
+    // @LOCALMOD-END
+
     virtual SDValue
       LowerFormalArguments(SDValue Chain,
                            CallingConv::ID CallConv, bool isVarArg,
diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td
index 33ee020689..7844df9f40 100644
--- a/lib/Target/Mips/MipsInstrFPU.td
+++ b/lib/Target/Mips/MipsInstrFPU.td
@@ -282,23 +282,24 @@ let Predicates = [NotN64, NotMips64, HasStandardEncoding] in {
 }
 
 // Indexed loads and stores.
-let Predicates = [HasMips32r2Or64, HasStandardEncoding] in {
+let Predicates = [HasMips32r2Or64, IsNotNaCl/*@LOCALMOD*/] in {
   def LWXC1 : FPIdxLoad<0x0, "lwxc1", FGR32, CPURegs, load>;
   def SWXC1 : FPIdxStore<0x8, "swxc1", FGR32, CPURegs, store>;
 }
 
-let Predicates = [HasMips32r2, NotMips64, HasStandardEncoding] in {
+let Predicates = [HasMips32r2, NotMips64, IsNotNaCl/*@LOCALMOD*/] in {
   def LDXC1 : FPIdxLoad<0x1, "ldxc1", AFGR64, CPURegs, load>;
   def SDXC1 : FPIdxStore<0x9, "sdxc1", AFGR64, CPURegs, store>;
 }
 
-let Predicates = [HasMips64, NotN64, HasStandardEncoding], DecoderNamespace="Mips64" in {
+let Predicates = [HasMips64, NotN64, IsNotNaCl/*@LOCALMOD*/],
+    DecoderNamespace="Mips64" in {
   def LDXC164 : FPIdxLoad<0x1, "ldxc1", FGR64, CPURegs, load>;
   def SDXC164 : FPIdxStore<0x9, "sdxc1", FGR64, CPURegs, store>;
 }
 
 // n64
-let Predicates = [IsN64, HasStandardEncoding], isCodeGenOnly=1 in {
+let Predicates = [IsN64, IsNotNaCl/*@LOCALMOD*/], isCodeGenOnly=1 in {
   def LWXC1_P8   : FPIdxLoad<0x0, "lwxc1", FGR32, CPU64Regs, load>;
   def LDXC164_P8 : FPIdxLoad<0x1, "ldxc1", FGR64, CPU64Regs, load>;
   def SWXC1_P8   : FPIdxStore<0x8, "swxc1", FGR32, CPU64Regs, store>;
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index f16b5f9ee7..3142ac94b1 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -174,6 +174,8 @@ def NoNaNsFPMath :    Predicate<"TM.Options.NoNaNsFPMath">,
                       AssemblerPredicate<"FeatureMips32">;
 def HasStandardEncoding : Predicate<"Subtarget.hasStandardEncoding()">,
                           AssemblerPredicate<"!FeatureMips16">;
+def IsNaCl       :    Predicate<"Subtarget.isTargetNaCl()">;
+def IsNotNaCl    :    Predicate<"!Subtarget.isTargetNaCl()">;
 
 class MipsPat<dag pattern, dag result> : Pat<pattern, result> {
   let Predicates = [HasStandardEncoding];
@@ -859,6 +861,37 @@ class SCBase<bits<6> Opc, string opstring, RegisterClass RC, Operand Mem> :
 // Pseudo instructions
 //===----------------------------------------------------------------------===//
 
+// @LOCALMOD-START
+
+// Older Macro based SFI Model
+def SFI_GUARD_LOADSTORE :
+MipsPseudo<(outs CPURegs:$dst), (ins CPURegs:$src1, CPURegs:$src2),
+    "sfi_load_store_preamble\t$dst, $src1, $src2", []>;
+
+def SFI_GUARD_INDIRECT_CALL :
+MipsPseudo<(outs CPURegs:$dst), (ins CPURegs:$src1, CPURegs:$src2),
+    "sfi_indirect_call_preamble\t$dst, $src1, $src2", []>;
+
+def SFI_GUARD_INDIRECT_JMP :
+MipsPseudo<(outs CPURegs:$dst), (ins CPURegs:$src1, CPURegs:$src2),
+    "sfi_indirect_jump_preamble\t$dst, $src1, $src2", []>;
+
+def SFI_GUARD_CALL :
+MipsPseudo<(outs), (ins), "sfi_call_preamble", []>;
+
+def SFI_GUARD_RETURN :
+MipsPseudo<(outs CPURegs:$dst), (ins CPURegs:$src1, CPURegs:$src2),
+    "sfi_return_preamble\t$dst, $src1, $src2", []>;
+
+def SFI_NOP_IF_AT_BUNDLE_END :
+MipsPseudo<(outs), (ins), "sfi_nop_if_at_bundle_end", []>;
+
+def SFI_DATA_MASK :
+MipsPseudo<(outs CPURegs:$dst), (ins CPURegs:$src1, CPURegs:$src2),
+    "sfi_data_mask\t$dst, $src1, $src2", []>;
+
+// @LOCALMOD-END
+
 // Return RA.
 let isReturn=1, isTerminator=1, hasDelaySlot=1, isBarrier=1, hasCtrlDep=1 in
 def RetRA : PseudoSE<(outs), (ins), "", [(MipsRet)]>;
diff --git a/lib/Target/Mips/MipsMCInstLower.cpp b/lib/Target/Mips/MipsMCInstLower.cpp
index 5fa6339338..d8119ff75c 100644
--- a/lib/Target/Mips/MipsMCInstLower.cpp
+++ b/lib/Target/Mips/MipsMCInstLower.cpp
@@ -160,3 +160,4 @@ void MipsMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
   }
 }
 
+
diff --git a/lib/Target/Mips/MipsNaClHeaders.cpp b/lib/Target/Mips/MipsNaClHeaders.cpp
new file mode 100644
index 0000000000..375c287d67
--- /dev/null
+++ b/lib/Target/Mips/MipsNaClHeaders.cpp
@@ -0,0 +1,128 @@
+//===-- MipsNaClHeaders.cpp - Print SFI headers to an Mips .s file --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the initial header string needed
+// for the Native Client target in Mips assembly.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/raw_ostream.h"
+#include "MipsNaClRewritePass.h"
+#include <string>
+
+using namespace llvm;
+
+void EmitMipsSFIHeaders(raw_ostream &O) {
+  O << " # ========================================\n";
+  O << "# Branch: " << FlagSfiBranch << "\n";
+  O << "# Stack: " << FlagSfiStack << "\n";
+  O << "# Store: " << FlagSfiStore << "\n";
+  O << "# Load: " << FlagSfiLoad << "\n";
+
+  O << " # ========================================\n";
+  // NOTE: this macro does bundle alignment as follows
+  //       if current bundle pos is X emit pX data items of value "val"
+  // NOTE: that pos will be one of: 0,4,8,12
+  //
+  O <<
+    "\t.macro sfi_long_based_on_pos p0 p1 p2 p3 val\n"
+    "\t.set pos, (. - XmagicX) % 16\n"
+    "\t.fill  (((\\p3<<12)|(\\p2<<8)|(\\p1<<4)|\\p0)>>pos) & 15, 4, \\val\n"
+    "\t.endm\n"
+    "\n\n";
+
+  O <<
+    "\t.macro sfi_nop_if_at_bundle_end\n"
+    "\tsfi_long_based_on_pos 0 0 0 1 0x00000000\n"
+    "\t.endm\n"
+      "\n\n";
+
+  O <<
+    "\t.macro sfi_nops_to_force_slot3\n"
+    "\tsfi_long_based_on_pos 3 2 1 0 0x00000000\n"
+    "\t.endm\n"
+    "\n\n";
+
+  O <<
+    "\t.macro sfi_nops_to_force_slot2\n"
+    "\tsfi_long_based_on_pos 2 1 0 3 0x00000000\n"
+    "\t.endm\n"
+    "\n\n";
+
+  O <<
+    "\t.macro sfi_nops_to_force_slot1\n"
+    "\tsfi_long_based_on_pos 1 0 3 2 0x00000000\n"
+    "\t.endm\n"
+    "\n\n";
+
+  O << " # ========================================\n";
+  O <<
+    "\t.macro sfi_data_mask reg1 reg2 maskreg\n"
+    "\tand \\reg1, \\reg2, \\maskreg\n"
+    "\t.endm\n"
+    "\n\n";
+
+  O <<
+    "\t.macro sfi_code_mask reg1 reg2 maskreg\n"
+    "\tand \\reg1, \\reg2, \\maskreg\n"
+    "\t.endm\n"
+    "\n\n";
+
+  O << " # ========================================\n";
+  if (FlagSfiBranch) {
+    O <<
+      "\t.macro sfi_call_preamble\n"
+      "\tsfi_nops_to_force_slot2\n"
+      "\t.endm\n"
+      "\n\n";
+
+    O <<
+      "\t.macro sfi_return_preamble reg1 reg2 maskreg\n"
+      "\tsfi_nop_if_at_bundle_end\n"
+      "\tsfi_code_mask \\reg1, \\reg2, \\maskreg\n"
+      "\t.endm\n"
+      "\n\n";
+
+    // This is used just before "jr"
+    O <<
+      "\t.macro sfi_indirect_jump_preamble reg1 reg2 maskreg\n"
+      "\tsfi_nop_if_at_bundle_end\n"
+      "\tsfi_code_mask \\reg1, \\reg2, \\maskreg\n"
+      "\t.endm\n"
+      "\n\n";
+
+    // This is used just before "jalr"
+    O <<
+      "\t.macro sfi_indirect_call_preamble reg1 reg2 maskreg\n"
+      "\tsfi_nops_to_force_slot1\n"
+      "\tsfi_code_mask \\reg1, \\reg2, \\maskreg\n"
+      "\t.endm\n"
+      "\n\n";
+
+  }
+
+  if (FlagSfiStore) {
+    O << " # ========================================\n";
+
+    O <<
+      "\t.macro sfi_load_store_preamble reg1 reg2 maskreg\n"
+      "\tsfi_nop_if_at_bundle_end\n"
+      "\tsfi_data_mask \\reg1, \\reg2 , \\maskreg\n"
+      "\t.endm\n"
+      "\n\n";
+  } else {
+    O <<
+      "\t.macro sfi_load_store_preamble reg1 reg2 maskreg\n"
+      "\t.endm\n"
+      "\n\n";
+  }
+
+  O << " # ========================================\n";
+  O << "\t.text\n";
+}
diff --git a/lib/Target/Mips/MipsNaClRewritePass.cpp b/lib/Target/Mips/MipsNaClRewritePass.cpp
new file mode 100644
index 0000000000..f675e5663a
--- /dev/null
+++ b/lib/Target/Mips/MipsNaClRewritePass.cpp
@@ -0,0 +1,333 @@
+//===-- MipsNaClRewritePass.cpp - Native Client Rewrite Pass  -----*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Native Client Rewrite Pass
+// This final pass inserts the sandboxing instructions needed to run inside
+// the Native Client sandbox. Native Client requires certain software fault
+// isolation (SFI) constructions to be put in place, to prevent escape from
+// the sandbox. Native Client refuses to execute binaries without the correct
+// SFI sequences.
+//
+// Potentially dangerous operations which are protected include:
+// * Stores
+// * Branches
+// * Changes to SP
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "mips-sfi"
+#include "Mips.h"
+#include "MipsInstrInfo.h"
+#include "MipsNaClRewritePass.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/Function.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+unsigned Mips::IndirectBranchMaskReg = Mips::T6;
+unsigned Mips::LoadStoreStackMaskReg = Mips::T7;
+
+namespace {
+  class MipsNaClRewritePass : public MachineFunctionPass {
+  public:
+    static char ID;
+    MipsNaClRewritePass() : MachineFunctionPass(ID) {}
+
+    const MipsInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+    virtual const char *getPassName() const {
+      return "Mips Native Client Rewrite Pass";
+    }
+
+  private:
+
+    bool SandboxLoadsInBlock(MachineBasicBlock &MBB);
+    bool SandboxStoresInBlock(MachineBasicBlock &MBB);
+    void SandboxLoadStore(MachineBasicBlock &MBB,
+                      MachineBasicBlock::iterator MBBI,
+                      MachineInstr &MI,
+                      int AddrIdx);
+
+    bool SandboxBranchesInBlock(MachineBasicBlock &MBB);
+    bool SandboxStackChangesInBlock(MachineBasicBlock &MBB);
+
+    void SandboxStackChange(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MBBI);
+    void AlignAllJumpTargets(MachineFunction &MF);
+  };
+  char MipsNaClRewritePass::ID = 0;
+}
+
+static bool IsReturn(const MachineInstr &MI) {
+  return (MI.getOpcode() == Mips::RET);
+}
+
+static bool IsIndirectJump(const MachineInstr &MI) {
+  return (MI.getOpcode() == Mips::JR);
+}
+
+static bool IsIndirectCall(const MachineInstr &MI) {
+  return (MI.getOpcode() == Mips::JALR);
+}
+
+static bool IsDirectCall(const MachineInstr &MI) {
+  return ((MI.getOpcode() == Mips::JAL) || (MI.getOpcode() == Mips::BGEZAL)
+       || (MI.getOpcode() == Mips::BLTZAL));
+;
+}
+
+static bool IsStackMask(const MachineInstr &MI) {
+  return (MI.getOpcode() == Mips::SFI_DATA_MASK);
+}
+
+static bool NeedSandboxStackChange(const MachineInstr &MI,
+                                   const TargetRegisterInfo *TRI) {
+  if (IsDirectCall(MI) || IsIndirectCall(MI)) {
+    // We check this first because method modifiesRegister
+    // returns true for calls.
+    return false;
+  }
+  return (MI.modifiesRegister(Mips::SP, TRI) && !IsStackMask(MI));
+}
+
+void MipsNaClRewritePass::SandboxStackChange(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MBBI) {
+  MachineInstr &MI = *MBBI;
+
+  BuildMI(MBB, MBBI, MI.getDebugLoc(),
+          TII->get(Mips::SFI_NOP_IF_AT_BUNDLE_END));
+
+  // Get to next instr (one + to get the original, and one more + to get past).
+  MachineBasicBlock::iterator MBBINext = (MBBI++);
+  MachineBasicBlock::iterator MBBINext2 = (MBBI++);
+
+  BuildMI(MBB, MBBINext2, MI.getDebugLoc(),
+          TII->get(Mips::SFI_DATA_MASK), Mips::SP)
+          .addReg(Mips::SP)
+          .addReg(Mips::LoadStoreStackMaskReg);
+  return;
+}
+
+bool MipsNaClRewritePass::SandboxStackChangesInBlock(MachineBasicBlock &MBB) {
+  bool Modified = false;
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+       MBBI != E; ++MBBI) {
+    MachineInstr &MI = *MBBI;
+    if (NeedSandboxStackChange(MI, TRI)) {
+      SandboxStackChange(MBB, MBBI);
+      Modified = true;
+    }
+  }
+  return Modified;
+}
+
+bool MipsNaClRewritePass::SandboxBranchesInBlock(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+      MBBI != E; ++MBBI) {
+    MachineInstr &MI = *MBBI;
+
+    if (IsReturn(MI)) {
+      unsigned AddrReg = MI.getOperand(0).getReg();
+      BuildMI(MBB, MBBI, MI.getDebugLoc(),
+              TII->get(Mips::SFI_GUARD_RETURN), AddrReg)
+          .addReg(AddrReg)
+          .addReg(Mips::IndirectBranchMaskReg);
+      Modified = true;
+    } else if (IsIndirectJump(MI)) {
+      unsigned AddrReg = MI.getOperand(0).getReg();
+      BuildMI(MBB, MBBI, MI.getDebugLoc(),
+              TII->get(Mips::SFI_GUARD_INDIRECT_JMP), AddrReg)
+          .addReg(AddrReg)
+          .addReg(Mips::IndirectBranchMaskReg);
+      Modified = true;
+    } else if (IsDirectCall(MI)) {
+      BuildMI(MBB, MBBI, MI.getDebugLoc(),
+              TII->get(Mips::SFI_GUARD_CALL));
+      Modified = true;
+    } else if (IsIndirectCall(MI)) {
+      unsigned AddrReg = MI.getOperand(0).getReg();
+      BuildMI(MBB, MBBI, MI.getDebugLoc(),
+              TII->get(Mips::SFI_GUARD_INDIRECT_CALL), AddrReg)
+          .addReg(AddrReg)
+          .addReg(Mips::IndirectBranchMaskReg);
+      Modified = true;
+    }
+  }
+
+  return Modified;
+}
+
+/*
+ * Sandboxes a load or store instruction by inserting an appropriate mask
+ * operation before it.
+ */
+void MipsNaClRewritePass::SandboxLoadStore(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator MBBI,
+                                      MachineInstr &MI,
+                                      int AddrIdx) {
+  unsigned BaseReg = MI.getOperand(AddrIdx).getReg();
+
+  BuildMI(MBB, MBBI, MI.getDebugLoc(),
+          TII->get(Mips::SFI_GUARD_LOADSTORE), BaseReg)
+      .addReg(BaseReg)
+      .addReg(Mips::LoadStoreStackMaskReg);
+  return;
+}
+
+bool IsDangerousLoad(const MachineInstr &MI, int *AddrIdx) {
+  unsigned Opcode = MI.getOpcode();
+  switch (Opcode) {
+  default: return false;
+
+  // Instructions with base address register in position 1
+  case Mips::LB:
+  case Mips::LBu:
+  case Mips::LH:
+  case Mips::LHu:
+  case Mips::LW:
+  case Mips::LWC1:
+  case Mips::LDC1:
+  case Mips::LL:
+  case Mips::LWL:
+  case Mips::LWR:
+    *AddrIdx = 1;
+    break;
+  }
+
+  if (MI.getOperand(*AddrIdx).getReg() == Mips::SP) {
+    // The contents of SP do not require masking.
+    return false;
+  }
+
+  return true;
+}
+
+bool IsDangerousStore(const MachineInstr &MI, int *AddrIdx) {
+  unsigned Opcode = MI.getOpcode();
+  switch (Opcode) {
+  default: return false;
+
+  // Instructions with base address register in position 1
+  case Mips::SB:
+  case Mips::SH:
+  case Mips::SW:
+  case Mips::SWC1:
+  case Mips::SDC1:
+  case Mips::SWL:
+  case Mips::SWR:
+    *AddrIdx = 1;
+    break;
+
+  case Mips::SC:
+    *AddrIdx = 2;
+    break;
+  }
+
+  if (MI.getOperand(*AddrIdx).getReg() == Mips::SP) {
+    // The contents of SP do not require masking.
+    return false;
+  }
+
+  return true;
+}
+
+bool MipsNaClRewritePass::SandboxLoadsInBlock(MachineBasicBlock &MBB) {
+  bool Modified = false;
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+       MBBI != E;
+       ++MBBI) {
+    MachineInstr &MI = *MBBI;
+    int AddrIdx;
+
+    if (IsDangerousLoad(MI, &AddrIdx)) {
+      SandboxLoadStore(MBB, MBBI, MI, AddrIdx);
+      Modified = true;
+    }
+  }
+  return Modified;
+}
+
+bool MipsNaClRewritePass::SandboxStoresInBlock(MachineBasicBlock &MBB) {
+  bool Modified = false;
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+       MBBI != E;
+       ++MBBI) {
+    MachineInstr &MI = *MBBI;
+    int AddrIdx;
+
+    if (IsDangerousStore(MI, &AddrIdx)) {
+      SandboxLoadStore(MBB, MBBI, MI, AddrIdx);
+      Modified = true;
+    }
+  }
+  return Modified;
+}
+
+// Make sure all jump targets are aligned
+void MipsNaClRewritePass::AlignAllJumpTargets(MachineFunction &MF) {
+  // JUMP TABLE TARGETS
+  MachineJumpTableInfo *jt_info = MF.getJumpTableInfo();
+  if (jt_info) {
+    const std::vector<MachineJumpTableEntry> &JT = jt_info->getJumpTables();
+    for (unsigned i=0; i < JT.size(); ++i) {
+      std::vector<MachineBasicBlock*> MBBs = JT[i].MBBs;
+
+      for (unsigned j=0; j < MBBs.size(); ++j) {
+        MBBs[j]->setAlignment(4);
+      }
+    }
+  }
+
+  for (MachineFunction::iterator I = MF.begin(), E = MF.end();
+                           I != E; ++I) {
+    MachineBasicBlock &MBB = *I;
+    if (MBB.hasAddressTaken())
+      MBB.setAlignment(4);
+  }
+}
+
+bool MipsNaClRewritePass::runOnMachineFunction(MachineFunction &MF) {
+  TII = static_cast<const MipsInstrInfo*>(MF.getTarget().getInstrInfo());
+  TRI = MF.getTarget().getRegisterInfo();
+
+  bool Modified = false;
+  for (MachineFunction::iterator MFI = MF.begin(), E = MF.end();
+       MFI != E;
+       ++MFI) {
+    MachineBasicBlock &MBB = *MFI;
+
+    if (FlagSfiLoad)
+      Modified |= SandboxLoadsInBlock(MBB);
+    if (FlagSfiStore)
+      Modified |= SandboxStoresInBlock(MBB);
+    if (FlagSfiBranch)
+      Modified |= SandboxBranchesInBlock(MBB);
+    if (FlagSfiStack)
+      Modified |= SandboxStackChangesInBlock(MBB);
+  }
+
+  if (FlagSfiBranch)
+    AlignAllJumpTargets(MF);
+
+  return Modified;
+}
+
+/// createMipsNaClRewritePass - returns an instance of the NaClRewritePass.
+FunctionPass *llvm::createMipsNaClRewritePass() {
+  return new MipsNaClRewritePass();
+}
diff --git a/lib/Target/Mips/MipsNaClRewritePass.h b/lib/Target/Mips/MipsNaClRewritePass.h
new file mode 100644
index 0000000000..4e729ec985
--- /dev/null
+++ b/lib/Target/Mips/MipsNaClRewritePass.h
@@ -0,0 +1,21 @@
+//===-- MipsNaClRewritePass.h - NaCl Sandboxing Pass    ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TARGET_MIPSNACLREWRITEPASS_H
+#define TARGET_MIPSNACLREWRITEPASS_H
+
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Support/CommandLine.h"
+
+namespace llvm {
+  extern cl::opt<bool> FlagSfiLoad;
+  extern cl::opt<bool> FlagSfiStore;
+  extern cl::opt<bool> FlagSfiStack;
+  extern cl::opt<bool> FlagSfiBranch;
+}
+
+#endif
diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index d8e0dd436a..13893a1e31 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -81,7 +81,9 @@ MipsRegisterInfo::getCallPreservedMask(CallingConv::ID) const {
 BitVector MipsRegisterInfo::
 getReservedRegs(const MachineFunction &MF) const {
   static const uint16_t ReservedCPURegs[] = {
-    Mips::ZERO, Mips::K0, Mips::K1, Mips::SP
+    Mips::ZERO,
+    Mips::T6, Mips::T7, Mips::T8,          // @LOCALMOD: reserved for PNaCl use
+    Mips::K0, Mips::K1, Mips::SP
   };
 
   static const uint16_t ReservedCPU64Regs[] = {
diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp
index 930af4dda1..1ff41ca358 100644
--- a/lib/Target/Mips/MipsSubtarget.cpp
+++ b/lib/Target/Mips/MipsSubtarget.cpp
@@ -33,6 +33,9 @@ MipsSubtarget::MipsSubtarget(const std::string &TT, const std::string &CPU,
   IsLinux(true), HasSEInReg(false), HasCondMov(false), HasMulDivAdd(false),
   HasMinMax(false), HasSwap(false), HasBitCount(false), InMips16Mode(false),
   HasDSP(false), HasDSPR2(false), IsAndroid(false)
+  // @LOCALMOD-START
+  , TargetTriple(TT)
+  // @LOCALMOD-END
 {
   std::string CPUName = CPU;
   if (CPUName.empty())
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index ff69237ec2..6eeab5c351 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -100,6 +100,8 @@ protected:
 
   InstrItineraryData InstrItins;
 
+  Triple TargetTriple;  // @LOCALMOD
+
 public:
   virtual bool enablePostRAScheduler(CodeGenOpt::Level OptLevel,
                                      AntiDepBreakMode& Mode,
@@ -152,6 +154,13 @@ public:
   bool hasMinMax()    const { return HasMinMax; }
   bool hasSwap()      const { return HasSwap; }
   bool hasBitCount()  const { return HasBitCount; }
+
+  // @LOCALMOD-BEGIN
+  bool isTargetNaCl() const {
+    return TargetTriple.getOS() == Triple::NativeClient;
+  }
+  // @LOCALMOD-END
+
 };
 } // End llvm namespace
 
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index 983ee21941..0ed3277306 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -116,6 +116,14 @@ bool MipsPassConfig::addPreEmitPass() {
   if (TM.getSubtarget<MipsSubtarget>().hasStandardEncoding())
     addPass(createMipsLongBranchPass(TM));
 
+
+  // @LOCALMOD-START
+  if (getMipsSubtarget().isTargetNaCl()) {
+    // This pass does all the heavy sfi lifting.
+    addPass(createMipsNaClRewritePass());
+  }
+  // @LOCALMOD-END
+
   return true;
 }
 
diff --git a/lib/Target/Mips/MipsTargetObjectFile.cpp b/lib/Target/Mips/MipsTargetObjectFile.cpp
index 881908b82c..211e6867a7 100644
--- a/lib/Target/Mips/MipsTargetObjectFile.cpp
+++ b/lib/Target/Mips/MipsTargetObjectFile.cpp
@@ -38,6 +38,23 @@ void MipsTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM){
                                ELF::SHF_WRITE |ELF::SHF_ALLOC,
                                SectionKind::getBSS());
 
+  // @LOCALMOD-BEGIN
+  // Without this the linker defined symbols __fini_array_start and
+  // __fini_array_end do not have useful values. c.f.:
+  // http://code.google.com/p/nativeclient/issues/detail?id=805
+  if (TM.getSubtarget<MipsSubtarget>().isTargetNaCl()) {
+    StaticCtorSection =
+      getContext().getELFSection(".init_array", ELF::SHT_INIT_ARRAY,
+                               ELF::SHF_WRITE |
+                               ELF::SHF_ALLOC,
+                               SectionKind::getDataRel());
+    StaticDtorSection =
+      getContext().getELFSection(".fini_array", ELF::SHT_FINI_ARRAY,
+                               ELF::SHF_WRITE |
+                               ELF::SHF_ALLOC,
+                               SectionKind::getDataRel());
+  }
+  // @LOCALMOD-END
 }
 
 // A address must be loaded from a small section if its size is less than the
@@ -67,6 +84,12 @@ IsGlobalInSmallSection(const GlobalValue *GV, const TargetMachine &TM,
   if (!Subtarget.useSmallSection())
     return false;
 
+  // @LOCALMOD-BEGIN
+  // Do not use small section for NaCl.
+  if (Subtarget.isTargetNaCl())
+    return false;
+  // @LOCALMOD-BEGIN
+
   // Only global variables, not functions.
   const GlobalVariable *GVA = dyn_cast<GlobalVariable>(GV);
   if (!GVA)
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index 382571982b..ffc1d9f0d1 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -27,6 +27,7 @@ using namespace llvm;
 namespace llvm {
   bool HasDivModLibcall;
   bool AsmVerbosityDefault(false);
+  bool TLSUseCall; // @LOCALMOD
 }
 
 static cl::opt<bool>
@@ -37,6 +38,20 @@ static cl::opt<bool>
 FunctionSections("ffunction-sections",
   cl::desc("Emit functions into separate sections"),
   cl::init(false));
+// @LOCALMOD-BEGIN
+// Use a function call to get the thread pointer for TLS accesses,
+// instead of using inline code.
+static cl::opt<bool, true>
+EnableTLSUseCall("mtls-use-call",
+  cl::desc("Use a function call to get the thread pointer for TLS accesses."),
+  cl::location(TLSUseCall),
+  cl::init(false));
+
+static cl::opt<bool>
+  ForceTLSNonPIC("force-tls-non-pic",
+                 cl::desc("Force TLS to use non-PIC models"),
+                 cl::init(false));
+// @LOCALMOD-END
 
 //---------------------------------------------------------------------------
 // TargetMachine Class
@@ -111,7 +126,8 @@ TLSModel::Model TargetMachine::getTLSModel(const GlobalValue *GV) const {
   bool isHidden = Var->hasHiddenVisibility();
 
   TLSModel::Model Model;
-  if (isPIC && !isPIE) {
+  if (isPIC && !isPIE &&
+      !ForceTLSNonPIC) { // @LOCALMOD
     if (isLocal || isHidden)
       Model = TLSModel::LocalDynamic;
     else
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index f4d03a602c..1b2ffb01ad 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -26,6 +26,7 @@ set(sources
   X86JITInfo.cpp
   X86MCInstLower.cpp
   X86MachineFunctionInfo.cpp
+  X86NaClRewritePass.cpp
   X86RegisterInfo.cpp
   X86SelectionDAGInfo.cpp
   X86Subtarget.cpp
diff --git a/lib/Target/X86/MCTargetDesc/CMakeLists.txt b/lib/Target/X86/MCTargetDesc/CMakeLists.txt
index 1c240e52a3..8be0c5e6d7 100644
--- a/lib/Target/X86/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/X86/MCTargetDesc/CMakeLists.txt
@@ -3,6 +3,7 @@ add_llvm_library(LLVMX86Desc
   X86MCTargetDesc.cpp
   X86MCAsmInfo.cpp
   X86MCCodeEmitter.cpp
+  X86MCNaCl.cpp # LOCALMOD
   X86MachObjectWriter.cpp
   X86ELFObjectWriter.cpp
   X86WinCOFFObjectWriter.cpp
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 467edadc7e..2c91c8c566 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -9,6 +9,7 @@
 
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "MCTargetDesc/X86FixupKinds.h"
+#include "MCTargetDesc/X86MCNaCl.h" // @LOCALMOD
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCELFObjectWriter.h"
@@ -337,8 +338,10 @@ namespace {
 class ELFX86AsmBackend : public X86AsmBackend {
 public:
   uint8_t OSABI;
-  ELFX86AsmBackend(const Target &T, uint8_t _OSABI, StringRef CPU)
-    : X86AsmBackend(T, CPU), OSABI(_OSABI) {
+  Triple::OSType OSType; // @LOCALMOD: kept OSTYPE vs upstream. FIXME: remove.
+  ELFX86AsmBackend(const Target &T, uint8_t _OSABI, StringRef CPU,
+                   Triple::OSType _OSType)
+    : X86AsmBackend(T, CPU), OSABI(_OSABI), OSType(_OSType) {
     HasReliableSymbolDifference = true;
   }
 
@@ -346,12 +349,28 @@ public:
     const MCSectionELF &ES = static_cast<const MCSectionELF&>(Section);
     return ES.getFlags() & ELF::SHF_MERGE;
   }
+
+  // @LOCALMOD-BEGIN
+  // FIXME! NaCl should inherit from ELFX86AsmBackend!
+  unsigned getBundleSize() const {
+    return OSType == Triple::NativeClient ? 32 : 0;
+  }
+
+  bool CustomExpandInst(const MCInst &Inst, MCStreamer &Out) const {
+    if (OSType == Triple::NativeClient) {
+      return CustomExpandInstNaClX86(Inst, Out);
+    }
+    return false;
+  }
+  // @LOCALMOD-END
+
 };
 
 class ELFX86_32AsmBackend : public ELFX86AsmBackend {
 public:
-  ELFX86_32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
-    : ELFX86AsmBackend(T, OSABI, CPU) {}
+  ELFX86_32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU,
+                      Triple::OSType OSType) // @LOCALMOD: kept OSType
+    : ELFX86AsmBackend(T, OSABI, CPU, OSType) {}
 
   MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
     return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI, ELF::EM_386);
@@ -360,8 +379,9 @@ public:
 
 class ELFX86_64AsmBackend : public ELFX86AsmBackend {
 public:
-  ELFX86_64AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
-    : ELFX86AsmBackend(T, OSABI, CPU) {}
+  ELFX86_64AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU,
+                      Triple::OSType OSType) // @LOCALMOD: kept OSType
+    : ELFX86AsmBackend(T, OSABI, CPU, OSType) {}
 
   MCObjectWriter *createObjectWriter(raw_ostream &OS) const {
     return createX86ELFObjectWriter(OS, /*IsELF64*/ true, OSABI, ELF::EM_X86_64);
@@ -459,7 +479,7 @@ MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T, StringRef TT, String
     return new WindowsX86AsmBackend(T, false, CPU);
 
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
-  return new ELFX86_32AsmBackend(T, OSABI, CPU);
+  return new ELFX86_32AsmBackend(T, OSABI, CPU, TheTriple.getOS());
 }
 
 MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T, StringRef TT, StringRef CPU) {
@@ -472,5 +492,5 @@ MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T, StringRef TT, String
     return new WindowsX86AsmBackend(T, true, CPU);
 
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TheTriple.getOS());
-  return new ELFX86_64AsmBackend(T, OSABI, CPU);
+  return new ELFX86_64AsmBackend(T, OSABI, CPU, TheTriple.getOS());
 }
diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index 7ea1961dec..0ce4c126c2 100644
--- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -291,6 +291,8 @@ namespace X86II {
     /// manual, this operand is described as pntr16:32 and pntr16:16
     RawFrmImm16 = 44,
 
+    CustomFrm = 62, // @LOCALMOD
+
     FormMask       = 63,
 
     //===------------------------------------------------------------------===//
@@ -542,6 +544,7 @@ namespace X86II {
     case X86II::MRMSrcReg:
     case X86II::RawFrmImm8:
     case X86II::RawFrmImm16:
+    case X86II::CustomFrm: // @LOCALMOD
        return -1;
     case X86II::MRMDestMem:
       return 0;
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index 16488eb7ae..7706b9308e 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -76,8 +76,18 @@ X86_64MCAsmInfoDarwin::X86_64MCAsmInfoDarwin(const Triple &Triple)
 void X86ELFMCAsmInfo::anchor() { }
 
 X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) {
-  if (T.getArch() == Triple::x86_64)
-    PointerSize = 8;
+
+  // @LOCALMOD-BEGIN
+  if (T.getArch() == Triple::x86_64) {
+    if (T.getOS() == Triple::NativeClient) {
+      PointerSize = 4;
+      StackSlotSize = 8;
+    } else {
+      PointerSize = 8;
+      StackSlotSize = 8;
+    }
+  }
+  // @LOCALMOD-END
 
   AssemblerDialect = AsmWriterFlavor;
 
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 122204ae75..4c6036761a 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -846,7 +846,6 @@ void X86MCCodeEmitter::EmitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
                                         int MemOperand, const MCInst &MI,
                                         const MCInstrDesc &Desc,
                                         raw_ostream &OS) const {
-
   // Emit the lock opcode prefix as needed.
   if (TSFlags & X86II::LOCK)
     EmitByte(0xF0, CurByte, OS);
@@ -1012,6 +1011,10 @@ EncodeInstruction(const MCInst &MI, raw_ostream &OS,
     llvm_unreachable("Unknown FormMask value in X86MCCodeEmitter!");
   case X86II::Pseudo:
     llvm_unreachable("Pseudo instruction shouldn't be emitted");
+  // @LOCALMOD-BEGIN
+  case X86II::CustomFrm:
+    assert(0 && "CustomFrm instruction shouldn't be emitted");
+  // @LOCALMOD-END
   case X86II::RawFrm:
     EmitByte(BaseOpcode, CurByte, OS);
     break;
diff --git a/lib/Target/X86/MCTargetDesc/X86MCNaCl.cpp b/lib/Target/X86/MCTargetDesc/X86MCNaCl.cpp
new file mode 100644
index 0000000000..29d87ba2c6
--- /dev/null
+++ b/lib/Target/X86/MCTargetDesc/X86MCNaCl.cpp
@@ -0,0 +1,700 @@
+//=== X86MCNaCl.cpp - Expansion of NaCl pseudo-instructions      --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "x86-sandboxing"
+
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86MCNaCl.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+// This option makes it possible to overwrite the x86 jmp mask immediate.
+// Setting it to -1 will effectively turn masking into a nop which will
+// help with linking this code with non-sandboxed libs (at least for x86-32).
+cl::opt<int> FlagSfiX86JmpMask("sfi-x86-jmp-mask", cl::init(-32));
+
+cl::opt<bool> FlagUseZeroBasedSandbox("sfi-zero-based-sandbox",
+                                      cl::desc("Use a zero-based sandbox model"
+                                               " for the NaCl SFI."),
+                                      cl::init(false));
+
+static unsigned PrefixSaved = 0;
+static bool PrefixPass = false;
+
+// See the notes below where these functions are defined.
+namespace {
+unsigned getX86SubSuperRegister_(unsigned Reg, EVT VT, bool High=false);
+unsigned DemoteRegTo32_(unsigned RegIn);
+} // namespace
+
+static void EmitDirectCall(const MCOperand &Op, bool Is64Bit,
+                           MCStreamer &Out) {
+  Out.EmitBundleAlignEnd();
+  Out.EmitBundleLock();
+
+  MCInst CALLInst;
+  CALLInst.setOpcode(Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32);
+  CALLInst.addOperand(Op);
+  Out.EmitInstruction(CALLInst);
+  Out.EmitBundleUnlock();
+}
+
+static void EmitIndirectBranch(const MCOperand &Op, bool Is64Bit, bool IsCall,
+                               MCStreamer &Out) {
+  const bool UseZeroBasedSandbox = FlagUseZeroBasedSandbox;
+  const int JmpMask = FlagSfiX86JmpMask;
+  const unsigned Reg32 = Op.getReg();
+  const unsigned Reg64 = getX86SubSuperRegister_(Reg32, MVT::i64);
+
+  if (IsCall)
+    Out.EmitBundleAlignEnd();
+
+  Out.EmitBundleLock();
+
+  MCInst ANDInst;
+  ANDInst.setOpcode(X86::AND32ri8);
+  ANDInst.addOperand(MCOperand::CreateReg(Reg32));
+  ANDInst.addOperand(MCOperand::CreateReg(Reg32));
+  ANDInst.addOperand(MCOperand::CreateImm(JmpMask));
+  Out.EmitInstruction(ANDInst);
+
+  if (Is64Bit && !UseZeroBasedSandbox) {
+    MCInst InstADD;
+    InstADD.setOpcode(X86::ADD64rr);
+    InstADD.addOperand(MCOperand::CreateReg(Reg64));
+    InstADD.addOperand(MCOperand::CreateReg(Reg64));
+    InstADD.addOperand(MCOperand::CreateReg(X86::R15));
+    Out.EmitInstruction(InstADD);
+  }
+
+  if (IsCall) {
+    MCInst CALLInst;
+    CALLInst.setOpcode(Is64Bit ? X86::CALL64r : X86::CALL32r);
+    CALLInst.addOperand(MCOperand::CreateReg(Is64Bit ? Reg64 : Reg32));
+    Out.EmitInstruction(CALLInst);
+  } else {
+    MCInst JMPInst;
+    JMPInst.setOpcode(Is64Bit ? X86::JMP64r : X86::JMP32r);
+    JMPInst.addOperand(MCOperand::CreateReg(Is64Bit ? Reg64 : Reg32));
+    Out.EmitInstruction(JMPInst);
+  }
+  Out.EmitBundleUnlock();
+}
+
+static void EmitRet(const MCOperand *AmtOp, bool Is64Bit, MCStreamer &Out) {
+  MCInst POPInst;
+  POPInst.setOpcode(Is64Bit ? X86::POP64r : X86::POP32r);
+  POPInst.addOperand(MCOperand::CreateReg(Is64Bit ? X86::RCX : X86::ECX));
+  Out.EmitInstruction(POPInst);
+
+  if (AmtOp) {
+    assert(!Is64Bit);
+    MCInst ADDInst;
+    unsigned ADDReg = X86::ESP;
+    ADDInst.setOpcode(X86::ADD32ri);
+    ADDInst.addOperand(MCOperand::CreateReg(ADDReg));
+    ADDInst.addOperand(MCOperand::CreateReg(ADDReg));
+    ADDInst.addOperand(*AmtOp);
+    Out.EmitInstruction(ADDInst);
+  }
+
+  MCInst JMPInst;
+  JMPInst.setOpcode(Is64Bit ? X86::NACL_JMP64r : X86::NACL_JMP32r);
+  JMPInst.addOperand(MCOperand::CreateReg(X86::ECX));
+  Out.EmitInstruction(JMPInst);
+}
+
+static void EmitTrap(bool Is64Bit, MCStreamer &Out) {
+  // Rewrite to:
+  //    X86-32:  mov $0, 0
+  //    X86-64:  mov $0, (%r15)
+  const bool UseZeroBasedSandbox = FlagUseZeroBasedSandbox;
+  unsigned BaseReg = Is64Bit && !UseZeroBasedSandbox ? X86::R15 : 0;
+
+  MCInst Tmp;
+  Tmp.setOpcode(X86::MOV32mi);
+  Tmp.addOperand(MCOperand::CreateReg(BaseReg)); // BaseReg
+  Tmp.addOperand(MCOperand::CreateImm(1)); // Scale
+  Tmp.addOperand(MCOperand::CreateReg(0)); // IndexReg
+  Tmp.addOperand(MCOperand::CreateImm(0)); // Offset
+  Tmp.addOperand(MCOperand::CreateReg(0)); // SegmentReg
+  Tmp.addOperand(MCOperand::CreateImm(0)); // Value
+
+  Out.EmitInstruction(Tmp);
+}
+
+// Fix a register after being truncated to 32-bits.
+static void EmitRegFix(unsigned Reg64, MCStreamer &Out) {
+  // lea (%rsp, %r15, 1), %rsp
+  // We do not need to add the R15 base for the zero-based sandbox model
+  const bool UseZeroBasedSandbox = FlagUseZeroBasedSandbox;
+  if (!UseZeroBasedSandbox) {
+    MCInst Tmp;
+    Tmp.setOpcode(X86::LEA64r);
+    Tmp.addOperand(MCOperand::CreateReg(Reg64));    // DestReg
+    Tmp.addOperand(MCOperand::CreateReg(Reg64));    // BaseReg
+    Tmp.addOperand(MCOperand::CreateImm(1));        // Scale
+    Tmp.addOperand(MCOperand::CreateReg(X86::R15)); // IndexReg
+    Tmp.addOperand(MCOperand::CreateImm(0));        // Offset
+    Tmp.addOperand(MCOperand::CreateReg(0));        // SegmentReg
+    Out.EmitInstruction(Tmp);
+  }
+}
+
+static void EmitSPArith(unsigned Opc, const MCOperand &ImmOp,
+                        MCStreamer &Out) {
+  Out.EmitBundleLock();
+
+  MCInst Tmp;
+  Tmp.setOpcode(Opc);
+  Tmp.addOperand(MCOperand::CreateReg(X86::RSP));
+  Tmp.addOperand(MCOperand::CreateReg(X86::RSP));
+  Tmp.addOperand(ImmOp);
+  Out.EmitInstruction(Tmp);
+
+  EmitRegFix(X86::RSP, Out);
+  Out.EmitBundleUnlock();
+}
+
+static void EmitSPAdj(const MCOperand &ImmOp, MCStreamer &Out) {
+  Out.EmitBundleLock();
+
+  MCInst Tmp;
+  Tmp.setOpcode(X86::LEA64_32r);
+  Tmp.addOperand(MCOperand::CreateReg(X86::RSP)); // DestReg
+  Tmp.addOperand(MCOperand::CreateReg(X86::RBP)); // BaseReg
+  Tmp.addOperand(MCOperand::CreateImm(1));        // Scale
+  Tmp.addOperand(MCOperand::CreateReg(0));        // IndexReg
+  Tmp.addOperand(ImmOp);                          // Offset
+  Tmp.addOperand(MCOperand::CreateReg(0));        // SegmentReg
+  Out.EmitInstruction(Tmp);
+
+  EmitRegFix(X86::RSP, Out);
+  Out.EmitBundleUnlock();
+}
+
+static void EmitPrefix(unsigned Opc, MCStreamer &Out) {
+  assert(PrefixSaved == 0);
+  assert(PrefixPass == false);
+
+  MCInst PrefixInst;
+  PrefixInst.setOpcode(Opc);
+  PrefixPass = true;
+  Out.EmitInstruction(PrefixInst);
+
+  assert(PrefixSaved == 0);
+  assert(PrefixPass == false);
+}
+
+static void EmitMoveRegReg(bool Is64Bit, unsigned ToReg,
+                           unsigned FromReg, MCStreamer &Out) {
+  MCInst Move;
+  Move.setOpcode(Is64Bit ? X86::MOV64rr : X86::MOV32rr);
+  Move.addOperand(MCOperand::CreateReg(ToReg));
+  Move.addOperand(MCOperand::CreateReg(FromReg));
+  Out.EmitInstruction(Move);
+}
+
+static void EmitRegTruncate(unsigned Reg64, MCStreamer &Out) {
+  unsigned Reg32 = getX86SubSuperRegister_(Reg64, MVT::i32);
+  EmitMoveRegReg(false, Reg32, Reg32, Out);
+}
+
+static void HandleMemoryRefTruncation(MCInst *Inst, unsigned IndexOpPosition,
+                                      MCStreamer &Out) {
+  const bool UseZeroBasedSandbox = FlagUseZeroBasedSandbox;
+  unsigned IndexReg = Inst->getOperand(IndexOpPosition).getReg();
+  if (UseZeroBasedSandbox) {
+    // With the zero-based sandbox, we use a 32-bit register on the index
+    Inst->getOperand(IndexOpPosition).setReg(DemoteRegTo32_(IndexReg));
+  } else {
+    EmitRegTruncate(IndexReg, Out);
+  }
+}
+
+static void ShortenMemoryRef(MCInst *Inst, unsigned IndexOpPosition) {
+  unsigned ImmOpPosition = IndexOpPosition - 1;
+  unsigned BaseOpPosition = IndexOpPosition - 2;
+  unsigned IndexReg = Inst->getOperand(IndexOpPosition).getReg();
+  // For the SIB byte, if the scale is 1 and the base is 0, then
+  // an equivalent setup moves index to base, and index to 0.  The
+  // equivalent setup is optimized to remove the SIB byte in
+  // X86MCCodeEmitter.cpp.
+  if (Inst->getOperand(ImmOpPosition).getImm() == 1 &&
+      Inst->getOperand(BaseOpPosition).getReg() == 0) {
+    Inst->getOperand(BaseOpPosition).setReg(IndexReg);
+    Inst->getOperand(IndexOpPosition).setReg(0);
+  }
+}
+
+static void EmitLoad(bool Is64Bit,
+                     unsigned DestReg,
+                     unsigned BaseReg,
+                     unsigned Scale,
+                     unsigned IndexReg,
+                     unsigned Offset,
+                     unsigned SegmentReg,
+                     MCStreamer &Out) {
+  // Load DestReg from address BaseReg + Scale * IndexReg + Offset
+  MCInst Load;
+  Load.setOpcode(Is64Bit ? X86::MOV64rm : X86::MOV32rm);
+  Load.addOperand(MCOperand::CreateReg(DestReg));
+  Load.addOperand(MCOperand::CreateReg(BaseReg));
+  Load.addOperand(MCOperand::CreateImm(Scale));
+  Load.addOperand(MCOperand::CreateReg(IndexReg));
+  Load.addOperand(MCOperand::CreateImm(Offset));
+  Load.addOperand(MCOperand::CreateReg(SegmentReg));
+  Out.EmitInstruction(Load);
+}
+
+static bool SandboxMemoryRef(MCInst *Inst,
+                             unsigned *IndexOpPosition) {
+  for (unsigned i = 0, last = Inst->getNumOperands(); i < last; i++) {
+    if (!Inst->getOperand(i).isReg() ||
+        Inst->getOperand(i).getReg() != X86::PSEUDO_NACL_SEG) {
+      continue;
+    }
+    // Return the index register that will need to be truncated.
+    // The order of operands on a memory reference is always:
+    // (BaseReg, ScaleImm, IndexReg, DisplacementImm, SegmentReg),
+    // So if we found a match for a segment register value, we know that
+    // the index register is exactly two operands prior.
+    *IndexOpPosition = i - 2;
+
+    // Remove the PSEUDO_NACL_SEG annotation.
+    Inst->getOperand(i).setReg(0);
+    return true;
+  }
+  return false;
+}
+
+static void EmitTLSAddr32(const MCInst &Inst, MCStreamer &Out) {
+  Out.EmitBundleAlignEnd();
+  Out.EmitBundleLock();
+
+  MCInst LeaInst;
+  LeaInst.setOpcode(X86::LEA32r);
+  LeaInst.addOperand(MCOperand::CreateReg(X86::EAX));    // DestReg
+  LeaInst.addOperand(Inst.getOperand(0)); // BaseReg
+  LeaInst.addOperand(Inst.getOperand(1)); // Scale
+  LeaInst.addOperand(Inst.getOperand(2)); // IndexReg
+  LeaInst.addOperand(Inst.getOperand(3)); // Offset
+  LeaInst.addOperand(Inst.getOperand(4)); // SegmentReg
+  Out.EmitInstruction(LeaInst);
+
+  MCInst CALLInst;
+  CALLInst.setOpcode(X86::CALLpcrel32);
+  MCContext &context = Out.getContext();
+  const MCSymbolRefExpr *expr =
+    MCSymbolRefExpr::Create(
+      context.GetOrCreateSymbol(StringRef("___tls_get_addr")),
+      MCSymbolRefExpr::VK_PLT, context);
+  CALLInst.addOperand(MCOperand::CreateExpr(expr));
+  Out.EmitInstruction(CALLInst);
+  Out.EmitBundleUnlock();
+}
+
+
+static void EmitREST(const MCInst &Inst, unsigned Reg32,
+                     bool IsMem, MCStreamer &Out) {
+  unsigned Reg64 = getX86SubSuperRegister_(Reg32, MVT::i64);
+  Out.EmitBundleLock();
+  if (!IsMem) {
+    EmitMoveRegReg(false, Reg32, Inst.getOperand(0).getReg(), Out);
+  } else {
+    unsigned IndexOpPosition;
+    MCInst SandboxedInst = Inst;
+    if (SandboxMemoryRef(&SandboxedInst, &IndexOpPosition)) {
+      HandleMemoryRefTruncation(&SandboxedInst, IndexOpPosition, Out);
+      ShortenMemoryRef(&SandboxedInst, IndexOpPosition);
+    }
+    EmitLoad(false,
+             Reg32,
+             SandboxedInst.getOperand(0).getReg(),  // BaseReg
+             SandboxedInst.getOperand(1).getImm(),  // Scale
+             SandboxedInst.getOperand(2).getReg(),  // IndexReg
+             SandboxedInst.getOperand(3).getImm(),  // Offset
+             SandboxedInst.getOperand(4).getReg(),  // SegmentReg
+             Out);
+  }
+
+  EmitRegFix(Reg64, Out);
+  Out.EmitBundleUnlock();
+}
+
+
+namespace llvm {
+// CustomExpandInstNaClX86 -
+//   If Inst is a NaCl pseudo instruction, emits the substitute
+//   expansion to the MCStreamer and returns true.
+//   Otherwise, returns false.
+//
+//   NOTE: Each time this function calls Out.EmitInstruction(), it will be
+//   called again recursively to rewrite the new instruction being emitted.
+//   Care must be taken to ensure that this does not result in an infinite
+//   loop. Also, global state must be managed carefully so that it is
+//   consistent during recursive calls.
+//
+//   We need global state to keep track of the explicit prefix (PREFIX_*)
+//   instructions. Unfortunately, the assembly parser prefers to generate
+//   these instead of combined instructions. At this time, having only
+//   one explicit prefix is supported.
+bool CustomExpandInstNaClX86(const MCInst &Inst, MCStreamer &Out) {
+  const bool UseZeroBasedSandbox = FlagUseZeroBasedSandbox;
+  // If we are emitting to .s, just emit all pseudo-instructions directly.
+  if (Out.hasRawTextSupport()) {
+    return false;
+  }
+  unsigned Opc = Inst.getOpcode();
+  DEBUG(dbgs() << "CustomExpandInstNaClX86("; Inst.dump(); dbgs() << ")\n");
+  switch (Opc) {
+  case X86::LOCK_PREFIX:
+  case X86::REP_PREFIX:
+  case X86::REPNE_PREFIX:
+  case X86::REX64_PREFIX:
+    // Ugly hack because LLVM AsmParser is not smart enough to combine
+    // prefixes back into the instruction they modify.
+    if (PrefixPass) {
+      PrefixPass = false;
+      PrefixSaved = 0;
+      return false;
+    }
+    assert(PrefixSaved == 0);
+    PrefixSaved = Opc;
+    return true;
+  case X86::NACL_TRAP32:
+    assert(PrefixSaved == 0);
+    EmitTrap(false, Out);
+    return true;
+  case X86::NACL_TRAP64:
+    assert(PrefixSaved == 0);
+    EmitTrap(true, Out);
+    return true;
+  case X86::NACL_CALL32d:
+    assert(PrefixSaved == 0);
+    EmitDirectCall(Inst.getOperand(0), false, Out);
+    return true;
+  case X86::NACL_CALL64d:
+    assert(PrefixSaved == 0);
+    EmitDirectCall(Inst.getOperand(0), true, Out);
+    return true;
+  case X86::NACL_CALL32r:
+    assert(PrefixSaved == 0);
+    EmitIndirectBranch(Inst.getOperand(0), false, true, Out);
+    return true;
+  case X86::NACL_CALL64r:
+    assert(PrefixSaved == 0);
+    EmitIndirectBranch(Inst.getOperand(0), true, true, Out);
+    return true;
+  case X86::NACL_JMP32r:
+    assert(PrefixSaved == 0);
+    EmitIndirectBranch(Inst.getOperand(0), false, false, Out);
+    return true;
+  case X86::NACL_TLS_addr32:
+    assert(PrefixSaved == 0);
+    EmitTLSAddr32(Inst, Out);
+    return true;
+  case X86::NACL_JMP64r:
+  case X86::NACL_JMP64z:
+    assert(PrefixSaved == 0);
+    EmitIndirectBranch(Inst.getOperand(0), true, false, Out);
+    return true;
+  case X86::NACL_RET32:
+    assert(PrefixSaved == 0);
+    EmitRet(NULL, false, Out);
+    return true;
+  case X86::NACL_RET64:
+    assert(PrefixSaved == 0);
+    EmitRet(NULL, true, Out);
+    return true;
+  case X86::NACL_RETI32:
+    assert(PrefixSaved == 0);
+    EmitRet(&Inst.getOperand(0), false, Out);
+    return true;
+  case X86::NACL_ASPi8:
+    assert(PrefixSaved == 0);
+    EmitSPArith(X86::ADD32ri8, Inst.getOperand(0), Out);
+    return true;
+  case X86::NACL_ASPi32:
+    assert(PrefixSaved == 0);
+    EmitSPArith(X86::ADD32ri, Inst.getOperand(0), Out);
+    return true;
+  case X86::NACL_SSPi8:
+    assert(PrefixSaved == 0);
+    EmitSPArith(X86::SUB32ri8, Inst.getOperand(0), Out);
+    return true;
+  case X86::NACL_SSPi32:
+    assert(PrefixSaved == 0);
+    EmitSPArith(X86::SUB32ri, Inst.getOperand(0), Out);
+    return true;
+  case X86::NACL_ANDSPi32:
+    assert(PrefixSaved == 0);
+    EmitSPArith(X86::AND32ri, Inst.getOperand(0), Out);
+    return true;
+  case X86::NACL_SPADJi32:
+    assert(PrefixSaved == 0);
+    EmitSPAdj(Inst.getOperand(0), Out);
+    return true;
+  case X86::NACL_RESTBPm:
+    assert(PrefixSaved == 0);
+    EmitREST(Inst, X86::EBP, true, Out);
+    return true;
+  case X86::NACL_RESTBPr:
+  case X86::NACL_RESTBPrz:
+    assert(PrefixSaved == 0);
+    EmitREST(Inst, X86::EBP, false, Out);
+    return true;
+  case X86::NACL_RESTSPm:
+    assert(PrefixSaved == 0);
+    EmitREST(Inst, X86::ESP, true, Out);
+    return true;
+  case X86::NACL_RESTSPr:
+  case X86::NACL_RESTSPrz:
+    assert(PrefixSaved == 0);
+    EmitREST(Inst, X86::ESP, false, Out);
+    return true;
+  }
+
+  unsigned IndexOpPosition;
+  MCInst SandboxedInst = Inst;
+  if (SandboxMemoryRef(&SandboxedInst, &IndexOpPosition)) {
+    unsigned PrefixLocal = PrefixSaved;
+    PrefixSaved = 0;
+
+    if (PrefixLocal || !UseZeroBasedSandbox)
+      Out.EmitBundleLock();
+
+    HandleMemoryRefTruncation(&SandboxedInst, IndexOpPosition, Out);
+    ShortenMemoryRef(&SandboxedInst, IndexOpPosition);
+
+    if (PrefixLocal)
+      EmitPrefix(PrefixLocal, Out);
+    Out.EmitInstruction(SandboxedInst);
+
+    if (PrefixLocal || !UseZeroBasedSandbox)
+      Out.EmitBundleUnlock();
+    return true;
+  }
+
+  if (PrefixSaved) {
+    unsigned PrefixLocal = PrefixSaved;
+    PrefixSaved = 0;
+    EmitPrefix(PrefixLocal, Out);
+  }
+  return false;
+}
+
+} // namespace llvm
+
+
+
+
+// @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+//
+// This is an exact copy of getX86SubSuperRegister from X86RegisterInfo.h
+// We cannot use the original because it is part of libLLVMX86CodeGen,
+// which cannot be a dependency of this module (libLLVMX86Desc).
+//
+// However, in all likelyhood, the real getX86SubSuperRegister will
+// eventually be moved to MCTargetDesc, and then this copy can be
+// removed.
+
+namespace {
+unsigned getX86SubSuperRegister_(unsigned Reg, EVT VT, bool High) {
+  switch (VT.getSimpleVT().SimpleTy) {
+  default: return Reg;
+  case MVT::i8:
+    if (High) {
+      switch (Reg) {
+      default: return 0;
+      case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+        return X86::AH;
+      case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+        return X86::DH;
+      case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+        return X86::CH;
+      case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+        return X86::BH;
+      }
+    } else {
+      switch (Reg) {
+      default: return 0;
+      case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+        return X86::AL;
+      case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+        return X86::DL;
+      case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+        return X86::CL;
+      case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+        return X86::BL;
+      case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+        return X86::SIL;
+      case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+        return X86::DIL;
+      case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+        return X86::BPL;
+      case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+        return X86::SPL;
+      case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+        return X86::R8B;
+      case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+        return X86::R9B;
+      case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+        return X86::R10B;
+      case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+        return X86::R11B;
+      case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+        return X86::R12B;
+      case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+        return X86::R13B;
+      case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+        return X86::R14B;
+      case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+        return X86::R15B;
+      }
+    }
+  case MVT::i16:
+    switch (Reg) {
+    default: return Reg;
+    case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+      return X86::AX;
+    case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+      return X86::DX;
+    case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+      return X86::CX;
+    case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+      return X86::BX;
+    case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+      return X86::SI;
+    case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+      return X86::DI;
+    case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+      return X86::BP;
+    case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+      return X86::SP;
+    case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+      return X86::R8W;
+    case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+      return X86::R9W;
+    case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+      return X86::R10W;
+    case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+      return X86::R11W;
+    case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+      return X86::R12W;
+    case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+      return X86::R13W;
+    case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+      return X86::R14W;
+    case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+      return X86::R15W;
+    }
+  case MVT::i32:
+    switch (Reg) {
+    default: return Reg;
+    case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+      return X86::EAX;
+    case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+      return X86::EDX;
+    case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+      return X86::ECX;
+    case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+      return X86::EBX;
+    case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+      return X86::ESI;
+    case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+      return X86::EDI;
+    case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+      return X86::EBP;
+    case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+      return X86::ESP;
+    case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+      return X86::R8D;
+    case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+      return X86::R9D;
+    case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+      return X86::R10D;
+    case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+      return X86::R11D;
+    case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+      return X86::R12D;
+    case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+      return X86::R13D;
+    case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+      return X86::R14D;
+    case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+      return X86::R15D;
+    }
+  case MVT::i64:
+    switch (Reg) {
+    default: return Reg;
+    case X86::AH: case X86::AL: case X86::AX: case X86::EAX: case X86::RAX:
+      return X86::RAX;
+    case X86::DH: case X86::DL: case X86::DX: case X86::EDX: case X86::RDX:
+      return X86::RDX;
+    case X86::CH: case X86::CL: case X86::CX: case X86::ECX: case X86::RCX:
+      return X86::RCX;
+    case X86::BH: case X86::BL: case X86::BX: case X86::EBX: case X86::RBX:
+      return X86::RBX;
+    case X86::SIL: case X86::SI: case X86::ESI: case X86::RSI:
+      return X86::RSI;
+    case X86::DIL: case X86::DI: case X86::EDI: case X86::RDI:
+      return X86::RDI;
+    case X86::BPL: case X86::BP: case X86::EBP: case X86::RBP:
+      return X86::RBP;
+    case X86::SPL: case X86::SP: case X86::ESP: case X86::RSP:
+      return X86::RSP;
+    case X86::R8B: case X86::R8W: case X86::R8D: case X86::R8:
+      return X86::R8;
+    case X86::R9B: case X86::R9W: case X86::R9D: case X86::R9:
+      return X86::R9;
+    case X86::R10B: case X86::R10W: case X86::R10D: case X86::R10:
+      return X86::R10;
+    case X86::R11B: case X86::R11W: case X86::R11D: case X86::R11:
+      return X86::R11;
+    case X86::R12B: case X86::R12W: case X86::R12D: case X86::R12:
+      return X86::R12;
+    case X86::R13B: case X86::R13W: case X86::R13D: case X86::R13:
+      return X86::R13;
+    case X86::R14B: case X86::R14W: case X86::R14D: case X86::R14:
+      return X86::R14;
+    case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
+      return X86::R15;
+    }
+  }
+
+  return Reg;
+}
+
+// This is a copy of DemoteRegTo32 from X86NaClRewritePass.cpp.
+// We cannot use the original because it uses part of libLLVMX86CodeGen,
+// which cannot be a dependency of this module (libLLVMX86Desc).
+// Note that this function calls getX86SubSuperRegister_, which is
+// also a copied function for the same reason.
+
+unsigned DemoteRegTo32_(unsigned RegIn) {
+  if (RegIn == 0)
+    return 0;
+  unsigned RegOut = getX86SubSuperRegister_(RegIn, MVT::i32, false);
+  assert(RegOut != 0);
+  return RegOut;
+}
+} //namespace
+// @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
diff --git a/lib/Target/X86/MCTargetDesc/X86MCNaCl.h b/lib/Target/X86/MCTargetDesc/X86MCNaCl.h
new file mode 100644
index 0000000000..01b400d4d9
--- /dev/null
+++ b/lib/Target/X86/MCTargetDesc/X86MCNaCl.h
@@ -0,0 +1,19 @@
+//===-- X86MCNaCl.h - Prototype for CustomExpandInstNaClX86   ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86MCNACL_H
+#define X86MCNACL_H
+
+namespace llvm {
+  class MCInst;
+  class MCStreamer;
+  bool CustomExpandInstNaClX86(const MCInst &Inst, MCStreamer &Out);
+}
+
+#endif
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index dce5b4d2b0..cbdfeaedbe 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -47,6 +47,11 @@ FunctionPass *createCleanupLocalDynamicTLSPass();
 ///
 FunctionPass *createX86FloatingPointStackifierPass();
 
+// @LOCALMOD-BEGIN - Creates a pass to make instructions follow NaCl SFI rules.
+FunctionPass* createX86NaClRewritePass();
+FunctionPass* createX86NaClRewriteFinalPass();
+// @LOCALMOD-END
+
 /// createX86IssueVZeroUpperPass - This pass inserts AVX vzeroupper instructions
 /// before each call to avoid transition penalty between functions encoded with
 /// AVX and SSE.
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index fdd712520b..9a63060c90 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -71,6 +71,35 @@ bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   return false;
 }
 
+// @LOCALMOD-BEGIN
+bool X86AsmPrinter::UseReadOnlyJumpTables() const {
+  return Subtarget->isTargetNaCl();
+}
+
+unsigned X86AsmPrinter::GetTargetBasicBlockAlign() const {
+  if (Subtarget->isTargetNaCl())
+    return 5;
+  return 0;
+}
+
+unsigned X86AsmPrinter::GetTargetLabelAlign(const MachineInstr *MI) const {
+  if (Subtarget->isTargetNaCl()) {
+    switch (MI->getOpcode()) {
+      default: return 0;
+      // These labels may indicate an indirect entry point that is
+      // externally reachable and hence must be bundle aligned.
+      // Note: these labels appear to be always at basic block beginnings
+      // so it may be possible to simply set the MBB alignment.
+      // However, it is unclear whether this always holds.
+      case TargetOpcode::EH_LABEL:
+      case TargetOpcode::GC_LABEL:
+        return 5;
+    }
+  }
+  return 0;
+}
+// @LOCALMOD-END
+
 /// printSymbolOperand - Print a raw symbol reference operand.  This handles
 /// jump tables, constant pools, global address and external symbols, all of
 /// which print to a label with various suffixes for relocation types etc.
diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h
index 61eb14e036..b166a531e1 100644
--- a/lib/Target/X86/X86AsmPrinter.h
+++ b/lib/Target/X86/X86AsmPrinter.h
@@ -46,6 +46,12 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
 
   virtual void EmitInstruction(const MachineInstr *MI) LLVM_OVERRIDE;
 
+  virtual bool UseReadOnlyJumpTables() const; // @LOCALMOD
+
+  virtual unsigned GetTargetBasicBlockAlign() const; // @LOCLAMOD
+
+  virtual unsigned GetTargetLabelAlign(const MachineInstr *MI) const;//@LOCALMOD
+
   void printSymbolOperand(const MachineOperand &MO, raw_ostream &O);
 
   // These methods are used by the tablegen'erated instruction printer.
diff --git a/lib/Target/X86/X86CodeEmitter.cpp b/lib/Target/X86/X86CodeEmitter.cpp
index 44db563818..ee6408b403 100644
--- a/lib/Target/X86/X86CodeEmitter.cpp
+++ b/lib/Target/X86/X86CodeEmitter.cpp
@@ -12,7 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#define DEBUG_TYPE "x86-emitter"
+#define DEBUG_TYPE "jit"
 #include "X86InstrInfo.h"
 #include "X86JITInfo.h"
 #include "X86Subtarget.h"
@@ -34,6 +34,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetOptions.h"
+#include "llvm/Target/TargetOpcodes.h" // @LOCALMOD
 using namespace llvm;
 
 STATISTIC(NumEmitted, "Number of machine instructions emitted");
@@ -1120,6 +1121,28 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
 
   unsigned Opcode = Desc->Opcode;
 
+  // @LOCALMOD-START
+  if (TM.getSubtargetImpl()->isTargetNaCl()) {
+    switch (Opcode) {
+    case TargetOpcode::BUNDLE_LOCK:
+      MCE.beginBundleLock();
+      return;
+    case TargetOpcode::BUNDLE_UNLOCK:
+      MCE.endBundleLock();
+      return;
+    case TargetOpcode::BUNDLE_ALIGN_START:
+      MCE.alignToBundleBeginning();
+      return;
+    case TargetOpcode::BUNDLE_ALIGN_END:
+      MCE.alignToBundleEnd();
+      return;
+    }
+    // In addition to groups of instructions, each instruction must itself be
+    // bundle-locked because they are emitted with multiple calls into MCE
+    MCE.beginBundleLock();
+  }
+  // @LOCALMOD-END
+  
   // If this is a two-address instruction, skip one of the register operands.
   unsigned NumOps = Desc->getNumOperands();
   unsigned CurOp = 0;
@@ -1479,5 +1502,11 @@ void Emitter<CodeEmitter>::emitInstruction(MachineInstr &MI,
     llvm_unreachable(0);
   }
 
+  // @LOCALMOD-START
+  if (TM.getSubtargetImpl()->isTargetNaCl()) {
+    MCE.endBundleLock();
+  }
+  // @LOCALMOD-END
+
   MCE.processDebugLoc(MI.getDebugLoc(), false);
 }
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index d4627c74cb..ad652366ad 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -580,6 +580,20 @@ bool X86FastISel::X86SelectAddress(const Value *V, X86AddressMode &AM) {
 
   // If all else fails, try to materialize the value in a register.
   if (!AM.GV || !Subtarget->isPICStyleRIPRel()) {
+    // @LOCALMOD-START
+    if (Subtarget->isTargetNaCl()) {
+      // We can materialize into a memory address only if
+      // no registers have been defined (and hence, we
+      // aren't modifying an existing memory reference).
+      if ((AM.Base.Reg == 0) && (AM.IndexReg == 0)) {
+        // Put into index register so that the NaCl rewrite pass will
+        // convert this to a 64-bit address.
+        AM.IndexReg = getRegForValue(V);
+        return AM.IndexReg != 0;
+      }
+      return false;
+    }
+    // @LOCALMOD-END
     if (AM.Base.Reg == 0) {
       AM.Base.Reg = getRegForValue(V);
       return AM.Base.Reg != 0;
@@ -818,9 +832,16 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
     unsigned Reg = X86MFInfo->getSRetReturnReg();
     assert(Reg &&
            "SRetReturnReg should have been set in LowerFormalArguments()!");
+    // @LOCALMOD-BEGIN -- Ensure that the register classes match.
+    // At this point, SRetReturnReg is EDI, because PointerTy() for NaCl
+    // is i32.  We then copy to EAX instead of RAX.  Alternatively, we could
+    // have zero-extended EDI to RDI then copy to RAX, but this has a smaller
+    // encoding (2 bytes vs 3 bytes).
+    unsigned CopyTo = Subtarget->has64BitPointers() ? X86::RAX : X86::EAX;
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(TargetOpcode::COPY),
-            X86::RAX).addReg(Reg);
-    MRI.addLiveOut(X86::RAX);
+            CopyTo).addReg(Reg);
+    MRI.addLiveOut(CopyTo);
+    // @LOCALMOD-END
   }
 
   // Now emit the RET.
@@ -1832,10 +1853,21 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
   if (CalleeOp) {
     // Register-indirect call.
     unsigned CallOpc;
-    if (Subtarget->is64Bit())
-      CallOpc = X86::CALL64r;
-    else
-      CallOpc = X86::CALL32r;
+    // @LOCALMOD-BEGIN
+    if (Subtarget->is64Bit()) {
+      if (Subtarget->isTargetNaCl()) {
+        CallOpc = X86::NACL_CG_CALL64r;
+      } else {
+        CallOpc = X86::CALL64r;
+      }
+    } else {
+      if (Subtarget->isTargetNaCl()) {
+        CallOpc = X86::NACL_CG_CALL32r;
+      } else {
+        CallOpc = X86::CALL32r;
+      }
+    }
+    // @LOCALMOD-END
     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL, TII.get(CallOpc))
       .addReg(CalleeOp);
 
@@ -1843,10 +1875,21 @@ bool X86FastISel::DoSelectCall(const Instruction *I, const char *MemIntName) {
     // Direct call.
     assert(GV && "Not a direct call");
     unsigned CallOpc;
-    if (Subtarget->is64Bit())
-      CallOpc = X86::CALL64pcrel32;
-    else
-      CallOpc = X86::CALLpcrel32;
+    // @LOCALMOD-BEGIN
+    if (Subtarget->is64Bit()) {
+      if (Subtarget->isTargetNaCl()) {
+        CallOpc = X86::NACL_CG_CALL64pcrel32;
+      } else {
+        CallOpc = X86::CALL64pcrel32;
+      }
+    } else {
+      if (Subtarget->isTargetNaCl()) {
+        CallOpc = X86::NACL_CG_CALLpcrel32;
+      } else {
+        CallOpc = X86::CALLpcrel32;
+      }
+    }
+    // @LOCALMOD-END
 
     // See if we need any target-specific flags on the GV operand.
     unsigned char OpFlags = 0;
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index 369589d469..5bfb5054b0 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -114,6 +114,8 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
   case X86::TCRETURNmi:
   case X86::TCRETURNdi64:
   case X86::TCRETURNri64:
+  case X86::NACL_CG_TCRETURNdi64: // @LOCALMOD
+  case X86::NACL_CG_TCRETURNri64: // @LOCALMOD
   case X86::TCRETURNmi64:
   case X86::EH_RETURN:
   case X86::EH_RETURN64: {
@@ -994,6 +996,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   case X86::TCRETURNdi64:
   case X86::TCRETURNri64:
   case X86::TCRETURNmi64:
+  case X86::NACL_CG_TCRETURNdi64: // @LOCALMOD
+  case X86::NACL_CG_TCRETURNri64: // @LOCALMOD
   case X86::EH_RETURN:
   case X86::EH_RETURN64:
     break;  // These are ok
@@ -1086,6 +1090,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   } else if (RetOpcode == X86::TCRETURNri || RetOpcode == X86::TCRETURNdi ||
              RetOpcode == X86::TCRETURNmi ||
              RetOpcode == X86::TCRETURNri64 || RetOpcode == X86::TCRETURNdi64 ||
+             RetOpcode == X86::NACL_CG_TCRETURNri64 || // @LOCALMOD
+             RetOpcode == X86::NACL_CG_TCRETURNdi64 || // @LOCALMOD
              RetOpcode == X86::TCRETURNmi64) {
     bool isMem = RetOpcode == X86::TCRETURNmi || RetOpcode == X86::TCRETURNmi64;
     // Tail call return: adjust the stack pointer and jump to callee.
@@ -1111,10 +1117,22 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
     }
 
     // Jump to label or value in register.
-    if (RetOpcode == X86::TCRETURNdi || RetOpcode == X86::TCRETURNdi64) {
+    if (RetOpcode == X86::TCRETURNdi || RetOpcode == X86::TCRETURNdi64 ||
+        RetOpcode == X86::NACL_CG_TCRETURNdi64) { // @LOCALMOD
+      // @LOCALMOD-BEGIN
+      unsigned TailJmpOpc;
+      switch (RetOpcode) {
+      case X86::TCRETURNdi  : TailJmpOpc = X86::TAILJMPd; break;
+      case X86::TCRETURNdi64: TailJmpOpc = X86::TAILJMPd64; break;
+      case X86::NACL_CG_TCRETURNdi64:
+        TailJmpOpc = X86::NACL_CG_TAILJMPd64;
+        break;
+      default: llvm_unreachable("Unexpected return opcode");
+      }
+      // @LOCALMOD-END
       MachineInstrBuilder MIB =
-        BuildMI(MBB, MBBI, DL, TII.get((RetOpcode == X86::TCRETURNdi)
-                                       ? X86::TAILJMPd : X86::TAILJMPd64));
+        BuildMI(MBB, MBBI, DL, TII.get(TailJmpOpc)); // @LOCALMOD
+
       if (JumpTarget.isGlobal())
         MIB.addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset(),
                              JumpTarget.getTargetFlags());
@@ -1132,6 +1150,11 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
     } else if (RetOpcode == X86::TCRETURNri64) {
       BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr64)).
         addReg(JumpTarget.getReg(), RegState::Kill);
+// @LOCALMOD-BEGIN
+    } else if (RetOpcode == X86::NACL_CG_TCRETURNri64) {
+      BuildMI(MBB, MBBI, DL, TII.get(X86::NACL_CG_TAILJMPr64)).
+        addReg(JumpTarget.getReg(), RegState::Kill);
+// @LOCALMOD-END
     } else {
       BuildMI(MBB, MBBI, DL, TII.get(X86::TAILJMPr)).
         addReg(JumpTarget.getReg(), RegState::Kill);
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index dc515dc39c..d46c41f508 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -29,7 +29,8 @@ public:
   explicit X86FrameLowering(const X86TargetMachine &tm, const X86Subtarget &sti)
     : TargetFrameLowering(StackGrowsDown,
                           sti.getStackAlignment(),
-                          (sti.is64Bit() ? -8 : -4)),
+                          (sti.is64Bit() ? -8 : -4),
+                          1, (sti.is64Bit() ? 8 : 4)), // @LOCALMOD
       TM(tm), STI(sti) {
   }
 
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 99f557417b..42134256e3 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -41,6 +41,7 @@ using namespace llvm;
 
 STATISTIC(NumLoadMoved, "Number of loads moved below TokenFactor");
 
+
 //===----------------------------------------------------------------------===//
 //                      Pattern Matcher Implementation
 //===----------------------------------------------------------------------===//
@@ -214,6 +215,10 @@ namespace {
                              SDValue &Index, SDValue &Disp,
                              SDValue &Segment,
                              SDValue &NodeWithChain);
+    // @LOCALMOD-BEGIN
+    void LegalizeAddressingModeForNaCl(SDValue N, X86ISelAddressMode &AM);
+    // @LOCALMOD-END
+
 
     bool TryFoldLoad(SDNode *P, SDValue N,
                      SDValue &Base, SDValue &Scale,
@@ -231,8 +236,9 @@ namespace {
     inline void getAddressOperands(X86ISelAddressMode &AM, SDValue &Base,
                                    SDValue &Scale, SDValue &Index,
                                    SDValue &Disp, SDValue &Segment) {
+      EVT MemOpVT = Subtarget->is64Bit() ? MVT::i64 : MVT::i32;  // @LOCALMOD
       Base  = (AM.BaseType == X86ISelAddressMode::FrameIndexBase) ?
-        CurDAG->getTargetFrameIndex(AM.Base_FrameIndex, TLI.getPointerTy()) :
+        CurDAG->getTargetFrameIndex(AM.Base_FrameIndex, MemOpVT) : // @LOCALMOD
         AM.Base_Reg;
       Scale = getI8Imm(AM.Scale);
       Index = AM.IndexReg;
@@ -292,6 +298,15 @@ namespace {
     const X86InstrInfo *getInstrInfo() {
       return getTargetMachine().getInstrInfo();
     }
+
+    // @LOCALMOD-START
+    bool selectingMemOp;
+    bool RestrictUseOfBaseReg() {
+      return selectingMemOp && Subtarget->isTargetNaCl64();
+    }
+    // @LOCALMOD-END
+
+
   };
 }
 
@@ -442,6 +457,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
     SDNode *N = I++;  // Preincrement iterator to avoid invalidation issues.
 
     if (OptLevel != CodeGenOpt::None &&
+        !Subtarget->isTargetNaCl() &&   // @LOCALMOD: We can't fold load/call
         (N->getOpcode() == X86ISD::CALL ||
          (N->getOpcode() == X86ISD::TC_RETURN &&
           // Only does this if load can be foled into TC_RETURN.
@@ -598,6 +614,14 @@ bool X86DAGToDAGISel::FoldOffsetIntoAddress(uint64_t Offset,
 bool X86DAGToDAGISel::MatchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
   SDValue Address = N->getOperand(1);
 
+  // @LOCALMOD-START
+  // Disable this tls access optimization in Native Client, since
+  // gs:0 (or fs:0 on X86-64) does not exactly contain its own address.
+  if (Subtarget->isTargetNaCl()) {
+    return true;
+  }
+  // @LOCALMOD-END
+    
   // load gs:0 -> GS segment register.
   // load fs:0 -> FS segment register.
   //
@@ -722,6 +746,8 @@ bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM) {
   if (MatchAddressRecursively(N, AM, 0))
     return true;
 
+
+  if (!RestrictUseOfBaseReg()) {   // @LOCALMOD
   // Post-processing: Convert lea(,%reg,2) to lea(%reg,%reg), which has
   // a smaller encoding and avoids a scaled-index.
   if (AM.Scale == 2 &&
@@ -730,7 +756,8 @@ bool X86DAGToDAGISel::MatchAddress(SDValue N, X86ISelAddressMode &AM) {
     AM.Base_Reg = AM.IndexReg;
     AM.Scale = 1;
   }
-
+  } // @LOCALMOD
+  
   // Post-processing: Convert foo to foo(%rip), even in non-PIC mode,
   // because it has a smaller encoding.
   // TODO: Which other code models can use this?
@@ -1077,6 +1104,8 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
     // FALL THROUGH
   case ISD::MUL:
   case X86ISD::MUL_IMM:
+    // @LOCALMOD
+    if (!RestrictUseOfBaseReg()) {
     // X*[3,5,9] -> X+X*[2,4,8]
     if (AM.BaseType == X86ISelAddressMode::RegBase &&
         AM.Base_Reg.getNode() == 0 &&
@@ -1109,6 +1138,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
           return false;
         }
     }
+    } // @LOCALMOD
     break;
 
   case ISD::SUB: {
@@ -1195,6 +1225,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
       return false;
     AM = Backup;
 
+    if (!RestrictUseOfBaseReg()) { // @LOCALMOD
     // If we couldn't fold both operands into the address at the same time,
     // see if we can just put each operand into a register and fold at least
     // the add.
@@ -1207,6 +1238,7 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
       AM.Scale = 1;
       return false;
     }
+    } // @LOCALMOD
     N = Handle.getValue();
     break;
   }
@@ -1266,7 +1298,15 @@ bool X86DAGToDAGISel::MatchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
 /// MatchAddressBase - Helper for MatchAddress. Add the specified node to the
 /// specified addressing mode without any further recursion.
 bool X86DAGToDAGISel::MatchAddressBase(SDValue N, X86ISelAddressMode &AM) {
-  // Is the base register already occupied?
+  if (RestrictUseOfBaseReg()) { // @LOCALMOD
+    if (AM.IndexReg.getNode() == 0) {
+      AM.IndexReg = N;
+      AM.Scale = 1;
+      return false;
+    }
+    return true;
+  } // @LOCALMOD
+// Is the base register already occupied?
   if (AM.BaseType != X86ISelAddressMode::RegBase || AM.Base_Reg.getNode()) {
     // If so, check to see if the scale index register is set.
     if (AM.IndexReg.getNode() == 0) {
@@ -1296,6 +1336,8 @@ bool X86DAGToDAGISel::SelectAddr(SDNode *Parent, SDValue N, SDValue &Base,
                                  SDValue &Scale, SDValue &Index,
                                  SDValue &Disp, SDValue &Segment) {
   X86ISelAddressMode AM;
+  // @LOCALMOD
+  selectingMemOp = true;
 
   if (Parent &&
       // This list of opcodes are all the nodes that have an "addr:$ptr" operand
@@ -1317,7 +1359,14 @@ bool X86DAGToDAGISel::SelectAddr(SDNode *Parent, SDValue N, SDValue &Base,
   if (MatchAddress(N, AM))
     return false;
 
-  EVT VT = N.getValueType();
+  // @LOCALMOD-START
+  if (Subtarget->isTargetNaCl64()) {
+      LegalizeAddressingModeForNaCl(N, AM);
+  }
+  // @LOCALMOD-END
+
+  EVT VT = Subtarget->is64Bit() ? MVT::i64 : MVT::i32; // @LOCALMOD
+
   if (AM.BaseType == X86ISelAddressMode::RegBase) {
     if (!AM.Base_Reg.getNode())
       AM.Base_Reg = CurDAG->getRegister(0, VT);
@@ -1327,6 +1376,32 @@ bool X86DAGToDAGISel::SelectAddr(SDNode *Parent, SDValue N, SDValue &Base,
     AM.IndexReg = CurDAG->getRegister(0, VT);
 
   getAddressOperands(AM, Base, Scale, Index, Disp, Segment);
+
+  // @LOCALMOD-BEGIN
+  // For Native Client 64-bit, zero-extend 32-bit pointers
+  // to 64-bits for memory operations.  Most of the time, this
+  // won't generate any additional instructions because the backend
+  // knows that operations on 32-bit registers implicitly zero-extends.
+  // If we don't do this, there are a few corner cases where LLVM might
+  // assume the upper bits won't be modified or used, but since we
+  // always clear the upper bits, this is not a good assumption.
+  // http://code.google.com/p/nativeclient/issues/detail?id=1564
+  if (Subtarget->isTargetNaCl64()) {
+    assert(Base.getValueType() == MVT::i64 && "Unexpected base operand size");
+
+    if (Index.getValueType() != MVT::i64) {
+      Index = CurDAG->getZExtOrTrunc(Index, Index.getDebugLoc(), MVT::i64);
+      // Insert the new node into the topological ordering.
+      if (Parent &&
+          (Index->getNodeId() == -1 ||
+           Index->getNodeId() > Parent->getNodeId())) {
+        CurDAG->RepositionNode(Parent, Index.getNode());
+        Index->setNodeId(Parent->getNodeId());
+      }
+    }
+  }
+  // @LOCALMOD-END
+
   return true;
 }
 
@@ -1389,6 +1464,8 @@ bool X86DAGToDAGISel::SelectLEAAddr(SDValue N,
   SDValue Copy = AM.Segment;
   SDValue T = CurDAG->getRegister(0, MVT::i32);
   AM.Segment = T;
+  // @LOCALMOD
+  selectingMemOp = false;
   if (MatchAddress(N, AM))
     return false;
   assert (T == AM.Segment);
@@ -1452,7 +1529,8 @@ bool X86DAGToDAGISel::SelectTLSADDRAddr(SDValue N, SDValue &Base,
   AM.Base_Reg = CurDAG->getRegister(0, N.getValueType());
   AM.SymbolFlags = GA->getTargetFlags();
 
-  if (N.getValueType() == MVT::i32) {
+  if (N.getValueType() == MVT::i32 && 
+      !Subtarget->isTargetNaCl64()) {   // @LOCALMOD
     AM.Scale = 1;
     AM.IndexReg = CurDAG->getRegister(X86::EBX, MVT::i32);
   } else {
@@ -1477,6 +1555,141 @@ bool X86DAGToDAGISel::TryFoldLoad(SDNode *P, SDValue N,
                     N.getOperand(1), Base, Scale, Index, Disp, Segment);
 }
 
+// @LOCALMOD-BEGIN
+// LegalizeAddressingModeForNaCl - NaCl specific addressing fixes.  This ensures
+// two addressing mode invariants.
+//
+//   case 1. Addressing using only a displacement (constant address references)
+//   is only legal when the displacement is positive.  This is because, when
+//   later we replace
+//     movl 0xffffffff, %eax
+//   by
+//     movl 0xffffffff(%r15), %eax
+//   the displacement becomes a negative offset from %r15, making this a
+//   reference to the guard region below %r15 rather than to %r15 + 4GB - 1,
+//   as the programmer expected.  To handle these cases we pull negative
+//   displacements out whenever there is no base or index register in the
+//   addressing mode.  I.e., the above becomes
+//     movl $0xffffffff, %ebx
+//     movl %rbx, %rbx
+//     movl (%r15, %rbx, 1), %eax
+//
+//   case 2. Because NaCl needs to zero the top 32-bits of the index, we can't
+//   allow the index register to be negative. However, if we are using a base
+//   frame index, global address or the constant pool, and AM.Disp > 0, then
+//   negative values of "index" may be expected to legally occur.
+//   To avoid this, we fold the displacement (and scale) back into the
+//   index. This results in a LEA before the current instruction.
+//   Unfortunately, this may add a requirement for an additional register.
+//
+//   For example, this sandboxed code is broken if %eax is negative:
+//
+//     movl %eax,%eax
+//     incl -30(%rbp,%rax,4)
+//
+//   Instead, we now generate:
+//     leal -30(%rbp,%rax,4), %tmp
+//     movl %tmp,%tmp
+//     incl (%r15,%tmp,1)
+//
+//  TODO(espindola): This might not be complete since the matcher can select
+//  any dag node to go in the index. This is also not how the rest of the
+//  matcher logic works, if the matcher selects something, it must be
+//  valid and not depend on further patching. A more desirable fix is
+//  probably to update the matching code to avoid assigning a register
+//  to a value that we cannot prove is positive.
+void X86DAGToDAGISel::LegalizeAddressingModeForNaCl(SDValue N,
+                                                    X86ISelAddressMode &AM) {
+
+
+  // RIP-relative addressing is always fine.
+  if (AM.isRIPRelative())
+    return;
+
+  DebugLoc dl = N->getDebugLoc();
+  // Case 1 above:
+  if (!AM.hasBaseOrIndexReg() && !AM.hasSymbolicDisplacement() && AM.Disp < 0) {
+    SDValue Imm = CurDAG->getTargetConstant(AM.Disp, MVT::i32);
+    SDValue MovNode =
+      SDValue(CurDAG->getMachineNode(X86::MOV32ri, dl, MVT::i32, Imm), 0);
+    AM.IndexReg = MovNode;
+    AM.Disp = 0;
+    InsertDAGNode(*CurDAG, N, MovNode);
+    return;
+  }
+
+  // MatchAddress wants to use the base register when there's only
+  // one register and no scale. We need to use the index register instead.
+  if (AM.BaseType == X86ISelAddressMode::RegBase &&
+      AM.Base_Reg.getNode() &&
+      !AM.IndexReg.getNode()) {
+    AM.IndexReg = AM.Base_Reg;
+    AM.setBaseReg(SDValue());
+  }
+
+  // Case 2 above comprises two sub-cases:
+  // sub-case 1: Prevent negative indexes
+  bool NeedsFixing1 =
+       (AM.BaseType == X86ISelAddressMode::FrameIndexBase || AM.GV || AM.CP) &&
+       AM.IndexReg.getNode() &&
+       AM.Disp > 0;
+
+  // sub-case 2: Both index and base registers are being used
+  bool NeedsFixing2 =
+       (AM.BaseType == X86ISelAddressMode::RegBase) &&
+       AM.Base_Reg.getNode() &&
+       AM.IndexReg.getNode();
+
+  if (!NeedsFixing1 && !NeedsFixing2)
+    return;
+
+  static const unsigned LogTable[] = { ~0, 0, 1, ~0, 2, ~0, ~0, ~0, 3 };
+  assert(AM.Scale < sizeof(LogTable)/sizeof(LogTable[0]));
+  unsigned ScaleLog = LogTable[AM.Scale];
+  assert(ScaleLog <= 3);
+  SmallVector<SDNode*, 8> NewNodes;
+
+  SDValue NewIndex = AM.IndexReg;
+  if (ScaleLog > 0) {
+    SDValue ShlCount = CurDAG->getConstant(ScaleLog, MVT::i8);
+    NewNodes.push_back(ShlCount.getNode());
+    SDValue ShlNode = CurDAG->getNode(ISD::SHL, dl, N.getValueType(),
+                                      NewIndex, ShlCount);
+    NewNodes.push_back(ShlNode.getNode());
+    NewIndex = ShlNode;
+  }
+  if (AM.Disp > 0) {
+    SDValue DispNode = CurDAG->getConstant(AM.Disp, N.getValueType());
+    NewNodes.push_back(DispNode.getNode());
+
+    SDValue AddNode = CurDAG->getNode(ISD::ADD, dl, N.getValueType(),
+                                  NewIndex, DispNode);
+    NewNodes.push_back(AddNode.getNode());
+    NewIndex = AddNode;
+  }
+
+  if (NeedsFixing2) {
+    SDValue AddBase = CurDAG->getNode(ISD::ADD, dl, N.getValueType(),
+                                      NewIndex, AM.Base_Reg);
+    NewNodes.push_back(AddBase.getNode());
+    NewIndex = AddBase;
+    AM.setBaseReg(SDValue());
+  }
+  AM.Disp = 0;
+  AM.Scale = 1;
+  AM.IndexReg = NewIndex;
+
+  // Insert the new nodes into the topological ordering.
+  for (unsigned i=0; i < NewNodes.size(); i++) {
+    if (NewNodes[i]->getNodeId() == -1 ||
+        NewNodes[i]->getNodeId() > N.getNode()->getNodeId()) {
+      CurDAG->RepositionNode(N.getNode(), NewNodes[i]);
+      NewNodes[i]->setNodeId(N.getNode()->getNodeId());
+    }
+  }
+}
+// @LOCALMOD-END
+
 /// getGlobalBaseReg - Return an SDNode that returns the value of
 /// the global base register. Output instructions required to
 /// initialize the global base register, if necessary.
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 1f729e3133..5610bb5ba3 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -146,6 +146,12 @@ static TargetLoweringObjectFile *createTLOF(X86TargetMachine &TM) {
 
   if (Subtarget->isTargetLinux())
     return new X86LinuxTargetObjectFile();
+    
+  // @LOCALMOD-BEGIN
+  if (Subtarget->isTargetNaCl())
+    return new TargetLoweringObjectFileNaCl();
+  // @LOCALMOD-END
+
   if (Subtarget->isTargetELF())
     return new TargetLoweringObjectFileELF();
   if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
@@ -158,6 +164,9 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   Subtarget = &TM.getSubtarget<X86Subtarget>();
   X86ScalarSSEf64 = Subtarget->hasSSE2();
   X86ScalarSSEf32 = Subtarget->hasSSE1();
+  // @LOCALMOD-START
+  X86StackPtr = Subtarget->has64BitPointers() ? X86::RSP : X86::ESP;
+  // @LOCALMOD-END
 
   RegInfo = TM.getRegisterInfo();
   TD = getDataLayout();
@@ -179,7 +188,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setSchedulingPreference(Sched::ILP);
   else
     setSchedulingPreference(Sched::RegPressure);
-  setStackPointerRegisterToSaveRestore(RegInfo->getStackRegister());
+  setStackPointerRegisterToSaveRestore(X86StackPtr); // @LOCALMOD
 
   // Bypass i32 with i8 on Atom when compiling with O2
   if (Subtarget->hasSlowDivide() && TM.getOptLevel() >= CodeGenOpt::Default)
@@ -542,7 +551,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setOperationAction(ISD::EHSELECTION,   MVT::i64, Expand);
   setOperationAction(ISD::EXCEPTIONADDR, MVT::i32, Expand);
   setOperationAction(ISD::EHSELECTION,   MVT::i32, Expand);
-  if (Subtarget->is64Bit()) {
+  if (Subtarget->has64BitPointers()) {
     setExceptionPointerRegister(X86::RAX);
     setExceptionSelectorRegister(X86::RDX);
   } else {
@@ -573,13 +582,16 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
   setOperationAction(ISD::STACKRESTORE,       MVT::Other, Expand);
 
   if (Subtarget->isTargetCOFF() && !Subtarget->isTargetEnvMacho())
-    setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
+    setOperationAction(ISD::DYNAMIC_STACKALLOC,
+                       Subtarget->has64BitPointers() ? // @LOCALMOD
                        MVT::i64 : MVT::i32, Custom);
   else if (TM.Options.EnableSegmentedStacks)
-    setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
+    setOperationAction(ISD::DYNAMIC_STACKALLOC,
+                       Subtarget->has64BitPointers() ? // @LOCALMOD
                        MVT::i64 : MVT::i32, Custom);
   else
-    setOperationAction(ISD::DYNAMIC_STACKALLOC, Subtarget->is64Bit() ?
+    setOperationAction(ISD::DYNAMIC_STACKALLOC,
+                       Subtarget->has64BitPointers() ? // @LOCALMOD
                        MVT::i64 : MVT::i32, Expand);
 
   if (!TM.Options.UseSoftFloat && X86ScalarSSEf64) {
@@ -1273,6 +1285,14 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setTargetDAGCombine(ISD::MUL);
   setTargetDAGCombine(ISD::XOR);
 
+  // @LOCALMOD-BEGIN
+  if (Subtarget->isTargetNaCl()) {
+    setOperationAction(ISD::NACL_TP_TLS_OFFSET,        MVT::i32, Custom);
+    setOperationAction(ISD::NACL_TP_TDB_OFFSET,        MVT::i32, Custom);
+    setOperationAction(ISD::NACL_TARGET_ARCH,          MVT::i32, Custom);
+  }
+  // @LOCALMOD-END
+
   computeRegisterProperties();
 
   // On Darwin, -Os means optimize for size without hurting performance,
@@ -1617,7 +1637,16 @@ X86TargetLowering::LowerReturn(SDValue Chain,
            "SRetReturnReg should have been set in LowerFormalArguments().");
     SDValue Val = DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy());
 
-    Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag);
+    // @LOCALMOD-START
+    if (Subtarget->isTargetNaCl()) {
+      // NaCl 64 uses 32-bit pointers, so there might be some zero-ext needed.
+      SDValue Zext = DAG.getZExtOrTrunc(Val, dl, MVT::i64);
+      Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Zext, Flag);
+    } else {
+      Chain = DAG.getCopyToReg(Chain, dl, X86::RAX, Val, Flag);
+    }
+    // @LOCALMOD-END
+
     Flag = Chain.getValue(1);
 
     // RAX now acts like a return value.
@@ -1981,7 +2010,9 @@ X86TargetLowering::LowerFormalArguments(SDValue Chain,
     X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
     unsigned Reg = FuncInfo->getSRetReturnReg();
     if (!Reg) {
-      Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(MVT::i64));
+      // @LOCALMOD
+      Reg = MF.getRegInfo().createVirtualRegister(
+          getRegClassFor(getPointerTy()));
       FuncInfo->setSRetReturnReg(Reg);
     }
     SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[0]);
@@ -2350,7 +2381,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     } else if (!IsSibcall && (!isTailCall || isByVal)) {
       assert(VA.isMemLoc());
       if (StackPtr.getNode() == 0)
-        StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
+        StackPtr = DAG.getCopyFromReg(Chain, dl, X86StackPtr, // @LOCALMOD
                                       getPointerTy());
       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
                                              dl, DAG, VA, Flags));
@@ -2440,7 +2471,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
           SDValue Source = DAG.getIntPtrConstant(VA.getLocMemOffset());
           if (StackPtr.getNode() == 0)
             StackPtr = DAG.getCopyFromReg(Chain, dl,
-                                          RegInfo->getStackRegister(),
+                                          X86StackPtr, // @LOCALMOD
                                           getPointerTy());
           Source = DAG.getNode(ISD::ADD, dl, getPointerTy(), StackPtr, Source);
 
@@ -3049,7 +3080,8 @@ SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
     FuncInfo->setRAIndex(ReturnAddrIndex);
   }
 
-  return DAG.getFrameIndex(ReturnAddrIndex, getPointerTy());
+  return DAG.getFrameIndex(ReturnAddrIndex, // @LOCALMOD
+                           Subtarget->is64Bit() ? MVT::i64 : MVT::i32);
 }
 
 
@@ -7563,7 +7595,8 @@ X86TargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
 static SDValue
 GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
            SDValue *InFlag, const EVT PtrVT, unsigned ReturnReg,
-           unsigned char OperandFlags, bool LocalDynamic = false) {
+           unsigned char OperandFlags,
+           unsigned Opcode = X86ISD::TLSADDR) { // @LOCALMOD
   MachineFrameInfo *MFI = DAG.getMachineFunction().getFrameInfo();
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   DebugLoc dl = GA->getDebugLoc();
@@ -7571,16 +7604,12 @@ GetTLSADDR(SelectionDAG &DAG, SDValue Chain, GlobalAddressSDNode *GA,
                                            GA->getValueType(0),
                                            GA->getOffset(),
                                            OperandFlags);
-
-  X86ISD::NodeType CallType = LocalDynamic ? X86ISD::TLSBASEADDR
-                                           : X86ISD::TLSADDR;
-
   if (InFlag) {
     SDValue Ops[] = { Chain,  TGA, *InFlag };
-    Chain = DAG.getNode(CallType, dl, NodeTys, Ops, 3);
+    Chain = DAG.getNode(Opcode, dl, NodeTys, Ops, 3); // @LOCALMOD
   } else {
     SDValue Ops[]  = { Chain, TGA };
-    Chain = DAG.getNode(CallType, dl, NodeTys, Ops, 2);
+    Chain = DAG.getNode(Opcode, dl, NodeTys, Ops, 2); // @LOCALMOD
   }
 
   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
@@ -7612,6 +7641,52 @@ LowerToTLSGeneralDynamicModel64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
                     X86::RAX, X86II::MO_TLSGD);
 }
 
+// Lower ISD::GlobalTLSAddress using the "initial exec" or "local exec" model.
+static SDValue
+LowerToTLSExecCall(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+                   const EVT PtrVT, TLSModel::Model model, bool is64Bit) {
+
+  // See: http://code.google.com/p/nativeclient/issues/detail?id=1685
+  unsigned char TargetFlag;
+  unsigned Opcode;
+  if (model == TLSModel::LocalExec) {
+    TargetFlag = is64Bit ? X86II::MO_TPOFF : X86II::MO_NTPOFF;
+    Opcode = X86ISD::TLSADDR_LE;
+  } else if (model == TLSModel::InitialExec) {
+    TargetFlag = is64Bit ? X86II::MO_GOTTPOFF : X86II::MO_INDNTPOFF;
+    Opcode = X86ISD::TLSADDR_IE;
+  } else {
+    llvm_unreachable("Unknown TLS model");
+  }
+
+  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT,
+                    X86::EAX, // PtrVT is 32-bit.
+                    TargetFlag, Opcode);
+}
+
+// @LOCALMOD-START
+// Lower TLS accesses to a function call, rather than use segment registers.
+// Lower ISD::GlobalTLSAddress for NaCl 64 bit.
+static SDValue
+LowerToTLSNaCl64(GlobalAddressSDNode *GA, SelectionDAG &DAG,
+                 const EVT PtrVT, TLSModel::Model model) {
+
+  // See: http://code.google.com/p/nativeclient/issues/detail?id=1685
+  unsigned char TargetFlag;
+  unsigned Opcode;
+  if (model == TLSModel::GeneralDynamic || model == TLSModel::LocalDynamic) {
+    TargetFlag = X86II::MO_TLSGD;
+    Opcode = X86ISD::TLSADDR;
+  } else {
+    return LowerToTLSExecCall(GA, DAG, PtrVT, model, true);
+  }
+
+  return GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT,
+                    X86::EAX, // PtrVT is 32-bit.
+                    TargetFlag, Opcode);
+}
+// @LOCALMOD-END
+
 static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
                                            SelectionDAG &DAG,
                                            const EVT PtrVT,
@@ -7626,14 +7701,16 @@ static SDValue LowerToTLSLocalDynamicModel(GlobalAddressSDNode *GA,
   SDValue Base;
   if (is64Bit) {
     Base = GetTLSADDR(DAG, DAG.getEntryNode(), GA, NULL, PtrVT, X86::RAX,
-                      X86II::MO_TLSLD, /*LocalDynamic=*/true);
+                      X86II::MO_TLSLD,
+                      /*Opcode=*/X86ISD::TLSBASEADDR); // @LOCALMOD
   } else {
     SDValue InFlag;
     SDValue Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, X86::EBX,
         DAG.getNode(X86ISD::GlobalBaseReg, DebugLoc(), PtrVT), InFlag);
     InFlag = Chain.getValue(1);
     Base = GetTLSADDR(DAG, Chain, GA, &InFlag, PtrVT, X86::EAX,
-                      X86II::MO_TLSLDM, /*LocalDynamic=*/true);
+                      X86II::MO_TLSLDM,
+                      /*Opcode=*/X86ISD::TLSBASEADDR); // @LOCALMOD
   }
 
   // Note: the CleanupLocalDynamicTLSPass will remove redundant computations
@@ -7717,6 +7794,11 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
   if (Subtarget->isTargetELF()) {
     TLSModel::Model model = getTargetMachine().getTLSModel(GV);
 
+    // @LOCALMOD-START
+    if (Subtarget->isTargetNaCl64())
+      return LowerToTLSNaCl64(GA, DAG, getPointerTy(), model);
+    // @LOCALMOD-END
+
     switch (model) {
       case TLSModel::GeneralDynamic:
         if (Subtarget->is64Bit())
@@ -7727,9 +7809,16 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
                                            Subtarget->is64Bit());
       case TLSModel::InitialExec:
       case TLSModel::LocalExec:
-        return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
+        // @LOCALMOD-START
+        if (llvm::TLSUseCall) {
+          return LowerToTLSExecCall(GA, DAG, getPointerTy(), model,
+                                    Subtarget->is64Bit());
+        } else {
+          return LowerToTLSExecModel(GA, DAG, getPointerTy(), model,
                                    Subtarget->is64Bit(),
                          getTargetMachine().getRelocationModel() == Reloc::PIC_);
+        }
+        // @LOCALMOD-END
     }
     llvm_unreachable("Unknown TLS model.");
   }
@@ -8667,13 +8756,31 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
     break;
   }
 
+  // @LOCALMOD-BEGIN
+  // This function only peeks at the data dependencies of the DAG to find
+  // an arith op that also defines EFLAGS.  However, function calls may
+  // clobber EFLAGS and the data dependencies do not show that.
+  // When that occurs, EFLAGS must be copied via PUSHF and POPF.
+  // The problem is that NaCl does not allow PUSHF and POPF.
+  // We could try to detect such clobbers for NaCl, but for now, we
+  // keep this code simple, and bail out for NaCl.  A further
+  // PeepholeOptimizer pass can do a similar optimization
+  // (see optimizeCompareInstr in X86InstrInfo.cpp), so it's not *so*
+  // bad.  This function also converts "add op, -1" to DEC, which can
+  // help fold load/stores:
+  //    (store m, (add (load m), -1)) -> (dec m)
+  // So we lose out on that.
+  // BUG=http://code.google.com/p/nativeclient/issues/detail?id=2711
+  bool ConservativeForNaCl = Subtarget->isTargetNaCl();
+
   // See if we can use the EFLAGS value from the operand instead of
   // doing a separate TEST. TEST always sets OF and CF to 0, so unless
   // we prove that the arithmetic won't overflow, we can't use OF or CF.
-  if (Op.getResNo() != 0 || NeedOF || NeedCF)
+  if (Op.getResNo() != 0 || NeedOF || NeedCF || ConservativeForNaCl)
     // Emit a CMP with 0, which is the TEST pattern.
     return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op,
                        DAG.getConstant(0, Op.getValueType()));
+  // @LOCALMOD-END
 
   unsigned Opcode = 0;
   unsigned NumOperands = 0;
@@ -8903,6 +9010,10 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
 /// if it's possible.
 SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
                                      DebugLoc dl, SelectionDAG &DAG) const {
+   // @LOCALMOD: NaCl validator rejects BT, BTS, and BTC.
+  if (Subtarget->isTargetNaCl())
+    return SDValue();
+  
   SDValue Op0 = And.getOperand(0);
   SDValue Op1 = And.getOperand(1);
   if (Op0.getOpcode() == ISD::TRUNCATE)
@@ -9734,14 +9845,14 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   SDValue Size  = Op.getOperand(1);
   // FIXME: Ensure alignment here
 
-  bool Is64Bit = Subtarget->is64Bit();
-  EVT SPTy = Is64Bit ? MVT::i64 : MVT::i32;
+  bool Has64BitPointers = Subtarget->has64BitPointers(); // @LOCALMOD
+  EVT SPTy = Has64BitPointers ? MVT::i64 : MVT::i32; // @LOCALMOD
 
   if (getTargetMachine().Options.EnableSegmentedStacks) {
     MachineFunction &MF = DAG.getMachineFunction();
     MachineRegisterInfo &MRI = MF.getRegInfo();
 
-    if (Is64Bit) {
+    if (Subtarget->is64Bit()) { // @LOCALMOD
       // The 64 bit implementation of segmented stacks needs to clobber both r10
       // r11. This makes it impossible to use it along with nested parameters.
       const Function *F = MF.getFunction();
@@ -9754,7 +9865,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
     }
 
     const TargetRegisterClass *AddrRegClass =
-      getRegClassFor(Subtarget->is64Bit() ? MVT::i64:MVT::i32);
+      getRegClassFor(Has64BitPointers ? MVT::i64:MVT::i32); // @LOCALMOD
     unsigned Vreg = MRI.createVirtualRegister(AddrRegClass);
     Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
     SDValue Value = DAG.getNode(X86ISD::SEG_ALLOCA, dl, SPTy, Chain,
@@ -9763,7 +9874,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
     return DAG.getMergeValues(Ops1, 2, dl);
   } else {
     SDValue Flag;
-    unsigned Reg = (Subtarget->is64Bit() ? X86::RAX : X86::EAX);
+    unsigned Reg = (Has64BitPointers ? X86::RAX : X86::EAX); // @LOCALMOD
 
     Chain = DAG.getCopyToReg(Chain, dl, Reg, Size, Flag);
     Flag = Chain.getValue(1);
@@ -9772,7 +9883,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
     Chain = DAG.getNode(X86ISD::WIN_ALLOCA, dl, NodeTys, Chain, Flag);
     Flag = Chain.getValue(1);
 
-    Chain = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
+    Chain = DAG.getCopyFromReg(Chain, dl, X86StackPtr, // @LOCALMOD
                                SPTy).getValue(1);
 
     SDValue Ops1[2] = { Chain.getValue(0), Chain };
@@ -9801,6 +9912,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   //   fp_offset         (48 - 48 + 8 * 16)
   //   overflow_arg_area (point to parameters coming in memory).
   //   reg_save_area
+  unsigned PointerSize = TD->getPointerSize(0); // @LOCALMOD
   SmallVector<SDValue, 8> MemOps;
   SDValue FIN = Op.getOperand(1);
   // Store gp_offset
@@ -9823,7 +9935,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
                     FIN, DAG.getIntPtrConstant(4));
   SDValue OVFIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(),
-                                    getPointerTy());
+                                    getPointerTy()); // @LOCALMOD
   Store = DAG.getStore(Op.getOperand(0), DL, OVFIN, FIN,
                        MachinePointerInfo(SV, 8),
                        false, false, 0);
@@ -9831,11 +9943,12 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
 
   // Store ptr to reg_save_area.
   FIN = DAG.getNode(ISD::ADD, DL, getPointerTy(),
-                    FIN, DAG.getIntPtrConstant(8));
+                    FIN, DAG.getIntPtrConstant(PointerSize)); // @LOCALMOD
   SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
-                                    getPointerTy());
+                                    getPointerTy()); // @LOCALMOD
   Store = DAG.getStore(Op.getOperand(0), DL, RSFIN, FIN,
-                       MachinePointerInfo(SV, 16), false, false, 0);
+                       MachinePointerInfo(SV, 8+PointerSize), // @LOCALMOD
+                       false, false, 0);
   MemOps.push_back(Store);
   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
                      &MemOps[0], MemOps.size());
@@ -9845,7 +9958,8 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   assert(Subtarget->is64Bit() &&
          "LowerVAARG only handles 64-bit va_arg!");
   assert((Subtarget->isTargetLinux() ||
-          Subtarget->isTargetDarwin()) &&
+          Subtarget->isTargetDarwin() ||
+          Subtarget->isTargetNaCl()) && // @LOCALMOD
           "Unhandled target in LowerVAARG");
   assert(Op.getNode()->getNumOperands() == 4);
   SDValue Chain = Op.getOperand(0);
@@ -9920,11 +10034,56 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget *Subtarget,
   DebugLoc DL = Op.getDebugLoc();
 
   return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
-                       DAG.getIntPtrConstant(24), 8, /*isVolatile*/false,
+                       // @LOCALMOD-START
+                       // Size is actually 8 + 2 * pointer size and align
+                       // is the pointer ABI alignment but we don't have a
+                       // pointer to TD in this static function
+                       DAG.getIntPtrConstant(Subtarget->has64BitPointers() ?
+                                             24 : 16),
+                       Subtarget->has64BitPointers() ? 8 : 4,
+                       /*isVolatile*/false,
+                       // @LOCALMOD-END
                        false,
                        MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
 }
 
+//////////////////////////////////////////////////////////////////////
+// NaCl TLS setup / layout intrinsics.
+// See: native_client/src/untrusted/stubs/tls_params.h
+SDValue X86TargetLowering::LowerNaClTpTlsOffset(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  // ssize_t __nacl_tp_tls_offset (size_t tls_size) {
+  //   return -tls_size;
+  // }
+  DebugLoc dl = Op.getDebugLoc();
+  return DAG.getNode(ISD::SUB, dl, Op.getValueType().getSimpleVT(),
+                     DAG.getConstant(0, Op.getValueType().getSimpleVT()),
+                     Op.getOperand(0));
+}
+
+SDValue X86TargetLowering::LowerNaClTpTdbOffset(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  // ssize_t __nacl_tp_tdb_offset (size_t tdb_size) {
+  //   return 0;
+  // }
+  return DAG.getConstant(0, Op.getValueType().getSimpleVT());
+}
+
+SDValue
+X86TargetLowering::LowerNaClTargetArch(SDValue Op, SelectionDAG &DAG) const {
+  // int __nacl_target_arch () {
+  //   return (is_64_bit ?
+  //           PnaclTargetArchitectureX86_64 :
+  //           PnaclTargetArchitectureX86_32);
+  // }
+  return DAG.getConstant((Subtarget->is64Bit() ?
+                          PnaclTargetArchitectureX86_64 :
+                          PnaclTargetArchitectureX86_32),
+                         Op.getValueType().getSimpleVT());
+}
+
+//////////////////////////////////////////////////////////////////////
+
 // getTargetVShiftNOde - Handle vector element shifts where the shift amount
 // may or may not be a constant. Takes immediate version of shift as input.
 static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT,
@@ -9970,11 +10129,37 @@ static SDValue getTargetVShiftNode(unsigned Opc, DebugLoc dl, EVT VT,
   return DAG.getNode(Opc, dl, VT, SrcOp, ShAmt);
 }
 
-static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) {
+SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
+                                                   SelectionDAG &DAG) const {
   DebugLoc dl = Op.getDebugLoc();
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   switch (IntNo) {
   default: return SDValue();    // Don't custom lower most intrinsics.
+
+  // @LOCALMOD-BEGIN
+  case Intrinsic::nacl_read_tp: {
+    EVT PtrVT = DAG.getTargetLoweringInfo().getPointerTy();
+    if (Subtarget->is64Bit() || llvm::TLSUseCall) {
+      // Call __nacl_read_tp() to get the thread pointer.
+      unsigned PtrSize = PtrVT.getSizeInBits();
+      IntegerType *PtrTy = Type::getIntNTy(*DAG.getContext(), PtrSize);
+      SDValue ReadTpFunction = DAG.getExternalSymbol("__nacl_read_tp", PtrVT);
+      ArgListTy Args;
+      TargetLowering::CallLoweringInfo CLI(
+          DAG.getEntryNode(), PtrTy,
+          false, false, false, false, 0, CallingConv::C,
+          /*isTailCall=*/false, /*doesNotRet=*/false,
+          /*isReturnValueUsed=*/true,
+          ReadTpFunction, Args, DAG, dl);
+      std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
+      return CallResult.first;
+    } else {
+      // Get %gs:0, which contains the thread pointer on x86-32.
+      return DAG.getNode(X86ISD::THREAD_POINTER_FROM_GS, dl, PtrVT);
+    }
+  }
+  // @LOCALMOD-END
+
   // Comparison intrinsics.
   case Intrinsic::x86_sse_comieq_ss:
   case Intrinsic::x86_sse_comilt_ss:
@@ -10570,7 +10755,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
   DebugLoc dl = Op.getDebugLoc();  // FIXME probably not meaningful
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  unsigned FrameReg = Subtarget->is64Bit() ? X86::RBP : X86::EBP;
+  unsigned FrameReg = Subtarget->has64BitPointers() ? X86::RBP : X86::EBP; // @LOCALMOD
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, VT);
   while (Depth--)
     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
@@ -10590,10 +10775,13 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
   SDValue Handler   = Op.getOperand(2);
   DebugLoc dl       = Op.getDebugLoc();
 
+  // @LOCALMOD-START
+  bool Has64BitPointers = Subtarget->has64BitPointers();
   SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
-                                     Subtarget->is64Bit() ? X86::RBP : X86::EBP,
+                                     Has64BitPointers ? X86::RBP : X86::EBP,
                                      getPointerTy());
-  unsigned StoreAddrReg = (Subtarget->is64Bit() ? X86::RCX : X86::ECX);
+  unsigned StoreAddrReg = (Has64BitPointers ? X86::RCX : X86::ECX);
+  // @LOCALMOD-END
 
   SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, getPointerTy(), Frame,
                                   DAG.getIntPtrConstant(RegInfo->getSlotSize()));
@@ -11674,6 +11862,11 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SUBE:               return LowerADDC_ADDE_SUBC_SUBE(Op, DAG);
   case ISD::ADD:                return LowerADD(Op, DAG);
   case ISD::SUB:                return LowerSUB(Op, DAG);
+  // @LOCALMOD-BEGIN
+  case ISD::NACL_TP_TLS_OFFSET:    return LowerNaClTpTlsOffset(Op, DAG);
+  case ISD::NACL_TP_TDB_OFFSET:    return LowerNaClTpTdbOffset(Op, DAG);
+  case ISD::NACL_TARGET_ARCH:      return LowerNaClTargetArch(Op, DAG);
+  // @LOCALMOD-END
   }
 }
 
@@ -11957,6 +12150,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::FRCP:               return "X86ISD::FRCP";
   case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
   case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
+  case X86ISD::TLSADDR_LE:         return "X86ISD::TLSADDR_LE"; // @LOCALMOD
+  case X86ISD::TLSADDR_IE:         return "X86ISD::TLSADDR_IE"; // @LOCALMOD
   case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
   case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
   case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
@@ -12967,9 +13162,11 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(
   MachineInstr::mmo_iterator MMOEnd = MI->memoperands_end();
 
   // Machine Information
+  bool IsNaCl = Subtarget->isTargetNaCl(); // @LOCALMOD
   const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
-  const TargetRegisterClass *AddrRegClass = getRegClassFor(MVT::i64);
+  const TargetRegisterClass *AddrRegClass =
+    getRegClassFor(getPointerTy()); // @LOCALMOD
   const TargetRegisterClass *OffsetRegClass = getRegClassFor(MVT::i32);
   DebugLoc DL = MI->getDebugLoc();
 
@@ -12997,7 +13194,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(
   MachineBasicBlock *overflowMBB;
   MachineBasicBlock *offsetMBB;
   MachineBasicBlock *endMBB;
-
+  
   unsigned OffsetDestReg = 0;    // Argument address computed by offsetMBB
   unsigned OverflowDestReg = 0;  // Argument address computed by overflowMBB
   unsigned OffsetReg = 0;
@@ -13078,29 +13275,39 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(
   }
 
   // In offsetMBB, emit code to use the reg_save_area.
+  unsigned Opc; // @LOCALMOD
   if (offsetMBB) {
     assert(OffsetReg != 0);
 
     // Read the reg_save_area address.
     unsigned RegSaveReg = MRI.createVirtualRegister(AddrRegClass);
-    BuildMI(offsetMBB, DL, TII->get(X86::MOV64rm), RegSaveReg)
+    Opc = IsNaCl ? X86::MOV32rm : X86::MOV64rm; // @LOCALMOD
+    BuildMI(offsetMBB, DL, TII->get(Opc), RegSaveReg) // @LOCALMOD
       .addOperand(Base)
       .addOperand(Scale)
       .addOperand(Index)
-      .addDisp(Disp, 16)
+      .addDisp(Disp, 8+TD->getPointerSize(0)) // @LOCALMOD
       .addOperand(Segment)
       .setMemRefs(MMOBegin, MMOEnd);
 
     // Zero-extend the offset
-    unsigned OffsetReg64 = MRI.createVirtualRegister(AddrRegClass);
-      BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetReg64)
-        .addImm(0)
-        .addReg(OffsetReg)
-        .addImm(X86::sub_32bit);
+    // @LOCALMOD-BEGIN
+    unsigned OffsetRegExt;
+    if (IsNaCl) {
+      OffsetRegExt = OffsetReg;
+    } else {
+      OffsetRegExt = MRI.createVirtualRegister(AddrRegClass);
+        BuildMI(offsetMBB, DL, TII->get(X86::SUBREG_TO_REG), OffsetRegExt)
+          .addImm(0)
+          .addReg(OffsetReg)
+          .addImm(X86::sub_32bit);
+    }
+    // @LOCALMOD-END
 
     // Add the offset to the reg_save_area to get the final address.
-    BuildMI(offsetMBB, DL, TII->get(X86::ADD64rr), OffsetDestReg)
-      .addReg(OffsetReg64)
+    Opc = IsNaCl ? X86::ADD32rr : X86::ADD64rr; // @LOCALMOD
+    BuildMI(offsetMBB, DL, TII->get(Opc), OffsetDestReg)
+      .addReg(OffsetRegExt) // @LOCALMOD
       .addReg(RegSaveReg);
 
     // Compute the offset for the next argument
@@ -13130,7 +13337,8 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(
 
   // Load the overflow_area address into a register.
   unsigned OverflowAddrReg = MRI.createVirtualRegister(AddrRegClass);
-  BuildMI(overflowMBB, DL, TII->get(X86::MOV64rm), OverflowAddrReg)
+  Opc = IsNaCl ? X86::MOV32rm : X86::MOV64rm; // @LOCALMOD
+  BuildMI(overflowMBB, DL, TII->get(Opc), OverflowAddrReg)
     .addOperand(Base)
     .addOperand(Scale)
     .addOperand(Index)
@@ -13146,11 +13354,13 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(
     unsigned TmpReg = MRI.createVirtualRegister(AddrRegClass);
 
     // aligned_addr = (addr + (align-1)) & ~(align-1)
-    BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
+    Opc = IsNaCl ? X86::ADD32ri : X86::ADD64ri32; // @LOCALMOD
+    BuildMI(overflowMBB, DL, TII->get(Opc), TmpReg)
       .addReg(OverflowAddrReg)
       .addImm(Align-1);
 
-    BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
+    Opc = IsNaCl ? X86::AND32ri : X86::AND64ri32; // @LOCALMOD
+    BuildMI(overflowMBB, DL, TII->get(Opc), OverflowDestReg)
       .addReg(TmpReg)
       .addImm(~(uint64_t)(Align-1));
   } else {
@@ -13161,12 +13371,14 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(
   // Compute the next overflow address after this argument.
   // (the overflow address should be kept 8-byte aligned)
   unsigned NextAddrReg = MRI.createVirtualRegister(AddrRegClass);
-  BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), NextAddrReg)
+  Opc = IsNaCl ? X86::ADD32ri : X86::ADD64ri32; // @LOCALMOD
+  BuildMI(overflowMBB, DL, TII->get(Opc), NextAddrReg)
     .addReg(OverflowDestReg)
     .addImm(ArgSizeA8);
 
   // Store the new overflow address.
-  BuildMI(overflowMBB, DL, TII->get(X86::MOV64mr))
+  Opc = IsNaCl ? X86::MOV32mr : X86::MOV64mr; // @LOCALMOD
+  BuildMI(overflowMBB, DL, TII->get(Opc))
     .addOperand(Base)
     .addOperand(Scale)
     .addOperand(Index)
@@ -13541,6 +13753,25 @@ X86TargetLowering::EmitLoweredWinAlloca(MachineInstr *MI,
   return BB;
 }
 
+// @LOCALMOD-BEGIN
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredThreadPointerFromGs(MachineInstr *MI,
+                                                  MachineBasicBlock *BB) const {
+  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  DebugLoc DL = MI->getDebugLoc();
+  // This generates "movl %gs:0, %DEST", which fetches the thread
+  // pointer on x86-32.
+  BuildMI(*BB, MI, DL, TII->get(X86::MOV32rm), MI->getOperand(0).getReg())
+    .addReg(/*Base=*/0)
+    .addImm(/*Scale=*/1)
+    .addReg(/*IndexReg=*/0)
+    .addImm(/*Disp=*/0)
+    .addReg(/*Segment=*/X86::GS);
+  MI->eraseFromParent();
+  return BB;
+}
+// @LOCALMOD-END
+
 MachineBasicBlock *
 X86TargetLowering::EmitLoweredTLSCall(MachineInstr *MI,
                                       MachineBasicBlock *BB) const {
@@ -13816,6 +14047,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     return EmitLoweredSegAlloca(MI, BB, false);
   case X86::SEG_ALLOCA_64:
     return EmitLoweredSegAlloca(MI, BB, true);
+  // @LOCALMOD-BEGIN
+  case X86::THREAD_POINTER_FROM_GS:
+    return EmitLoweredThreadPointerFromGs(MI, BB);
+  // @LOCALMOD-END
   case X86::TLSCall_32:
   case X86::TLSCall_64:
     return EmitLoweredTLSCall(MI, BB);
@@ -14012,6 +14247,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
     return EmitVAStartSaveXMMRegsWithCustomInserter(MI, BB);
 
   case X86::VAARG_64:
+  case X86::NACL_CG_VAARG_64:
     return EmitVAARG64WithCustomInserter(MI, BB);
 
   case X86::EH_SjLj_SetJmp32:
@@ -15698,6 +15934,12 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
   }
 
   unsigned Bits = VT.getSizeInBits();
+  // @LOCALMOD-START
+  // Due to a limitation in NaCl's 32-bit validator,
+  // 16-bit shld instructions are illegal in 32-bit NaCl.
+  if (Subtarget->isTargetNaCl() && !Subtarget->is64Bit() && Bits == 16)
+    return SDValue();
+  // @LOCALMOD-END
   if (ShAmt1.getOpcode() == ISD::SUB) {
     SDValue Sum = ShAmt1.getOperand(0);
     if (ConstantSDNode *SumC = dyn_cast<ConstantSDNode>(Sum)) {
@@ -17801,4 +18043,3 @@ unsigned X86VectorTargetTransformInfo::getCastInstrCost(unsigned Opcode,
 
   return VectorTargetTransformImpl::getCastInstrCost(Opcode, Dst, Src);
 }
-
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 465c6036ad..b6e8960f76 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -214,6 +214,16 @@ namespace llvm {
       // TLSBASEADDR - Thread Local Storage. A call to get the start address
       // of the TLS block for the current module.
       TLSBASEADDR,
+      // @LOCALMOD-BEGIN
+      // TLSADDR_LE - Thread Local Storage. (Local Exec Model)
+      TLSADDR_LE,
+
+      // TLSADDR_IE - Thread Local Storage. (Initial Exec Model)
+      TLSADDR_IE,
+
+      // THREAD_POINTER_FROM_GS - Read thread pointer from %gs:0 on x86-32.
+      THREAD_POINTER_FROM_GS,
+      // @LOCALMOD-END
 
       // TLSCALL - Thread Local Storage.  When calling to an OS provided
       // thunk at the address from an earlier relocation.
@@ -465,6 +475,7 @@ namespace llvm {
   //===--------------------------------------------------------------------===//
   //  X86TargetLowering - X86 Implementation of the TargetLowering interface
   class X86TargetLowering : public TargetLowering {
+
   public:
     explicit X86TargetLowering(X86TargetMachine &TM);
 
@@ -718,6 +729,9 @@ namespace llvm {
     const X86Subtarget *Subtarget;
     const X86RegisterInfo *RegInfo;
     const DataLayout *TD;
+    // @LOCALMOD - This is essentially a revert of r167104
+    /// X86StackPtr - X86 physical register used as stack ptr.
+    unsigned X86StackPtr;
 
     /// X86ScalarSSEf32, X86ScalarSSEf64 - Select between SSE or x87
     /// floating point ops.
@@ -819,6 +833,7 @@ namespace llvm {
     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAME_TO_ARGS_OFFSET(SDValue Op, SelectionDAG &DAG) const;
@@ -831,11 +846,18 @@ namespace llvm {
 
     SDValue LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const;
 
+    // @LOCALMOD-BEGIN
+    SDValue LowerNaClTpTlsOffset(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerNaClTpTdbOffset(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerNaClTargetArch(SDValue Op, SelectionDAG &DAG) const;
+    // @LOCALMOD-END
+
+
     // Utility functions to help LowerVECTOR_SHUFFLE & LowerBUILD_VECTOR
     SDValue LowerVectorBroadcast(SDValue Op, SelectionDAG &DAG) const;
     SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG) const;
     SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) const;
-
+    
     SDValue LowerVectorAllZeroTest(SDValue Op, SelectionDAG &DAG) const;
 
     SDValue lowerVectorIntExtend(SDValue Op, SelectionDAG &DAG) const;
@@ -903,6 +925,12 @@ namespace llvm {
                                             MachineBasicBlock *BB,
                                             bool Is64Bit) const;
 
+    // @LOCALMOD-BEGIN
+    MachineBasicBlock *EmitLoweredThreadPointerFromGs(
+        MachineInstr *MI,
+        MachineBasicBlock *BB) const;
+    // @LOCALMOD-END
+
     MachineBasicBlock *EmitLoweredTLSCall(MachineInstr *MI,
                                           MachineBasicBlock *BB) const;
 
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index f790611b8f..f580b76d95 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -32,8 +32,9 @@ def LEA64_32r : I<0x8D, MRMSrcMem,
                   [(set GR32:$dst, lea32addr:$src)], IIC_LEA>,
                   Requires<[In64BitMode]>;
 
+// @LOCALMOD (lea64mem)
 let isReMaterializable = 1 in
-def LEA64r   : RI<0x8D, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
+def LEA64r   : RI<0x8D, MRMSrcMem, (outs GR64:$dst), (ins lea64mem:$src),
                   "lea{q}\t{$src|$dst}, {$dst|$src}",
                   [(set GR64:$dst, lea64addr:$src)], IIC_LEA>;
 
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 9e6f27988f..a24ddf6f99 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -92,8 +92,8 @@ def VAARG_64 : I<0, Pseudo,
                  "#VAARG_64 $dst, $ap, $size, $mode, $align",
                  [(set GR64:$dst,
                     (X86vaarg64 addr:$ap, imm:$size, imm:$mode, imm:$align)),
-                  (implicit EFLAGS)]>;
-
+                  (implicit EFLAGS)]>,
+                 Requires<[IsNotNaCl]>;
 // Dynamic stack allocation yields a _chkstk or _alloca call for all Windows
 // targets.  These calls are needed to probe the stack when allocating more than
 // 4k bytes in one go. Touching the stack at 4K increments is necessary to
@@ -399,7 +399,7 @@ let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
 def TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
                   "# TLS_addr32",
                   [(X86tlsaddr tls32addr:$sym)]>,
-                  Requires<[In32BitMode]>;
+                  Requires<[In32BitMode, IsNotNaCl]>;
 def TLS_base_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
                   "# TLS_base_addr32",
                   [(X86tlsbaseaddr tls32baseaddr:$sym)]>,
@@ -425,6 +425,16 @@ def TLS_base_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
                   Requires<[In64BitMode]>;
 }
 
+// @LOCALMOD-BEGIN
+// NaCl TLS support
+let usesCustomInserter = 1 in {
+  def THREAD_POINTER_FROM_GS :
+    I<0, Pseudo, (outs GR32:$dst), (ins),
+      "# get thread pointer from %gs:0",
+      [(set GR32:$dst, (X86thread_pointer_from_gs))]>;
+}
+// @LOCALMOD-END
+
 // Darwin TLS Support
 // For i386, the address of the thunk is passed on the stack, on return the
 // address of the variable is in %eax.  %ecx is trashed during the function
@@ -993,9 +1003,9 @@ def : Pat<(load (i64 (X86Wrapper tglobaltlsaddr :$dst))),
 // Direct PC relative function call for small code model. 32-bit displacement
 // sign extended to 64-bit.
 def : Pat<(X86call (i64 tglobaladdr:$dst)),
-          (CALL64pcrel32 tglobaladdr:$dst)>;
+          (CALL64pcrel32 tglobaladdr:$dst)>, Requires<[IsNotNaCl]>;
 def : Pat<(X86call (i64 texternalsym:$dst)),
-          (CALL64pcrel32 texternalsym:$dst)>;
+          (CALL64pcrel32 texternalsym:$dst)>, Requires<[IsNotNaCl]>;
 
 // Tailcall stuff. The TCRETURN instructions execute after the epilog, so they
 // can never use callee-saved registers. That is the purpose of the GR64_TC
@@ -1024,7 +1034,7 @@ def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
 // callee-saved register.
 def : Pat<(X86tcret (load addr:$dst), imm:$off),
           (TCRETURNmi addr:$dst, imm:$off)>,
-          Requires<[In32BitMode, IsNotPIC]>;
+          Requires<[In32BitMode, IsNotPIC, IsNotNaCl]>;
 
 def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off),
           (TCRETURNdi texternalsym:$dst, imm:$off)>,
@@ -1036,29 +1046,29 @@ def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off),
 
 def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
           (TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>,
-          Requires<[In64BitMode]>;
+          Requires<[In64BitMode, IsNotNaCl]>;
 
 // Don't fold loads into X86tcret requiring more than 6 regs.
 // There wouldn't be enough scratch registers for base+index.
 def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off),
           (TCRETURNmi64 addr:$dst, imm:$off)>,
-          Requires<[In64BitMode]>;
+          Requires<[In64BitMode, IsNotNaCl]>;
 
 def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off),
           (TCRETURNdi64 tglobaladdr:$dst, imm:$off)>,
-          Requires<[In64BitMode]>;
+          Requires<[In64BitMode, IsNotNaCl]>;
 
 def : Pat<(X86tcret (i64 texternalsym:$dst), imm:$off),
           (TCRETURNdi64 texternalsym:$dst, imm:$off)>,
-          Requires<[In64BitMode]>;
+          Requires<[In64BitMode, IsNotNaCl]>;
 
 // Normal calls, with various flavors of addresses.
 def : Pat<(X86call (i32 tglobaladdr:$dst)),
-          (CALLpcrel32 tglobaladdr:$dst)>;
+          (CALLpcrel32 tglobaladdr:$dst)>, Requires<[IsNotNaCl]>;
 def : Pat<(X86call (i32 texternalsym:$dst)),
-          (CALLpcrel32 texternalsym:$dst)>;
+          (CALLpcrel32 texternalsym:$dst)>, Requires<[IsNotNaCl]>;
 def : Pat<(X86call (i32 imm:$dst)),
-          (CALLpcrel32 imm:$dst)>, Requires<[CallImmAddr]>;
+          (CALLpcrel32 imm:$dst)>, Requires<[CallImmAddr, IsNotNaCl]>;
 
 // Comparisons.
 
@@ -1483,19 +1493,19 @@ def : Pat<(store (i8 (trunc_su (srl_su GR64:$src, (i8 8)))), addr:$dst),
           (MOV8mr_NOREX
             addr:$dst,
             (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)),
-                            sub_8bit_hi))>;
+                            sub_8bit_hi))>, Requires<[IsNotNaCl]>;
 def : Pat<(store (i8 (trunc_su (srl_su GR32:$src, (i8 8)))), addr:$dst),
           (MOV8mr_NOREX
             addr:$dst,
             (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
                             sub_8bit_hi))>,
-      Requires<[In64BitMode]>;
+      Requires<[In64BitMode, IsNotNaCl]>;
 def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst),
           (MOV8mr_NOREX
             addr:$dst,
             (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
                             sub_8bit_hi))>,
-      Requires<[In64BitMode]>;
+      Requires<[In64BitMode, IsNotNaCl]>;
 
 
 // (shl x, 1) ==> (add x, x)
diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td
index bfe954114c..5dd04aad69 100644
--- a/lib/Target/X86/X86InstrControl.td
+++ b/lib/Target/X86/X86InstrControl.td
@@ -112,7 +112,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
   def JMP32r     : I<0xFF, MRM4r, (outs), (ins GR32:$dst), "jmp{l}\t{*}$dst",
                      [(brind GR32:$dst)], IIC_JMP_REG>, Requires<[In32BitMode]>;
   def JMP32m     : I<0xFF, MRM4m, (outs), (ins i32mem:$dst), "jmp{l}\t{*}$dst",
-                     [(brind (loadi32 addr:$dst))], IIC_JMP_MEM>, Requires<[In32BitMode]>;
+                     [(brind (loadi32 addr:$dst))], IIC_JMP_MEM>, Requires<[In32BitMode,IsNotNaCl]>;
 
   def JMP64r     : I<0xFF, MRM4r, (outs), (ins GR64:$dst), "jmp{q}\t{*}$dst",
                      [(brind GR64:$dst)], IIC_JMP_REG>, Requires<[In64BitMode]>;
@@ -126,7 +126,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
                           (ins i32imm:$off, i16imm:$seg),
                           "ljmp{l}\t{$seg, $off|$off, $seg}", [], IIC_JMP_FAR_PTR>;
   def FARJMP64   : RI<0xFF, MRM5m, (outs), (ins opaque80mem:$dst),
-                      "ljmp{q}\t{*}$dst", [], IIC_JMP_FAR_MEM>;
+                      "ljmp{q}\t{*}$dst", [], IIC_JMP_FAR_MEM>, Requires<[IsNotNaCl]>;
 
   def FARJMP16m  : I<0xFF, MRM5m, (outs), (ins opaque32mem:$dst),
                      "ljmp{w}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize;
@@ -155,10 +155,10 @@ let isCall = 1 in
                            "call{l}\t$dst", [], IIC_CALL_RI>, Requires<[In32BitMode]>;
     def CALL32r     : I<0xFF, MRM2r, (outs), (ins GR32:$dst),
                         "call{l}\t{*}$dst", [(X86call GR32:$dst)], IIC_CALL_RI>,
-                         Requires<[In32BitMode]>;
+                         Requires<[In32BitMode,IsNotNaCl]>; // @LOCALMOD
     def CALL32m     : I<0xFF, MRM2m, (outs), (ins i32mem:$dst),
                         "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))], IIC_CALL_MEM>,
-                        Requires<[In32BitMode]>;
+                        Requires<[In32BitMode,IsNotNaCl]>; // @LOCALMOD
 
     def FARCALL16i  : Iseg16<0x9A, RawFrmImm16, (outs),
                              (ins i16imm:$off, i16imm:$seg),
@@ -178,9 +178,20 @@ let isCall = 1 in
     let isAsmParserOnly = 1 in
       def CALLpcrel16 : Ii16PCRel<0xE8, RawFrm,
                        (outs), (ins i16imm_pcrel:$dst),
-                       "callw\t$dst", []>, OpSize;
+                       "callw\t$dst", []>, OpSize,
+                       Requires<[IsNotNaCl]>; // @LOCALMOD
   }
 
+// @LOCALMOD-BEGIN
+// These CodeGen patterns are normally part of the declaration above.
+// However, we need to be able to disable these patterns for NaCl
+// without disabling the the instruction itself. (so we can use the
+// instruction in assembly input)
+def : Pat<(X86call GR32:$dst),
+          (CALL32r GR32:$dst)>, Requires<[IsNotNaCl]>;
+def : Pat<(X86call (loadi32 addr:$dst)),
+          (CALL32m addr:$dst)>, Requires<[IsNotNaCl]>;
+// @LOCALMOD-END
 
 // Tail call stuff.
 
@@ -205,7 +216,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
                    "", [], IIC_JMP_REG>;  // FIXME: Remove encoding when JIT is dead.
   let mayLoad = 1 in
   def TAILJMPm : I<0xFF, MRM4m, (outs), (ins i32mem_TC:$dst),
-                   "jmp{l}\t{*}$dst  # TAILCALL", [], IIC_JMP_MEM>;
+                   "jmp{l}\t{*}$dst  # TAILCALL", [], IIC_JMP_MEM>, Requires<[IsNotNaCl]>; // @LOCALMOD
 }
 
 
@@ -223,18 +234,18 @@ let isCall = 1, Uses = [RSP] in {
   def CALL64pcrel32 : Ii32PCRel<0xE8, RawFrm,
                         (outs), (ins i64i32imm_pcrel:$dst),
                         "call{q}\t$dst", [], IIC_CALL_RI>,
-                      Requires<[In64BitMode]>;
+                        Requires<[In64BitMode, IsNotNaCl]>; // @LOCALMOD
   def CALL64r       : I<0xFF, MRM2r, (outs), (ins GR64:$dst),
                         "call{q}\t{*}$dst", [(X86call GR64:$dst)],
                         IIC_CALL_RI>,
-                      Requires<[In64BitMode]>;
+                        Requires<[In64BitMode, IsNotNaCl]>; // @LOCALMOD
   def CALL64m       : I<0xFF, MRM2m, (outs), (ins i64mem:$dst),
                         "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))],
                         IIC_CALL_MEM>,
-                      Requires<[In64BitMode]>;
+                        Requires<[In64BitMode, IsNotNaCl]>; // @LOCALMOD
 
   def FARCALL64   : RI<0xFF, MRM3m, (outs), (ins opaque80mem:$dst),
-                       "lcall{q}\t{*}$dst", [], IIC_CALL_FAR_MEM>;
+                       "lcall{q}\t{*}$dst", [], IIC_CALL_FAR_MEM>, Requires<[IsNotNaCl]>; // @LOCALMOD
 }
 
 let isCall = 1, isCodeGenOnly = 1 in
@@ -269,5 +280,6 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
 
   let mayLoad = 1 in
   def TAILJMPm64 : I<0xFF, MRM4m, (outs), (ins i64mem_TC:$dst),
-                     "jmp{q}\t{*}$dst  # TAILCALL", [], IIC_JMP_MEM>;
+                     "jmp{q}\t{*}$dst  # TAILCALL", [], IIC_JMP_MEM>,
+                   Requires<[IsNotNaCl]>; // @LOCALMOD
 }
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index 268e9fc9c0..7309942880 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -53,6 +53,7 @@ def MRM_DC : Format<53>;
 def MRM_DD : Format<54>;
 def MRM_DE : Format<55>;
 def MRM_DF : Format<56>;
+def CustomFrm : Format<62>; // @LOCALMOD
 
 // ImmType - This specifies the immediate type used by an instruction. This is
 // part of the ad-hoc solution used to emit machine instruction encodings by our
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 5a99ff004d..0267fdd860 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -276,12 +276,17 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
                   Flags | TB_INDEX_0 | TB_FOLDED_LOAD | TB_FOLDED_STORE);
   }
 
+  // @LOCALMOD-BEGIN
+  unsigned NoForwardForNaCl =
+      tm.getSubtarget<X86Subtarget>().isTargetNaCl() ? TB_NO_FORWARD : 0;
+  // @LOCALMOD-END
+
   static const X86OpTblEntry OpTbl0[] = {
     { X86::BT16ri8,     X86::BT16mi8,       TB_FOLDED_LOAD },
     { X86::BT32ri8,     X86::BT32mi8,       TB_FOLDED_LOAD },
     { X86::BT64ri8,     X86::BT64mi8,       TB_FOLDED_LOAD },
-    { X86::CALL32r,     X86::CALL32m,       TB_FOLDED_LOAD },
-    { X86::CALL64r,     X86::CALL64m,       TB_FOLDED_LOAD },
+    { X86::CALL32r,     X86::CALL32m,       TB_FOLDED_LOAD | NoForwardForNaCl },
+    { X86::CALL64r,     X86::CALL64m,       TB_FOLDED_LOAD | NoForwardForNaCl },
     { X86::CMP16ri,     X86::CMP16mi,       TB_FOLDED_LOAD },
     { X86::CMP16ri8,    X86::CMP16mi8,      TB_FOLDED_LOAD },
     { X86::CMP16rr,     X86::CMP16mr,       TB_FOLDED_LOAD },
@@ -308,8 +313,8 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::IMUL32r,     X86::IMUL32m,       TB_FOLDED_LOAD },
     { X86::IMUL64r,     X86::IMUL64m,       TB_FOLDED_LOAD },
     { X86::IMUL8r,      X86::IMUL8m,        TB_FOLDED_LOAD },
-    { X86::JMP32r,      X86::JMP32m,        TB_FOLDED_LOAD },
-    { X86::JMP64r,      X86::JMP64m,        TB_FOLDED_LOAD },
+    { X86::JMP32r,      X86::JMP32m,        TB_FOLDED_LOAD | NoForwardForNaCl },
+    { X86::JMP64r,      X86::JMP64m,        TB_FOLDED_LOAD | NoForwardForNaCl },
     { X86::MOV16ri,     X86::MOV16mi,       TB_FOLDED_STORE },
     { X86::MOV16rr,     X86::MOV16mr,       TB_FOLDED_STORE },
     { X86::MOV32ri,     X86::MOV32mi,       TB_FOLDED_STORE },
@@ -348,8 +353,8 @@ X86InstrInfo::X86InstrInfo(X86TargetMachine &tm)
     { X86::SETOr,       X86::SETOm,         TB_FOLDED_STORE },
     { X86::SETPr,       X86::SETPm,         TB_FOLDED_STORE },
     { X86::SETSr,       X86::SETSm,         TB_FOLDED_STORE },
-    { X86::TAILJMPr,    X86::TAILJMPm,      TB_FOLDED_LOAD },
-    { X86::TAILJMPr64,  X86::TAILJMPm64,    TB_FOLDED_LOAD },
+    { X86::TAILJMPr,    X86::TAILJMPm,      TB_FOLDED_LOAD | NoForwardForNaCl },
+    { X86::TAILJMPr64,  X86::TAILJMPm64,    TB_FOLDED_LOAD | NoForwardForNaCl },
     { X86::TEST16ri,    X86::TEST16mi,      TB_FOLDED_LOAD },
     { X86::TEST32ri,    X86::TEST32mi,      TB_FOLDED_LOAD },
     { X86::TEST64ri32,  X86::TEST64mi32,    TB_FOLDED_LOAD },
@@ -2869,6 +2874,7 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   DEBUG(dbgs() << "Cannot copy " << RI.getName(SrcReg)
                << " to " << RI.getName(DestReg) << '\n');
+  MBB.dump();
   llvm_unreachable("Cannot emit physreg copy instruction");
 }
 
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 650fa95d7f..cec4625135 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -103,6 +103,10 @@ def SDT_X86TLSBASEADDR : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
 def SDT_X86TLSCALL : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
+// @LOCALMOD-BEGIN
+def SDT_X86ThreadPointer : SDTypeProfile<1, 0, [SDTCisPtrTy<0>]>;
+// @LOCALMOD-END
+
 def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
 
 def SDT_X86WIN_FTOL : SDTypeProfile<0, 1, [SDTCisFP<0>]>;
@@ -213,6 +217,17 @@ def X86tlsaddr : SDNode<"X86ISD::TLSADDR", SDT_X86TLSADDR,
 def X86tlsbaseaddr : SDNode<"X86ISD::TLSBASEADDR", SDT_X86TLSBASEADDR,
                         [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 
+// @LOCALMOD-BEGIN
+def X86tlsaddr_le : SDNode<"X86ISD::TLSADDR_LE", SDT_X86TLSADDR,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def X86tlsaddr_ie : SDNode<"X86ISD::TLSADDR_IE", SDT_X86TLSADDR,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
+
+def X86thread_pointer_from_gs :
+  SDNode<"X86ISD::THREAD_POINTER_FROM_GS", SDT_X86ThreadPointer>;
+// @LOCALMOD-END
+
 def X86ehret : SDNode<"X86ISD::EH_RETURN", SDT_X86EHRET,
                         [SDNPHasChain]>;
 
@@ -518,6 +533,13 @@ def i64i8imm   : Operand<i64> {
   let OperandType = "OPERAND_IMMEDIATE";
 }
 
+// @LOCALMOD
+def lea64mem : Operand<i64> {
+  let PrintMethod = "printi64mem";
+  let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, i8imm);
+  let ParserMatchClass = X86MemAsmOperand;
+}
+
 def lea64_32mem : Operand<i32> {
   let PrintMethod = "printi32mem";
   let AsmOperandLowerMethod = "lower_lea64_32mem";
@@ -533,7 +555,8 @@ def lea64_32mem : Operand<i32> {
 // Define X86 specific addressing mode.
 def addr      : ComplexPattern<iPTR, 5, "SelectAddr", [], [SDNPWantParent]>;
 def lea32addr : ComplexPattern<i32, 5, "SelectLEAAddr",
-                               [add, sub, mul, X86mul_imm, shl, or, frameindex],
+                               [add, sub, mul, X86mul_imm, shl, or, frameindex,
+                               X86WrapperRIP], // @LOCALMOD
                                []>;
 def tls32addr : ComplexPattern<i32, 5, "SelectTLSADDRAddr",
                                [tglobaltlsaddr], []>;
@@ -599,7 +622,7 @@ def In64BitMode  : Predicate<"Subtarget->is64Bit()">,
                              AssemblerPredicate<"Mode64Bit">;
 def IsWin64      : Predicate<"Subtarget->isTargetWin64()">;
 def IsNaCl       : Predicate<"Subtarget->isTargetNaCl()">;
-def NotNaCl      : Predicate<"!Subtarget->isTargetNaCl()">;
+def IsNotNaCl    : Predicate<"!Subtarget->isTargetNaCl()">;
 def SmallCode    : Predicate<"TM.getCodeModel() == CodeModel::Small">;
 def KernelCode   : Predicate<"TM.getCodeModel() == CodeModel::Kernel">;
 def FarData      : Predicate<"TM.getCodeModel() != CodeModel::Small &&"
@@ -1682,6 +1705,12 @@ let Predicates = [HasBMI2] in {
 //===----------------------------------------------------------------------===//
 
 include "X86InstrArithmetic.td"
+
+//===----------------------------------------------------------------------===//
+// NaCl support (@LOCALMOD)
+//===----------------------------------------------------------------------===//
+
+include "X86InstrNaCl.td"
 include "X86InstrCMovSetCC.td"
 include "X86InstrExtension.td"
 include "X86InstrControl.td"
diff --git a/lib/Target/X86/X86InstrNaCl.td b/lib/Target/X86/X86InstrNaCl.td
new file mode 100644
index 0000000000..8a7eebecd7
--- /dev/null
+++ b/lib/Target/X86/X86InstrNaCl.td
@@ -0,0 +1,357 @@
+//====- X86InstrNaCl.td - Describe NaCl Instructions ----*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the modifications to the X86 instruction set needed for
+// Native Client code generation.
+//
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// NaCl specific DAG Nodes.
+//
+
+//===----------------------------------------------------------------------===//
+//
+//                       Native Client Pseudo-Instructions
+//
+// These instructions implement the Native Client pseudo-instructions, such
+// as nacljmp and naclasp.
+//
+// TableGen and MC consider these to be "real" instructions. They can be
+// parsed by the AsmParser and emitted by the AsmStreamer as if they
+// were just regular instructions. They are not marked "Pseudo" because
+// this would imply isCodeGenOnly=1, which would stop them from being
+// parsed by the assembler.
+//
+// These instructions cannot be encoded (written into an object file) by the
+// MCCodeEmitter. Instead, during direct object emission, they get lowered to
+// a sequence of streamer emits. (see X86InstrNaCl.cpp)
+//
+// These instructions should not be used in CodeGen. They have no pattern
+// and lack CodeGen metadata. Instead, the X86NaClRewritePass should
+// generate these instructions after CodeGen is finished.
+//
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+// 32-bit Native Client Pseudo Instructions
+//===----------------------------------------------------------------------===//
+
+class NaClPI32<dag outs, dag ins, string asm>
+  : I<0, CustomFrm, outs, ins, asm, []>, Requires<[IsNaCl, In32BitMode]>;
+
+let isTerminator = 1, isBarrier = 1, hasCtrlDep = 1, isAsmParserOnly = 1 in {
+  def NACL_TRAP32  : NaClPI32<(outs), (ins), "nacltrap">;
+}
+
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+    hasCtrlDep = 1, FPForm = SpecialFP, isAsmParserOnly = 1 in {
+  def NACL_RET32  : NaClPI32<(outs), (ins), "naclret">;
+  def NACL_RETI32 : NaClPI32<(outs), (ins i16imm:$amt), "naclreti\t$amt">;
+}
+
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1,
+    isAsmParserOnly = 1 in {
+  def NACL_JMP32r : NaClPI32<(outs), (ins GR32:$dst), "nacljmp\t$dst">;
+}
+
+let isCall = 1, isAsmParserOnly = 1 in {
+  def NACL_CALL32d : NaClPI32<(outs), (ins i32imm_pcrel:$dst),
+                     "naclcall\t$dst">;
+  def NACL_CALL32r : NaClPI32<(outs), (ins GR32:$dst),
+                     "naclcall\t$dst">;
+}
+
+// nacltlsaddr32 gets rewritten to:
+//     .bundle_align_end
+//     .bundle_lock
+//     leal\t$sym@TLSGD, %eax
+//     call\t___tls_get_addr@PLT
+//     .bundle_unlock
+// (The linker expects the leal+call sequence to be directly adjacent)
+let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
+            MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+            XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+            XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+    Uses = [ESP],
+    isAsmParserOnly = 1 in
+def NACL_TLS_addr32 : NaClPI32<(outs), (ins i32mem:$sym),
+                      "nacltlsaddr32\t$sym">;
+
+//===----------------------------------------------------------------------===//
+// 64-bit Native Client Pseudo Instructions
+//===----------------------------------------------------------------------===//
+
+class NaClPI64<dag outs, dag ins, string asm>
+  : I<0, CustomFrm, outs, ins, asm, []>, Requires<[IsNaCl, In64BitMode]>;
+
+let isTerminator = 1, isBarrier = 1, hasCtrlDep = 1, isAsmParserOnly = 1 in {
+  def NACL_TRAP64  : NaClPI64<(outs), (ins), "nacltrap">;
+}
+
+let isTerminator = 1, isReturn = 1, isBarrier = 1,
+    hasCtrlDep = 1, FPForm = SpecialFP, isAsmParserOnly = 1 in {
+  def NACL_RET64  : NaClPI64<(outs), (ins), "naclret">;
+}
+
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1,
+    isAsmParserOnly = 1 in {
+  def NACL_JMP64r : NaClPI64<(outs), (ins GR32:$dst, GR64:$rZP),
+                    "nacljmp\t{$dst, $rZP|$rZP, $dst}">;
+  def NACL_JMP64z : NaClPI64<(outs), (ins GR32:$dst),
+                    "nacljmp\t$dst">;
+}
+
+
+let isCall = 1, isAsmParserOnly = 1 in {
+  def NACL_CALL64d : NaClPI64<(outs), (ins i32imm_pcrel:$dst),
+                     "naclcall\t$dst">;
+  def NACL_CALL64r : NaClPI64<(outs), (ins GR32:$dst, GR64:$rZP),
+                     "naclcall\t$dst,$rZP">;
+}
+
+let Defs = [RSP, EFLAGS], Uses = [RSP], isAsmParserOnly = 1 in {
+  def NACL_ASPi8 : NaClPI64<(outs), (ins i64i8imm:$off, GR64:$rZP),
+                   "naclasp{q}\t{$off, $rZP|$rZP, $off}">;
+
+  def NACL_ASPi32: NaClPI64<(outs), (ins i64i32imm:$off, GR64:$rZP),
+                   "naclasp{q}\t{$off, $rZP|$rZP, $off}">;
+
+  def NACL_SSPi8 : NaClPI64<(outs), (ins i64i8imm:$off, GR64:$rZP),
+                   "naclssp{q}\t{$off, $rZP|$rZP, $off}">;
+
+  def NACL_SSPi32: NaClPI64<(outs), (ins i64i32imm:$off, GR64:$rZP),
+                   "naclssp{q}\t{$off, $rZP|$rZP, $off}">;
+
+  def NACL_ANDSPi32: NaClPI64<(outs), (ins i64i32imm:$off, GR64:$rZP),
+                   "naclandsp{q}\t{$off, $rZP|$rZP, $off}">;
+}
+
+let Defs = [RSP], Uses = [RBP], isAsmParserOnly = 1 in {
+  def NACL_SPADJi32  : NaClPI64<(outs), (ins i64i32imm:$off, GR64:$rZP),
+                       "naclspadj\t{$off, $rZP|$rZP, $off}">;
+}
+
+let Defs = [RSP], isAsmParserOnly = 1 in {
+  def NACL_RESTSPr   : NaClPI64<(outs), (ins GR32:$src, GR64:$rZP),
+                       "naclrestsp_noflags\t{$src, $rZP|$rZP, $src}">;
+  def NACL_RESTSPm   : NaClPI64<(outs), (ins i32mem:$src, GR64:$rZP),
+                       "naclrestsp_noflags\t{$src, $rZP|$rZP, $src}">;
+  def NACL_RESTSPrz   : NaClPI64<(outs), (ins GR32:$src),
+                       "naclrestsp_noflags\t$src">;
+}
+
+def : MnemonicAlias<"naclrestsp", "naclrestsp_noflags">;
+
+let Defs = [RBP], isAsmParserOnly = 1 in {
+  def NACL_RESTBPr   : NaClPI64<(outs), (ins GR32:$src, GR64:$rZP),
+                       "naclrestbp\t{$src, $rZP|$rZP, $src}">;
+  def NACL_RESTBPm   : NaClPI64<(outs), (ins i32mem:$src, GR64:$rZP),
+                       "naclrestbp\t{$src, $rZP|$rZP, $src}">;
+  def NACL_RESTBPrz   : NaClPI64<(outs), (ins GR32:$src),
+                       "naclrestbp\t$src">;
+}
+
+//===----------------------------------------------------------------------===//
+//
+// Code Generator Instructions (isCodeGenOnly == 1)
+//
+// These instructions exists to make CodeGen work with Native Client's
+// modifications.
+//
+// Many of these instructions exist because of limitations in CodeGen
+// or TableGen, and may become unnecessary in the future.
+//===----------------------------------------------------------------------===//
+
+
+//===----------------------------------------------------------------------===//
+//
+// CodeGen 32-bit
+//
+//===----------------------------------------------------------------------===//
+
+
+// To avoid a naming conflict between call/naclcall, we have to
+// disable the real CALLpcrel32 and CALL32r instructions when targeting
+// for NaCl. Thus, they need to be produced here.
+
+let isCall = 1 in
+  // All calls clobber the non-callee saved registers. ESP is marked as
+  // a use to prevent stack-pointer assignments that appear immediately
+  // before calls from potentially appearing dead. Uses for argument
+  // registers are added manually.
+  let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
+              MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+              XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+              XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+      Uses = [ESP] in {
+
+    def NACL_CG_CALLpcrel32 : I<0, Pseudo,
+                              (outs), (ins i32imm_pcrel:$dst),
+                              "naclcall\t$dst", []>,
+                              Requires<[IsNaCl, In32BitMode]>;
+    def NACL_CG_CALL32r     : I<0, Pseudo,
+                              (outs), (ins GR32:$dst),
+                              "naclcall\t$dst", [(X86call GR32:$dst)]>,
+                              Requires<[IsNaCl, In32BitMode]>;
+}
+
+// Normal calls, with various flavors of addresses.
+def : Pat<(X86call (i32 tglobaladdr:$dst)),
+          (NACL_CG_CALLpcrel32 tglobaladdr:$dst)>,
+          Requires<[IsNaCl, In32BitMode]>;
+def : Pat<(X86call (i32 texternalsym:$dst)),
+          (NACL_CG_CALLpcrel32 texternalsym:$dst)>,
+          Requires<[IsNaCl, In32BitMode]>;
+def : Pat<(X86call (i32 imm:$dst)),
+          (NACL_CG_CALLpcrel32 imm:$dst)>,
+          Requires<[IsNaCl, In32BitMode, CallImmAddr]>;
+
+//===----------------------------------------------------------------------===//
+//
+// CodeGen 64-bit
+//
+//===----------------------------------------------------------------------===//
+
+
+// Because pointers are 32-bit on X86-64 Native Client, we need to
+// produce new versions of the JMP64/CALL64 instructions which can accept
+// addresses which are i32 instead of i64.
+
+let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
+  def NACL_CG_JMP64r     : I<0, Pseudo, (outs), (ins GR32:$dst),
+                           "nacljmp\t$dst",
+                           [(brind GR32:$dst)]>,
+                           Requires<[IsNaCl, In64BitMode]>;
+}
+
+let isCall = 1 in
+  // All calls clobber the non-callee saved registers. RSP is marked as
+  // a use to prevent stack-pointer assignments that appear immediately
+  // before calls from potentially appearing dead. Uses for argument
+  // registers are added manually.
+  let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+              FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1,
+              MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+              XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+              XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+      Uses = [RSP] in {
+
+    def NACL_CG_CALL64pcrel32 : I<0, Pseudo, (outs),
+                                (ins i32imm_pcrel:$dst),
+                                "naclcall\t$dst", []>,
+                                Requires<[IsNaCl, In64BitMode]>;
+
+    def NACL_CG_CALL64r       : I<0, Pseudo, (outs), (ins GR32:$dst),
+                                "naclcall\t$dst,%r15",
+                                [(X86call GR32:$dst)]>,
+                                Requires<[IsNaCl, In64BitMode]>;
+}
+
+def : Pat<(X86call (i32 tglobaladdr:$dst)),
+          (NACL_CG_CALL64pcrel32 tglobaladdr:$dst)>,
+      Requires<[IsNaCl, In64BitMode]>;
+def : Pat<(X86call (i32 texternalsym:$dst)),
+          (NACL_CG_CALL64pcrel32 texternalsym:$dst)>,
+      Requires<[IsNaCl, In64BitMode]>;
+
+// Tail calls
+// Also needed due to the i64 / i32 pointer problem.
+let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
+    isCodeGenOnly = 1 in
+  let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+              FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1,
+              MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+              XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+              XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+      Uses = [RSP] in {
+
+  def NACL_CG_TCRETURNdi64 : I<0, Pseudo, (outs),
+                             (ins i32imm_pcrel:$dst, i32imm:$offset),
+                             "#TC_RETURN $dst $offset", []>,
+                          Requires<[IsNaCl, In64BitMode]>;
+  def NACL_CG_TCRETURNri64 : I<0, Pseudo, (outs),
+                            (ins GR32_TC_64:$dst, i32imm:$offset),
+                            "#TC_RETURN $dst $offset", []>,
+                            Requires<[IsNaCl, In64BitMode]>;
+
+  def NACL_CG_TAILJMPd64 : I<0, Pseudo, (outs),
+                           (ins i32imm_pcrel:$dst),
+                           "jmp\t$dst  # TAILCALL", []>,
+                           Requires<[IsNaCl, In64BitMode]>;
+  def NACL_CG_TAILJMPr64 : I<0, Pseudo, (outs),
+                           (ins GR32_TC_64:$dst),
+                           "nacljmp\t$dst,%r15  # TAILCALL", []>,
+                           Requires<[IsNaCl, In64BitMode]>;
+}
+
+def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off),
+          (NACL_CG_TCRETURNdi64 tglobaladdr:$dst, imm:$off)>,
+	  Requires<[IsNaCl, In64BitMode]>;
+
+def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off),
+          (NACL_CG_TCRETURNdi64 texternalsym:$dst, imm:$off)>,
+	  Requires<[IsNaCl, In64BitMode]>;
+
+def : Pat<(X86tcret GR32_TC_64:$dst, imm:$off),
+          (NACL_CG_TCRETURNri64 GR32_TC_64:$dst, imm:$off)>,
+	  Requires<[IsNaCl, In64BitMode]>;
+
+// ELF TLS Support
+
+let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0,
+            MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+            XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+            XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+    Uses = [ESP] in
+def NACL_CG_TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
+                         ".bundle_align_end"
+                         ".bundle_lock"
+                         "leal\t$sym, %eax; "
+                         "call\t___tls_get_addr@PLT"
+                         ".bundle_unlock",
+                         [(X86tlsaddr tls32addr:$sym)]>,
+                         Requires<[In32BitMode, IsNaCl]>;
+
+// These are lowered in X86NaClRewritePass.
+let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
+            FP0, FP1, FP2, FP3, FP4, FP5, FP6, ST0, ST1,
+            MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
+            XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
+            XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
+    Uses = [RSP] in {
+def NACL_CG_GD_TLS_addr64 : I<0, Pseudo, (outs), (ins i32mem:$sym), "",
+                            [(X86tlsaddr tls32addr:$sym)]>,
+                            Requires<[IsNaCl, In64BitMode]>;
+def NACL_CG_LE_TLS_addr64 : I<0, Pseudo, (outs), (ins i32mem:$sym), "",
+                            [(X86tlsaddr_le tls32addr:$sym)]>,
+                            Requires<[IsNaCl, In64BitMode]>;
+def NACL_CG_IE_TLS_addr64 : I<0, Pseudo, (outs), (ins i32mem:$sym), "",
+                            [(X86tlsaddr_ie tls32addr:$sym)]>,
+                            Requires<[IsNaCl, In64BitMode]>;
+// For mtls-use-call.
+def NACL_CG_LE_TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym), "",
+                            [(X86tlsaddr_le tls32addr:$sym)]>,
+                            Requires<[IsNaCl, In32BitMode]>;
+def NACL_CG_IE_TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym), "",
+                            [(X86tlsaddr_ie tls32addr:$sym)]>,
+                            Requires<[IsNaCl, In32BitMode]>;
+}
+
+let usesCustomInserter = 1, Defs = [EFLAGS] in
+def NACL_CG_VAARG_64 : I<0, Pseudo,
+                     (outs GR32:$dst),
+                     (ins i8mem:$ap, i32imm:$size, i8imm:$mode, i32imm:$align),
+                     "#NACL_VAARG_64 $dst, $ap, $size, $mode, $align",
+                     [(set GR32:$dst,
+                     (X86vaarg64 addr:$ap, imm:$size, imm:$mode, imm:$align)),
+                     (implicit EFLAGS)]>,
+                     Requires<[IsNaCl, In64BitMode]>;
diff --git a/lib/Target/X86/X86JITInfo.cpp b/lib/Target/X86/X86JITInfo.cpp
index 764aa5d4f2..4b528f6153 100644
--- a/lib/Target/X86/X86JITInfo.cpp
+++ b/lib/Target/X86/X86JITInfo.cpp
@@ -18,6 +18,8 @@
 #include "X86TargetMachine.h"
 #include "llvm/Function.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"//TODO(dschuff):don't forget to remove these
+#include "llvm/Support/Disassembler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Valgrind.h"
 #include <cstdlib>
@@ -82,7 +84,7 @@ static TargetJITInfo::JITCompilerFn JITCompilerFunction;
 // Provide a wrapper for X86CompilationCallback2 that saves non-traditional
 // callee saved registers, for the fastcc calling convention.
 extern "C" {
-#if defined(X86_64_JIT)
+#if defined(X86_64_JIT) && !defined(__native_client__)
 # ifndef _MSC_VER
   // No need to save EAX/EDX for X86-64.
   void X86CompilationCallback(void);
@@ -230,7 +232,11 @@ extern "C" {
     "popl    %ebp\n"
     CFI(".cfi_adjust_cfa_offset -4\n")
     CFI(".cfi_restore %ebp\n")
+#if defined(__native_client__) // @LOCALMOD-BEGIN
+    "popl %ecx; nacljmp %ecx\n"
+#else
     "ret\n"
+#endif // @LOCALMOD-END
     CFI(".cfi_endproc\n")
     SIZE(X86CompilationCallback)
   );
@@ -295,7 +301,11 @@ extern "C" {
     "popl    %ebp\n"
     CFI(".cfi_adjust_cfa_offset -4\n")
     CFI(".cfi_restore %ebp\n")
+#if defined(__native_client__) // @LOCALMOD-BEGIN
+    "popl %ecx; nacljmp %ecx\n"
+#else
     "ret\n"
+#endif // @LOCALMOD-END
     CFI(".cfi_endproc\n")
     SIZE(X86CompilationCallback_SSE)
   );
@@ -469,7 +479,14 @@ TargetJITInfo::StubLayout X86JITInfo::getStubLayout() {
   // The 32-bit stub contains a 5-byte call|jmp.
   // If the stub is a call to the compilation callback, an extra byte is added
   // to mark it as a stub.
+#ifdef __native_client__
+  // NaCl call targets must be bundle-aligned. In the case of stubs with
+  // CALLs, the calls do not need to be aligned to the end of the bundle
+  // because there is no return
+  StubLayout Result = {32, 32};//TODO(dschuff): use named constant here
+#else
   StubLayout Result = {14, 4};
+#endif
   return Result;
 }
 
@@ -498,6 +515,9 @@ void *X86JITInfo::emitFunctionStub(const Function* F, void *Target,
     JCE.emitByte(0xE9);
     JCE.emitWordLE((intptr_t)Target-JCE.getCurrentPCValue()-4);
 #endif
+    DEBUG(dbgs() <<"emitted stub: "<< sys::disassembleBuffer(
+        (uint8_t *)Result,JCE.getCurrentPCValue()-(uintptr_t)Result,
+        (intptr_t)Result));
     return Result;
   }
 
@@ -519,6 +539,9 @@ void *X86JITInfo::emitFunctionStub(const Function* F, void *Target,
   // initialize the buffer with garbage, which means it may follow a
   // noreturn function call, confusing X86CompilationCallback2.  PR 4929.
   JCE.emitByte(0xCE);   // Interrupt - Just a marker identifying the stub!
+  DEBUG(dbgs()  <<"emitted stub: "<< sys::disassembleBuffer(
+      (uint8_t *)Result,JCE.getCurrentPCValue()-(uintptr_t)Result,
+      (intptr_t)Result));
   return Result;
 }
 
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index cfd68f74b7..20bc85e65f 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -703,7 +703,13 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     // Emit the call.
     MCSymbol *PICBase = MF->getPICBaseSymbol();
-    TmpInst.setOpcode(X86::CALLpcrel32);
+    // @LOCALMOD-BEGIN
+    // For NaCl, the call should be aligned to the end of a bundle. Since the
+    // call is at the end of the bundle, there should be no padding between
+    // the call and the next instruction (the label should still make sense).
+    TmpInst.setOpcode(getSubtarget().isTargetNaCl() ?
+                      X86::NACL_CALL32d : X86::CALLpcrel32);
+    // @LOCALMOD-END
     // FIXME: We would like an efficient form for this, so we don't have to do a
     // lot of extra uniquing.
     TmpInst.addOperand(MCOperand::CreateExpr(MCSymbolRefExpr::Create(PICBase,
diff --git a/lib/Target/X86/X86NaClJITInfo.cpp b/lib/Target/X86/X86NaClJITInfo.cpp
new file mode 100644
index 0000000000..e5ccbf960d
--- /dev/null
+++ b/lib/Target/X86/X86NaClJITInfo.cpp
@@ -0,0 +1,393 @@
+//===-- X86JITInfo.cpp - Implement the JIT interfaces for the X86 target --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the JIT interfaces for the X86 target on Native Client
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "jit"
+#include "X86NaClJITInfo.h"
+#include "X86Relocations.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include <cstdlib>
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Disassembler.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/Valgrind.h"
+#ifdef __native_client__
+#include <nacl/nacl_dyncode.h>
+#endif
+
+using namespace llvm;
+
+extern cl::opt<int> FlagSfiX86JmpMask;
+
+// Determine the platform we're running on
+#if defined (__x86_64__) || defined (_M_AMD64) || defined (_M_X64)
+# define X86_64_JIT
+#elif defined(__i386__) || defined(i386) || defined(_M_IX86)
+# define X86_32_JIT
+#elif defined(__pnacl__)
+#warning "PNaCl does not yet have JIT support"
+#else
+#error "Should not be building X86NaClJITInfo on non-x86"
+// TODO(dschuff): make this work under pnacl self-build?
+#endif
+
+// Get the ASMPREFIX for the current host.  This is often '_'.
+#ifndef __USER_LABEL_PREFIX__
+#define __USER_LABEL_PREFIX__
+#endif
+#define GETASMPREFIX2(X) #X
+#define GETASMPREFIX(X) GETASMPREFIX2(X)
+#define ASMPREFIX GETASMPREFIX(__USER_LABEL_PREFIX__)
+
+# define SIZE(sym) ".size " #sym ", . - " #sym "\n"
+# define TYPE_FUNCTION(sym) ".type " #sym ", @function\n"
+
+void X86NaClJITInfo::replaceMachineCodeForFunction(void *Old, void *New) {
+  // We don't know the original instruction boundaries, so we replace the
+  // whole bundle.
+  uint8_t buf[kBundleSize];
+  buf[0] = 0xE9;                // Emit JMP opcode.
+  intptr_t OldAddr = ((uintptr_t)Old + 1);
+  uint32_t NewOffset = (intptr_t)New - OldAddr - 4;// PC-relative offset of new
+  *((uint32_t*)(buf + 1)) = NewOffset;
+  memcpy(buf + 5, getNopSequence(kBundleSize - 5), kBundleSize - 5);
+
+#ifdef __native_client__
+  if(nacl_dyncode_create(Old, buf, kBundleSize)) {
+    report_fatal_error("machine code replacement failed");
+  }
+#endif
+
+  // X86 doesn't need to invalidate the processor cache, so just invalidate
+  // Valgrind's cache directly.
+  sys::ValgrindDiscardTranslations(Old, 5);
+}
+
+/// JITCompilerFunction - This contains the address of the JIT function used to
+/// compile a function lazily.
+static TargetJITInfo::JITCompilerFn JITCompilerFunction;
+
+extern "C" {
+#if defined(X86_64_JIT) || defined(__pnacl__) || !defined(__native_client__)
+void X86NaClCompilationCallback(void) {
+//TODO(dschuff): implement for X86-64
+}
+void X86NaClCompilationCallback_fastcc(void) {
+//TODO(dschuff): implement for X86-64
+}
+#else
+// Chrome system requirements include PIII, So SSE is present.
+// For now this is the same as X86CompilationCallback_SSE
+// In the future we could emit this rather than defining it with asm, for
+// compatibility with pnacl self-build
+// Also omit CFI junk (which is #defined away)
+
+// The difference between the 2 wrapper variants is that the first returns
+// through ecx and the 2nd returns through eax. The fastcc calling convention
+// uses ecx to pass arguments, and the C calling convention uses eax to pass
+// arguments with the 'inreg' attribute, so we make sure not to clobber it.
+// Returning through eax for fastcc and ecx for C clobbers the 'nest' parameter
+// breaking nested functions (which are not supported by clang in any case).
+
+void X86NaClCompilationCallback(void);
+asm(
+    ".text\n"
+    ".align 32\n"
+    ".globl " ASMPREFIX "X86NaClCompilationCallback\n"
+    TYPE_FUNCTION(X86NaClCompilationCallback)
+    ASMPREFIX "X86NaClCompilationCallback:\n"
+    "pushl %ebp\n"
+    "movl    %esp, %ebp\n"    // Standard prologue
+    "pushl   %eax\n"
+    "pushl   %edx\n"          // Save EAX/EDX/ECX
+    "pushl   %ecx\n"
+    "andl    $-16, %esp\n"    // Align ESP on 16-byte boundary
+    // Save all XMM arg registers
+    "subl    $64, %esp\n"
+    // FIXME: provide frame move information for xmm registers.
+    // This can be tricky, because CFA register is ebp (unaligned)
+    // and we need to produce offsets relative to it.
+    "movaps  %xmm0, (%esp)\n"
+    "movaps  %xmm1, 16(%esp)\n"
+    "movaps  %xmm2, 32(%esp)\n"
+    "movaps  %xmm3, 48(%esp)\n"
+    "subl    $16, %esp\n"
+    "movl    4(%ebp), %eax\n" // Pass prev frame and return address
+    "movl    %eax, 4(%esp)\n"
+    "movl    %ebp, (%esp)\n"
+    "call    " ASMPREFIX "X86NaClCompilationCallback2\n"
+    "addl    $16, %esp\n"
+    "movaps  48(%esp), %xmm3\n"
+    "movaps  32(%esp), %xmm2\n"
+    "movaps  16(%esp), %xmm1\n"
+    "movaps  (%esp), %xmm0\n"
+    "movl    %ebp, %esp\n"    // Restore ESP
+    "subl    $12, %esp\n"
+    "popl    %ecx\n"
+    "popl    %edx\n"
+    "popl    %eax\n"
+    "popl    %ebp\n"
+    "popl %ecx\n"
+    "nacljmp %ecx\n"
+    SIZE(X86NaClCompilationCallback)
+);
+
+
+
+void X86NaClCompilationCallback_fastcc(void);
+asm(
+    ".text\n"
+    ".align 32\n"
+    ".globl " ASMPREFIX "X86NaClCompilationCallback_fastcc\n"
+    TYPE_FUNCTION(X86NaClCompilationCallback_fastcc)
+    ASMPREFIX "X86NaClCompilationCallback_fastcc:\n"
+    "pushl %ebp\n"
+    "movl    %esp, %ebp\n"    // Standard prologue
+    "pushl   %eax\n"
+    "pushl   %edx\n"          // Save EAX/EDX/ECX
+    "pushl   %ecx\n"
+    "andl    $-16, %esp\n"    // Align ESP on 16-byte boundary
+    // Save all XMM arg registers
+    "subl    $64, %esp\n"
+    // FIXME: provide frame move information for xmm registers.
+    // This can be tricky, because CFA register is ebp (unaligned)
+    // and we need to produce offsets relative to it.
+    "movaps  %xmm0, (%esp)\n"
+    "movaps  %xmm1, 16(%esp)\n"
+    "movaps  %xmm2, 32(%esp)\n"
+    "movaps  %xmm3, 48(%esp)\n"
+    "subl    $16, %esp\n"
+    "movl    4(%ebp), %eax\n" // Pass prev frame and return address
+    "movl    %eax, 4(%esp)\n"
+    "movl    %ebp, (%esp)\n"
+    "call    " ASMPREFIX "X86NaClCompilationCallback2\n"
+    "addl    $16, %esp\n"
+    "movaps  48(%esp), %xmm3\n"
+    "movaps  32(%esp), %xmm2\n"
+    "movaps  16(%esp), %xmm1\n"
+    "movaps  (%esp), %xmm0\n"
+    "movl    %ebp, %esp\n"    // Restore ESP
+    "subl    $12, %esp\n"
+    "popl    %ecx\n"
+    "popl    %edx\n"
+    "popl    %eax\n"
+    "popl    %ebp\n"
+    "popl %eax\n"
+    "nacljmp %eax\n"
+    SIZE(X86NaClCompilationCallback_fastcc)
+);
+#endif
+
+/// X86CompilationCallback2 - This is the target-specific function invoked by the
+/// function stub when we did not know the real target of a call.  This function
+/// must locate the start of the stub or call site and pass it into the JIT
+/// compiler function.
+
+// A stub has the following format:
+// | Jump opcode (1 byte) | Jump target +22 bytes | 3 bytes of NOPs
+//   | 18 bytes of NOPs | 1 halt | Call opcode (1 byte) | call target
+// The jump targets the call at the end of the bundle, which targets the
+// compilation callback. Once the compilation callback JITed the target
+// function it replaces the first 8 bytes of the stub in a single atomic
+// operation, retargeting the jump at the JITed function.
+
+static uint8_t *BundleRewriteBuffer;
+
+static void LLVM_ATTRIBUTE_USED
+X86NaClCompilationCallback2(intptr_t *StackPtr, intptr_t RetAddr) {
+  // Get the return address from where the call instruction left it
+  intptr_t *RetAddrLoc = &StackPtr[1];
+  assert(*RetAddrLoc == RetAddr &&
+         "Could not find return address on the stack!");
+
+  // TODO: take a lock here. figure out whether it has to be the JIT lock or
+  // can be our own lock (or however we handle thread safety)
+#if 0
+  DEBUG(dbgs() << "In callback! Addr=" << (void*)RetAddr
+               << " ESP=" << (void*)StackPtr << "\n");
+#endif
+
+  intptr_t StubStart = RetAddr - 32;
+  // This probably isn't necessary. I believe the corresponding code in
+  // X86JITInfo is vestigial, and AFAICT no non-stub calls to the compilation
+  // callback are generated anywhere. Still it doesn't hurt as a sanity check
+  bool isStub = *((unsigned char*)StubStart) == 0xE9 &&
+      *((int32_t*)(StubStart + 1)) == 22 &&
+      *((unsigned char*)(StubStart + 26)) == 0xF4;
+
+  assert(isStub && "NaCl doesn't support rewriting non-stub callsites yet");
+
+  // Backtrack so RetAddr points inside the stub (so JITResolver can find
+  // which function to compile)
+  RetAddr -= 4;
+
+  intptr_t NewVal = (intptr_t)JITCompilerFunction((void*)RetAddr);
+
+  // Rewrite the stub's call target, so that we don't end up here every time we
+  // execute the call.
+
+  // Get the first 8 bytes of the stub
+  memcpy(BundleRewriteBuffer, (void *)(StubStart), 8);
+  // Point the jump at the newly-JITed code
+  *((intptr_t *)(BundleRewriteBuffer + 1)) = NewVal - (StubStart + 5);
+
+  // Copy the new code
+#ifdef __native_client__
+  if(nacl_dyncode_modify((void *)StubStart, BundleRewriteBuffer, 8)) {
+    report_fatal_error("dyncode_modify failed");
+  }
+#endif
+  // TODO: release the lock
+
+  // Change our return address to execute the new jump
+  *RetAddrLoc = StubStart;
+}
+
+}
+
+const int X86NaClJITInfo::kBundleSize;
+
+TargetJITInfo::LazyResolverFn
+X86NaClJITInfo::getLazyResolverFunction(JITCompilerFn F) {
+  JITCompilerFunction = F;
+  return X86NaClCompilationCallback;
+}
+
+X86NaClJITInfo::X86NaClJITInfo(X86TargetMachine &tm) : X86JITInfo(tm) {
+  // FIXME: does LLVM have some way of doing static initialization?
+#ifndef __pnacl__
+  if(posix_memalign((void **)&BundleRewriteBuffer, kBundleSize, kBundleSize))
+    report_fatal_error("Could not allocate aligned memory");
+#else
+  BundleRewriteBuffer = NULL;
+#endif
+
+  NopString = new uint8_t[kBundleSize];
+  for (int i = 0; i < kBundleSize; i++) NopString[i] = 0x90;
+  X86Hlt.ins = new uint8_t[1];
+  X86Hlt.ins[0] = 0xf4;
+  X86Hlt.len = 1;
+}
+
+X86NaClJITInfo::~X86NaClJITInfo() {
+  delete [] NopString;
+  delete [] X86Hlt.ins;
+}
+
+TargetJITInfo::StubLayout X86NaClJITInfo::getStubLayout() {
+  // NaCl stubs must be full bundles because calls still have to be aligned
+  // even if they don't return
+  StubLayout Result = {kBundleSize, kBundleSize};
+  return Result;
+}
+
+
+void *X86NaClJITInfo::emitFunctionStub(const Function* F, void *Target,
+                                       JITCodeEmitter &JCE) {
+  bool TargetsCC = Target == (void *)(intptr_t)X86NaClCompilationCallback;
+
+  // If we target the compilation callback, swap it for a different one for
+  // functions using the fastcc calling convention
+  if(TargetsCC && F->getCallingConv() == CallingConv::Fast) {
+    Target = (void *)(intptr_t)X86NaClCompilationCallback_fastcc;
+  }
+
+  void *Result = (void *)JCE.getCurrentPCValue();
+  assert(RoundUpToAlignment((uintptr_t)Result, kBundleSize) == (uintptr_t)Result
+         && "Unaligned function stub");
+  if (!TargetsCC) {
+    // Jump to the target
+    JCE.emitByte(0xE9);
+    JCE.emitWordLE((intptr_t)Target - JCE.getCurrentPCValue() - 4);
+    // Fill with Nops.
+    emitNopPadding(JCE, 27);
+  } else {
+    // Jump over 22 bytes
+    JCE.emitByte(0xE9);
+    JCE.emitWordLE(22);
+    // emit 3-bytes of nop to ensure an instruction boundary at 8 bytes
+    emitNopPadding(JCE, 3);
+    // emit 18 bytes of nop
+    emitNopPadding(JCE, 18);
+    // emit 1 byte of halt. This helps CompilationCallback tell whether
+    // we came from a stub or not
+    JCE.emitByte(X86Hlt.ins[0]);
+    // emit a call to the compilation callback
+    JCE.emitByte(0xE8);
+    JCE.emitWordLE((intptr_t)Target - JCE.getCurrentPCValue() - 4);
+  }
+  return Result;
+}
+
+// Relocations are the same as in X86, but the address being written
+// not the same as the address that the offset is relative to (see comment on
+// setRelocationBuffer in X86NaClJITInfo.h
+void X86NaClJITInfo::relocate(void *Function, MachineRelocation *MR,
+                    unsigned NumRelocs, unsigned char* GOTBase) {
+  for (unsigned i = 0; i != NumRelocs; ++i, ++MR) {
+    void *RelocPos = RelocationBuffer + MR->getMachineCodeOffset();
+    void *RelocTargetPos = (char*)Function + MR->getMachineCodeOffset();
+    intptr_t ResultPtr = (intptr_t)MR->getResultPointer();
+    switch ((X86::RelocationType)MR->getRelocationType()) {
+    case X86::reloc_pcrel_word: {
+      // PC relative relocation, add the relocated value to the value already in
+      // memory, after we adjust it for where the PC is.
+      ResultPtr = ResultPtr -(intptr_t)RelocTargetPos - 4 - MR->getConstantVal();
+      *((unsigned*)RelocPos) += (unsigned)ResultPtr;
+      break;
+    }
+    case X86::reloc_picrel_word: {
+      // PIC base relative relocation, add the relocated value to the value
+      // already in memory, after we adjust it for where the PIC base is.
+      ResultPtr = ResultPtr - ((intptr_t)Function + MR->getConstantVal());
+      *((unsigned*)RelocPos) += (unsigned)ResultPtr;
+      break;
+    }
+    case X86::reloc_absolute_word:
+    case X86::reloc_absolute_word_sext:
+      // Absolute relocation, just add the relocated value to the value already
+      // in memory.
+      *((unsigned*)RelocPos) += (unsigned)ResultPtr;
+      break;
+    case X86::reloc_absolute_dword:
+      *((intptr_t*)RelocPos) += ResultPtr;
+      break;
+    }
+  }
+}
+
+const uint8_t *X86NaClJITInfo::getNopSequence(size_t len) const {
+  // TODO(dschuff): use more efficient NOPs.
+  // Update emitNopPadding when it happens
+  assert((int)len <= kBundleSize &&
+         "Nop sequence can't be more than bundle size");
+  return NopString;
+}
+
+void X86NaClJITInfo::emitNopPadding(JITCodeEmitter &JCE, size_t len) {
+  for (size_t i = 0; i < len; i++) JCE.emitByte(NopString[i]);
+}
+
+const TargetJITInfo::HaltInstruction *X86NaClJITInfo::getHalt() const {
+  return &X86Hlt;
+}
+
+int X86NaClJITInfo::getBundleSize() const {
+  return kBundleSize;
+}
+
+int32_t X86NaClJITInfo::getJumpMask() const {
+  return FlagSfiX86JmpMask;
+}
diff --git a/lib/Target/X86/X86NaClJITInfo.h b/lib/Target/X86/X86NaClJITInfo.h
new file mode 100644
index 0000000000..9416efeff1
--- /dev/null
+++ b/lib/Target/X86/X86NaClJITInfo.h
@@ -0,0 +1,75 @@
+//=- X86NaClJITInfo.h - X86 implementation of the JIT interface  --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the TargetJITInfo class for
+// Native Client
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef X86NACLJITINFO_H
+#define X86NACLJITINFO_H
+
+#include "X86JITInfo.h"
+#include "llvm/Function.h"
+#include "llvm/CodeGen/JITCodeEmitter.h"
+#include "llvm/Target/TargetJITInfo.h"
+
+namespace llvm {
+  class X86NaClJITInfo : public X86JITInfo {
+    void emitNopPadding(JITCodeEmitter &JCE, size_t len);
+    const X86Subtarget *Subtarget;
+    uintptr_t PICBase;
+    uint8_t *NopString;
+    HaltInstruction X86Hlt;
+    uint8_t *RelocationBuffer;
+   public:
+    static const int kBundleSize = 32;
+    explicit X86NaClJITInfo(X86TargetMachine &tm);
+    virtual ~X86NaClJITInfo();
+
+    virtual void replaceMachineCodeForFunction(void *Old, void *New);
+
+    // getStubLayout - Returns the size and alignment of the largest call stub
+    // on X86 NaCl.
+    virtual StubLayout getStubLayout();
+
+    // Note: the emission and functions MUST NOT touch the target memory
+    virtual void *emitFunctionStub(const Function* F, void *Target,
+                                   JITCodeEmitter &JCE);
+    /// getLazyResolverFunction - Expose the lazy resolver to the JIT.
+    virtual LazyResolverFn getLazyResolverFunction(JITCompilerFn);
+    /// relocate - Before the JIT can run a block of code that has been emitted,
+    /// it must rewrite the code to contain the actual addresses of any
+    /// referenced global symbols.
+    virtual void relocate(void *Function, MachineRelocation *MR,
+                        unsigned NumRelocs, unsigned char* GOTBase);
+
+    virtual char* allocateThreadLocalMemory(size_t size) {
+      //TODO(dschuff) Implement TLS or decide whether X86 TLS works
+      assert(0 && "This target does not implement thread local storage!");
+      return 0;
+    }
+    /// Return a string containing a sequence of NOPs which is valid for
+    /// the given length
+    virtual const uint8_t *getNopSequence(size_t len) const;
+    virtual const HaltInstruction *getHalt() const;
+    virtual int getBundleSize() const;
+    virtual int getJumpMask() const;
+    /// Relocations cannot happen in-place in NaCl because we can't write to
+    /// code. This function takes a pointer to where the code has been emitted,
+    /// before it is copied to the code region. The subsequent call to
+    /// relocate takes pointers to the target code location, but rewrites the
+    /// code in the relocation buffer rather than at the target
+    virtual void setRelocationBuffer(unsigned char * BufferBegin) {
+      RelocationBuffer = BufferBegin;
+    }
+  };
+}
+
+#endif
diff --git a/lib/Target/X86/X86NaClRewriteFinalPass.cpp b/lib/Target/X86/X86NaClRewriteFinalPass.cpp
new file mode 100644
index 0000000000..b6276dc583
--- /dev/null
+++ b/lib/Target/X86/X86NaClRewriteFinalPass.cpp
@@ -0,0 +1,232 @@
+//=== X86NaClRewriteFinalPass.cpp - Expand NaCl pseudo-instructions  --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This pass expands NaCl pseudo-instructions into real instructions.
+// This duplicates much of the functionality found in X86MCNaCl.cpp but is
+// needed for non-MC JIT, which doesn't use MC. It expands pseudo instructions
+// into bundle-locked groups by emitting a BUNDLE_LOCK marker,
+// followed by the instructions, followed by a BUNDLE_UNLOCK marker.
+// The Code Emitter needs to ensure the alignment as it emits. Additionallly,
+// this pass needs to be run last, or the user at least needs to ensure that
+// subsequent passes do not reorder or remove any bundled groups.
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "x86-jit-sandboxing"
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Function.h"
+
+using namespace llvm;
+
+extern cl::opt<int> FlagSfiX86JmpMask;
+
+namespace {
+  class X86NaClRewriteFinalPass : public MachineFunctionPass {
+  public:
+    static char ID;
+    X86NaClRewriteFinalPass() : MachineFunctionPass(ID),
+        kJumpMask(FlagSfiX86JmpMask) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+    virtual const char *getPassName() const {
+      return "NaCl Pseudo-instruction expansion";
+    }
+
+  private:
+    const int kJumpMask;
+    const TargetMachine *TM;
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    bool Is64Bit;
+
+    bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
+
+    void TraceLog(const char *fun,
+		  const MachineBasicBlock &MBB,
+		  const MachineBasicBlock::iterator MBBI) const;
+
+    void RewriteIndirectJump(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator MBBI,
+                             bool Is64Bit,
+                             bool IsCall);
+    void RewriteDirectCall(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           bool Is64Bit);
+    bool ApplyCommonRewrites(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator MBBI);
+
+  };
+
+  char X86NaClRewriteFinalPass::ID = 0;
+}
+
+void X86NaClRewriteFinalPass::RewriteIndirectJump(MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator MBBI,
+    bool Is64Bit,
+    bool IsCall) {
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+
+  DEBUG(dbgs() << "rewrite indirect jump " << MBB);
+
+  unsigned reg32 = MI.getOperand(0).getReg();
+  unsigned reg64 = getX86SubSuperRegister(reg32, MVT::i64);
+
+  if (IsCall)
+    BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::BUNDLE_ALIGN_END));
+
+  BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::BUNDLE_LOCK));
+
+  BuildMI(MBB, MBBI, DL, TII->get(X86::AND32ri8))
+    .addReg(reg32)
+    .addReg(reg32)
+    //.addOperand(MI.getOperand(0))//correct flags, but might be 64bit reg
+    .addImm(kJumpMask);
+
+  if (Is64Bit) {
+    BuildMI(MBB, MBBI, DL, TII->get(X86::ADD64rr))
+      .addReg(reg64)
+      .addReg(reg64)
+      .addReg(X86::R15);
+  }
+
+  if (IsCall) {
+    BuildMI(MBB, MBBI, DL, TII->get(Is64Bit ? X86::CALL64r : X86::CALL32r))
+        .addReg(Is64Bit ? reg64 : reg32);
+  } else {
+    BuildMI(MBB, MBBI, DL, TII->get(Is64Bit ? X86::JMP64r : X86::JMP32r))
+        .addReg(Is64Bit ? reg64 : reg32);
+  }
+
+  BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::BUNDLE_UNLOCK));
+  MI.eraseFromParent();
+
+  DEBUG(dbgs() << "done rewrite indirect jump " << MBB);
+}
+
+void X86NaClRewriteFinalPass::RewriteDirectCall(MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator MBBI,
+    bool Is64Bit) {
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+  DEBUG(dbgs() << "rewrite direct call " << MBB);
+  const MachineOperand &MO = MI.getOperand(0);
+  // rewrite calls to immediates as indirect calls.
+  if (MO.isImm()) {
+    DEBUG(dbgs() << " is immediate " << MO);
+    // First, rewrite as a move imm->reg + indirect call sequence,
+    BuildMI(MBB, MBBI, DL, TII->get(X86::MOV32ri))
+            .addReg(X86::ECX)
+            .addOperand(MO);
+    BuildMI(MBB, MBBI, DL, TII->get(Is64Bit ? X86::CALL64r : X86::CALL32r))
+            .addReg(X86::ECX);
+    // Then use RewriteIndirectJump to sandbox it
+    MachineBasicBlock::iterator I = MBBI;
+    --I; // I now points at the call instruction
+    MI.eraseFromParent();
+    return RewriteIndirectJump(MBB, I, Is64Bit, true);
+  }
+
+  BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::BUNDLE_ALIGN_END));
+
+  BuildMI(MBB, MBBI, DL,
+          TII->get(Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32))
+          .addOperand(MI.getOperand(0));
+
+  MI.eraseFromParent();
+}
+
+bool X86NaClRewriteFinalPass::ApplyCommonRewrites(MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator MBBI) {
+  MachineInstr &MI = *MBBI;
+  unsigned Opcode = MI.getOpcode();
+  switch(Opcode) {
+  case X86::NACL_CALL32d:
+    RewriteDirectCall(MBB, MBBI, false);
+    break;
+  case X86::NACL_CALL64d:
+    RewriteDirectCall(MBB, MBBI, true);
+    break;
+  case X86::NACL_CALL32r:
+    RewriteIndirectJump(MBB, MBBI, false, true);
+    return true;
+  case X86::NACL_CALL64r:
+    RewriteIndirectJump(MBB, MBBI, true, true);
+    return true;
+  case X86::NACL_JMP32r:
+    RewriteIndirectJump(MBB, MBBI, false, false);
+    return true;
+  case X86::NACL_JMP64r:
+    RewriteIndirectJump(MBB, MBBI, true, false);
+    return true;
+  case X86::NACL_TRAP32:
+  case X86::NACL_TRAP64:
+  case X86::NACL_ASPi8:
+  case X86::NACL_ASPi32:
+  case X86::NACL_SSPi8:
+  case X86::NACL_SSPi32:
+  case X86::NACL_SPADJi32:
+  case X86::NACL_RESTBPm:
+  case X86::NACL_RESTBPr:
+  case X86::NACL_RESTSPm:
+  case X86::NACL_RESTSPr:
+    dbgs() << "inst, opcode not handled: " << MI << Opcode;
+    assert(false && "NaCl Pseudo-inst not handled");
+  case X86::NACL_RET32:
+  case X86::NACL_RET64:
+  case X86::NACL_RETI32:
+    assert(false && "Should not get RETs here");
+  }
+  return false;
+}
+
+bool X86NaClRewriteFinalPass::runOnMachineFunction(MachineFunction &MF) {
+  bool modified = false;
+  TM = &MF.getTarget();
+  TII = TM->getInstrInfo();
+  TRI = TM->getRegisterInfo();
+  const X86Subtarget *subtarget = &TM->getSubtarget<X86Subtarget>();
+  assert(subtarget->isTargetNaCl() && "Target in NaClRewriteFinal is not NaCl");
+
+  DEBUG(dbgs() << "*************** NaCl Rewrite Final ***************\n");
+  DEBUG(dbgs() << " funcnum " << MF.getFunctionNumber() << " "
+               << MF.getFunction()->getName() << "\n");
+
+  for (MachineFunction::iterator MFI = MF.begin(), E = MF.end(); 
+       MFI != E; ++MFI) {
+    modified |= runOnMachineBasicBlock(*MFI);
+  }
+
+  DEBUG(dbgs() << "************* NaCl Rewrite Final Done *************\n");
+  return modified;
+}
+
+bool X86NaClRewriteFinalPass::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
+  bool modified = false;
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), NextMBBI = MBBI;
+       MBBI != MBB.end(); MBBI = NextMBBI) {
+    ++NextMBBI;
+    if (ApplyCommonRewrites(MBB, MBBI)) {
+      modified = true;
+    }
+  }
+  return modified;
+}
+
+// return an instance of the pass
+namespace llvm {
+  FunctionPass *createX86NaClRewriteFinalPass() {
+    return new X86NaClRewriteFinalPass();
+  }
+}
diff --git a/lib/Target/X86/X86NaClRewritePass.cpp b/lib/Target/X86/X86NaClRewritePass.cpp
new file mode 100644
index 0000000000..7310dcd77a
--- /dev/null
+++ b/lib/Target/X86/X86NaClRewritePass.cpp
@@ -0,0 +1,762 @@
+//=== X86NaClRewritePAss.cpp - Rewrite instructions for NaCl SFI --*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that ensures stores and loads and stack/frame
+// pointer addresses are within the NaCl sandbox (for x86-64).
+// It also ensures that indirect control flow follows NaCl requirments.
+//
+// The other major portion of rewriting for NaCl is done in X86InstrNaCl.cpp,
+// which is responsible for expanding the NaCl-specific operations introduced
+// here and also the intrinsic functions to support setjmp, etc.
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "x86-sandboxing"
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+extern cl::opt<bool> FlagUseZeroBasedSandbox;
+cl::opt<bool> FlagRestrictR15("sfi-restrict-r15",
+                              cl::desc("Restrict use of %r15.  This flag can"
+                                       " be turned off for the zero-based"
+                                       " sandbox model."),
+                              cl::init(true));
+
+namespace {
+  class X86NaClRewritePass : public MachineFunctionPass {
+  public:
+    static char ID;
+    X86NaClRewritePass() : MachineFunctionPass(ID) {}
+
+    virtual bool runOnMachineFunction(MachineFunction &Fn);
+
+    virtual const char *getPassName() const {
+      return "NaCl Rewrites";
+    }
+
+  private:
+
+    const TargetMachine *TM;
+    const TargetInstrInfo *TII;
+    const TargetRegisterInfo *TRI;
+    const X86Subtarget *Subtarget;
+    bool Is64Bit;
+
+    bool runOnMachineBasicBlock(MachineBasicBlock &MBB);
+
+    void TraceLog(const char *func,
+                  const MachineBasicBlock &MBB,
+                  const MachineBasicBlock::iterator MBBI) const;
+
+    bool ApplyRewrites(MachineBasicBlock &MBB,
+                      MachineBasicBlock::iterator MBBI);
+    bool ApplyStackSFI(MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator MBBI);
+
+    bool ApplyMemorySFI(MachineBasicBlock &MBB,
+                        MachineBasicBlock::iterator MBBI);
+
+    bool ApplyFrameSFI(MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator MBBI);
+
+    bool ApplyControlSFI(MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator MBBI);
+
+    void PassLightWeightValidator(MachineBasicBlock &MBB);
+    bool AlignJumpTableTargets(MachineFunction &MF);
+  };
+
+  char X86NaClRewritePass::ID = 0;
+
+}
+
+static void DumpInstructionVerbose(const MachineInstr &MI);
+
+static bool IsPushPop(MachineInstr &MI) {
+  const unsigned Opcode = MI.getOpcode();
+  switch (Opcode) {
+   default:
+    return false;
+   case X86::PUSH64r:
+   case X86::POP64r:
+    return true;
+  }
+}
+
+static bool IsStore(MachineInstr &MI) {
+  return MI.getDesc().mayStore();
+}
+
+static bool IsLoad(MachineInstr &MI) {
+  return MI.getDesc().mayLoad();
+}
+
+static bool IsFrameChange(MachineInstr &MI) {
+  return MI.modifiesRegister(X86::EBP, NULL) ||
+         MI.modifiesRegister(X86::RBP, NULL);
+}
+
+static bool IsStackChange(MachineInstr &MI) {
+  return MI.modifiesRegister(X86::ESP, NULL) ||
+         MI.modifiesRegister(X86::RSP, NULL);
+}
+
+
+static bool HasControlFlow(const MachineInstr &MI) {
+ return MI.getDesc().isBranch() ||
+        MI.getDesc().isCall() ||
+        MI.getDesc().isReturn() ||
+        MI.getDesc().isTerminator() ||
+        MI.getDesc().isBarrier();
+}
+
+static bool IsDirectBranch(const MachineInstr &MI) {
+  return  MI.getDesc().isBranch() &&
+         !MI.getDesc().isIndirectBranch();
+}
+
+static bool IsRegAbsolute(unsigned Reg) {
+  const bool UseZeroBasedSandbox = FlagUseZeroBasedSandbox;
+  const bool RestrictR15 = FlagRestrictR15;
+  assert(UseZeroBasedSandbox || RestrictR15);
+  return (Reg == X86::RSP || Reg == X86::RBP ||
+          (Reg == X86::R15 && RestrictR15));
+}
+
+static bool FindMemoryOperand(const MachineInstr &MI, unsigned* index) {
+  int NumFound = 0;
+  unsigned MemOp = 0;
+  for (unsigned i = 0; i < MI.getNumOperands(); ) {
+    if (isMem(&MI, i)) {
+      NumFound++;
+      MemOp = i;
+      i += X86::AddrNumOperands;
+    } else {
+      i++;
+    }
+  }
+
+  // Intrinsics and other functions can have mayLoad and mayStore to reflect
+  // the side effects of those functions.  This function is used to find
+  // explicit memory references in the instruction, of which there are none.
+  if (NumFound == 0)
+    return false;
+
+  if (NumFound > 1)
+    llvm_unreachable("Too many memory operands in instruction!");
+
+  *index = MemOp;
+  return true;
+}
+
+static unsigned PromoteRegTo64(unsigned RegIn) {
+  if (RegIn == 0)
+    return 0;
+  unsigned RegOut = getX86SubSuperRegister(RegIn, MVT::i64, false);
+  assert(RegOut != 0);
+  return RegOut;
+}
+
+static unsigned DemoteRegTo32(unsigned RegIn) {
+  if (RegIn == 0)
+    return 0;
+  unsigned RegOut = getX86SubSuperRegister(RegIn, MVT::i32, false);
+  assert(RegOut != 0);
+  return RegOut;
+}
+
+
+//
+// True if this MI restores RSP from RBP with a slight adjustment offset.
+//
+static bool MatchesSPAdj(const MachineInstr &MI) {
+  assert (MI.getOpcode() == X86::LEA64r && "Call to MatchesSPAdj w/ non LEA");
+  const MachineOperand &DestReg = MI.getOperand(0);
+  const MachineOperand &BaseReg = MI.getOperand(1);
+  const MachineOperand &Scale = MI.getOperand(2);
+  const MachineOperand &IndexReg = MI.getOperand(3);
+  const MachineOperand &Offset = MI.getOperand(4);
+  return (DestReg.isReg() && DestReg.getReg() == X86::RSP &&
+          BaseReg.isReg() && BaseReg.getReg() == X86::RBP &&
+          Scale.getImm() == 1 &&
+          IndexReg.isReg() && IndexReg.getReg() == 0 &&
+          Offset.isImm());
+}
+
+void
+X86NaClRewritePass::TraceLog(const char *func,
+                             const MachineBasicBlock &MBB,
+                             const MachineBasicBlock::iterator MBBI) const {
+  DEBUG(dbgs() << "@" << func << "(" << MBB.getName() << ", " << (*MBBI) << ")\n");
+}
+
+bool X86NaClRewritePass::ApplyStackSFI(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MBBI) {
+  const bool UseZeroBasedSandbox = FlagUseZeroBasedSandbox;
+  TraceLog("ApplyStackSFI", MBB, MBBI);
+  assert(Is64Bit);
+  MachineInstr &MI = *MBBI;
+
+  if (!IsStackChange(MI))
+    return false;
+
+  if (IsPushPop(MI))
+    return false;
+
+  if (MI.getDesc().isCall())
+    return false;
+
+  unsigned Opc = MI.getOpcode();
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned DestReg = MI.getOperand(0).getReg();
+  assert(DestReg == X86::ESP || DestReg == X86::RSP);
+
+  unsigned NewOpc = 0;
+  switch (Opc) {
+  case X86::ADD64ri8 : NewOpc = X86::NACL_ASPi8; break;
+  case X86::ADD64ri32: NewOpc = X86::NACL_ASPi32; break;
+  case X86::SUB64ri8 : NewOpc = X86::NACL_SSPi8; break;
+  case X86::SUB64ri32: NewOpc = X86::NACL_SSPi32; break;
+  case X86::AND64ri32: NewOpc = X86::NACL_ANDSPi32; break;
+  }
+  if (NewOpc) {
+    BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
+      .addImm(MI.getOperand(2).getImm())
+      .addReg(UseZeroBasedSandbox ? 0 : X86::R15);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  // Promote "MOV ESP, EBP" to a 64-bit move
+  if (Opc == X86::MOV32rr && MI.getOperand(1).getReg() == X86::EBP) {
+    MI.getOperand(0).setReg(X86::RSP);
+    MI.getOperand(1).setReg(X86::RBP);
+    MI.setDesc(TII->get(X86::MOV64rr));
+    Opc = X86::MOV64rr;
+  }
+
+  // "MOV RBP, RSP" is already safe
+  if (Opc == X86::MOV64rr && MI.getOperand(1).getReg() == X86::RBP) {
+    return true;
+  }
+
+  //  Promote 32-bit lea to 64-bit lea (does this ever happen?)
+  assert(Opc != X86::LEA32r && "Invalid opcode in 64-bit mode!");
+  if (Opc == X86::LEA64_32r) {
+    unsigned DestReg = MI.getOperand(0).getReg();
+    unsigned BaseReg = MI.getOperand(1).getReg();
+    unsigned Scale   = MI.getOperand(2).getImm();
+    unsigned IndexReg = MI.getOperand(3).getReg();
+    assert(DestReg == X86::ESP);
+    assert(Scale == 1);
+    assert(BaseReg == X86::EBP);
+    assert(IndexReg == 0);
+    MI.getOperand(0).setReg(X86::RSP);
+    MI.getOperand(1).setReg(X86::RBP);
+    MI.setDesc(TII->get(X86::LEA64r));
+    Opc = X86::LEA64r;
+  }
+
+  if (Opc == X86::LEA64r && MatchesSPAdj(MI)) {
+    const MachineOperand &Offset = MI.getOperand(4);
+    BuildMI(MBB, MBBI, DL, TII->get(X86::NACL_SPADJi32))
+      .addImm(Offset.getImm())
+      .addReg(UseZeroBasedSandbox ? 0 : X86::R15);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  if (Opc == X86::MOV32rr || Opc == X86::MOV64rr) {
+    BuildMI(MBB, MBBI, DL, TII->get(X86::NACL_RESTSPr))
+      .addReg(DemoteRegTo32(MI.getOperand(1).getReg()))
+      .addReg(UseZeroBasedSandbox ? 0 : X86::R15);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  if (Opc == X86::MOV32rm) {
+    BuildMI(MBB, MBBI, DL, TII->get(X86::NACL_RESTSPm))
+      .addOperand(MI.getOperand(1)) // Base
+      .addOperand(MI.getOperand(2)) // Scale
+      .addOperand(MI.getOperand(3)) // Index
+      .addOperand(MI.getOperand(4)) // Offset
+      .addOperand(MI.getOperand(5)) // Segment
+      .addReg(UseZeroBasedSandbox ? 0 : X86::R15);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  DumpInstructionVerbose(MI);
+  llvm_unreachable("Unhandled Stack SFI");
+}
+
+bool X86NaClRewritePass::ApplyFrameSFI(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MBBI) {
+  const bool UseZeroBasedSandbox = FlagUseZeroBasedSandbox;
+  TraceLog("ApplyFrameSFI", MBB, MBBI);
+  assert(Is64Bit);
+  MachineInstr &MI = *MBBI;
+
+  if (!IsFrameChange(MI))
+    return false;
+
+  unsigned Opc = MI.getOpcode();
+  DebugLoc DL = MI.getDebugLoc();
+
+  // Handle moves to RBP
+  if (Opc == X86::MOV64rr) {
+    assert(MI.getOperand(0).getReg() == X86::RBP);
+    unsigned SrcReg = MI.getOperand(1).getReg();
+
+    // MOV RBP, RSP is already safe
+    if (SrcReg == X86::RSP)
+      return false;
+
+    // Rewrite: mov %rbp, %rX
+    // To:      naclrestbp %eX, %rZP
+    BuildMI(MBB, MBBI, DL, TII->get(X86::NACL_RESTBPr))
+      .addReg(DemoteRegTo32(SrcReg))
+      .addReg(UseZeroBasedSandbox ? 0 : X86::R15); // rZP
+    MI.eraseFromParent();
+    return true;
+  }
+
+  // Handle memory moves to RBP
+  if (Opc == X86::MOV64rm) {
+    assert(MI.getOperand(0).getReg() == X86::RBP);
+
+    // Zero-based sandbox model uses address clipping
+    if (UseZeroBasedSandbox)
+      return false;
+
+    // Rewrite: mov %rbp, (...)
+    // To:      naclrestbp (...), %rZP
+    BuildMI(MBB, MBBI, DL, TII->get(X86::NACL_RESTBPm))
+      .addOperand(MI.getOperand(1))  // Base
+      .addOperand(MI.getOperand(2))  // Scale
+      .addOperand(MI.getOperand(3))  // Index
+      .addOperand(MI.getOperand(4))  // Offset
+      .addOperand(MI.getOperand(5))  // Segment
+      .addReg(UseZeroBasedSandbox ? 0 : X86::R15); // rZP
+    MI.eraseFromParent();
+    return true;
+  }
+
+  // Popping onto RBP
+  // Rewrite to:
+  //   naclrestbp (%rsp), %rZP
+  //   naclasp $8, %rZP
+  //
+  // TODO(pdox): Consider rewriting to this instead:
+  //   .bundle_lock
+  //   pop %rbp
+  //   mov %ebp,%ebp
+  //   add %rZP, %rbp
+  //   .bundle_unlock
+  if (Opc == X86::POP64r) {
+    assert(MI.getOperand(0).getReg() == X86::RBP);
+
+    BuildMI(MBB, MBBI, DL, TII->get(X86::NACL_RESTBPm))
+      .addReg(X86::RSP)  // Base
+      .addImm(1)  // Scale
+      .addReg(0)  // Index
+      .addImm(0)  // Offset
+      .addReg(0)  // Segment
+      .addReg(UseZeroBasedSandbox ? 0 : X86::R15); // rZP
+
+    BuildMI(MBB, MBBI, DL, TII->get(X86::NACL_ASPi8))
+      .addImm(8)
+      .addReg(UseZeroBasedSandbox ? 0 : X86::R15);
+
+    MI.eraseFromParent();
+    return true;
+  }
+
+  DumpInstructionVerbose(MI);
+  llvm_unreachable("Unhandled Frame SFI");
+}
+
+bool X86NaClRewritePass::ApplyControlSFI(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator MBBI) {
+  const bool UseZeroBasedSandbox = FlagUseZeroBasedSandbox;
+  TraceLog("ApplyControlSFI", MBB, MBBI);
+  MachineInstr &MI = *MBBI;
+
+  if (!HasControlFlow(MI))
+    return false;
+
+  // Direct branches are OK
+  if (IsDirectBranch(MI))
+    return false;
+
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Opc = MI.getOpcode();
+
+  // Rewrite indirect jump/call instructions
+  unsigned NewOpc = 0;
+  switch (Opc) {
+  // 32-bit
+  case X86::JMP32r               : NewOpc = X86::NACL_JMP32r; break;
+  case X86::TAILJMPr             : NewOpc = X86::NACL_JMP32r; break;
+  case X86::NACL_CG_CALL32r      : NewOpc = X86::NACL_CALL32r; break;
+  // 64-bit
+  case X86::NACL_CG_JMP64r       : NewOpc = X86::NACL_JMP64r; break;
+  case X86::NACL_CG_CALL64r      : NewOpc = X86::NACL_CALL64r; break;
+  case X86::NACL_CG_TAILJMPr64   : NewOpc = X86::NACL_JMP64r; break;
+  }
+  if (NewOpc) {
+    MachineInstrBuilder NewMI =
+     BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
+       .addOperand(MI.getOperand(0));
+    if (Is64Bit) {
+      NewMI.addReg(UseZeroBasedSandbox ? 0 : X86::R15);
+    }
+    MI.eraseFromParent();
+    return true;
+  }
+
+  // EH_RETURN has a single argment which is not actually used directly.
+  // The argument gives the location where to reposition the stack pointer
+  // before returning. EmitPrologue takes care of that repositioning.
+  // So EH_RETURN just ultimately emits a plain "ret".
+  // RETI returns and pops some number of bytes from the stack.
+  if (Opc == X86::RET || Opc == X86::EH_RETURN || Opc == X86::EH_RETURN64 ||
+      Opc == X86::RETI) {
+    // To maintain compatibility with nacl-as, for now we don't emit naclret.
+    // MI.setDesc(TII->get(Is64Bit ? X86::NACL_RET64 : X86::NACL_RET32));
+    if (Is64Bit) {
+      BuildMI(MBB, MBBI, DL, TII->get(X86::POP64r), X86::RCX);
+      if (Opc == X86::RETI) {
+        BuildMI(MBB, MBBI, DL, TII->get(X86::NACL_ASPi32))
+          .addOperand(MI.getOperand(0))
+          .addReg(UseZeroBasedSandbox ? 0 : X86::R15);
+      }
+      BuildMI(MBB, MBBI, DL, TII->get(X86::NACL_JMP64r))
+        .addReg(X86::ECX)
+        .addReg(UseZeroBasedSandbox ? 0 : X86::R15);
+    } else {
+      BuildMI(MBB, MBBI, DL, TII->get(X86::POP32r), X86::ECX);
+      if (Opc == X86::RETI) {
+        BuildMI(MBB, MBBI, DL, TII->get(X86::ADD32ri), X86::ESP)
+          .addReg(X86::ESP)
+          .addOperand(MI.getOperand(0));
+      }
+      BuildMI(MBB, MBBI, DL, TII->get(X86::NACL_JMP32r))
+        .addReg(X86::ECX);
+    }
+    MI.eraseFromParent();
+    return true;
+  }
+
+  // Rewrite trap
+  if (Opc == X86::TRAP) {
+    // To maintain compatibility with nacl-as, for now we don't emit nacltrap.
+    // MI.setDesc(TII->get(Is64Bit ? X86::NACL_TRAP64 : X86::NACL_TRAP32));
+    BuildMI(MBB, MBBI, DL, TII->get(X86::MOV32mi))
+      .addReg(Is64Bit && !UseZeroBasedSandbox ? X86::R15 : 0) // Base
+      .addImm(1) // Scale
+      .addReg(0) // Index
+      .addImm(0) // Offset
+      .addReg(0) // Segment
+      .addImm(0); // Value
+    MI.eraseFromParent();
+    return true;
+  }
+
+  DumpInstructionVerbose(MI);
+  llvm_unreachable("Unhandled Control SFI");
+}
+
+//
+// Sandboxes loads and stores (64-bit only)
+//
+bool X86NaClRewritePass::ApplyMemorySFI(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MBBI) {
+  TraceLog("ApplyMemorySFI", MBB, MBBI);
+  assert(Is64Bit);
+  MachineInstr &MI = *MBBI;
+  const bool UseZeroBasedSandbox = FlagUseZeroBasedSandbox;
+
+  if (!IsLoad(MI) && !IsStore(MI))
+    return false;
+
+  if (IsPushPop(MI))
+    return false;
+
+  unsigned MemOp;
+  if (!FindMemoryOperand(MI, &MemOp))
+    return false;
+  assert(isMem(&MI, MemOp));
+  MachineOperand &BaseReg  = MI.getOperand(MemOp + 0);
+  MachineOperand &Scale = MI.getOperand(MemOp + 1);
+  MachineOperand &IndexReg  = MI.getOperand(MemOp + 2);
+  //MachineOperand &Disp = MI.getOperand(MemOp + 3);
+  MachineOperand &SegmentReg = MI.getOperand(MemOp + 4);
+
+  // RIP-relative addressing is safe.
+  if (BaseReg.getReg() == X86::RIP)
+    return false;
+
+  // Make sure the base and index are 64-bit registers.
+  IndexReg.setReg(PromoteRegTo64(IndexReg.getReg()));
+  BaseReg.setReg(PromoteRegTo64(BaseReg.getReg()));
+  assert(IndexReg.getSubReg() == 0);
+  assert(BaseReg.getSubReg() == 0);
+
+  bool AbsoluteBase = IsRegAbsolute(BaseReg.getReg());
+  bool AbsoluteIndex = IsRegAbsolute(IndexReg.getReg());
+  unsigned AddrReg = 0;
+
+  if (AbsoluteBase && AbsoluteIndex) {
+    llvm_unreachable("Unexpected absolute register pair");
+  } else if (AbsoluteBase) {
+    AddrReg = IndexReg.getReg();
+  } else if (AbsoluteIndex) {
+    assert(!BaseReg.getReg() && "Unexpected base register");
+    assert(Scale.getImm() == 1);
+    AddrReg = 0;
+  } else {
+    if (!BaseReg.getReg()) {
+      // No base, fill in relative.
+      BaseReg.setReg(UseZeroBasedSandbox ? 0 : X86::R15);
+      AddrReg = IndexReg.getReg();
+    } else if (!UseZeroBasedSandbox) {
+      // Switch base and index registers if index register is undefined.
+      // That is do conversions like "mov d(%r,0,0) -> mov d(%r15, %r, 1)".
+      assert (!IndexReg.getReg()
+              && "Unexpected index and base register");
+      IndexReg.setReg(BaseReg.getReg());
+      Scale.setImm(1);
+      BaseReg.setReg(X86::R15);
+      AddrReg = IndexReg.getReg();
+    } else {
+      llvm_unreachable(
+          "Unexpected index and base register");
+    }
+  }
+
+  if (AddrReg) {
+    assert(!SegmentReg.getReg() && "Unexpected segment register");
+    SegmentReg.setReg(X86::PSEUDO_NACL_SEG);
+    return true;
+  }
+
+  return false;
+}
+
+bool X86NaClRewritePass::ApplyRewrites(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MBBI) {
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+  unsigned Opc = MI.getOpcode();
+
+  // These direct jumps need their opcode rewritten
+  // and variable operands removed.
+  unsigned NewOpc = 0;
+  switch (Opc) {
+  case X86::NACL_CG_CALLpcrel32  : NewOpc = X86::NACL_CALL32d; break;
+  case X86::TAILJMPd             : NewOpc = X86::JMP_4; break;
+  case X86::NACL_CG_TAILJMPd64   : NewOpc = X86::JMP_4; break;
+  case X86::NACL_CG_CALL64pcrel32: NewOpc = X86::NACL_CALL64d; break;
+  }
+  if (NewOpc) {
+    BuildMI(MBB, MBBI, DL, TII->get(NewOpc))
+      .addOperand(MI.getOperand(0));
+    MI.eraseFromParent();
+    return true;
+  }
+
+  if (Opc == X86::NACL_CG_TLS_addr32) {
+    // Rewrite to nacltlsaddr32
+    BuildMI(MBB, MBBI, DL, TII->get(X86::NACL_TLS_addr32))
+      .addOperand(MI.getOperand(0))  // Base
+      .addOperand(MI.getOperand(1))  // Scale
+      .addOperand(MI.getOperand(2))  // Index
+      .addGlobalAddress(MI.getOperand(3).getGlobal(), 0, X86II::MO_TLSGD)
+      .addOperand(MI.getOperand(4)); // Segment
+    MI.eraseFromParent();
+    return true;
+  }
+
+  // General Dynamic NaCl TLS model
+  // http://code.google.com/p/nativeclient/issues/detail?id=1685
+  if (Opc == X86::NACL_CG_GD_TLS_addr64) {
+
+    // Rewrite to:
+    //   leaq $sym@TLSGD(%rip), %rdi
+    //   naclcall __tls_get_addr@PLT
+    BuildMI(MBB, MBBI, DL, TII->get(X86::LEA64r), X86::RDI)
+        .addReg(X86::RIP) // Base
+        .addImm(1) // Scale
+        .addReg(0) // Index
+        .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
+                          MI.getOperand(3).getTargetFlags())
+        .addReg(0); // Segment
+    BuildMI(MBB, MBBI, DL, TII->get(X86::NACL_CALL64d))
+        .addExternalSymbol("__tls_get_addr", X86II::MO_PLT);
+    MI.eraseFromParent();
+    return true;
+  }
+
+  // Local Exec NaCl TLS Model
+  if (Opc == X86::NACL_CG_LE_TLS_addr64 ||
+      Opc == X86::NACL_CG_LE_TLS_addr32) {
+    unsigned CallOpc, LeaOpc, Reg;
+    // Rewrite to:
+    //   naclcall __nacl_read_tp@PLT
+    //   lea $sym@flag(,%reg), %reg
+    if (Opc == X86::NACL_CG_LE_TLS_addr64) {
+      CallOpc = X86::NACL_CALL64d;
+      LeaOpc = X86::LEA64r;
+      Reg = X86::RAX;
+    } else {
+      CallOpc = X86::NACL_CALL32d;
+      LeaOpc = X86::LEA32r;
+      Reg = X86::EAX;
+    }
+    BuildMI(MBB, MBBI, DL, TII->get(CallOpc))
+        .addExternalSymbol("__nacl_read_tp", X86II::MO_PLT);
+    BuildMI(MBB, MBBI, DL, TII->get(LeaOpc), Reg)
+        .addReg(0) // Base
+        .addImm(1) // Scale
+        .addReg(Reg) // Index
+        .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
+                          MI.getOperand(3).getTargetFlags())
+        .addReg(0); // Segment
+    MI.eraseFromParent();
+    return true;
+  }
+
+  // Initial Exec NaCl TLS Model
+  if (Opc == X86::NACL_CG_IE_TLS_addr64 ||
+      Opc == X86::NACL_CG_IE_TLS_addr32) {
+    unsigned CallOpc, AddOpc, Base, Reg;
+    // Rewrite to:
+    //   naclcall __nacl_read_tp@PLT
+    //   addq sym@flag(%base), %reg
+    if (Opc == X86::NACL_CG_IE_TLS_addr64) {
+      CallOpc = X86::NACL_CALL64d;
+      AddOpc = X86::ADD64rm;
+      Base = X86::RIP;
+      Reg = X86::RAX;
+    } else {
+      CallOpc = X86::NACL_CALL32d;
+      AddOpc = X86::ADD32rm;
+      Base = MI.getOperand(3).getTargetFlags() == X86II::MO_INDNTPOFF ?
+          0 : X86::EBX; // EBX for GOTNTPOFF.
+      Reg = X86::EAX;
+    }
+    BuildMI(MBB, MBBI, DL, TII->get(CallOpc))
+        .addExternalSymbol("__nacl_read_tp", X86II::MO_PLT);
+    BuildMI(MBB, MBBI, DL, TII->get(AddOpc), Reg)
+        .addReg(Reg)
+        .addReg(Base)
+        .addImm(1) // Scale
+        .addReg(0) // Index
+        .addGlobalAddress(MI.getOperand(3).getGlobal(), 0,
+                          MI.getOperand(3).getTargetFlags())
+        .addReg(0); // Segment
+    MI.eraseFromParent();
+    return true;
+  }
+
+  return false;
+}
+
+bool X86NaClRewritePass::AlignJumpTableTargets(MachineFunction &MF) {
+  bool Modified = true;
+
+  MF.setAlignment(5); // log2, 32 = 2^5
+
+  MachineJumpTableInfo *JTI = MF.getJumpTableInfo();
+  if (JTI != NULL) {
+    const std::vector<MachineJumpTableEntry> &JT = JTI->getJumpTables();
+    for (unsigned i = 0; i < JT.size(); ++i) {
+      const std::vector<MachineBasicBlock*> &MBBs = JT[i].MBBs;
+      for (unsigned j = 0; j < MBBs.size(); ++j) {
+        MBBs[j]->setAlignment(5);
+        Modified |= true;
+      }
+    }
+  }
+  return Modified;
+}
+
+bool X86NaClRewritePass::runOnMachineFunction(MachineFunction &MF) {
+  bool Modified = false;
+
+  TM = &MF.getTarget();
+  TII = TM->getInstrInfo();
+  TRI = TM->getRegisterInfo();
+  Subtarget = &TM->getSubtarget<X86Subtarget>();
+  Is64Bit = Subtarget->is64Bit();
+
+  assert(Subtarget->isTargetNaCl() && "Unexpected target in NaClRewritePass!");
+
+  DEBUG(dbgs() << "*************** NaCl Rewrite Pass ***************\n");
+  for (MachineFunction::iterator MFI = MF.begin(), E = MF.end();
+       MFI != E;
+       ++MFI) {
+    Modified |= runOnMachineBasicBlock(*MFI);
+  }
+  Modified |= AlignJumpTableTargets(MF);
+  DEBUG(dbgs() << "*************** NaCl Rewrite DONE  ***************\n");
+  return Modified;
+}
+
+bool X86NaClRewritePass::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
+  bool Modified = false;
+  if (MBB.hasAddressTaken()) {
+    //FIXME: use a symbolic constant or get this value from some configuration
+    MBB.setAlignment(5);
+    Modified = true;
+  }
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), NextMBBI = MBBI;
+       MBBI != MBB.end(); MBBI = NextMBBI) {
+    ++NextMBBI;
+    // When one of these methods makes a change,
+    // it returns true, skipping the others.
+    if (ApplyRewrites(MBB, MBBI) ||
+        (Is64Bit && ApplyStackSFI(MBB, MBBI)) ||
+        (Is64Bit && ApplyMemorySFI(MBB, MBBI)) ||
+        (Is64Bit && ApplyFrameSFI(MBB, MBBI)) ||
+        ApplyControlSFI(MBB, MBBI)) {
+      Modified = true;
+    }
+  }
+  return Modified;
+}
+
+static void DumpInstructionVerbose(const MachineInstr &MI) {
+  dbgs() << MI;
+  dbgs() << MI.getNumOperands() << " operands:" << "\n";
+  for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
+    const MachineOperand& op = MI.getOperand(i);
+    dbgs() << "  " << i << "(" << op.getType() << "):" << op << "\n";
+  }
+  dbgs() << "\n";
+}
+
+/// createX86NaClRewritePassPass - returns an instance of the pass.
+namespace llvm {
+  FunctionPass* createX86NaClRewritePass() {
+    return new X86NaClRewritePass();
+  }
+}
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 73ac747742..9054345d35 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -54,6 +54,11 @@ cl::opt<bool>
 EnableBasePointer("x86-use-base-pointer", cl::Hidden, cl::init(true),
           cl::desc("Enable use of a base pointer for complex stack frames"));
 
+// @LOCALMOD-BEGIN
+extern cl::opt<bool> FlagUseZeroBasedSandbox;
+extern cl::opt<bool> FlagRestrictR15;
+// @LOCALMOD-END
+
 X86RegisterInfo::X86RegisterInfo(X86TargetMachine &tm,
                                  const TargetInstrInfo &tii)
   : X86GenRegisterInfo(tm.getSubtarget<X86Subtarget>().is64Bit()
@@ -365,6 +370,25 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     }
   }
 
+  // @LOCALMOD-START
+  const X86Subtarget& Subtarget = MF.getTarget().getSubtarget<X86Subtarget>();
+  const bool UseZeroBasedSandbox = FlagUseZeroBasedSandbox;
+  const bool RestrictR15 = FlagRestrictR15;
+  assert(UseZeroBasedSandbox || RestrictR15);
+  if (Subtarget.isTargetNaCl64()) {
+    if (RestrictR15) {
+      Reserved.set(X86::R15);
+      Reserved.set(X86::R15D);
+      Reserved.set(X86::R15W);
+      Reserved.set(X86::R15B);
+    }
+    Reserved.set(X86::RBP);
+    Reserved.set(X86::EBP);
+    Reserved.set(X86::BP);
+    Reserved.set(X86::BPL);
+  }
+  // @LOCALMOD-END
+
   return Reserved;
 }
 
@@ -726,6 +750,9 @@ unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT,
       return X86::R14D;
     case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
       return X86::R15D;
+    // @LOCALMOD. TODO: possibly revert this after LEA .td fixes
+    case X86::EIP: case X86::RIP:
+      return X86::EIP;
     }
   case MVT::i64:
     // For 64-bit mode if we've requested a "high" register and the
@@ -778,6 +805,9 @@ unsigned getX86SubSuperRegister(unsigned Reg, MVT::SimpleValueType VT,
       return X86::R14;
     case X86::R15B: case X86::R15W: case X86::R15D: case X86::R15:
       return X86::R15;
+    // @LOCALMOD. TODO: possibly revert this after LEA .td fixes
+    case X86::EIP: case X86::RIP:
+      return X86::RIP;
     }
   }
 }
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index be6282a643..f3bfe9b328 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -270,6 +270,9 @@ def CR15 : X86Reg<"cr15", 15>;
 // Pseudo index registers
 def EIZ : X86Reg<"eiz", 4>;
 def RIZ : X86Reg<"riz", 4>;
+  
+def PSEUDO_NACL_SEG : X86Reg<"nacl", 4>; // @LOCALMOD
+
 
 
 //===----------------------------------------------------------------------===//
@@ -336,6 +339,10 @@ def GR16_ABCD : RegisterClass<"X86", [i16], 16, (add AX, CX, DX, BX)>;
 def GR32_ABCD : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX, EBX)>;
 def GR64_ABCD : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RBX)>;
 def GR32_TC   : RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX)>;
+// @LOCALMOD-START
+def GR32_TC_64: RegisterClass<"X86", [i32], 32, (add EAX, ECX, EDX, ESI, EDI,
+                                                 R8D, R9D, R11D)>;
+// @LOCALMOD-END
 def GR64_TC   : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX, RSI, RDI,
                                                      R8, R9, R11, RIP)>;
 def GR64_TCW64 : RegisterClass<"X86", [i64], 64, (add RAX, RCX, RDX,
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index 723e50cc18..a102935b4b 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -35,6 +35,14 @@ X86SelectionDAGInfo::EmitTargetCodeForMemset(SelectionDAG &DAG, DebugLoc dl,
                                          MachinePointerInfo DstPtrInfo) const {
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
 
+  // @LOCALMOD-BEGIN
+  if (Subtarget->isTargetNaCl()) {
+    // TODO: Can we allow this optimization for Native Client?
+    // At the very least, pointer size needs to be fixed below.
+    return SDValue();
+  }
+  // @LOCALMOD-END
+
   // If to a segment-relative address space, use the default lowering.
   if (DstPtrInfo.getAddrSpace() >= 256)
     return SDValue();
@@ -190,6 +198,13 @@ X86SelectionDAGInfo::EmitTargetCodeForMemcpy(SelectionDAG &DAG, DebugLoc dl,
   if (!AlwaysInline && SizeVal > Subtarget->getMaxInlineSizeThreshold())
     return SDValue();
 
+  // @LOCALMOD-BEGIN
+  if (Subtarget->isTargetNaCl()) {
+    // TODO(pdox): Allow use of the NaCl pseudo-instruction for REP MOV
+    return SDValue();
+  }
+  // @LOCALMOD-END
+
   /// If not DWORD aligned, it is more efficient to call the library.  However
   /// if calling the library is not allowed (AlwaysInline), then soldier on as
   /// the code generated here is better than the long load-store sequence we
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index d1ed680287..0132f81410 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -160,7 +160,15 @@ const char *X86Subtarget::getBZeroEntry() const {
 bool X86Subtarget::IsLegalToCallImmediateAddr(const TargetMachine &TM) const {
   if (In64BitMode)
     return false;
-  return isTargetELF() || TM.getRelocationModel() == Reloc::Static;
+  // @LOCALMOD-BEGIN
+  // BUG= http://code.google.com/p/nativeclient/issues/detail?id=2367
+  // For NaCl dynamic linking we do not want to generate a text relocation to
+  // an absolute address in PIC mode.  Such a situation arises from
+  // test/CodeGen/X86/call-imm.ll with the default implementation.
+  // For other platforms we retain the default behavior.
+  return (isTargetELF() && !isTargetNaCl()) ||
+         TM.getRelocationModel() == Reloc::Static;
+  // @LOCALMOD-END
 }
 
 void X86Subtarget::AutoDetectSubtargetFeatures() {
@@ -416,10 +424,11 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
          "64-bit code requested on a subtarget that doesn't support it!");
 
   // Stack alignment is 16 bytes on Darwin, Linux and Solaris (both
-  // 32 and 64 bit) and for all 64-bit targets.
+  // 32 and 64 bit), NaCl and for all 64-bit targets.
   if (StackAlignOverride)
     stackAlignment = StackAlignOverride;
   else if (isTargetDarwin() || isTargetLinux() || isTargetSolaris() ||
+           isTargetNaCl() || // @LOCALMOD
            In64BitMode)
     stackAlignment = 16;
 }
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 8bf4cc77f7..0f8cab52f2 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -192,6 +192,9 @@ public:
 
   bool is64Bit() const { return In64BitMode; }
 
+  // @LOCALMOD
+  bool has64BitPointers() const { return is64Bit() && !isTargetNaCl(); }
+
   PICStyles::Style getPICStyle() const { return PICStyle; }
   void setPICStyle(PICStyles::Style Style)  { PICStyle = Style; }
 
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 158f9dc066..59c037f296 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -43,6 +43,8 @@ X86_32TargetMachine::X86_32TargetMachine(const Target &T, StringRef TT,
                 getSubtargetImpl()->isTargetWindows()) ?
                "e-p:32:32-f64:64:64-i64:64:64-f80:32:32-f128:128:128-"
                "n8:16:32-S32" :
+               getSubtargetImpl()->isTargetNaCl() ? // @LOCALMOD
+               "e-p:32:32-s:32-f64:64:64-f32:32:32-f80:128:128-i64:64:64-n8:16:32-S128" :
                "e-p:32:32-f64:32:64-i64:32:64-f80:32:32-f128:128:128-"
                "n8:16:32-S128"),
     InstrInfo(*this),
@@ -60,7 +62,10 @@ X86_64TargetMachine::X86_64TargetMachine(const Target &T, StringRef TT,
                                          Reloc::Model RM, CodeModel::Model CM,
                                          CodeGenOpt::Level OL)
   : X86TargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, true),
-    DL("e-p:64:64-s:64-f64:64:64-i64:64:64-f80:128:128-f128:128:128-"
+    DL(getSubtargetImpl()->isTargetNaCl() ? // @LOCALMOD
+               "e-p:32:32-s:64-f64:64:64-f32:32:32-f80:128:128-i64:64:64-"
+               "n8:16:32:64-S128" :
+               "e-p:64:64-s:64-f64:64:64-i64:64:64-f80:128:128-f128:128:128-"
                "n8:16:32:64-S128"),
     InstrInfo(*this),
     TSInfo(*this),
@@ -192,11 +197,25 @@ bool X86PassConfig::addPreEmitPass() {
     ShouldPrint = true;
   }
 
+  // @LOCALMOD-START
+  if (getX86Subtarget().isTargetNaCl()) {
+    addPass(createX86NaClRewritePass());
+    ShouldPrint = true;
+  }
+  // @LOCALMOD-END
+
   return ShouldPrint;
 }
 
 bool X86TargetMachine::addCodeEmitter(PassManagerBase &PM,
                                       JITCodeEmitter &JCE) {
+  // @LOCALMOD-START
+  // Add this pass here instead of as a PreEmitPass because this function is
+  // only called in JIT mode
+  if (Subtarget.isTargetNaCl()) {
+    PM.add(createX86NaClRewriteFinalPass());
+  }
+  // @LOCALMOD-END
   PM.add(createX86JITCodeEmitterPass(*this, JCE));
 
   return false;
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index 12311a1abf..967ce95d10 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -19,6 +19,9 @@
 #include "X86ISelLowering.h"
 #include "X86FrameLowering.h"
 #include "X86JITInfo.h"
+#ifdef __native_client__
+#include "X86NaClJITInfo.h"
+#endif
 #include "X86SelectionDAGInfo.h"
 #include "X86Subtarget.h"
 #include "llvm/Target/TargetMachine.h"
@@ -80,7 +83,11 @@ class X86_32TargetMachine : public X86TargetMachine {
   X86InstrInfo      InstrInfo;
   X86SelectionDAGInfo TSInfo;
   X86TargetLowering TLInfo;
+#ifdef __native_client__
+  X86NaClJITInfo        JITInfo;
+#else
   X86JITInfo        JITInfo;
+#endif
   ScalarTargetTransformImpl STTI;
   X86VectorTargetTransformInfo VTTI;
 public:
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index 92aee0dd3f..4f39d68d40 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -9,6 +9,7 @@
 
 #include "X86TargetObjectFile.h"
 #include "X86TargetMachine.h"
+#include "X86Subtarget.h"  // @LOCALMOD
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/MC/MCContext.h"
@@ -51,3 +52,30 @@ X86LinuxTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM) {
   TargetLoweringObjectFileELF::Initialize(Ctx, TM);
   InitializeELF(TM.Options.UseInitArray);
 }
+
+// @LOCALMOD-START
+// NOTE: this was largely lifted from
+// lib/Target/ARM/ARMTargetObjectFile.cpp
+//
+// The default is .ctors/.dtors while the arm backend uses
+// .init_array/.fini_array
+//
+// Without this the linker defined symbols __fini_array_start and
+// __fini_array_end do not have useful values. c.f.:
+// http://code.google.com/p/nativeclient/issues/detail?id=805
+void TargetLoweringObjectFileNaCl::Initialize(MCContext &Ctx,
+                                              const TargetMachine &TM) {
+  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
+
+  StaticCtorSection =
+    getContext().getELFSection(".init_array", ELF::SHT_INIT_ARRAY,
+                               ELF::SHF_WRITE |
+                               ELF::SHF_ALLOC,
+                               SectionKind::getDataRel());
+  StaticDtorSection =
+    getContext().getELFSection(".fini_array", ELF::SHT_FINI_ARRAY,
+                               ELF::SHF_WRITE |
+                               ELF::SHF_ALLOC,
+                               SectionKind::getDataRel());
+}
+// @LOCALMOD-END
diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h
index 2d320c594c..5fac48e57a 100644
--- a/lib/Target/X86/X86TargetObjectFile.h
+++ b/lib/Target/X86/X86TargetObjectFile.h
@@ -38,6 +38,13 @@ namespace llvm {
     virtual void Initialize(MCContext &Ctx, const TargetMachine &TM);
   };
 
+  // @LOCALMOD-BEGIN
+  class TargetLoweringObjectFileNaCl : public TargetLoweringObjectFileELF {
+  public:
+    virtual void Initialize(MCContext &ctx, const TargetMachine &TM);
+  };
+ // @LOCALMOD-END
+
 } // end namespace llvm
 
 #endif
diff --git a/lib/Transforms/CMakeLists.txt b/lib/Transforms/CMakeLists.txt
index de1353e6c1..9fa690971a 100644
--- a/lib/Transforms/CMakeLists.txt
+++ b/lib/Transforms/CMakeLists.txt
@@ -5,3 +5,4 @@ add_subdirectory(Scalar)
 add_subdirectory(IPO)
 add_subdirectory(Vectorize)
 add_subdirectory(Hello)
+add_subdirectory(NaCl)
diff --git a/lib/Transforms/IPO/ExtractGV.cpp b/lib/Transforms/IPO/ExtractGV.cpp
index 6716deb9e4..05aefeff9f 100644
--- a/lib/Transforms/IPO/ExtractGV.cpp
+++ b/lib/Transforms/IPO/ExtractGV.cpp
@@ -58,6 +58,15 @@ namespace {
             continue;
           if (I->getName() == "llvm.global_ctors")
             continue;
+          // @LOCALMOD-BEGIN - this is likely upstreamable
+          // Note: there will likely be more cases once this
+          // is exercises more thorougly.
+	  if (I->getName() == "llvm.global_dtors")
+            continue;
+          // not observed yet 
+          if (I->hasExternalWeakLinkage()) 
+	    continue;
+          // @LOCALMOD-END
         }
 
         bool Local = I->hasLocalLinkage();
@@ -78,8 +87,15 @@ namespace {
         if (!Delete) {
           if (I->hasAvailableExternallyLinkage())
             continue;
+          // @LOCALMOD-BEGIN - this is likely upstreamable
+          // Note: there will likely be more cases once this
+          // is exercises more thorougly.
+          // observed for pthread_cancel
+          if (I->hasExternalWeakLinkage())
+	    continue;
+          // @LOCALMOD-END
         }
-
+ 
         bool Local = I->hasLocalLinkage();
         if (Local)
           I->setVisibility(GlobalValue::HiddenVisibility);
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 5ad6f9111c..4f4c388a92 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1146,8 +1146,10 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
   // If we are removing arguments to the function, emit an obnoxious warning.
   if (FT->getNumParams() < NumActualArgs) {
     if (!FT->isVarArg()) {
-      errs() << "WARNING: While resolving call to function '"
-             << Callee->getName() << "' arguments were dropped!\n";
+      if (Callee->getName() != "main") { // @LOCALMOD
+        errs() << "WARNING: While resolving call to function '"
+               << Callee->getName() << "' arguments were dropped!\n";
+      }
     } else {
       // Add all of the arguments in their promoted form to the arg list.
       for (unsigned i = FT->getNumParams(); i != NumActualArgs; ++i, ++AI) {
diff --git a/lib/Transforms/LLVMBuild.txt b/lib/Transforms/LLVMBuild.txt
index f7bca064c7..001ba5d232 100644
--- a/lib/Transforms/LLVMBuild.txt
+++ b/lib/Transforms/LLVMBuild.txt
@@ -16,7 +16,7 @@
 ;===------------------------------------------------------------------------===;
 
 [common]
-subdirectories = IPO InstCombine Instrumentation Scalar Utils Vectorize
+subdirectories = IPO InstCombine Instrumentation Scalar Utils Vectorize NaCl
 
 [component_0]
 type = Group
diff --git a/lib/Transforms/Makefile b/lib/Transforms/Makefile
index 8b1df92fa2..ae03ff32c5 100644
--- a/lib/Transforms/Makefile
+++ b/lib/Transforms/Makefile
@@ -8,7 +8,11 @@
 ##===----------------------------------------------------------------------===##
 
 LEVEL = ../..
-PARALLEL_DIRS = Utils Instrumentation Scalar InstCombine IPO Vectorize Hello
+PARALLEL_DIRS = Utils Instrumentation Scalar InstCombine IPO Vectorize Hello NaCl
+
+ifeq ($(NACL_SANDBOX),1)
+  PARALLEL_DIRS := $(filter-out Hello, $(PARALLEL_DIRS))
+endif
 
 include $(LEVEL)/Makefile.config
 
diff --git a/lib/Transforms/NaCl/CMakeLists.txt b/lib/Transforms/NaCl/CMakeLists.txt
new file mode 100644
index 0000000000..d634ad9655
--- /dev/null
+++ b/lib/Transforms/NaCl/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_llvm_library(LLVMTransformsNaCl
+  ExpandCtors.cpp
+  )
+
+add_dependencies(LLVMTransformsNaCl intrinsics_gen)
diff --git a/lib/Transforms/NaCl/ExpandCtors.cpp b/lib/Transforms/NaCl/ExpandCtors.cpp
new file mode 100644
index 0000000000..6b8130e4fb
--- /dev/null
+++ b/lib/Transforms/NaCl/ExpandCtors.cpp
@@ -0,0 +1,145 @@
+//===- ExpandCtors.cpp - Convert ctors/dtors to concrete arrays -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass converts LLVM's special symbols llvm.global_ctors and
+// llvm.global_dtors to concrete arrays, __init_array_start/end and
+// __fini_array_start/end, that are usable by a C library.
+//
+// This pass sorts the contents of global_ctors/dtors according to the
+// priority values they contain and removes the priority values.
+//
+//===----------------------------------------------------------------------===//
+
+#include <vector>
+
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/NaCl.h"
+#include "llvm/TypeBuilder.h"
+
+using namespace llvm;
+
+namespace {
+  struct ExpandCtors : public ModulePass {
+    static char ID; // Pass identification, replacement for typeid
+    ExpandCtors() : ModulePass(ID) {
+      initializeExpandCtorsPass(*PassRegistry::getPassRegistry());
+    }
+
+    virtual bool runOnModule(Module &M);
+  };
+}
+
+char ExpandCtors::ID = 0;
+INITIALIZE_PASS(ExpandCtors, "nacl-expand-ctors",
+                "Hook up constructor and destructor arrays to libc",
+                false, false)
+
+static void setGlobalVariableValue(Module &M, const char *Name,
+                                   Constant *Value) {
+  GlobalVariable *Var = M.getNamedGlobal(Name);
+  if (!Var) {
+    // This warning can happen in a program that does not use a libc
+    // and so does not call the functions in __init_array_start or
+    // __fini_array_end.  Such a program might be linked with
+    // "-nostdlib".
+    errs() << "Warning: Variable " << Name << " not referenced\n";
+  } else {
+    if (Var->hasInitializer()) {
+      report_fatal_error(std::string("Variable ") + Name +
+                         " already has an initializer");
+    }
+    Var->replaceAllUsesWith(ConstantExpr::getBitCast(Value, Var->getType()));
+    Var->eraseFromParent();
+  }
+}
+
+struct FuncArrayEntry {
+  uint64_t priority;
+  Constant *func;
+};
+
+static bool compareEntries(FuncArrayEntry Entry1, FuncArrayEntry Entry2) {
+  return Entry1.priority < Entry2.priority;
+}
+
+static void defineFuncArray(Module &M, const char *LlvmArrayName,
+                            const char *StartSymbol,
+                            const char *EndSymbol) {
+  std::vector<Constant*> Funcs;
+
+  GlobalVariable *Array = M.getNamedGlobal(LlvmArrayName);
+  if (Array) {
+    if (Array->hasInitializer() && !Array->getInitializer()->isNullValue()) {
+      ConstantArray *InitList = cast<ConstantArray>(Array->getInitializer());
+      std::vector<FuncArrayEntry> FuncsToSort;
+      for (unsigned Index = 0; Index < InitList->getNumOperands(); ++Index) {
+        ConstantStruct *CS = cast<ConstantStruct>(InitList->getOperand(Index));
+        FuncArrayEntry Entry;
+        Entry.priority = cast<ConstantInt>(CS->getOperand(0))->getZExtValue();
+        Entry.func = CS->getOperand(1);
+        FuncsToSort.push_back(Entry);
+      }
+
+      std::sort(FuncsToSort.begin(), FuncsToSort.end(), compareEntries);
+      for (std::vector<FuncArrayEntry>::iterator Iter = FuncsToSort.begin();
+           Iter != FuncsToSort.end();
+           ++Iter) {
+        Funcs.push_back(Iter->func);
+      }
+    }
+    // No code should be referencing global_ctors/global_dtors,
+    // because this symbol is internal to LLVM.
+    Array->eraseFromParent();
+  }
+
+  Type *FuncTy = FunctionType::get(Type::getVoidTy(M.getContext()), false);
+  Type *FuncPtrTy = FuncTy->getPointerTo();
+  ArrayType *ArrayTy = ArrayType::get(FuncPtrTy, Funcs.size());
+  GlobalVariable *NewArray =
+      new GlobalVariable(M, ArrayTy, /* isConstant= */ true,
+                         GlobalValue::InternalLinkage,
+                         ConstantArray::get(ArrayTy, Funcs));
+  setGlobalVariableValue(M, StartSymbol, NewArray);
+  // We do this last so that LLVM gives NewArray the name
+  // "__{init,fini}_array_start" without adding any suffixes to
+  // disambiguate from the original GlobalVariable's name.  This is
+  // not essential -- it just makes the output easier to understand
+  // when looking at symbols for debugging.
+  NewArray->setName(StartSymbol);
+
+  // We replace "__{init,fini}_array_end" with the address of the end
+  // of NewArray.  This removes the name "__{init,fini}_array_end"
+  // from the output, which is not ideal for debugging.  Ideally we
+  // would convert "__{init,fini}_array_end" to being a GlobalAlias
+  // that points to the end of the array.  However, unfortunately LLVM
+  // does not generate correct code when a GlobalAlias contains a
+  // GetElementPtr ConstantExpr.
+  Constant *NewArrayEnd =
+      ConstantExpr::getGetElementPtr(NewArray,
+                                     ConstantInt::get(M.getContext(),
+                                                      APInt(32, 1)));
+  setGlobalVariableValue(M, EndSymbol, NewArrayEnd);
+}
+
+bool ExpandCtors::runOnModule(Module &M) {
+  defineFuncArray(M, "llvm.global_ctors",
+                  "__init_array_start", "__init_array_end");
+  defineFuncArray(M, "llvm.global_dtors",
+                  "__fini_array_start", "__fini_array_end");
+  return true;
+}
+
+ModulePass *llvm::createExpandCtorsPass() {
+  return new ExpandCtors();
+}
diff --git a/lib/Transforms/NaCl/LLVMBuild.txt b/lib/Transforms/NaCl/LLVMBuild.txt
new file mode 100644
index 0000000000..2f1522b3e5
--- /dev/null
+++ b/lib/Transforms/NaCl/LLVMBuild.txt
@@ -0,0 +1,23 @@
+;===- ./lib/Transforms/NaCl/LLVMBuild.txt ----------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = NaCl
+parent = Transforms
+library_name = NaCl
+required_libraries = Core
diff --git a/lib/Transforms/NaCl/Makefile b/lib/Transforms/NaCl/Makefile
new file mode 100644
index 0000000000..ecf8db6eae
--- /dev/null
+++ b/lib/Transforms/NaCl/Makefile
@@ -0,0 +1,15 @@
+##===- lib/Transforms/NaCl/Makefile-------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMTransformsNaCl
+BUILD_ARCHIVE = 1
+
+include $(LEVEL)/Makefile.common
+
diff --git a/lib/Transforms/Scalar/CMakeLists.txt b/lib/Transforms/Scalar/CMakeLists.txt
index b3fc6e338c..06ef4b4a9b 100644
--- a/lib/Transforms/Scalar/CMakeLists.txt
+++ b/lib/Transforms/Scalar/CMakeLists.txt
@@ -32,6 +32,7 @@ add_llvm_library(LLVMScalarOpts
   SimplifyLibCalls.cpp
   Sink.cpp
   TailRecursionElimination.cpp
+  NaClCcRewrite.cpp
   )
 
 add_dependencies(LLVMScalarOpts intrinsics_gen)
diff --git a/lib/Transforms/Scalar/NaClCcRewrite.cpp b/lib/Transforms/Scalar/NaClCcRewrite.cpp
new file mode 100644
index 0000000000..5eace7f39d
--- /dev/null
+++ b/lib/Transforms/Scalar/NaClCcRewrite.cpp
@@ -0,0 +1,1053 @@
+//===- ConstantProp.cpp - Code to perform Simple Constant Propagation -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements calling convention rewrite for Native Client to ensure
+// compatibility between pnacl and gcc generated code when calling
+// ppapi interface functions.
+//===----------------------------------------------------------------------===//
+
+
+// Major TODOs:
+// * dealing with vararg
+//   (We shoulf exclude all var arg functions and calls to them from rewrites)
+
+#define DEBUG_TYPE "naclcc"
+
+#include "llvm/Argument.h"
+#include "llvm/Attributes.h"
+#include "llvm/Constant.h"
+#include "llvm/DataLayout.h"
+#include "llvm/Instruction.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Function.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Transforms/Scalar.h"
+
+#include <vector>
+
+using namespace llvm;
+
+namespace llvm {
+
+cl::opt<bool> FlagEnableCcRewrite(
+  "nacl-cc-rewrite",
+  cl::desc("enable NaCl CC rewrite"));
+}
+
+namespace {
+
+// This represents a rule for rewiriting types
+struct TypeRewriteRule {
+  const char* src;    // type pattern we are trying to match
+  const char* dst;    // replacement type
+  const char* name;   // name of the rule for diagnosis
+};
+
+// Note: all rules must be well-formed
+// * parentheses must match
+// * TODO: add verification for this
+
+// Legend:
+// s(): struct (also used for unions)
+// c:   char (= 8 bit int)  (only allowed for src)
+// i:   32 bit int
+// l:   64 bit int
+// f:   32 bit float
+// d:   64 bit float (= double)
+// p:   untyped pointer (only allowed for src)
+// P(): typed pointer (currently not used, only allowed for src)
+// F:   generic function type (only allowed for src)
+
+// The X8664 Rewrite rules are also subject to
+// register constraints, c.f.: section 3.2.3
+// http://www.x86-64.org/documentation/abi.pdf
+// (roughly) for X8664: up to 2 regs per struct can be used for struct passsing
+//                      and up to 2 regs for struct returns
+// The rewrite rules are straight forward except for: s(iis(d)) => ll
+// which would be straight forward if the frontend had lowered the union inside
+// of PP_Var to s(l) instead of s(d), yielding: s(iis(l)) => ll
+TypeRewriteRule ByvalRulesX8664[] = {
+  {"s(iis(d))", "ll", "PP_Var"},
+  {"s(pp)",     "l",  "PP_ArrayOutput"},
+  {"s(ppi)",    "li", "PP_CompletionCallback"},
+  {0, 0, 0},
+};
+
+TypeRewriteRule SretRulesX8664[] = {
+  // Note: for srets, multireg returns are modeled as struct returns
+  {"s(iis(d))", "s(ll)", "PP_Var"},
+  {"s(ff)",     "d",     "PP_FloatPoint"},
+  {"s(ii)",     "l",     "PP_Point" },
+  {"s(pp)",     "l",     "PP_ArrayOutput"},
+  {0, 0, 0},
+};
+
+// for ARM: up to 4 regs can be used for struct passsing
+//          and up to 2 float regs for struct returns
+TypeRewriteRule ByvalRulesARM[] = {
+  {"s(iis(d))",  "ll",  "PP_Var"},
+  {"s(ppi)",     "iii", "PP_CompletionCallback" },
+  {"s(pp)",      "ii",  "PP_ArrayOutput"},
+  {0, 0, 0},
+};
+
+TypeRewriteRule SretRulesARM[] = {
+  // Note: for srets, multireg returns are modeled as struct returns
+  {"s(ff)",     "s(ff)", "PP_FloatPoint"},
+  {0, 0, 0},
+};
+
+// Helper class to model Register Usage as required by
+// the x86-64 calling conventions
+class RegUse {
+  uint32_t n_int_;
+  uint32_t n_float_;
+
+ public:
+  RegUse(uint32_t n_int=0, uint32_t n_float=0) :
+    n_int_(n_int), n_float_(n_float) {}
+
+  static RegUse OneIntReg() { return RegUse(1, 0); }
+  static RegUse OnePointerReg() { return RegUse(1, 0); }
+  static RegUse OneFloatReg() { return RegUse(0, 1); }
+
+  RegUse operator+(RegUse other) const {
+    return RegUse(n_int_ + other.n_int_, n_float_ + other.n_float_); }
+  RegUse operator-(RegUse other) const {
+    return RegUse(n_int_ - other.n_int_, n_float_ - other.n_float_); }
+  bool operator==(RegUse other) const {
+    return n_int_ == other.n_int_ &&  n_float_ == other.n_float_; }
+  bool operator!=(RegUse other) const {
+    return n_int_ != other.n_int_ &&  n_float_ != other.n_float_; }
+  bool operator<=(RegUse other) const {
+    return n_int_ <= other.n_int_ &&  n_float_ <= other.n_float_; }
+  bool operator<(RegUse other) const {
+    return n_int_ < other.n_int_ &&  n_float_ < other.n_float_; }
+  bool operator>=(RegUse other) const {
+    return n_int_ >= other.n_int_ &&  n_float_ >= other.n_float_; }
+  bool operator>(RegUse other) const {
+    return n_int_ > other.n_int_ &&  n_float_ > other.n_float_; }
+  RegUse& operator+=(const RegUse& other) {
+    n_int_ += other.n_int_; n_float_ += other.n_float_; return *this;}
+  RegUse& operator-=(const RegUse& other) {
+    n_int_ -= other.n_int_; n_float_ -= other.n_float_; return *this;}
+
+  friend raw_ostream& operator<<(raw_ostream &O, const RegUse& reg);
+};
+
+raw_ostream& operator<<(raw_ostream &O, const RegUse& reg) {
+  O << "(" << reg.n_int_ << ", " << reg.n_float_ << ")";
+  return O;
+}
+
+// TODO: Find a better way to determine the architecture
+const TypeRewriteRule* GetByvalRewriteRulesForTarget(
+  const TargetLowering* tli) {
+  if (!FlagEnableCcRewrite) return 0;
+
+  const TargetMachine &m = tli->getTargetMachine();
+  const StringRef triple = m.getTargetTriple();
+
+  if (0 == triple.find("x86_64"))  return ByvalRulesX8664;
+  if (0 == triple.find("i686")) return 0;
+  if (0 == triple.find("armv7a")) return ByvalRulesARM;
+
+  llvm_unreachable("Unknown arch");
+  return 0;
+}
+
+// TODO: Find a better way to determine the architecture
+const TypeRewriteRule* GetSretRewriteRulesForTarget(
+  const TargetLowering* tli) {
+  if (!FlagEnableCcRewrite) return 0;
+
+  const TargetMachine &m = tli->getTargetMachine();
+  const StringRef triple = m.getTargetTriple();
+
+  if (0 == triple.find("x86_64"))  return SretRulesX8664;
+  if (0 == triple.find("i686")) return 0;
+  if (0 == triple.find("armv7a")) return SretRulesARM;
+
+  llvm_unreachable("Unknown arch");
+  return 0;
+}
+
+// TODO: Find a better way to determine the architecture
+// Describes the number of registers available for function
+// argument passing which may affect rewrite decisions on
+// some platforms.
+RegUse GetAvailableRegsForTarget(
+  const TargetLowering* tli) {
+  if (!FlagEnableCcRewrite) return RegUse(0, 0);
+
+  const TargetMachine &m = tli->getTargetMachine();
+  const StringRef triple = m.getTargetTriple();
+
+  // integer: RDI, RSI, RDX, RCX, R8, R9
+  // float XMM0, ..., XMM7
+  if (0 == triple.find("x86_64"))  return RegUse(6, 8);
+  // unused
+  if (0 == triple.find("i686")) return RegUse(0, 0);
+  // no constraints enforced here - the backend handles all the details
+  uint32_t max = std::numeric_limits<uint32_t>::max();
+  if (0 == triple.find("armv7a")) return RegUse(max, max);
+
+  llvm_unreachable("Unknown arch");
+  return 0;
+}
+
+// This class represents the a bitcode rewrite pass which ensures
+// that all ppapi interfaces are calling convention compatible
+// with gcc. This pass is archtitecture dependent.
+struct NaClCcRewrite : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  const TypeRewriteRule* SretRewriteRules;
+  const TypeRewriteRule* ByvalRewriteRules;
+  const RegUse AvailableRegs;
+
+  explicit NaClCcRewrite(const TargetLowering *tli = 0)
+    : FunctionPass(ID),
+      SretRewriteRules(GetSretRewriteRulesForTarget(tli)),
+      ByvalRewriteRules(GetByvalRewriteRulesForTarget(tli)),
+      AvailableRegs(GetAvailableRegsForTarget(tli)) {
+    initializeNaClCcRewritePass(*PassRegistry::getPassRegistry());
+  }
+
+  // main pass entry point
+  bool runOnFunction(Function &F);
+
+ private:
+  void RewriteCallsite(Instruction* call, LLVMContext& C);
+  void RewriteFunctionPrologAndEpilog(Function& F);
+};
+
+char NaClCcRewrite::ID = 0;
+
+// This is only used for dst side of rules
+Type* GetElementaryType(char c, LLVMContext& C) {
+  switch (c) {
+   case 'i':
+    return Type::getInt32Ty(C);
+   case 'l':
+    return Type::getInt64Ty(C);
+   case 'd':
+    return Type::getDoubleTy(C);
+   case 'f':
+    return Type::getFloatTy(C);
+   default:
+    dbgs() << c << "\n";
+    llvm_unreachable("Unknown type specifier");
+    return 0;
+  }
+}
+
+// This is only used for the dst side of a rule
+int GetElementaryTypeWidth(char c) {
+  switch (c) {
+   case 'i':
+   case 'f':
+    return 4;
+   case 'l':
+   case 'd':
+    return 8;
+   default:
+    llvm_unreachable("Unknown type specifier");
+    return 0;
+  }
+}
+
+// Check whether a type matches the *src* side pattern of a rewrite rule.
+// Note that the pattern parameter is updated during the recursion
+bool HasRewriteType(const Type* type, const char*& pattern) {
+  switch (*pattern++) {
+   case '\0':
+    return false;
+   case ')':
+    return false;
+   case 's':   // struct and union are currently no distinguished
+    {
+      if (*pattern++ != '(')  llvm_unreachable("malformed type pattern");
+      if (!type->isStructTy()) return false;
+      // check struct members
+      const StructType* st = cast<StructType>(type);
+      for (StructType::element_iterator it = st->element_begin(),
+                                        end = st->element_end();
+           it != end;
+           ++it) {
+        if (!HasRewriteType(*it, pattern)) return false;
+      }
+      // ensure we reached the end
+      int c = *pattern++;
+      return c == ')';
+    }
+    break;
+   case 'c':
+    return type->isIntegerTy(8);
+   case 'i':
+    return type->isIntegerTy(32);
+   case 'l':
+    return type->isIntegerTy(64);
+   case 'd':
+    return type->isDoubleTy();
+   case 'f':
+    return type->isFloatTy();
+   case 'F':
+    return type->isFunctionTy();
+   case 'p':  // untyped pointer
+    return type->isPointerTy();
+   case 'P':  // typed pointer
+    {
+      if (*pattern++ != '(')  llvm_unreachable("malformed type pattern");
+      if (!type->isPointerTy()) return false;
+      Type* pointee = dyn_cast<PointerType>(type)->getElementType();
+      if (!HasRewriteType(pointee, pattern)) return false;
+      int c = *pattern++;
+      return c == ')';
+    }
+   default:
+    llvm_unreachable("Unknown type specifier");
+    return false;
+  }
+}
+
+RegUse RegUseForRewriteRule(const TypeRewriteRule* rule) {
+  const char* pattern = std::string("C") == rule->dst ? rule->src : rule->dst;
+  RegUse result(0, 0);
+  while (char c = *pattern++) {
+    // Note, we only support a subset here, complex types (s, P)
+    // would require more work
+    switch (c) {
+     case 'i':
+     case 'l':
+      result += RegUse::OneIntReg();
+      break;
+     case 'd':
+     case 'f':
+      result += RegUse::OneFloatReg();
+      break;
+     default:
+      dbgs() << c << "\n";
+      llvm_unreachable("unexpected return type");
+    }
+  }
+  return result;
+}
+
+// Note, this only has to be accurate for x86-64 and is intentionally
+// quite strict so that we know when to add support for new types.
+// Ideally, unexpected types would be flagged by a bitcode checker.
+RegUse RegUseForType(const Type* t) {
+ if (t->isPointerTy()) {
+   return RegUse::OnePointerReg();
+ } else if (t->isFloatTy() || t->isDoubleTy()) {
+   return RegUse::OneFloatReg();
+ } else if (t->isIntegerTy()) {
+   const IntegerType* it = dyn_cast<const IntegerType>(t);
+   unsigned width = it->getBitWidth();
+   // x86-64 assumption here - use "register info" to make this better
+   if (width <= 64) return RegUse::OneIntReg();
+ }
+
+ dbgs() << *const_cast<Type*>(t) << "\n";
+ llvm_unreachable("unexpected type in RegUseForType");
+}
+
+// Match a type against a set of rewrite rules.
+// Return the matching rule, if any.
+const TypeRewriteRule* MatchRewriteRules(
+  const Type* type, const TypeRewriteRule* rules) {
+  if (rules == 0) return 0;
+  for (; rules->name != 0; ++rules) {
+    const char* pattern = rules->src;
+    if (HasRewriteType(type, pattern)) return rules;
+  }
+  return 0;
+}
+
+// Same as MatchRewriteRules but "dereference" type first.
+const TypeRewriteRule* MatchRewriteRulesPointee(const Type* t,
+                                                const TypeRewriteRule* Rules) {
+  // sret and byval are both modelled as pointers
+  const PointerType* pointer = dyn_cast<PointerType>(t);
+  if (pointer == 0) return 0;
+
+  return MatchRewriteRules(pointer->getElementType(), Rules);
+}
+
+// Note, the attributes are not part of the type but are stored
+// with the CallInst and/or the Function (if any)
+Type* CreateFunctionPointerType(Type* result_type,
+                                std::vector<Type*>& arguments) {
+  FunctionType* ft = FunctionType::get(result_type,
+                                       arguments,
+                                       false);
+  return PointerType::getUnqual(ft);
+}
+
+// Determines whether a function body needs a rewrite
+bool FunctionNeedsRewrite(const Function* fun,
+                          const TypeRewriteRule* ByvalRewriteRules,
+                          const TypeRewriteRule* SretRewriteRules,
+                          RegUse available) {
+  // TODO: can this be detected on indirect callsites as well.
+  //       if we skip the rewrite for the function body
+  //       we also need to skip it at the callsites
+  // if (F.isVarArg()) return false;
+
+  // Vectors and Arrays are not supported for compatibility
+  for (Function::const_arg_iterator AI = fun->arg_begin(), AE = fun->arg_end();
+       AI != AE;
+       ++AI) {
+    const Type* t = AI->getType();
+    if (isa<VectorType>(t) || isa<ArrayType>(t)) return false;
+  }
+
+  for (Function::const_arg_iterator AI = fun->arg_begin(), AE = fun->arg_end();
+       AI != AE;
+       ++AI) {
+    const Argument& a = *AI;
+    const Type* t = a.getType();
+    // byval and srets are modelled as pointers (to structs)
+    if (t->isPointerTy()) {
+      Type* pointee = dyn_cast<PointerType>(t)->getElementType();
+
+      if (ByvalRewriteRules && a.hasByValAttr()) {
+        const TypeRewriteRule* rule =
+          MatchRewriteRules(pointee, ByvalRewriteRules);
+        if (rule != 0 && RegUseForRewriteRule(rule) <= available) {
+          return true;
+        }
+      } else if (SretRewriteRules && a.hasStructRetAttr()) {
+        if (0 != MatchRewriteRules(pointee, SretRewriteRules)) {
+          return true;
+        }
+      }
+    }
+    available -= RegUseForType(t);
+  }
+  return false;
+}
+
+// Used for sret rewrites to determine the new function result type
+Type* GetNewReturnType(Type* type,
+                       const TypeRewriteRule* rule,
+                       LLVMContext& C) {
+  if (std::string("l") == rule->dst ||
+      std::string("d") == rule->dst) {
+    return GetElementaryType(rule->dst[0], C);
+  } else if (rule->dst[0] == 's') {
+    const char* cp = rule->dst + 2; // skip 's('
+    std::vector<Type*> fields;
+    while (*cp != ')') {
+      fields.push_back(GetElementaryType(*cp, C));
+      ++cp;
+    }
+    return StructType::get(C, fields, false /* isPacked */);
+  } else {
+    dbgs() << *type << " " << rule->name << "\n";
+    llvm_unreachable("unexpected return type");
+    return 0;
+  }
+}
+
+// Rewrite sret parameter while rewriting a function
+Type* RewriteFunctionSret(Function& F,
+                          Value* orig_val,
+                          const TypeRewriteRule* rule) {
+  LLVMContext& C = F.getContext();
+  BasicBlock& entry = F.getEntryBlock();
+  Instruction* before = &(entry.front());
+  Type* old_type = orig_val->getType();
+  Type* old_pointee = dyn_cast<PointerType>(old_type)->getElementType();
+  Type* new_type = GetNewReturnType(old_type, rule, C);
+  // create a temporary to hold the return value as we no longer pass
+  // in the pointer
+  AllocaInst* tmp_ret = new AllocaInst(old_pointee, "result", before);
+  orig_val->replaceAllUsesWith(tmp_ret);
+  CastInst* cast_ret = CastInst::CreatePointerCast(
+    tmp_ret,
+    PointerType::getUnqual(new_type),
+    "byval_cast",
+    before);
+  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
+    for (BasicBlock::iterator II = BI->begin(), IE = BI->end();
+         II != IE;
+         /* see below */) {
+      Instruction* inst = II;
+      // we do decontructive magic below, so advance the iterator here
+      // (this is still a little iffy)
+      ++II;
+      ReturnInst* ret = dyn_cast<ReturnInst>(inst);
+      if (ret) {
+        if (ret->getReturnValue() != 0)
+          llvm_unreachable("expected a void return");
+        // load the return value from temporary
+        Value *ret_val = new LoadInst(cast_ret, "load_result", ret);
+        // return that loaded value and delete the return instruction
+        ReturnInst::Create(C, ret_val, ret);
+        ret->eraseFromParent();
+      }
+    }
+  }
+  return new_type;
+}
+
+// Rewrite one byval function parameter while rewriting a function
+void FixFunctionByvalsParameter(Function& F,
+                                std::vector<Argument*>& new_arguments,
+                                std::vector<Attributes>& new_attributes,
+                                Value* byval,
+                                const TypeRewriteRule* rule) {
+  LLVMContext& C = F.getContext();
+  BasicBlock& entry = F.getEntryBlock();
+  Instruction* before = &(entry.front());
+  Twine prefix =  byval->getName() + "_split";
+  Type* t = byval->getType();
+  Type* pointee = dyn_cast<PointerType>(t)->getElementType();
+  AllocaInst* tmp_param = new AllocaInst(pointee, prefix + "_param", before);
+  byval->replaceAllUsesWith(tmp_param);
+  // convert byval poiner to char pointer
+  Value* base = CastInst::CreatePointerCast(
+    tmp_param, PointerType::getInt8PtrTy(C), prefix + "_base", before);
+
+  int width = 0;
+  const char* pattern = rule->dst;
+  for (int offset = 0; *pattern; ++pattern, offset += width) {
+    width = GetElementaryTypeWidth(*pattern);
+    Type* t = GetElementaryType(*pattern, C);
+    Argument* arg = new Argument(t, prefix, &F);
+    Type* pt = PointerType::getUnqual(t);
+    // the code below generates something like:
+    // <CHAR-PTR> = getelementptr i8* <BASE>, i32 <OFFSET-FROM-BASE>
+    // <PTR> = bitcast i8* <CHAR-PTR> to <TYPE>*
+    // store <ARG> <TYPE>* <ELEM-PTR>
+    ConstantInt* baseOffset = ConstantInt::get(Type::getInt32Ty(C), offset);
+    Value *v;
+    v = GetElementPtrInst::Create(base, baseOffset, prefix + "_base_add", before);
+    v = CastInst::CreatePointerCast(v, pt, prefix + "_cast", before);
+    v = new StoreInst(arg, v, before);
+
+    new_arguments.push_back(arg);
+    new_attributes.push_back(Attributes());
+  }
+}
+
+// Change function signature to reflect all the rewrites.
+// This includes function type/signature and attributes.
+void UpdateFunctionSignature(Function &F,
+                             Type* new_result_type,
+                             std::vector<Argument*>& new_arguments,
+                             std::vector<Attributes>& new_attributes) {
+  DEBUG(dbgs() << "PHASE PROTOTYPE UPDATE\n");
+  if (new_result_type) {
+    DEBUG(dbgs() << "NEW RESULT TYPE: " << *new_result_type << "\n");
+  }
+  // Update function type
+  FunctionType* old_fun_type = F.getFunctionType();
+  std::vector<Type*> new_types;
+  for (size_t i = 0; i < new_arguments.size(); ++i) {
+    new_types.push_back(new_arguments[i]->getType());
+  }
+
+  FunctionType* new_fun_type = FunctionType::get(
+    new_result_type ? new_result_type : old_fun_type->getReturnType(),
+    new_types,
+    false);
+  F.setType(PointerType::getUnqual(new_fun_type));
+
+  Function::ArgumentListType& args = F.getArgumentList();
+  DEBUG(dbgs() << "PHASE ARGUMENT DEL " <<  args.size() << "\n");
+  while (args.size()) {
+    Argument* arg = args.begin();
+    DEBUG(dbgs() << "DEL " << arg->getArgNo() << " " << arg->getName() << "\n");
+    args.remove(args.begin());
+  }
+
+  DEBUG(dbgs() << "PHASE ARGUMENT ADD " <<  new_arguments.size()   << "\n");
+  for (size_t i = 0; i < new_arguments.size(); ++i) {
+    Argument* arg = new_arguments[i];
+    DEBUG(dbgs() << "ADD " << i << " " << arg->getName() << "\n");
+    args.push_back(arg);
+  }
+
+  DEBUG(dbgs() << "PHASE ATTRIBUTES UPDATE\n");
+  std::vector<AttributeWithIndex> new_attributes_vec;
+  for (size_t i = 0; i < new_attributes.size(); ++i) {
+    Attributes attr = new_attributes[i];
+    if (attr.hasAttributes()) {
+      new_attributes_vec.push_back(AttributeWithIndex::get(i + 1, attr));
+    }
+  }
+  Attributes fattr = F.getAttributes().getFnAttributes();
+  if (fattr.hasAttributes())
+    new_attributes_vec.push_back(AttributeWithIndex::get(~0, fattr));
+  F.setAttributes(AttrListPtr::get(new_attributes_vec));
+}
+
+
+void ExtractFunctionArgsAndAttributes(Function& F,
+                                      std::vector<Argument*>& old_arguments,
+                                      std::vector<Attributes>& old_attributes) {
+  for (Function::arg_iterator ai = F.arg_begin(),
+                             end = F.arg_end();
+       ai != end;
+       ++ai) {
+    old_arguments.push_back(ai);
+  }
+
+  for (size_t i = 0; i < old_arguments.size(); ++i) {
+    // index zero is for return value attributes
+    old_attributes.push_back(F.getParamAttributes(i + 1));
+  }
+}
+
+// Apply byval or sret rewrites to function body.
+void NaClCcRewrite::RewriteFunctionPrologAndEpilog(Function& F) {
+
+  DEBUG(dbgs() << "\nFUNCTION-REWRITE\n");
+
+  DEBUG(dbgs() << "FUNCTION BEFORE ");
+  DEBUG(dbgs() << F);
+  DEBUG(dbgs() << "\n");
+
+  std::vector<Argument*> new_arguments;
+  std::vector<Attributes> new_attributes;
+  std::vector<Argument*> old_arguments;
+  std::vector<Attributes> old_attributes;
+
+
+  // make a copy of everything first as create Argument adds them to the list
+  ExtractFunctionArgsAndAttributes(F, old_arguments, old_attributes);
+
+  // A non-zero new_result_type indicates an sret rewrite
+  Type* new_result_type = 0;
+
+  // only the first arg can be "sret"
+  if (old_attributes.size() > 0 && old_attributes[0].hasAttribute(Attributes::StructRet)) {
+    const TypeRewriteRule* sret_rule =
+      MatchRewriteRulesPointee(old_arguments[0]->getType(), SretRewriteRules);
+    if (sret_rule) {
+      Argument* arg = old_arguments[0];
+      DEBUG(dbgs() << "REWRITING SRET "
+            << " arg " << arg->getName() << " " << sret_rule->name << "\n");
+      new_result_type = RewriteFunctionSret(F, arg, sret_rule);
+      old_arguments.erase(old_arguments.begin());
+      old_attributes.erase(old_attributes.begin());
+    }
+  }
+
+  // now deal with the byval arguments
+  RegUse available = AvailableRegs;
+  for (size_t i = 0; i < old_arguments.size(); ++i) {
+    Argument* arg = old_arguments[i];
+    Type* t = arg->getType();
+    Attributes attr = old_attributes[i];
+    if (attr.hasAttribute(Attributes::ByVal)) {
+      const TypeRewriteRule* rule =
+        MatchRewriteRulesPointee(t, ByvalRewriteRules);
+      if (rule != 0 && RegUseForRewriteRule(rule) <= available) {
+        DEBUG(dbgs() << "REWRITING BYVAL "
+              << *t << " arg " << arg->getName() << " " << rule->name << "\n");
+        FixFunctionByvalsParameter(F,
+                                   new_arguments,
+                                   new_attributes,
+                                   arg,
+                                   rule);
+        available -= RegUseForRewriteRule(rule);
+        continue;
+      }
+    }
+
+    // fall through case - no rewrite is happening
+    new_arguments.push_back(arg);
+    new_attributes.push_back(attr);
+    available -= RegUseForType(t);
+  }
+
+  UpdateFunctionSignature(F, new_result_type, new_arguments, new_attributes);
+
+  DEBUG(dbgs() << "FUNCTION AFTER ");
+  DEBUG(dbgs() << F);
+  DEBUG(dbgs() << "\n");
+}
+
+// used for T in {CallInst, InvokeInst}
+// TODO(robertm): try unifying this code with FunctionNeedsRewrite()
+template<class T> bool CallNeedsRewrite(
+  const Instruction* inst,
+  const TypeRewriteRule* ByvalRewriteRules,
+  const TypeRewriteRule* SretRewriteRules,
+  RegUse available) {
+
+  const T* call = cast<T>(inst);
+  // skip non parameter operands at the end
+  size_t num_params = call->getNumOperands() - (isa<CallInst>(inst) ? 1 : 3);
+
+  // Vectors and Arrays are not supported for compatibility
+  for (size_t i = 0; i <  num_params; ++i) {
+    Type* t = call->getOperand(i)->getType();
+    if (isa<VectorType>(t) || isa<ArrayType>(t)) return false;
+  }
+
+  for (size_t i = 0; i <  num_params; ++i) {
+    Type* t = call->getOperand(i)->getType();
+    // byval and srets are modelled as pointers (to structs)
+    if (t->isPointerTy()) {
+      Type* pointee = dyn_cast<PointerType>(t)->getElementType();
+
+      //  param zero is for the return value
+      if (ByvalRewriteRules && call->paramHasAttr(i + 1, Attributes::ByVal)) {
+        const TypeRewriteRule* rule =
+          MatchRewriteRules(pointee, ByvalRewriteRules);
+        if (rule != 0 && RegUseForRewriteRule(rule) <= available) {
+          return true;
+        }
+      } else if (SretRewriteRules &&
+                 call->paramHasAttr(i + 1, Attributes::StructRet)) {
+        if (0 != MatchRewriteRules(pointee, SretRewriteRules)) {
+          return true;
+        }
+      }
+    }
+    available -= RegUseForType(t);
+  }
+  return false;
+}
+
+// This code will load the fields of the byval ptr into scalar variables
+// which will then be used as argument when we rewrite the actual call
+// instruction.
+void PrependCompensationForByvals(std::vector<Value*>& new_operands,
+                                  std::vector<Attributes>& new_attributes,
+                                  Instruction* call,
+                                  Value* byval,
+                                  const TypeRewriteRule* rule,
+                                  LLVMContext& C) {
+  // convert byval poiner to char pointer
+  Value* base = CastInst::CreatePointerCast(
+    byval, PointerType::getInt8PtrTy(C), "byval_base", call);
+
+  int width = 0;
+  const char* pattern = rule->dst;
+  for (int offset = 0; *pattern; ++pattern, offset += width) {
+    width = GetElementaryTypeWidth(*pattern);
+    Type* t = GetElementaryType(*pattern, C);
+    Type* pt = PointerType::getUnqual(t);
+    // the code below generates something like:
+    // <CHAR-PTR> = getelementptr i8* <BASE>, i32 <OFFSET-FROM-BASE>
+    // <PTR> = bitcast i8* <CHAR-PTR> to i32*
+    // <SCALAR> = load i32* <ELEM-PTR>
+    ConstantInt* baseOffset = ConstantInt::get(Type::getInt32Ty(C), offset);
+    Value* v;
+    v = GetElementPtrInst::Create(base, baseOffset, "byval_base_add", call);
+    v = CastInst::CreatePointerCast(v, pt, "byval_cast", call);
+    v = new LoadInst(v, "byval_extract", call);
+
+    new_operands.push_back(v);
+    new_attributes.push_back(Attributes());
+  }
+}
+
+// Note: this will only be called if we expect a rewrite to occur
+void CallsiteFixupSrets(Instruction* call,
+                        Value* sret,
+                        Type* new_type,
+                        const TypeRewriteRule* rule) {
+  const char* pattern = rule->dst;
+  Instruction* next;
+  if (isa<CallInst>(call)) {
+    next = call->getNextNode();
+  } else if (isa<InvokeInst>(call)) {
+    // if this scheme turns out to be too simplistic (i.e. asserts fire)
+    // we need to introduce a new basic block for the compensation code.
+    BasicBlock* normal = dyn_cast<InvokeInst>(call)->getNormalDest();
+    if (!normal->getSinglePredecessor()) {
+      llvm_unreachable("unexpected invoke normal bb");
+    }
+    next = normal->getFirstNonPHI();
+  } else {
+    llvm_unreachable("unexpected call instruction");
+  }
+
+  if (next == 0) {
+    llvm_unreachable("unexpected missing next instruction");
+  }
+
+  if (pattern[0] == 's' ||
+      std::string("l") == pattern ||
+      std::string("d") == pattern) {
+    Type* pt = PointerType::getUnqual(new_type);
+    Value* cast = CastInst::CreatePointerCast(sret, pt, "cast", next);
+    new StoreInst(call, cast, next);
+  } else {
+    dbgs() << rule->name << "\n";
+    llvm_unreachable("unexpected return type at fix up");
+  }
+}
+
+void ExtractOperandsAndAttributesFromCallInst(
+  CallInst* call,
+  std::vector<Value*>& operands,
+  std::vector<Attributes>& attributes) {
+
+  AttrListPtr PAL = call->getAttributes();
+  // last operand is: function
+  for (size_t i = 0; i <  call->getNumOperands() - 1; ++i) {
+    operands.push_back(call->getArgOperand(i));
+    // index zero is for return value attributes
+    attributes.push_back(PAL.getParamAttributes(i + 1));
+  }
+}
+
+// Note: this differs from the one above in the loop bounds
+void ExtractOperandsAndAttributesFromeInvokeInst(
+  InvokeInst* call,
+  std::vector<Value*>& operands,
+  std::vector<Attributes>& attributes) {
+  AttrListPtr PAL = call->getAttributes();
+  // last three operands are: function, bb-normal, bb-exception
+  for (size_t i = 0; i <  call->getNumOperands() - 3; ++i) {
+    operands.push_back(call->getArgOperand(i));
+    // index zero is for return value attributes
+    attributes.push_back(PAL.getParamAttributes(i + 1));
+  }
+}
+
+
+Instruction* ReplaceCallInst(CallInst* call,
+                             Type* function_pointer,
+                             std::vector<Value*>& new_operands,
+                             std::vector<Attributes>& new_attributes) {
+  Value* v = CastInst::CreatePointerCast(
+    call->getCalledValue(), function_pointer, "fp_cast", call);
+  CallInst* new_call = CallInst::Create(v, new_operands, "", call);
+  // NOTE: tail calls may be ruled out but byval/sret, should we assert this?
+  // TODO: did wid forget to clone anything else?
+  new_call->setTailCall(call->isTailCall());
+  new_call->setCallingConv(call->getCallingConv());
+  for (size_t i = 0; i < new_attributes.size(); ++i) {
+    // index zero is for return value attributes
+    new_call->addAttribute(i + 1, new_attributes[i]);
+  }
+  return new_call;
+}
+
+Instruction* ReplaceInvokeInst(InvokeInst* call,
+                             Type* function_pointer,
+                             std::vector<Value*>& new_operands,
+                             std::vector<Attributes>& new_attributes) {
+  Value* v = CastInst::CreatePointerCast(
+    call->getCalledValue(), function_pointer, "fp_cast", call);
+  InvokeInst* new_call = InvokeInst::Create(v,
+                                            call->getNormalDest(),
+                                            call->getUnwindDest(),
+                                            new_operands,
+                                            "",
+                                            call);
+  for (size_t i = 0; i < new_attributes.size(); ++i) {
+    // index zero is for return value attributes
+    new_call->addAttribute(i + 1, new_attributes[i]);
+  }
+  return new_call;
+}
+
+
+void NaClCcRewrite::RewriteCallsite(Instruction* call, LLVMContext& C) {
+  BasicBlock* BB = call->getParent();
+
+  DEBUG(dbgs() << "\nCALLSITE-REWRITE\n");
+  DEBUG(dbgs() << "CALLSITE BB BEFORE " << *BB);
+  DEBUG(dbgs() << "\n");
+  DEBUG(dbgs() << *call << "\n");
+  if (isa<InvokeInst>(call)) {
+    DEBUG(dbgs() << "\n" << *(dyn_cast<InvokeInst>(call)->getNormalDest()));
+  }
+
+  // new_result(_type) is only relevent if an sret is rewritten
+  // whish is indicated by sret_rule != 0
+  const TypeRewriteRule* sret_rule = 0;
+  Type* new_result_type = call->getType();
+  // This is the sret which was originally passed in as the first arg.
+  // After the rewrite we simply copy the function result into it.
+  Value* new_result = 0;
+
+  std::vector<Value*> old_operands;
+  std::vector<Attributes> old_attributes;
+  if (isa<CallInst>(call)) {
+    ExtractOperandsAndAttributesFromCallInst(
+      cast<CallInst>(call), old_operands, old_attributes);
+  } else if (isa<InvokeInst>(call)) {
+    ExtractOperandsAndAttributesFromeInvokeInst(
+      cast<InvokeInst>(call), old_operands, old_attributes);
+  } else {
+    llvm_unreachable("Unexpected instruction type");
+  }
+
+  // handle sret (just the book-keeping, 'new_result' is dealt with below)
+  // only the first arg can be "sret"
+  if (old_attributes[0].hasAttribute(Attributes::StructRet)) {
+    sret_rule = MatchRewriteRulesPointee(
+      old_operands[0]->getType(), SretRewriteRules);
+    if (sret_rule) {
+      new_result_type =
+        GetNewReturnType(old_operands[0]->getType(), sret_rule, C);
+      new_result = old_operands[0];
+      old_operands.erase(old_operands.begin());
+      old_attributes.erase(old_attributes.begin());
+    }
+  }
+
+  // handle byval
+  std::vector<Value*> new_operands;
+  std::vector<Attributes> new_attributes;
+  RegUse available = AvailableRegs;
+
+  for (size_t i = 0; i <  old_operands.size(); ++i) {
+    Value *operand = old_operands[i];
+    Type* t = operand->getType();
+    Attributes attr = old_attributes[i];
+
+    if (attr.hasAttribute(Attributes::ByVal)) {
+      const TypeRewriteRule* rule =
+        MatchRewriteRulesPointee(t, ByvalRewriteRules);
+      if (rule != 0 && RegUseForRewriteRule(rule) <= available) {
+        DEBUG(dbgs() << "REWRITING BYVAL "
+              << *t << " arg " << i << " " << rule->name << "\n");
+        PrependCompensationForByvals(new_operands,
+                                     new_attributes,
+                                     call,
+                                     operand,
+                                     rule,
+                                     C);
+        available -= RegUseForRewriteRule(rule);
+        continue;
+      }
+    }
+
+    // fall through case - no rewrite is happening
+    new_operands.push_back(operand);
+    new_attributes.push_back(attr);
+    available -= RegUseForType(t);
+  }
+
+  // Note, this code is tricky.
+  // Initially we used a much more elaborate scheme introducing
+  // new function declarations for direct calls.
+  // This simpler scheme, however, works for both direct and
+  // indirect calls
+  // We transform (here the direct case):
+  // call void @result_PP_FloatPoint(%struct.PP_FloatPoint* sret %sret)
+  // into
+  //  %fp_cast = bitcast void (%struct.PP_FloatPoint*)*
+  //                @result_PP_FloatPoint to %struct.PP_FloatPoint ()*
+  //  %result = call %struct.PP_FloatPoint %fp_cast()
+  //
+  std::vector<Type*> new_arg_types;
+  for (size_t i = 0; i < new_operands.size(); ++i) {
+    new_arg_types.push_back(new_operands[i]->getType());
+  }
+
+  DEBUG(dbgs() << "REWRITE CALL INSTRUCTION\n");
+  Instruction* new_call = 0;
+  if (isa<CallInst>(call)) {
+    new_call = ReplaceCallInst(
+      cast<CallInst>(call),
+      CreateFunctionPointerType(new_result_type, new_arg_types),
+      new_operands,
+      new_attributes);
+  } else if (isa<InvokeInst>(call)) {
+    new_call = ReplaceInvokeInst(
+      cast<InvokeInst>(call),
+      CreateFunctionPointerType(new_result_type, new_arg_types),
+      new_operands,
+      new_attributes);
+  } else {
+    llvm_unreachable("Unexpected instruction type");
+  }
+
+  // We prepended the new call, now get rid of the old one.
+  // If we did not change the return type, there may be consumers
+  // of the result which must be redirected.
+  if (!sret_rule) {
+    call->replaceAllUsesWith(new_call);
+  }
+  call->eraseFromParent();
+
+  // Add compensation codes for srets if necessary
+  if (sret_rule) {
+    DEBUG(dbgs() << "REWRITING  SRET " << sret_rule->name << "\n");
+    CallsiteFixupSrets(new_call, new_result, new_result_type, sret_rule);
+  }
+
+  DEBUG(dbgs() << "CALLSITE BB AFTER" << *BB);
+  DEBUG(dbgs() << "\n");
+  DEBUG(dbgs() << *new_call << "\n");
+  if (isa<InvokeInst>(call)) {
+    DEBUG(dbgs() << "\n" << *(dyn_cast<InvokeInst>(call)->getNormalDest()));
+  }
+}
+
+bool NaClCcRewrite::runOnFunction(Function &F) {
+  // No rules - no action
+  if (ByvalRewriteRules == 0 && SretRewriteRules == 0) return false;
+
+  bool Changed = false;
+
+  if (FunctionNeedsRewrite(&F, ByvalRewriteRules, SretRewriteRules, AvailableRegs)) {
+    DEBUG(dbgs() << "FUNCTION NEEDS REWRITE " << F.getName() << "\n");
+    RewriteFunctionPrologAndEpilog(F);
+    Changed = true;
+  }
+
+  // Find all the calls and invokes in F and rewrite them if necessary
+  for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
+    for (BasicBlock::iterator II = BI->begin(), IE = BI->end();
+         II != IE;
+         /* II updated below */) {
+      Instruction* inst = II;
+      // we do decontructive magic below, so advance the iterator here
+      // (this is still a little iffy)
+      ++II;
+      if (isa<InvokeInst>(inst) || isa<CallInst>(inst))  {
+        // skip calls to llvm.dbg.declare, etc.
+        if (isa<IntrinsicInst>(inst)) continue;
+
+        if (isa<CallInst>(inst) &&
+            !CallNeedsRewrite<CallInst>
+            (inst, ByvalRewriteRules, SretRewriteRules, AvailableRegs)) continue;
+
+        if (isa<InvokeInst>(inst) &&
+            !CallNeedsRewrite<InvokeInst>
+            (inst, ByvalRewriteRules, SretRewriteRules, AvailableRegs)) continue;
+
+        RewriteCallsite(inst, F.getContext());
+        Changed = true;
+      }
+    }
+  }
+  return Changed;
+}
+
+} // end anonymous namespace
+
+
+INITIALIZE_PASS(NaClCcRewrite, "naclcc", "NaCl CC Rewriter", false, false)
+
+FunctionPass *llvm::createNaClCcRewritePass(const TargetLowering *tli) {
+  return new NaClCcRewrite(tli);
+}
diff --git a/lib/VMCore/Globals.cpp b/lib/VMCore/Globals.cpp
index c428b889c3..ad7a872b1f 100644
--- a/lib/VMCore/Globals.cpp
+++ b/lib/VMCore/Globals.cpp
@@ -45,6 +45,116 @@ void GlobalValue::destroyConstant() {
   llvm_unreachable("You can't GV->destroyConstant()!");
 }
 
+// @LOCALMOD-BEGIN
+
+// Extract the version information from GV.
+static void ExtractVersion(const GlobalValue *GV,
+                           StringRef *Name,
+                           StringRef *Ver,
+                           bool *IsDefault) {
+  // The version information is stored in the GlobalValue's name, e.g.:
+  //
+  //     GV Name      Name  Ver  IsDefault
+  //    ------------------------------------
+  //     foo@@V1 -->  foo   V1     true
+  //     bar@V2  -->  bar   V2     false
+  //     baz     -->  baz          false
+
+  StringRef GVName = GV->getName();
+  size_t atpos = GVName.find("@");
+  if (atpos == StringRef::npos) {
+    *Name = GVName;
+    *Ver = "";
+    *IsDefault = false;
+    return;
+  }
+  *Name = GVName.substr(0, atpos);
+  ++atpos;
+  if (atpos < GVName.size() && GVName[atpos] == '@') {
+    *IsDefault = true;
+    ++atpos;
+  } else {
+    *IsDefault = false;
+  }
+  *Ver = GVName.substr(atpos);
+}
+
+// Set the version information on GV.
+static void SetVersion(Module *M,
+                       GlobalValue *GV,
+                       StringRef Ver,
+                       bool IsDefault) {
+  StringRef Name;
+  StringRef PrevVersion;
+  bool PrevIsDefault;
+  ExtractVersion(GV, &Name, &PrevVersion, &PrevIsDefault);
+
+  // If this symbol already has a version, make sure it matches.
+  if (!PrevVersion.empty()) {
+    if (!PrevVersion.equals(Ver) || PrevIsDefault != IsDefault) {
+      llvm_unreachable("Trying to override symbol version info!");
+    }
+    return;
+  }
+  // If there's no version to set, there's nothing to do.
+  if (Ver.empty())
+    return;
+
+  // Make sure the versioned symbol name doesn't already exist.
+  std::string NewName = Name.str() + (IsDefault ? "@@" : "@") + Ver.str();
+  if (M->getNamedValue(NewName)) {
+    // It may make sense to do this as long as one of the globals being
+    // merged is only a declaration. But since this situation seems to be
+    // a corner case, for now it is unimplemented.
+    llvm_unreachable("Merging unversioned global into "
+                     "existing versioned global is unimplemented");
+  }
+  GV->setName(NewName);
+}
+
+StringRef GlobalValue::getUnversionedName() const {
+  StringRef Name;
+  StringRef Ver;
+  bool IsDefaultVersion;
+  ExtractVersion(this, &Name, &Ver, &IsDefaultVersion);
+  return Name;
+}
+
+StringRef GlobalValue::getVersion() const {
+  StringRef Name;
+  StringRef Ver;
+  bool IsDefaultVersion;
+  ExtractVersion(this, &Name, &Ver, &IsDefaultVersion);
+  return Ver;
+}
+
+bool GlobalValue::isDefaultVersion() const {
+  StringRef Name;
+  StringRef Ver;
+  bool IsDefaultVersion;
+  ExtractVersion(this, &Name, &Ver, &IsDefaultVersion);
+  // It is an error to call this function on an unversioned symbol.
+  assert(!Ver.empty());
+  return IsDefaultVersion;
+}
+
+void GlobalValue::setVersionDef(StringRef Version, bool IsDefault) {
+  // This call only makes sense for definitions.
+  assert(!isDeclaration());
+  SetVersion(Parent, this, Version, IsDefault);
+}
+
+void GlobalValue::setNeeded(StringRef Version, StringRef DynFile) {
+  // This call makes sense on declarations or
+  // available-externally definitions.
+  // TODO(pdox): If this is a definition, should we turn it
+  //             into a declaration here?
+  assert(isDeclaration() || hasAvailableExternallyLinkage());
+  SetVersion(Parent, this, Version, false);
+  Parent->addNeededRecord(DynFile, this);
+}
+// @LOCALMOD-END
+
 /// copyAttributesFrom - copy all additional attributes (those not needed to
 /// create a GlobalValue) from the GlobalValue Src to this one.
 void GlobalValue::copyAttributesFrom(const GlobalValue *Src) {
diff --git a/lib/VMCore/Module.cpp b/lib/VMCore/Module.cpp
index 5b5176b3c7..a6e335c10c 100644
--- a/lib/VMCore/Module.cpp
+++ b/lib/VMCore/Module.cpp
@@ -22,6 +22,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/LeakDetector.h"
+#include "llvm/Support/ErrorHandling.h" // @LOCALMOD
 #include "SymbolTableListTraitsImpl.h"
 #include <algorithm>
 #include <cstdarg>
@@ -467,3 +468,181 @@ void Module::removeLibrary(StringRef Lib) {
       return;
     }
 }
+
+
+// @LOCALMOD-BEGIN
+// TODO(pdox):
+// If possible, use actual bitcode records instead of NamedMetadata.
+// This is contingent upon whether we can get these changes upstreamed
+// immediately, to avoid creating incompatibilities in the bitcode format.
+
+static std::string
+ModuleMetaGet(const Module *module, StringRef MetaName) {
+  NamedMDNode *node = module->getNamedMetadata(MetaName);
+  if (node == NULL)
+    return "";
+  assert(node->getNumOperands() == 1);
+  MDNode *subnode = node->getOperand(0);
+  assert(subnode->getNumOperands() == 1);
+  MDString *value = dyn_cast<MDString>(subnode->getOperand(0));
+  assert(value != NULL);
+  return value->getString();
+}
+
+static void
+ModuleMetaSet(Module *module, StringRef MetaName, StringRef ValueStr) {
+  NamedMDNode *node = module->getNamedMetadata(MetaName);
+  if (node)
+    module->eraseNamedMetadata(node);
+  node = module->getOrInsertNamedMetadata(MetaName);
+  MDString *value = MDString::get(module->getContext(), ValueStr);
+  node->addOperand(MDNode::get(module->getContext(),
+                   makeArrayRef(static_cast<Value*>(value))));
+}
+
+const std::string &Module::getSOName() const {
+  if (ModuleSOName == "")
+    ModuleSOName.assign(ModuleMetaGet(this, "SOName"));
+  return ModuleSOName;
+}
+
+void Module::setSOName(StringRef Name) {
+  ModuleMetaSet(this, "SOName", Name);
+  ModuleSOName = Name;
+}
+
+void Module::setOutputFormat(Module::OutputFormat F) {
+  const char *formatStr;
+  switch (F) {
+  case ObjectOutputFormat: formatStr = "object"; break;
+  case SharedOutputFormat: formatStr = "shared"; break;
+  case ExecutableOutputFormat: formatStr = "executable"; break;
+  default:
+    llvm_unreachable("Unrecognized output format in setOutputFormat()");
+  }
+  ModuleMetaSet(this, "OutputFormat", formatStr);
+}
+
+Module::OutputFormat Module::getOutputFormat() const {
+  std::string formatStr = ModuleMetaGet(this, "OutputFormat");
+  if (formatStr == "" || formatStr == "object")
+    return ObjectOutputFormat;
+  else if (formatStr == "shared")
+    return SharedOutputFormat;
+  else if (formatStr == "executable")
+    return ExecutableOutputFormat;
+  llvm_unreachable("Invalid module compile type in getOutputFormat()");
+}
+
+void
+Module::wrapSymbol(StringRef symName) {
+  std::string wrapSymName("__wrap_");
+  wrapSymName += symName;
+
+  std::string realSymName("__real_");
+  realSymName += symName;
+
+  GlobalValue *SymGV = getNamedValue(symName);
+  GlobalValue *WrapGV = getNamedValue(wrapSymName);
+  GlobalValue *RealGV = getNamedValue(realSymName);
+
+  // Replace uses of "sym" with __wrap_sym.
+  if (SymGV) {
+    if (!WrapGV)
+      WrapGV = cast<GlobalValue>(getOrInsertGlobal(wrapSymName,
+                                                   SymGV->getType()));
+    SymGV->replaceAllUsesWith(ConstantExpr::getBitCast(WrapGV,
+                                                       SymGV->getType()));
+  }
+
+  // Replace uses of "__real_sym" with "sym".
+  if (RealGV) {
+    if (!SymGV)
+      SymGV = cast<GlobalValue>(getOrInsertGlobal(symName, RealGV->getType()));
+    RealGV->replaceAllUsesWith(ConstantExpr::getBitCast(SymGV,
+                                                        RealGV->getType()));
+  }
+}
+
+// The metadata key prefix for NeededRecords.
+static const char *NeededPrefix = "NeededRecord_";
+
+void
+Module::dumpMeta(raw_ostream &OS) const {
+  OS << "OutputFormat: ";
+  switch (getOutputFormat()) {
+    case Module::ObjectOutputFormat: OS << "object"; break;
+    case Module::SharedOutputFormat: OS << "shared"; break;
+    case Module::ExecutableOutputFormat: OS << "executable"; break;
+  }
+  OS << "\n";
+  OS << "SOName: " << getSOName() << "\n";
+  for (Module::lib_iterator L = lib_begin(),
+                            E = lib_end();
+       L != E; ++L) {
+    OS << "NeedsLibrary: " << (*L) << "\n";
+  }
+  std::vector<NeededRecord> NList;
+  getNeededRecords(&NList);
+  for (unsigned i = 0; i < NList.size(); ++i) {
+    const NeededRecord &NR = NList[i];
+    OS << StringRef(NeededPrefix) << NR.DynFile << ": ";
+    for (unsigned j = 0; j < NR.Symbols.size(); ++j) {
+      if (j != 0)
+        OS << " ";
+      OS << NR.Symbols[j];
+    }
+    OS << "\n";
+  }
+}
+
+void Module::addNeededRecord(StringRef DynFile, GlobalValue *GV) {
+  if (DynFile.empty()) {
+    // We never resolved this symbol, even after linking.
+    // This should only happen in a shared object.
+    // It is safe to ignore this symbol, and let the dynamic loader
+    // figure out where it comes from.
+    return;
+  }
+  std::string Key = NeededPrefix;
+  Key += DynFile;
+  // Get the node for this file.
+  NamedMDNode *Node = getOrInsertNamedMetadata(Key);
+  // Add this global value's name to the list.
+  MDString *value = MDString::get(getContext(), GV->getName());
+  Node->addOperand(MDNode::get(getContext(),
+                   makeArrayRef(static_cast<Value*>(value))));
+}
+
+// Get the NeededRecord for SOName.
+// Returns an empty NeededRecord if there was no metadata found.
+static void getNeededRecordFor(const Module *M,
+                               StringRef SOName,
+                               Module::NeededRecord *NR) {
+  NR->DynFile = SOName;
+  NR->Symbols.clear();
+
+  std::string Key = NeededPrefix;
+  Key += SOName;
+  NamedMDNode *Node = M->getNamedMetadata(Key);
+  if (!Node)
+    return;
+
+  for (unsigned k = 0; k < Node->getNumOperands(); ++k) {
+    // Insert the symbol name.
+    const MDString *SymName =
+        dyn_cast<MDString>(Node->getOperand(k)->getOperand(0));
+    NR->Symbols.push_back(SymName->getString());
+  }
+}
+
+// Place the complete list of needed records in NeededOut.
+void Module::getNeededRecords(std::vector<NeededRecord> *NeededOut) const {
+  // Iterate through the libraries needed, grabbing each NeededRecord.
+  for (lib_iterator I = lib_begin(), E = lib_end(); I != E; ++I) {
+    NeededRecord NR;
+    getNeededRecordFor(this, *I, &NR);
+    NeededOut->push_back(NR);
+  }
+}
+// @LOCALMOD-END
+\ No newline at end of file
diff --git a/lib/Wrap/LLVMBuild.txt b/lib/Wrap/LLVMBuild.txt
new file mode 100644
index 0000000000..8750711338
--- /dev/null
+++ b/lib/Wrap/LLVMBuild.txt
@@ -0,0 +1,21 @@
+;===- ./lib/Wrap/LLVMBuild.txt ------------------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = Wrap
+parent = Libraries
diff --git a/lib/Wrap/Makefile b/lib/Wrap/Makefile
new file mode 100644
index 0000000000..79aa2b3531
--- /dev/null
+++ b/lib/Wrap/Makefile
@@ -0,0 +1,14 @@
+##===- lib/Linker/Makefile ---------------------------------*- Makefile -*-===##
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+#
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../..
+LIBRARYNAME = LLVMWrap
+BUILD_ARCHIVE := 1
+
+include $(LEVEL)/Makefile.common
diff --git a/lib/Wrap/bitcode_wrapperer.cpp b/lib/Wrap/bitcode_wrapperer.cpp
new file mode 100644
index 0000000000..eeb2825793
--- /dev/null
+++ b/lib/Wrap/bitcode_wrapperer.cpp
@@ -0,0 +1,355 @@
+/* Copyright 2012 The Native Client Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ */
+
+#include "llvm/Wrap/bitcode_wrapperer.h"
+
+#include <stdio.h>
+#include <sys/stat.h>
+
+using std::vector;
+
+// The number of bytes in a 32 bit integer.
+static const uint32_t kWordSize = 4;
+
+// Number of LLVM-defined fixed fields in the header.
+static const uint32_t kLLVMFields = 4;
+
+// Total number of fixed fields in the header.
+static const uint32_t kFixedFields = 7;
+
+// The magic number that must exist for bitcode wrappers.
+static const uint32_t kWrapperMagicNumber = 0x0B17C0DE;
+
+// The version number associated with a wrapper file.
+// Note: llvm currently only allows the value 0. When this changes,
+// we should consider making this a command line option.
+static const uint32_t kLLVMVersionNumber = 0;
+
+// Fields defined by Android bitcode header.
+static const uint32_t kAndroidHeaderVersion = 0;
+static const uint32_t kAndroidTargetAPI = 0;
+static const uint32_t kAndroidDefaultCompilerVersion = 0;
+static const uint32_t kAndroidDefaultOptimizationLevel = 3;
+
+// PNaCl bitcode version number.
+static const uint32_t kPnaclBitcodeVersion = 0;
+
+// Max size for variable fields. Currently only used for writing them
+// out to files (the parsing works for arbitrary sizes).
+static const size_t kMaxVariableFieldSize = 256;
+
+BitcodeWrapperer::BitcodeWrapperer(WrapperInput* infile, WrapperOutput* outfile)
+    : infile_(infile),
+      outfile_(outfile),
+      buffer_size_(0),
+      cursor_(0),
+      infile_at_eof_(false),
+      infile_bc_offset_(0),
+      wrapper_bc_offset_(0),
+      wrapper_bc_size_(0),
+      android_header_version_(kAndroidHeaderVersion),
+      android_target_api_(kAndroidTargetAPI),
+      pnacl_bc_version_(0),
+      error_(false) {
+  buffer_.resize(kBitcodeWrappererBufferSize);
+  if (IsInputBitcodeWrapper()) {
+    ParseWrapperHeader();
+  } else if (IsInputBitcodeFile()) {
+    wrapper_bc_offset_ = kWordSize * kFixedFields;
+    wrapper_bc_size_ = GetInFileSize();
+  } else {
+    fprintf(stderr, "Error: input file is not a bitcode file.\n");
+    error_ = true;
+  }
+}
+
+BitcodeWrapperer::~BitcodeWrapperer() {
+  for(size_t i = 0; i < variable_field_data_.size(); i++) {
+    delete [] variable_field_data_[i];
+  }
+}
+
+
+void BitcodeWrapperer::ClearBuffer() {
+  buffer_size_ = 0;
+  cursor_ = 0;
+  infile_at_eof_ = false;
+}
+
+bool BitcodeWrapperer::Seek(uint32_t pos) {
+  if (infile_ != NULL && infile_->Seek(pos)) {
+    ClearBuffer();
+    return true;
+  }
+  return false;
+}
+
+bool BitcodeWrapperer::CanReadWord() {
+  if (GetBufferUnreadBytes() < kWordSize) {
+    FillBuffer();
+    return GetBufferUnreadBytes() >= kWordSize;
+  } else {
+    return true;
+  }
+}
+
+void BitcodeWrapperer::FillBuffer() {
+  if (cursor_ > 0) {
+    // Before filling, move any remaining bytes to the
+    // front of the buffer. This allows us to assume
+    // that after the call to FillBuffer, readable
+    // text is contiguous.
+    if (cursor_ < buffer_size_) {
+      size_t i = 0;
+      while (cursor_ < buffer_size_) {
+        buffer_[i++] = buffer_[cursor_++];
+      }
+      cursor_ = 0;
+      buffer_size_ = i;
+    }
+  } else {
+    // Assume the buffer contents have been used,
+    // and we want to completely refill it.
+    buffer_size_ = 0;
+  }
+
+  // If we don't have an input, we can't refill the buffer at all.
+  if (infile_ == NULL) {
+    return;
+  }
+
+  // Now fill in remaining space.
+  size_t needed = buffer_.size() - buffer_size_;
+
+  while (buffer_.size() > buffer_size_) {
+    int actually_read = infile_->Read(&buffer_[buffer_size_], needed);
+    if (infile_->AtEof()) {
+      infile_at_eof_ = true;
+    }
+    if (actually_read) {
+      buffer_size_ += actually_read;
+      needed -= actually_read;
+    } else if (infile_at_eof_) {
+      break;
+    }
+  }
+}
+
+bool BitcodeWrapperer::ReadWord(uint32_t& word) {
+  if (!CanReadWord()) return false;
+  word = (((uint32_t) BufferLookahead(0)) << 0)
+      | (((uint32_t) BufferLookahead(1)) << 8)
+      | (((uint32_t) BufferLookahead(2)) << 16)
+      | (((uint32_t) BufferLookahead(3)) << 24);
+  cursor_ += kWordSize;
+  return true;
+}
+
+bool BitcodeWrapperer::WriteWord(uint32_t value) {
+  uint8_t buffer[kWordSize];
+  buffer[3] = (value >> 24) & 0xFF;
+  buffer[2] = (value >> 16) & 0xFF;
+  buffer[1] = (value >> 8)  & 0xFF;
+  buffer[0] = (value >> 0)  & 0xFF;
+  return outfile_->Write(buffer, kWordSize);
+}
+
+bool BitcodeWrapperer::WriteVariableFields() {
+  // This buffer may have to be bigger if we start using the fields
+  // for larger things.
+  uint8_t buffer[kMaxVariableFieldSize];
+  for (vector<BCHeaderField>::iterator it = header_fields_.begin();
+       it != header_fields_.end(); ++it) {
+    if (!it->Write(buffer, kMaxVariableFieldSize) ||
+        !outfile_->Write(buffer, it->GetTotalSize())) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool BitcodeWrapperer::ParseWrapperHeader() {
+  // Make sure LLVM-defined fields have been parsed
+  if (!IsInputBitcodeWrapper()) return false;
+  // Check the android/pnacl fields
+  if (!ReadWord(android_header_version_) ||
+      !ReadWord(android_target_api_) || !ReadWord(pnacl_bc_version_)) {
+    fprintf(stderr, "Error: file not long enough to contain header\n");
+    return false;
+  }
+  if (pnacl_bc_version_ != kPnaclBitcodeVersion) {
+    fprintf(stderr, "Error: bad PNaCl Bitcode version\n");
+    return false;
+  }
+  int field_data_total = wrapper_bc_offset_ - kWordSize * kFixedFields;
+  if (field_data_total > 0) {
+    // Read in the variable fields. We need to allocate space for the data.
+    int field_data_read = 0;
+
+    while (field_data_read < field_data_total) {
+      FillBuffer();
+      size_t buffer_needed = BCHeaderField::GetDataSizeFromSerialized(
+          &buffer_[cursor_]);
+      if (buffer_needed > buffer_.size()) {
+        buffer_.resize(buffer_needed +
+                       sizeof(BCHeaderField::FixedSubfield) * 2);
+        FillBuffer();
+      }
+      variable_field_data_.push_back(new uint8_t[buffer_needed]);
+
+      BCHeaderField field(BCHeaderField::kInvalid, 0,
+                          variable_field_data_.back());
+      field.Read(&buffer_[cursor_], buffer_size_);
+      header_fields_.push_back(field);
+      size_t field_size = field.GetTotalSize();
+      cursor_ += field_size;
+      field_data_read += field_size;
+      if (field_data_read > field_data_total) {
+        // We read too much data, the header is corrupted
+        fprintf(stderr, "Error: raw bitcode offset inconsistent with "
+                "variable field data\n");
+        return false;
+      }
+    }
+    Seek(0);
+  }
+  return true;
+}
+
+bool BitcodeWrapperer::IsInputBitcodeWrapper() {
+  ResetCursor();
+  // First make sure that there are enough words (LLVM header)
+  // to peek at.
+  if (GetBufferUnreadBytes() < kLLVMFields * kWordSize) {
+    FillBuffer();
+    if (GetBufferUnreadBytes() < kLLVMFields * kWordSize) return false;
+  }
+
+  // Now make sure the magic number is right.
+  uint32_t first_word;
+  if ((!ReadWord(first_word)) ||
+      (kWrapperMagicNumber != first_word)) return false;
+
+  // Make sure the version is right.
+  uint32_t second_word;
+  if ((!ReadWord(second_word)) ||
+      (kLLVMVersionNumber != second_word)) return false;
+
+  // Make sure that the offset and size (for llvm) is defined.
+  uint32_t bc_offset;
+  uint32_t bc_size;
+  if (ReadWord(bc_offset) &&
+      ReadWord(bc_size)) {
+    // Before returning, save the extracted values.
+    wrapper_bc_offset_ = bc_offset;
+    infile_bc_offset_ = bc_offset;
+    wrapper_bc_size_ = bc_size;
+    return true;
+  }
+  // If reached, unable to read wrapped header.
+  return false;
+}
+
+bool BitcodeWrapperer::IsInputBitcodeFile() {
+  ResetCursor();
+  // First make sure that there are four bytes to peek at.
+  if (GetBufferUnreadBytes() < kWordSize) {
+    FillBuffer();
+    if (GetBufferUnreadBytes() < kWordSize) return false;
+  }
+  // If reached, Check if first 4 bytes match bitcode
+  // file magic number.
+  return (BufferLookahead(0) == 'B') &&
+      (BufferLookahead(1) == 'C') &&
+      (BufferLookahead(2) == 0xc0) &&
+      (BufferLookahead(3) == 0xde);
+}
+
+bool BitcodeWrapperer::BufferCopyInToOut(uint32_t size) {
+  while (size > 0) {
+    // Be sure buffer is non-empty before writing.
+    if (0 == buffer_size_) {
+      FillBuffer();
+      if (0 == buffer_size_) {
+        return false;
+      }
+    }
+    // copy the buffer to the output file.
+    size_t block = (buffer_size_ < size) ? buffer_size_ : size;
+    if (!outfile_->Write(&buffer_[cursor_], block)) return false;
+    size -= block;
+    buffer_size_ = 0;
+  }
+  // Be sure that there isn't more bytes on the input stream.
+  FillBuffer();
+  return buffer_size_ == 0;
+}
+
+void BitcodeWrapperer::AddHeaderField(BCHeaderField* field) {
+  vector<BCHeaderField>::iterator it = header_fields_.begin();
+  for (; it != header_fields_.end(); ++it) {
+    // If this field is the same as an existing one, overwrite it.
+    if (it->getID() == field->getID()) {
+      wrapper_bc_offset_ += (field->GetTotalSize() - it->GetTotalSize());
+      *it = *field;
+      break;
+    }
+  }
+  if (it == header_fields_.end()) { // there was no match, add a new field
+    header_fields_.push_back(*field);
+    wrapper_bc_offset_ += field->GetTotalSize();
+  }
+}
+
+bool BitcodeWrapperer::WriteBitcodeWrapperHeader() {
+  return
+      // Note: This writes out the 4 word header required by llvm wrapped
+      // bitcode.
+      WriteWord(kWrapperMagicNumber) &&
+      WriteWord(kLLVMVersionNumber) &&
+      WriteWord(wrapper_bc_offset_) &&
+      WriteWord(wrapper_bc_size_) &&
+      // 2 fixed fields defined by Android
+      WriteWord(android_header_version_) &&
+      WriteWord(android_target_api_) &&
+      // PNaClBitcode version
+      WriteWord(kPnaclBitcodeVersion) &&
+      // Common variable-length fields
+      WriteVariableFields();
+}
+
+void BitcodeWrapperer::PrintWrapperHeader() {
+  if (error_) {
+    fprintf(stderr, "Error condition exists: the following"
+            "data may not be reliable\n");
+  }
+  fprintf(stderr, "Wrapper magic:\t\t%x\n", kWrapperMagicNumber);
+  fprintf(stderr, "LLVM Bitcode version:\t%d\n", kLLVMVersionNumber);
+  fprintf(stderr, "Raw bitcode offset:\t%d\n", wrapper_bc_offset_);
+  fprintf(stderr, "Raw bitcode size:\t%d\n", wrapper_bc_size_);
+  fprintf(stderr, "Android header version:\t%d\n", android_header_version_);
+  fprintf(stderr, "Android target API:\t%d\n", android_target_api_);
+  fprintf(stderr, "PNaCl bitcode version:\t%d\n", kPnaclBitcodeVersion);
+  for (size_t i = 0; i < header_fields_.size(); i++) header_fields_[i].Print();
+}
+
+bool BitcodeWrapperer::GenerateWrappedBitcodeFile() {
+  if (!error_ &&
+      WriteBitcodeWrapperHeader() &&
+      Seek(infile_bc_offset_) &&
+      BufferCopyInToOut(wrapper_bc_size_)) {
+    off_t dangling = wrapper_bc_size_ & 3;
+    if (dangling) {
+      return outfile_->Write((const uint8_t*) "\0\0\0\0", 4 - dangling);
+    }
+    return true;
+  }
+  return false;
+}
+
+bool BitcodeWrapperer::GenerateRawBitcodeFile() {
+  return !error_ && Seek(infile_bc_offset_) &&
+      BufferCopyInToOut(wrapper_bc_size_);
+}
diff --git a/lib/Wrap/file_wrapper_input.cpp b/lib/Wrap/file_wrapper_input.cpp
new file mode 100644
index 0000000000..fc592e0246
--- /dev/null
+++ b/lib/Wrap/file_wrapper_input.cpp
@@ -0,0 +1,53 @@
+/* Copyright 2012 The Native Client Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ */
+
+#include <sys/stat.h>
+#include <stdlib.h>
+
+#include "llvm/Wrap/file_wrapper_input.h"
+
+FileWrapperInput::FileWrapperInput(const std::string& name) :
+    _name(name), _at_eof(false), _size_found(false), _size(0) {
+  _file = fopen(name.c_str(), "rb");
+  if (NULL == _file) {
+    fprintf(stderr, "Unable to open: %s\n", name.c_str());
+    exit(1);
+  }
+}
+
+FileWrapperInput::~FileWrapperInput() {
+  fclose(_file);
+}
+
+size_t FileWrapperInput::Read(uint8_t* buffer, size_t wanted) {
+  size_t found = fread((char*) buffer, 1, wanted, _file);
+  if (feof(_file) || ferror(_file)) {
+    _at_eof = true;
+  }
+  return found;
+}
+
+bool FileWrapperInput::AtEof() {
+  return _at_eof;
+}
+
+off_t FileWrapperInput::Size() {
+  if (_size_found) return _size;
+  struct stat st;
+  if (0 == stat(_name.c_str(), &st)) {
+    _size_found = true;
+    _size = st.st_size;
+    return _size;
+  } else {
+    fprintf(stderr, "Unable to compute file size: %s\n", _name.c_str());
+    exit(1);
+  }
+  // NOT REACHABLE.
+  return 0;
+}
+
+bool FileWrapperInput::Seek(uint32_t pos) {
+  return 0 == fseek(_file, (long) pos, SEEK_SET);
+}
diff --git a/lib/Wrap/file_wrapper_output.cpp b/lib/Wrap/file_wrapper_output.cpp
new file mode 100644
index 0000000000..f9f126868d
--- /dev/null
+++ b/lib/Wrap/file_wrapper_output.cpp
@@ -0,0 +1,37 @@
+/* Copyright 2012 The Native Client Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ */
+
+#include "llvm/Wrap/file_wrapper_output.h"
+#include <stdlib.h>
+
+
+FileWrapperOutput::FileWrapperOutput(const std::string& name)
+    : _name(name) {
+  _file = fopen(name.c_str(), "wb");
+  if (NULL == _file) {
+    fprintf(stderr, "Unable to open: %s\n", name.c_str());
+    exit(1);
+  }
+}
+
+FileWrapperOutput::~FileWrapperOutput() {
+  fclose(_file);
+}
+
+bool FileWrapperOutput::Write(uint8_t byte) {
+  return EOF != fputc(byte, _file);
+}
+
+bool FileWrapperOutput::Write(const uint8_t* buffer, size_t buffer_size) {
+  if (!buffer) {
+    return false;
+  }
+
+  if (buffer_size > 0) {
+    return buffer_size == fwrite(buffer, 1, buffer_size, _file);
+  } else {
+    return true;
+  }
+}
diff --git a/lib/Wrap/wrapper_output.cpp b/lib/Wrap/wrapper_output.cpp
new file mode 100644
index 0000000000..493f29efa8
--- /dev/null
+++ b/lib/Wrap/wrapper_output.cpp
@@ -0,0 +1,9 @@
+#include "llvm/Wrap/wrapper_output.h"
+
+bool WrapperOutput::Write(const uint8_t* buffer, size_t buffer_size) {
+  // Default implementation that uses the byte write routine.
+  for (size_t i = 0; i < buffer_size; ++i) {
+    if (!Write(buffer[i])) return false;
+  }
+  return true;
+}
diff --git a/projects/sample/autoconf/config.sub b/projects/sample/autoconf/config.sub
index 9942491533..8f5793aef3 100755
--- a/projects/sample/autoconf/config.sub
+++ b/projects/sample/autoconf/config.sub
@@ -132,6 +132,10 @@ case $maybe_os in
     os=-$maybe_os
     basic_machine=`echo $1 | sed 's/^\(.*\)-\([^-]*-[^-]*\)$/\1/'`
     ;;
+  nacl)
+    os=-nacl
+    basic_machine=pnacl-unknown
+    ;;
   *)
     basic_machine=`echo $1 | sed 's/-[^-]*$//'`
     if [ $basic_machine != $1 ]
@@ -347,6 +351,8 @@ case $basic_machine in
 	i*86 | x86_64)
 	  basic_machine=$basic_machine-pc
 	  ;;
+	pnacl-*)
+	  ;;
 	# Object if more than one company name word.
 	*-*-*)
 		echo Invalid configuration \`$1\': machine \`$basic_machine\' not recognized 1>&2
@@ -1364,6 +1370,8 @@ case $os in
 			;;
 		esac
 		;;
+	-nacl)
+		;;
 	-nto-qnx*)
 		;;
 	-nto*)
diff --git a/projects/sample/configure b/projects/sample/configure
index cfbb6c6922..3baa1a7e16 100755
--- a/projects/sample/configure
+++ b/projects/sample/configure
@@ -3686,6 +3686,11 @@ else
     llvm_cv_no_link_all_option="-Wl,--no-whole-archive"
     llvm_cv_os_type="GNU"
     llvm_cv_platform_type="Unix" ;;
+  *-*-nacl*)
+    llvm_cv_link_all_option="-Wl,--whole-archive"
+    llvm_cv_no_link_all_option="-Wl,--no-whole-archive"
+    llvm_cv_os_type="NativeClient"
+    llvm_cv_platform_type="Unix" ;;
   *-*-solaris*)
     llvm_cv_link_all_option="-Wl,-z,allextract"
     llvm_cv_no_link_all_option="-Wl,-z,defaultextract"
diff --git a/test/CodeGen/X86/fast-isel-x86-64.ll b/test/CodeGen/X86/fast-isel-x86-64.ll
index cdfaf7f4c1..4800743a5f 100644
--- a/test/CodeGen/X86/fast-isel-x86-64.ll
+++ b/test/CodeGen/X86/fast-isel-x86-64.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -mattr=-avx -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s
 ; RUN: llc < %s -mattr=+avx -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort -mtriple=x86_64-none-nacl | FileCheck %s --check-prefix=NACL64
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.0.0"
@@ -301,6 +302,11 @@ define void @test23(i8* noalias sret %result) {
 ; CHECK: call
 ; CHECK: movq  %rdi, %rax
 ; CHECK: ret
+; NACL64: test23:
+; NACL64: call
+; NACL64: movl  %edi, %eax
+; NACL64: popq %rcx
+; NACL64: nacljmp %ecx, %r15
 }
 
 declare i8* @foo23()
diff --git a/test/NaCl/ARM/lit.local.cfg b/test/NaCl/ARM/lit.local.cfg
new file mode 100644
index 0000000000..1f10377867
--- /dev/null
+++ b/test/NaCl/ARM/lit.local.cfg
@@ -0,0 +1,6 @@
+config.suffixes = ['.ll', '.s']
+
+targets = set(config.root.targets_to_build.split())
+if not 'ARM' in targets:
+    config.unsupported = True
+
diff --git a/test/NaCl/ARM/nacl-read-tp-intrinsic.ll b/test/NaCl/ARM/nacl-read-tp-intrinsic.ll
new file mode 100644
index 0000000000..1050b902ed
--- /dev/null
+++ b/test/NaCl/ARM/nacl-read-tp-intrinsic.ll
@@ -0,0 +1,20 @@
+
+; RUN: llc -mtriple=armv7-unknown-nacl -sfi-store -filetype=asm %s -o - \
+; RUN:   | FileCheck -check-prefix=ARM %s
+
+; RUN: llc -mtriple=armv7-unknown-nacl -sfi-store -filetype=asm -mtls-use-call %s -o - \
+; RUN:   | FileCheck -check-prefix=ARM_IRT %s
+
+
+declare i8* @llvm.nacl.read.tp()
+
+define i8* @get_thread_pointer() {
+  %tp = call i8* @llvm.nacl.read.tp()
+  ret i8* %tp
+}
+
+; ARM: get_thread_pointer:
+; ARM: ldr r0, [r9]
+
+; ARM_IRT: get_thread_pointer:
+; ARM_IRT: bl __aeabi_read_tp
diff --git a/test/NaCl/ARM/neon-vst1-sandboxing.ll b/test/NaCl/ARM/neon-vst1-sandboxing.ll
new file mode 100644
index 0000000000..8fd580bb49
--- /dev/null
+++ b/test/NaCl/ARM/neon-vst1-sandboxing.ll
@@ -0,0 +1,116 @@
+; RUN: llc -mtriple=armv7-unknown-nacl -mattr=+neon -sfi-store -filetype=obj %s -o - \
+; RUN:  | llvm-objdump -disassemble -triple armv7 - | FileCheck %s
+
+define void @vst1i8(i8* %A, <8 x i8>* %B) nounwind {
+  %tmp1 = load <8 x i8>* %B
+  call void @llvm.arm.neon.vst1.v8i8(i8* %A, <8 x i8> %tmp1, i32 16)
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst1.8 {{{d[0-9]+}}}, [r0, :64]
+  ret void
+}
+
+define void @vst1i16(i16* %A, <4 x i16>* %B) nounwind {
+  %tmp0 = bitcast i16* %A to i8*
+  %tmp1 = load <4 x i16>* %B
+  call void @llvm.arm.neon.vst1.v4i16(i8* %tmp0, <4 x i16> %tmp1, i32 1)
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst1.16 {{{d[0-9]+}}}, [r0]
+  ret void
+}
+
+define void @vst1i32(i32* %A, <2 x i32>* %B) nounwind {
+  %tmp0 = bitcast i32* %A to i8*
+  %tmp1 = load <2 x i32>* %B
+  call void @llvm.arm.neon.vst1.v2i32(i8* %tmp0, <2 x i32> %tmp1, i32 1)
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst1.32 {{{d[0-9]+}}}, [r0]
+  ret void
+}
+
+define void @vst1f(float* %A, <2 x float>* %B) nounwind {
+  %tmp0 = bitcast float* %A to i8*
+  %tmp1 = load <2 x float>* %B
+  call void @llvm.arm.neon.vst1.v2f32(i8* %tmp0, <2 x float> %tmp1, i32 1)
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst1.32 {{{d[0-9]+}}}, [r0]
+  ret void
+}
+
+define void @vst1i64(i64* %A, <1 x i64>* %B) nounwind {
+  %tmp0 = bitcast i64* %A to i8*
+  %tmp1 = load <1 x i64>* %B
+  call void @llvm.arm.neon.vst1.v1i64(i8* %tmp0, <1 x i64> %tmp1, i32 1)
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst1.64 {{{d[0-9]+}}}, [r0]
+  ret void
+}
+
+define void @vst1Qi8(i8* %A, <16 x i8>* %B) nounwind {
+  %tmp1 = load <16 x i8>* %B
+  call void @llvm.arm.neon.vst1.v16i8(i8* %A, <16 x i8> %tmp1, i32 8)
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst1.8 {{{d[0-9]+, d[0-9]+}}}, [r0, :64]
+  ret void
+}
+
+define void @vst1Qi16(i16* %A, <8 x i16>* %B) nounwind {
+  %tmp0 = bitcast i16* %A to i8*
+  %tmp1 = load <8 x i16>* %B
+  call void @llvm.arm.neon.vst1.v8i16(i8* %tmp0, <8 x i16> %tmp1, i32 32)
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst1.16 {{{d[0-9]+, d[0-9]+}}}, [r0, :128]
+  ret void
+}
+
+define void @vst1Qi32(i32* %A, <4 x i32>* %B) nounwind {
+  %tmp0 = bitcast i32* %A to i8*
+  %tmp1 = load <4 x i32>* %B
+  call void @llvm.arm.neon.vst1.v4i32(i8* %tmp0, <4 x i32> %tmp1, i32 1)
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r0]
+  ret void
+}
+
+define void @vst1Qf(float* %A, <4 x float>* %B) nounwind {
+  %tmp0 = bitcast float* %A to i8*
+  %tmp1 = load <4 x float>* %B
+  call void @llvm.arm.neon.vst1.v4f32(i8* %tmp0, <4 x float> %tmp1, i32 1)
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r0]
+  ret void
+}
+
+define void @vst1Qi64(i64* %A, <2 x i64>* %B) nounwind {
+  %tmp0 = bitcast i64* %A to i8*
+  %tmp1 = load <2 x i64>* %B
+  call void @llvm.arm.neon.vst1.v2i64(i8* %tmp0, <2 x i64> %tmp1, i32 1)
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r0]
+  ret void
+}
+
+;Check for a post-increment updating store.
+define void @vst1f_update(float** %ptr, <2 x float>* %B) nounwind {
+  %A = load float** %ptr
+  %tmp0 = bitcast float* %A to i8*
+  %tmp1 = load <2 x float>* %B
+  call void @llvm.arm.neon.vst1.v2f32(i8* %tmp0, <2 x float> %tmp1, i32 1)
+; CHECK:         bic r1, r1, #3221225472
+; CHECK-NEXT:    vst1.32 {{{d[0-9]+}}}, [r1]!
+  %tmp2 = getelementptr float* %A, i32 2
+  store float* %tmp2, float** %ptr
+  ret void
+}
+
+declare void @llvm.arm.neon.vst1.v8i8(i8*, <8 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst1.v4i16(i8*, <4 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst1.v1i64(i8*, <1 x i64>, i32) nounwind
+
+declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst1.v4i32(i8*, <4 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst1.v2i64(i8*, <2 x i64>, i32) nounwind
+
diff --git a/test/NaCl/ARM/neon-vst2-sandboxing.ll b/test/NaCl/ARM/neon-vst2-sandboxing.ll
new file mode 100644
index 0000000000..e87373c174
--- /dev/null
+++ b/test/NaCl/ARM/neon-vst2-sandboxing.ll
@@ -0,0 +1,95 @@
+; RUN: llc -mtriple=armv7-unknown-nacl -mattr=+neon -sfi-store -filetype=obj %s -o - \
+; RUN:  | llvm-objdump -disassemble -triple armv7 - | FileCheck %s
+
+define void @vst2i8(i8* %A, <8 x i8>* %B) nounwind {
+  %tmp1 = load <8 x i8>* %B
+  call void @llvm.arm.neon.vst2.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 8)
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst2.8 {{{d[0-9]+, d[0-9]+}}}, [r0, :64]
+  ret void
+}
+
+define void @vst2i16(i16* %A, <4 x i16>* %B) nounwind {
+  %tmp0 = bitcast i16* %A to i8*
+  %tmp1 = load <4 x i16>* %B
+  call void @llvm.arm.neon.vst2.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 32)
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst2.16 {{{d[0-9]+, d[0-9]+}}}, [r0, :128]
+  ret void
+}
+
+define void @vst2i32(i32* %A, <2 x i32>* %B) nounwind {
+  %tmp0 = bitcast i32* %A to i8*
+  %tmp1 = load <2 x i32>* %B
+  call void @llvm.arm.neon.vst2.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst2.32 {{{d[0-9]+, d[0-9]+}}}, [r0]
+  ret void
+}
+
+define void @vst2f(float* %A, <2 x float>* %B) nounwind {
+  %tmp0 = bitcast float* %A to i8*
+  %tmp1 = load <2 x float>* %B
+  call void @llvm.arm.neon.vst2.v2f32(i8* %tmp0, <2 x float> %tmp1, <2 x float> %tmp1, i32 1)
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst2.32 {{{d[0-9]+, d[0-9]+}}}, [r0]
+  ret void
+}
+
+define void @vst2Qi8(i8* %A, <16 x i8>* %B) nounwind {
+  %tmp1 = load <16 x i8>* %B
+  call void @llvm.arm.neon.vst2.v16i8(i8* %A, <16 x i8> %tmp1, <16 x i8> %tmp1, i32 8)
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst2.8 {{{d[0-9]+, d[0-9]+, d[0-9]+, d[0-9]+}}}, [r0, :64]
+  ret void
+}
+
+define void @vst2Qi16(i16* %A, <8 x i16>* %B) nounwind {
+  %tmp0 = bitcast i16* %A to i8*
+  %tmp1 = load <8 x i16>* %B
+  call void @llvm.arm.neon.vst2.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 16)
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst2.16 {{{d[0-9]+, d[0-9]+, d[0-9]+, d[0-9]+}}}, [r0, :128]
+  ret void
+}
+
+define void @vst2Qi32(i32* %A, <4 x i32>* %B) nounwind {
+  %tmp0 = bitcast i32* %A to i8*
+  %tmp1 = load <4 x i32>* %B
+  call void @llvm.arm.neon.vst2.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 64)
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst2.32 {{{d[0-9]+, d[0-9]+, d[0-9]+, d[0-9]+}}}, [r0, :256]
+  ret void
+}
+
+define void @vst2Qf(float* %A, <4 x float>* %B) nounwind {
+  %tmp0 = bitcast float* %A to i8*
+  %tmp1 = load <4 x float>* %B
+  call void @llvm.arm.neon.vst2.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst2.32 {{{d[0-9]+, d[0-9]+, d[0-9]+, d[0-9]+}}}, [r0]
+  ret void
+}
+
+;Check for a post-increment updating store with register increment.
+define void @vst2i8_update(i8** %ptr, <8 x i8>* %B, i32 %inc) nounwind {
+  %A = load i8** %ptr
+  %tmp1 = load <8 x i8>* %B
+  call void @llvm.arm.neon.vst2.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 4)
+; CHECK:         bic r1, r1, #3221225472
+; CHECK-NEXT:    vst2.8 {{{d[0-9]+, d[0-9]+}}}, [r1], r2
+  %tmp2 = getelementptr i8* %A, i32 %inc
+  store i8* %tmp2, i8** %ptr
+  ret void
+}
+
+declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst2.v4i16(i8*, <4 x i16>, <4 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst2.v2i32(i8*, <2 x i32>, <2 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst2.v2f32(i8*, <2 x float>, <2 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst2.v1i64(i8*, <1 x i64>, <1 x i64>, i32) nounwind
+
+declare void @llvm.arm.neon.vst2.v16i8(i8*, <16 x i8>, <16 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst2.v8i16(i8*, <8 x i16>, <8 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst2.v4i32(i8*, <4 x i32>, <4 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32) nounwind
diff --git a/test/NaCl/ARM/neon-vst3-sandboxing.ll b/test/NaCl/ARM/neon-vst3-sandboxing.ll
new file mode 100644
index 0000000000..b496c0c592
--- /dev/null
+++ b/test/NaCl/ARM/neon-vst3-sandboxing.ll
@@ -0,0 +1,48 @@
+; RUN: llc -mtriple=armv7-unknown-nacl -mattr=+neon -sfi-store -filetype=obj %s -o - \
+; RUN:  | llvm-objdump -disassemble -triple armv7 - | FileCheck %s
+
+define void @vst3i8(i8* %A, <8 x i8>* %B) nounwind {
+  %tmp1 = load <8 x i8>* %B
+  call void @llvm.arm.neon.vst3.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 32)
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst3.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r0, :64]
+  ret void
+}
+
+define void @vst3i16(i16* %A, <4 x i16>* %B) nounwind {
+  %tmp0 = bitcast i16* %A to i8*
+  %tmp1 = load <4 x i16>* %B
+  call void @llvm.arm.neon.vst3.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1)
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
+  ret void
+}
+
+define void @vst3i32(i32* %A, <2 x i32>* %B) nounwind {
+  %tmp0 = bitcast i32* %A to i8*
+  %tmp1 = load <2 x i32>* %B
+  call void @llvm.arm.neon.vst3.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1)
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst3.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r0]
+  ret void
+}
+
+;Check for a post-increment updating store.
+define void @vst3Qi16_update(i16** %ptr, <8 x i16>* %B) nounwind {
+  %A = load i16** %ptr
+  %tmp0 = bitcast i16* %A to i8*
+  %tmp1 = load <8 x i16>* %B
+  call void @llvm.arm.neon.vst3.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 1)
+; CHECK:         bic r1, r1, #3221225472
+; CHECK-NEXT:    vst3.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r1]!
+  %tmp2 = getelementptr i16* %A, i32 24
+  store i16* %tmp2, i16** %ptr
+  ret void
+}
+
+declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst3.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst3.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32) nounwind
+
+declare void @llvm.arm.neon.vst3.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32) nounwind
diff --git a/test/NaCl/ARM/neon-vst4-sandboxing.ll b/test/NaCl/ARM/neon-vst4-sandboxing.ll
new file mode 100644
index 0000000000..032f194231
--- /dev/null
+++ b/test/NaCl/ARM/neon-vst4-sandboxing.ll
@@ -0,0 +1,53 @@
+; RUN: llc -mtriple=armv7-unknown-nacl -mattr=+neon -sfi-store -filetype=obj %s -o - \
+; RUN:  | llvm-objdump -disassemble -triple armv7 - | FileCheck %s
+
+define void @vst4i8(i8* %A, <8 x i8>* %B) nounwind {
+  %tmp1 = load <8 x i8>* %B
+  call void @llvm.arm.neon.vst4.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 8)
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst4.8 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r0, :64]
+  ret void
+}
+
+define void @vst4i16(i16* %A, <4 x i16>* %B) nounwind {
+  %tmp0 = bitcast i16* %A to i8*
+  %tmp1 = load <4 x i16>* %B
+  call void @llvm.arm.neon.vst4.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 16)
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst4.16 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r0, :128]
+  ret void
+}
+
+define void @vst4i32(i32* %A, <2 x i32>* %B) nounwind {
+  %tmp0 = bitcast i32* %A to i8*
+  %tmp1 = load <2 x i32>* %B
+  call void @llvm.arm.neon.vst4.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 32)
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r0, :256]
+  ret void
+}
+
+;Check for a post-increment updating store.
+define void @vst4Qf_update(float** %ptr, <4 x float>* %B) nounwind {
+  %A = load float** %ptr
+  %tmp0 = bitcast float* %A to i8*
+  %tmp1 = load <4 x float>* %B
+  call void @llvm.arm.neon.vst4.v4f32(i8* %tmp0, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, <4 x float> %tmp1, i32 1)
+; CHECK:         bic r1, r1, #3221225472
+; CHECK-NEXT:    vst4.32 {d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}, d{{[0-9]+}}}, [r1]!
+  %tmp2 = getelementptr float* %A, i32 16
+  store float* %tmp2, float** %ptr
+  ret void
+}
+
+declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst4.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst4.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32) nounwind
+declare void @llvm.arm.neon.vst4.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32) nounwind
+
+declare void @llvm.arm.neon.vst4.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32) nounwind
+declare void @llvm.arm.neon.vst4.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32) nounwind
+declare void @llvm.arm.neon.vst4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32) nounwind
+declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32) nounwind
+
diff --git a/test/NaCl/ARM/neon-vstlane-sandboxing.ll b/test/NaCl/ARM/neon-vstlane-sandboxing.ll
new file mode 100644
index 0000000000..5b4dc63a14
--- /dev/null
+++ b/test/NaCl/ARM/neon-vstlane-sandboxing.ll
@@ -0,0 +1,196 @@
+; RUN: llc -mtriple=armv7-unknown-nacl -mattr=+neon -sfi-store -filetype=obj %s -o - \
+; RUN:  | llvm-objdump -disassemble -triple armv7 - | FileCheck %s
+
+define void @vst1lanei8(i8* %A, <8 x i8>* %B) nounwind {
+  %tmp1 = load <8 x i8>* %B
+        %tmp2 = extractelement <8 x i8> %tmp1, i32 3
+        store i8 %tmp2, i8* %A, align 8
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst1.8 {d{{[0-9]+}}[3]}, [r0]
+  ret void
+}
+
+define void @vst1lanei16(i16* %A, <4 x i16>* %B) nounwind {
+  %tmp1 = load <4 x i16>* %B
+        %tmp2 = extractelement <4 x i16> %tmp1, i32 2
+        store i16 %tmp2, i16* %A, align 8
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst1.16 {d{{[0-9]+}}[2]}, [r0, :16]
+  ret void
+}
+
+define void @vst1lanei32(i32* %A, <2 x i32>* %B) nounwind {
+  %tmp1 = load <2 x i32>* %B
+        %tmp2 = extractelement <2 x i32> %tmp1, i32 1
+        store i32 %tmp2, i32* %A, align 8
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst1.32 {d{{[0-9]+}}[1]}, [r0, :32]
+  ret void
+}
+
+define void @vst1laneQi8(i8* %A, <16 x i8>* %B) nounwind {
+  %tmp1 = load <16 x i8>* %B
+        %tmp2 = extractelement <16 x i8> %tmp1, i32 9
+        store i8 %tmp2, i8* %A, align 8
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst1.8 {d{{[0-9]+}}[1]}, [r0]
+  ret void
+}
+
+define void @vst1laneQi16(i16* %A, <8 x i16>* %B) nounwind {
+  %tmp1 = load <8 x i16>* %B
+        %tmp2 = extractelement <8 x i16> %tmp1, i32 5
+        store i16 %tmp2, i16* %A, align 8
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst1.16 {d{{[0-9]+}}[1]}, [r0, :16]
+  ret void
+}
+
+define void @vst2lanei8(i8* %A, <8 x i8>* %B) nounwind {
+  %tmp1 = load <8 x i8>* %B
+  call void @llvm.arm.neon.vst2lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 4)
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst2.8 {d{{[0-9]+}}[1], d{{[0-9]+}}[1]}, [r0, :16]
+  ret void
+}
+
+define void @vst2lanei16(i16* %A, <4 x i16>* %B) nounwind {
+  %tmp0 = bitcast i16* %A to i8*
+  %tmp1 = load <4 x i16>* %B
+  call void @llvm.arm.neon.vst2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst2.16 {d{{[0-9]+}}[1], d{{[0-9]+}}[1]}, [r0, :32]
+  ret void
+}
+
+define void @vst2lanei32(i32* %A, <2 x i32>* %B) nounwind {
+  %tmp0 = bitcast i32* %A to i8*
+  %tmp1 = load <2 x i32>* %B
+  call void @llvm.arm.neon.vst2lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst2.32 {d{{[0-9]+}}[1], d{{[0-9]+}}[1]}, [r0]
+  ret void
+}
+
+define void @vst2laneQi16(i16* %A, <8 x i16>* %B) nounwind {
+  %tmp0 = bitcast i16* %A to i8*
+  %tmp1 = load <8 x i16>* %B
+  call void @llvm.arm.neon.vst2lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 5, i32 1)
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst2.16 {d{{[0-9]+}}[1], d{{[0-9]+}}[1]}, [r0]
+  ret void
+}
+
+define void @vst2laneQi32(i32* %A, <4 x i32>* %B) nounwind {
+  %tmp0 = bitcast i32* %A to i8*
+  %tmp1 = load <4 x i32>* %B
+  call void @llvm.arm.neon.vst2lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 16)
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst2.32 {d{{[0-9]+}}[0], d{{[0-9]+}}[0]}, [r0, :64]
+  ret void
+}
+
+define void @vst3lanei8(i8* %A, <8 x i8>* %B) nounwind {
+  %tmp1 = load <8 x i8>* %B
+  call void @llvm.arm.neon.vst3lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 1)
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst3.8 {d{{[0-9]+}}[1], d{{[0-9]+}}[1], d{{[0-9]+}}[1]}, [r0]
+  ret void
+}
+
+define void @vst3lanei16(i16* %A, <4 x i16>* %B) nounwind {
+  %tmp0 = bitcast i16* %A to i8*
+  %tmp1 = load <4 x i16>* %B
+  call void @llvm.arm.neon.vst3lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 8)
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst3.16 {d{{[0-9]+}}[1], d{{[0-9]+}}[1], d{{[0-9]+}}[1]}, [r0]
+  ret void
+}
+
+define void @vst3lanei32(i32* %A, <2 x i32>* %B) nounwind {
+  %tmp0 = bitcast i32* %A to i8*
+  %tmp1 = load <2 x i32>* %B
+  call void @llvm.arm.neon.vst3lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 1)
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst3.32 {d{{[0-9]+}}[1], d{{[0-9]+}}[1], d{{[0-9]+}}[1]}, [r0]
+  ret void
+}
+
+define void @vst4lanei8(i8* %A, <8 x i8>* %B) nounwind {
+  %tmp1 = load <8 x i8>* %B
+  call void @llvm.arm.neon.vst4lane.v8i8(i8* %A, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, <8 x i8> %tmp1, i32 1, i32 8)
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst4.8 {d{{[0-9]+}}[1], d{{[0-9]+}}[1], d{{[0-9]+}}[1], d{{[0-9]+}}[1]}, [r0, :32]
+  ret void
+}
+
+define void @vst4lanei16(i16* %A, <4 x i16>* %B) nounwind {
+  %tmp0 = bitcast i16* %A to i8*
+  %tmp1 = load <4 x i16>* %B
+  call void @llvm.arm.neon.vst4lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 1)
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst4.16 {d{{[0-9]+}}[1], d{{[0-9]+}}[1], d{{[0-9]+}}[1], d{{[0-9]+}}[1]}, [r0]
+  ret void
+}
+
+define void @vst4lanei32(i32* %A, <2 x i32>* %B) nounwind {
+  %tmp0 = bitcast i32* %A to i8*
+  %tmp1 = load <2 x i32>* %B
+  call void @llvm.arm.neon.vst4lane.v2i32(i8* %tmp0, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, <2 x i32> %tmp1, i32 1, i32 16)
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst4.32 {d{{[0-9]+}}[1], d{{[0-9]+}}[1], d{{[0-9]+}}[1], d{{[0-9]+}}[1]}, [r0, :128]
+  ret void
+}
+
+define void @vst4laneQi16(i16* %A, <8 x i16>* %B) nounwind {
+  %tmp0 = bitcast i16* %A to i8*
+  %tmp1 = load <8 x i16>* %B
+  call void @llvm.arm.neon.vst4lane.v8i16(i8* %tmp0, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, <8 x i16> %tmp1, i32 7, i32 16)
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst4.16 {d{{[0-9]+}}[3], d{{[0-9]+}}[3], d{{[0-9]+}}[3], d{{[0-9]+}}[3]}, [r0, :64]
+  ret void
+}
+
+define void @vst4laneQi32(i32* %A, <4 x i32>* %B) nounwind {
+  %tmp0 = bitcast i32* %A to i8*
+  %tmp1 = load <4 x i32>* %B
+  call void @llvm.arm.neon.vst4lane.v4i32(i8* %tmp0, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, <4 x i32> %tmp1, i32 2, i32 1)
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vst4.32 {d{{[0-9]+}}[0], d{{[0-9]+}}[0], d{{[0-9]+}}[0], d{{[0-9]+}}[0]}, [r0]
+  ret void
+}
+
+declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32) nounwind
+declare void @llvm.arm.neon.vst4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32) nounwind
+
+;Check for a post-increment updating store with register increment.
+define void @vst2lanei16_update(i16** %ptr, <4 x i16>* %B, i32 %inc) nounwind {
+  %A = load i16** %ptr
+  %tmp0 = bitcast i16* %A to i8*
+  %tmp1 = load <4 x i16>* %B
+  call void @llvm.arm.neon.vst2lane.v4i16(i8* %tmp0, <4 x i16> %tmp1, <4 x i16> %tmp1, i32 1, i32 2)
+; CHECK:         bic r1, r1, #3221225472
+; CHECK-NEXT:    vst2.16 {d{{[0-9]+}}[1], d{{[0-9]+}}[1]}, [r1], r2
+  %tmp2 = getelementptr i16* %A, i32 %inc
+  store i16* %tmp2, i16** %ptr
+  ret void
+}
diff --git a/test/NaCl/ARM/simple-load-store_sandboxing1.ll b/test/NaCl/ARM/simple-load-store_sandboxing1.ll
new file mode 100644
index 0000000000..417bb1f389
--- /dev/null
+++ b/test/NaCl/ARM/simple-load-store_sandboxing1.ll
@@ -0,0 +1,27 @@
+; RUN: llc -mtriple=armv7-unknown-nacl -sfi-store -sfi-load -filetype=obj %s -o - \
+; RUN:  | llvm-objdump -disassemble -triple armv7 - | FileCheck %s
+
+define void @foo(i32* %input, i32* %output) nounwind {
+entry:
+  %input.addr = alloca i32*, align 4
+  %output.addr = alloca i32*, align 4
+  store i32* %input, i32** %input.addr, align 4
+  store i32* %output, i32** %output.addr, align 4
+  %0 = load i32** %input.addr, align 4
+  %1 = load i32* %0, align 4
+
+; CHECK:          bic r0, r0, #3221225472
+; CHECK-NEXT:     ldr r0, [r0]
+
+  %add = add nsw i32 %1, 4
+  %2 = load i32** %output.addr, align 4
+  store i32 %add, i32* %2, align 4
+
+; CHECK:          bic r1, r1, #3221225472
+; CHECK-NEXT:     str r0, [r1]
+
+  ret void
+}
+
+
+
diff --git a/test/NaCl/ARM/sp-arithmetic-sandboxing1.ll b/test/NaCl/ARM/sp-arithmetic-sandboxing1.ll
new file mode 100644
index 0000000000..a8b3cf1c16
--- /dev/null
+++ b/test/NaCl/ARM/sp-arithmetic-sandboxing1.ll
@@ -0,0 +1,28 @@
+; RUN: llc -mtriple=armv7-unknown-nacl -sfi-store -sfi-load -sfi-stack -filetype=obj %s -o - \
+; RUN:  | llvm-objdump -disassemble -triple armv7 - | FileCheck %s
+
+define void @foo(i32* %input, i32* %output) nounwind {
+entry:
+  %input.addr = alloca i32*, align 4
+  %output.addr = alloca i32*, align 4
+  %temp = alloca i32, align 4
+
+; CHECK:        sub   sp, sp
+; CHECK-NEXT:   bic   sp, sp, #3221225472
+
+  store i32* %input, i32** %input.addr, align 4
+  store i32* %output, i32** %output.addr, align 4
+  %0 = load i32** %input.addr, align 4
+  %arrayidx = getelementptr inbounds i32* %0, i32 1
+  %1 = load i32* %arrayidx, align 4
+  store i32 %1, i32* %temp, align 4
+  %2 = load i32* %temp, align 4
+  %3 = load i32** %output.addr, align 4
+  %arrayidx1 = getelementptr inbounds i32* %3, i32 0
+  store i32 %2, i32* %arrayidx1, align 4
+
+; CHECK:        add   sp, sp
+; CHECK-NEXT:   bic   sp, sp, #3221225472
+
+  ret void
+}
diff --git a/test/NaCl/ARM/vstr-sandboxing1.ll b/test/NaCl/ARM/vstr-sandboxing1.ll
new file mode 100644
index 0000000000..6646cbc717
--- /dev/null
+++ b/test/NaCl/ARM/vstr-sandboxing1.ll
@@ -0,0 +1,13 @@
+; RUN: llc -mtriple=armv7-unknown-nacl -sfi-store -filetype=obj %s -o - \
+; RUN:  | llvm-objdump -disassemble -triple armv7 - | FileCheck %s
+
+define void @test_vstr_sandbox(<8 x i8>* %ptr) nounwind {
+  %1 = insertelement <8 x i8> undef, i8 -128, i32 0
+  %2 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
+  store <8 x i8> %2, <8 x i8>* %ptr, align 8
+; CHECK:         bic r0, r0, #3221225472
+; CHECK-NEXT:    vstr {{[0-9a-z]+}}, [r0]
+
+  ret void
+}
+
diff --git a/test/NaCl/X86/lit.local.cfg b/test/NaCl/X86/lit.local.cfg
new file mode 100644
index 0000000000..56bf008595
--- /dev/null
+++ b/test/NaCl/X86/lit.local.cfg
@@ -0,0 +1,6 @@
+config.suffixes = ['.ll', '.s']
+
+targets = set(config.root.targets_to_build.split())
+if not 'X86' in targets:
+    config.unsupported = True
+
diff --git a/test/NaCl/X86/nacl-read-tp-intrinsic.ll b/test/NaCl/X86/nacl-read-tp-intrinsic.ll
new file mode 100644
index 0000000000..2779f1b1e1
--- /dev/null
+++ b/test/NaCl/X86/nacl-read-tp-intrinsic.ll
@@ -0,0 +1,44 @@
+
+; RUN: llc -mtriple=i386-unknown-nacl -filetype=asm %s -o - \
+; RUN:   | FileCheck -check-prefix=X32 %s
+
+; RUN: llc -mtriple=i386-unknown-nacl -filetype=asm -mtls-use-call %s -o - \
+; RUN:   | FileCheck -check-prefix=USE_CALL %s
+
+; RUN: llc -mtriple=x86_64-unknown-nacl -filetype=asm %s -o - \
+; RUN:   | FileCheck -check-prefix=USE_CALL %s
+
+; "-mtls-use-call" should not make any difference on x86-64.
+; RUN: llc -mtriple=x86_64-unknown-nacl -filetype=asm -mtls-use-call %s -o - \
+; RUN:   | FileCheck -check-prefix=USE_CALL %s
+
+
+declare i8* @llvm.nacl.read.tp()
+
+define i8* @get_thread_pointer() {
+  %tp = call i8* @llvm.nacl.read.tp()
+  ret i8* %tp
+}
+
+; X32: get_thread_pointer:
+; X32: movl %gs:0, %eax
+
+; USE_CALL: get_thread_pointer:
+; USE_CALL: naclcall __nacl_read_tp
+
+
+; Make sure that we do not generate:
+;   movl $1000, %eax
+;   addl %gs:0, %eax
+; The x86-32 NaCl validator only accepts %gs with "mov", not with
+; "add".  Note that we had to use a large immediate to trigger the bug
+; and generate the code above.
+define i8* @get_thread_pointer_add() {
+  %tp = call i8* @llvm.nacl.read.tp()
+  %result = getelementptr i8* %tp, i32 1000
+  ret i8* %result
+}
+
+; X32: get_thread_pointer_add:
+; X32: movl %gs:0, %eax
+; X32: addl $1000, %eax
diff --git a/test/Transforms/NaCl/expand-ctors-empty.ll b/test/Transforms/NaCl/expand-ctors-empty.ll
new file mode 100644
index 0000000000..4368270765
--- /dev/null
+++ b/test/Transforms/NaCl/expand-ctors-empty.ll
@@ -0,0 +1,11 @@
+; Currently we do not define __{init,fini}_array_end as named aliases.
+; RUN: opt < %s -nacl-expand-ctors -S | not grep __init_array_end
+; RUN: opt < %s -nacl-expand-ctors -S | not grep __fini_array_end
+
+; RUN: opt < %s -nacl-expand-ctors -S | FileCheck %s
+
+; If llvm.global_ctors is not present, it is treated as if it is an
+; empty array, and __{init,fini}_array_start are defined anyway.
+
+; CHECK: @__init_array_start = internal constant [0 x void ()*] zeroinitializer
+; CHECK: @__fini_array_start = internal constant [0 x void ()*] zeroinitializer
diff --git a/test/Transforms/NaCl/expand-ctors-zeroinit.ll b/test/Transforms/NaCl/expand-ctors-zeroinit.ll
new file mode 100644
index 0000000000..d02741f0b5
--- /dev/null
+++ b/test/Transforms/NaCl/expand-ctors-zeroinit.ll
@@ -0,0 +1,16 @@
+; Currently we do not define __{init,fini}_array_end as named aliases.
+; RUN: opt < %s -nacl-expand-ctors -S | not grep __init_array_end
+; RUN: opt < %s -nacl-expand-ctors -S | not grep __fini_array_end
+
+; We expect this symbol to be removed:
+; RUN: opt < %s -nacl-expand-ctors -S | not grep llvm.global_ctors
+
+; RUN: opt < %s -nacl-expand-ctors -S | FileCheck %s
+
+; If llvm.global_ctors is zeroinitializer, it should be treated the
+; same as an empty array.
+
+@llvm.global_ctors = appending global [0 x { i32, void ()* }] zeroinitializer
+
+; CHECK: @__init_array_start = internal constant [0 x void ()*] zeroinitializer
+; CHECK: @__fini_array_start = internal constant [0 x void ()*] zeroinitializer
diff --git a/test/Transforms/NaCl/expand-ctors.ll b/test/Transforms/NaCl/expand-ctors.ll
new file mode 100644
index 0000000000..7f202618e7
--- /dev/null
+++ b/test/Transforms/NaCl/expand-ctors.ll
@@ -0,0 +1,36 @@
+; We expect these symbol names to be removed:
+; RUN: opt < %s -nacl-expand-ctors -S | not grep llvm.global_ctors
+; RUN: opt < %s -nacl-expand-ctors -S | not grep __init_array_end
+; RUN: opt < %s -nacl-expand-ctors -S | not grep __fini_array_end
+
+; RUN: opt < %s -nacl-expand-ctors -S | FileCheck %s
+
+@llvm.global_ctors = appending global [3 x { i32, void ()* }]
+  [{ i32, void ()* } { i32 300, void ()* @init_func_A },
+   { i32, void ()* } { i32 100, void ()* @init_func_B },
+   { i32, void ()* } { i32 200, void ()* @init_func_C }]
+
+@__init_array_start = extern_weak global [0 x void ()*]
+@__init_array_end = extern_weak global [0 x void ()*]
+
+; CHECK: @__init_array_start = internal constant [3 x void ()*] [void ()* @init_func_B, void ()* @init_func_C, void ()* @init_func_A]
+; CHECK: @__fini_array_start = internal constant [0 x void ()*] zeroinitializer
+
+define void @init_func_A() { ret void }
+define void @init_func_B() { ret void }
+define void @init_func_C() { ret void }
+
+define [0 x void ()*]* @get_array_start() {
+  ret [0 x void ()*]* @__init_array_start;
+}
+; CHECK: @get_array_start()
+; CHECK: ret {{.*}} @__init_array_start
+
+define [0 x void ()*]* @get_array_end() {
+  ret [0 x void ()*]* @__init_array_end;
+}
+
+; @get_array_end() is converted to use a GetElementPtr that returns
+; the end of the generated array:
+; CHECK: @get_array_end()
+; CHECK: ret {{.*}} bitcast ([3 x void ()*]* getelementptr inbounds ([3 x void ()*]* @__init_array_start, i32 1)
diff --git a/tools/Makefile b/tools/Makefile
index a29e49f0a1..17e8380677 100644
--- a/tools/Makefile
+++ b/tools/Makefile
@@ -34,7 +34,7 @@ PARALLEL_DIRS := opt llvm-as llvm-dis \
                  bugpoint llvm-bcanalyzer \
                  llvm-diff macho-dump llvm-objdump llvm-readobj \
 	         llvm-rtdyld llvm-dwarfdump llvm-cov \
-	         llvm-size llvm-stress llvm-mcmarkup
+	         llvm-size llvm-stress llvm-mcmarkup bc-wrap pso-stub
 
 # Let users override the set of tools to build from the command line.
 ifdef ONLY_TOOLS
diff --git a/tools/bc-wrap/LLVMBuild.txt b/tools/bc-wrap/LLVMBuild.txt
new file mode 100644
index 0000000000..b515fc04b9
--- /dev/null
+++ b/tools/bc-wrap/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./tools/llc/LLVMBuild.txt --------------------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Tool
+name = bc-wrap
+parent = Tools
+required_libraries = Wrap all-targets
diff --git a/tools/bc-wrap/Makefile b/tools/bc-wrap/Makefile
new file mode 100644
index 0000000000..dccff2ecde
--- /dev/null
+++ b/tools/bc-wrap/Makefile
@@ -0,0 +1,20 @@
+#===- tools/bc-wrap/Makefile -----------------------------------*- Makefile -*-===##
+# 
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+# 
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../..
+TOOLNAME = bc-wrap
+
+# Include this here so we can get the configuration of the targets
+# that have been configured for construction. We have to do this 
+# early so we can set up LINK_COMPONENTS before including Makefile.rules
+include $(LEVEL)/Makefile.config
+
+LINK_COMPONENTS := $(TARGETS_TO_BUILD) Wrap
+
+include $(LLVM_SRC_ROOT)/Makefile.rules
diff --git a/tools/bc-wrap/bc_wrap.cpp b/tools/bc-wrap/bc_wrap.cpp
new file mode 100644
index 0000000000..5311f714ee
--- /dev/null
+++ b/tools/bc-wrap/bc_wrap.cpp
@@ -0,0 +1,123 @@
+/* Copyright 2012 The Native Client Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ */
+/*
+ * Utility to wrap a .bc file, using LLVM standard+ custom headers.
+ */
+
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Wrap/bitcode_wrapperer.h"
+#include "llvm/Wrap/file_wrapper_input.h"
+#include "llvm/Wrap/file_wrapper_output.h"
+
+#include <ctype.h>
+#include <string.h>
+
+using namespace llvm;
+
+static cl::opt<std::string>
+InputFilename(cl::Positional, cl::desc("<input file>"), cl::Required);
+
+static cl::opt<std::string>
+OutputFilename("o", cl::desc("<output file>"));
+
+static cl::opt<bool> UnwrapFlag("u",
+                                cl::desc("unwrap rather than wrap the file"),
+                                cl::init(false));
+
+static cl::opt<bool> VerboseFlag("v",
+                                 cl::desc("print verbose header information"),
+                                 cl::init(false));
+
+static cl::opt<bool> DryRunFlag("n",
+                                cl::desc("Dry run (implies -v)"),
+                                cl::init(false));
+
+// Accept the hash on the command line to avoid having to include sha1
+// library with the LLVM code
+static cl::opt<std::string> BitcodeHash("hash",
+  cl::desc("Hash of bitcode (ignored if -u is given)"));
+
+const int kMaxBinaryHashLen = 32;
+
+// Convert ASCII hex hash to binary hash. return buffer and length.
+// The caller must free the returned buffer.
+static uint8_t* ParseBitcodeHash(int* len) {
+  if (BitcodeHash.size() > kMaxBinaryHashLen * 2 ||
+      BitcodeHash.size() % 2) return NULL;
+  *len = BitcodeHash.size() / 2;
+  uint8_t* buf = new uint8_t[*len];
+  const char* arg = BitcodeHash.data();
+  for (size_t i = 0; i < BitcodeHash.size() / 2; i++) {
+    unsigned int r; // glibc has %hhx but it's nonstandard
+    if (!isxdigit(*(arg + 2 * i + 1)) || // sscanf ignores trailing junk
+        !sscanf(arg + 2 * i, "%2x", &r) ||
+        r > std::numeric_limits<uint8_t>::max()) {
+      delete [] buf;
+      return NULL;
+    }
+    buf[i] = static_cast<uint8_t>(r);
+  }
+  return buf;
+}
+
+int main(const int argc, const char* argv[]) {
+  bool success = true;
+  cl::ParseCommandLineOptions(argc, argv, "bitcode wrapper/unwrapper\n");
+  if (OutputFilename == "") {
+    // Default to input file = output file. The cl lib doesn't seem to
+    // directly support initializing one opt from another.
+    OutputFilename = InputFilename;
+  }
+  if (DryRunFlag) VerboseFlag = true;
+  sys::fs::file_status outfile_status;
+  std::string outfile_temp;
+  outfile_temp = std::string(OutputFilename) + ".temp";
+  if (UnwrapFlag) {
+    FileWrapperInput inbc(InputFilename);
+    FileWrapperOutput outbc(outfile_temp);
+    BitcodeWrapperer wrapperer(&inbc, &outbc);
+    if (wrapperer.IsInputBitcodeWrapper()) {
+      if (VerboseFlag) {
+        fprintf(stderr, "Headers read from infile:\n");
+        wrapperer.PrintWrapperHeader();
+      }
+      if (DryRunFlag)
+        return 0;
+      success = wrapperer.GenerateRawBitcodeFile();
+    }
+  } else {
+    FileWrapperInput inbc(InputFilename);
+    FileWrapperOutput outbc(outfile_temp);
+    BitcodeWrapperer wrapperer(&inbc, &outbc);
+    if (BitcodeHash.size()) {
+      // SHA-2 hash is 256 bit
+      int hash_len;
+      uint8_t* buf = ParseBitcodeHash(&hash_len);
+      if (!buf) {
+        fprintf(stderr, "Bitcode hash must be a hex string <= 64 chars.\n");
+        exit(1);
+      }
+      BCHeaderField hash(BCHeaderField::kBitcodeHash, hash_len, buf);
+      wrapperer.AddHeaderField(&hash);
+    }
+    if (VerboseFlag) {
+      fprintf(stderr, "Headers generated:\n");
+      wrapperer.PrintWrapperHeader();
+    }
+    if (DryRunFlag)
+      return 0;
+    success = wrapperer.GenerateWrappedBitcodeFile();
+  }
+  error_code ec;
+  if ((ec = sys::fs::rename(outfile_temp, OutputFilename))) {
+    fprintf(stderr, "Could not rename temporary: %s\n", ec.message().c_str());
+    success = false;
+  }
+  if (success) return 0;
+  fprintf(stderr, "error: Unable to generate a proper %s bitcode file!\n",
+          (UnwrapFlag ? "unwrapped" : "wrapped"));
+  return 1;
+}
diff --git a/tools/gold/Makefile b/tools/gold/Makefile
index 496e31cc39..31812e1f8c 100644
--- a/tools/gold/Makefile
+++ b/tools/gold/Makefile
@@ -14,6 +14,10 @@ LINK_LIBS_IN_SHARED := 1
 SHARED_LIBRARY := 1
 LOADABLE_MODULE := 1
 
+# @LOCALMOD: this forces to appear -lLTO *after* the object file
+#            on the linkline. This is necessary for linking on ubuntu precise.
+#            Otherwise LLVMgold.so will not have a dt_needed entry for LTO
+EXTRA_LIBS := -lLTO
 EXPORTED_SYMBOL_FILE = $(PROJ_SRC_DIR)/gold.exports
 
 # Include this here so we can get the configuration of the targets
diff --git a/tools/gold/gold-plugin.cpp b/tools/gold/gold-plugin.cpp
index b0a0dd2a40..1c3a01b1e8 100644
--- a/tools/gold/gold-plugin.cpp
+++ b/tools/gold/gold-plugin.cpp
@@ -55,6 +55,25 @@ namespace {
   ld_plugin_set_extra_library_path set_extra_library_path = NULL;
   ld_plugin_get_view get_view = NULL;
   ld_plugin_message message = discard_message;
+  // @LOCALMOD-BEGIN
+  // REL, DYN, or EXEC
+  ld_plugin_output_file_type linker_output;
+
+  // Callback for getting link soname from gold
+  ld_plugin_get_output_soname get_output_soname = NULL;
+
+  // Callback for getting needed libraries from gold
+  ld_plugin_get_needed get_needed = NULL;
+
+  // Callback for getting number of needed library from gold
+  ld_plugin_get_num_needed get_num_needed = NULL;
+
+  // Callback for getting the number of --wrap'd symbols.
+  ld_plugin_get_num_wrapped get_num_wrapped = NULL;
+
+  // Callback for getting the name of a wrapped symbol.
+  ld_plugin_get_wrapped get_wrapped = NULL;
+  // @LOCALMOD-END
 
   int api_version = 0;
   int gold_version = 0;
@@ -62,11 +81,17 @@ namespace {
   struct claimed_file {
     void *handle;
     std::vector<ld_plugin_symbol> syms;
+    bool is_linked_in; // @LOCALMOD
   };
 
   lto_codegen_model output_type = LTO_CODEGEN_PIC_MODEL_STATIC;
   std::string output_name = "";
   std::list<claimed_file> Modules;
+
+  // @LOCALMOD-BEGIN
+  std::vector<std::string> DepLibs;
+  // @LOCALMOD-END
+
   std::vector<sys::Path> Cleanup;
   lto_code_gen_t code_gen = NULL;
 }
@@ -74,6 +99,7 @@ namespace {
 namespace options {
   enum generate_bc { BC_NO, BC_ALSO, BC_ONLY };
   static bool generate_api_file = false;
+  static bool gather_then_link = true; // @LOCALMOD
   static generate_bc generate_bc_file = BC_NO;
   static std::string bc_path;
   static std::string obj_path;
@@ -103,6 +129,10 @@ namespace options {
       triple = opt.substr(strlen("mtriple="));
     } else if (opt.startswith("obj-path=")) {
       obj_path = opt.substr(strlen("obj-path="));
+      // @LOCALMOD-BEGIN
+    } else if (opt == "no-gather-then-link") {
+      gather_then_link = false;
+      // @LOCALMOD-END
     } else if (opt == "emit-llvm") {
       generate_bc_file = BC_ONLY;
     } else if (opt == "also-emit-llvm") {
@@ -123,6 +153,18 @@ namespace options {
   }
 }
 
+// @LOCALMOD-BEGIN
+static const char *get_basename(const char *path) {
+  if (path == NULL)
+    return NULL;
+  const char *slash = strrchr(path, '/');
+  if (slash)
+    return slash + 1;
+
+  return path;
+}
+// @LOCALMOD-END
+
 static ld_plugin_status claim_file_hook(const ld_plugin_input_file *file,
                                         int *claimed);
 static ld_plugin_status all_symbols_read_hook(void);
@@ -150,6 +192,10 @@ ld_plugin_status onload(ld_plugin_tv *tv) {
         output_name = tv->tv_u.tv_string;
         break;
       case LDPT_LINKER_OUTPUT:
+        // @LOCALMOD-BEGIN
+        linker_output =
+          static_cast<ld_plugin_output_file_type>(tv->tv_u.tv_val);
+        // @LOCALMOD-END
         switch (tv->tv_u.tv_val) {
           case LDPO_REL:  // .o
           case LDPO_DYN:  // .so
@@ -213,7 +259,23 @@ ld_plugin_status onload(ld_plugin_tv *tv) {
         break;
       case LDPT_GET_VIEW:
         get_view = tv->tv_u.tv_get_view;
+      // @LOCALMOD-BEGIN
+      case LDPT_GET_OUTPUT_SONAME:
+        get_output_soname = tv->tv_u.tv_get_output_soname;
         break;
+      case LDPT_GET_NEEDED:
+        get_needed = tv->tv_u.tv_get_needed;
+        break;
+      case LDPT_GET_NUM_NEEDED:
+        get_num_needed = tv->tv_u.tv_get_num_needed;
+        break;
+      case LDPT_GET_WRAPPED:
+        get_wrapped = tv->tv_u.tv_get_wrapped;
+        break;
+      case LDPT_GET_NUM_WRAPPED:
+        get_num_wrapped = tv->tv_u.tv_get_num_wrapped;
+        break;
+      // @LOCALMOD-END
       case LDPT_MESSAGE:
         message = tv->tv_u.tv_message;
         break;
@@ -231,6 +293,24 @@ ld_plugin_status onload(ld_plugin_tv *tv) {
     return LDPS_ERR;
   }
 
+  // @LOCALMOD-BEGIN
+  // Parse extra command-line options
+  // Although lto_codegen provides a way to parse command-line arguments,
+  // we need the arguments to be parsed and applied before LTOModules are
+  // even created. In particular, this is needed because the
+  // "-add-nacl-read-tp-dependency" flag affects how modules are created.
+  if (!options::extra.empty()) {
+    for (std::vector<std::string>::iterator it = options::extra.begin();
+         it != options::extra.end(); ++it) {
+      lto_add_command_line_option((*it).c_str());
+    }
+    lto_parse_command_line_options();
+    // We clear the options so that they don't get parsed again in
+    // lto_codegen_debug_options.
+    options::extra.clear();
+  }
+  // @LOCALMOD-END
+
   return LDPS_OK;
 }
 
@@ -297,7 +377,21 @@ static ld_plugin_status claim_file_hook(const ld_plugin_input_file *file,
     ld_plugin_symbol &sym = cf.syms.back();
     sym.name = const_cast<char *>(lto_module_get_symbol_name(M, i));
     sym.name = strdup(sym.name);
+    // @LOCALMOD-BEGIN
+    // Localmods have disabled the use of the 'version' field for passing
+    // version information to Gold. Instead, the version is now transmitted as
+    // part of the 'name' field, which has the form "sym@VER" or "sym@@VER".
+    // This is nicer because it communicates one extra bit of information (@@
+    // marks the default version), and allows us to access the real symbol
+    // name in all_symbols_read.
+
+    // These fields are set by Gold to communicate the updated version info
+    // to the plugin. They are used in all_symbols_read_hook().
+    // Initialize them for predictability.
     sym.version = NULL;
+    sym.is_default = false;
+    sym.dynfile = NULL;
+    // @LOCALMOD-END
 
     int scope = attrs & LTO_SYMBOL_SCOPE_MASK;
     switch (scope) {
@@ -346,18 +440,45 @@ static ld_plugin_status claim_file_hook(const ld_plugin_input_file *file,
   }
 
   cf.syms.reserve(cf.syms.size());
+  // @LOCALMOD-BEGIN
+  bool is_shared =
+    (lto_module_get_output_format(M) == LTO_OUTPUT_FORMAT_SHARED);
+  const char* soname = lto_module_get_soname(M);
+  if (soname[0] == '\0')
+    soname = NULL;
+  // @LOCALMOD-END
 
   if (!cf.syms.empty()) {
-    if ((*add_symbols)(cf.handle, cf.syms.size(), &cf.syms[0]) != LDPS_OK) {
+    if ((*add_symbols)(cf.handle, cf.syms.size(), &cf.syms[0],
+                       is_shared, soname) != LDPS_OK) { // @LOCALMOD
       (*message)(LDPL_ERROR, "Unable to add symbols!");
       return LDPS_ERR;
     }
   }
 
-  if (code_gen)
-    lto_codegen_add_module(code_gen, M);
+  // @LOCALMOD-BEGIN
+  // Do not merge the module if it's a PSO.
+  // If the PSO's soname is set, add it to DepLibs.
+  cf.is_linked_in = false;
+  if (code_gen) {
+    if (is_shared) {
+      if (soname && strlen(soname) > 0) {
+        DepLibs.push_back(soname);
+      }
+    } else {
+      if (options::gather_then_link) {
+        lto_codegen_gather_module_for_link(code_gen, M);
+      } else {
+        lto_codegen_add_module(code_gen, M);
+      }
+      cf.is_linked_in = true;
+    }
+  }
 
-  lto_module_dispose(M);
+  // With gather_then_link, the modules are disposed when linking.
+  if (!options::gather_then_link)
+    lto_module_dispose(M);
+  // @LOCALMOD-END
 
   return LDPS_OK;
 }
@@ -370,6 +491,12 @@ static ld_plugin_status all_symbols_read_hook(void) {
   std::ofstream api_file;
   assert(code_gen);
 
+  // @LOCALMOD-BEGIN
+  if (options::gather_then_link) {
+    lto_codegen_link_gathered_modules_and_dispose(code_gen);
+  }
+  // @LOCALMOD-END
+
   if (options::generate_api_file) {
     api_file.open("apifile.txt", std::ofstream::out | std::ofstream::trunc);
     if (!api_file.is_open()) {
@@ -384,12 +511,45 @@ static ld_plugin_status all_symbols_read_hook(void) {
       continue;
     (*get_symbols)(I->handle, I->syms.size(), &I->syms[0]);
     for (unsigned i = 0, e = I->syms.size(); i != e; i++) {
+      // @LOCALMOD-BEGIN
+      // Don't process the symbols inside a dynamic object.
+      if (!I->is_linked_in)
+        continue;
+      // @LOCALMOD-END
+
       if (I->syms[i].resolution == LDPR_PREVAILING_DEF) {
+        // @LOCALMOD-BEGIN
+        // Set the symbol version in the module.
+        if (linker_output != LDPO_REL && I->syms[i].version) {
+          // NOTE: This may change the name of the symbol, so it must happen
+          // before the call to lto_codegen_add_must_preserve_symbols() below.
+          I->syms[i].name = const_cast<char *>(
+            lto_codegen_set_symbol_def_version(code_gen, I->syms[i].name,
+                                               I->syms[i].version,
+                                               I->syms[i].is_default));
+        }
         lto_codegen_add_must_preserve_symbol(code_gen, I->syms[i].name);
+        // @LOCALMOD-END
 
         if (options::generate_api_file)
           api_file << I->syms[i].name << "\n";
       }
+      // @LOCALMOD-BEGIN
+      else if (linker_output != LDPO_REL &&
+               (I->syms[i].resolution == LDPR_RESOLVED_DYN ||
+                I->syms[i].resolution == LDPR_UNDEF)) {
+        // This symbol is provided by an external object.
+        // Set the version and source dynamic file for it.
+        const char *ver = I->syms[i].version;
+        const char *dynfile = I->syms[i].dynfile;
+        dynfile = get_basename(dynfile);
+        // NOTE: This may change the name of the symbol.
+        I->syms[i].name = const_cast<char *>(
+          lto_codegen_set_symbol_needed(code_gen, I->syms[i].name,
+                                        ver ? ver : "",
+                                        dynfile ? dynfile : ""));
+      }
+      // @LOCALMOD-END
     }
   }
 
@@ -401,6 +561,11 @@ static ld_plugin_status all_symbols_read_hook(void) {
   if (!options::mcpu.empty())
     lto_codegen_set_cpu(code_gen, options::mcpu.c_str());
 
+  // @LOCALMOD-BEGIN (COMMENT)
+  // "extra" will always be empty below, because we process the extra
+  // options earlier, at the end of onload().
+  // @LOCALMOD-END
+
   // Pass through extra options to the code generator.
   if (!options::extra.empty()) {
     for (std::vector<std::string>::iterator it = options::extra.begin();
@@ -409,6 +574,57 @@ static ld_plugin_status all_symbols_read_hook(void) {
     }
   }
 
+  // @LOCALMOD-BEGIN
+  // Store the linker output format into the bitcode.
+  lto_output_format format;
+  switch (linker_output) {
+    case LDPO_REL:
+      format = LTO_OUTPUT_FORMAT_OBJECT;
+      break;
+    case LDPO_DYN:
+      format = LTO_OUTPUT_FORMAT_SHARED;
+      break;
+    case LDPO_EXEC:
+      format = LTO_OUTPUT_FORMAT_EXEC;
+      break;
+    default:
+      (*message)(LDPL_FATAL, "Unknown linker output format (gold-plugin)");
+      abort();
+      break;
+  }
+  lto_codegen_set_merged_module_output_format(code_gen, format);
+  // @LOCALMOD-END
+
+  // @LOCALMOD-BEGIN
+  // For -shared linking, store the soname into the bitcode.
+  if (linker_output == LDPO_DYN) {
+    const char *soname = (*get_output_soname)();
+    lto_codegen_set_merged_module_soname(code_gen, soname);
+  }
+  // @LOCALMOD-END
+
+  // @LOCALMOD-BEGIN
+  // Add the needed libraries to the bitcode.
+  unsigned int num_needed = (*get_num_needed)();
+  for (unsigned i=0; i < num_needed; ++i) {
+    const char *soname = (*get_needed)(i);
+    soname = get_basename(soname);
+    lto_codegen_add_merged_module_library_dep(code_gen, soname);
+  }
+  for (std::vector<std::string>::iterator I = DepLibs.begin(),
+           E = DepLibs.end(); I != E; ++I) {
+    lto_codegen_add_merged_module_library_dep(code_gen, I->c_str());
+  }
+  // @LOCALMOD-END
+
+  // @LOCALMOD-BEGIN
+  // Perform symbol wrapping.
+  unsigned int num_wrapped = (*get_num_wrapped)();
+  for (unsigned i=0; i < num_wrapped; ++i) {
+    const char *sym = (*get_wrapped)(i);
+    lto_codegen_wrap_symbol_in_merged_module(code_gen, sym);
+  }
+  // @LOCALMOD-END
   if (options::generate_bc_file != options::BC_NO) {
     std::string path;
     if (options::generate_bc_file == options::BC_ONLY)
diff --git a/tools/llc/CMakeLists.txt b/tools/llc/CMakeLists.txt
index 683f29862d..9c695bcdea 100644
--- a/tools/llc/CMakeLists.txt
+++ b/tools/llc/CMakeLists.txt
@@ -1,5 +1,11 @@
 set(LLVM_LINK_COMPONENTS ${LLVM_TARGETS_TO_BUILD} bitreader asmparser)
 
 add_llvm_tool(llc
+# LOCALMOD BEGIN
+# This file provides wrappers to lseek(2), read(2), etc. 
+  nacl_file.cpp
+  StubMaker.cpp
+  TextStubWriter.cpp
+# LOCALMOD END
   llc.cpp
   )
diff --git a/tools/llc/ELFStub.h b/tools/llc/ELFStub.h
new file mode 100644
index 0000000000..a79fecff0f
--- /dev/null
+++ b/tools/llc/ELFStub.h
@@ -0,0 +1,55 @@
+// This file describes a simple high-level representation of an ELF stub.
+
+#ifndef __ELF_STUB_H
+#define __ELF_STUB_H
+
+#include <llvm/Support/ELF.h>
+#include <llvm/ADT/StringMap.h>
+#include <string>
+#include <vector>
+
+namespace llvm {
+
+struct SymbolStub;
+struct VersionDefinition;
+
+using ELF::Elf32_Half;
+
+struct ELFStub {
+  Elf32_Half Machine;
+  std::string SOName;
+  std::vector<SymbolStub> Symbols;
+  std::vector<VersionDefinition> VerDefs;
+
+  // These are used for constructing the version definitions.
+  // They are not directly emitted to the ELF stub.
+  StringMap<Elf32_Half> IndexMap; // Maps version name to version index.
+  Elf32_Half NextIndex;           // Next available version index
+};
+
+
+// Dynamic symbol entries
+struct SymbolStub {
+  // Symbol Table info.
+  std::string Name;
+  unsigned char Type; // STT_*
+  unsigned char Binding; // STB_*
+  unsigned char Visibility; // STV_*
+  ELF::Elf32_Word Size; // Guess for st_size.
+  // st_value, etc. are stubbed out.
+
+  // Version info matching each of the symbols.
+  Elf32_Half VersionIndex; // vd_ndx
+  bool IsDefault;
+};
+
+// Versions defined in this module
+struct VersionDefinition {
+  Elf32_Half Index; // vd_ndx
+  bool IsWeak; // TODO(pdox): Implement this (for vd_flags)
+  std::string Name; // for vda_name, etc.
+  std::vector<std::string> Parents; // TODO(pdox): Implement this
+};
+
+}
+#endif
diff --git a/tools/llc/SRPCStreamer.cpp b/tools/llc/SRPCStreamer.cpp
new file mode 100644
index 0000000000..3eaa7c17c6
--- /dev/null
+++ b/tools/llc/SRPCStreamer.cpp
@@ -0,0 +1,116 @@
+//===-- SRPCStreamer.cpp - Stream bitcode over SRPC  ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(__native_client__) && defined(NACL_SRPC)
+#define DEBUG_TYPE "bitcode-stream"
+#include "SRPCStreamer.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <errno.h>
+
+using llvm::dbgs;
+
+size_t QueueStreamer::GetBytes(unsigned char *buf, size_t len) {
+  pthread_mutex_lock(&Mutex);
+  while (!Done && queueSize() < len) {
+    DEBUG(dbgs() << "QueueStreamer::GetBytes len " << len << " size " <<
+          queueSize() <<" << waiting\n");
+    pthread_cond_wait(&Cond, &Mutex);
+  }
+  if (Done && queueSize() < len) len = queueSize();
+  queueGet(buf, len);
+  pthread_mutex_unlock(&Mutex);
+  return len;
+}
+
+size_t QueueStreamer::PutBytes(unsigned char *buf, size_t len) {
+  pthread_mutex_lock(&Mutex);
+  queuePut(buf, len);
+  pthread_cond_signal(&Cond);
+  pthread_mutex_unlock(&Mutex);
+  return len;
+}
+
+void QueueStreamer::SetDone() {
+  // Still need the lock to avoid signaling between the check and
+  // the wait in GetBytes.
+  pthread_mutex_lock(&Mutex);
+  Done = true;
+  pthread_cond_signal(&Cond);
+  pthread_mutex_unlock(&Mutex);
+}
+
+// Called with Mutex held to protect Cons, Prod, and Bytes
+void QueueStreamer::queuePut(unsigned char *buf, size_t len) {
+  while (capacityRemaining() < len) {
+    int leftover = Bytes.size() - Cons;
+    DEBUG(dbgs() << "resizing " << leftover << " " << Prod << " " <<
+          Cons << "\n");
+    Bytes.resize(Bytes.size() * 2);
+    if (Cons > Prod) {
+      // There are unread bytes left between Cons and the previous end of the
+      // buffer. Move them to the new end of the buffer.
+      memmove(&Bytes[Bytes.size() - leftover], &Bytes[Cons], leftover);
+      Cons = Bytes.size() - leftover;
+    }
+  }
+  size_t EndSpace = std::min(len, Bytes.size() - Prod);
+  DEBUG(dbgs() << "put, len " << len << " Endspace " << EndSpace << " p " <<
+        Prod << " c " << Cons << "\n");
+  // Copy up to the end of the buffer
+  memcpy(&Bytes[Prod], buf, EndSpace);
+  // Wrap around if necessary
+  memcpy(&Bytes[0], buf + EndSpace, len - EndSpace);
+  Prod = (Prod + len) % Bytes.size();
+}
+
+// Called with Mutex held to protect Cons, Prod, and Bytes
+void QueueStreamer::queueGet(unsigned char *buf, size_t len) {
+  assert(len <= queueSize());
+  size_t EndSpace = std::min(len, Bytes.size() - Cons);
+  DEBUG(dbgs() << "get, len " << len << " Endspace " << EndSpace << " p " <<
+        Prod << " c " << Cons << "\n");
+  // Copy up to the end of the buffer
+  memcpy(buf, &Bytes[Cons], EndSpace);
+  // Wrap around if necessary
+  memcpy(buf + EndSpace, &Bytes[0], len - EndSpace);
+  Cons = (Cons + len) % Bytes.size();
+}
+
+llvm::DataStreamer *SRPCStreamer::init(void *(*Callback)(void *), void *arg,
+                                       std::string *ErrMsg) {
+  int err = pthread_create(&CompileThread, NULL, Callback, arg);
+  if (err) {
+    if (ErrMsg) *ErrMsg = std::string(strerror(errno));
+    return NULL;
+  }
+  return &Q;
+}
+
+size_t SRPCStreamer::gotChunk(unsigned char *bytes, size_t len) {
+  if (Error) return 0;
+  return Q.PutBytes(bytes, len);
+}
+
+int SRPCStreamer::streamEnd(std::string *ErrMsg) {
+  Q.SetDone();
+  int err = pthread_join(CompileThread, NULL);
+  if (err) {
+    if (ErrMsg) *ErrMsg = std::string(strerror(errno));
+    return err;
+  }
+  if (Error && ErrMsg) *ErrMsg = std::string("compile failed.");
+  return Error;
+}
+
+#endif
diff --git a/tools/llc/SRPCStreamer.h b/tools/llc/SRPCStreamer.h
new file mode 100644
index 0000000000..a326d9276d
--- /dev/null
+++ b/tools/llc/SRPCStreamer.h
@@ -0,0 +1,93 @@
+//===-- SRPCStreamer.cpp - Stream bitcode over SRPC  ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef SRPCSTREAMER_H
+#define SRPCSTREAMER_H
+
+#include <pthread.h>
+#include <cassert>
+#include <cstdio>
+#include <cstring>
+#include <vector>
+#include "llvm/Support/DataStream.h"
+
+// Implements LLVM's interface for fetching data from a stream source.
+// Bitcode bytes from the RPC thread are placed here with PutBytes and buffered
+// until the bitcode reader calls GetBytes to remove them.
+class QueueStreamer : public llvm::DataStreamer {
+ public:
+ QueueStreamer() : Done(false), Prod(0), Cons(0) {
+    pthread_mutex_init(&Mutex, NULL);
+    pthread_cond_init(&Cond, NULL);
+    Bytes.resize(64 * 1024);
+  }
+  // Called by the compilation thread. Wait for len bytes to become available,
+  // and copy them into buf. If all bytes have been received and there are
+  // fewer than len bytes available, copy all remaining bytes.
+  // Return the number of bytes copied.
+  virtual size_t GetBytes(unsigned char *buf, size_t len);
+
+  // Called by the RPC thread. Copy len bytes from buf and wake up the
+  // compilation thread if it is waiting. Return the number of bytes copied.
+  size_t PutBytes(unsigned char *buf, size_t len);
+
+  // Called by the RPC thread. Signal that all bytes have been received,
+  // so the last call to GetBytes will return the remaining bytes rather
+  // than waiting for the entire requested amound.
+  void SetDone();
+
+ private:
+  bool Done;
+  pthread_mutex_t Mutex;
+  pthread_cond_t Cond;
+
+  // Variables and functions to manage the circular queue
+  std::vector<unsigned char> Bytes;
+  size_t Prod; // Queue producer index
+  size_t Cons; // Queue consumer index
+  size_t queueSize() {
+    return Prod >= Cons ? Prod - Cons : Bytes.size() - (Cons - Prod);
+  }
+  size_t capacityRemaining() {
+    return (Prod >= Cons ? Bytes.size() - (Prod - Cons) : (Cons - Prod)) - 1;
+  }
+  void queuePut(unsigned char *buf, size_t len);
+  void queueGet(unsigned char *buf, size_t len);
+};
+
+// Class to manage the compliation thread and serve as the interface from
+// the SRPC thread
+class SRPCStreamer  {
+public:
+  SRPCStreamer() : Error(false) {}
+  // Initialize streamer, create a new thread running Callback, and
+  // return a pointer to the DataStreamer the threads will use to
+  // synchronize. On error, return NULL and fill in the ErrorMsg string
+  llvm::DataStreamer *init(void *(*Callback)(void *),
+                           void *arg, std::string *ErrMsg);
+  // Called by the RPC thread. Copy len bytes from buf. Return bytes copied.
+  size_t gotChunk(unsigned char *bytes, size_t len);
+  // Called by the RPC thread. Wait for the compilation thread to finish.
+  int streamEnd(std::string *ErrMsg);
+  // Called by the compilation thread. Signal that there was a compilation
+  // error so the RPC thread can abort the stream.
+  void setError() { Error = true; }
+private:
+  bool Error;
+  QueueStreamer Q;
+  pthread_t CompileThread;
+};
+
+
+
+#endif  // SRPCSTREAMER_H
diff --git a/tools/llc/StubMaker.cpp b/tools/llc/StubMaker.cpp
new file mode 100644
index 0000000000..cc343280a3
--- /dev/null
+++ b/tools/llc/StubMaker.cpp
@@ -0,0 +1,233 @@
+// Create a high-level representation of the needed library.
+
+#include "StubMaker.h"
+
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Module.h"
+#include "llvm/Type.h"
+#include "ELFStub.h"
+
+using namespace llvm;
+
+// Extract the Name, Version, and IsDefault flag from the FullName string.
+// e.g. foo@V1  --> foo, V1, false
+//      bar@@V2 --> bar, V2, true
+static void ExtractVersion(StringRef FullName,
+                           StringRef &Name,
+                           StringRef &Version,
+                           bool &IsDefault) {
+  size_t atpos = FullName.find('@');
+  if (atpos == StringRef::npos) {
+    Name = FullName;
+    Version = "";
+    IsDefault = false;
+    return;
+  }
+  Name = FullName.substr(0, atpos);
+  ++atpos;
+  if (FullName[atpos] == '@') {
+    IsDefault = true;
+    ++atpos;
+  } else {
+    IsDefault = false;
+  }
+  Version = FullName.substr(atpos);
+}
+
+
+// This implicitly creates a version record as a result of locating a symbol
+// with this version. There is normally more information attached to a
+// version definition: the parent version(s) and definition flags (weak
+// or base). This information is currently not stored in the bitcode
+// module. It may be necessary to add this in the future.
+static Elf32_Half AddVersionDef(ELFStub *Stub, StringRef Name) {
+  VersionDefinition VD;
+  VD.Name = Name;
+  VD.Index = Stub->NextIndex++;
+  VD.IsWeak = false; // TODO(pdox): Implement
+  VD.Parents.clear(); // TODO(pdox): Implement
+  Stub->VerDefs.push_back(VD);
+  Stub->IndexMap[VD.Name] = VD.Index;
+  return VD.Index;
+}
+
+static Elf32_Half GetVersionIndex(StringRef Version, ELFStub *Stub) {
+  // Handle unversioned symbols
+  if (Version.empty())
+    return 1; /* ELF::VER_NDX_GLOBAL */
+  // Find the version definition, if it already exists.
+  StringMap<Elf32_Half>::const_iterator I = Stub->IndexMap.find(Version);
+  if (I != Stub->IndexMap.end()) {
+    return I->second;
+  }
+  // If not, create it.
+  return AddVersionDef(Stub, Version);
+}
+
+static Elf32_Half GetELFMachine(const Triple &T) {
+  switch (T.getArch()) {
+    default: llvm_unreachable("Unknown target triple in StubMaker.cpp");
+    case Triple::x86_64: return ELF::EM_X86_64;
+    case Triple::x86: return ELF::EM_386;
+    case Triple::arm: return ELF::EM_ARM;
+    case Triple::mipsel: return ELF::EM_MIPS;
+  }
+}
+
+static unsigned char GetELFVisibility(const GlobalValue *GV) {
+  switch (GV->getVisibility()) {
+  case GlobalValue::DefaultVisibility: return ELF::STV_DEFAULT;
+  case GlobalValue::HiddenVisibility: return ELF::STV_HIDDEN;
+  case GlobalValue::ProtectedVisibility: return ELF::STV_PROTECTED;
+  }
+  llvm_unreachable("Unknown visibility in GETELFVisibility");
+}
+
+static ELF::Elf32_Word GetElfSizeForType(const GlobalValue *GV,
+                                         const Type *ElemType) {
+  unsigned bit_size = ElemType->getPrimitiveSizeInBits();
+  if (bit_size != 0) {
+    // Check against 0 to see if it was actually a primitive.
+    return bit_size / 8;
+  }
+  if (isa<PointerType>(ElemType)) {
+    // Pointers are 32-bit for NaCl.
+    return 4;
+  }
+  if (isa<FunctionType>(ElemType)) {
+    // This is not a data object, so just say unknown (0).
+    return 0;
+  }
+  if (const ArrayType *ATy = dyn_cast<ArrayType>(ElemType)) {
+    unsigned elem_size = GetElfSizeForType(GV, ATy->getElementType());
+    unsigned num_elems = ATy->getNumElements();
+    // TODO(jvoung): Come up with a test for what to do with 0-length arrays.
+    // Not sure what to do here actually.  It may be that the 0-length
+    // array is meant to be an opaque type, which you can never check the
+    // "sizeof".  For now, return 0 instead of asserting.
+    // Known instance of this in library code is in basic_string.h:
+    //    static size_type _S_empty_rep_storage[];
+    return elem_size * num_elems;
+  }
+  if (const VectorType *VTy = dyn_cast<VectorType>(ElemType)) {
+    unsigned bit_width = VTy->getBitWidth();
+    if (bit_width) {
+      return bit_width / 8;
+    } else {
+      // It's a vector of pointers, and pointers are 32-bit in NaCl
+      return VTy->getNumElements() * 4;
+    }
+  }
+  if (const StructType *STy = dyn_cast<StructType>(ElemType)) {
+    // Alignment padding should have been added to the type in the front-end.
+    unsigned size_so_far = 0;
+    for (unsigned i = 0; i < STy->getNumElements(); ++i) {
+      size_so_far += GetElfSizeForType(GV, STy->getElementType(i));
+    }
+    return size_so_far;
+  }
+  // Unknown type!
+  DEBUG({
+      dbgs() << "Unknown GetELFSize for var=";
+      GV->dump();
+      dbgs() << " type= ";
+      ElemType->dump();
+      dbgs() << "\n";
+    });
+  llvm_unreachable("Unhandled type for GetELFSize");
+  return 0;
+}
+
+// Return a value for the symbol table's st_size, which is the number of bytes
+// in a data object.  Functions may report unknown size 0 (not data objects).
+// This is known to be important for symbols that may sit in BSS
+// with copy relocations (to know how much to copy).
+static ELF::Elf32_Word GetELFSize(const GlobalValue *GV) {
+  const class PointerType *PT = GV->getType();
+  const Type *ElemType = PT->getElementType();
+  return GetElfSizeForType(GV, ElemType);
+}
+
+static unsigned char GetELFType(const GlobalValue *GV) {
+  if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV)) {
+    return GVar->isThreadLocal() ? ELF::STT_TLS : ELF::STT_OBJECT;
+  } else if (isa<Function>(GV)) {
+    // TODO(pdox): Handle STT_GNU_IFUNC
+    return ELF::STT_FUNC;
+  }
+  // TODO(pdox): Do we need to resolve GlobalAliases?
+  llvm_unreachable("Unknown GlobalValue type in GetELFType!");
+}
+
+static unsigned char GetELFBinding(const GlobalValue *GV) {
+  // TODO(pdox):
+  // This information would ideally be made to match the symbol binding
+  // as declared in the original shared object. However, GV is only the
+  // declaration for this symbol, so we cannot derive the definition's
+  // binding here. But it seems like it should be fine to always set it to
+  // STB_GLOBAL, since we already know this symbol is the prevailing
+  // definition.
+  return ELF::STB_GLOBAL;
+}
+
+static void MakeOneStub(const Module &M,
+                        const Module::NeededRecord &NR,
+                        ELFStub *Stub) {
+  Stub->SOName = NR.DynFile;
+  Stub->NextIndex = 2; // 0,1 are reserved
+  for (unsigned j = 0; j < NR.Symbols.size(); ++j) {
+    StringRef FullName = NR.Symbols[j];
+    GlobalValue *GV = M.getNamedValue(FullName);
+    if (!GV) {
+      // The symbol may have been removed by optimization or dead code
+      // elimination, so this is not an error.
+      continue;
+    }
+    StringRef Name;
+    StringRef Version;
+    bool IsDefault;
+    ExtractVersion(FullName, Name, Version, IsDefault);
+
+    SymbolStub SS;
+    SS.Name = Name;
+    SS.Type = GetELFType(GV);
+    SS.Binding = GetELFBinding(GV);
+    SS.Visibility = GetELFVisibility(GV);
+    SS.Size = GetELFSize(GV);
+    SS.VersionIndex = GetVersionIndex(Version, Stub);
+    SS.IsDefault = IsDefault;
+    Stub->Symbols.push_back(SS);
+  }
+}
+
+namespace llvm {
+
+// For module M, make all the stubs neededs and insert them into StubList.
+void MakeAllStubs(const Module &M, const Triple &T,
+                  SmallVectorImpl<ELFStub*> *StubList) {
+  std::vector<Module::NeededRecord> NRList;
+  M.getNeededRecords(&NRList);
+  Elf32_Half Machine = GetELFMachine(T);
+  for (unsigned i = 0; i < NRList.size(); ++i) {
+    const Module::NeededRecord &NR = NRList[i];
+    ELFStub *Stub = new ELFStub();
+    Stub->Machine = Machine;
+    MakeOneStub(M, NR, Stub);
+    StubList->push_back(Stub);
+  }
+}
+
+void FreeStubList(llvm::SmallVectorImpl<ELFStub*> *StubList) {
+  for (unsigned i = 0; i < StubList->size(); ++i) {
+    delete (*StubList)[i];
+  }
+  StubList->clear();
+}
+
+} // namespace
diff --git a/tools/llc/StubMaker.h b/tools/llc/StubMaker.h
new file mode 100644
index 0000000000..27e1e55d7f
--- /dev/null
+++ b/tools/llc/StubMaker.h
@@ -0,0 +1,20 @@
+#ifndef __STUB_MAKER_H
+#define __STUB_MAKER_H
+
+#include "llvm/ADT/SmallVector.h"
+
+namespace llvm {
+
+class Module;
+class Triple;
+class ELFStub;
+
+// For module M, make all required ELF stubs and insert them into StubList.
+void MakeAllStubs(const Module &M,
+                  const Triple &T,
+                  SmallVectorImpl<ELFStub*> *StubList);
+void FreeStubList(SmallVectorImpl<ELFStub*> *StubList);
+
+}
+
+#endif
diff --git a/tools/llc/TextStubWriter.cpp b/tools/llc/TextStubWriter.cpp
new file mode 100644
index 0000000000..ae6e2f77d3
--- /dev/null
+++ b/tools/llc/TextStubWriter.cpp
@@ -0,0 +1,84 @@
+// Using the high-level representation of an ELF stub, create a text version
+// of the ELF stub object.
+
+#include "TextStubWriter.h"
+
+#include <sstream>
+
+#include "ELFStub.h"
+#include "llvm/Support/ELF.h"
+
+using namespace llvm;
+
+namespace {
+
+std::string LibShortname(const std::string &fullname) {
+  std::string result = fullname;
+  if (result.find("lib") != std::string::npos) {
+    result = result.substr(3);
+  }
+  size_t so_pos = result.find(".so");
+  if (so_pos != std::string::npos) {
+    result = result.substr(0, so_pos);
+  }
+  return result;
+}
+
+const ELF::Elf32_Half kDummyCodeShndx = 5;
+const ELF::Elf32_Half kDummyDataShndx = 6;
+
+}  // namespace
+
+namespace llvm {
+
+// Write out the dynamic symbol table information.  The format must be kept
+// in sync with the changes in NaCl's version of gold (see gold/metadata.cc).
+void WriteTextELFStub(const ELFStub *Stub, std::string *output) {
+  std::stringstream ss;
+
+  ss << "#### Symtab for " << Stub->SOName << "\n";
+  ss << "@obj " << LibShortname(Stub->SOName) << " " << Stub->SOName << "\n";
+
+  // st_value is usually a relative address for .so, and .exe files.
+  // So, make some up.
+  ELF::Elf32_Addr fake_relative_addr = 0;
+  for (size_t i = 0; i < Stub->Symbols.size(); ++i) {
+    const SymbolStub &sym = Stub->Symbols[i];
+
+    ELF::Elf32_Addr st_value = fake_relative_addr;
+    ELF::Elf32_Word st_size = sym.Size;
+    unsigned int st_info = sym.Type | (sym.Binding << 4);
+    unsigned int st_other = sym.Visibility;
+    ELF::Elf32_Half st_shndx = sym.Type == ELF::STT_FUNC ?
+      kDummyCodeShndx : kDummyDataShndx;
+    ELF::Elf32_Half vd_ndx = sym.VersionIndex;
+    // Mark non-default versions hidden.
+    if (!sym.IsDefault) {
+      vd_ndx |= ELF::VERSYM_HIDDEN;
+    }
+
+    ss << "@sym "
+       << sym.Name << " " // Representative for st_name.
+       << (st_value) << " "
+       << (st_size) << " "
+       << (st_info) << " "
+       << (st_other) << " "
+       << (st_shndx) << " "
+       << (vd_ndx) << " "
+       << "\n";
+    fake_relative_addr += (sym.Size == 0 ? 4 : sym.Size);
+  }
+
+  // Now dump the version map.
+  ss << "#### VerDefs for " << Stub->SOName << "\n";
+  for (size_t i = 0; i < Stub->VerDefs.size(); ++i) {
+    const VersionDefinition &verdef = Stub->VerDefs[i];
+    ss << "@ver " << (Elf32_Half)(verdef.Index) << " " << verdef.Name << "\n";
+  }
+
+  ss << "\n";
+
+  output->append(ss.str());
+}
+
+} // namespace llvm
diff --git a/tools/llc/TextStubWriter.h b/tools/llc/TextStubWriter.h
new file mode 100644
index 0000000000..4dbc5978b2
--- /dev/null
+++ b/tools/llc/TextStubWriter.h
@@ -0,0 +1,12 @@
+#ifndef __TEXT_STUB_WRITER_H
+#define __TEXT_STUB_WRITER_H
+
+#include "ELFStub.h"
+
+namespace llvm {
+
+void WriteTextELFStub(const ELFStub *Stub, std::string *output);
+
+}
+
+#endif
diff --git a/tools/llc/llc.cpp b/tools/llc/llc.cpp
index 4d4a74c009..bd2fa4c1cd 100644
--- a/tools/llc/llc.cpp
+++ b/tools/llc/llc.cpp
@@ -20,8 +20,10 @@
 #include "llvm/Pass.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Assembly/PrintModulePass.h"
+#include "llvm/Support/DataStream.h"  // @LOCALMOD
 #include "llvm/Support/IRReader.h"
 #include "llvm/CodeGen/CommandFlags.h"
+#include "llvm/CodeGen/IntrinsicLowering.h" // @LOCALMOD
 #include "llvm/CodeGen/LinkAllAsmWriterComponents.h"
 #include "llvm/CodeGen/LinkAllCodegenComponents.h"
 #include "llvm/MC/SubtargetFeature.h"
@@ -29,7 +31,9 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/ManagedStatic.h"
+#if !defined(__native_client__)
 #include "llvm/Support/PluginLoader.h"
+#endif
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/Host.h"
@@ -39,8 +43,33 @@
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Target/TargetMachine.h"
 #include <memory>
+
+// @LOCALMOD-BEGIN
+#include "StubMaker.h"
+#include "TextStubWriter.h"
+// @LOCALMOD-END
+
 using namespace llvm;
 
+// @LOCALMOD-BEGIN
+// NOTE: this tool can be build as a "sandboxed" translator.
+//       There are two ways to build the translator
+//       SRPC-style:  no file operations are allowed
+//                    see nacl_file.cc for support code
+//       non-SRPC-style: some basic file operations are allowed
+//                       This can be useful for debugging but will
+//                       not be deployed.
+#if defined(__native_client__) && defined(NACL_SRPC)
+MemoryBuffer* NaClGetMemoryBufferForFile(const char* filename);
+void NaClOutputStringToFile(const char* filename, const std::string& data);
+// The following two functions communicate metadata to the SRPC wrapper for LLC.
+void NaClRecordObjectInformation(bool is_shared, const std::string& soname);
+void NaClRecordSharedLibraryDependency(const std::string& library_name);
+DataStreamer* NaClBitcodeStreamer;
+#endif
+// @LOCALMOD-END
+
+
 // General options for llc.  Other pass-specific options are specified
 // within the corresponding llc passes, and target-specific options
 // and back-end code generation options are specified with the target machine.
@@ -51,6 +80,32 @@ InputFilename(cl::Positional, cl::desc("<input bitcode>"), cl::init("-"));
 static cl::opt<std::string>
 OutputFilename("o", cl::desc("Output filename"), cl::value_desc("filename"));
 
+// @LOCALMOD-BEGIN
+static cl::opt<std::string>
+MetadataTextFilename("metadata-text", cl::desc("Metadata as text, out filename"),
+                     cl::value_desc("filename"));
+
+// Using bitcode streaming has a couple of ramifications. Primarily it means
+// that the module in the file will be compiled one function at a time rather
+// than the whole module. This allows earlier functions to be compiled before
+// later functions are read from the bitcode but of course means no whole-module
+// optimizations. For now, streaming is only supported for files and stdin.
+static cl::opt<bool>
+LazyBitcode("streaming-bitcode",
+  cl::desc("Use lazy bitcode streaming for file inputs"),
+  cl::init(false));
+
+// The option below overlaps very much with bitcode streaming.
+// We keep it separate because it is still experimental and we want
+// to use it without changing the outside behavior which is especially
+// relevant for the sandboxed case.
+static cl::opt<bool>
+ReduceMemoryFootprint("reduce-memory-footprint",
+  cl::desc("Aggressively reduce memory used by llc"),
+  cl::init(false));
+
+// @LOCALMOD-END
+
 // Determine optimization level.
 static cl::opt<char>
 OptLevel("O",
@@ -149,9 +204,60 @@ static tool_output_file *GetOutputStream(const char *TargetName,
   return FDOut;
 }
 
+// @LOCALMOD-BEGIN
+#if defined(__native_client__) && defined(NACL_SRPC)
+void RecordMetadataForSrpc(const Module &mod) {
+  bool is_shared = (mod.getOutputFormat() == Module::SharedOutputFormat);
+  std::string soname = mod.getSOName();
+  NaClRecordObjectInformation(is_shared, soname);
+  for (Module::lib_iterator L = mod.lib_begin(),
+                            E = mod.lib_end();
+       L != E; ++L) {
+    NaClRecordSharedLibraryDependency(*L);
+  }
+}
+#endif  // defined(__native_client__) && defined(NACL_SRPC)
+// @LOCALMOD-END
+
+
+// @LOCALMOD-BEGIN
+
+// Write the ELF Stubs to the metadata file, in text format
+// Returns 0 on success, non-zero on error.
+int WriteTextMetadataFile(const Module &M, const Triple &TheTriple) {
+  // Build the ELF stubs (in high level format)
+  SmallVector<ELFStub*, 8> StubList;
+  // NOTE: The triple is unnecessary for the text version.
+  MakeAllStubs(M, TheTriple, &StubList);
+  // For each stub, write the ELF object to the metadata file.
+  std::string s;
+  for (unsigned i = 0; i < StubList.size(); i++) {
+    WriteTextELFStub(StubList[i], &s);
+  }
+  FreeStubList(&StubList);
+
+#if defined(__native_client__) && defined(NACL_SRPC)
+  NaClOutputStringToFile(MetadataTextFilename.c_str(), s);
+#else
+  std::string error;
+  OwningPtr<tool_output_file> MOut(
+      new tool_output_file(MetadataTextFilename.c_str(), error,
+                           raw_fd_ostream::F_Binary));
+  if (!error.empty()) {
+    errs() << error << '\n';
+    return 1;
+  }
+  MOut->os().write(s.data(), s.size());
+  MOut->keep();
+#endif
+  return 0;
+}
+
+// @LOCALMOD-END
+
 // main - Entry point for the llc compiler.
 //
-int main(int argc, char **argv) {
+int llc_main(int argc, char **argv) {
   sys::PrintStackTraceOnErrorSignal();
   PrettyStackTraceProgram X(argc, argv);
 
@@ -192,13 +298,66 @@ int main(int argc, char **argv) {
 
   // If user just wants to list available options, skip module loading
   if (!SkipModule) {
+  // @LOCALMOD-BEGIN
+#if defined(__native_client__) && defined(NACL_SRPC)
+  if (LazyBitcode) {
+    std::string StrError;
+    M.reset(getStreamedBitcodeModule(std::string("<SRPC stream>"),
+                                     NaClBitcodeStreamer, Context, &StrError));
+    if (!StrError.empty()) {
+      Err = SMDiagnostic(InputFilename, SourceMgr::DK_Error, StrError);
+    }
+  } else {
+    // In the NACL_SRPC case, open the file with our special wrapper, which
+    // is aware of pre-opened file descriptors.
+    // NOTE: we could remove this if we only support streaming.
+    // ParseIR() should take ownership of the MemoryBuffer.
+    M.reset(ParseIR(NaClGetMemoryBufferForFile(InputFilename.c_str()),
+                    Err,
+                    Context));
+    M->setModuleIdentifier(InputFilename);
+  }
+#else
+  if (LazyBitcode) {
+    std::string StrError;
+    DataStreamer *streamer = getDataFileStreamer(InputFilename, &StrError);
+    if (streamer) {
+      M.reset(getStreamedBitcodeModule(InputFilename, streamer, Context,
+                                       &StrError));
+    }
+    if (!StrError.empty()) {
+      Err = SMDiagnostic(InputFilename, SourceMgr::DK_Error, StrError);
+    }
+  } else {
     M.reset(ParseIRFile(InputFilename, Err, Context));
+  }
+#endif
+  // @LOCALMOD-END
+
     mod = M.get();
     if (mod == 0) {
       Err.print(argv[0], errs());
       return 1;
     }
 
+  // @LOCALMOD-BEGIN
+#if defined(__native_client__) && defined(NACL_SRPC)
+  RecordMetadataForSrpc(*mod);
+
+  // To determine if we should compile PIC or not, we needed to load at
+  // least the metadata. Since we've already constructed the commandline,
+  // we have to hack this in after commandline processing.
+  if (mod->getOutputFormat() == Module::SharedOutputFormat) {
+    RelocModel = Reloc::PIC_;
+  }
+  // Also set PIC_ for dynamic executables:
+  // BUG= http://code.google.com/p/nativeclient/issues/detail?id=2351
+  if (mod->lib_size() > 0) {
+    RelocModel = Reloc::PIC_;
+  }
+#endif  // defined(__native_client__) && defined(NACL_SRPC)
+  // @LOCALMOD-END
+
     // If we are supposed to override the target triple, do so now.
     if (!TargetTriple.empty())
       mod->setTargetTriple(Triple::normalize(TargetTriple));
@@ -223,6 +382,11 @@ int main(int argc, char **argv) {
   std::string FeaturesStr;
   if (MAttrs.size()) {
     SubtargetFeatures Features;
+    // @LOCALMOD-BEGIN
+    // Use the same default attribute settings as libLTO.
+    // TODO(pdox): Figure out why this isn't done for upstream llc.
+    Features.getDefaultSubtargetFeatures(TheTriple);
+    // @LOCALMOD-END
     for (unsigned i = 0; i != MAttrs.size(); ++i)
       Features.AddFeature(MAttrs[i]);
     FeaturesStr = Features.getString();
@@ -289,30 +453,38 @@ int main(int argc, char **argv) {
       TheTriple.isMacOSXVersionLT(10, 6))
     Target.setMCUseLoc(false);
 
+#if !defined(NACL_SRPC)
   // Figure out where we are going to send the output.
   OwningPtr<tool_output_file> Out
     (GetOutputStream(TheTarget->getName(), TheTriple.getOS(), argv[0]));
   if (!Out) return 1;
+#endif
 
   // Build up all of the passes that we want to do to the module.
-  PassManager PM;
+  // @LOCALMOD-BEGIN
+  OwningPtr<PassManagerBase> PM;
+  if (LazyBitcode || ReduceMemoryFootprint)
+    PM.reset(new FunctionPassManager(mod));
+  else
+    PM.reset(new PassManager());
+  // @LOCALMOD-END
 
   // Add an appropriate TargetLibraryInfo pass for the module's triple.
   TargetLibraryInfo *TLI = new TargetLibraryInfo(TheTriple);
   if (DisableSimplifyLibCalls)
     TLI->disableAllFunctions();
-  PM.add(TLI);
+  PM->add(TLI);
 
   if (target.get()) {
-    PM.add(new TargetTransformInfo(target->getScalarTargetTransformInfo(),
+    PM->add(new TargetTransformInfo(target->getScalarTargetTransformInfo(),
                                    target->getVectorTargetTransformInfo()));
   }
 
   // Add the target data from the target machine, if it exists, or the module.
   if (const DataLayout *TD = Target.getDataLayout())
-    PM.add(new DataLayout(*TD));
+    PM->add(new DataLayout(*TD));
   else
-    PM.add(new DataLayout(mod));
+    PM->add(new DataLayout(mod));
 
   // Override default to generate verbose assembly.
   Target.setAsmVerbosityDefault(true);
@@ -325,6 +497,39 @@ int main(int argc, char **argv) {
       Target.setMCRelaxAll(true);
   }
 
+
+  
+#if defined __native_client__ && defined(NACL_SRPC)
+  {
+    std::string s;
+    raw_string_ostream ROS(s);
+    formatted_raw_ostream FOS(ROS);
+    // Ask the target to add backend passes as necessary.
+    if (Target.addPassesToEmitFile(*PM, FOS, FileType, NoVerify)) {
+      errs() << argv[0] << ": target does not support generation of this"
+             << " file type!\n";
+      return 1;
+    }
+
+    if (LazyBitcode || ReduceMemoryFootprint) {
+      FunctionPassManager* P = static_cast<FunctionPassManager*>(PM.get());
+      P->doInitialization();
+      for (Module::iterator I = mod->begin(), E = mod->end(); I != E; ++I) {
+        P->run(*I);
+        if (ReduceMemoryFootprint) {
+          I->Dematerialize();
+        }
+      }
+      P->doFinalization();
+    } else {
+      static_cast<PassManager*>(PM.get())->run(*mod);
+    }
+    FOS.flush();
+    ROS.flush();
+    NaClOutputStringToFile(OutputFilename.c_str(), ROS.str());
+  }
+#else
+      
   {
     formatted_raw_ostream FOS(Out->os());
 
@@ -349,7 +554,7 @@ int main(int argc, char **argv) {
     }
 
     // Ask the target to add backend passes as necessary.
-    if (Target.addPassesToEmitFile(PM, FOS, FileType, NoVerify,
+    if (Target.addPassesToEmitFile(*PM, FOS, FileType, NoVerify,
                                    StartAfterID, StopAfterID)) {
       errs() << argv[0] << ": target does not support generation of this"
              << " file type!\n";
@@ -359,11 +564,50 @@ int main(int argc, char **argv) {
     // Before executing passes, print the final values of the LLVM options.
     cl::PrintOptionValues();
 
-    PM.run(*mod);
+    if (LazyBitcode || ReduceMemoryFootprint) {
+      FunctionPassManager *P = static_cast<FunctionPassManager*>(PM.get());
+      P->doInitialization();
+      for (Module::iterator I = mod->begin(), E = mod->end(); I != E; ++I) {
+        P->run(*I);
+        if (ReduceMemoryFootprint) {
+          I->Dematerialize();
+        }
+      }
+      P->doFinalization();
+    } else {
+      static_cast<PassManager*>(PM.get())->run(*mod);
+    }
   }
 
   // Declare success.
   Out->keep();
+#endif
+
+  // @LOCALMOD-BEGIN
+  // Write out the metadata.
+  //
+  // We need to ensure that intrinsic prototypes are available, in case
+  // we have a NeededRecord for one of them.
+  // They may have been eliminated by the StripDeadPrototypes pass,
+  // or some other pass that is unaware of NeededRecords / IntrinsicLowering.
+  if (!MetadataTextFilename.empty()) {
+    IntrinsicLowering IL(*target->getDataLayout());
+    IL.AddPrototypes(*M);
+
+    int err = WriteTextMetadataFile(*M.get(), TheTriple);
+    if (err != 0)
+      return err;
+  }
+  // @LOCALMOD-END
 
   return 0;
 }
+
+#if !defined(NACL_SRPC)
+int
+main (int argc, char **argv) {
+  return llc_main(argc, argv);
+}
+#else
+// main() is in nacl_file.cpp.
+#endif
diff --git a/tools/llc/nacl_file.cpp b/tools/llc/nacl_file.cpp
new file mode 100644
index 0000000000..13dcda128a
--- /dev/null
+++ b/tools/llc/nacl_file.cpp
@@ -0,0 +1,480 @@
+/* Copyright 2012 The Native Client Authors. All rights reserved.
+ * Use of this source code is governed by a BSD-style license that can
+ * be found in the LICENSE file.
+ *
+ * This file provides wrappers to open() to use pre-opened file descriptors
+ * for the input bitcode and the output file.
+ *
+ * It also has the SRPC interfaces, but that should probably be refactored
+ * into a separate file.
+ */
+
+#if defined(__native_client__) && defined(NACL_SRPC)
+
+#include <argz.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+// Headers which are not properly part of the SDK are included by their
+// path in the nacl tree
+#include "native_client/src/shared/srpc/nacl_srpc.h"
+#ifdef __pnacl__
+#include <nacl/pnacl.h>
+#endif
+#include "SRPCStreamer.h"
+
+
+#include <string>
+#include <map>
+#include <vector>
+
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/system_error.h"
+
+
+using llvm::MemoryBuffer;
+using llvm::StringRef;
+using std::string;
+using std::map;
+
+#define printerr(...)  fprintf(stderr, __VA_ARGS__)
+// Temporarily enabling debug prints to debug temp-file usage on windows bots.
+#define printdbg(...)  fprintf(stderr, __VA_ARGS__)
+
+#define ARRAY_SIZE(array) (sizeof array / sizeof array[0])
+
+namespace {
+
+typedef std::vector<std::string> string_vector;
+
+// True if the bitcode to be compiled is for a shared library.
+// Used to return to the coordinator.
+bool g_bitcode_is_shared_library;
+// The soname of the current compilation unit, if it is a shared library.
+// Empty string otherwise.
+std::string* g_bitcode_soname = NULL;
+// The newline separated list of libraries that the current bitcode compilation
+// unit depends on.
+std::string* g_bitcode_lib_dependencies = NULL;
+// The filename used internally for looking up the bitcode file.
+char kBitcodeFilename[] = "pnacl.pexe";
+// The filename used internally for looking up the object code file.
+char kObjectFilename[] = "pnacl.o";
+// Object which manages streaming bitcode over SRPC and threading.
+SRPCStreamer *srpc_streamer;
+
+}  // namespace
+
+//TODO(dschuff): a little more elegant interface into llc than this?
+extern llvm::DataStreamer* NaClBitcodeStreamer;
+
+class FileInfo {
+ private:
+  static map<string, FileInfo*> descriptor_map_;
+
+  string filename_;
+  int fd_;
+
+ public:
+  // Construct a FileInfo for a file descriptor.
+  // File descriptors are used for the bitcode (input) file and for the
+  // object (output) file passed in by the coordinator when using the Run
+  // SRPC.
+  FileInfo(string fn, int fd) :
+    filename_(fn), fd_(fd) {
+    printdbg("LLVM-SB-DBG: registering file %d (%s)\n", fd, fn.c_str());
+    descriptor_map_[fn] = this;
+  }
+
+  int GetFd() {
+    return fd_;
+  }
+
+  MemoryBuffer* ReadAllDataAsMemoryBuffer() {
+    printdbg("LLVM-SB-DBG: opening file %d (%s)\n", fd_, filename_.c_str());
+    llvm::OwningPtr<MemoryBuffer> mb;
+    if (llvm::error_code::success() != MemoryBuffer::getOpenFile(
+            fd_, filename_.c_str(), mb,
+            -1, -1, 0, false)) {
+      perror("LLVM-SB-ERROR: ReadAllDataAsMemoryBuffer getOpenFile failed!\n");
+      return 0;
+    }
+    return mb.take();
+  }
+
+  void WriteAllDataToTmpFile(string data) {
+    printdbg("LLVM-SB-DBG: writing file %d (%s): %d bytes\n",
+             fd_, filename_.c_str(), data.size());
+
+    if (fd_ < 0) {
+      printerr("LLVM-SB-ERROR: invalid fd for write\n");
+      return;
+    }
+    size_t bytes_to_write = data.size();
+    const char* buf = data.c_str();
+    while (bytes_to_write > 0) {
+      ssize_t bytes_written = write(fd_, (const void*) buf, bytes_to_write);
+      printdbg("LLVM-SB-DBG: write call to file %d (req: %zu, got: %zd)\n",
+               fd_, bytes_to_write, bytes_written);
+      if (bytes_written < 0) {
+        printerr("LLVM-SB-ERROR: write to file %d failed with %zd\n",
+                 fd_, bytes_written);
+        perror("LLVM-SB-ERROR: WriteAllDataToTmpFile write failed");
+        return;
+      }
+      buf += bytes_written;
+      bytes_to_write -= (size_t) bytes_written;
+    }
+  }
+
+  void WriteAllData(string data) {
+    WriteAllDataToTmpFile(data);
+  }
+
+  static FileInfo* FindFileInfo(const string& fn) {
+    map<string, FileInfo*>::iterator it = descriptor_map_.find(fn);
+    if (it == descriptor_map_.end()) {
+      printerr("LLVM-SB-ERROR: no mapping for filename\n");
+      return NULL;
+    }
+    return it->second;
+  }
+
+};
+
+map<string, FileInfo*> FileInfo::descriptor_map_;
+
+extern int llc_main(int argc, char **argv);
+
+
+MemoryBuffer* NaClGetMemoryBufferForFile(const char* filename) {
+  FileInfo* fi = FileInfo::FindFileInfo(filename);
+  if (fi == NULL) {
+    printerr("LLVM-SB-ERROR: unknown file %s\n", filename);
+    return NULL;
+  }
+  return fi->ReadAllDataAsMemoryBuffer();
+}
+
+void NaClOutputStringToFile(const char* filename, const string& data) {
+  FileInfo* fi = FileInfo::FindFileInfo(filename);
+  fi->WriteAllData(data);
+}
+
+void NaClRecordObjectInformation(bool is_shared, const std::string& soname) {
+  // This function is invoked to begin recording library information.
+  // To make it reentrant, we clean up what might be left over from last time.
+  delete g_bitcode_soname;
+  delete g_bitcode_lib_dependencies;
+  // Then remember the module global information.
+  g_bitcode_is_shared_library = is_shared;
+  g_bitcode_soname = new std::string(soname);
+  g_bitcode_lib_dependencies = new std::string();
+}
+
+void NaClRecordSharedLibraryDependency(const std::string& library_name) {
+  const std::string& kDelimiterString("\n");
+  *g_bitcode_lib_dependencies += (library_name + kDelimiterString);
+}
+
+namespace {
+
+int DoTranslate(string_vector* cmd_line_vec, int bitcode_fd, int object_fd) {
+  if (cmd_line_vec == NULL) {
+    return 1;
+  }
+  if (bitcode_fd) {
+    // Add mapping for bitcode file (side effect is to register the file).
+    new FileInfo(kBitcodeFilename, bitcode_fd);
+  }
+  // Add mapping for object file (side effect is to register the file).
+  new FileInfo(kObjectFilename, object_fd);
+  // Make an argv array from the input vector.
+  size_t argc = cmd_line_vec->size();
+  char** argv = new char*[argc];
+  for (size_t i = 0; i < argc; ++i) {
+    // llc_main will not mutate the command line, so this is safe.
+    argv[i] = const_cast<char*>((*cmd_line_vec)[i].c_str());
+  }
+  argv[argc] = NULL;
+  // Call main.
+  return llc_main(static_cast<int>(argc), argv);
+}
+
+string_vector* CommandLineFromArgz(char* str, size_t str_len) {
+  char* entry = str;
+  string_vector* vec = new string_vector;
+  while (entry != NULL) {
+    vec->push_back(entry);
+    entry = argz_next(str, str_len, entry);
+  }
+  // Add fixed arguments to the command line.  These specify the bitcode
+  // and object code filenames, removing them from the contract with the
+  // coordinator.
+  vec->push_back(kBitcodeFilename);
+  vec->push_back("-o");
+  vec->push_back(kObjectFilename);
+  return vec;
+}
+
+void run(NaClSrpcRpc *rpc,
+         NaClSrpcArg **in_args,
+         NaClSrpcArg **out_args,
+         NaClSrpcClosure *done) {
+  NaClSrpcClosureRunner runner(done);
+  rpc->result = NACL_SRPC_RESULT_APP_ERROR;
+  int bitcode_fd = in_args[0]->u.hval;
+  int object_fd = in_args[1]->u.hval;
+  char* command_line = in_args[2]->arrays.carr;
+  size_t command_line_len = in_args[2]->u.count;
+  string_vector* cmd_line_vec =
+      CommandLineFromArgz(command_line, command_line_len);
+  if (DoTranslate(cmd_line_vec, bitcode_fd, object_fd) != 0) {
+    printerr("DoTranslate failed.\n");
+    return;
+  }
+  delete cmd_line_vec;
+  out_args[0]->u.ival = g_bitcode_is_shared_library;
+  // SRPC deletes the strings returned when the closure is invoked.
+  // Therefore we need to use strdup.
+  out_args[1]->arrays.str = strdup(g_bitcode_soname->c_str());
+  out_args[2]->arrays.str = strdup(g_bitcode_lib_dependencies->c_str());
+  rpc->result = NACL_SRPC_RESULT_OK;
+}
+
+string_vector* GetDefaultCommandLine() {
+  string_vector* command_line = new string_vector;
+  size_t i;
+  // First, those common to all architectures.
+  static const char* common_args[] = { "pnacl_translator",
+                                       "-filetype=obj",
+                                       kBitcodeFilename,
+                                       "-o",
+                                       kObjectFilename };
+  for (i = 0; i < ARRAY_SIZE(common_args); ++i) {
+    command_line->push_back(common_args[i]);
+  }
+  // Then those particular to a platform.
+  static const char* llc_args_x8632[] = { "-march=x86",
+                                          "-mcpu=pentium4",
+                                          "-mtriple=i686-none-nacl-gnu",
+                                          NULL };
+  static const char* llc_args_x8664[] = { "-march=x86-64",
+                                          "-mcpu=core2",
+                                          "-mtriple=x86_64-none-nacl-gnu",
+                                          NULL };
+  static const char* llc_args_arm[] = { "-mcpu=cortex-a8",
+                                        "-mtriple=armv7a-none-nacl-gnueabi",
+                                        "-arm-reserve-r9",
+                                        "-sfi-disable-cp",
+                                        "-sfi-store",
+                                        "-sfi-load",
+                                        "-sfi-stack",
+                                        "-sfi-branch",
+                                        "-sfi-data",
+                                        "-no-inline-jumptables",
+                                        "-float-abi=hard",
+                                        NULL };
+
+  const char **llc_args = NULL;
+#if defined (__pnacl__)
+  switch (__builtin_nacl_target_arch()) {
+    case PnaclTargetArchitectureX86_32: {
+      llc_args = llc_args_x8632;
+      break;
+    }
+    case PnaclTargetArchitectureX86_64: {
+      llc_args = llc_args_x8664;
+      break;
+    }
+    case PnaclTargetArchitectureARM_32: {
+      llc_args = llc_args_arm;
+      break;
+    }
+    default:
+      printerr("no target architecture match.\n");
+      delete command_line;
+      command_line = NULL;
+      break;
+  }
+#elif defined (__i386__)
+  llc_args = llc_args_x8632;
+#elif defined (__x86_64__)
+  llc_args = llc_args_x8664;
+#else
+#error
+#endif
+  for (i = 0; llc_args[i] != NULL; i++) command_line->push_back(llc_args[i]);
+  return command_line;
+}
+
+void run_with_default_command_line(NaClSrpcRpc *rpc,
+                                   NaClSrpcArg **in_args,
+                                   NaClSrpcArg **out_args,
+                                   NaClSrpcClosure *done) {
+  NaClSrpcClosureRunner runner(done);
+  rpc->result = NACL_SRPC_RESULT_APP_ERROR;
+  int bitcode_fd = in_args[0]->u.hval;
+  int object_fd = in_args[1]->u.hval;
+  string_vector* cmd_line_vec = GetDefaultCommandLine();
+  if (DoTranslate(cmd_line_vec, bitcode_fd, object_fd) != 0) {
+    printerr("DoTranslate failed.\n");
+    return;
+  }
+  delete cmd_line_vec;
+  out_args[0]->u.ival = g_bitcode_is_shared_library;
+  // SRPC deletes the strings returned when the closure is invoked.
+  // Therefore we need to use strdup.
+  out_args[1]->arrays.str = strdup(g_bitcode_soname->c_str());
+  out_args[2]->arrays.str = strdup(g_bitcode_lib_dependencies->c_str());
+  rpc->result = NACL_SRPC_RESULT_OK;
+}
+
+// Data passed from main thread to compile thread.
+// Takes ownership of the commandline vector.
+class StreamingThreadData {
+ public:
+  StreamingThreadData(int object_fd, string_vector* cmd_line_vec) :
+      object_fd_(object_fd), cmd_line_vec_(cmd_line_vec) {}
+  int ObjectFD() const { return object_fd_; }
+  string_vector* CmdLineVec() const { return cmd_line_vec_.get(); }
+  const int object_fd_;
+  const llvm::OwningPtr<string_vector> cmd_line_vec_;
+};
+
+void *run_streamed(void *arg) {
+  StreamingThreadData* data = reinterpret_cast<StreamingThreadData*>(arg);
+  data->CmdLineVec()->push_back("-streaming-bitcode");
+  if (DoTranslate(data->CmdLineVec(), 0, data->ObjectFD()) != 0) {
+    printerr("DoTranslate failed.\n");
+    srpc_streamer->setError();
+    return NULL;
+  }
+  delete data;
+  return NULL;
+}
+
+// Actually do the work for stream initialization.
+void do_stream_init(NaClSrpcRpc *rpc,
+                    NaClSrpcArg **in_args,
+                    NaClSrpcArg **out_args,
+                    NaClSrpcClosure *done,
+                    string_vector* command_line_vec) {
+  NaClSrpcClosureRunner runner(done);
+  rpc->result = NACL_SRPC_RESULT_APP_ERROR;
+  srpc_streamer = new SRPCStreamer();
+  std::string StrError;
+  StreamingThreadData* thread_data = new StreamingThreadData(
+      in_args[0]->u.hval, command_line_vec);
+  NaClBitcodeStreamer = srpc_streamer->init(run_streamed,
+      reinterpret_cast<void *>(thread_data),
+      &StrError);
+  if (NaClBitcodeStreamer) {
+    rpc->result = NACL_SRPC_RESULT_OK;
+    out_args[0]->arrays.str = strdup("no error");
+  } else {
+    out_args[0]->arrays.str = strdup(StrError.c_str());
+  }
+}
+
+// Invoked by the StreamInit RPC to initialize bitcode streaming over SRPC.
+// Under the hood it forks a new thread at starts the llc_main, which sets
+// up the compilation and blocks when it tries to start reading the bitcode.
+// Input arg is a file descriptor to write the output object file to.
+// Returns a string, containing an error message if the call fails.
+void stream_init(NaClSrpcRpc *rpc,
+                 NaClSrpcArg **in_args,
+                 NaClSrpcArg **out_args,
+                 NaClSrpcClosure *done) {
+  // cmd_line_vec allocated by GetDefaultCommandLine() is freed by the
+  // translation thread in run_streamed()
+  do_stream_init(rpc, in_args, out_args, done, GetDefaultCommandLine());
+}
+
+// Invoked by StreamInitWithCommandLine RPC. Same as stream_init, but
+// provides a command line to use instead of the default.
+void stream_init_with_command_line(NaClSrpcRpc *rpc,
+                                   NaClSrpcArg **in_args,
+                                   NaClSrpcArg **out_args,
+                                   NaClSrpcClosure *done) {
+  char* command_line = in_args[1]->arrays.carr;
+  size_t command_line_len = in_args[1]->u.count;
+  string_vector* cmd_line_vec =
+      CommandLineFromArgz(command_line, command_line_len);
+  // cmd_line_vec is freed by the translation thread in run_streamed
+  do_stream_init(rpc, in_args, out_args, done, cmd_line_vec);
+}
+
+// Invoked by the StreamChunk RPC. Receives a chunk of the bitcode and
+// buffers it for later retrieval by the compilation thread.
+void stream_chunk(NaClSrpcRpc *rpc,
+                 NaClSrpcArg **in_args,
+                 NaClSrpcArg **out_args,
+                 NaClSrpcClosure *done) {
+  NaClSrpcClosureRunner runner(done);
+  rpc->result = NACL_SRPC_RESULT_APP_ERROR;
+  size_t len = in_args[0]->u.count;
+  unsigned char *bytes = reinterpret_cast<unsigned char*>(
+      in_args[0]->arrays.carr);
+  if (srpc_streamer->gotChunk(bytes, len) != len) {
+    return;
+  }
+  rpc->result = NACL_SRPC_RESULT_OK;
+}
+
+// Invoked by the StreamEnd RPC. Waits until the compilation finishes,
+// then returns. Returns an int indicating whether the bitcode is a
+// shared library, a string with the soname, a string with dependencies,
+// and a string which contains an error message if applicable.
+void stream_end(NaClSrpcRpc *rpc,
+                NaClSrpcArg **in_args,
+                NaClSrpcArg **out_args,
+                NaClSrpcClosure *done) {
+  NaClSrpcClosureRunner runner(done);
+  rpc->result = NACL_SRPC_RESULT_APP_ERROR;
+  std::string StrError;
+  if (srpc_streamer->streamEnd(&StrError)) {
+    out_args[3]->arrays.str = strdup(StrError.c_str());
+    return;
+  }
+  out_args[0]->u.ival = g_bitcode_is_shared_library;
+  // SRPC deletes the strings returned when the closure is invoked.
+  // Therefore we need to use strdup.
+  out_args[1]->arrays.str = strdup(g_bitcode_soname->c_str());
+  out_args[2]->arrays.str = strdup(g_bitcode_lib_dependencies->c_str());
+  rpc->result = NACL_SRPC_RESULT_OK;
+}
+
+const struct NaClSrpcHandlerDesc srpc_methods[] = {
+  { "Run:hhC:iss", run },
+  { "RunWithDefaultCommandLine:hh:iss", run_with_default_command_line },
+  // Protocol for streaming:
+  // (StreamInit(obj_fd) -> error_str |
+  //    StreamInitWIthCommandLine(obj_fd, escaped_cmdline) -> error_str)
+  // StreamChunk(data) +
+  // StreamEnd() -> (is_shared_lib,soname,dependencies,error_str)
+  { "StreamInit:h:s", stream_init },
+  { "StreamInitWithCommandLine:hC:s:", stream_init_with_command_line },
+  { "StreamChunk:C:", stream_chunk },
+  { "StreamEnd::isss", stream_end },
+  { NULL, NULL },
+};
+
+}  // namespace
+
+int
+main() {
+  if (!NaClSrpcModuleInit()) {
+    return 1;
+  }
+
+  if (!NaClSrpcAcceptClientConnection(srpc_methods)) {
+    return 1;
+  }
+  NaClSrpcModuleFini();
+  return 0;
+}
+
+#endif
diff --git a/tools/llvm-dis/llvm-dis.cpp b/tools/llvm-dis/llvm-dis.cpp
index 41f023d4c4..75ceda61ad 100644
--- a/tools/llvm-dis/llvm-dis.cpp
+++ b/tools/llvm-dis/llvm-dis.cpp
@@ -51,6 +51,13 @@ static cl::opt<bool>
 ShowAnnotations("show-annotations",
                 cl::desc("Add informational comments to the .ll file"));
 
+// @LOCALMOD-BEGIN
+// Print bitcode metadata only, in text format.
+// (includes output format, soname, and dependencies).
+static cl::opt<bool>
+DumpMetadata("dump-metadata", cl::desc("Dump bitcode metadata"));
+// @LOCALMOD-END
+
 namespace {
 
 static void printDebugLoc(const DebugLoc &DL, formatted_raw_ostream &OS) {
@@ -154,7 +161,7 @@ int main(int argc, char **argv) {
     OutputFilename = "-";
 
   if (OutputFilename.empty()) { // Unspecified output, infer it.
-    if (InputFilename == "-") {
+    if (InputFilename == "-" || DumpMetadata) { // @LOCALMOD
       OutputFilename = "-";
     } else {
       const std::string &IFN = InputFilename;
@@ -176,6 +183,14 @@ int main(int argc, char **argv) {
     return 1;
   }
 
+  // @LOCALMOD-BEGIN
+  if (DumpMetadata) {
+    M->dumpMeta(Out->os());
+    Out->keep();
+    return 0;
+  }
+  // @LOCALMOD-END
+
   OwningPtr<AssemblyAnnotationWriter> Annotator;
   if (ShowAnnotations)
     Annotator.reset(new CommentWriter());
diff --git a/tools/llvm-extract/llvm-extract.cpp b/tools/llvm-extract/llvm-extract.cpp
index ac82d98b3b..40fd51331e 100644
--- a/tools/llvm-extract/llvm-extract.cpp
+++ b/tools/llvm-extract/llvm-extract.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Transforms/IPO.h"
 #include "llvm/DataLayout.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h" // @LOCALMOD
 #include "llvm/Support/IRReader.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/PrettyStackTrace.h"
@@ -47,6 +48,18 @@ Force("f", cl::desc("Enable binary output on terminals"));
 static cl::opt<bool>
 DeleteFn("delete", cl::desc("Delete specified Globals from Module"));
 
+// @LOCALMOD-BEGIN
+static cl::opt<unsigned>
+Divisor("divisor",
+        cl::init(0),
+        cl::desc("select GV by position (pos % divisor = remainder "));
+
+static cl::opt<unsigned>
+Remainder("remainder",
+          cl::init(0),
+          cl::desc("select GV by position (pos % divisor = remainder "));
+// @LOCALMOD-END
+
 // ExtractFuncs - The functions to extract from the module.
 static cl::list<std::string>
 ExtractFuncs("func", cl::desc("Specify function to extract"),
@@ -178,6 +191,24 @@ int main(int argc, char **argv) {
     }
   }
 
+  // @LOCALMOD-BEGIN
+  // Extract globals via modulo operation.
+  size_t count_globals = 0;
+  if (Divisor != 0) {
+    size_t pos = 0;
+    for (Module::global_iterator GV = M->global_begin(), E = M->global_end();
+         GV != E;
+         GV++, pos++) {
+      if (pos % Divisor == Remainder) {
+        GVs.insert(&*GV);
+      }
+    }
+    dbgs() << "total globals: " <<  pos << "\n";
+    count_globals = GVs.size();
+    dbgs() << "selected globals: " << count_globals  << "\n";
+  }
+  // @LOCALMOD-END
+  
   // Figure out which functions we should extract.
   for (size_t i = 0, e = ExtractFuncs.size(); i != e; ++i) {
     GlobalValue *GV = M->getFunction(ExtractFuncs[i]);
@@ -212,6 +243,22 @@ int main(int argc, char **argv) {
     }
   }
 
+  // @LOCALMOD-BEGIN
+  // Extract functions via modulo operation.
+  if (Divisor != 0) {
+    size_t pos = 0;
+    for (Module::iterator F = M->begin(), E = M->end();
+         F != E;
+         F++, pos++) {
+       if (pos % Divisor == Remainder) {
+         GVs.insert(&*F);
+      }
+    }
+    dbgs() << "total functions: " <<  pos << "\n";
+    dbgs() << "selected functions: " << GVs.size() - count_globals  << "\n";
+  }
+  // @LOCALMOD-END
+  
   // Materialize requisite global values.
   if (!DeleteFn)
     for (size_t i = 0, e = GVs.size(); i != e; ++i) {
diff --git a/tools/lto/LTOCodeGenerator.cpp b/tools/lto/LTOCodeGenerator.cpp
index b1c4f437ff..5d79fda5aa 100644
--- a/tools/lto/LTOCodeGenerator.cpp
+++ b/tools/lto/LTOCodeGenerator.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/Verifier.h"
 #include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/CodeGen/IntrinsicLowering.h" // @LOCALMOD
 #include "llvm/Config/config.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -71,6 +72,16 @@ LTOCodeGenerator::LTOCodeGenerator()
   InitializeAllTargets();
   InitializeAllTargetMCs();
   InitializeAllAsmPrinters();
+
+    // @LOCALMOD-BEGIN
+    // Preserve symbols which may be referenced due to the lowering
+    // of an intrinsic.
+    const llvm::StringSet<> &IntrinsicSymbols = IntrinsicLowering::GetFuncNames();
+    for (llvm::StringSet<>::const_iterator it = IntrinsicSymbols.begin(),
+         ie = IntrinsicSymbols.end(); it != ie; ++it) {
+      _mustPreserveSymbols[it->getKey().str().c_str()] = 1;
+    }
+    // @LOCALMOD-END
 }
 
 LTOCodeGenerator::~LTOCodeGenerator() {
@@ -92,6 +103,68 @@ bool LTOCodeGenerator::addModule(LTOModule* mod, std::string& errMsg) {
   return ret;
 }
 
+// @LOCALMOD-BEGIN
+/// Add a module that will be merged with the final output module.
+/// The merging does not happen until linkGatheredModulesAndDispose().
+bool LTOCodeGenerator::gatherModuleForLinking(LTOModule* mod) {
+  _gatheredModules.push_back(mod);
+}
+
+/// Merge all modules gathered from gatherModuleForLinking(), and
+/// destroy the source modules in the process.
+bool LTOCodeGenerator::linkGatheredModulesAndDispose(std::string& errMsg) {
+
+  // We gather the asm undefs earlier than addModule() does,
+  // since we delete the modules during linking, and would not be
+  // able to do this after linking.  The undefs vector contain lists
+  // of global variable names which are considered "used", which will be
+  // appended into the "llvm.compiler.used" list.  The names must be the
+  // same before linking as they are after linking, since we have switched
+  // the order.
+  for (unsigned i = 0, ei = _gatheredModules.size(); i != ei; ++i) {
+    const std::vector<const char*> &undefs =
+        _gatheredModules[i]->getAsmUndefinedRefs();
+    for (int j = 0, ej = undefs.size(); j != ej; ++j) {
+      _asmUndefinedRefs[undefs[j]] = 1;
+    }
+  }
+
+  // Tree-reduce the mods, re-using the incoming mods as scratch
+  // intermediate results.  Module i is linked with (i + stride), with i as
+  // the dest.  We begin with a stride of 1, and double each time.  E.g.,
+  // after the first round, only the even-indexed modules are still available,
+  // and after the second, only those with index that are a multiple of 4
+  // are available.  Eventually the Module with the content of all other modules
+  // will be Module 0.
+  // NOTE: we may be able to be smarter about linking if we did not do them
+  // pairwise using Linker::LinkModules.  We also disregard module sizes
+  // and try our best to keep the modules in order (linking adjacent modules).
+  for (unsigned stride = 1, len = _gatheredModules.size();
+       stride < len;
+       stride *= 2) {
+    for (unsigned i = 0; i + stride < len; i = i + (stride * 2)) {
+      if (Linker::LinkModules(_gatheredModules[i]->getLLVVMModule(),
+                              _gatheredModules[i+stride]->getLLVVMModule(),
+                              Linker::DestroySource, &errMsg)) {
+        errs() << "LinkModules " << i << " w/ " << i + stride << " failed...\n";
+        // We leak the memory in this case...
+        return true;
+      }
+      delete _gatheredModules[i+stride];
+    }
+  }
+
+  // Finally, link Node 0 with the Dest and delete Node 0.
+  if (_linker.LinkInModule(_gatheredModules[0]->getLLVVMModule(), &errMsg)) {
+    errs() << "LinkModules Dst w/ _gatheredModules[0] failed...\n";
+    return true;
+  }
+  delete _gatheredModules[0];
+
+  return false;
+}
+// @LOCALMOD-END
+
 bool LTOCodeGenerator::setDebugInfo(lto_debug_model debug,
                                     std::string& errMsg) {
   switch (debug) {
@@ -118,6 +191,81 @@ bool LTOCodeGenerator::setCodePICModel(lto_codegen_model model,
   llvm_unreachable("Unknown PIC model!");
 }
 
+// @LOCALMOD-BEGIN
+void LTOCodeGenerator::setMergedModuleOutputFormat(lto_output_format format)
+{
+  Module::OutputFormat outputFormat;
+  switch (format) {
+  case LTO_OUTPUT_FORMAT_OBJECT:
+    outputFormat = Module::ObjectOutputFormat;
+    break;
+  case LTO_OUTPUT_FORMAT_SHARED:
+    outputFormat = Module::SharedOutputFormat;
+    break;
+  case LTO_OUTPUT_FORMAT_EXEC:
+    outputFormat = Module::ExecutableOutputFormat;
+    break;
+  }
+  Module *mergedModule = _linker.getModule();
+  mergedModule->setOutputFormat(outputFormat);
+}
+
+void LTOCodeGenerator::setMergedModuleSOName(const char *soname)
+{
+  Module *mergedModule = _linker.getModule();
+  mergedModule->setSOName(soname);
+}
+
+void LTOCodeGenerator::addLibraryDep(const char *lib)
+{
+  Module *mergedModule = _linker.getModule();
+  mergedModule->addLibrary(lib);
+}
+
+void LTOCodeGenerator::wrapSymbol(const char *sym)
+{
+  Module *mergedModule = _linker.getModule();
+  mergedModule->wrapSymbol(sym);
+}
+
+const char* LTOCodeGenerator::setSymbolDefVersion(const char *sym,
+                                                  const char *ver,
+                                                  bool is_default)
+{
+  Module *mergedModule = _linker.getModule();
+  GlobalValue *GV = mergedModule->getNamedValue(sym);
+  if (!GV) {
+    llvm_unreachable("Invalid global in setSymbolDefVersion");
+  }
+  GV->setVersionDef(ver, is_default);
+  return strdup(GV->getName().str().c_str());
+}
+
+const char* LTOCodeGenerator::setSymbolNeeded(const char *sym,
+                                              const char *ver,
+                                              const char *dynfile)
+{
+  Module *mergedModule = _linker.getModule();
+  GlobalValue *GV = mergedModule->getNamedValue(sym);
+  if (!GV) {
+    // Symbol lookup may have failed because this symbol was already
+    // renamed for versioning. Make sure this is the case.
+    if (strchr(sym, '@') != NULL || ver == NULL || ver[0] == '\0') {
+      llvm_unreachable("Unexpected condition in setSymbolNeeded");
+    }
+    std::string NewName = std::string(sym) + "@" + ver;
+    GV = mergedModule->getNamedValue(NewName);
+  }
+  if (!GV) {
+    // Ignore failures due to unused declarations.
+    // This caused a falure to build libppruntime.so for glibc.
+    // TODO(sehr): better document under which circumstances this is needed.
+    return sym;
+  }
+  GV->setNeeded(ver, dynfile);
+  return strdup(GV->getName().str().c_str());
+}
+// @LOCALMOD-END
 bool LTOCodeGenerator::writeMergedModules(const char *path,
                                           std::string &errMsg) {
   if (determineTarget(errMsg))
diff --git a/tools/lto/LTOCodeGenerator.h b/tools/lto/LTOCodeGenerator.h
index 3081b7dad1..de3d1fa8a5 100644
--- a/tools/lto/LTOCodeGenerator.h
+++ b/tools/lto/LTOCodeGenerator.h
@@ -19,6 +19,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm-c/lto.h"
 #include <string>
+#include <vector>
 
 namespace llvm {
   class LLVMContext;
@@ -40,6 +41,12 @@ struct LTOCodeGenerator {
   ~LTOCodeGenerator();
 
   bool addModule(struct LTOModule*, std::string &errMsg);
+  // @LOCALMOD-BEGIN
+  // Alternative methods of adding modules, which delay merging modules until
+  // all modules are available.
+  bool gatherModuleForLinking(struct LTOModule*);
+  bool linkGatheredModulesAndDispose(std::string &errMsg);
+  // @LOCALMOD-END
   bool setDebugInfo(lto_debug_model, std::string &errMsg);
   bool setCodePICModel(lto_codegen_model, std::string &errMsg);
 
@@ -50,6 +57,16 @@ struct LTOCodeGenerator {
   }
 
   bool writeMergedModules(const char *path, std::string &errMsg);
+  // @LOCALMOD-BEGIN
+  void                setMergedModuleOutputFormat(lto_output_format format);
+  void                setMergedModuleSOName(const char *soname);
+  void                addLibraryDep(const char *lib);
+  void                wrapSymbol(const char *sym);
+  const char*         setSymbolDefVersion(const char *sym, const char *ver,
+                                          bool is_default);
+  const char*         setSymbolNeeded(const char *sym, const char *ver,
+                                      const char *dynfile);
+  // @LOCALMOD-END
   bool compile_to_file(const char **name, std::string &errMsg);
   const void *compile(size_t *length, std::string &errMsg);
   void setCodeGenDebugOptions(const char *opts);
@@ -77,6 +94,9 @@ private:
   std::vector<char*>          _codegenOptions;
   std::string                 _mCpu;
   std::string                 _nativeObjectPath;
+
+  // @LOCALMOD
+  std::vector<LTOModule*> _gatheredModules;
 };
 
 #endif // LTO_CODE_GENERATOR_H
diff --git a/tools/lto/LTOModule.cpp b/tools/lto/LTOModule.cpp
index ffdcbe644c..cb8a4e5f0d 100644
--- a/tools/lto/LTOModule.cpp
+++ b/tools/lto/LTOModule.cpp
@@ -17,6 +17,8 @@
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/CodeGen/IntrinsicLowering.h" // @LOCALMOD
+
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCStreamer.h"
@@ -27,6 +29,7 @@
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h" // @LOCALMOD
 #include "llvm/Support/Host.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
@@ -271,7 +274,7 @@ LTOModule *LTOModule::makeLTOModule(MemoryBuffer *buffer,
   }
 
   // parse bitcode buffer
-  OwningPtr<Module> m(getLazyBitcodeModule(buffer, getGlobalContext(),
+  OwningPtr<Module> m(ParseBitcodeFile(buffer, getGlobalContext(), // @LOCALMOD
                                            &errMsg));
   if (!m) {
     delete buffer;
@@ -304,6 +307,13 @@ LTOModule *LTOModule::makeLTOModule(MemoryBuffer *buffer,
   getTargetOptions(Options);
   TargetMachine *target = march->createTargetMachine(TripleStr, CPU, FeatureStr,
                                                      Options);
+
+  // @LOCALMOD-BEGIN
+  // Add declarations for functions which may be used by intrinsics.
+  IntrinsicLowering IL(*target->getDataLayout());
+  IL.AddPrototypes(*m);
+  // @LOCALMOD-END
+
   LTOModule *Ret = new LTOModule(m.take(), target);
   if (Ret->parseSymbols(errMsg)) {
     delete Ret;
@@ -319,6 +329,33 @@ MemoryBuffer *LTOModule::makeBuffer(const void *mem, size_t length) {
   return MemoryBuffer::getMemBuffer(StringRef(startPtr, length), "", false);
 }
 
+// @LOCALMOD-BEGIN
+lto_output_format LTOModule::getOutputFormat() {
+  Module::OutputFormat format = _module->getOutputFormat();
+  switch (format) {
+  case Module::ObjectOutputFormat: return LTO_OUTPUT_FORMAT_OBJECT;
+  case Module::SharedOutputFormat: return LTO_OUTPUT_FORMAT_SHARED;
+  case Module::ExecutableOutputFormat: return LTO_OUTPUT_FORMAT_EXEC;
+  }
+  llvm_unreachable("Unknown output format in LTOModule");
+}
+
+const char *LTOModule::getSOName() {
+  return _module->getSOName().c_str();
+}
+
+const char* LTOModule::getLibraryDep(uint32_t index) {
+  const Module::LibraryListType &Libs = _module->getLibraries();
+  if (index < Libs.size())
+    return Libs[index].c_str();
+  return NULL;
+}
+
+uint32_t LTOModule::getNumLibraryDeps() {
+  return _module->getLibraries().size();
+}
+// @LOCALMOD-END
+
 /// objcClassNameFromExpression - Get string that the data pointer points to.
 bool LTOModule::objcClassNameFromExpression(Constant *c, std::string &name) {
   if (ConstantExpr *ce = dyn_cast<ConstantExpr>(c)) {
@@ -612,6 +649,16 @@ void LTOModule::addPotentialUndefinedSymbol(GlobalValue *decl, bool isFunc) {
   if (decl->getName().startswith("llvm."))
     return;
 
+  // @LOCALMOD-BEGIN
+  // Bitcode modules may have declarations for functions or globals
+  // which are unused. Ignore them here so that gold does not mistake
+  // them for undefined symbols. But don't ignore declarations for
+  // functions which are potentially used by intrinsics.
+  if (decl->use_empty() &&
+      !IntrinsicLowering::IsCalledByIntrinsic(decl->getName()))
+    return;
+  // @LOCALMOD-END
+
   // ignore all aliases
   if (isa<GlobalAlias>(decl))
     return;
@@ -788,6 +835,12 @@ namespace {
                                    unsigned MaxBytesToEmit) {}
     virtual bool EmitValueToOffset(const MCExpr *Offset,
                                    unsigned char Value ) { return false; }
+    // @LOCALMOD-BEGIN
+    virtual void EmitBundleLock() {}
+    virtual void EmitBundleUnlock() {}
+    virtual void EmitBundleAlignStart() {}
+    virtual void EmitBundleAlignEnd() {}
+    // @LOCALMOD-END
     virtual void EmitFileDirective(StringRef Filename) {}
     virtual void EmitDwarfAdvanceLineAddr(int64_t LineDelta,
                                           const MCSymbol *LastLabel,
diff --git a/tools/lto/LTOModule.h b/tools/lto/LTOModule.h
index 8e52206b5b..03c16d08db 100644
--- a/tools/lto/LTOModule.h
+++ b/tools/lto/LTOModule.h
@@ -99,6 +99,14 @@ public:
     _module->setTargetTriple(triple);
   }
 
+  // @LOCALMOD-BEGIN
+  lto_output_format        getOutputFormat();
+  const char*              getSOName();
+  const char*              getLibraryDep(uint32_t index);
+  uint32_t                 getNumLibraryDeps();
+  // @LOCALMOD-END
+
+
   /// getSymbolCount - Get the number of symbols
   uint32_t getSymbolCount() {
     return _symbols.size();
diff --git a/tools/lto/Makefile b/tools/lto/Makefile
index 3610fed03b..f9392a6911 100644
--- a/tools/lto/Makefile
+++ b/tools/lto/Makefile
@@ -57,3 +57,11 @@ ifeq ($(HOST_OS),Darwin)
                           -Wl,-object_path_lto -Wl,$(TempFile)
     endif
 endif
+
+#@ LOCALMOD-BEGIN
+# This is to fix an upstream bug. It is in the process of being upstreamed.
+# This line can be removed after it has been fixed upstream and we've merged.
+ifneq ($(HOST_OS),Darwin)
+  LLVMLibsOptions := $(LLVMLibsOptions) -Wl,-soname=$(SharedPrefix)LTO$(SHLIBEXT)
+endif
+#@ LOCALMOD-END
diff --git a/tools/lto/lto.cpp b/tools/lto/lto.cpp
index a7e633d14b..a7c335c934 100644
--- a/tools/lto/lto.cpp
+++ b/tools/lto/lto.cpp
@@ -15,6 +15,8 @@
 #include "llvm-c/lto.h"
 #include "llvm-c/Core.h"
 
+#include "llvm/Support/CommandLine.h" // @LOCALMOD
+
 #include "LTOModule.h"
 #include "LTOCodeGenerator.h"
 
@@ -23,6 +25,25 @@
 // *** Not thread safe ***
 static std::string sLastErrorString;
 
+// @LOCALMOD-BEGIN
+static std::vector<const char*> lto_options;
+extern void lto_add_command_line_option(const char* opt)
+{
+  // ParseCommandLineOptions() expects argv[0] to be program name.
+  if (lto_options.empty())
+    lto_options.push_back("libLTO");
+
+  lto_options.push_back(strdup(opt));
+}
+
+extern void lto_parse_command_line_options()
+{
+  if ( !lto_options.empty() )
+      llvm::cl::ParseCommandLineOptions(lto_options.size(),
+                                        const_cast<char **>(&lto_options[0]));
+}
+// @LOCALMOD-END
+
 /// lto_get_version - Returns a printable string.
 extern const char* lto_get_version() {
   return LTOCodeGenerator::getVersionString();
@@ -107,6 +128,45 @@ void lto_module_set_target_triple(lto_module_t mod, const char *triple) {
   return mod->setTargetTriple(triple);
 }
 
+// @LOCALMOD-BEGIN
+
+//
+// Get the module format for this module
+//
+lto_output_format lto_module_get_output_format(lto_module_t mod)
+{
+  return mod->getOutputFormat();
+}
+
+//
+// Get the module soname
+//
+const char* lto_module_get_soname(lto_module_t mod)
+{
+  return mod->getSOName();
+}
+
+//
+// Get the i'th library dependency.
+// Returns NULL if i >= lto_module_get_num_library_deps()
+//
+const char *
+lto_module_get_library_dep(lto_module_t mod, unsigned int i)
+{
+  return mod->getLibraryDep(i);
+}
+
+//
+// Return the number of library dependencies of this module.
+//
+unsigned int
+lto_module_get_num_library_deps(lto_module_t mod)
+{
+  return mod->getNumLibraryDeps();
+}
+
+// @LOCALMOD-END
+
 /// lto_module_get_num_symbols - Returns the number of symbols in the object
 /// module.
 unsigned int lto_module_get_num_symbols(lto_module_t mod) {
@@ -145,6 +205,16 @@ bool lto_codegen_add_module(lto_code_gen_t cg, lto_module_t mod) {
   return cg->addModule(mod, sLastErrorString);
 }
 
+// @LOCALMOD-BEGIN
+bool lto_codegen_gather_module_for_link(lto_code_gen_t cg, lto_module_t mod) {
+  return cg->gatherModuleForLinking(mod);
+}
+
+bool lto_codegen_link_gathered_modules_and_dispose(lto_code_gen_t cg) {
+  return cg->linkGatheredModulesAndDispose(sLastErrorString);
+}
+// @LOCALMOD-END
+
 /// lto_codegen_set_debug_model - Sets what if any format of debug info should
 /// be generated. Returns true on error (check lto_get_error_message() for
 /// details).
@@ -183,6 +253,77 @@ void lto_codegen_add_must_preserve_symbol(lto_code_gen_t cg,
   cg->addMustPreserveSymbol(symbol);
 }
 
+// @LOCALMOD-BEGIN
+
+//
+// Set the module format for the merged module
+//
+void lto_codegen_set_merged_module_output_format(lto_code_gen_t cg,
+                                                 lto_output_format format)
+{
+  cg->setMergedModuleOutputFormat(format);
+}
+
+//
+// Set the module soname (for shared library bitcode)
+//
+void lto_codegen_set_merged_module_soname(lto_code_gen_t cg,
+                                          const char* soname)
+{
+  cg->setMergedModuleSOName(soname);
+}
+
+//
+// Add a library dependency to the linked bitcode module.
+//
+void lto_codegen_add_merged_module_library_dep(lto_code_gen_t cg,
+                                               const char* soname)
+{
+  cg->addLibraryDep(soname);
+}
+
+//
+// Apply symbol wrapping in the linked bitcode module.
+//
+void lto_codegen_wrap_symbol_in_merged_module(lto_code_gen_t cg,
+                                              const char* sym) {
+  cg->wrapSymbol(sym);
+}
+
+//
+// Set the symbol version of defined symbol 'sym'.
+// 'sym' is the name of the GlobalValue, exactly as it is
+// in the LLVM module. It may already have a version suffix.
+// In that case, this function verifies that the old version
+// and new version match.
+// Returns a reference to the new name.
+//
+const char *
+lto_codegen_set_symbol_def_version(lto_code_gen_t cg,
+                                   const char *sym,
+                                   const char *version,
+                                   bool is_default) {
+  return cg->setSymbolDefVersion(sym, version, is_default);
+}
+
+//
+// Set the symbol version of needed symbol 'sym' from file 'dynfile'.
+// 'sym' is the name of the GlobalValue, exactly as it is
+// in the LLVM module. It may already have a version suffix.
+// In that case, this function verifies that the old version
+// and new version match.
+// In any case, it adds a NeededRecord entry.
+// Returns a reference to the new name.
+//
+const char*
+lto_codegen_set_symbol_needed(lto_code_gen_t cg,
+                              const char *sym,
+                              const char *version,
+                              const char *dynfile) {
+  return cg->setSymbolNeeded(sym, version, dynfile);
+}
+// @LOCALMOD-END
+
 /// lto_codegen_write_merged_modules - Writes a new file at the specified path
 /// that contains the merged contents of all modules added so far. Returns true
 /// on error (check lto_get_error_message() for details).
diff --git a/tools/lto/lto.exports b/tools/lto/lto.exports
index 4940bb147e..e589c5d2c6 100644
--- a/tools/lto/lto.exports
+++ b/tools/lto/lto.exports
@@ -1,3 +1,5 @@
+lto_add_command_line_option
+lto_parse_command_line_options
 lto_get_error_message
 lto_get_version
 lto_module_create
@@ -9,16 +11,25 @@ lto_module_get_symbol_attribute
 lto_module_get_symbol_name
 lto_module_get_target_triple
 lto_module_set_target_triple
+lto_module_get_output_format
+lto_module_get_soname
+lto_module_get_library_dep
+lto_module_get_num_library_deps
 lto_module_is_object_file
 lto_module_is_object_file_for_target
 lto_module_is_object_file_in_memory
 lto_module_is_object_file_in_memory_for_target
 lto_module_dispose
 lto_codegen_add_module
+lto_codegen_gather_module_for_link
+lto_codegen_link_gathered_modules_and_dispose
 lto_codegen_add_must_preserve_symbol
 lto_codegen_compile
 lto_codegen_create
 lto_codegen_dispose
+lto_codegen_set_assembler_args
+lto_codegen_set_assembler_path
+lto_codegen_set_cpu
 lto_codegen_set_debug_model
 lto_codegen_set_pic_model
 lto_codegen_write_merged_modules
@@ -26,6 +37,12 @@ lto_codegen_debug_options
 lto_codegen_set_assembler_args
 lto_codegen_set_assembler_path
 lto_codegen_set_cpu
+lto_codegen_set_merged_module_output_format
+lto_codegen_set_merged_module_soname
+lto_codegen_add_merged_module_library_dep
+lto_codegen_set_symbol_def_version
+lto_codegen_set_symbol_needed
+lto_codegen_wrap_symbol_in_merged_module
 lto_codegen_compile_to_file
 LLVMCreateDisasm
 LLVMDisasmDispose
diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp
index bac0d46947..0390bc470a 100644
--- a/tools/opt/opt.cpp
+++ b/tools/opt/opt.cpp
@@ -580,6 +580,7 @@ int main(int argc, char **argv) {
   initializeInstCombine(Registry);
   initializeInstrumentation(Registry);
   initializeTarget(Registry);
+  initializeExpandCtorsPass(Registry);
 
   cl::ParseCommandLineOptions(argc, argv,
     "llvm .bc -> .bc modular optimizer and analysis printer\n");
diff --git a/tools/pso-stub/CMakeLists.txt b/tools/pso-stub/CMakeLists.txt
new file mode 100644
index 0000000000..4b2f779cb0
--- /dev/null
+++ b/tools/pso-stub/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(LLVM_LINK_COMPONENTS bitreader bitwriter object support analysis)
+
+add_llvm_tool(pso-stub
+  pso-stub.cpp
+  )
diff --git a/tools/pso-stub/LLVMBuild.txt b/tools/pso-stub/LLVMBuild.txt
new file mode 100644
index 0000000000..e643053dbf
--- /dev/null
+++ b/tools/pso-stub/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./tools/pso-stub/LLVMBuild.txt ---------------------------*- Conf -*--===;
+;
+;                     The LLVM Compiler Infrastructure
+;
+; This file is distributed under the University of Illinois Open Source
+; License. See LICENSE.TXT for details.
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Tool
+name = pso-stub
+parent = Tools
+required_libraries = BitReader BitWriter Object Support Analysis
diff --git a/tools/pso-stub/Makefile b/tools/pso-stub/Makefile
new file mode 100644
index 0000000000..c2860e65f6
--- /dev/null
+++ b/tools/pso-stub/Makefile
@@ -0,0 +1,18 @@
+##===- tools/pso-stub/Makefile -----------------------------*- Makefile -*-===##
+# 
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License. See LICENSE.TXT for details.
+# 
+##===----------------------------------------------------------------------===##
+
+LEVEL := ../..
+TOOLNAME := pso-stub
+LINK_COMPONENTS := bitreader bitwriter object support analysis
+
+# This tool has no plugins, optimize startup time.
+TOOL_NO_EXPORTS := 1
+
+include $(LEVEL)/Makefile.common
+
diff --git a/tools/pso-stub/pso-stub.cpp b/tools/pso-stub/pso-stub.cpp
new file mode 100644
index 0000000000..1fdc868499
--- /dev/null
+++ b/tools/pso-stub/pso-stub.cpp
@@ -0,0 +1,309 @@
+/*===- pso-stub.c - Create bitcode shared object stubs  -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Create a bitcode stub for a native shared object.
+// Usage: pso-stub <input.so> -o <output.pso>
+//
+// The stub bitcode file contains the same dynamic symbols as the input shared
+// object, with identical attributes (e.g. weak, undefined, TLS).
+//
+// Undefined functions become declarations in the bitcode.
+// Undefined variables become external variable declarations in the bitcode.
+// Defined functions become trivial stub functions in the bitcode (which do
+// nothing but "ret void").
+// Defined object/tls symbols became dummy variable definitions (int foo = 0).
+//
+// The generated bitcode is suitable for linking against (as a shared object),
+// but nothing else.
+//
+// TODO(pdox): Implement GNU symbol versioning.
+// TODO(pdox): Mark IFUNC symbols as functions, and store
+//             this attribute as metadata.
+//===----------------------------------------------------------------------===*/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "llvm/LLVMContext.h"
+#include "llvm/Module.h"
+#include "llvm/GlobalValue.h"
+#include "llvm/Type.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Constant.h"
+#include "llvm/Constants.h"
+#include "llvm/Instructions.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Object/ELF.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/PrettyStackTrace.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Signals.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/ADT/APInt.h"
+
+using namespace llvm;
+using namespace llvm::object;
+
+namespace {
+
+cl::opt<std::string>
+InputFilename(cl::Positional, cl::desc("<input native shared object>"),
+              cl::init(""));
+
+cl::opt<std::string>
+OutputFilename("o", cl::desc("Output filename"), cl::value_desc("filename"));
+
+// Variables / declarations to place in llvm.used array.
+std::vector<GlobalValue*> LLVMUsed;
+
+void AddUsedGlobal(GlobalValue *GV) {
+  // Clang normally asserts that these are not decls.  We do need
+  // decls to survive though, and those are really the ones we
+  // worry about, so only add those.
+  // We run verifyModule() below, so that we know this is somewhat valid.
+  if (GV->isDeclaration()) {
+    LLVMUsed.push_back(GV);
+  }
+}
+
+// Emit llvm.used array.
+// This is almost exactly like clang/lib/CodeGen/CodeGenModule.cpp::EmitLLVMUsed
+void EmitLLVMUsed(Module *M) {
+  // Don't create llvm.used if there is no need.
+  if (LLVMUsed.empty())
+    return;
+
+  Type *Int8PtrTy = Type::getInt8PtrTy(M->getContext());
+  // Convert LLVMUsed to what ConstantArray needs.
+  SmallVector<llvm::Constant*, 8> UsedArray;
+  UsedArray.resize(LLVMUsed.size());
+  for (unsigned i = 0, e = LLVMUsed.size(); i != e; ++i) {
+    UsedArray[i] =
+     llvm::ConstantExpr::getBitCast(cast<llvm::Constant>(&*LLVMUsed[i]),
+                                    Int8PtrTy);
+  }
+
+  if (UsedArray.empty())
+    return;
+  llvm::ArrayType *ATy = llvm::ArrayType::get(Int8PtrTy, UsedArray.size());
+
+  llvm::GlobalVariable *GV =
+    new llvm::GlobalVariable(*M, ATy, false,
+                             llvm::GlobalValue::AppendingLinkage,
+                             llvm::ConstantArray::get(ATy, UsedArray),
+                             "llvm.used");
+
+  GV->setSection("llvm.metadata");
+}
+
+// Add a stub function definition or declaration
+void
+AddFunction(Module *M,
+            GlobalValue::LinkageTypes Linkage,
+            const StringRef &Name,
+            bool isDefine) {
+  // Create an empty function with no arguments.
+  // void Name(void);
+  Type *RetTy = Type::getVoidTy(M->getContext());
+  FunctionType *FT = FunctionType::get(RetTy, /*isVarArg=*/ false);
+  Function *F = Function::Create(FT, Linkage, Name, M);
+  if (isDefine) {
+    // Add a single basic block with "ret void"
+    BasicBlock *BB = BasicBlock::Create(F->getContext(), "", F);
+    BB->getInstList().push_back(ReturnInst::Create(F->getContext()));
+  }
+  AddUsedGlobal(F);
+}
+
+// Add a stub global variable declaration or definition.
+void
+AddGlobalVariable(Module *M,
+          GlobalValue::LinkageTypes Linkage,
+          const StringRef &Name,
+          bool isTLS,
+          bool isDefine) {
+  // Use 'int' as the dummy type.
+  Type *Ty = Type::getInt32Ty(M->getContext());
+
+  Constant *InitVal = NULL;
+  if (isDefine) {
+    // Define to dummy value, 0.
+    InitVal = Constant::getIntegerValue(Ty, APInt(32, 0));
+  }
+  GlobalVariable *GV =
+    new GlobalVariable(*M, Ty, /*isConstant=*/ false,
+                       Linkage, /*Initializer=*/ InitVal,
+                       Twine(Name), /*InsertBefore=*/ NULL,
+                       isTLS ? GlobalVariable::GeneralDynamicTLSModel :
+                               GlobalVariable::NotThreadLocal,
+                       /*AddressSpace=*/ 0);
+  AddUsedGlobal(GV);
+}
+
+// Iterate through the ObjectFile's needed libraries, and
+// add them to the module.
+void TransferLibrariesNeeded(Module *M, const ObjectFile *obj) {
+  library_iterator it = obj->begin_libraries_needed();
+  library_iterator ie = obj->end_libraries_needed();
+  error_code ec;
+  for (; it != ie; it.increment(ec)) {
+    StringRef path;
+    it->getPath(path);
+    outs() << "Adding library " << path << "\n";
+    M->addLibrary(path);
+  }
+}
+
+// Set the Module's SONAME from the ObjectFile
+void TransferLibraryName(Module *M, const ObjectFile *obj) {
+  StringRef soname = obj->getLoadName();
+  outs() << "Setting soname to: " << soname << "\n";
+  M->setSOName(soname);
+}
+
+// Create stubs in the module for the dynamic symbols
+void TransferDynamicSymbols(Module *M, const ObjectFile *obj) {
+  // Iterate through the dynamic symbols in the ObjectFile.
+  symbol_iterator it = obj->begin_dynamic_symbols();
+  symbol_iterator ie = obj->end_dynamic_symbols();
+  error_code ec;
+  for (; it != ie; it.increment(ec)) {
+    const SymbolRef &sym = *it;
+    StringRef Name;
+    SymbolRef::Type Type;
+    uint32_t Flags;
+
+    sym.getName(Name);
+    sym.getType(Type);
+    sym.getFlags(Flags);
+
+    // Ignore debug info and section labels
+    if (Flags & SymbolRef::SF_FormatSpecific)
+      continue;
+
+    // Ignore local symbols
+    if (!(Flags & SymbolRef::SF_Global))
+      continue;
+    outs() << "Transferring symbol " << Name << "\n";
+
+    bool isFunc = (Type == SymbolRef::ST_Function);
+    bool isUndef = (Flags & SymbolRef::SF_Undefined);
+    bool isTLS = (Flags & SymbolRef::SF_ThreadLocal);
+    bool isCommon = (Flags & SymbolRef::SF_Common);
+    bool isWeak = (Flags & SymbolRef::SF_Weak);
+
+    if (Type == SymbolRef::ST_Unknown) {
+      // Weak symbols can be "v" according to NM, which are definitely
+      // data, but they may also be "w", which are of unknown type.
+      // Thus there is already a mechanism to say "weak object", but not
+      // for weak function.  Assume unknown weak symbols are functions.
+      if (isWeak) {
+        outs() << "Warning: Symbol '" << Name <<
+            "' has unknown type (weak). Assuming function.\n";
+        Type = SymbolRef::ST_Function;
+        isFunc = true;
+      } else {
+        // If it is undef, we likely don't care, since it won't be used
+        // to bind to unresolved symbols in the real pexe and real pso.
+        // Other cases seen where it is not undef: _end, __bss_start,
+        // which are markers provided by the linker scripts.
+        outs() << "Warning: Symbol '" << Name <<
+            "' has unknown type (isUndef=" << isUndef << "). Assuming data.\n";
+        Type = SymbolRef::ST_Data;
+        isFunc = false;
+      }
+    }
+
+    // Determine Linkage type.
+    GlobalValue::LinkageTypes Linkage;
+    if (isWeak)
+      Linkage = isUndef ? GlobalValue::ExternalWeakLinkage :
+                          GlobalValue::WeakAnyLinkage;
+    else if (isCommon)
+      Linkage = GlobalValue::CommonLinkage;
+    else
+      Linkage = GlobalValue::ExternalLinkage;
+
+    if (isFunc)
+      AddFunction(M, Linkage, Name, !isUndef);
+    else
+      AddGlobalVariable(M, Linkage, Name, isTLS, !isUndef);
+  }
+}
+
+}  // namespace
+
+
+int main(int argc, const char** argv) {
+  sys::PrintStackTraceOnErrorSignal();
+  PrettyStackTraceProgram X(argc, argv);
+  LLVMContext &Context = getGlobalContext();
+  llvm_shutdown_obj Y;  // Call llvm_shutdown() on exit.
+
+  cl::ParseCommandLineOptions(argc, argv,
+                              "Portable Shared Object Stub Maker\n");
+
+  if (InputFilename.empty()) {
+    errs() << "Please specify an input filename\n";
+    return 1;
+  }
+  if (OutputFilename.empty()) {
+    errs() << "Please specify an output filename with -o\n";
+    return 1;
+  }
+
+  // Open the object file
+  OwningPtr<MemoryBuffer> File;
+  if (MemoryBuffer::getFile(InputFilename, File)) {
+    errs() << InputFilename << ": Open failed\n";
+    return 1;
+  }
+
+  ObjectFile *obj = ObjectFile::createObjectFile(File.take());
+  if (!obj) {
+    errs() << InputFilename << ": Object type not recognized\n";
+  }
+
+  // Create the new module
+  OwningPtr<Module> M(new Module(InputFilename, Context));
+
+  // Transfer the relevant ELF information
+  M->setOutputFormat(Module::SharedOutputFormat);
+  TransferLibrariesNeeded(M.get(), obj);
+  TransferLibraryName(M.get(), obj);
+  TransferDynamicSymbols(M.get(), obj);
+  EmitLLVMUsed(M.get());
+
+  // Verify the module
+  std::string Err;
+  if (verifyModule(*M.get(), ReturnStatusAction, &Err)) {
+    errs() << "Module created is invalid:\n";
+    errs() << Err;
+    return 1;
+  }
+
+  // Write the module to a file
+  std::string ErrorInfo;
+  OwningPtr<tool_output_file> Out(
+      new tool_output_file(OutputFilename.c_str(), ErrorInfo,
+                           raw_fd_ostream::F_Binary));
+  if (!ErrorInfo.empty()) {
+    errs() << ErrorInfo << '\n';
+    return 1;
+  }
+  WriteBitcodeToFile(M.get(), Out->os());
+  Out->keep();
+  return 0;
+}
diff --git a/utils/Makefile b/utils/Makefile
index 7a3c17d032..f972b6596f 100644
--- a/utils/Makefile
+++ b/utils/Makefile
@@ -11,6 +11,15 @@ LEVEL = ..
 PARALLEL_DIRS := FileCheck FileUpdate TableGen PerfectShuffle \
 	      count fpcmp llvm-lit not unittest yaml2obj
 
+ifeq ($(NACL_SANDBOX),1)
+  # In sandboxed mode, just build the bare minimum
+  # Note: TableGen is usually built twice: 
+  #       * once with host compiler
+  #       * also with the "given" compiler
+  #  Here we just disable that second build
+  PARALLEL_DIRS := 
+endif
+
 EXTRA_DIST := check-each-file codegen-diff countloc.sh \
               DSAclean.py DSAextract.py emacs findsym.pl GenLibDeps.pl \
 	      getsrcs.sh llvmdo llvmgrep llvm-native-gcc \
diff --git a/utils/TableGen/CodeGenTarget.cpp b/utils/TableGen/CodeGenTarget.cpp
index c9992eb392..bd55e697c5 100644
--- a/utils/TableGen/CodeGenTarget.cpp
+++ b/utils/TableGen/CodeGenTarget.cpp
@@ -310,6 +310,12 @@ void CodeGenTarget::ComputeInstrsByEnum() const {
     "BUNDLE",
     "LIFETIME_START",
     "LIFETIME_END",
+    // @LOCALMOD-BEGIN
+    "BUNDLE_ALIGN_START",
+    "BUNDLE_ALIGN_END",
+    "BUNDLE_LOCK",
+    "BUNDLE_UNLOCK",
+    // @LOCALMOD-END
     0
   };
   const DenseMap<const Record*, CodeGenInstruction*> &Insts = getInstructions();
diff --git a/utils/TableGen/EDEmitter.cpp b/utils/TableGen/EDEmitter.cpp
index ea2545050b..4101076f33 100644
--- a/utils/TableGen/EDEmitter.cpp
+++ b/utils/TableGen/EDEmitter.cpp
@@ -273,6 +273,7 @@ static int X86TypeFromOpName(LiteralConstantEmitter *type,
   REG("RFP32");
   REG("GR64");
   REG("GR64_NOAX");
+  REG("GR32_TC_64"); // @LOCALMOD
   REG("GR64_TC");
   REG("FR64");
   REG("VR64");