77 files changed, 3419 insertions, 934 deletions
diff --git a/docs/ReleaseNotes.html b/docs/ReleaseNotes.html
index 45a9cc5dec..fc3a8b71bd 100644
--- a/docs/ReleaseNotes.html
+++ b/docs/ReleaseNotes.html
@@ -473,15 +473,31 @@ Release Notes</a>.</h1>
     <b>-mllvm -force-vector-width=4</b>.
     The default value is <b>0</b> which means auto-select.
     <br/>
-    We can now vectorize this code:
+    We can now vectorize this function:
 
     <pre class="doc_code">
-    for (i=0; i&lt;n; i++) {
-      a[i] = b[i+1] + c[i+3] + i;
-      sum += d[i];
+    unsigned sum_arrays(int *A, int *B, int start, int end) {
+      unsigned sum = 0;
+      for (int i = start; i &lt; end; ++i)
+        sum += A[i] + B[i] + i;
+
+      return sum;
     }
     </pre>
 
+    We vectorize under the following loops:
+    <ul>
+    <li>The inner most loops must have a single basic block.</li>
+    <li>The number of iterations are known before the loop starts to execute.</li>
+    <li>The loop counter needs to be incrimented by one.</li>
+    <li>The loop trip count <b>can</b> be a variable.</li>
+    <li>Loops do <b>not</b> need to start at zero.</li>
+    <li>The induction variable can be used inside the loop.</li>
+    <li>Loop reductions are supported.</li>
+    <li>Arrays with affine access pattern do <b>not</b> need to be marked as 'noalias' and are checked at runtime.</li>
+    <li>...</li>
+    </ul>
+
 </p>
 
 <p>SROA - We've re-written SROA to be significantly more powerful.
diff --git a/include/llvm/MC/MCExpr.h b/include/llvm/MC/MCExpr.h
index 4c10e5114a..00eef270d6 100644
--- a/include/llvm/MC/MCExpr.h
+++ b/include/llvm/MC/MCExpr.h
@@ -167,6 +167,7 @@ public:
     VK_ARM_TPOFF,
     VK_ARM_GOTTPOFF,
     VK_ARM_TARGET1,
+    VK_ARM_TARGET2,
 
     VK_PPC_TOC,          // TOC base
     VK_PPC_TOC_ENTRY,    // TOC entry
diff --git a/include/llvm/Target/TargetLibraryInfo.h b/include/llvm/Target/TargetLibraryInfo.h
index 2a0a43229f..a2c97d782e 100644
--- a/include/llvm/Target/TargetLibraryInfo.h
+++ b/include/llvm/Target/TargetLibraryInfo.h
@@ -276,12 +276,18 @@ namespace llvm {
       sqrtf,
       /// long double sqrtl(long double x);
       sqrtl,
+      /// char *stpcpy(char *s1, const char *s2);
+      stpcpy,
       /// char *strcat(char *s1, const char *s2);
       strcat,
       /// char *strchr(const char *s, int c);
       strchr,
+      /// int strcmp(const char *s1, const char *s2);
+      strcmp,
       /// char *strcpy(char *s1, const char *s2);
       strcpy,
+      /// size_t strcspn(const char *s1, const char *s2);
+      strcspn,
       /// char *strdup(const char *s1);
       strdup,
       /// size_t strlen(const char *s);
@@ -296,6 +302,29 @@ namespace llvm {
       strndup,
       /// size_t strnlen(const char *s, size_t maxlen);
       strnlen,
+      /// char *strpbrk(const char *s1, const char *s2);
+      strpbrk,
+      /// char *strrchr(const char *s, int c);
+      strrchr,
+      /// size_t strspn(const char *s1, const char *s2);
+      strspn,
+      /// char *strstr(const char *s1, const char *s2);
+      strstr,
+      /// double strtod(const char *nptr, char **endptr);
+      strtod,
+      /// float strtof(const char *nptr, char **endptr);
+      strtof,
+      /// long int strtol(const char *nptr, char **endptr, int base);
+      strtol,
+      /// long double strtold(const char *nptr, char **endptr);
+      strtold,
+      /// long long int strtoll(const char *nptr, char **endptr, int base);
+      strtoll,
+      /// unsigned long int strtoul(const char *nptr, char **endptr, int base);
+      strtoul,
+      /// unsigned long long int strtoull(const char *nptr, char **endptr,
+      ///                                 int base);
+      strtoull,
       /// double tan(double x);
       tan,
       /// float tanf(float x);
diff --git a/include/llvm/Transforms/Utils/SimplifyLibCalls.h b/include/llvm/Transforms/Utils/SimplifyLibCalls.h
index 5db2d00181..fde452bca2 100644
--- a/include/llvm/Transforms/Utils/SimplifyLibCalls.h
+++ b/include/llvm/Transforms/Utils/SimplifyLibCalls.h
@@ -19,6 +19,7 @@ namespace llvm {
   class Value;
   class CallInst;
   class DataLayout;
+  class Instruction;
   class TargetLibraryInfo;
   class LibCallSimplifierImpl;
 
@@ -35,8 +36,16 @@ namespace llvm {
 
     /// optimizeCall - Take the given call instruction and return a more
     /// optimal value to replace the instruction with or 0 if a more
-    /// optimal form can't be found.
+    /// optimal form can't be found.  Note that the returned value may
+    /// be equal to the instruction being optimized.  In this case all
+    /// other instructions that use the given instruction were modified
+    /// and the given instruction is dead.
     Value *optimizeCall(CallInst *CI);
+
+    /// replaceAllUsesWith - This method is used when the library call
+    /// simplifier needs to replace instructions other than the library
+    /// call being modified.
+    virtual void replaceAllUsesWith(Instruction *I, Value *With) const;
   };
 } // End llvm namespace
 
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index 5cac8ca3ba..91a5b84e8a 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -292,7 +292,10 @@ static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
     unsigned IntBytes = unsigned(CI->getBitWidth()/8);
 
     for (unsigned i = 0; i != BytesLeft && ByteOffset != IntBytes; ++i) {
-      CurPtr[i] = (unsigned char)(Val >> (ByteOffset * 8));
+      int n = ByteOffset;
+      if (!TD.isLittleEndian())
+        n = IntBytes - n - 1;
+      CurPtr[i] = (unsigned char)(Val >> (n * 8));
       ++ByteOffset;
     }
     return true;
@@ -442,10 +445,19 @@ static Constant *FoldReinterpretLoadFromConstPtr(Constant *C,
                           BytesLoaded, TD))
     return 0;
 
-  APInt ResultVal = APInt(IntType->getBitWidth(), RawBytes[BytesLoaded-1]);
-  for (unsigned i = 1; i != BytesLoaded; ++i) {
-    ResultVal <<= 8;
-    ResultVal |= RawBytes[BytesLoaded-1-i];
+  APInt ResultVal = APInt(IntType->getBitWidth(), 0);
+  if (TD.isLittleEndian()) {
+    ResultVal = RawBytes[BytesLoaded - 1];
+    for (unsigned i = 1; i != BytesLoaded; ++i) {
+      ResultVal <<= 8;
+      ResultVal |= RawBytes[BytesLoaded-1-i];
+    }
+  } else {
+    ResultVal = RawBytes[0];
+    for (unsigned i = 1; i != BytesLoaded; ++i) {
+      ResultVal <<= 8;
+      ResultVal |= RawBytes[i];
+    }
   }
 
   return ConstantInt::get(IntType->getContext(), ResultVal);
@@ -521,10 +533,8 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C,
     }
   }
 
-  // Try hard to fold loads from bitcasted strange and non-type-safe things.  We
-  // currently don't do any of this for big endian systems.  It can be
-  // generalized in the future if someone is interested.
-  if (TD && TD->isLittleEndian())
+  // Try hard to fold loads from bitcasted strange and non-type-safe things.
+  if (TD)
     return FoldReinterpretLoadFromConstPtr(CE, *TD);
   return 0;
 }
diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
index 65bc4af99e..4e75d892e5 100644
--- a/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -146,6 +146,11 @@ void LiveIntervals::print(raw_ostream &OS, const Module* ) const {
       OS << PrintReg(Reg) << " = " << getInterval(Reg) << '\n';
   }
 
+  OS << "RegMasks:";
+  for (unsigned i = 0, e = RegMaskSlots.size(); i != e; ++i)
+    OS << ' ' << RegMaskSlots[i];
+  OS << '\n';
+
   printInstrs(OS);
 }
 
@@ -1257,10 +1262,15 @@ private:
     SmallVectorImpl<SlotIndex>::iterator RI =
       std::lower_bound(LIS.RegMaskSlots.begin(), LIS.RegMaskSlots.end(),
                        OldIdx);
-    assert(*RI == OldIdx && "No RegMask at OldIdx.");
-    *RI = NewIdx;
-    assert(*prior(RI) < *RI && *RI < *next(RI) &&
-           "RegSlots out of order. Did you move one call across another?");
+    assert(RI != LIS.RegMaskSlots.end() && *RI == OldIdx.getRegSlot() &&
+           "No RegMask at OldIdx.");
+    *RI = NewIdx.getRegSlot();
+    assert((RI == LIS.RegMaskSlots.begin() ||
+            SlotIndex::isEarlierInstr(*llvm::prior(RI), *RI)) &&
+            "Cannot move regmask instruction above another call");
+    assert((llvm::next(RI) == LIS.RegMaskSlots.end() ||
+            SlotIndex::isEarlierInstr(*RI, *llvm::next(RI))) &&
+            "Cannot move regmask instruction below another call");
   }
 
   // Return the last use of reg between NewIdx and OldIdx.
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index de16932c06..a4817d09c0 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -1583,6 +1583,7 @@ const char *ConvergingScheduler::getReasonStr(
   case NextDefUse:     return "DEF-USE   ";
   case NodeOrder:      return "ORDER     ";
   };
+  llvm_unreachable("Unknown reason!");
 }
 
 void ConvergingScheduler::traceCandidate(const SchedCandidate &Cand,
diff --git a/lib/MC/MCExpr.cpp b/lib/MC/MCExpr.cpp
index ffa79761f2..e0336342d6 100644
--- a/lib/MC/MCExpr.cpp
+++ b/lib/MC/MCExpr.cpp
@@ -60,7 +60,8 @@ void MCExpr::print(raw_ostream &OS) const {
         SRE.getKind() == MCSymbolRefExpr::VK_ARM_GOTOFF ||
         SRE.getKind() == MCSymbolRefExpr::VK_ARM_TPOFF ||
         SRE.getKind() == MCSymbolRefExpr::VK_ARM_GOTTPOFF ||
-        SRE.getKind() == MCSymbolRefExpr::VK_ARM_TARGET1)
+        SRE.getKind() == MCSymbolRefExpr::VK_ARM_TARGET1 ||
+        SRE.getKind() == MCSymbolRefExpr::VK_ARM_TARGET2)
       OS << MCSymbolRefExpr::getVariantKindName(SRE.getKind());
     else if (SRE.getKind() != MCSymbolRefExpr::VK_None &&
              SRE.getKind() != MCSymbolRefExpr::VK_PPC_DARWIN_HA16 &&
@@ -199,6 +200,7 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_ARM_GOTTPOFF: return "(gottpoff)";
   case VK_ARM_TLSGD: return "(tlsgd)";
   case VK_ARM_TARGET1: return "(target1)";
+  case VK_ARM_TARGET2: return "(target2)";
   case VK_PPC_TOC: return "tocbase";
   case VK_PPC_TOC_ENTRY: return "toc";
   case VK_PPC_DARWIN_HA16: return "ha16";
diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp
index 43c68f4d1d..7e8b4a3d0d 100644
--- a/lib/Support/APFloat.cpp
+++ b/lib/Support/APFloat.cpp
@@ -3553,11 +3553,6 @@ void APFloat::toString(SmallVectorImpl<char> &Str,
 }
 
 bool APFloat::getExactInverse(APFloat *inv) const {
-  // We can only guarantee the existence of an exact inverse for IEEE floats.
-  if (semantics != &IEEEhalf && semantics != &IEEEsingle &&
-      semantics != &IEEEdouble && semantics != &IEEEquad)
-    return false;
-
   // Special floats and denormals have no exact inverse.
   if (category != fcNormal)
     return false;
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index 8d1a301a67..f67decc550 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -853,13 +853,28 @@ void ARMAsmPrinter::emitAttributes() {
     AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use,
                                ARMBuildAttrs::Allowed);
   } else if (CPUString == "generic") {
-    // FIXME: Why these defaults?
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v4T);
+    // For a generic CPU, we assume a standard v7a architecture in Subtarget.
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v7);
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch_profile,
+                               ARMBuildAttrs::ApplicationProfile);
     AttrEmitter->EmitAttribute(ARMBuildAttrs::ARM_ISA_use,
                                ARMBuildAttrs::Allowed);
     AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use,
-                               ARMBuildAttrs::Allowed);
-  }
+                               ARMBuildAttrs::AllowThumb32);
+  } else if (Subtarget->hasV7Ops()) {
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v7);
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use,
+                               ARMBuildAttrs::AllowThumb32);
+  } else if (Subtarget->hasV6T2Ops())
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v6T2);
+  else if (Subtarget->hasV6Ops())
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v6);
+  else if (Subtarget->hasV5TEOps())
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v5TE);
+  else if (Subtarget->hasV5TOps())
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v5T);
+  else if (Subtarget->hasV4TOps())
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v4T);
 
   if (Subtarget->hasNEON() && emitFPU) {
     /* NEON is not exactly a VFP architecture, but GAS emit one of
@@ -1515,31 +1530,6 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     }
     return;
   }
-  case ARM::t2BMOVPCB_CALL: {
-    {
-      MCInst TmpInst;
-      TmpInst.setOpcode(ARM::tMOVr);
-      TmpInst.addOperand(MCOperand::CreateReg(ARM::LR));
-      TmpInst.addOperand(MCOperand::CreateReg(ARM::PC));
-      // Add predicate operands.
-      TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
-      TmpInst.addOperand(MCOperand::CreateReg(0));
-      OutStreamer.EmitInstruction(TmpInst);
-    }
-    {
-      MCInst TmpInst;
-      TmpInst.setOpcode(ARM::t2B);
-      const GlobalValue *GV = MI->getOperand(0).getGlobal();
-      MCSymbol *GVSym = Mang->getSymbol(GV);
-      const MCExpr *GVSymExpr = MCSymbolRefExpr::Create(GVSym, OutContext);
-      TmpInst.addOperand(MCOperand::CreateExpr(GVSymExpr));
-      // Add predicate operands.
-      TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
-      TmpInst.addOperand(MCOperand::CreateReg(0));
-      OutStreamer.EmitInstruction(TmpInst);
-    }
-    return;
-  }
   case ARM::MOVi16_ga_pcrel:
   case ARM::t2MOVi16_ga_pcrel: {
     MCInst TmpInst;
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index dec498a4f7..0893826427 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -1639,18 +1639,12 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   if (Subtarget->isThumb()) {
     if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
       CallOpc = ARMISD::CALL_NOLINK;
-    else if (doesNotRet && isDirect && !isARMFunc &&
-             Subtarget->hasRAS() && !Subtarget->isThumb1Only() &&
-             // Emit regular call when code size is the priority
-             !HasMinSizeAttr)
-      // "mov lr, pc; b _foo" to avoid confusing the RSP
-      CallOpc = ARMISD::CALL_NOLINK;
     else
       CallOpc = isARMFunc ? ARMISD::CALL : ARMISD::tCALL;
   } else {
-    if (!isDirect && !Subtarget->hasV5TOps()) {
+    if (!isDirect && !Subtarget->hasV5TOps())
       CallOpc = ARMISD::CALL_NOLINK;
-    } else if (doesNotRet && isDirect && Subtarget->hasRAS() &&
+    else if (doesNotRet && isDirect && Subtarget->hasRAS() &&
                // Emit regular call when code size is the priority
                !HasMinSizeAttr)
       // "mov lr, pc; b _foo" to avoid confusing the RSP
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 248bab6b12..c2800acccd 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -3331,20 +3331,6 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1 in {
                  Requires<[IsThumb2, IsIOS]>;
 }
 
-let isCall = 1, Defs = [LR], Uses = [SP] in {
-  // mov lr, pc; b if callee is marked noreturn to avoid confusing the
-  // return stack predictor.
-  def t2BMOVPCB_CALL : tPseudoInst<(outs),
-                                   (ins t_bltarget:$func),
-                               6, IIC_Br, [(ARMcall_nolink tglobaladdr:$func)]>,
-                        Requires<[IsThumb]>;
-}
-
-// Direct calls
-def : T2Pat<(ARMcall_nolink texternalsym:$func),
-            (t2BMOVPCB_CALL texternalsym:$func)>,
-      Requires<[IsThumb]>;
-
 // IT block
 let Defs = [ITSTATE] in
 def t2IT : Thumb2XI<(outs), (ins it_pred:$cc, it_mask:$mask),
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index 7a7ce27d48..253d1fa2ab 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -247,6 +247,9 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
       case MCSymbolRefExpr::VK_ARM_TARGET1:
         Type = ELF::R_ARM_TARGET1;
         break;
+      case MCSymbolRefExpr::VK_ARM_TARGET2:
+        Type = ELF::R_ARM_TARGET2;
+        break;
       } 
       break;
     case ARM::fixup_arm_ldst_pcrel_12:
diff --git a/lib/Target/NVPTX/NVPTX.td b/lib/Target/NVPTX/NVPTX.td
index ae7710e54f..7aee3595c6 100644
--- a/lib/Target/NVPTX/NVPTX.td
+++ b/lib/Target/NVPTX/NVPTX.td
@@ -24,7 +24,30 @@ include "NVPTXInstrInfo.td"
 // - Need at least one feature to avoid generating zero sized array by
 //   TableGen in NVPTXGenSubtarget.inc.
 //===----------------------------------------------------------------------===//
-def FeatureDummy  : SubtargetFeature<"dummy", "dummy", "true", "">;
+
+// SM Versions
+def SM10 : SubtargetFeature<"sm_10", "SmVersion", "10",
+                            "Target SM 1.0">;
+def SM11 : SubtargetFeature<"sm_11", "SmVersion", "11",
+                            "Target SM 1.1">;
+def SM12 : SubtargetFeature<"sm_12", "SmVersion", "12",
+                            "Target SM 1.2">;
+def SM13 : SubtargetFeature<"sm_13", "SmVersion", "13",
+                            "Target SM 1.3">;
+def SM20 : SubtargetFeature<"sm_20", "SmVersion", "20",
+                            "Target SM 2.0">;
+def SM21 : SubtargetFeature<"sm_21", "SmVersion", "21",
+                            "Target SM 2.1">;
+def SM30 : SubtargetFeature<"sm_30", "SmVersion", "30",
+                            "Target SM 3.0">;
+def SM35 : SubtargetFeature<"sm_35", "SmVersion", "35",
+                            "Target SM 3.5">;
+
+// PTX Versions
+def PTX30 : SubtargetFeature<"ptx30", "PTXVersion", "30",
+                             "Use PTX version 3.0">;
+def PTX31 : SubtargetFeature<"ptx31", "PTXVersion", "31",
+                             "Use PTX version 3.1">;
 
 //===----------------------------------------------------------------------===//
 // NVPTX supported processors.
@@ -33,7 +56,14 @@ def FeatureDummy  : SubtargetFeature<"dummy", "dummy", "true", "">;
 class Proc<string Name, list<SubtargetFeature> Features>
  : Processor<Name, NoItineraries, Features>;
 
-def : Proc<"sm_10", [FeatureDummy]>;
+def : Proc<"sm_10", [SM10]>;
+def : Proc<"sm_11", [SM11]>;
+def : Proc<"sm_12", [SM12]>;
+def : Proc<"sm_13", [SM13]>;
+def : Proc<"sm_20", [SM20]>;
+def : Proc<"sm_21", [SM21]>;
+def : Proc<"sm_30", [SM30]>;
+def : Proc<"sm_35", [SM35]>;
 
 
 def NVPTXInstrInfo : InstrInfo {
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index d3dfb35e26..3dd9bf5613 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -910,7 +910,8 @@ void NVPTXAsmPrinter::emitHeader (Module &M, raw_ostream &O) {
   O << "//\n";
   O << "\n";
 
-  O << ".version 3.0\n";
+  unsigned PTXVersion = nvptxSubtarget.getPTXVersion();
+  O << ".version " << (PTXVersion / 10) << "." << (PTXVersion % 10) << "\n";
 
   O << ".target ";
   O << nvptxSubtarget.getTargetName();
@@ -1525,6 +1526,9 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F,
       // <a> = PAL.getparamalignment
       // size = typeallocsize of element type
       unsigned align = PAL.getParamAlignment(paramIndex+1);
+      if (align == 0)
+        align = TD->getABITypeAlignment(ETy);
+
       unsigned sz = TD->getTypeAllocSize(ETy);
       O << "\t.param .align " << align
           << " .b8 ";
diff --git a/lib/Target/NVPTX/NVPTXSubtarget.cpp b/lib/Target/NVPTX/NVPTXSubtarget.cpp
index 6aadd43e94..7b62cce2c6 100644
--- a/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -34,16 +34,18 @@ DriverInterface(cl::desc("Choose driver interface:"),
 
 NVPTXSubtarget::NVPTXSubtarget(const std::string &TT, const std::string &CPU,
                                const std::string &FS, bool is64Bit)
-:NVPTXGenSubtargetInfo(TT, "", FS), // Don't pass CPU to subtarget,
- // because we don't register all
- // nvptx targets.
- Is64Bit(is64Bit) {
+: NVPTXGenSubtargetInfo(TT, CPU, FS),
+  Is64Bit(is64Bit),
+  PTXVersion(0),
+  SmVersion(10) {
 
   drvInterface = DriverInterface;
 
   // Provide the default CPU if none
   std::string defCPU = "sm_10";
 
+  ParseSubtargetFeatures((CPU.empty() ? defCPU : CPU), FS);
+
   // Get the TargetName from the FS if available
   if (FS.empty() && CPU.empty())
     TargetName = defCPU;
@@ -52,6 +54,12 @@ NVPTXSubtarget::NVPTXSubtarget(const std::string &TT, const std::string &CPU,
   else
     llvm_unreachable("we are not using FeatureStr");
 
-  // Set up the SmVersion
-  SmVersion = atoi(TargetName.c_str()+3);
+  // We default to PTX 3.1, but we cannot just default to it in the initializer
+  // since the attribute parser checks if the given option is >= the default.
+  // So if we set ptx31 as the default, the ptx30 attribute would never match.
+  // Instead, we use 0 as the default and manually set 31 if the default is
+  // used.
+  if (PTXVersion == 0) {
+    PTXVersion = 31;
+  }
 }
diff --git a/lib/Target/NVPTX/NVPTXSubtarget.h b/lib/Target/NVPTX/NVPTXSubtarget.h
index 8f2a629d22..c3a683a2c6 100644
--- a/lib/Target/NVPTX/NVPTXSubtarget.h
+++ b/lib/Target/NVPTX/NVPTXSubtarget.h
@@ -25,13 +25,18 @@
 namespace llvm {
 
 class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
-
-  unsigned int SmVersion;
+  
   std::string TargetName;
   NVPTX::DrvInterface drvInterface;
   bool dummy; // For the 'dummy' feature, see NVPTX.td
   bool Is64Bit;
 
+  // PTX version x.y is represented as 10*x+y, e.g. 3.1 == 31
+  unsigned PTXVersion;
+
+  // SM version x.y is represented as 10*x+y, e.g. 3.1 == 31
+  unsigned int SmVersion;
+
 public:
   /// This constructor initializes the data members to match that
   /// of the specified module.
@@ -69,6 +74,8 @@ public:
   NVPTX::DrvInterface getDrvInterface() const { return drvInterface; }
   std::string getTargetName() const { return TargetName; }
 
+  unsigned getPTXVersion() const { return PTXVersion; }
+
   void ParseSubtargetFeatures(StringRef CPU, StringRef FS);
 
   std::string getDataLayout() const {
diff --git a/lib/Target/TargetLibraryInfo.cpp b/lib/Target/TargetLibraryInfo.cpp
index 62f973e658..6d4eab1204 100644
--- a/lib/Target/TargetLibraryInfo.cpp
+++ b/lib/Target/TargetLibraryInfo.cpp
@@ -152,9 +152,12 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "sqrt",
     "sqrtf",
     "sqrtl",
+    "stpcpy",
     "strcat",
     "strchr",
+    "strcmp",
     "strcpy",
+    "strcspn",
     "strdup",
     "strlen",
     "strncat",
@@ -162,6 +165,17 @@ const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
     "strncpy",
     "strndup",
     "strnlen",
+    "strpbrk",
+    "strrchr",
+    "strspn",
+    "strstr",
+    "strtod",
+    "strtof",
+    "strtol",
+    "strtold",
+    "strtoll",
+    "strtoul",
+    "strtoull",
     "tan",
     "tanf",
     "tanh",
@@ -309,6 +323,10 @@ static void initialize(TargetLibraryInfo &TLI, const Triple &T,
       TLI.setUnavailable(LibFunc::tanf);
       TLI.setUnavailable(LibFunc::tanhf);
     }
+
+    // Win32 does *not* provide stpcpy.  It is provided on POSIX systems:
+    // http://pubs.opengroup.org/onlinepubs/9699919799/functions/stpcpy.html
+    TLI.setUnavailable(LibFunc::stpcpy);
   }
 }
 
diff --git a/lib/Target/TargetTransformImpl.cpp b/lib/Target/TargetTransformImpl.cpp
index c07332de32..b36e6f858f 100644
--- a/lib/Target/TargetTransformImpl.cpp
+++ b/lib/Target/TargetTransformImpl.cpp
@@ -214,8 +214,16 @@ unsigned VectorTargetTransformImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
   // Handle scalar conversions.
   if (!Src->isVectorTy() && !Dst->isVectorTy()) {
 
-    // Scalar bitcasts and truncs are usually free.
-    if (Opcode == Instruction::BitCast || Opcode == Instruction::Trunc)
+    // Scalar bitcasts are usually free.
+    if (Opcode == Instruction::BitCast)
+      return 0;
+
+    if (Opcode == Instruction::Trunc &&
+        TLI->isTruncateFree(SrcLT.second, DstLT.second))
+      return 0;
+
+    if (Opcode == Instruction::ZExt &&
+        TLI->isZExtFree(SrcLT.second, DstLT.second))
       return 0;
 
     // Just check the op cost. If the operation is legal then assume it costs 1.
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index e86c1000f1..42134256e3 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -2892,85 +2892,6 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
 
     return Result;
   }
-
-  // FIXME: Custom handling because TableGen doesn't support multiple implicit
-  // defs in an instruction pattern
-  case X86ISD::PCMPESTRI: {
-    SDValue N0 = Node->getOperand(0);
-    SDValue N1 = Node->getOperand(1);
-    SDValue N2 = Node->getOperand(2);
-    SDValue N3 = Node->getOperand(3);
-    SDValue N4 = Node->getOperand(4);
-
-    // Make sure last argument is a constant
-    ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N4);
-    if (!Cst)
-      break;
-
-    uint64_t Imm = Cst->getZExtValue();
-
-    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl,
-                                          X86::EAX, N1, SDValue()).getValue(1);
-    InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EDX,
-                                  N3, InFlag).getValue(1);
-
-    SDValue Ops[] = { N0, N2, getI8Imm(Imm), InFlag };
-    unsigned Opc = Subtarget->hasAVX() ? X86::VPCMPESTRIrr :
-                                         X86::PCMPESTRIrr;
-    InFlag = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, Ops,
-                                            array_lengthof(Ops)), 0);
-
-    if (!SDValue(Node, 0).use_empty()) {
-      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
-                                              X86::ECX, NVT, InFlag);
-      InFlag = Result.getValue(2);
-      ReplaceUses(SDValue(Node, 0), Result);
-    }
-    if (!SDValue(Node, 1).use_empty()) {
-      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
-                                              X86::EFLAGS, NVT, InFlag);
-      InFlag = Result.getValue(2);
-      ReplaceUses(SDValue(Node, 1), Result);
-    }
-
-    return NULL;
-  }
-
-  // FIXME: Custom handling because TableGen doesn't support multiple implicit
-  // defs in an instruction pattern
-  case X86ISD::PCMPISTRI: {
-    SDValue N0 = Node->getOperand(0);
-    SDValue N1 = Node->getOperand(1);
-    SDValue N2 = Node->getOperand(2);
-
-    // Make sure last argument is a constant
-    ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(N2);
-    if (!Cst)
-      break;
-
-    uint64_t Imm = Cst->getZExtValue();
-
-    SDValue Ops[] = { N0, N1, getI8Imm(Imm) };
-    unsigned Opc = Subtarget->hasAVX() ? X86::VPCMPISTRIrr :
-                                         X86::PCMPISTRIrr;
-    SDValue InFlag = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, Ops,
-                                                    array_lengthof(Ops)), 0);
-
-    if (!SDValue(Node, 0).use_empty()) {
-      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
-                                              X86::ECX, NVT, InFlag);
-      InFlag = Result.getValue(2);
-      ReplaceUses(SDValue(Node, 0), Result);
-    }
-    if (!SDValue(Node, 1).use_empty()) {
-      SDValue Result = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl,
-                                              X86::EFLAGS, NVT, InFlag);
-      InFlag = Result.getValue(2);
-      ReplaceUses(SDValue(Node, 1), Result);
-    }
-
-    return NULL;
-  }
   }
 
   SDNode *ResNode = SelectCode(Node);
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index fd0a8a27d6..5610bb5ba3 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -12240,6 +12240,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
   case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
   case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
+  case X86ISD::PCMPESTRI:          return "X86ISD::PCMPESTRI";
+  case X86ISD::PCMPISTRI:          return "X86ISD::PCMPISTRI";
   }
 }
 
@@ -12388,13 +12390,10 @@ X86TargetLowering::isVectorClearMaskLegal(const SmallVectorImpl<int> &Mask,
 //                           X86 Scheduler Hooks
 //===----------------------------------------------------------------------===//
 
-// private utility function
-
 /// Utility function to emit xbegin specifying the start of an RTM region.
-MachineBasicBlock *
-X86TargetLowering::EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB) const {
+static MachineBasicBlock *EmitXBegin(MachineInstr *MI, MachineBasicBlock *MBB,
+                                     const TargetInstrInfo *TII) {
   DebugLoc DL = MI->getDebugLoc();
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
 
   const BasicBlock *BB = MBB->getBasicBlock();
   MachineFunction::iterator I = MBB;
@@ -13033,45 +13032,82 @@ X86TargetLowering::EmitAtomicLoadArith6432(MachineInstr *MI,
 // FIXME: When we get size specific XMM0 registers, i.e. XMM0_V16I8
 // or XMM0_V32I8 in AVX all of this code can be replaced with that
 // in the .td file.
-MachineBasicBlock *
-X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
-                            unsigned numArgs, bool memArg) const {
-  assert(Subtarget->hasSSE42() &&
-         "Target must have SSE4.2 or AVX features enabled");
+static MachineBasicBlock *EmitPCMPSTRM(MachineInstr *MI, MachineBasicBlock *BB,
+                                       const TargetInstrInfo *TII) {
+  unsigned Opc;
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("illegal opcode!");
+  case X86::PCMPISTRM128REG:  Opc = X86::PCMPISTRM128rr;  break;
+  case X86::VPCMPISTRM128REG: Opc = X86::VPCMPISTRM128rr; break;
+  case X86::PCMPISTRM128MEM:  Opc = X86::PCMPISTRM128rm;  break;
+  case X86::VPCMPISTRM128MEM: Opc = X86::VPCMPISTRM128rm; break;
+  case X86::PCMPESTRM128REG:  Opc = X86::PCMPESTRM128rr;  break;
+  case X86::VPCMPESTRM128REG: Opc = X86::VPCMPESTRM128rr; break;
+  case X86::PCMPESTRM128MEM:  Opc = X86::PCMPESTRM128rm;  break;
+  case X86::VPCMPESTRM128MEM: Opc = X86::VPCMPESTRM128rm; break;
+  }
 
   DebugLoc dl = MI->getDebugLoc();
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
+  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
+
+  unsigned NumArgs = MI->getNumOperands();
+  for (unsigned i = 1; i < NumArgs; ++i) {
+    MachineOperand &Op = MI->getOperand(i);
+    if (!(Op.isReg() && Op.isImplicit()))
+      MIB.addOperand(Op);
+  }
+  if (MI->hasOneMemOperand())
+    MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
+  BuildMI(*BB, MI, dl,
+    TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
+    .addReg(X86::XMM0);
+
+  MI->eraseFromParent();
+  return BB;
+}
+
+// FIXME: Custom handling because TableGen doesn't support multiple implicit
+// defs in an instruction pattern
+static MachineBasicBlock *EmitPCMPSTRI(MachineInstr *MI, MachineBasicBlock *BB,
+                                       const TargetInstrInfo *TII) {
   unsigned Opc;
-  if (!Subtarget->hasAVX()) {
-    if (memArg)
-      Opc = numArgs == 3 ? X86::PCMPISTRM128rm : X86::PCMPESTRM128rm;
-    else
-      Opc = numArgs == 3 ? X86::PCMPISTRM128rr : X86::PCMPESTRM128rr;
-  } else {
-    if (memArg)
-      Opc = numArgs == 3 ? X86::VPCMPISTRM128rm : X86::VPCMPESTRM128rm;
-    else
-      Opc = numArgs == 3 ? X86::VPCMPISTRM128rr : X86::VPCMPESTRM128rr;
+  switch (MI->getOpcode()) {
+  default: llvm_unreachable("illegal opcode!");
+  case X86::PCMPISTRIREG:  Opc = X86::PCMPISTRIrr;  break;
+  case X86::VPCMPISTRIREG: Opc = X86::VPCMPISTRIrr; break;
+  case X86::PCMPISTRIMEM:  Opc = X86::PCMPISTRIrm;  break;
+  case X86::VPCMPISTRIMEM: Opc = X86::VPCMPISTRIrm; break;
+  case X86::PCMPESTRIREG:  Opc = X86::PCMPESTRIrr;  break;
+  case X86::VPCMPESTRIREG: Opc = X86::VPCMPESTRIrr; break;
+  case X86::PCMPESTRIMEM:  Opc = X86::PCMPESTRIrm;  break;
+  case X86::VPCMPESTRIMEM: Opc = X86::VPCMPESTRIrm; break;
   }
 
+  DebugLoc dl = MI->getDebugLoc();
   MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(Opc));
-  for (unsigned i = 0; i < numArgs; ++i) {
-    MachineOperand &Op = MI->getOperand(i+1);
+
+  unsigned NumArgs = MI->getNumOperands(); // remove the results
+  for (unsigned i = 1; i < NumArgs; ++i) {
+    MachineOperand &Op = MI->getOperand(i);
     if (!(Op.isReg() && Op.isImplicit()))
       MIB.addOperand(Op);
   }
+  if (MI->hasOneMemOperand())
+    MIB->setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
+
   BuildMI(*BB, MI, dl,
     TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg())
-    .addReg(X86::XMM0);
+    .addReg(X86::ECX);
 
   MI->eraseFromParent();
   return BB;
 }
 
-MachineBasicBlock *
-X86TargetLowering::EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB) const {
+static MachineBasicBlock * EmitMonitor(MachineInstr *MI, MachineBasicBlock *BB,
+                                       const TargetInstrInfo *TII,
+                                       const X86Subtarget* Subtarget) {
   DebugLoc dl = MI->getDebugLoc();
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
 
   // Address into RAX/EAX, other two args into ECX, EDX.
   unsigned MemOpc = Subtarget->is64Bit() ? X86::LEA64r : X86::LEA32r;
@@ -14125,36 +14161,33 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr *MI,
   case X86::PCMPESTRM128REG:
   case X86::VPCMPESTRM128REG:
   case X86::PCMPESTRM128MEM:
-  case X86::VPCMPESTRM128MEM: {
-    unsigned NumArgs;
-    bool MemArg;
-    switch (MI->getOpcode()) {
-    default: llvm_unreachable("illegal opcode!");
-    case X86::PCMPISTRM128REG:
-    case X86::VPCMPISTRM128REG:
-      NumArgs = 3; MemArg = false; break;
-    case X86::PCMPISTRM128MEM:
-    case X86::VPCMPISTRM128MEM:
-      NumArgs = 3; MemArg = true; break;
-    case X86::PCMPESTRM128REG:
-    case X86::VPCMPESTRM128REG:
-      NumArgs = 5; MemArg = false; break;
-    case X86::PCMPESTRM128MEM:
-    case X86::VPCMPESTRM128MEM:
-      NumArgs = 5; MemArg = true; break;
-    }
-    return EmitPCMP(MI, BB, NumArgs, MemArg);
-  }
-
-    // Thread synchronization.
+  case X86::VPCMPESTRM128MEM:
+    assert(Subtarget->hasSSE42() &&
+           "Target must have SSE4.2 or AVX features enabled");
+    return EmitPCMPSTRM(MI, BB, getTargetMachine().getInstrInfo());
+
+  // String/text processing lowering.
+  case X86::PCMPISTRIREG:
+  case X86::VPCMPISTRIREG:
+  case X86::PCMPISTRIMEM:
+  case X86::VPCMPISTRIMEM:
+  case X86::PCMPESTRIREG:
+  case X86::VPCMPESTRIREG:
+  case X86::PCMPESTRIMEM:
+  case X86::VPCMPESTRIMEM:
+    assert(Subtarget->hasSSE42() &&
+           "Target must have SSE4.2 or AVX features enabled");
+    return EmitPCMPSTRI(MI, BB, getTargetMachine().getInstrInfo());
+
+  // Thread synchronization.
   case X86::MONITOR:
-    return EmitMonitor(MI, BB);
+    return EmitMonitor(MI, BB, getTargetMachine().getInstrInfo(), Subtarget);
 
   // xbegin
   case X86::XBEGIN:
-    return EmitXBegin(MI, BB);
+    return EmitXBegin(MI, BB, getTargetMachine().getInstrInfo());
 
-    // Atomic Lowering.
+  // Atomic Lowering.
   case X86::ATOMAND8:
   case X86::ATOMAND16:
   case X86::ATOMAND32:
@@ -17993,8 +18026,8 @@ unsigned X86VectorTargetTransformInfo::getCastInstrCost(unsigned Opcode,
     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i8,  1 },
     { ISD::UINT_TO_FP,  MVT::v8f32, MVT::v8i8,  1 },
     { ISD::UINT_TO_FP,  MVT::v4f32, MVT::v4i8,  1 },
-    { ISD::FP_TO_SINT,  MVT::v8i8, MVT::v8f32,  1 },
-    { ISD::FP_TO_SINT,  MVT::v4i8, MVT::v4f32,  1 },
+    { ISD::FP_TO_SINT,  MVT::v8i8,  MVT::v8f32, 1 },
+    { ISD::FP_TO_SINT,  MVT::v4i8,  MVT::v4f32, 1 },
     { ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i1,  6 },
     { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i1,  9 },
     { ISD::TRUNCATE,    MVT::v8i32, MVT::v8i64, 3 },
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 09f175db53..b6e8960f76 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -893,21 +893,6 @@ namespace llvm {
                    const SmallVectorImpl<ISD::OutputArg> &Outs,
                    LLVMContext &Context) const;
 
-    /// Utility function to emit string processing sse4.2 instructions
-    /// that return in xmm0.
-    /// This takes the instruction to expand, the associated machine basic
-    /// block, the number of args, and whether or not the second arg is
-    /// in memory or not.
-    MachineBasicBlock *EmitPCMP(MachineInstr *BInstr, MachineBasicBlock *BB,
-                                unsigned argNum, bool inMem) const;
-
-    /// Utility functions to emit monitor and mwait instructions. These
-    /// need to make sure that the arguments to the intrinsic are in the
-    /// correct registers.
-    MachineBasicBlock *EmitMonitor(MachineInstr *MI,
-                                   MachineBasicBlock *BB) const;
-    MachineBasicBlock *EmitMwait(MachineInstr *MI, MachineBasicBlock *BB) const;
-
     /// Utility function to emit atomic-load-arith operations (and, or, xor,
     /// nand, max, min, umax, umin). It takes the corresponding instruction to
     /// expand, the associated machine basic block, and the associated X86
@@ -920,10 +905,6 @@ namespace llvm {
     MachineBasicBlock *EmitAtomicLoadArith6432(MachineInstr *MI,
                                                MachineBasicBlock *MBB) const;
 
-    /// Utility function to emit xbegin specifying the start of an RTM region.
-    MachineBasicBlock *EmitXBegin(MachineInstr *MI,
-                                  MachineBasicBlock *MBB) const;
-
     // Utility function to emit the low-level va_arg code for X86-64.
     MachineBasicBlock *EmitVAARG64WithCustomInserter(
                        MachineInstr *MI,
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 28dfbe7a1f..6f48d7ed7f 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -7002,8 +7002,8 @@ multiclass pseudo_pcmpistrm<string asm> {
                                                   imm:$src3))]>;
   def MEM : PseudoI<(outs VR128:$dst),
                     (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
-    [(set VR128:$dst, (int_x86_sse42_pcmpistrm128
-                       VR128:$src1, (load addr:$src2), imm:$src3))]>;
+    [(set VR128:$dst, (int_x86_sse42_pcmpistrm128 VR128:$src1,
+                       (bc_v16i8 (memopv2i64 addr:$src2)), imm:$src3))]>;
 }
 
 let Defs = [EFLAGS], usesCustomInserter = 1 in {
@@ -7011,24 +7011,22 @@ let Defs = [EFLAGS], usesCustomInserter = 1 in {
   defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128">, Requires<[UseSSE42]>;
 }
 
-let Defs = [XMM0, EFLAGS], neverHasSideEffects = 1, Predicates = [HasAVX] in {
-  def VPCMPISTRM128rr : SS42AI<0x62, MRMSrcReg, (outs),
-      (ins VR128:$src1, VR128:$src2, i8imm:$src3),
-      "vpcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize, VEX;
+multiclass pcmpistrm_SS42AI<string asm> {
+  def rr : SS42AI<0x62, MRMSrcReg, (outs),
+    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+    []>, OpSize;
   let mayLoad = 1 in
-  def VPCMPISTRM128rm : SS42AI<0x62, MRMSrcMem, (outs),
-      (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
-      "vpcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize, VEX;
+  def rm :SS42AI<0x62, MRMSrcMem, (outs),
+    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+    []>, OpSize;
 }
 
 let Defs = [XMM0, EFLAGS], neverHasSideEffects = 1 in {
-  def PCMPISTRM128rr : SS42AI<0x62, MRMSrcReg, (outs),
-      (ins VR128:$src1, VR128:$src2, i8imm:$src3),
-      "pcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize;
-  let mayLoad = 1 in
-  def PCMPISTRM128rm : SS42AI<0x62, MRMSrcMem, (outs),
-      (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
-      "pcmpistrm\t{$src3, $src2, $src1|$src1, $src2, $src3}", []>, OpSize;
+  let Predicates = [HasAVX] in
+  defm VPCMPISTRM128 : pcmpistrm_SS42AI<"vpcmpistrm">, VEX;
+  defm PCMPISTRM128  : pcmpistrm_SS42AI<"pcmpistrm"> ;
 }
 
 // Packed Compare Explicit Length Strings, Return Mask
@@ -7039,8 +7037,8 @@ multiclass pseudo_pcmpestrm<string asm> {
                        VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
   def MEM : PseudoI<(outs VR128:$dst),
                     (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
-    [(set VR128:$dst, (int_x86_sse42_pcmpestrm128
-                       VR128:$src1, EAX, (load addr:$src3), EDX, imm:$src5))]>;
+    [(set VR128:$dst, (int_x86_sse42_pcmpestrm128 VR128:$src1, EAX,
+                       (bc_v16i8 (memopv2i64 addr:$src3)), EDX, imm:$src5))]>;
 }
 
 let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
@@ -7048,64 +7046,94 @@ let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
   defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128">, Requires<[UseSSE42]>;
 }
 
-let Predicates = [HasAVX],
-    Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
-  def VPCMPESTRM128rr : SS42AI<0x60, MRMSrcReg, (outs),
-      (ins VR128:$src1, VR128:$src3, i8imm:$src5),
-      "vpcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize, VEX;
+multiclass SS42AI_pcmpestrm<string asm> {
+  def rr : SS42AI<0x60, MRMSrcReg, (outs),
+    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+    []>, OpSize;
   let mayLoad = 1 in
-  def VPCMPESTRM128rm : SS42AI<0x60, MRMSrcMem, (outs),
-      (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
-      "vpcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize, VEX;
+  def rm : SS42AI<0x60, MRMSrcMem, (outs),
+    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+    []>, OpSize;
 }
 
 let Defs = [XMM0, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
-  def PCMPESTRM128rr : SS42AI<0x60, MRMSrcReg, (outs),
-      (ins VR128:$src1, VR128:$src3, i8imm:$src5),
-      "pcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize;
-  let mayLoad = 1 in
-  def PCMPESTRM128rm : SS42AI<0x60, MRMSrcMem, (outs),
-      (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
-      "pcmpestrm\t{$src5, $src3, $src1|$src1, $src3, $src5}", []>, OpSize;
+  let Predicates = [HasAVX] in
+  defm VPCMPESTRM128 : SS42AI_pcmpestrm<"vpcmpestrm">, VEX;
+  defm PCMPESTRM128 :  SS42AI_pcmpestrm<"pcmpestrm">;
 }
 
 // Packed Compare Implicit Length Strings, Return Index
-let Defs = [ECX, EFLAGS], neverHasSideEffects = 1 in {
-  multiclass SS42AI_pcmpistri<string asm> {
-    def rr : SS42AI<0x63, MRMSrcReg, (outs),
-      (ins VR128:$src1, VR128:$src2, i8imm:$src3),
-      !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
-      []>, OpSize;
-    let mayLoad = 1 in
-    def rm : SS42AI<0x63, MRMSrcMem, (outs),
-      (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
-      !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
-      []>, OpSize;
-  }
+multiclass pseudo_pcmpistri<string asm> {
+  def REG : PseudoI<(outs GR32:$dst),
+                    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+    [(set GR32:$dst, EFLAGS,
+      (X86pcmpistri VR128:$src1, VR128:$src2, imm:$src3))]>;
+  def MEM : PseudoI<(outs GR32:$dst),
+                    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+    [(set GR32:$dst, EFLAGS, (X86pcmpistri VR128:$src1,
+                              (bc_v16i8 (memopv2i64 addr:$src2)), imm:$src3))]>;
 }
 
-let Predicates = [HasAVX] in
-defm VPCMPISTRI  : SS42AI_pcmpistri<"vpcmpistri">, VEX;
-defm PCMPISTRI   : SS42AI_pcmpistri<"pcmpistri">;
+let Defs = [EFLAGS], usesCustomInserter = 1 in {
+  defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI">, Requires<[HasAVX]>;
+  defm PCMPISTRI  : pseudo_pcmpistri<"#PCMPISTRI">, Requires<[UseSSE42]>;
+}
+
+multiclass SS42AI_pcmpistri<string asm> {
+  def rr : SS42AI<0x63, MRMSrcReg, (outs),
+    (ins VR128:$src1, VR128:$src2, i8imm:$src3),
+    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+    []>, OpSize;
+  let mayLoad = 1 in
+  def rm : SS42AI<0x63, MRMSrcMem, (outs),
+    (ins VR128:$src1, i128mem:$src2, i8imm:$src3),
+    !strconcat(asm, "\t{$src3, $src2, $src1|$src1, $src2, $src3}"),
+    []>, OpSize;
+}
+
+let Defs = [ECX, EFLAGS], neverHasSideEffects = 1 in {
+  let Predicates = [HasAVX] in
+  defm VPCMPISTRI : SS42AI_pcmpistri<"vpcmpistri">, VEX;
+  defm PCMPISTRI  : SS42AI_pcmpistri<"pcmpistri">;
+}
 
 // Packed Compare Explicit Length Strings, Return Index
-let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
-  multiclass SS42AI_pcmpestri<string asm> {
-    def rr : SS42AI<0x61, MRMSrcReg, (outs),
-      (ins VR128:$src1, VR128:$src3, i8imm:$src5),
-      !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
-      []>, OpSize;
-    let mayLoad = 1 in
-    def rm : SS42AI<0x61, MRMSrcMem, (outs),
-      (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
-      !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
-      []>, OpSize;
-  }
+multiclass pseudo_pcmpestri<string asm> {
+  def REG : PseudoI<(outs GR32:$dst),
+                    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+    [(set GR32:$dst, EFLAGS,
+      (X86pcmpestri VR128:$src1, EAX, VR128:$src3, EDX, imm:$src5))]>;
+  def MEM : PseudoI<(outs GR32:$dst),
+                    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+    [(set GR32:$dst, EFLAGS,
+      (X86pcmpestri VR128:$src1, EAX, (bc_v16i8 (memopv2i64 addr:$src3)), EDX,
+       imm:$src5))]>;
 }
 
-let Predicates = [HasAVX] in
-defm VPCMPESTRI  : SS42AI_pcmpestri<"vpcmpestri">, VEX;
-defm PCMPESTRI   : SS42AI_pcmpestri<"pcmpestri">;
+let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
+  defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI">, Requires<[HasAVX]>;
+  defm PCMPESTRI  : pseudo_pcmpestri<"#PCMPESTRI">, Requires<[UseSSE42]>;
+}
+
+multiclass SS42AI_pcmpestri<string asm> {
+  def rr : SS42AI<0x61, MRMSrcReg, (outs),
+    (ins VR128:$src1, VR128:$src3, i8imm:$src5),
+    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+    []>, OpSize;
+  let mayLoad = 1 in
+  def rm : SS42AI<0x61, MRMSrcMem, (outs),
+    (ins VR128:$src1, i128mem:$src3, i8imm:$src5),
+    !strconcat(asm, "\t{$src5, $src3, $src1|$src1, $src3, $src5}"),
+    []>, OpSize;
+}
+
+let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], neverHasSideEffects = 1 in {
+  let Predicates = [HasAVX] in
+  defm VPCMPESTRI : SS42AI_pcmpestri<"vpcmpestri">, VEX;
+  defm PCMPESTRI  : SS42AI_pcmpestri<"pcmpestri">;
+}
 
 //===----------------------------------------------------------------------===//
 // SSE4.2 - CRC Instructions
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 31be6b7a7b..0132f81410 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -423,12 +423,11 @@ X86Subtarget::X86Subtarget(const std::string &TT, const std::string &CPU,
   assert((!In64BitMode || HasX86_64) &&
          "64-bit code requested on a subtarget that doesn't support it!");
 
-  // Stack alignment is 16 bytes on Darwin, FreeBSD, Linux, Solaris (both
+  // Stack alignment is 16 bytes on Darwin, Linux and Solaris (both
   // 32 and 64 bit), NaCl and for all 64-bit targets.
   if (StackAlignOverride)
     stackAlignment = StackAlignOverride;
-  else if (isTargetDarwin() || isTargetFreeBSD() || isTargetLinux() ||
-           isTargetSolaris() ||
+  else if (isTargetDarwin() || isTargetLinux() || isTargetSolaris() ||
            isTargetNaCl() || // @LOCALMOD
            In64BitMode)
     stackAlignment = 16;
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index ccf75bca2b..9a46f25e66 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2367,6 +2367,24 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
   return MadeIRChange;
 }
 
+namespace {
+class InstCombinerLibCallSimplifier : public LibCallSimplifier {
+  InstCombiner *IC;
+public:
+  InstCombinerLibCallSimplifier(const DataLayout *TD,
+                                const TargetLibraryInfo *TLI,
+                                InstCombiner *IC)
+    : LibCallSimplifier(TD, TLI) {
+    this->IC = IC;
+  }
+
+  /// replaceAllUsesWith - override so that instruction replacement
+  /// can be defined in terms of the instruction combiner framework.
+  virtual void replaceAllUsesWith(Instruction *I, Value *With) const {
+    IC->ReplaceInstUsesWith(*I, With);
+  }
+};
+}
 
 bool InstCombiner::runOnFunction(Function &F) {
   TD = getAnalysisIfAvailable<DataLayout>();
@@ -2379,7 +2397,7 @@ bool InstCombiner::runOnFunction(Function &F) {
                InstCombineIRInserter(Worklist));
   Builder = &TheBuilder;
 
-  LibCallSimplifier TheSimplifier(TD, TLI);
+  InstCombinerLibCallSimplifier TheSimplifier(TD, TLI, this);
   Simplifier = &TheSimplifier;
 
   bool EverMadeChange = false;
diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index c6244a55c9..9e10fc4416 100644
--- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -97,6 +97,10 @@ struct ThreadSanitizer : public FunctionPass {
   Function *TsanWrite[kNumberOfAccessSizes];
   Function *TsanAtomicLoad[kNumberOfAccessSizes];
   Function *TsanAtomicStore[kNumberOfAccessSizes];
+  Function *TsanAtomicRMW[AtomicRMWInst::LAST_BINOP + 1][kNumberOfAccessSizes];
+  Function *TsanAtomicCAS[kNumberOfAccessSizes];
+  Function *TsanAtomicThreadFence;
+  Function *TsanAtomicSignalFence;
   Function *TsanVptrUpdate;
 };
 }  // namespace
@@ -167,10 +171,42 @@ bool ThreadSanitizer::doInitialization(Module &M) {
     TsanAtomicStore[i] = checkInterfaceFunction(M.getOrInsertFunction(
         AtomicStoreName, IRB.getVoidTy(), PtrTy, Ty, OrdTy,
         NULL));
+
+    for (int op = AtomicRMWInst::FIRST_BINOP;
+        op <= AtomicRMWInst::LAST_BINOP; ++op) {
+      TsanAtomicRMW[op][i] = NULL;
+      const char *NamePart = NULL;
+      if (op == AtomicRMWInst::Xchg)
+        NamePart = "_exchange";
+      else if (op == AtomicRMWInst::Add)
+        NamePart = "_fetch_add";
+      else if (op == AtomicRMWInst::Sub)
+        NamePart = "_fetch_sub";
+      else if (op == AtomicRMWInst::And)
+        NamePart = "_fetch_and";
+      else if (op == AtomicRMWInst::Or)
+        NamePart = "_fetch_or";
+      else if (op == AtomicRMWInst::Xor)
+        NamePart = "_fetch_xor";
+      else
+        continue;
+      SmallString<32> RMWName("__tsan_atomic" + itostr(BitSize) + NamePart);
+      TsanAtomicRMW[op][i] = checkInterfaceFunction(M.getOrInsertFunction(
+          RMWName, Ty, PtrTy, Ty, OrdTy, NULL));
+    }
+
+    SmallString<32> AtomicCASName("__tsan_atomic" + itostr(BitSize) +
+                                  "_compare_exchange_val");
+    TsanAtomicCAS[i] = checkInterfaceFunction(M.getOrInsertFunction(
+        AtomicCASName, Ty, PtrTy, Ty, Ty, OrdTy, NULL));
   }
   TsanVptrUpdate = checkInterfaceFunction(M.getOrInsertFunction(
       "__tsan_vptr_update", IRB.getVoidTy(), IRB.getInt8PtrTy(),
       IRB.getInt8PtrTy(), NULL));
+  TsanAtomicThreadFence = checkInterfaceFunction(M.getOrInsertFunction(
+      "__tsan_atomic_thread_fence", IRB.getVoidTy(), OrdTy, NULL));
+  TsanAtomicSignalFence = checkInterfaceFunction(M.getOrInsertFunction(
+      "__tsan_atomic_signal_fence", IRB.getVoidTy(), OrdTy, NULL));
   return true;
 }
 
@@ -253,8 +289,8 @@ static bool isAtomic(Instruction *I) {
     return true;
   if (isa<AtomicCmpXchgInst>(I))
     return true;
-  if (FenceInst *FI = dyn_cast<FenceInst>(I))
-    return FI->getSynchScope() == CrossThread;
+  if (isa<FenceInst>(I))
+    return true;
   return false;
 }
 
@@ -354,15 +390,14 @@ static ConstantInt *createOrdering(IRBuilder<> *IRB, AtomicOrdering ord) {
   switch (ord) {
     case NotAtomic:              assert(false);
     case Unordered:              // Fall-through.
-    case Monotonic:              v = 1 << 0; break;
-    // case Consume:                v = 1 << 1; break;  // Not specified yet.
-    case Acquire:                v = 1 << 2; break;
-    case Release:                v = 1 << 3; break;
-    case AcquireRelease:         v = 1 << 4; break;
-    case SequentiallyConsistent: v = 1 << 5; break;
+    case Monotonic:              v = 0; break;
+ // case Consume:                v = 1; break;  // Not specified yet.
+    case Acquire:                v = 2; break;
+    case Release:                v = 3; break;
+    case AcquireRelease:         v = 4; break;
+    case SequentiallyConsistent: v = 5; break;
   }
-  // +100500 is temporal to migrate to new enum values.
-  return IRB->getInt32(v + 100500);
+  return IRB->getInt32(v);
 }
 
 bool ThreadSanitizer::instrumentAtomic(Instruction *I) {
@@ -397,12 +432,44 @@ bool ThreadSanitizer::instrumentAtomic(Instruction *I) {
     CallInst *C = CallInst::Create(TsanAtomicStore[Idx],
                                    ArrayRef<Value*>(Args));
     ReplaceInstWithInst(I, C);
-  } else if (isa<AtomicRMWInst>(I)) {
-    // FIXME: Not yet supported.
-  } else if (isa<AtomicCmpXchgInst>(I)) {
-    // FIXME: Not yet supported.
-  } else if (isa<FenceInst>(I)) {
-    // FIXME: Not yet supported.
+  } else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(I)) {
+    Value *Addr = RMWI->getPointerOperand();
+    int Idx = getMemoryAccessFuncIndex(Addr);
+    if (Idx < 0)
+      return false;
+    Function *F = TsanAtomicRMW[RMWI->getOperation()][Idx];
+    if (F == NULL)
+      return false;
+    const size_t ByteSize = 1 << Idx;
+    const size_t BitSize = ByteSize * 8;
+    Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
+    Type *PtrTy = Ty->getPointerTo();
+    Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
+                     IRB.CreateIntCast(RMWI->getValOperand(), Ty, false),
+                     createOrdering(&IRB, RMWI->getOrdering())};
+    CallInst *C = CallInst::Create(F, ArrayRef<Value*>(Args));
+    ReplaceInstWithInst(I, C);
+  } else if (AtomicCmpXchgInst *CASI = dyn_cast<AtomicCmpXchgInst>(I)) {
+    Value *Addr = CASI->getPointerOperand();
+    int Idx = getMemoryAccessFuncIndex(Addr);
+    if (Idx < 0)
+      return false;
+    const size_t ByteSize = 1 << Idx;
+    const size_t BitSize = ByteSize * 8;
+    Type *Ty = Type::getIntNTy(IRB.getContext(), BitSize);
+    Type *PtrTy = Ty->getPointerTo();
+    Value *Args[] = {IRB.CreatePointerCast(Addr, PtrTy),
+                     IRB.CreateIntCast(CASI->getCompareOperand(), Ty, false),
+                     IRB.CreateIntCast(CASI->getNewValOperand(), Ty, false),
+                     createOrdering(&IRB, CASI->getOrdering())};
+    CallInst *C = CallInst::Create(TsanAtomicCAS[Idx], ArrayRef<Value*>(Args));
+    ReplaceInstWithInst(I, C);
+  } else if (FenceInst *FI = dyn_cast<FenceInst>(I)) {
+    Value *Args[] = {createOrdering(&IRB, FI->getOrdering())};
+    Function *F = FI->getSynchScope() == SingleThread ?
+        TsanAtomicSignalFence : TsanAtomicThreadFence;
+    CallInst *C = CallInst::Create(F, ArrayRef<Value*>(Args));
+    ReplaceInstWithInst(I, C);
   }
   return true;
 }
diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
index 7d652dea48..17d07cdb2d 100644
--- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
@@ -99,243 +99,7 @@ static bool CallHasFloatingPointArgument(const CallInst *CI) {
   return false;
 }
 
-/// IsOnlyUsedInEqualityComparison - Return true if it is only used in equality
-/// comparisons with With.
-static bool IsOnlyUsedInEqualityComparison(Value *V, Value *With) {
-  for (Value::use_iterator UI = V->use_begin(), E = V->use_end();
-       UI != E; ++UI) {
-    if (ICmpInst *IC = dyn_cast<ICmpInst>(*UI))
-      if (IC->isEquality() && IC->getOperand(1) == With)
-        continue;
-    // Unknown instruction.
-    return false;
-  }
-  return true;
-}
-
-//===----------------------------------------------------------------------===//
-// String and Memory LibCall Optimizations
-//===----------------------------------------------------------------------===//
-
 namespace {
-//===---------------------------------------===//
-// 'strcspn' Optimizations
-
-struct StrCSpnOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 2 ||
-        FT->getParamType(0) != B.getInt8PtrTy() ||
-        FT->getParamType(1) != FT->getParamType(0) ||
-        !FT->getReturnType()->isIntegerTy())
-      return 0;
-
-    StringRef S1, S2;
-    bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
-    bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
-
-    // strcspn("", s) -> 0
-    if (HasS1 && S1.empty())
-      return Constant::getNullValue(CI->getType());
-
-    // Constant folding.
-    if (HasS1 && HasS2) {
-      size_t Pos = S1.find_first_of(S2);
-      if (Pos == StringRef::npos) Pos = S1.size();
-      return ConstantInt::get(CI->getType(), Pos);
-    }
-
-    // strcspn(s, "") -> strlen(s)
-    if (TD && HasS2 && S2.empty())
-      return EmitStrLen(CI->getArgOperand(0), B, TD, TLI);
-
-    return 0;
-  }
-};
-
-//===---------------------------------------===//
-// 'strstr' Optimizations
-
-struct StrStrOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 2 ||
-        !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isPointerTy() ||
-        !FT->getReturnType()->isPointerTy())
-      return 0;
-
-    // fold strstr(x, x) -> x.
-    if (CI->getArgOperand(0) == CI->getArgOperand(1))
-      return B.CreateBitCast(CI->getArgOperand(0), CI->getType());
-
-    // fold strstr(a, b) == a -> strncmp(a, b, strlen(b)) == 0
-    if (TD && IsOnlyUsedInEqualityComparison(CI, CI->getArgOperand(0))) {
-      Value *StrLen = EmitStrLen(CI->getArgOperand(1), B, TD, TLI);
-      if (!StrLen)
-        return 0;
-      Value *StrNCmp = EmitStrNCmp(CI->getArgOperand(0), CI->getArgOperand(1),
-                                   StrLen, B, TD, TLI);
-      if (!StrNCmp)
-        return 0;
-      for (Value::use_iterator UI = CI->use_begin(), UE = CI->use_end();
-           UI != UE; ) {
-        ICmpInst *Old = cast<ICmpInst>(*UI++);
-        Value *Cmp = B.CreateICmp(Old->getPredicate(), StrNCmp,
-                                  ConstantInt::getNullValue(StrNCmp->getType()),
-                                  "cmp");
-        Old->replaceAllUsesWith(Cmp);
-        Old->eraseFromParent();
-      }
-      return CI;
-    }
-
-    // See if either input string is a constant string.
-    StringRef SearchStr, ToFindStr;
-    bool HasStr1 = getConstantStringInfo(CI->getArgOperand(0), SearchStr);
-    bool HasStr2 = getConstantStringInfo(CI->getArgOperand(1), ToFindStr);
-
-    // fold strstr(x, "") -> x.
-    if (HasStr2 && ToFindStr.empty())
-      return B.CreateBitCast(CI->getArgOperand(0), CI->getType());
-
-    // If both strings are known, constant fold it.
-    if (HasStr1 && HasStr2) {
-      std::string::size_type Offset = SearchStr.find(ToFindStr);
-
-      if (Offset == StringRef::npos) // strstr("foo", "bar") -> null
-        return Constant::getNullValue(CI->getType());
-
-      // strstr("abcd", "bc") -> gep((char*)"abcd", 1)
-      Value *Result = CastToCStr(CI->getArgOperand(0), B);
-      Result = B.CreateConstInBoundsGEP1_64(Result, Offset, "strstr");
-      return B.CreateBitCast(Result, CI->getType());
-    }
-
-    // fold strstr(x, "y") -> strchr(x, 'y').
-    if (HasStr2 && ToFindStr.size() == 1) {
-      Value *StrChr= EmitStrChr(CI->getArgOperand(0), ToFindStr[0], B, TD, TLI);
-      return StrChr ? B.CreateBitCast(StrChr, CI->getType()) : 0;
-    }
-    return 0;
-  }
-};
-
-
-//===---------------------------------------===//
-// 'memcmp' Optimizations
-
-struct MemCmpOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 3 || !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isPointerTy() ||
-        !FT->getReturnType()->isIntegerTy(32))
-      return 0;
-
-    Value *LHS = CI->getArgOperand(0), *RHS = CI->getArgOperand(1);
-
-    if (LHS == RHS)  // memcmp(s,s,x) -> 0
-      return Constant::getNullValue(CI->getType());
-
-    // Make sure we have a constant length.
-    ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
-    if (!LenC) return 0;
-    uint64_t Len = LenC->getZExtValue();
-
-    if (Len == 0) // memcmp(s1,s2,0) -> 0
-      return Constant::getNullValue(CI->getType());
-
-    // memcmp(S1,S2,1) -> *(unsigned char*)LHS - *(unsigned char*)RHS
-    if (Len == 1) {
-      Value *LHSV = B.CreateZExt(B.CreateLoad(CastToCStr(LHS, B), "lhsc"),
-                                 CI->getType(), "lhsv");
-      Value *RHSV = B.CreateZExt(B.CreateLoad(CastToCStr(RHS, B), "rhsc"),
-                                 CI->getType(), "rhsv");
-      return B.CreateSub(LHSV, RHSV, "chardiff");
-    }
-
-    // Constant folding: memcmp(x, y, l) -> cnst (all arguments are constant)
-    StringRef LHSStr, RHSStr;
-    if (getConstantStringInfo(LHS, LHSStr) &&
-        getConstantStringInfo(RHS, RHSStr)) {
-      // Make sure we're not reading out-of-bounds memory.
-      if (Len > LHSStr.size() || Len > RHSStr.size())
-        return 0;
-      uint64_t Ret = memcmp(LHSStr.data(), RHSStr.data(), Len);
-      return ConstantInt::get(CI->getType(), Ret);
-    }
-
-    return 0;
-  }
-};
-
-//===---------------------------------------===//
-// 'memcpy' Optimizations
-
-struct MemCpyOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    // These optimizations require DataLayout.
-    if (!TD) return 0;
-
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
-        !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isPointerTy() ||
-        FT->getParamType(2) != TD->getIntPtrType(*Context))
-      return 0;
-
-    // memcpy(x, y, n) -> llvm.memcpy(x, y, n, 1)
-    B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
-                   CI->getArgOperand(2), 1);
-    return CI->getArgOperand(0);
-  }
-};
-
-//===---------------------------------------===//
-// 'memmove' Optimizations
-
-struct MemMoveOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    // These optimizations require DataLayout.
-    if (!TD) return 0;
-
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
-        !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isPointerTy() ||
-        FT->getParamType(2) != TD->getIntPtrType(*Context))
-      return 0;
-
-    // memmove(x, y, n) -> llvm.memmove(x, y, n, 1)
-    B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1),
-                    CI->getArgOperand(2), 1);
-    return CI->getArgOperand(0);
-  }
-};
-
-//===---------------------------------------===//
-// 'memset' Optimizations
-
-struct MemSetOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    // These optimizations require DataLayout.
-    if (!TD) return 0;
-
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
-        !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isIntegerTy() ||
-        FT->getParamType(2) != TD->getIntPtrType(*Context))
-      return 0;
-
-    // memset(p, v, n) -> llvm.memset(p, v, n, 1)
-    Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
-    B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1);
-    return CI->getArgOperand(0);
-  }
-};
-
 //===----------------------------------------------------------------------===//
 // Math Library Optimizations
 //===----------------------------------------------------------------------===//
@@ -1004,9 +768,6 @@ namespace {
     TargetLibraryInfo *TLI;
 
     StringMap<LibCallOptimization*> Optimizations;
-    // String and Memory LibCall Optimizations
-    StrCSpnOpt StrCSpn; StrStrOpt StrStr;
-    MemCmpOpt MemCmp; MemCpyOpt MemCpy; MemMoveOpt MemMove; MemSetOpt MemSet;
     // Math Library Optimizations
     CosOpt Cos; PowOpt Pow; Exp2Opt Exp2;
     UnaryDoubleFPOpt UnaryDoubleFP, UnsafeUnaryDoubleFP;
@@ -1072,14 +833,6 @@ void SimplifyLibCalls::AddOpt(LibFunc::Func F1, LibFunc::Func F2,
 /// Optimizations - Populate the Optimizations map with all the optimizations
 /// we know.
 void SimplifyLibCalls::InitOptimizations() {
-  // String and Memory LibCall Optimizations
-  Optimizations["strcspn"] = &StrCSpn;
-  Optimizations["strstr"] = &StrStr;
-  Optimizations["memcmp"] = &MemCmp;
-  AddOpt(LibFunc::memcpy, &MemCpy);
-  Optimizations["memmove"] = &MemMove;
-  AddOpt(LibFunc::memset, &MemSet);
-
   // Math Library Optimizations
   Optimizations["cosf"] = &Cos;
   Optimizations["cos"] = &Cos;
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 64c7011660..c3ea63852f 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -34,6 +34,7 @@ protected:
   Function *Caller;
   const DataLayout *TD;
   const TargetLibraryInfo *TLI;
+  const LibCallSimplifier *LCS;
   LLVMContext* Context;
 public:
   LibCallOptimization() { }
@@ -48,10 +49,12 @@ public:
     =0;
 
   Value *optimizeCall(CallInst *CI, const DataLayout *TD,
-                      const TargetLibraryInfo *TLI, IRBuilder<> &B) {
+                      const TargetLibraryInfo *TLI,
+                      const LibCallSimplifier *LCS, IRBuilder<> &B) {
     Caller = CI->getParent()->getParent();
     this->TD = TD;
     this->TLI = TLI;
+    this->LCS = LCS;
     if (CI->getCalledFunction())
       Context = &CI->getCalledFunction()->getContext();
 
@@ -83,6 +86,20 @@ static bool isOnlyUsedInZeroEqualityComparison(Value *V) {
   return true;
 }
 
+/// isOnlyUsedInEqualityComparison - Return true if it is only used in equality
+/// comparisons with With.
+static bool isOnlyUsedInEqualityComparison(Value *V, Value *With) {
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end();
+       UI != E; ++UI) {
+    if (ICmpInst *IC = dyn_cast<ICmpInst>(*UI))
+      if (IC->isEquality() && IC->getOperand(1) == With)
+        continue;
+    // Unknown instruction.
+    return false;
+  }
+  return true;
+}
+
 //===----------------------------------------------------------------------===//
 // Fortified Library Call Optimizations
 //===----------------------------------------------------------------------===//
@@ -801,6 +818,204 @@ struct StrSpnOpt : public LibCallOptimization {
   }
 };
 
+struct StrCSpnOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        FT->getParamType(0) != B.getInt8PtrTy() ||
+        FT->getParamType(1) != FT->getParamType(0) ||
+        !FT->getReturnType()->isIntegerTy())
+      return 0;
+
+    StringRef S1, S2;
+    bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
+    bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
+
+    // strcspn("", s) -> 0
+    if (HasS1 && S1.empty())
+      return Constant::getNullValue(CI->getType());
+
+    // Constant folding.
+    if (HasS1 && HasS2) {
+      size_t Pos = S1.find_first_of(S2);
+      if (Pos == StringRef::npos) Pos = S1.size();
+      return ConstantInt::get(CI->getType(), Pos);
+    }
+
+    // strcspn(s, "") -> strlen(s)
+    if (TD && HasS2 && S2.empty())
+      return EmitStrLen(CI->getArgOperand(0), B, TD, TLI);
+
+    return 0;
+  }
+};
+
+struct StrStrOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy() ||
+        !FT->getReturnType()->isPointerTy())
+      return 0;
+
+    // fold strstr(x, x) -> x.
+    if (CI->getArgOperand(0) == CI->getArgOperand(1))
+      return B.CreateBitCast(CI->getArgOperand(0), CI->getType());
+
+    // fold strstr(a, b) == a -> strncmp(a, b, strlen(b)) == 0
+    if (TD && isOnlyUsedInEqualityComparison(CI, CI->getArgOperand(0))) {
+      Value *StrLen = EmitStrLen(CI->getArgOperand(1), B, TD, TLI);
+      if (!StrLen)
+        return 0;
+      Value *StrNCmp = EmitStrNCmp(CI->getArgOperand(0), CI->getArgOperand(1),
+                                   StrLen, B, TD, TLI);
+      if (!StrNCmp)
+        return 0;
+      for (Value::use_iterator UI = CI->use_begin(), UE = CI->use_end();
+           UI != UE; ) {
+        ICmpInst *Old = cast<ICmpInst>(*UI++);
+        Value *Cmp = B.CreateICmp(Old->getPredicate(), StrNCmp,
+                                  ConstantInt::getNullValue(StrNCmp->getType()),
+                                  "cmp");
+        LCS->replaceAllUsesWith(Old, Cmp);
+      }
+      return CI;
+    }
+
+    // See if either input string is a constant string.
+    StringRef SearchStr, ToFindStr;
+    bool HasStr1 = getConstantStringInfo(CI->getArgOperand(0), SearchStr);
+    bool HasStr2 = getConstantStringInfo(CI->getArgOperand(1), ToFindStr);
+
+    // fold strstr(x, "") -> x.
+    if (HasStr2 && ToFindStr.empty())
+      return B.CreateBitCast(CI->getArgOperand(0), CI->getType());
+
+    // If both strings are known, constant fold it.
+    if (HasStr1 && HasStr2) {
+      std::string::size_type Offset = SearchStr.find(ToFindStr);
+
+      if (Offset == StringRef::npos) // strstr("foo", "bar") -> null
+        return Constant::getNullValue(CI->getType());
+
+      // strstr("abcd", "bc") -> gep((char*)"abcd", 1)
+      Value *Result = CastToCStr(CI->getArgOperand(0), B);
+      Result = B.CreateConstInBoundsGEP1_64(Result, Offset, "strstr");
+      return B.CreateBitCast(Result, CI->getType());
+    }
+
+    // fold strstr(x, "y") -> strchr(x, 'y').
+    if (HasStr2 && ToFindStr.size() == 1) {
+      Value *StrChr= EmitStrChr(CI->getArgOperand(0), ToFindStr[0], B, TD, TLI);
+      return StrChr ? B.CreateBitCast(StrChr, CI->getType()) : 0;
+    }
+    return 0;
+  }
+};
+
+struct MemCmpOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 || !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy() ||
+        !FT->getReturnType()->isIntegerTy(32))
+      return 0;
+
+    Value *LHS = CI->getArgOperand(0), *RHS = CI->getArgOperand(1);
+
+    if (LHS == RHS)  // memcmp(s,s,x) -> 0
+      return Constant::getNullValue(CI->getType());
+
+    // Make sure we have a constant length.
+    ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+    if (!LenC) return 0;
+    uint64_t Len = LenC->getZExtValue();
+
+    if (Len == 0) // memcmp(s1,s2,0) -> 0
+      return Constant::getNullValue(CI->getType());
+
+    // memcmp(S1,S2,1) -> *(unsigned char*)LHS - *(unsigned char*)RHS
+    if (Len == 1) {
+      Value *LHSV = B.CreateZExt(B.CreateLoad(CastToCStr(LHS, B), "lhsc"),
+                                 CI->getType(), "lhsv");
+      Value *RHSV = B.CreateZExt(B.CreateLoad(CastToCStr(RHS, B), "rhsc"),
+                                 CI->getType(), "rhsv");
+      return B.CreateSub(LHSV, RHSV, "chardiff");
+    }
+
+    // Constant folding: memcmp(x, y, l) -> cnst (all arguments are constant)
+    StringRef LHSStr, RHSStr;
+    if (getConstantStringInfo(LHS, LHSStr) &&
+        getConstantStringInfo(RHS, RHSStr)) {
+      // Make sure we're not reading out-of-bounds memory.
+      if (Len > LHSStr.size() || Len > RHSStr.size())
+        return 0;
+      uint64_t Ret = memcmp(LHSStr.data(), RHSStr.data(), Len);
+      return ConstantInt::get(CI->getType(), Ret);
+    }
+
+    return 0;
+  }
+};
+
+struct MemCpyOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // These optimizations require DataLayout.
+    if (!TD) return 0;
+
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
+        !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy() ||
+        FT->getParamType(2) != TD->getIntPtrType(*Context))
+      return 0;
+
+    // memcpy(x, y, n) -> llvm.memcpy(x, y, n, 1)
+    B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                   CI->getArgOperand(2), 1);
+    return CI->getArgOperand(0);
+  }
+};
+
+struct MemMoveOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // These optimizations require DataLayout.
+    if (!TD) return 0;
+
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
+        !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy() ||
+        FT->getParamType(2) != TD->getIntPtrType(*Context))
+      return 0;
+
+    // memmove(x, y, n) -> llvm.memmove(x, y, n, 1)
+    B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1),
+                    CI->getArgOperand(2), 1);
+    return CI->getArgOperand(0);
+  }
+};
+
+struct MemSetOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // These optimizations require DataLayout.
+    if (!TD) return 0;
+
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
+        !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isIntegerTy() ||
+        FT->getParamType(2) != TD->getIntPtrType(*Context))
+      return 0;
+
+    // memset(p, v, n) -> llvm.memset(p, v, n, 1)
+    Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
+    B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1);
+    return CI->getArgOperand(0);
+  }
+};
+
 } // End anonymous namespace.
 
 namespace llvm {
@@ -808,6 +1023,7 @@ namespace llvm {
 class LibCallSimplifierImpl {
   const DataLayout *TD;
   const TargetLibraryInfo *TLI;
+  const LibCallSimplifier *LCS;
   StringMap<LibCallOptimization*> Optimizations;
 
   // Fortified library call optimizations.
@@ -818,7 +1034,7 @@ class LibCallSimplifierImpl {
   StpCpyChkOpt StpCpyChk;
   StrNCpyChkOpt StrNCpyChk;
 
-  // String and memory library call optimizations.
+  // String library call optimizations.
   StrCatOpt StrCat;
   StrNCatOpt StrNCat;
   StrChrOpt StrChr;
@@ -832,12 +1048,23 @@ class LibCallSimplifierImpl {
   StrPBrkOpt StrPBrk;
   StrToOpt StrTo;
   StrSpnOpt StrSpn;
+  StrCSpnOpt StrCSpn;
+  StrStrOpt StrStr;
+
+  // Memory library call optimizations.
+  MemCmpOpt MemCmp;
+  MemCpyOpt MemCpy;
+  MemMoveOpt MemMove;
+  MemSetOpt MemSet;
 
   void initOptimizations();
+  void addOpt(LibFunc::Func F, LibCallOptimization* Opt);
 public:
-  LibCallSimplifierImpl(const DataLayout *TD, const TargetLibraryInfo *TLI) {
+  LibCallSimplifierImpl(const DataLayout *TD, const TargetLibraryInfo *TLI,
+                        const LibCallSimplifier *LCS) {
     this->TD = TD;
     this->TLI = TLI;
+    this->LCS = LCS;
   }
 
   Value *optimizeCall(CallInst *CI);
@@ -853,26 +1080,34 @@ void LibCallSimplifierImpl::initOptimizations() {
   Optimizations["__strncpy_chk"] = &StrNCpyChk;
   Optimizations["__stpncpy_chk"] = &StrNCpyChk;
 
-  // String and memory library call optimizations.
-  Optimizations["strcat"] = &StrCat;
-  Optimizations["strncat"] = &StrNCat;
-  Optimizations["strchr"] = &StrChr;
-  Optimizations["strrchr"] = &StrRChr;
-  Optimizations["strcmp"] = &StrCmp;
-  Optimizations["strncmp"] = &StrNCmp;
-  Optimizations["strcpy"] = &StrCpy;
-  Optimizations["stpcpy"] = &StpCpy;
-  Optimizations["strncpy"] = &StrNCpy;
-  Optimizations["strlen"] = &StrLen;
-  Optimizations["strpbrk"] = &StrPBrk;
-  Optimizations["strtol"] = &StrTo;
-  Optimizations["strtod"] = &StrTo;
-  Optimizations["strtof"] = &StrTo;
-  Optimizations["strtoul"] = &StrTo;
-  Optimizations["strtoll"] = &StrTo;
-  Optimizations["strtold"] = &StrTo;
-  Optimizations["strtoull"] = &StrTo;
-  Optimizations["strspn"] = &StrSpn;
+  // String library call optimizations.
+  addOpt(LibFunc::strcat, &StrCat);
+  addOpt(LibFunc::strncat, &StrNCat);
+  addOpt(LibFunc::strchr, &StrChr);
+  addOpt(LibFunc::strrchr, &StrRChr);
+  addOpt(LibFunc::strcmp, &StrCmp);
+  addOpt(LibFunc::strncmp, &StrNCmp);
+  addOpt(LibFunc::strcpy, &StrCpy);
+  addOpt(LibFunc::stpcpy, &StpCpy);
+  addOpt(LibFunc::strncpy, &StrNCpy);
+  addOpt(LibFunc::strlen, &StrLen);
+  addOpt(LibFunc::strpbrk, &StrPBrk);
+  addOpt(LibFunc::strtol, &StrTo);
+  addOpt(LibFunc::strtod, &StrTo);
+  addOpt(LibFunc::strtof, &StrTo);
+  addOpt(LibFunc::strtoul, &StrTo);
+  addOpt(LibFunc::strtoll, &StrTo);
+  addOpt(LibFunc::strtold, &StrTo);
+  addOpt(LibFunc::strtoull, &StrTo);
+  addOpt(LibFunc::strspn, &StrSpn);
+  addOpt(LibFunc::strcspn, &StrCSpn);
+  addOpt(LibFunc::strstr, &StrStr);
+
+  // Memory library call optimizations.
+  addOpt(LibFunc::memcmp, &MemCmp);
+  addOpt(LibFunc::memcpy, &MemCpy);
+  addOpt(LibFunc::memmove, &MemMove);
+  addOpt(LibFunc::memset, &MemSet);
 }
 
 Value *LibCallSimplifierImpl::optimizeCall(CallInst *CI) {
@@ -883,14 +1118,19 @@ Value *LibCallSimplifierImpl::optimizeCall(CallInst *CI) {
   LibCallOptimization *LCO = Optimizations.lookup(Callee->getName());
   if (LCO) {
     IRBuilder<> Builder(CI);
-    return LCO->optimizeCall(CI, TD, TLI, Builder);
+    return LCO->optimizeCall(CI, TD, TLI, LCS, Builder);
   }
   return 0;
 }
 
+void LibCallSimplifierImpl::addOpt(LibFunc::Func F, LibCallOptimization* Opt) {
+  if (TLI->has(F))
+    Optimizations[TLI->getName(F)] = Opt;
+}
+
 LibCallSimplifier::LibCallSimplifier(const DataLayout *TD,
                                      const TargetLibraryInfo *TLI) {
-  Impl = new LibCallSimplifierImpl(TD, TLI);
+  Impl = new LibCallSimplifierImpl(TD, TLI, this);
 }
 
 LibCallSimplifier::~LibCallSimplifier() {
@@ -901,4 +1141,9 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
   return Impl->optimizeCall(CI);
 }
 
+void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) const {
+  I->replaceAllUsesWith(With);
+  I->eraseFromParent();
+}
+
 }
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 892808760f..a7ef248e6e 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -78,6 +78,10 @@ VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
 /// We don't vectorize loops with a known constant trip count below this number.
 const unsigned TinyTripCountThreshold = 16;
 
+/// When performing a runtime memory check, do not check more than this
+/// number of pointers. Notice that the check is quadratic!
+const unsigned RuntimeMemoryCheckThreshold = 2;
+
 namespace {
 
 // Forward declarations.
@@ -114,7 +118,7 @@ public:
     /// Widen each instruction in the old loop to a new one in the new loop.
     /// Use the Legality module to find the induction and reduction variables.
     vectorizeLoop(Legal);
-    // register the new loop.
+    // Register the new loop and update the analysis passes.
     updateAnalysis();
  }
 
@@ -123,7 +127,8 @@ private:
   void createEmptyLoop(LoopVectorizationLegality *Legal);
   /// Copy and widen the instructions from the old loop.
   void vectorizeLoop(LoopVectorizationLegality *Legal);
-  /// Insert the new loop to the loop hierarchy and pass manager.
+  /// Insert the new loop to the loop hierarchy and pass manager
+  /// and update the analysis passes.
   void updateAnalysis();
 
   /// This instruction is un-vectorizable. Implement it as a sequence
@@ -242,6 +247,15 @@ public:
     ReductionKind Kind;
   };
 
+  // This POD struct holds information about the memory runtime legality
+  // check that a group of pointers do not overlap.
+  struct RuntimePointerCheck {
+    /// This flag indicates if we need to add the runtime check.
+    bool Need;
+    /// Holds the pointers that we need to check.
+    SmallVector<Value*, 2> Pointers;
+  };
+
   /// ReductionList contains the reduction descriptors for all
   /// of the reductions that were found in the loop.
   typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
@@ -263,9 +277,14 @@ public:
   /// This check allows us to vectorize A[idx] into a wide load/store.
   bool isConsecutiveGep(Value *Ptr);
 
+  /// Returns true if the value V is uniform within the loop.
+  bool isUniform(Value *V);
+
   /// Returns true if this instruction will remain scalar after vectorization.
   bool isUniformAfterVectorization(Instruction* I) {return Uniforms.count(I);}
 
+  /// Returns the information that we collected about runtime memory check.
+  RuntimePointerCheck *getRuntimePointerCheck() {return &PtrRtCheck; }
 private:
   /// Check if a single basic block loop is vectorizable.
   /// At this point we know that this is a loop with a constant trip count
@@ -286,6 +305,8 @@ private:
   bool isReductionInstr(Instruction *I, ReductionKind Kind);
   /// Returns True, if 'Phi' is an induction variable.
   bool isInductionVariable(PHINode *Phi);
+  /// Return true if can compute the address bounds of Ptr within the loop.
+  bool hasComputableBounds(Value *Ptr);
 
   /// The loop that we evaluate.
   Loop *TheLoop;
@@ -306,6 +327,9 @@ private:
   /// This set holds the variables which are known to be uniform after
   /// vectorization.
   SmallPtrSet<Instruction*, 4> Uniforms;
+  /// We need to check that all of the pointers in this list are disjoint
+  /// at runtime.
+  RuntimePointerCheck PtrRtCheck;
 };
 
 /// LoopVectorizationCostModel - estimates the expected speedups due to
@@ -506,6 +530,10 @@ bool LoopVectorizationLegality::isConsecutiveGep(Value *Ptr) {
   return false;
 }
 
+bool LoopVectorizationLegality::isUniform(Value *V) {
+  return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
+}
+
 Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) {
   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
   // If we saved a vectorized copy of V, use it.
@@ -631,13 +659,29 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
    ...
    */
 
+  OldInduction = Legal->getInduction();
+  assert(OldInduction && "We must have a single phi node.");
+  Type *IdxTy = OldInduction->getType();
+
+  // Find the loop boundaries.
+  const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader());
+  assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
+
+  // Get the total trip count from the count by adding 1.
+  ExitCount = SE->getAddExpr(ExitCount,
+                             SE->getConstant(ExitCount->getType(), 1));
+  // We may need to extend the index in case there is a type mismatch.
+  // We know that the count starts at zero and does not overflow.
+  // We are using Zext because it should be less expensive.
+  if (ExitCount->getType() != IdxTy)
+    ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy);
+
   // This is the original scalar-loop preheader.
   BasicBlock *BypassBlock = OrigLoop->getLoopPreheader();
   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
   assert(ExitBlock && "Must have an exit block");
 
   // The loop index does not have to start at Zero. It starts with this value.
-  OldInduction = Legal->getInduction();
   Value *StartIdx = OldInduction->getIncomingValueForBlock(BypassBlock);
 
   assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop");
@@ -655,8 +699,6 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
                                  "scalar.preheader");
   // Find the induction variable.
   BasicBlock *OldBasicBlock = OrigLoop->getHeader();
-  assert(OldInduction && "We must have a single phi node.");
-  Type *IdxTy = OldInduction->getType();
 
   // Use this IR builder to create the loop instructions (Phi, Br, Cmp)
   // inside the loop.
@@ -666,25 +708,11 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   Induction = Builder.CreatePHI(IdxTy, 2, "index");
   Constant *Step = ConstantInt::get(IdxTy, VF);
 
-  // Find the loop boundaries.
-  const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader());
-  assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
-
-  // Get the total trip count from the count by adding 1.
-  ExitCount = SE->getAddExpr(ExitCount,
-                             SE->getConstant(ExitCount->getType(), 1));
-
   // Expand the trip count and place the new instructions in the preheader.
   // Notice that the pre-header does not change, only the loop body.
   SCEVExpander Exp(*SE, "induction");
   Instruction *Loc = BypassBlock->getTerminator();
 
-  // We may need to extend the index in case there is a type mismatch.
-  // We know that the count starts at zero and does not overflow.
-  // We are using Zext because it should be less expensive.
-  if (ExitCount->getType() != Induction->getType())
-    ExitCount = SE->getZeroExtendExpr(ExitCount, IdxTy);
-
   // Count holds the overall loop count (N).
   Value *Count = Exp.expandCodeFor(ExitCount, Induction->getType(), Loc);
 
@@ -704,15 +732,85 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
                                IdxEndRoundDown,
                                StartIdx,
                                "cmp.zero", Loc);
+
+  LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
+    Legal->getRuntimePointerCheck();
+  Value *MemoryRuntimeCheck = 0;
+  if (PtrRtCheck->Need) {
+    unsigned NumPointers = PtrRtCheck->Pointers.size();
+    SmallVector<Value* , 2> Starts;
+    SmallVector<Value* , 2> Ends;
+
+    // Use this type for pointer arithmetic.
+    Type* PtrArithTy = PtrRtCheck->Pointers[0]->getType();
+
+    for (unsigned i=0; i < NumPointers; ++i) {
+      Value *Ptr = PtrRtCheck->Pointers[i];
+      const SCEV *Sc = SE->getSCEV(Ptr);
+
+      if (SE->isLoopInvariant(Sc, OrigLoop)) {
+        DEBUG(dbgs() << "LV1: Adding RT check for a loop invariant ptr:" <<
+              *Ptr <<"\n");
+        Starts.push_back(Ptr);
+        Ends.push_back(Ptr);
+      } else {
+        DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n");
+        const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
+        Value *Start = Exp.expandCodeFor(AR->getStart(), PtrArithTy, Loc);
+        const SCEV *Ex = SE->getExitCount(OrigLoop, OrigLoop->getHeader());
+        const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
+        assert(!isa<SCEVCouldNotCompute>(ScEnd) && "Invalid scev range.");
+        Value *End = Exp.expandCodeFor(ScEnd, PtrArithTy, Loc);
+        Starts.push_back(Start);
+        Ends.push_back(End);
+      }
+    }
+
+    for (unsigned i=0; i < NumPointers; ++i) {
+      for (unsigned j=i+1; j < NumPointers; ++j) {
+        Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
+                                      Starts[0], Ends[1], "bound0", Loc);
+        Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
+                                      Starts[1], Ends[0], "bound1", Loc);
+        Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1,
+                                                    "found.conflict", Loc);
+        if (MemoryRuntimeCheck) {
+          MemoryRuntimeCheck = BinaryOperator::Create(Instruction::Or,
+                                                      MemoryRuntimeCheck,
+                                                      IsConflict,
+                                                      "conflict.rdx", Loc);
+        } else {
+          MemoryRuntimeCheck = IsConflict;
+        }
+      }
+    }
+  }// end of need-runtime-check code.
+
+  // If we are using memory runtime checks, include them in.
+  if (MemoryRuntimeCheck) {
+    Cmp = BinaryOperator::Create(Instruction::Or, Cmp, MemoryRuntimeCheck,
+                                 "CntOrMem", Loc);
+  }
+
   BranchInst::Create(MiddleBlock, VectorPH, Cmp, Loc);
   // Remove the old terminator.
   Loc->eraseFromParent();
 
+  // We are going to resume the execution of the scalar loop.
+  // This PHI decides on what number to start. If we come from the
+  // vector loop then we need to start with the end index minus the
+  // index modulo VF. If we come from a bypass edge then we need to start
+  // from the real start.
+  PHINode* ResumeIndex = PHINode::Create(IdxTy, 2, "resume.idx",
+                                         MiddleBlock->getTerminator());
+  ResumeIndex->addIncoming(StartIdx, BypassBlock);
+  ResumeIndex->addIncoming(IdxEndRoundDown, VecBody);
+
   // Add a check in the middle block to see if we have completed
   // all of the iterations in the first vector loop.
   // If (N - N%VF) == N, then we *don't* need to run the remainder.
   Value *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, IdxEnd,
-                                IdxEndRoundDown, "cmp.n",
+                                ResumeIndex, "cmp.n",
                                 MiddleBlock->getTerminator());
 
   BranchInst::Create(ExitBlock, ScalarPH, CmpN, MiddleBlock->getTerminator());
@@ -732,7 +830,7 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
 
   // Fix the scalar body iteration count.
   unsigned BlockIdx = OldInduction->getBasicBlockIndex(ScalarPH);
-  OldInduction->setIncomingValue(BlockIdx, IdxEndRoundDown);
+  OldInduction->setIncomingValue(BlockIdx, ResumeIndex);
 
   // Get ready to start creating new instructions into the vectorized body.
   Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
@@ -905,7 +1003,12 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF);
         Value *Ptr = SI->getPointerOperand();
         unsigned Alignment = SI->getAlignment();
+
+        assert(!Legal->isUniform(Ptr) &&
+               "We do not allow storing to uniform addresses");
+
         GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+
         // This store does not use GEPs.
         if (!Legal->isConsecutiveGep(Gep)) {
           scalarizeInstruction(Inst);
@@ -935,8 +1038,9 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
         unsigned Alignment = LI->getAlignment();
         GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
 
-        // We don't have a gep. Scalarize the load.
-        if (!Legal->isConsecutiveGep(Gep)) {
+        // If we don't have a gep, or that the pointer is loop invariant,
+        // scalarize the load.
+        if (!Gep || Legal->isUniform(Gep) || !Legal->isConsecutiveGep(Gep)) {
           scalarizeInstruction(Inst);
           break;
         }
@@ -1146,12 +1250,6 @@ bool LoopVectorizationLegality::canVectorize() {
   BasicBlock *BB = TheLoop->getHeader();
   DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n");
 
-  // Go over each instruction and look at memory deps.
-  if (!canVectorizeBlock(*BB)) {
-    DEBUG(dbgs() << "LV: Can't vectorize this loop header\n");
-    return false;
-  }
-
   // ScalarEvolution needs to be able to find the exit count.
   const SCEV *ExitCount = SE->getExitCount(TheLoop, BB);
   if (ExitCount == SE->getCouldNotCompute()) {
@@ -1167,7 +1265,15 @@ bool LoopVectorizationLegality::canVectorize() {
     return false;
   }
 
-  DEBUG(dbgs() << "LV: We can vectorize this loop!\n");
+  // Go over each instruction and look at memory deps.
+  if (!canVectorizeBlock(*BB)) {
+    DEBUG(dbgs() << "LV: Can't vectorize this loop header\n");
+    return false;
+  }
+
+  DEBUG(dbgs() << "LV: We can vectorize this loop" <<
+        (PtrRtCheck.Need ? " (with a runtime bound check)" : "")
+        <<"!\n");
 
   // Okay! We can vectorize. At this point we don't have any other mem analysis
   // which may limit our maximum vectorization factor, so just return true with
@@ -1304,6 +1410,8 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
   // Holds the Load and Store *instructions*.
   ValueVector Loads;
   ValueVector Stores;
+  PtrRtCheck.Pointers.clear();
+  PtrRtCheck.Need = false;
 
   // Scan the BB and collect legal loads and stores.
   for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
@@ -1361,6 +1469,12 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
     StoreInst *ST = dyn_cast<StoreInst>(*I);
     assert(ST && "Bad StoreInst");
     Value* Ptr = ST->getPointerOperand();
+
+    if (isUniform(Ptr)) {
+      DEBUG(dbgs() << "LV: We don't allow storing to uniform addresses\n");
+      return false;
+    }
+
     // If we did *not* see this pointer before, insert it to
     // the read-write list. At this phase it is only a 'write' list.
     if (Seen.insert(Ptr))
@@ -1390,6 +1504,39 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
     return true;
   }
 
+  // Find pointers with computable bounds. We are going to use this information
+  // to place a runtime bound check.
+  bool RT = true;
+  for (I = ReadWrites.begin(), IE = ReadWrites.end(); I != IE; ++I)
+    if (hasComputableBounds(*I)) {
+      PtrRtCheck.Pointers.push_back(*I);
+      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
+    } else {
+      RT = false;
+      break;
+    }
+  for (I = Reads.begin(), IE = Reads.end(); I != IE; ++I)
+    if (hasComputableBounds(*I)) {
+      PtrRtCheck.Pointers.push_back(*I);
+      DEBUG(dbgs() << "LV: Found a runtime check ptr:" << **I <<"\n");
+    } else {
+      RT = false;
+      break;
+    }
+
+  // Check that we did not collect too many pointers or found a
+  // unsizeable pointer.
+  if (!RT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) {
+    PtrRtCheck.Pointers.clear();
+    RT = false;
+  }
+
+  PtrRtCheck.Need = RT;
+
+  if (RT) {
+    DEBUG(dbgs() << "LV: We can perform a memory runtime check if needed.\n");
+  }
+
   // Now that the pointers are in two lists (Reads and ReadWrites), we
   // can check that there are no conflicts between each of the writes and
   // between the writes to the reads.
@@ -1404,12 +1551,12 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
          it != e; ++it) {
       if (!isIdentifiedObject(*it)) {
         DEBUG(dbgs() << "LV: Found an unidentified write ptr:"<< **it <<"\n");
-        return false;
+        return RT;
       }
       if (!WriteObjects.insert(*it)) {
         DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
               << **it <<"\n");
-        return false;
+        return RT;
       }
     }
     TempObjects.clear();
@@ -1422,18 +1569,21 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
          it != e; ++it) {
       if (!isIdentifiedObject(*it)) {
         DEBUG(dbgs() << "LV: Found an unidentified read ptr:"<< **it <<"\n");
-        return false;
+        return RT;
       }
       if (WriteObjects.count(*it)) {
         DEBUG(dbgs() << "LV: Found a possible read/write reorder:"
               << **it <<"\n");
-        return false;
+        return RT;
       }
     }
     TempObjects.clear();
   }
 
-  // All is okay.
+  // It is safe to vectorize and we don't need any runtime checks.
+  DEBUG(dbgs() << "LV: We don't need a runtime memory check.\n");
+  PtrRtCheck.Pointers.clear();
+  PtrRtCheck.Need = false;
   return true;
 }
 
@@ -1556,6 +1706,15 @@ bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
   return true;
 }
 
+bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) {
+  const SCEV *PhiScev = SE->getSCEV(Ptr);
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
+  if (!AR)
+    return false;
+
+  return AR->isAffine();
+}
+
 unsigned
 LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) {
   if (!VTTI) {
diff --git a/test/CodeGen/ARM/2010-10-19-mc-elf-objheader.ll b/test/CodeGen/ARM/2010-10-19-mc-elf-objheader.ll
index 99db63713d..36d15757c3 100644
--- a/test/CodeGen/ARM/2010-10-19-mc-elf-objheader.ll
+++ b/test/CodeGen/ARM/2010-10-19-mc-elf-objheader.ll
@@ -13,12 +13,12 @@
 ; BASIC-NEXT:         0x00000000
 ; BASIC-NEXT:         0x00000000
 ; BASIC-NEXT:         0x0000003c
-; BASIC-NEXT:         0x00000020
+; BASIC-NEXT:         0x00000022
 ; BASIC-NEXT:         0x00000000
 ; BASIC-NEXT:         0x00000000
 ; BASIC-NEXT:         0x00000001
 ; BASIC-NEXT:         0x00000000
-; BASIC-NEXT:         '411f0000 00616561 62690001 15000000 06020801 09011401 15011703 18011901'
+; BASIC-NEXT:         '41210000 00616561 62690001 17000000 060a0741 08010902 14011501 17031801 1901'
 
 ; CORTEXA8:        .ARM.attributes
 ; CORTEXA8-NEXT:         0x70000003
diff --git a/test/CodeGen/ARM/call-noret-minsize.ll b/test/CodeGen/ARM/call-noret-minsize.ll
index 35490ac69b..df3c19eca6 100644
--- a/test/CodeGen/ARM/call-noret-minsize.ll
+++ b/test/CodeGen/ARM/call-noret-minsize.ll
@@ -1,6 +1,5 @@
 ; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=cortex-a8   | FileCheck %s -check-prefix=ARM
 ; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=swift       | FileCheck %s -check-prefix=SWIFT
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 | FileCheck %s -check-prefix=T2
 ; rdar://12348580
 
 define void @t1() noreturn minsize nounwind ssp {
@@ -10,9 +9,6 @@ entry:
 
 ; SWIFT: t1:
 ; SWIFT: bl _bar
-
-; T2: t1:
-; T2: blx _bar
   tail call void @bar() noreturn nounwind
   unreachable
 }
@@ -24,9 +20,6 @@ entry:
 
 ; SWIFT: t2:
 ; SWIFT: bl _t1
-
-; T2: t2:
-; T2: bl _t1
   tail call void @t1() noreturn nounwind
   unreachable
 }
diff --git a/test/CodeGen/ARM/call-noret.ll b/test/CodeGen/ARM/call-noret.ll
index d294f2cf1a..27062dca38 100644
--- a/test/CodeGen/ARM/call-noret.ll
+++ b/test/CodeGen/ARM/call-noret.ll
@@ -1,6 +1,5 @@
 ; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=cortex-a8   | FileCheck %s -check-prefix=ARM
 ; RUN: llc < %s -mtriple=armv7-apple-ios -mcpu=swift       | FileCheck %s -check-prefix=SWIFT
-; RUN: llc < %s -mtriple=thumbv7-apple-ios -mcpu=cortex-a8 | FileCheck %s -check-prefix=T2
 ; rdar://8979299
 
 define void @t1() noreturn nounwind ssp {
@@ -12,9 +11,6 @@ entry:
 ; SWIFT: t1:
 ; SWIFT: mov lr, pc
 ; SWIFT: b _bar
-
-; T2: t1:
-; T2: blx _bar
   tail call void @bar() noreturn nounwind
   unreachable
 }
@@ -28,10 +24,6 @@ entry:
 ; SWIFT: t2:
 ; SWIFT: mov lr, pc
 ; SWIFT: b _t1
-
-; T2: t2:
-; T2: mov lr, pc
-; T2: b.w _t1
   tail call void @t1() noreturn nounwind
   unreachable
 }
diff --git a/test/CodeGen/NVPTX/param-align.ll b/test/CodeGen/NVPTX/param-align.ll
new file mode 100644
index 0000000000..84ccb650d4
--- /dev/null
+++ b/test/CodeGen/NVPTX/param-align.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+
+;;; Need 4-byte alignment on float* passed byval
+define ptx_device void @t1(float* byval %x) {
+; CHECK: .func t1
+; CHECK: .param .align 4 .b8 t1_param_0[4]
+  ret void
+}
+
+
+;;; Need 8-byte alignment on double* passed byval
+define ptx_device void @t2(double* byval %x) {
+; CHECK: .func t2
+; CHECK: .param .align 8 .b8 t2_param_0[8]
+  ret void
+}
+
+
+;;; Need 4-byte alignment on float2* passed byval
+%struct.float2 = type { float, float }
+define ptx_device void @t3(%struct.float2* byval %x) {
+; CHECK: .func t3
+; CHECK: .param .align 4 .b8 t3_param_0[8]
+  ret void
+}
diff --git a/test/CodeGen/NVPTX/ptx-version-30.ll b/test/CodeGen/NVPTX/ptx-version-30.ll
new file mode 100644
index 0000000000..0422b01f4e
--- /dev/null
+++ b/test/CodeGen/NVPTX/ptx-version-30.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -mattr=ptx30 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=ptx30 | FileCheck %s
+
+
+; CHECK: .version 3.0
+
diff --git a/test/CodeGen/NVPTX/ptx-version-31.ll b/test/CodeGen/NVPTX/ptx-version-31.ll
new file mode 100644
index 0000000000..d6e57301a3
--- /dev/null
+++ b/test/CodeGen/NVPTX/ptx-version-31.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -mattr=ptx31 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -mattr=ptx31 | FileCheck %s
+
+
+; CHECK: .version 3.1
+
diff --git a/test/CodeGen/NVPTX/sm-version-10.ll b/test/CodeGen/NVPTX/sm-version-10.ll
new file mode 100644
index 0000000000..9324a37809
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-10.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_10 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_10 | FileCheck %s
+
+
+; CHECK: .target sm_10
+
diff --git a/test/CodeGen/NVPTX/sm-version-11.ll b/test/CodeGen/NVPTX/sm-version-11.ll
new file mode 100644
index 0000000000..9033a4eba5
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-11.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_11 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_11 | FileCheck %s
+
+
+; CHECK: .target sm_11
+
diff --git a/test/CodeGen/NVPTX/sm-version-12.ll b/test/CodeGen/NVPTX/sm-version-12.ll
new file mode 100644
index 0000000000..d8ee85c901
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-12.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_12 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_12 | FileCheck %s
+
+
+; CHECK: .target sm_12
+
diff --git a/test/CodeGen/NVPTX/sm-version-13.ll b/test/CodeGen/NVPTX/sm-version-13.ll
new file mode 100644
index 0000000000..ad67d642ce
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-13.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_13 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_13 | FileCheck %s
+
+
+; CHECK: .target sm_13
+
diff --git a/test/CodeGen/NVPTX/sm-version-20.ll b/test/CodeGen/NVPTX/sm-version-20.ll
new file mode 100644
index 0000000000..c21f49e6ae
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-20.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
+
+
+; CHECK: .target sm_20
+
diff --git a/test/CodeGen/NVPTX/sm-version-21.ll b/test/CodeGen/NVPTX/sm-version-21.ll
new file mode 100644
index 0000000000..4fb6de3e63
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-21.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_21 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_21 | FileCheck %s
+
+
+; CHECK: .target sm_21
+
diff --git a/test/CodeGen/NVPTX/sm-version-30.ll b/test/CodeGen/NVPTX/sm-version-30.ll
new file mode 100644
index 0000000000..692b49a0d6
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-30.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_30 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s
+
+
+; CHECK: .target sm_30
+
diff --git a/test/CodeGen/NVPTX/sm-version-35.ll b/test/CodeGen/NVPTX/sm-version-35.ll
new file mode 100644
index 0000000000..25368a0133
--- /dev/null
+++ b/test/CodeGen/NVPTX/sm-version-35.ll
@@ -0,0 +1,6 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_35 | FileCheck %s
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s
+
+
+; CHECK: .target sm_35
+
diff --git a/test/CodeGen/PowerPC/misched.ll b/test/CodeGen/PowerPC/misched.ll
new file mode 100644
index 0000000000..d6fb3b3046
--- /dev/null
+++ b/test/CodeGen/PowerPC/misched.ll
@@ -0,0 +1,45 @@
+; RUN: llc < %s -enable-misched -verify-machineinstrs
+; PR14302
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-bgq-linux"
+
+@b = external global [16000 x double], align 32
+
+define void @pr14302() nounwind {
+entry:
+  tail call void @putchar() nounwind
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  br i1 undef, label %for.body, label %for.body24.i
+
+for.body24.i:                                     ; preds = %for.body24.i, %for.body
+  store double 1.000000e+00, double* undef, align 8
+  br i1 undef, label %for.body24.i58, label %for.body24.i
+
+for.body24.i58:                                   ; preds = %for.body24.i58, %for.body24.i
+  %arrayidx26.i55.1 = getelementptr inbounds [16000 x double]* @b, i64 0, i64 undef
+  store double 1.000000e+00, double* %arrayidx26.i55.1, align 8
+  br i1 undef, label %for.body24.i64, label %for.body24.i58
+
+for.body24.i64:                                   ; preds = %for.body24.i64, %for.body24.i58
+  %exitcond.2489 = icmp eq i32 0, 16000
+  br i1 %exitcond.2489, label %for.body24.i70, label %for.body24.i64
+
+for.body24.i70:                                   ; preds = %for.body24.i70, %for.body24.i64
+  br i1 undef, label %for.body24.i76, label %for.body24.i70
+
+for.body24.i76:                                   ; preds = %for.body24.i76, %for.body24.i70
+  br i1 undef, label %set1d.exit77, label %for.body24.i76
+
+set1d.exit77:                                     ; preds = %for.body24.i76
+  br label %for.body29
+
+for.body29:                                       ; preds = %for.body29, %set1d.exit77
+  br i1 undef, label %for.end35, label %for.body29
+
+for.end35:                                        ; preds = %for.body29
+  ret void
+}
+
+declare void @putchar()
diff --git a/test/CodeGen/Thumb/thumb_jump24_fixup.ll b/test/CodeGen/Thumb/thumb_jump24_fixup.ll
deleted file mode 100644
index e6a6b25ca1..0000000000
--- a/test/CodeGen/Thumb/thumb_jump24_fixup.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; RUN: llc -mtriple thumbv7-none-linux-gnueabi -mcpu=cortex-a8 -march=thumb -mattr=thumb2 -filetype=obj -o - < %s | llvm-objdump -r - | FileCheck %s
-
-target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:32-n32-S64"
-target triple = "thumbv7-none-linux-gnueabi"
-
-define i32 @test_fixup_t2_uncondbranch() {
-b0:
-  invoke void @__cxa_throw(i8* null, i8* null, i8* null) noreturn
-    to label %unreachable unwind label %lpad
-
-; CHECK: {{[0-9]+}} R_ARM_THM_JUMP24 __cxa_throw
-
-lpad:
-  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) cleanup
-  ret i32 0
-
-unreachable:
-  unreachable
-}
-
-declare i32 @__gxx_personality_v0(...)
-
-declare void @__cxa_throw(i8*, i8*, i8*)
diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll
index c44beb4bc2..88ecd5a5d3 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -1140,9 +1140,9 @@ declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) noun
 
 
 define i32 @test_x86_sse42_pcmpestri128(<16 x i8> %a0, <16 x i8> %a2) {
-  ; CHECK: movl
-  ; CHECK: movl
-  ; CHECK: vpcmpestri
+  ; CHECK: movl $7
+  ; CHECK: movl $7
+  ; CHECK: vpcmpestri $7
   ; CHECK: movl
   %res = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
   ret i32 %res
@@ -1150,6 +1150,18 @@ define i32 @test_x86_sse42_pcmpestri128(<16 x i8> %a0, <16 x i8> %a2) {
 declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
 
 
+define i32 @test_x86_sse42_pcmpestri128_load(<16 x i8>* %a0, <16 x i8>* %a2) {
+  ; CHECK: movl $7
+  ; CHECK: movl $7
+  ; CHECK: vpcmpestri $7, (
+  ; CHECK: movl
+  %1 = load <16 x i8>* %a0
+  %2 = load <16 x i8>* %a2
+  %res = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %1, i32 7, <16 x i8> %2, i32 7, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+
+
 define i32 @test_x86_sse42_pcmpestria128(<16 x i8> %a0, <16 x i8> %a2) {
   ; CHECK: movl
   ; CHECK: movl
@@ -1216,8 +1228,19 @@ define <16 x i8> @test_x86_sse42_pcmpestrm128(<16 x i8> %a0, <16 x i8> %a2) {
 declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
 
 
+define <16 x i8> @test_x86_sse42_pcmpestrm128_load(<16 x i8> %a0, <16 x i8>* %a2) {
+  ; CHECK: movl $7
+  ; CHECK: movl $7
+  ; CHECK: vpcmpestrm $7,
+  ; CHECK-NOT: vmov
+  %1 = load <16 x i8>* %a2
+  %res = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %1, i32 7, i8 7) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+
+
 define i32 @test_x86_sse42_pcmpistri128(<16 x i8> %a0, <16 x i8> %a1) {
-  ; CHECK: vpcmpistri
+  ; CHECK: vpcmpistri $7
   ; CHECK: movl
   %res = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
   ret i32 %res
@@ -1225,6 +1248,16 @@ define i32 @test_x86_sse42_pcmpistri128(<16 x i8> %a0, <16 x i8> %a1) {
 declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind readnone
 
 
+define i32 @test_x86_sse42_pcmpistri128_load(<16 x i8>* %a0, <16 x i8>* %a1) {
+  ; CHECK: vpcmpistri $7, (
+  ; CHECK: movl
+  %1 = load <16 x i8>* %a0
+  %2 = load <16 x i8>* %a1
+  %res = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %1, <16 x i8> %2, i8 7) ; <i32> [#uses=1]
+  ret i32 %res
+}
+
+
 define i32 @test_x86_sse42_pcmpistria128(<16 x i8> %a0, <16 x i8> %a1) {
   ; CHECK: vpcmpistri
   ; CHECK: seta
@@ -1271,7 +1304,7 @@ declare i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8>, <16 x i8>, i8) nounwind rea
 
 
 define <16 x i8> @test_x86_sse42_pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1) {
-  ; CHECK: vpcmpistrm
+  ; CHECK: vpcmpistrm $7
   ; CHECK-NOT: vmov
   %res = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<16 x i8>> [#uses=1]
   ret <16 x i8> %res
@@ -1279,6 +1312,15 @@ define <16 x i8> @test_x86_sse42_pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1) {
 declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwind readnone
 
 
+define <16 x i8> @test_x86_sse42_pcmpistrm128_load(<16 x i8> %a0, <16 x i8>* %a1) {
+  ; CHECK: vpcmpistrm $7, (
+  ; CHECK-NOT: vmov
+  %1 = load <16 x i8>* %a1
+  %res = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %1, i8 7) ; <<16 x i8>> [#uses=1]
+  ret <16 x i8> %res
+}
+
+
 define <4 x float> @test_x86_sse_add_ss(<4 x float> %a0, <4 x float> %a1) {
   ; CHECK: vaddss
   %res = call <4 x float> @llvm.x86.sse.add.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
diff --git a/test/Instrumentation/ThreadSanitizer/atomic.ll b/test/Instrumentation/ThreadSanitizer/atomic.ll
index ed3c821205..107dbdc0f2 100644
--- a/test/Instrumentation/ThreadSanitizer/atomic.ll
+++ b/test/Instrumentation/ThreadSanitizer/atomic.ll
@@ -8,7 +8,7 @@ entry:
   ret i8 %0
 }
 ; CHECK: atomic8_load_unordered
-; CHECK: call i8 @__tsan_atomic8_load(i8* %a, i32 100501)
+; CHECK: call i8 @__tsan_atomic8_load(i8* %a, i32 0)
 
 define i8 @atomic8_load_monotonic(i8* %a) nounwind uwtable {
 entry:
@@ -16,7 +16,7 @@ entry:
   ret i8 %0
 }
 ; CHECK: atomic8_load_monotonic
-; CHECK: call i8 @__tsan_atomic8_load(i8* %a, i32 100501)
+; CHECK: call i8 @__tsan_atomic8_load(i8* %a, i32 0)
 
 define i8 @atomic8_load_acquire(i8* %a) nounwind uwtable {
 entry:
@@ -24,7 +24,7 @@ entry:
   ret i8 %0
 }
 ; CHECK: atomic8_load_acquire
-; CHECK: call i8 @__tsan_atomic8_load(i8* %a, i32 100504)
+; CHECK: call i8 @__tsan_atomic8_load(i8* %a, i32 2)
 
 define i8 @atomic8_load_seq_cst(i8* %a) nounwind uwtable {
 entry:
@@ -32,7 +32,7 @@ entry:
   ret i8 %0
 }
 ; CHECK: atomic8_load_seq_cst
-; CHECK: call i8 @__tsan_atomic8_load(i8* %a, i32 100532)
+; CHECK: call i8 @__tsan_atomic8_load(i8* %a, i32 5)
 
 define void @atomic8_store_unordered(i8* %a) nounwind uwtable {
 entry:
@@ -40,7 +40,7 @@ entry:
   ret void
 }
 ; CHECK: atomic8_store_unordered
-; CHECK: call void @__tsan_atomic8_store(i8* %a, i8 0, i32 100501)
+; CHECK: call void @__tsan_atomic8_store(i8* %a, i8 0, i32 0)
 
 define void @atomic8_store_monotonic(i8* %a) nounwind uwtable {
 entry:
@@ -48,7 +48,7 @@ entry:
   ret void
 }
 ; CHECK: atomic8_store_monotonic
-; CHECK: call void @__tsan_atomic8_store(i8* %a, i8 0, i32 100501)
+; CHECK: call void @__tsan_atomic8_store(i8* %a, i8 0, i32 0)
 
 define void @atomic8_store_release(i8* %a) nounwind uwtable {
 entry:
@@ -56,7 +56,7 @@ entry:
   ret void
 }
 ; CHECK: atomic8_store_release
-; CHECK: call void @__tsan_atomic8_store(i8* %a, i8 0, i32 100508)
+; CHECK: call void @__tsan_atomic8_store(i8* %a, i8 0, i32 3)
 
 define void @atomic8_store_seq_cst(i8* %a) nounwind uwtable {
 entry:
@@ -64,7 +64,287 @@ entry:
   ret void
 }
 ; CHECK: atomic8_store_seq_cst
-; CHECK: call void @__tsan_atomic8_store(i8* %a, i8 0, i32 100532)
+; CHECK: call void @__tsan_atomic8_store(i8* %a, i8 0, i32 5)
+
+define void @atomic8_xchg_monotonic(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i8* %a, i8 0 monotonic
+  ret void
+}
+; CHECK: atomic8_xchg_monotonic
+; CHECK: call i8 @__tsan_atomic8_exchange(i8* %a, i8 0, i32 0)
+
+define void @atomic8_add_monotonic(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw add i8* %a, i8 0 monotonic
+  ret void
+}
+; CHECK: atomic8_add_monotonic
+; CHECK: call i8 @__tsan_atomic8_fetch_add(i8* %a, i8 0, i32 0)
+
+define void @atomic8_sub_monotonic(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i8* %a, i8 0 monotonic
+  ret void
+}
+; CHECK: atomic8_sub_monotonic
+; CHECK: call i8 @__tsan_atomic8_fetch_sub(i8* %a, i8 0, i32 0)
+
+define void @atomic8_and_monotonic(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw and i8* %a, i8 0 monotonic
+  ret void
+}
+; CHECK: atomic8_and_monotonic
+; CHECK: call i8 @__tsan_atomic8_fetch_and(i8* %a, i8 0, i32 0)
+
+define void @atomic8_or_monotonic(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw or i8* %a, i8 0 monotonic
+  ret void
+}
+; CHECK: atomic8_or_monotonic
+; CHECK: call i8 @__tsan_atomic8_fetch_or(i8* %a, i8 0, i32 0)
+
+define void @atomic8_xor_monotonic(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i8* %a, i8 0 monotonic
+  ret void
+}
+; CHECK: atomic8_xor_monotonic
+; CHECK: call i8 @__tsan_atomic8_fetch_xor(i8* %a, i8 0, i32 0)
+
+define void @atomic8_xchg_acquire(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i8* %a, i8 0 acquire
+  ret void
+}
+; CHECK: atomic8_xchg_acquire
+; CHECK: call i8 @__tsan_atomic8_exchange(i8* %a, i8 0, i32 2)
+
+define void @atomic8_add_acquire(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw add i8* %a, i8 0 acquire
+  ret void
+}
+; CHECK: atomic8_add_acquire
+; CHECK: call i8 @__tsan_atomic8_fetch_add(i8* %a, i8 0, i32 2)
+
+define void @atomic8_sub_acquire(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i8* %a, i8 0 acquire
+  ret void
+}
+; CHECK: atomic8_sub_acquire
+; CHECK: call i8 @__tsan_atomic8_fetch_sub(i8* %a, i8 0, i32 2)
+
+define void @atomic8_and_acquire(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw and i8* %a, i8 0 acquire
+  ret void
+}
+; CHECK: atomic8_and_acquire
+; CHECK: call i8 @__tsan_atomic8_fetch_and(i8* %a, i8 0, i32 2)
+
+define void @atomic8_or_acquire(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw or i8* %a, i8 0 acquire
+  ret void
+}
+; CHECK: atomic8_or_acquire
+; CHECK: call i8 @__tsan_atomic8_fetch_or(i8* %a, i8 0, i32 2)
+
+define void @atomic8_xor_acquire(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i8* %a, i8 0 acquire
+  ret void
+}
+; CHECK: atomic8_xor_acquire
+; CHECK: call i8 @__tsan_atomic8_fetch_xor(i8* %a, i8 0, i32 2)
+
+define void @atomic8_xchg_release(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i8* %a, i8 0 release
+  ret void
+}
+; CHECK: atomic8_xchg_release
+; CHECK: call i8 @__tsan_atomic8_exchange(i8* %a, i8 0, i32 3)
+
+define void @atomic8_add_release(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw add i8* %a, i8 0 release
+  ret void
+}
+; CHECK: atomic8_add_release
+; CHECK: call i8 @__tsan_atomic8_fetch_add(i8* %a, i8 0, i32 3)
+
+define void @atomic8_sub_release(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i8* %a, i8 0 release
+  ret void
+}
+; CHECK: atomic8_sub_release
+; CHECK: call i8 @__tsan_atomic8_fetch_sub(i8* %a, i8 0, i32 3)
+
+define void @atomic8_and_release(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw and i8* %a, i8 0 release
+  ret void
+}
+; CHECK: atomic8_and_release
+; CHECK: call i8 @__tsan_atomic8_fetch_and(i8* %a, i8 0, i32 3)
+
+define void @atomic8_or_release(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw or i8* %a, i8 0 release
+  ret void
+}
+; CHECK: atomic8_or_release
+; CHECK: call i8 @__tsan_atomic8_fetch_or(i8* %a, i8 0, i32 3)
+
+define void @atomic8_xor_release(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i8* %a, i8 0 release
+  ret void
+}
+; CHECK: atomic8_xor_release
+; CHECK: call i8 @__tsan_atomic8_fetch_xor(i8* %a, i8 0, i32 3)
+
+define void @atomic8_xchg_acq_rel(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i8* %a, i8 0 acq_rel
+  ret void
+}
+; CHECK: atomic8_xchg_acq_rel
+; CHECK: call i8 @__tsan_atomic8_exchange(i8* %a, i8 0, i32 4)
+
+define void @atomic8_add_acq_rel(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw add i8* %a, i8 0 acq_rel
+  ret void
+}
+; CHECK: atomic8_add_acq_rel
+; CHECK: call i8 @__tsan_atomic8_fetch_add(i8* %a, i8 0, i32 4)
+
+define void @atomic8_sub_acq_rel(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i8* %a, i8 0 acq_rel
+  ret void
+}
+; CHECK: atomic8_sub_acq_rel
+; CHECK: call i8 @__tsan_atomic8_fetch_sub(i8* %a, i8 0, i32 4)
+
+define void @atomic8_and_acq_rel(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw and i8* %a, i8 0 acq_rel
+  ret void
+}
+; CHECK: atomic8_and_acq_rel
+; CHECK: call i8 @__tsan_atomic8_fetch_and(i8* %a, i8 0, i32 4)
+
+define void @atomic8_or_acq_rel(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw or i8* %a, i8 0 acq_rel
+  ret void
+}
+; CHECK: atomic8_or_acq_rel
+; CHECK: call i8 @__tsan_atomic8_fetch_or(i8* %a, i8 0, i32 4)
+
+define void @atomic8_xor_acq_rel(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i8* %a, i8 0 acq_rel
+  ret void
+}
+; CHECK: atomic8_xor_acq_rel
+; CHECK: call i8 @__tsan_atomic8_fetch_xor(i8* %a, i8 0, i32 4)
+
+define void @atomic8_xchg_seq_cst(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i8* %a, i8 0 seq_cst
+  ret void
+}
+; CHECK: atomic8_xchg_seq_cst
+; CHECK: call i8 @__tsan_atomic8_exchange(i8* %a, i8 0, i32 5)
+
+define void @atomic8_add_seq_cst(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw add i8* %a, i8 0 seq_cst
+  ret void
+}
+; CHECK: atomic8_add_seq_cst
+; CHECK: call i8 @__tsan_atomic8_fetch_add(i8* %a, i8 0, i32 5)
+
+define void @atomic8_sub_seq_cst(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i8* %a, i8 0 seq_cst
+  ret void
+}
+; CHECK: atomic8_sub_seq_cst
+; CHECK: call i8 @__tsan_atomic8_fetch_sub(i8* %a, i8 0, i32 5)
+
+define void @atomic8_and_seq_cst(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw and i8* %a, i8 0 seq_cst
+  ret void
+}
+; CHECK: atomic8_and_seq_cst
+; CHECK: call i8 @__tsan_atomic8_fetch_and(i8* %a, i8 0, i32 5)
+
+define void @atomic8_or_seq_cst(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw or i8* %a, i8 0 seq_cst
+  ret void
+}
+; CHECK: atomic8_or_seq_cst
+; CHECK: call i8 @__tsan_atomic8_fetch_or(i8* %a, i8 0, i32 5)
+
+define void @atomic8_xor_seq_cst(i8* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i8* %a, i8 0 seq_cst
+  ret void
+}
+; CHECK: atomic8_xor_seq_cst
+; CHECK: call i8 @__tsan_atomic8_fetch_xor(i8* %a, i8 0, i32 5)
+
+define void @atomic8_cas_monotonic(i8* %a) nounwind uwtable {
+entry:
+  cmpxchg i8* %a, i8 0, i8 1 monotonic
+  ret void
+}
+; CHECK: atomic8_cas_monotonic
+; CHECK: call i8 @__tsan_atomic8_compare_exchange_val(i8* %a, i8 0, i8 1, i32 0)
+
+define void @atomic8_cas_acquire(i8* %a) nounwind uwtable {
+entry:
+  cmpxchg i8* %a, i8 0, i8 1 acquire
+  ret void
+}
+; CHECK: atomic8_cas_acquire
+; CHECK: call i8 @__tsan_atomic8_compare_exchange_val(i8* %a, i8 0, i8 1, i32 2)
+
+define void @atomic8_cas_release(i8* %a) nounwind uwtable {
+entry:
+  cmpxchg i8* %a, i8 0, i8 1 release
+  ret void
+}
+; CHECK: atomic8_cas_release
+; CHECK: call i8 @__tsan_atomic8_compare_exchange_val(i8* %a, i8 0, i8 1, i32 3)
+
+define void @atomic8_cas_acq_rel(i8* %a) nounwind uwtable {
+entry:
+  cmpxchg i8* %a, i8 0, i8 1 acq_rel
+  ret void
+}
+; CHECK: atomic8_cas_acq_rel
+; CHECK: call i8 @__tsan_atomic8_compare_exchange_val(i8* %a, i8 0, i8 1, i32 4)
+
+define void @atomic8_cas_seq_cst(i8* %a) nounwind uwtable {
+entry:
+  cmpxchg i8* %a, i8 0, i8 1 seq_cst
+  ret void
+}
+; CHECK: atomic8_cas_seq_cst
+; CHECK: call i8 @__tsan_atomic8_compare_exchange_val(i8* %a, i8 0, i8 1, i32 5)
 
 define i16 @atomic16_load_unordered(i16* %a) nounwind uwtable {
 entry:
@@ -72,7 +352,7 @@ entry:
   ret i16 %0
 }
 ; CHECK: atomic16_load_unordered
-; CHECK: call i16 @__tsan_atomic16_load(i16* %a, i32 100501)
+; CHECK: call i16 @__tsan_atomic16_load(i16* %a, i32 0)
 
 define i16 @atomic16_load_monotonic(i16* %a) nounwind uwtable {
 entry:
@@ -80,7 +360,7 @@ entry:
   ret i16 %0
 }
 ; CHECK: atomic16_load_monotonic
-; CHECK: call i16 @__tsan_atomic16_load(i16* %a, i32 100501)
+; CHECK: call i16 @__tsan_atomic16_load(i16* %a, i32 0)
 
 define i16 @atomic16_load_acquire(i16* %a) nounwind uwtable {
 entry:
@@ -88,7 +368,7 @@ entry:
   ret i16 %0
 }
 ; CHECK: atomic16_load_acquire
-; CHECK: call i16 @__tsan_atomic16_load(i16* %a, i32 100504)
+; CHECK: call i16 @__tsan_atomic16_load(i16* %a, i32 2)
 
 define i16 @atomic16_load_seq_cst(i16* %a) nounwind uwtable {
 entry:
@@ -96,7 +376,7 @@ entry:
   ret i16 %0
 }
 ; CHECK: atomic16_load_seq_cst
-; CHECK: call i16 @__tsan_atomic16_load(i16* %a, i32 100532)
+; CHECK: call i16 @__tsan_atomic16_load(i16* %a, i32 5)
 
 define void @atomic16_store_unordered(i16* %a) nounwind uwtable {
 entry:
@@ -104,7 +384,7 @@ entry:
   ret void
 }
 ; CHECK: atomic16_store_unordered
-; CHECK: call void @__tsan_atomic16_store(i16* %a, i16 0, i32 100501)
+; CHECK: call void @__tsan_atomic16_store(i16* %a, i16 0, i32 0)
 
 define void @atomic16_store_monotonic(i16* %a) nounwind uwtable {
 entry:
@@ -112,7 +392,7 @@ entry:
   ret void
 }
 ; CHECK: atomic16_store_monotonic
-; CHECK: call void @__tsan_atomic16_store(i16* %a, i16 0, i32 100501)
+; CHECK: call void @__tsan_atomic16_store(i16* %a, i16 0, i32 0)
 
 define void @atomic16_store_release(i16* %a) nounwind uwtable {
 entry:
@@ -120,7 +400,7 @@ entry:
   ret void
 }
 ; CHECK: atomic16_store_release
-; CHECK: call void @__tsan_atomic16_store(i16* %a, i16 0, i32 100508)
+; CHECK: call void @__tsan_atomic16_store(i16* %a, i16 0, i32 3)
 
 define void @atomic16_store_seq_cst(i16* %a) nounwind uwtable {
 entry:
@@ -128,7 +408,287 @@ entry:
   ret void
 }
 ; CHECK: atomic16_store_seq_cst
-; CHECK: call void @__tsan_atomic16_store(i16* %a, i16 0, i32 100532)
+; CHECK: call void @__tsan_atomic16_store(i16* %a, i16 0, i32 5)
+
+define void @atomic16_xchg_monotonic(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i16* %a, i16 0 monotonic
+  ret void
+}
+; CHECK: atomic16_xchg_monotonic
+; CHECK: call i16 @__tsan_atomic16_exchange(i16* %a, i16 0, i32 0)
+
+define void @atomic16_add_monotonic(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw add i16* %a, i16 0 monotonic
+  ret void
+}
+; CHECK: atomic16_add_monotonic
+; CHECK: call i16 @__tsan_atomic16_fetch_add(i16* %a, i16 0, i32 0)
+
+define void @atomic16_sub_monotonic(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i16* %a, i16 0 monotonic
+  ret void
+}
+; CHECK: atomic16_sub_monotonic
+; CHECK: call i16 @__tsan_atomic16_fetch_sub(i16* %a, i16 0, i32 0)
+
+define void @atomic16_and_monotonic(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw and i16* %a, i16 0 monotonic
+  ret void
+}
+; CHECK: atomic16_and_monotonic
+; CHECK: call i16 @__tsan_atomic16_fetch_and(i16* %a, i16 0, i32 0)
+
+define void @atomic16_or_monotonic(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw or i16* %a, i16 0 monotonic
+  ret void
+}
+; CHECK: atomic16_or_monotonic
+; CHECK: call i16 @__tsan_atomic16_fetch_or(i16* %a, i16 0, i32 0)
+
+define void @atomic16_xor_monotonic(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i16* %a, i16 0 monotonic
+  ret void
+}
+; CHECK: atomic16_xor_monotonic
+; CHECK: call i16 @__tsan_atomic16_fetch_xor(i16* %a, i16 0, i32 0)
+
+define void @atomic16_xchg_acquire(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i16* %a, i16 0 acquire
+  ret void
+}
+; CHECK: atomic16_xchg_acquire
+; CHECK: call i16 @__tsan_atomic16_exchange(i16* %a, i16 0, i32 2)
+
+define void @atomic16_add_acquire(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw add i16* %a, i16 0 acquire
+  ret void
+}
+; CHECK: atomic16_add_acquire
+; CHECK: call i16 @__tsan_atomic16_fetch_add(i16* %a, i16 0, i32 2)
+
+define void @atomic16_sub_acquire(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i16* %a, i16 0 acquire
+  ret void
+}
+; CHECK: atomic16_sub_acquire
+; CHECK: call i16 @__tsan_atomic16_fetch_sub(i16* %a, i16 0, i32 2)
+
+define void @atomic16_and_acquire(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw and i16* %a, i16 0 acquire
+  ret void
+}
+; CHECK: atomic16_and_acquire
+; CHECK: call i16 @__tsan_atomic16_fetch_and(i16* %a, i16 0, i32 2)
+
+define void @atomic16_or_acquire(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw or i16* %a, i16 0 acquire
+  ret void
+}
+; CHECK: atomic16_or_acquire
+; CHECK: call i16 @__tsan_atomic16_fetch_or(i16* %a, i16 0, i32 2)
+
+define void @atomic16_xor_acquire(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i16* %a, i16 0 acquire
+  ret void
+}
+; CHECK: atomic16_xor_acquire
+; CHECK: call i16 @__tsan_atomic16_fetch_xor(i16* %a, i16 0, i32 2)
+
+define void @atomic16_xchg_release(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i16* %a, i16 0 release
+  ret void
+}
+; CHECK: atomic16_xchg_release
+; CHECK: call i16 @__tsan_atomic16_exchange(i16* %a, i16 0, i32 3)
+
+define void @atomic16_add_release(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw add i16* %a, i16 0 release
+  ret void
+}
+; CHECK: atomic16_add_release
+; CHECK: call i16 @__tsan_atomic16_fetch_add(i16* %a, i16 0, i32 3)
+
+define void @atomic16_sub_release(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i16* %a, i16 0 release
+  ret void
+}
+; CHECK: atomic16_sub_release
+; CHECK: call i16 @__tsan_atomic16_fetch_sub(i16* %a, i16 0, i32 3)
+
+define void @atomic16_and_release(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw and i16* %a, i16 0 release
+  ret void
+}
+; CHECK: atomic16_and_release
+; CHECK: call i16 @__tsan_atomic16_fetch_and(i16* %a, i16 0, i32 3)
+
+define void @atomic16_or_release(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw or i16* %a, i16 0 release
+  ret void
+}
+; CHECK: atomic16_or_release
+; CHECK: call i16 @__tsan_atomic16_fetch_or(i16* %a, i16 0, i32 3)
+
+define void @atomic16_xor_release(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i16* %a, i16 0 release
+  ret void
+}
+; CHECK: atomic16_xor_release
+; CHECK: call i16 @__tsan_atomic16_fetch_xor(i16* %a, i16 0, i32 3)
+
+define void @atomic16_xchg_acq_rel(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i16* %a, i16 0 acq_rel
+  ret void
+}
+; CHECK: atomic16_xchg_acq_rel
+; CHECK: call i16 @__tsan_atomic16_exchange(i16* %a, i16 0, i32 4)
+
+define void @atomic16_add_acq_rel(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw add i16* %a, i16 0 acq_rel
+  ret void
+}
+; CHECK: atomic16_add_acq_rel
+; CHECK: call i16 @__tsan_atomic16_fetch_add(i16* %a, i16 0, i32 4)
+
+define void @atomic16_sub_acq_rel(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i16* %a, i16 0 acq_rel
+  ret void
+}
+; CHECK: atomic16_sub_acq_rel
+; CHECK: call i16 @__tsan_atomic16_fetch_sub(i16* %a, i16 0, i32 4)
+
+define void @atomic16_and_acq_rel(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw and i16* %a, i16 0 acq_rel
+  ret void
+}
+; CHECK: atomic16_and_acq_rel
+; CHECK: call i16 @__tsan_atomic16_fetch_and(i16* %a, i16 0, i32 4)
+
+define void @atomic16_or_acq_rel(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw or i16* %a, i16 0 acq_rel
+  ret void
+}
+; CHECK: atomic16_or_acq_rel
+; CHECK: call i16 @__tsan_atomic16_fetch_or(i16* %a, i16 0, i32 4)
+
+define void @atomic16_xor_acq_rel(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i16* %a, i16 0 acq_rel
+  ret void
+}
+; CHECK: atomic16_xor_acq_rel
+; CHECK: call i16 @__tsan_atomic16_fetch_xor(i16* %a, i16 0, i32 4)
+
+define void @atomic16_xchg_seq_cst(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i16* %a, i16 0 seq_cst
+  ret void
+}
+; CHECK: atomic16_xchg_seq_cst
+; CHECK: call i16 @__tsan_atomic16_exchange(i16* %a, i16 0, i32 5)
+
+define void @atomic16_add_seq_cst(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw add i16* %a, i16 0 seq_cst
+  ret void
+}
+; CHECK: atomic16_add_seq_cst
+; CHECK: call i16 @__tsan_atomic16_fetch_add(i16* %a, i16 0, i32 5)
+
+define void @atomic16_sub_seq_cst(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i16* %a, i16 0 seq_cst
+  ret void
+}
+; CHECK: atomic16_sub_seq_cst
+; CHECK: call i16 @__tsan_atomic16_fetch_sub(i16* %a, i16 0, i32 5)
+
+define void @atomic16_and_seq_cst(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw and i16* %a, i16 0 seq_cst
+  ret void
+}
+; CHECK: atomic16_and_seq_cst
+; CHECK: call i16 @__tsan_atomic16_fetch_and(i16* %a, i16 0, i32 5)
+
+define void @atomic16_or_seq_cst(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw or i16* %a, i16 0 seq_cst
+  ret void
+}
+; CHECK: atomic16_or_seq_cst
+; CHECK: call i16 @__tsan_atomic16_fetch_or(i16* %a, i16 0, i32 5)
+
+define void @atomic16_xor_seq_cst(i16* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i16* %a, i16 0 seq_cst
+  ret void
+}
+; CHECK: atomic16_xor_seq_cst
+; CHECK: call i16 @__tsan_atomic16_fetch_xor(i16* %a, i16 0, i32 5)
+
+define void @atomic16_cas_monotonic(i16* %a) nounwind uwtable {
+entry:
+  cmpxchg i16* %a, i16 0, i16 1 monotonic
+  ret void
+}
+; CHECK: atomic16_cas_monotonic
+; CHECK: call i16 @__tsan_atomic16_compare_exchange_val(i16* %a, i16 0, i16 1, i32 0)
+
+define void @atomic16_cas_acquire(i16* %a) nounwind uwtable {
+entry:
+  cmpxchg i16* %a, i16 0, i16 1 acquire
+  ret void
+}
+; CHECK: atomic16_cas_acquire
+; CHECK: call i16 @__tsan_atomic16_compare_exchange_val(i16* %a, i16 0, i16 1, i32 2)
+
+define void @atomic16_cas_release(i16* %a) nounwind uwtable {
+entry:
+  cmpxchg i16* %a, i16 0, i16 1 release
+  ret void
+}
+; CHECK: atomic16_cas_release
+; CHECK: call i16 @__tsan_atomic16_compare_exchange_val(i16* %a, i16 0, i16 1, i32 3)
+
+define void @atomic16_cas_acq_rel(i16* %a) nounwind uwtable {
+entry:
+  cmpxchg i16* %a, i16 0, i16 1 acq_rel
+  ret void
+}
+; CHECK: atomic16_cas_acq_rel
+; CHECK: call i16 @__tsan_atomic16_compare_exchange_val(i16* %a, i16 0, i16 1, i32 4)
+
+define void @atomic16_cas_seq_cst(i16* %a) nounwind uwtable {
+entry:
+  cmpxchg i16* %a, i16 0, i16 1 seq_cst
+  ret void
+}
+; CHECK: atomic16_cas_seq_cst
+; CHECK: call i16 @__tsan_atomic16_compare_exchange_val(i16* %a, i16 0, i16 1, i32 5)
 
 define i32 @atomic32_load_unordered(i32* %a) nounwind uwtable {
 entry:
@@ -136,7 +696,7 @@ entry:
   ret i32 %0
 }
 ; CHECK: atomic32_load_unordered
-; CHECK: call i32 @__tsan_atomic32_load(i32* %a, i32 100501)
+; CHECK: call i32 @__tsan_atomic32_load(i32* %a, i32 0)
 
 define i32 @atomic32_load_monotonic(i32* %a) nounwind uwtable {
 entry:
@@ -144,7 +704,7 @@ entry:
   ret i32 %0
 }
 ; CHECK: atomic32_load_monotonic
-; CHECK: call i32 @__tsan_atomic32_load(i32* %a, i32 100501)
+; CHECK: call i32 @__tsan_atomic32_load(i32* %a, i32 0)
 
 define i32 @atomic32_load_acquire(i32* %a) nounwind uwtable {
 entry:
@@ -152,7 +712,7 @@ entry:
   ret i32 %0
 }
 ; CHECK: atomic32_load_acquire
-; CHECK: call i32 @__tsan_atomic32_load(i32* %a, i32 100504)
+; CHECK: call i32 @__tsan_atomic32_load(i32* %a, i32 2)
 
 define i32 @atomic32_load_seq_cst(i32* %a) nounwind uwtable {
 entry:
@@ -160,7 +720,7 @@ entry:
   ret i32 %0
 }
 ; CHECK: atomic32_load_seq_cst
-; CHECK: call i32 @__tsan_atomic32_load(i32* %a, i32 100532)
+; CHECK: call i32 @__tsan_atomic32_load(i32* %a, i32 5)
 
 define void @atomic32_store_unordered(i32* %a) nounwind uwtable {
 entry:
@@ -168,7 +728,7 @@ entry:
   ret void
 }
 ; CHECK: atomic32_store_unordered
-; CHECK: call void @__tsan_atomic32_store(i32* %a, i32 0, i32 100501)
+; CHECK: call void @__tsan_atomic32_store(i32* %a, i32 0, i32 0)
 
 define void @atomic32_store_monotonic(i32* %a) nounwind uwtable {
 entry:
@@ -176,7 +736,7 @@ entry:
   ret void
 }
 ; CHECK: atomic32_store_monotonic
-; CHECK: call void @__tsan_atomic32_store(i32* %a, i32 0, i32 100501)
+; CHECK: call void @__tsan_atomic32_store(i32* %a, i32 0, i32 0)
 
 define void @atomic32_store_release(i32* %a) nounwind uwtable {
 entry:
@@ -184,7 +744,7 @@ entry:
   ret void
 }
 ; CHECK: atomic32_store_release
-; CHECK: call void @__tsan_atomic32_store(i32* %a, i32 0, i32 100508)
+; CHECK: call void @__tsan_atomic32_store(i32* %a, i32 0, i32 3)
 
 define void @atomic32_store_seq_cst(i32* %a) nounwind uwtable {
 entry:
@@ -192,7 +752,287 @@ entry:
   ret void
 }
 ; CHECK: atomic32_store_seq_cst
-; CHECK: call void @__tsan_atomic32_store(i32* %a, i32 0, i32 100532)
+; CHECK: call void @__tsan_atomic32_store(i32* %a, i32 0, i32 5)
+
+define void @atomic32_xchg_monotonic(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i32* %a, i32 0 monotonic
+  ret void
+}
+; CHECK: atomic32_xchg_monotonic
+; CHECK: call i32 @__tsan_atomic32_exchange(i32* %a, i32 0, i32 0)
+
+define void @atomic32_add_monotonic(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw add i32* %a, i32 0 monotonic
+  ret void
+}
+; CHECK: atomic32_add_monotonic
+; CHECK: call i32 @__tsan_atomic32_fetch_add(i32* %a, i32 0, i32 0)
+
+define void @atomic32_sub_monotonic(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i32* %a, i32 0 monotonic
+  ret void
+}
+; CHECK: atomic32_sub_monotonic
+; CHECK: call i32 @__tsan_atomic32_fetch_sub(i32* %a, i32 0, i32 0)
+
+define void @atomic32_and_monotonic(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw and i32* %a, i32 0 monotonic
+  ret void
+}
+; CHECK: atomic32_and_monotonic
+; CHECK: call i32 @__tsan_atomic32_fetch_and(i32* %a, i32 0, i32 0)
+
+define void @atomic32_or_monotonic(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw or i32* %a, i32 0 monotonic
+  ret void
+}
+; CHECK: atomic32_or_monotonic
+; CHECK: call i32 @__tsan_atomic32_fetch_or(i32* %a, i32 0, i32 0)
+
+define void @atomic32_xor_monotonic(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i32* %a, i32 0 monotonic
+  ret void
+}
+; CHECK: atomic32_xor_monotonic
+; CHECK: call i32 @__tsan_atomic32_fetch_xor(i32* %a, i32 0, i32 0)
+
+define void @atomic32_xchg_acquire(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i32* %a, i32 0 acquire
+  ret void
+}
+; CHECK: atomic32_xchg_acquire
+; CHECK: call i32 @__tsan_atomic32_exchange(i32* %a, i32 0, i32 2)
+
+define void @atomic32_add_acquire(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw add i32* %a, i32 0 acquire
+  ret void
+}
+; CHECK: atomic32_add_acquire
+; CHECK: call i32 @__tsan_atomic32_fetch_add(i32* %a, i32 0, i32 2)
+
+define void @atomic32_sub_acquire(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i32* %a, i32 0 acquire
+  ret void
+}
+; CHECK: atomic32_sub_acquire
+; CHECK: call i32 @__tsan_atomic32_fetch_sub(i32* %a, i32 0, i32 2)
+
+define void @atomic32_and_acquire(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw and i32* %a, i32 0 acquire
+  ret void
+}
+; CHECK: atomic32_and_acquire
+; CHECK: call i32 @__tsan_atomic32_fetch_and(i32* %a, i32 0, i32 2)
+
+define void @atomic32_or_acquire(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw or i32* %a, i32 0 acquire
+  ret void
+}
+; CHECK: atomic32_or_acquire
+; CHECK: call i32 @__tsan_atomic32_fetch_or(i32* %a, i32 0, i32 2)
+
+define void @atomic32_xor_acquire(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i32* %a, i32 0 acquire
+  ret void
+}
+; CHECK: atomic32_xor_acquire
+; CHECK: call i32 @__tsan_atomic32_fetch_xor(i32* %a, i32 0, i32 2)
+
+define void @atomic32_xchg_release(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i32* %a, i32 0 release
+  ret void
+}
+; CHECK: atomic32_xchg_release
+; CHECK: call i32 @__tsan_atomic32_exchange(i32* %a, i32 0, i32 3)
+
+define void @atomic32_add_release(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw add i32* %a, i32 0 release
+  ret void
+}
+; CHECK: atomic32_add_release
+; CHECK: call i32 @__tsan_atomic32_fetch_add(i32* %a, i32 0, i32 3)
+
+define void @atomic32_sub_release(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i32* %a, i32 0 release
+  ret void
+}
+; CHECK: atomic32_sub_release
+; CHECK: call i32 @__tsan_atomic32_fetch_sub(i32* %a, i32 0, i32 3)
+
+define void @atomic32_and_release(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw and i32* %a, i32 0 release
+  ret void
+}
+; CHECK: atomic32_and_release
+; CHECK: call i32 @__tsan_atomic32_fetch_and(i32* %a, i32 0, i32 3)
+
+define void @atomic32_or_release(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw or i32* %a, i32 0 release
+  ret void
+}
+; CHECK: atomic32_or_release
+; CHECK: call i32 @__tsan_atomic32_fetch_or(i32* %a, i32 0, i32 3)
+
+define void @atomic32_xor_release(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i32* %a, i32 0 release
+  ret void
+}
+; CHECK: atomic32_xor_release
+; CHECK: call i32 @__tsan_atomic32_fetch_xor(i32* %a, i32 0, i32 3)
+
+define void @atomic32_xchg_acq_rel(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i32* %a, i32 0 acq_rel
+  ret void
+}
+; CHECK: atomic32_xchg_acq_rel
+; CHECK: call i32 @__tsan_atomic32_exchange(i32* %a, i32 0, i32 4)
+
+define void @atomic32_add_acq_rel(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw add i32* %a, i32 0 acq_rel
+  ret void
+}
+; CHECK: atomic32_add_acq_rel
+; CHECK: call i32 @__tsan_atomic32_fetch_add(i32* %a, i32 0, i32 4)
+
+define void @atomic32_sub_acq_rel(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i32* %a, i32 0 acq_rel
+  ret void
+}
+; CHECK: atomic32_sub_acq_rel
+; CHECK: call i32 @__tsan_atomic32_fetch_sub(i32* %a, i32 0, i32 4)
+
+define void @atomic32_and_acq_rel(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw and i32* %a, i32 0 acq_rel
+  ret void
+}
+; CHECK: atomic32_and_acq_rel
+; CHECK: call i32 @__tsan_atomic32_fetch_and(i32* %a, i32 0, i32 4)
+
+define void @atomic32_or_acq_rel(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw or i32* %a, i32 0 acq_rel
+  ret void
+}
+; CHECK: atomic32_or_acq_rel
+; CHECK: call i32 @__tsan_atomic32_fetch_or(i32* %a, i32 0, i32 4)
+
+define void @atomic32_xor_acq_rel(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i32* %a, i32 0 acq_rel
+  ret void
+}
+; CHECK: atomic32_xor_acq_rel
+; CHECK: call i32 @__tsan_atomic32_fetch_xor(i32* %a, i32 0, i32 4)
+
+define void @atomic32_xchg_seq_cst(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i32* %a, i32 0 seq_cst
+  ret void
+}
+; CHECK: atomic32_xchg_seq_cst
+; CHECK: call i32 @__tsan_atomic32_exchange(i32* %a, i32 0, i32 5)
+
+define void @atomic32_add_seq_cst(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw add i32* %a, i32 0 seq_cst
+  ret void
+}
+; CHECK: atomic32_add_seq_cst
+; CHECK: call i32 @__tsan_atomic32_fetch_add(i32* %a, i32 0, i32 5)
+
+define void @atomic32_sub_seq_cst(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i32* %a, i32 0 seq_cst
+  ret void
+}
+; CHECK: atomic32_sub_seq_cst
+; CHECK: call i32 @__tsan_atomic32_fetch_sub(i32* %a, i32 0, i32 5)
+
+define void @atomic32_and_seq_cst(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw and i32* %a, i32 0 seq_cst
+  ret void
+}
+; CHECK: atomic32_and_seq_cst
+; CHECK: call i32 @__tsan_atomic32_fetch_and(i32* %a, i32 0, i32 5)
+
+define void @atomic32_or_seq_cst(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw or i32* %a, i32 0 seq_cst
+  ret void
+}
+; CHECK: atomic32_or_seq_cst
+; CHECK: call i32 @__tsan_atomic32_fetch_or(i32* %a, i32 0, i32 5)
+
+define void @atomic32_xor_seq_cst(i32* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i32* %a, i32 0 seq_cst
+  ret void
+}
+; CHECK: atomic32_xor_seq_cst
+; CHECK: call i32 @__tsan_atomic32_fetch_xor(i32* %a, i32 0, i32 5)
+
+define void @atomic32_cas_monotonic(i32* %a) nounwind uwtable {
+entry:
+  cmpxchg i32* %a, i32 0, i32 1 monotonic
+  ret void
+}
+; CHECK: atomic32_cas_monotonic
+; CHECK: call i32 @__tsan_atomic32_compare_exchange_val(i32* %a, i32 0, i32 1, i32 0)
+
+define void @atomic32_cas_acquire(i32* %a) nounwind uwtable {
+entry:
+  cmpxchg i32* %a, i32 0, i32 1 acquire
+  ret void
+}
+; CHECK: atomic32_cas_acquire
+; CHECK: call i32 @__tsan_atomic32_compare_exchange_val(i32* %a, i32 0, i32 1, i32 2)
+
+define void @atomic32_cas_release(i32* %a) nounwind uwtable {
+entry:
+  cmpxchg i32* %a, i32 0, i32 1 release
+  ret void
+}
+; CHECK: atomic32_cas_release
+; CHECK: call i32 @__tsan_atomic32_compare_exchange_val(i32* %a, i32 0, i32 1, i32 3)
+
+define void @atomic32_cas_acq_rel(i32* %a) nounwind uwtable {
+entry:
+  cmpxchg i32* %a, i32 0, i32 1 acq_rel
+  ret void
+}
+; CHECK: atomic32_cas_acq_rel
+; CHECK: call i32 @__tsan_atomic32_compare_exchange_val(i32* %a, i32 0, i32 1, i32 4)
+
+define void @atomic32_cas_seq_cst(i32* %a) nounwind uwtable {
+entry:
+  cmpxchg i32* %a, i32 0, i32 1 seq_cst
+  ret void
+}
+; CHECK: atomic32_cas_seq_cst
+; CHECK: call i32 @__tsan_atomic32_compare_exchange_val(i32* %a, i32 0, i32 1, i32 5)
 
 define i64 @atomic64_load_unordered(i64* %a) nounwind uwtable {
 entry:
@@ -200,7 +1040,7 @@ entry:
   ret i64 %0
 }
 ; CHECK: atomic64_load_unordered
-; CHECK: call i64 @__tsan_atomic64_load(i64* %a, i32 100501)
+; CHECK: call i64 @__tsan_atomic64_load(i64* %a, i32 0)
 
 define i64 @atomic64_load_monotonic(i64* %a) nounwind uwtable {
 entry:
@@ -208,7 +1048,7 @@ entry:
   ret i64 %0
 }
 ; CHECK: atomic64_load_monotonic
-; CHECK: call i64 @__tsan_atomic64_load(i64* %a, i32 100501)
+; CHECK: call i64 @__tsan_atomic64_load(i64* %a, i32 0)
 
 define i64 @atomic64_load_acquire(i64* %a) nounwind uwtable {
 entry:
@@ -216,7 +1056,7 @@ entry:
   ret i64 %0
 }
 ; CHECK: atomic64_load_acquire
-; CHECK: call i64 @__tsan_atomic64_load(i64* %a, i32 100504)
+; CHECK: call i64 @__tsan_atomic64_load(i64* %a, i32 2)
 
 define i64 @atomic64_load_seq_cst(i64* %a) nounwind uwtable {
 entry:
@@ -224,7 +1064,7 @@ entry:
   ret i64 %0
 }
 ; CHECK: atomic64_load_seq_cst
-; CHECK: call i64 @__tsan_atomic64_load(i64* %a, i32 100532)
+; CHECK: call i64 @__tsan_atomic64_load(i64* %a, i32 5)
 
 define void @atomic64_store_unordered(i64* %a) nounwind uwtable {
 entry:
@@ -232,7 +1072,7 @@ entry:
   ret void
 }
 ; CHECK: atomic64_store_unordered
-; CHECK: call void @__tsan_atomic64_store(i64* %a, i64 0, i32 100501)
+; CHECK: call void @__tsan_atomic64_store(i64* %a, i64 0, i32 0)
 
 define void @atomic64_store_monotonic(i64* %a) nounwind uwtable {
 entry:
@@ -240,7 +1080,7 @@ entry:
   ret void
 }
 ; CHECK: atomic64_store_monotonic
-; CHECK: call void @__tsan_atomic64_store(i64* %a, i64 0, i32 100501)
+; CHECK: call void @__tsan_atomic64_store(i64* %a, i64 0, i32 0)
 
 define void @atomic64_store_release(i64* %a) nounwind uwtable {
 entry:
@@ -248,7 +1088,7 @@ entry:
   ret void
 }
 ; CHECK: atomic64_store_release
-; CHECK: call void @__tsan_atomic64_store(i64* %a, i64 0, i32 100508)
+; CHECK: call void @__tsan_atomic64_store(i64* %a, i64 0, i32 3)
 
 define void @atomic64_store_seq_cst(i64* %a) nounwind uwtable {
 entry:
@@ -256,7 +1096,287 @@ entry:
   ret void
 }
 ; CHECK: atomic64_store_seq_cst
-; CHECK: call void @__tsan_atomic64_store(i64* %a, i64 0, i32 100532)
+; CHECK: call void @__tsan_atomic64_store(i64* %a, i64 0, i32 5)
+
+define void @atomic64_xchg_monotonic(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i64* %a, i64 0 monotonic
+  ret void
+}
+; CHECK: atomic64_xchg_monotonic
+; CHECK: call i64 @__tsan_atomic64_exchange(i64* %a, i64 0, i32 0)
+
+define void @atomic64_add_monotonic(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw add i64* %a, i64 0 monotonic
+  ret void
+}
+; CHECK: atomic64_add_monotonic
+; CHECK: call i64 @__tsan_atomic64_fetch_add(i64* %a, i64 0, i32 0)
+
+define void @atomic64_sub_monotonic(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i64* %a, i64 0 monotonic
+  ret void
+}
+; CHECK: atomic64_sub_monotonic
+; CHECK: call i64 @__tsan_atomic64_fetch_sub(i64* %a, i64 0, i32 0)
+
+define void @atomic64_and_monotonic(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw and i64* %a, i64 0 monotonic
+  ret void
+}
+; CHECK: atomic64_and_monotonic
+; CHECK: call i64 @__tsan_atomic64_fetch_and(i64* %a, i64 0, i32 0)
+
+define void @atomic64_or_monotonic(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw or i64* %a, i64 0 monotonic
+  ret void
+}
+; CHECK: atomic64_or_monotonic
+; CHECK: call i64 @__tsan_atomic64_fetch_or(i64* %a, i64 0, i32 0)
+
+define void @atomic64_xor_monotonic(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i64* %a, i64 0 monotonic
+  ret void
+}
+; CHECK: atomic64_xor_monotonic
+; CHECK: call i64 @__tsan_atomic64_fetch_xor(i64* %a, i64 0, i32 0)
+
+define void @atomic64_xchg_acquire(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i64* %a, i64 0 acquire
+  ret void
+}
+; CHECK: atomic64_xchg_acquire
+; CHECK: call i64 @__tsan_atomic64_exchange(i64* %a, i64 0, i32 2)
+
+define void @atomic64_add_acquire(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw add i64* %a, i64 0 acquire
+  ret void
+}
+; CHECK: atomic64_add_acquire
+; CHECK: call i64 @__tsan_atomic64_fetch_add(i64* %a, i64 0, i32 2)
+
+define void @atomic64_sub_acquire(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i64* %a, i64 0 acquire
+  ret void
+}
+; CHECK: atomic64_sub_acquire
+; CHECK: call i64 @__tsan_atomic64_fetch_sub(i64* %a, i64 0, i32 2)
+
+define void @atomic64_and_acquire(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw and i64* %a, i64 0 acquire
+  ret void
+}
+; CHECK: atomic64_and_acquire
+; CHECK: call i64 @__tsan_atomic64_fetch_and(i64* %a, i64 0, i32 2)
+
+define void @atomic64_or_acquire(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw or i64* %a, i64 0 acquire
+  ret void
+}
+; CHECK: atomic64_or_acquire
+; CHECK: call i64 @__tsan_atomic64_fetch_or(i64* %a, i64 0, i32 2)
+
+define void @atomic64_xor_acquire(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i64* %a, i64 0 acquire
+  ret void
+}
+; CHECK: atomic64_xor_acquire
+; CHECK: call i64 @__tsan_atomic64_fetch_xor(i64* %a, i64 0, i32 2)
+
+define void @atomic64_xchg_release(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i64* %a, i64 0 release
+  ret void
+}
+; CHECK: atomic64_xchg_release
+; CHECK: call i64 @__tsan_atomic64_exchange(i64* %a, i64 0, i32 3)
+
+define void @atomic64_add_release(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw add i64* %a, i64 0 release
+  ret void
+}
+; CHECK: atomic64_add_release
+; CHECK: call i64 @__tsan_atomic64_fetch_add(i64* %a, i64 0, i32 3)
+
+define void @atomic64_sub_release(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i64* %a, i64 0 release
+  ret void
+}
+; CHECK: atomic64_sub_release
+; CHECK: call i64 @__tsan_atomic64_fetch_sub(i64* %a, i64 0, i32 3)
+
+define void @atomic64_and_release(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw and i64* %a, i64 0 release
+  ret void
+}
+; CHECK: atomic64_and_release
+; CHECK: call i64 @__tsan_atomic64_fetch_and(i64* %a, i64 0, i32 3)
+
+define void @atomic64_or_release(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw or i64* %a, i64 0 release
+  ret void
+}
+; CHECK: atomic64_or_release
+; CHECK: call i64 @__tsan_atomic64_fetch_or(i64* %a, i64 0, i32 3)
+
+define void @atomic64_xor_release(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i64* %a, i64 0 release
+  ret void
+}
+; CHECK: atomic64_xor_release
+; CHECK: call i64 @__tsan_atomic64_fetch_xor(i64* %a, i64 0, i32 3)
+
+define void @atomic64_xchg_acq_rel(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i64* %a, i64 0 acq_rel
+  ret void
+}
+; CHECK: atomic64_xchg_acq_rel
+; CHECK: call i64 @__tsan_atomic64_exchange(i64* %a, i64 0, i32 4)
+
+define void @atomic64_add_acq_rel(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw add i64* %a, i64 0 acq_rel
+  ret void
+}
+; CHECK: atomic64_add_acq_rel
+; CHECK: call i64 @__tsan_atomic64_fetch_add(i64* %a, i64 0, i32 4)
+
+define void @atomic64_sub_acq_rel(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i64* %a, i64 0 acq_rel
+  ret void
+}
+; CHECK: atomic64_sub_acq_rel
+; CHECK: call i64 @__tsan_atomic64_fetch_sub(i64* %a, i64 0, i32 4)
+
+define void @atomic64_and_acq_rel(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw and i64* %a, i64 0 acq_rel
+  ret void
+}
+; CHECK: atomic64_and_acq_rel
+; CHECK: call i64 @__tsan_atomic64_fetch_and(i64* %a, i64 0, i32 4)
+
+define void @atomic64_or_acq_rel(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw or i64* %a, i64 0 acq_rel
+  ret void
+}
+; CHECK: atomic64_or_acq_rel
+; CHECK: call i64 @__tsan_atomic64_fetch_or(i64* %a, i64 0, i32 4)
+
+define void @atomic64_xor_acq_rel(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i64* %a, i64 0 acq_rel
+  ret void
+}
+; CHECK: atomic64_xor_acq_rel
+; CHECK: call i64 @__tsan_atomic64_fetch_xor(i64* %a, i64 0, i32 4)
+
+define void @atomic64_xchg_seq_cst(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i64* %a, i64 0 seq_cst
+  ret void
+}
+; CHECK: atomic64_xchg_seq_cst
+; CHECK: call i64 @__tsan_atomic64_exchange(i64* %a, i64 0, i32 5)
+
+define void @atomic64_add_seq_cst(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw add i64* %a, i64 0 seq_cst
+  ret void
+}
+; CHECK: atomic64_add_seq_cst
+; CHECK: call i64 @__tsan_atomic64_fetch_add(i64* %a, i64 0, i32 5)
+
+define void @atomic64_sub_seq_cst(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i64* %a, i64 0 seq_cst
+  ret void
+}
+; CHECK: atomic64_sub_seq_cst
+; CHECK: call i64 @__tsan_atomic64_fetch_sub(i64* %a, i64 0, i32 5)
+
+define void @atomic64_and_seq_cst(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw and i64* %a, i64 0 seq_cst
+  ret void
+}
+; CHECK: atomic64_and_seq_cst
+; CHECK: call i64 @__tsan_atomic64_fetch_and(i64* %a, i64 0, i32 5)
+
+define void @atomic64_or_seq_cst(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw or i64* %a, i64 0 seq_cst
+  ret void
+}
+; CHECK: atomic64_or_seq_cst
+; CHECK: call i64 @__tsan_atomic64_fetch_or(i64* %a, i64 0, i32 5)
+
+define void @atomic64_xor_seq_cst(i64* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i64* %a, i64 0 seq_cst
+  ret void
+}
+; CHECK: atomic64_xor_seq_cst
+; CHECK: call i64 @__tsan_atomic64_fetch_xor(i64* %a, i64 0, i32 5)
+
+define void @atomic64_cas_monotonic(i64* %a) nounwind uwtable {
+entry:
+  cmpxchg i64* %a, i64 0, i64 1 monotonic
+  ret void
+}
+; CHECK: atomic64_cas_monotonic
+; CHECK: call i64 @__tsan_atomic64_compare_exchange_val(i64* %a, i64 0, i64 1, i32 0)
+
+define void @atomic64_cas_acquire(i64* %a) nounwind uwtable {
+entry:
+  cmpxchg i64* %a, i64 0, i64 1 acquire
+  ret void
+}
+; CHECK: atomic64_cas_acquire
+; CHECK: call i64 @__tsan_atomic64_compare_exchange_val(i64* %a, i64 0, i64 1, i32 2)
+
+define void @atomic64_cas_release(i64* %a) nounwind uwtable {
+entry:
+  cmpxchg i64* %a, i64 0, i64 1 release
+  ret void
+}
+; CHECK: atomic64_cas_release
+; CHECK: call i64 @__tsan_atomic64_compare_exchange_val(i64* %a, i64 0, i64 1, i32 3)
+
+define void @atomic64_cas_acq_rel(i64* %a) nounwind uwtable {
+entry:
+  cmpxchg i64* %a, i64 0, i64 1 acq_rel
+  ret void
+}
+; CHECK: atomic64_cas_acq_rel
+; CHECK: call i64 @__tsan_atomic64_compare_exchange_val(i64* %a, i64 0, i64 1, i32 4)
+
+define void @atomic64_cas_seq_cst(i64* %a) nounwind uwtable {
+entry:
+  cmpxchg i64* %a, i64 0, i64 1 seq_cst
+  ret void
+}
+; CHECK: atomic64_cas_seq_cst
+; CHECK: call i64 @__tsan_atomic64_compare_exchange_val(i64* %a, i64 0, i64 1, i32 5)
 
 define i128 @atomic128_load_unordered(i128* %a) nounwind uwtable {
 entry:
@@ -264,7 +1384,7 @@ entry:
   ret i128 %0
 }
 ; CHECK: atomic128_load_unordered
-; CHECK: call i128 @__tsan_atomic128_load(i128* %a, i32 100501)
+; CHECK: call i128 @__tsan_atomic128_load(i128* %a, i32 0)
 
 define i128 @atomic128_load_monotonic(i128* %a) nounwind uwtable {
 entry:
@@ -272,7 +1392,7 @@ entry:
   ret i128 %0
 }
 ; CHECK: atomic128_load_monotonic
-; CHECK: call i128 @__tsan_atomic128_load(i128* %a, i32 100501)
+; CHECK: call i128 @__tsan_atomic128_load(i128* %a, i32 0)
 
 define i128 @atomic128_load_acquire(i128* %a) nounwind uwtable {
 entry:
@@ -280,7 +1400,7 @@ entry:
   ret i128 %0
 }
 ; CHECK: atomic128_load_acquire
-; CHECK: call i128 @__tsan_atomic128_load(i128* %a, i32 100504)
+; CHECK: call i128 @__tsan_atomic128_load(i128* %a, i32 2)
 
 define i128 @atomic128_load_seq_cst(i128* %a) nounwind uwtable {
 entry:
@@ -288,7 +1408,7 @@ entry:
   ret i128 %0
 }
 ; CHECK: atomic128_load_seq_cst
-; CHECK: call i128 @__tsan_atomic128_load(i128* %a, i32 100532)
+; CHECK: call i128 @__tsan_atomic128_load(i128* %a, i32 5)
 
 define void @atomic128_store_unordered(i128* %a) nounwind uwtable {
 entry:
@@ -296,7 +1416,7 @@ entry:
   ret void
 }
 ; CHECK: atomic128_store_unordered
-; CHECK: call void @__tsan_atomic128_store(i128* %a, i128 0, i32 100501)
+; CHECK: call void @__tsan_atomic128_store(i128* %a, i128 0, i32 0)
 
 define void @atomic128_store_monotonic(i128* %a) nounwind uwtable {
 entry:
@@ -304,7 +1424,7 @@ entry:
   ret void
 }
 ; CHECK: atomic128_store_monotonic
-; CHECK: call void @__tsan_atomic128_store(i128* %a, i128 0, i32 100501)
+; CHECK: call void @__tsan_atomic128_store(i128* %a, i128 0, i32 0)
 
 define void @atomic128_store_release(i128* %a) nounwind uwtable {
 entry:
@@ -312,7 +1432,7 @@ entry:
   ret void
 }
 ; CHECK: atomic128_store_release
-; CHECK: call void @__tsan_atomic128_store(i128* %a, i128 0, i32 100508)
+; CHECK: call void @__tsan_atomic128_store(i128* %a, i128 0, i32 3)
 
 define void @atomic128_store_seq_cst(i128* %a) nounwind uwtable {
 entry:
@@ -320,4 +1440,348 @@ entry:
   ret void
 }
 ; CHECK: atomic128_store_seq_cst
-; CHECK: call void @__tsan_atomic128_store(i128* %a, i128 0, i32 100532)
+; CHECK: call void @__tsan_atomic128_store(i128* %a, i128 0, i32 5)
+
+define void @atomic128_xchg_monotonic(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i128* %a, i128 0 monotonic
+  ret void
+}
+; CHECK: atomic128_xchg_monotonic
+; CHECK: call i128 @__tsan_atomic128_exchange(i128* %a, i128 0, i32 0)
+
+define void @atomic128_add_monotonic(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw add i128* %a, i128 0 monotonic
+  ret void
+}
+; CHECK: atomic128_add_monotonic
+; CHECK: call i128 @__tsan_atomic128_fetch_add(i128* %a, i128 0, i32 0)
+
+define void @atomic128_sub_monotonic(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i128* %a, i128 0 monotonic
+  ret void
+}
+; CHECK: atomic128_sub_monotonic
+; CHECK: call i128 @__tsan_atomic128_fetch_sub(i128* %a, i128 0, i32 0)
+
+define void @atomic128_and_monotonic(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw and i128* %a, i128 0 monotonic
+  ret void
+}
+; CHECK: atomic128_and_monotonic
+; CHECK: call i128 @__tsan_atomic128_fetch_and(i128* %a, i128 0, i32 0)
+
+define void @atomic128_or_monotonic(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw or i128* %a, i128 0 monotonic
+  ret void
+}
+; CHECK: atomic128_or_monotonic
+; CHECK: call i128 @__tsan_atomic128_fetch_or(i128* %a, i128 0, i32 0)
+
+define void @atomic128_xor_monotonic(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i128* %a, i128 0 monotonic
+  ret void
+}
+; CHECK: atomic128_xor_monotonic
+; CHECK: call i128 @__tsan_atomic128_fetch_xor(i128* %a, i128 0, i32 0)
+
+define void @atomic128_xchg_acquire(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i128* %a, i128 0 acquire
+  ret void
+}
+; CHECK: atomic128_xchg_acquire
+; CHECK: call i128 @__tsan_atomic128_exchange(i128* %a, i128 0, i32 2)
+
+define void @atomic128_add_acquire(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw add i128* %a, i128 0 acquire
+  ret void
+}
+; CHECK: atomic128_add_acquire
+; CHECK: call i128 @__tsan_atomic128_fetch_add(i128* %a, i128 0, i32 2)
+
+define void @atomic128_sub_acquire(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i128* %a, i128 0 acquire
+  ret void
+}
+; CHECK: atomic128_sub_acquire
+; CHECK: call i128 @__tsan_atomic128_fetch_sub(i128* %a, i128 0, i32 2)
+
+define void @atomic128_and_acquire(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw and i128* %a, i128 0 acquire
+  ret void
+}
+; CHECK: atomic128_and_acquire
+; CHECK: call i128 @__tsan_atomic128_fetch_and(i128* %a, i128 0, i32 2)
+
+define void @atomic128_or_acquire(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw or i128* %a, i128 0 acquire
+  ret void
+}
+; CHECK: atomic128_or_acquire
+; CHECK: call i128 @__tsan_atomic128_fetch_or(i128* %a, i128 0, i32 2)
+
+define void @atomic128_xor_acquire(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i128* %a, i128 0 acquire
+  ret void
+}
+; CHECK: atomic128_xor_acquire
+; CHECK: call i128 @__tsan_atomic128_fetch_xor(i128* %a, i128 0, i32 2)
+
+define void @atomic128_xchg_release(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i128* %a, i128 0 release
+  ret void
+}
+; CHECK: atomic128_xchg_release
+; CHECK: call i128 @__tsan_atomic128_exchange(i128* %a, i128 0, i32 3)
+
+define void @atomic128_add_release(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw add i128* %a, i128 0 release
+  ret void
+}
+; CHECK: atomic128_add_release
+; CHECK: call i128 @__tsan_atomic128_fetch_add(i128* %a, i128 0, i32 3)
+
+define void @atomic128_sub_release(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i128* %a, i128 0 release
+  ret void
+}
+; CHECK: atomic128_sub_release
+; CHECK: call i128 @__tsan_atomic128_fetch_sub(i128* %a, i128 0, i32 3)
+
+define void @atomic128_and_release(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw and i128* %a, i128 0 release
+  ret void
+}
+; CHECK: atomic128_and_release
+; CHECK: call i128 @__tsan_atomic128_fetch_and(i128* %a, i128 0, i32 3)
+
+define void @atomic128_or_release(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw or i128* %a, i128 0 release
+  ret void
+}
+; CHECK: atomic128_or_release
+; CHECK: call i128 @__tsan_atomic128_fetch_or(i128* %a, i128 0, i32 3)
+
+define void @atomic128_xor_release(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i128* %a, i128 0 release
+  ret void
+}
+; CHECK: atomic128_xor_release
+; CHECK: call i128 @__tsan_atomic128_fetch_xor(i128* %a, i128 0, i32 3)
+
+define void @atomic128_xchg_acq_rel(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i128* %a, i128 0 acq_rel
+  ret void
+}
+; CHECK: atomic128_xchg_acq_rel
+; CHECK: call i128 @__tsan_atomic128_exchange(i128* %a, i128 0, i32 4)
+
+define void @atomic128_add_acq_rel(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw add i128* %a, i128 0 acq_rel
+  ret void
+}
+; CHECK: atomic128_add_acq_rel
+; CHECK: call i128 @__tsan_atomic128_fetch_add(i128* %a, i128 0, i32 4)
+
+define void @atomic128_sub_acq_rel(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i128* %a, i128 0 acq_rel
+  ret void
+}
+; CHECK: atomic128_sub_acq_rel
+; CHECK: call i128 @__tsan_atomic128_fetch_sub(i128* %a, i128 0, i32 4)
+
+define void @atomic128_and_acq_rel(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw and i128* %a, i128 0 acq_rel
+  ret void
+}
+; CHECK: atomic128_and_acq_rel
+; CHECK: call i128 @__tsan_atomic128_fetch_and(i128* %a, i128 0, i32 4)
+
+define void @atomic128_or_acq_rel(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw or i128* %a, i128 0 acq_rel
+  ret void
+}
+; CHECK: atomic128_or_acq_rel
+; CHECK: call i128 @__tsan_atomic128_fetch_or(i128* %a, i128 0, i32 4)
+
+define void @atomic128_xor_acq_rel(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i128* %a, i128 0 acq_rel
+  ret void
+}
+; CHECK: atomic128_xor_acq_rel
+; CHECK: call i128 @__tsan_atomic128_fetch_xor(i128* %a, i128 0, i32 4)
+
+define void @atomic128_xchg_seq_cst(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xchg i128* %a, i128 0 seq_cst
+  ret void
+}
+; CHECK: atomic128_xchg_seq_cst
+; CHECK: call i128 @__tsan_atomic128_exchange(i128* %a, i128 0, i32 5)
+
+define void @atomic128_add_seq_cst(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw add i128* %a, i128 0 seq_cst
+  ret void
+}
+; CHECK: atomic128_add_seq_cst
+; CHECK: call i128 @__tsan_atomic128_fetch_add(i128* %a, i128 0, i32 5)
+
+define void @atomic128_sub_seq_cst(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw sub i128* %a, i128 0 seq_cst
+  ret void
+}
+; CHECK: atomic128_sub_seq_cst
+; CHECK: call i128 @__tsan_atomic128_fetch_sub(i128* %a, i128 0, i32 5)
+
+define void @atomic128_and_seq_cst(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw and i128* %a, i128 0 seq_cst
+  ret void
+}
+; CHECK: atomic128_and_seq_cst
+; CHECK: call i128 @__tsan_atomic128_fetch_and(i128* %a, i128 0, i32 5)
+
+define void @atomic128_or_seq_cst(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw or i128* %a, i128 0 seq_cst
+  ret void
+}
+; CHECK: atomic128_or_seq_cst
+; CHECK: call i128 @__tsan_atomic128_fetch_or(i128* %a, i128 0, i32 5)
+
+define void @atomic128_xor_seq_cst(i128* %a) nounwind uwtable {
+entry:
+  atomicrmw xor i128* %a, i128 0 seq_cst
+  ret void
+}
+; CHECK: atomic128_xor_seq_cst
+; CHECK: call i128 @__tsan_atomic128_fetch_xor(i128* %a, i128 0, i32 5)
+
+define void @atomic128_cas_monotonic(i128* %a) nounwind uwtable {
+entry:
+  cmpxchg i128* %a, i128 0, i128 1 monotonic
+  ret void
+}
+; CHECK: atomic128_cas_monotonic
+; CHECK: call i128 @__tsan_atomic128_compare_exchange_val(i128* %a, i128 0, i128 1, i32 0)
+
+define void @atomic128_cas_acquire(i128* %a) nounwind uwtable {
+entry:
+  cmpxchg i128* %a, i128 0, i128 1 acquire
+  ret void
+}
+; CHECK: atomic128_cas_acquire
+; CHECK: call i128 @__tsan_atomic128_compare_exchange_val(i128* %a, i128 0, i128 1, i32 2)
+
+define void @atomic128_cas_release(i128* %a) nounwind uwtable {
+entry:
+  cmpxchg i128* %a, i128 0, i128 1 release
+  ret void
+}
+; CHECK: atomic128_cas_release
+; CHECK: call i128 @__tsan_atomic128_compare_exchange_val(i128* %a, i128 0, i128 1, i32 3)
+
+define void @atomic128_cas_acq_rel(i128* %a) nounwind uwtable {
+entry:
+  cmpxchg i128* %a, i128 0, i128 1 acq_rel
+  ret void
+}
+; CHECK: atomic128_cas_acq_rel
+; CHECK: call i128 @__tsan_atomic128_compare_exchange_val(i128* %a, i128 0, i128 1, i32 4)
+
+define void @atomic128_cas_seq_cst(i128* %a) nounwind uwtable {
+entry:
+  cmpxchg i128* %a, i128 0, i128 1 seq_cst
+  ret void
+}
+; CHECK: atomic128_cas_seq_cst
+; CHECK: call i128 @__tsan_atomic128_compare_exchange_val(i128* %a, i128 0, i128 1, i32 5)
+
+define void @atomic_signal_fence_acquire() nounwind uwtable {
+entry:
+  fence singlethread acquire
+  ret void
+}
+; CHECK: atomic_signal_fence_acquire
+; CHECK: call void @__tsan_atomic_signal_fence(i32 2)
+
+define void @atomic_thread_fence_acquire() nounwind uwtable {
+entry:
+  fence  acquire
+  ret void
+}
+; CHECK: atomic_thread_fence_acquire
+; CHECK: call void @__tsan_atomic_thread_fence(i32 2)
+
+define void @atomic_signal_fence_release() nounwind uwtable {
+entry:
+  fence singlethread release
+  ret void
+}
+; CHECK: atomic_signal_fence_release
+; CHECK: call void @__tsan_atomic_signal_fence(i32 3)
+
+define void @atomic_thread_fence_release() nounwind uwtable {
+entry:
+  fence  release
+  ret void
+}
+; CHECK: atomic_thread_fence_release
+; CHECK: call void @__tsan_atomic_thread_fence(i32 3)
+
+define void @atomic_signal_fence_acq_rel() nounwind uwtable {
+entry:
+  fence singlethread acq_rel
+  ret void
+}
+; CHECK: atomic_signal_fence_acq_rel
+; CHECK: call void @__tsan_atomic_signal_fence(i32 4)
+
+define void @atomic_thread_fence_acq_rel() nounwind uwtable {
+entry:
+  fence  acq_rel
+  ret void
+}
+; CHECK: atomic_thread_fence_acq_rel
+; CHECK: call void @__tsan_atomic_thread_fence(i32 4)
+
+define void @atomic_signal_fence_seq_cst() nounwind uwtable {
+entry:
+  fence singlethread seq_cst
+  ret void
+}
+; CHECK: atomic_signal_fence_seq_cst
+; CHECK: call void @__tsan_atomic_signal_fence(i32 5)
+
+define void @atomic_thread_fence_seq_cst() nounwind uwtable {
+entry:
+  fence  seq_cst
+  ret void
+}
+; CHECK: atomic_thread_fence_seq_cst
+; CHECK: call void @__tsan_atomic_thread_fence(i32 5)
diff --git a/test/MC/ARM/elf-jump24-fixup.s b/test/MC/ARM/elf-jump24-fixup.s
new file mode 100644
index 0000000000..75a4b869dc
--- /dev/null
+++ b/test/MC/ARM/elf-jump24-fixup.s
@@ -0,0 +1,9 @@
+@ RUN: llvm-mc %s -triple=thumbv7-linux-gnueabi -filetype=obj -o - < %s | llvm-objdump -r - | FileCheck %s
+	.syntax unified
+	.text
+	.code	16
+	.thumb_func
+foo:
+	b.w	bar
+
+@ CHECK: {{[0-9]+}} R_ARM_THM_JUMP24 bar
diff --git a/test/Transforms/ConstProp/loads.ll b/test/Transforms/ConstProp/loads.ll
index 74d80aa187..6794288a0e 100644
--- a/test/Transforms/ConstProp/loads.ll
+++ b/test/Transforms/ConstProp/loads.ll
@@ -1,17 +1,24 @@
-; RUN: opt < %s -instcombine -S | FileCheck %s 
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
+; RUN: opt < %s -default-data-layout="e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64" -instcombine -S | FileCheck %s --check-prefix=LE
+; RUN: opt < %s -default-data-layout="E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64" -instcombine -S | FileCheck %s --check-prefix=BE
 
+; {{ 0xDEADBEEF, 0xBA }, 0xCAFEBABE}
 @g1 = constant {{i32,i8},i32} {{i32,i8} { i32 -559038737, i8 186 }, i32 -889275714 }
 @g2 = constant double 1.0
+; { 0x7B, 0x06B1BFF8 }
 @g3 = constant {i64, i64} { i64 123, i64 112312312 }
 
 ; Simple load
 define i32 @test1() {
   %r = load i32* getelementptr ({{i32,i8},i32}* @g1, i32 0, i32 0, i32 0)
   ret i32 %r
-; CHECK: @test1
-; CHECK: ret i32 -559038737
+
+; 0xDEADBEEF
+; LE: @test1
+; LE: ret i32 -559038737
+
+; 0xDEADBEEF
+; BE: @test1
+; BE: ret i32 -559038737
 }
 
 ; PR3152
@@ -20,8 +27,13 @@ define i16 @test2() {
   %r = load i16* bitcast(i32* getelementptr ({{i32,i8},i32}* @g1, i32 0, i32 0, i32 0) to i16*)
   ret i16 %r
 
-; CHECK: @test2
-; CHECK: ret i16 -16657 
+; 0xBEEF
+; LE: @test2
+; LE: ret i16 -16657
+
+; 0xDEAD
+; BE: @test2
+; BE: ret i16 -8531
 }
 
 ; Load of second 16 bits of 32-bit value.
@@ -29,16 +41,27 @@ define i16 @test3() {
   %r = load i16* getelementptr(i16* bitcast(i32* getelementptr ({{i32,i8},i32}* @g1, i32 0, i32 0, i32 0) to i16*), i32 1)
   ret i16 %r
 
-; CHECK: @test3
-; CHECK: ret i16 -8531
+; 0xDEAD
+; LE: @test3
+; LE: ret i16 -8531
+
+; 0xBEEF
+; BE: @test3
+; BE: ret i16 -16657
 }
 
 ; Load of 8 bit field + tail padding.
 define i16 @test4() {
   %r = load i16* getelementptr(i16* bitcast(i32* getelementptr ({{i32,i8},i32}* @g1, i32 0, i32 0, i32 0) to i16*), i32 2)
   ret i16 %r
-; CHECK: @test4
-; CHECK: ret i16 186
+
+; 0x00BA
+; LE: @test4
+; LE: ret i16 186
+
+; 0xBA00
+; BE: @test4
+; BE: ret i16 -17920
 }
 
 ; Load of double bits.
@@ -46,8 +69,13 @@ define i64 @test6() {
   %r = load i64* bitcast(double* @g2 to i64*)
   ret i64 %r
 
-; CHECK: @test6
-; CHECK: ret i64 4607182418800017408
+; 0x3FF_0000000000000
+; LE: @test6
+; LE: ret i64 4607182418800017408
+
+; 0x3FF_0000000000000
+; BE: @test6
+; BE: ret i64 4607182418800017408
 }
 
 ; Load of double bits.
@@ -55,8 +83,13 @@ define i16 @test7() {
   %r = load i16* bitcast(double* @g2 to i16*)
   ret i16 %r
 
-; CHECK: @test7
-; CHECK: ret i16 0
+; 0x0000
+; LE: @test7
+; LE: ret i16 0
+
+; 0x3FF0
+; BE: @test7
+; BE: ret i16 16368
 }
 
 ; Double load.
@@ -64,8 +97,11 @@ define double @test8() {
   %r = load double* bitcast({{i32,i8},i32}* @g1 to double*)
   ret double %r
 
-; CHECK: @test8
-; CHECK: ret double 0xBADEADBEEF
+; LE: @test8
+; LE: ret double 0xBADEADBEEF
+
+; BE: @test8
+; BE: ret double 0xDEADBEEFBA000000
 }
 
 
@@ -74,8 +110,13 @@ define i128 @test9() {
   %r = load i128* bitcast({i64, i64}* @g3 to i128*)
   ret i128 %r
 
-; CHECK: @test9
-; CHECK: ret i128 2071796475790618158476296315
+; 0x00000000_06B1BFF8_00000000_0000007B
+; LE: @test9
+; LE: ret i128 2071796475790618158476296315
+
+; 0x00000000_0000007B_00000000_06B1BFF8
+; BE: @test9
+; BE: ret i128 2268949521066387161080
 }
 
 ; vector load.
@@ -83,21 +124,30 @@ define <2 x i64> @test10() {
   %r = load <2 x i64>* bitcast({i64, i64}* @g3 to <2 x i64>*)
   ret <2 x i64> %r
 
-; CHECK: @test10
-; CHECK: ret <2 x i64> <i64 123, i64 112312312>
+; LE: @test10
+; LE: ret <2 x i64> <i64 123, i64 112312312>
+
+; BE: @test10
+; BE: ret <2 x i64> <i64 123, i64 112312312>
 }
 
 
 ; PR5287
+; { 0xA1, 0x08 }
 @g4 = internal constant { i8, i8 } { i8 -95, i8 8 }
 
 define i16 @test11() nounwind {
 entry:
   %a = load i16* bitcast ({ i8, i8 }* @g4 to i16*)
   ret i16 %a
-  
-; CHECK: @test11
-; CHECK: ret i16 2209
+
+; 0x08A1
+; LE: @test11
+; LE: ret i16 2209
+
+; 0xA108
+; BE: @test11
+; BE: ret i16 -24312
 }
 
 
@@ -107,8 +157,14 @@ entry:
 define i16 @test12() {
   %a = load i16* getelementptr inbounds ([3 x i16]* bitcast ([6 x i8]* @test12g to [3 x i16]*), i32 0, i64 1) 
   ret i16 %a
-; CHECK: @test12
-; CHECK: ret i16 98
+
+; 0x0062
+; LE: @test12
+; LE: ret i16 98
+
+; 0x6200
+; BE: @test12
+; BE: ret i16 25088
 }
 
 
@@ -117,8 +173,12 @@ define i16 @test12() {
 define i1 @test13() {
   %A = load i1* bitcast (i8* @g5 to i1*)
   ret i1 %A
-; CHECK: @test13
-; CHECK: ret i1 false
+
+; LE: @test13
+; LE: ret i1 false
+
+; BE: @test13
+; BE: ret i1 false
 }
 
 @g6 = constant [2 x i8*] [i8* inttoptr (i64 1 to i8*), i8* inttoptr (i64 2 to i8*)]
@@ -126,14 +186,22 @@ define i64 @test14() nounwind {
 entry:
   %tmp = load i64* bitcast ([2 x i8*]* @g6 to i64*)
   ret i64 %tmp
-; CHECK: @test14
-; CHECK: ret i64 1
+
+; LE: @test14
+; LE: ret i64 1
+
+; BE: @test14
+; BE: ret i64 1
 }
 
 define i64 @test15() nounwind {
 entry:
   %tmp = load i64* bitcast (i8** getelementptr inbounds ([2 x i8*]* @g6, i32 0, i64 1) to i64*)
   ret i64 %tmp
-; CHECK: @test15
-; CHECK: ret i64 2
+
+; LE: @test15
+; LE: ret i64 2
+
+; BE: @test15
+; BE: ret i64 2
 }
diff --git a/test/Transforms/GVN/rle.ll b/test/Transforms/GVN/rle.ll
index e764169126..72fa819d1c 100644
--- a/test/Transforms/GVN/rle.ll
+++ b/test/Transforms/GVN/rle.ll
@@ -1,7 +1,5 @@
-; RUN: opt < %s -basicaa -gvn -S -die | FileCheck %s
-
-; 32-bit little endian target.
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
+; RUN: opt < %s -default-data-layout="e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-n8:16:32" -basicaa -gvn -S -die | FileCheck %s
+; RUN: opt < %s -default-data-layout="E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-n32"      -basicaa -gvn -S -die | FileCheck %s
 
 ;; Trivial RLE test.
 define i32 @test0(i32 %V, i32* %P) {
@@ -318,7 +316,7 @@ define i8 @coerce_offset_nonlocal0(i32* %P, i1 %cond) {
   %P4 = getelementptr i8* %P3, i32 2
   br i1 %cond, label %T, label %F
 T:
-  store i32 42, i32* %P
+  store i32 57005, i32* %P
   br label %Cont
   
 F:
diff --git a/test/Transforms/InstCombine/2012-07-25-LoadPart.ll b/test/Transforms/InstCombine/2012-07-25-LoadPart.ll
index 73e5a6653e..18aab7f27e 100644
--- a/test/Transforms/InstCombine/2012-07-25-LoadPart.ll
+++ b/test/Transforms/InstCombine/2012-07-25-LoadPart.ll
@@ -1,12 +1,14 @@
-; RUN: opt < %s -instcombine -S | FileCheck %s
+; RUN: opt < %s -default-data-layout="e-p:32:32:32" -instcombine -S | FileCheck %s --check-prefix=LE
+; RUN: opt < %s -default-data-layout="E-p:32:32:32" -instcombine -S | FileCheck %s --check-prefix=BE
 ; PR13442
 
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
-
 @test = constant [4 x i32] [i32 1, i32 2, i32 3, i32 4]
 
 define i64 @foo() {
   %ret = load i64* bitcast (i8* getelementptr (i8* bitcast ([4 x i32]* @test to i8*), i64 2) to i64*), align 1
   ret i64 %ret
-  ; CHECK: ret i64 844424930263040
+  ; 0x00030000_00020000 in [01 00/00 00 02 00 00 00 03 00/00 00 04 00 00 00]
+  ; LE: ret i64 844424930263040
+  ; 0x00000200_00000300 in [00 00/00 01 00 00 00 02 00 00/00 03 00 00 00 04]
+  ; BE: ret i64 281474976841728
 }
diff --git a/test/Transforms/InstCombine/disable-simplify-libcalls.ll b/test/Transforms/InstCombine/disable-simplify-libcalls.ll
new file mode 100644
index 0000000000..d81e9ae5bd
--- /dev/null
+++ b/test/Transforms/InstCombine/disable-simplify-libcalls.ll
@@ -0,0 +1,236 @@
+; Test that -disable-simplify-libcalls is wired up correctly.
+;
+; RUN: opt < %s -instcombine -disable-simplify-libcalls -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@.str  = constant [1 x i8] zeroinitializer, align 1
+@.str1 = constant [13 x i8] c"hello, world\00", align 1
+@.str2 = constant [4 x i8] c"foo\00", align 1
+@.str3 = constant [4 x i8] c"bar\00", align 1
+@.str4 = constant [6 x i8] c"123.4\00", align 1
+@.str5 = constant [5 x i8] c"1234\00", align 1
+@empty = constant [1 x i8] c"\00", align 1
+
+declare double @ceil(double)
+declare double @copysign(double, double)
+declare double @cos(double)
+declare double @fabs(double)
+declare double @floor(double)
+declare i8* @strcat(i8*, i8*)
+declare i8* @strncat(i8*, i8*, i32)
+declare i8* @strchr(i8*, i32)
+declare i8* @strrchr(i8*, i32)
+declare i32 @strcmp(i8*, i8*)
+declare i32 @strncmp(i8*, i8*, i64)
+declare i8* @strcpy(i8*, i8*)
+declare i8* @stpcpy(i8*, i8*)
+declare i8* @strncpy(i8*, i8*, i64)
+declare i64 @strlen(i8*)
+declare i8* @strpbrk(i8*, i8*)
+declare i64 @strspn(i8*, i8*)
+declare double @strtod(i8*, i8**)
+declare float @strtof(i8*, i8**)
+declare x86_fp80 @strtold(i8*, i8**)
+declare i64 @strtol(i8*, i8**, i32)
+declare i64 @strtoll(i8*, i8**, i32)
+declare i64 @strtoul(i8*, i8**, i32)
+declare i64 @strtoull(i8*, i8**, i32)
+declare i64 @strcspn(i8*, i8*)
+
+define double @t1(double %x) {
+; CHECK: @t1
+  %ret = call double @ceil(double %x)
+  ret double %ret
+; CHECK: call double @ceil
+}
+
+define double @t2(double %x, double %y) {
+; CHECK: @t2
+  %ret = call double @copysign(double %x, double %y)
+  ret double %ret
+; CHECK: call double @copysign
+}
+
+define double @t3(double %x) {
+; CHECK: @t3
+  %call = call double @cos(double %x)
+  ret double %call
+; CHECK: call double @cos
+}
+
+define double @t4(double %x) {
+; CHECK: @t4
+  %ret = call double @fabs(double %x)
+  ret double %ret
+; CHECK: call double @fabs
+}
+
+define double @t5(double %x) {
+; CHECK: @t5
+  %ret = call double @floor(double %x)
+  ret double %ret
+; CHECK: call double @floor
+}
+
+define i8* @t6(i8* %x) {
+; CHECK: @t6
+  %empty = getelementptr [1 x i8]* @empty, i32 0, i32 0
+  %ret = call i8* @strcat(i8* %x, i8* %empty)
+  ret i8* %ret
+; CHECK: call i8* @strcat
+}
+
+define i8* @t7(i8* %x) {
+; CHECK: @t7
+  %empty = getelementptr [1 x i8]* @empty, i32 0, i32 0
+  %ret = call i8* @strncat(i8* %x, i8* %empty, i32 1)
+  ret i8* %ret
+; CHECK: call i8* @strncat
+}
+
+define i8* @t8() {
+; CHECK: @t8
+  %x = getelementptr inbounds [13 x i8]* @.str1, i32 0, i32 0
+  %ret = call i8* @strchr(i8* %x, i32 119)
+  ret i8* %ret
+; CHECK: call i8* @strchr
+}
+
+define i8* @t9() {
+; CHECK: @t9
+  %x = getelementptr inbounds [13 x i8]* @.str1, i32 0, i32 0
+  %ret = call i8* @strrchr(i8* %x, i32 119)
+  ret i8* %ret
+; CHECK: call i8* @strrchr
+}
+
+define i32 @t10() {
+; CHECK: @t10
+  %x = getelementptr inbounds [4 x i8]* @.str2, i32 0, i32 0
+  %y = getelementptr inbounds [4 x i8]* @.str3, i32 0, i32 0
+  %ret = call i32 @strcmp(i8* %x, i8* %y)
+  ret i32 %ret
+; CHECK: call i32 @strcmp
+}
+
+define i32 @t11() {
+; CHECK: @t11
+  %x = getelementptr inbounds [4 x i8]* @.str2, i32 0, i32 0
+  %y = getelementptr inbounds [4 x i8]* @.str3, i32 0, i32 0
+  %ret = call i32 @strncmp(i8* %x, i8* %y, i64 3)
+  ret i32 %ret
+; CHECK: call i32 @strncmp
+}
+
+define i8* @t12(i8* %x) {
+; CHECK: @t12
+  %y = getelementptr inbounds [4 x i8]* @.str2, i32 0, i32 0
+  %ret = call i8* @strcpy(i8* %x, i8* %y)
+  ret i8* %ret
+; CHECK: call i8* @strcpy
+}
+
+define i8* @t13(i8* %x) {
+; CHECK: @t13
+  %y = getelementptr inbounds [4 x i8]* @.str2, i32 0, i32 0
+  %ret = call i8* @stpcpy(i8* %x, i8* %y)
+  ret i8* %ret
+; CHECK: call i8* @stpcpy
+}
+
+define i8* @t14(i8* %x) {
+; CHECK: @t14
+  %y = getelementptr inbounds [4 x i8]* @.str2, i32 0, i32 0
+  %ret = call i8* @strncpy(i8* %x, i8* %y, i64 3)
+  ret i8* %ret
+; CHECK: call i8* @strncpy
+}
+
+define i64 @t15() {
+; CHECK: @t15
+  %x = getelementptr inbounds [4 x i8]* @.str2, i32 0, i32 0
+  %ret = call i64 @strlen(i8* %x)
+  ret i64 %ret
+; CHECK: call i64 @strlen
+}
+
+define i8* @t16(i8* %x) {
+; CHECK: @t16
+  %y = getelementptr inbounds [1 x i8]* @.str, i32 0, i32 0
+  %ret = call i8* @strpbrk(i8* %x, i8* %y)
+  ret i8* %ret
+; CHECK: call i8* @strpbrk
+}
+
+define i64 @t17(i8* %x) {
+; CHECK: @t17
+  %y = getelementptr inbounds [1 x i8]* @.str, i32 0, i32 0
+  %ret = call i64 @strspn(i8* %x, i8* %y)
+  ret i64 %ret
+; CHECK: call i64 @strspn
+}
+
+define double @t18(i8** %y) {
+; CHECK: @t18
+  %x = getelementptr inbounds [6 x i8]* @.str4, i64 0, i64 0
+  %ret = call double @strtod(i8* %x, i8** %y)
+  ret double %ret
+; CHECK: call double @strtod
+}
+
+define float @t19(i8** %y) {
+; CHECK: @t19
+  %x = getelementptr inbounds [6 x i8]* @.str4, i64 0, i64 0
+  %ret = call float @strtof(i8* %x, i8** %y)
+  ret float %ret
+; CHECK: call float @strtof
+}
+
+define x86_fp80 @t20(i8** %y) {
+; CHECK: @t20
+  %x = getelementptr inbounds [6 x i8]* @.str4, i64 0, i64 0
+  %ret = call x86_fp80 @strtold(i8* %x, i8** %y)
+  ret x86_fp80 %ret
+; CHECK: call x86_fp80 @strtold
+}
+
+define i64 @t21(i8** %y) {
+; CHECK: @t21
+  %x = getelementptr inbounds [5 x i8]* @.str5, i64 0, i64 0
+  %ret = call i64 @strtol(i8* %x, i8** %y, i32 10)
+  ret i64 %ret
+; CHECK: call i64 @strtol
+}
+
+define i64 @t22(i8** %y) {
+; CHECK: @t22
+  %x = getelementptr inbounds [5 x i8]* @.str5, i64 0, i64 0
+  %ret = call i64 @strtoll(i8* %x, i8** %y, i32 10)
+  ret i64 %ret
+; CHECK: call i64 @strtoll
+}
+
+define i64 @t23(i8** %y) {
+; CHECK: @t23
+  %x = getelementptr inbounds [5 x i8]* @.str5, i64 0, i64 0
+  %ret = call i64 @strtoul(i8* %x, i8** %y, i32 10)
+  ret i64 %ret
+; CHECK: call i64 @strtoul
+}
+
+define i64 @t24(i8** %y) {
+; CHECK: @t24
+  %x = getelementptr inbounds [5 x i8]* @.str5, i64 0, i64 0
+  %ret = call i64 @strtoull(i8* %x, i8** %y, i32 10)
+  ret i64 %ret
+; CHECK: call i64 @strtoull
+}
+
+define i64 @t25(i8* %y) {
+; CHECK: @t25
+  %x = getelementptr [1 x i8]* @empty, i32 0, i32 0
+  %ret = call i64 @strcspn(i8* %x, i8* %y)
+  ret i64 %ret
+; CHECK: call i64 @strcspn
+}
diff --git a/test/Transforms/InstCombine/memcmp-1.ll b/test/Transforms/InstCombine/memcmp-1.ll
new file mode 100644
index 0000000000..4238c5f8fb
--- /dev/null
+++ b/test/Transforms/InstCombine/memcmp-1.ll
@@ -0,0 +1,72 @@
+; Test that the memcmp library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+@foo = constant [4 x i8] c"foo\00"
+@hel = constant [4 x i8] c"hel\00"
+@hello_u = constant [8 x i8] c"hello_u\00"
+
+declare i32 @memcmp(i8*, i8*, i32)
+
+; Check memcmp(mem, mem, size) -> 0.
+
+define i32 @test_simplify1(i8* %mem, i32 %size) {
+; CHECK: @test_simplify1
+  %ret = call i32 @memcmp(i8* %mem, i8* %mem, i32 %size)
+  ret i32 %ret
+; CHECK: ret i32 0
+}
+
+; Check memcmp(mem1, mem2, 0) -> 0.
+
+define i32 @test_simplify2(i8* %mem1, i8* %mem2) {
+; CHECK: @test_simplify2
+  %ret = call i32 @memcmp(i8* %mem1, i8* %mem2, i32 0)
+  ret i32 %ret
+; CHECK: ret i32 0
+}
+
+;; Check memcmp(mem1, mem2, 1) -> *(unsigned char*)mem1 - *(unsigned char*)mem2.
+
+define i32 @test_simplify3(i8* %mem1, i8* %mem2) {
+; CHECK: @test_simplify3
+  %ret = call i32 @memcmp(i8* %mem1, i8* %mem2, i32 1)
+; CHECK: [[LOAD1:%[a-z]+]] = load i8* %mem1, align 1
+; CHECK: [[ZEXT1:%[a-z]+]] = zext i8 [[LOAD1]] to i32
+; CHECK: [[LOAD2:%[a-z]+]] = load i8* %mem2, align 1
+; CHECK: [[ZEXT2:%[a-z]+]] = zext i8 [[LOAD2]] to i32
+; CHECK: [[RET:%[a-z]+]] = sub i32 [[ZEXT1]], [[ZEXT2]]
+  ret i32 %ret
+; CHECK: ret i32 [[RET]]
+}
+
+; Check memcmp(mem1, mem2, size) -> cnst, where all arguments are constants.
+
+define i32 @test_simplify4() {
+; CHECK: @test_simplify4
+  %mem1 = getelementptr [4 x i8]* @hel, i32 0, i32 0
+  %mem2 = getelementptr [8 x i8]* @hello_u, i32 0, i32 0
+  %ret = call i32 @memcmp(i8* %mem1, i8* %mem2, i32 3)
+  ret i32 %ret
+; CHECK: ret i32 0
+}
+
+define i32 @test_simplify5() {
+; CHECK: @test_simplify5
+  %mem1 = getelementptr [4 x i8]* @hel, i32 0, i32 0
+  %mem2 = getelementptr [4 x i8]* @foo, i32 0, i32 0
+  %ret = call i32 @memcmp(i8* %mem1, i8* %mem2, i32 3)
+  ret i32 %ret
+; CHECK: ret i32 {{[0-9]+}}
+}
+
+define i32 @test_simplify6() {
+; CHECK: @test_simplify6
+  %mem1 = getelementptr [4 x i8]* @foo, i32 0, i32 0
+  %mem2 = getelementptr [4 x i8]* @hel, i32 0, i32 0
+  %ret = call i32 @memcmp(i8* %mem1, i8* %mem2, i32 3)
+  ret i32 %ret
+; CHECK: ret i32 {{-[0-9]+}}
+}
diff --git a/test/Transforms/InstCombine/memcmp-2.ll b/test/Transforms/InstCombine/memcmp-2.ll
new file mode 100644
index 0000000000..3796117bc2
--- /dev/null
+++ b/test/Transforms/InstCombine/memcmp-2.ll
@@ -0,0 +1,17 @@
+; Test that the memcmp library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+declare i32* @memcmp(i8*, i8*, i32)
+
+; Check that memcmp functions with the wrong prototype aren't simplified.
+
+define i32* @test_no_simplify1(i8* %mem, i32 %size) {
+; CHECK: @test_no_simplify1
+  %ret = call i32* @memcmp(i8* %mem, i8* %mem, i32 %size)
+; CHECK-NEXT: call i32* @memcmp
+  ret i32* %ret
+; CHECK-NEXT: ret i32* %ret
+}
diff --git a/test/Transforms/InstCombine/memcpy-1.ll b/test/Transforms/InstCombine/memcpy-1.ll
new file mode 100644
index 0000000000..65b79ad03d
--- /dev/null
+++ b/test/Transforms/InstCombine/memcpy-1.ll
@@ -0,0 +1,17 @@
+; Test that the memcpy library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+declare i8* @memcpy(i8*, i8*, i32)
+
+; Check memcpy(mem1, mem2, size) -> llvm.memcpy(mem1, mem2, size, 1).
+
+define i8* @test_simplify1(i8* %mem1, i8* %mem2, i32 %size) {
+; CHECK: @test_simplify1
+  %ret = call i8* @memcpy(i8* %mem1, i8* %mem2, i32 %size)
+; CHECK: call void @llvm.memcpy
+  ret i8* %ret
+; CHECK: ret i8* %mem1
+}
diff --git a/test/Transforms/InstCombine/memcpy-2.ll b/test/Transforms/InstCombine/memcpy-2.ll
new file mode 100644
index 0000000000..4a8a02018f
--- /dev/null
+++ b/test/Transforms/InstCombine/memcpy-2.ll
@@ -0,0 +1,17 @@
+; Test that the memcpy library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+declare i8 @memcpy(i8*, i8*, i32)
+
+; Check that memcpy functions with the wrong prototype aren't simplified.
+
+define i8 @test_no_simplify1(i8* %mem1, i8* %mem2, i32 %size) {
+; CHECK: @test_no_simplify1
+  %ret = call i8 @memcpy(i8* %mem1, i8* %mem2, i32 %size)
+; CHECK: call i8 @memcpy
+  ret i8 %ret
+; CHECK: ret i8 %ret
+}
diff --git a/test/Transforms/InstCombine/memmove-1.ll b/test/Transforms/InstCombine/memmove-1.ll
new file mode 100644
index 0000000000..53f2f116c7
--- /dev/null
+++ b/test/Transforms/InstCombine/memmove-1.ll
@@ -0,0 +1,17 @@
+; Test that the memmove library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+declare i8* @memmove(i8*, i8*, i32)
+
+; Check memmove(mem1, mem2, size) -> llvm.memmove(mem1, mem2, size, 1).
+
+define i8* @test_simplify1(i8* %mem1, i8* %mem2, i32 %size) {
+; CHECK: @test_simplify1
+  %ret = call i8* @memmove(i8* %mem1, i8* %mem2, i32 %size)
+; CHECK: call void @llvm.memmove
+  ret i8* %ret
+; CHECK: ret i8* %mem1
+}
diff --git a/test/Transforms/InstCombine/memmove-2.ll b/test/Transforms/InstCombine/memmove-2.ll
new file mode 100644
index 0000000000..23887bce31
--- /dev/null
+++ b/test/Transforms/InstCombine/memmove-2.ll
@@ -0,0 +1,17 @@
+; Test that the memmove library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+declare i8 @memmove(i8*, i8*, i32)
+
+; Check that memmove functions with the wrong prototype aren't simplified.
+
+define i8 @test_no_simplify1(i8* %mem1, i8* %mem2, i32 %size) {
+; CHECK: @test_no_simplify1
+  %ret = call i8 @memmove(i8* %mem1, i8* %mem2, i32 %size)
+; CHECK: call i8 @memmove
+  ret i8 %ret
+; CHECK: ret i8 %ret
+}
diff --git a/test/Transforms/InstCombine/memset-1.ll b/test/Transforms/InstCombine/memset-1.ll
new file mode 100644
index 0000000000..48b433e137
--- /dev/null
+++ b/test/Transforms/InstCombine/memset-1.ll
@@ -0,0 +1,17 @@
+; Test that the memset library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+declare i8* @memset(i8*, i32, i32)
+
+; Check memset(mem1, val, size) -> llvm.memset(mem1, val, size, 1).
+
+define i8* @test_simplify1(i8* %mem, i32 %val, i32 %size) {
+; CHECK: @test_simplify1
+  %ret = call i8* @memset(i8* %mem, i32 %val, i32 %size)
+; CHECK: call void @llvm.memset
+  ret i8* %ret
+; CHECK: ret i8* %mem
+}
diff --git a/test/Transforms/InstCombine/memset-2.ll b/test/Transforms/InstCombine/memset-2.ll
new file mode 100644
index 0000000000..8a9033302d
--- /dev/null
+++ b/test/Transforms/InstCombine/memset-2.ll
@@ -0,0 +1,17 @@
+; Test that the memset library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+
+declare i8 @memset(i8*, i32, i32)
+
+; Check that memset functions with the wrong prototype aren't simplified.
+
+define i8 @test_no_simplify1(i8* %mem, i32 %val, i32 %size) {
+; CHECK: @test_no_simplify1
+  %ret = call i8 @memset(i8* %mem, i32 %val, i32 %size)
+; CHECK: call i8 @memset
+  ret i8 %ret
+; CHECK: ret i8 %ret
+}
diff --git a/test/Transforms/InstCombine/strcspn-1.ll b/test/Transforms/InstCombine/strcspn-1.ll
new file mode 100644
index 0000000000..60fad897b2
--- /dev/null
+++ b/test/Transforms/InstCombine/strcspn-1.ll
@@ -0,0 +1,57 @@
+; Test that the strcspn library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@abcba = constant [6 x i8] c"abcba\00"
+@abc = constant [4 x i8] c"abc\00"
+@null = constant [1 x i8] zeroinitializer
+
+declare i64 @strcspn(i8*, i8*)
+
+; Check strcspn(s, "") -> strlen(s).
+
+define i64 @test_simplify1(i8* %str) {
+; CHECK: @test_simplify1
+  %pat = getelementptr [1 x i8]* @null, i32 0, i32 0
+
+  %ret = call i64 @strcspn(i8* %str, i8* %pat)
+; CHECK-NEXT: [[VAR:%[a-z]+]] = call i64 @strlen(i8* %str)
+  ret i64 %ret
+; CHECK-NEXT: ret i64 [[VAR]]
+}
+
+; Check strcspn("", s) -> 0.
+
+define i64 @test_simplify2(i8* %pat) {
+; CHECK: @test_simplify2
+  %str = getelementptr [1 x i8]* @null, i32 0, i32 0
+
+  %ret = call i64 @strcspn(i8* %str, i8* %pat)
+  ret i64 %ret
+; CHECK-NEXT: ret i64 0
+}
+
+; Check strcspn(s1, s2), where s1 and s2 are constants.
+
+define i64 @test_simplify3() {
+; CHECK: @test_simplify3
+  %str = getelementptr [6 x i8]* @abcba, i32 0, i32 0
+  %pat = getelementptr [4 x i8]* @abc, i32 0, i32 0
+
+  %ret = call i64 @strcspn(i8* %str, i8* %pat)
+  ret i64 %ret
+; CHECK-NEXT: ret i64 0
+}
+
+; Check cases that shouldn't be simplified.
+
+define i64 @test_no_simplify1(i8* %str, i8* %pat) {
+; CHECK: @test_no_simplify1
+
+  %ret = call i64 @strcspn(i8* %str, i8* %pat)
+; CHECK-NEXT: %ret = call i64 @strcspn(i8* %str, i8* %pat)
+  ret i64 %ret
+; CHECK-NEXT: ret i64 %ret
+}
diff --git a/test/Transforms/InstCombine/strcspn-2.ll b/test/Transforms/InstCombine/strcspn-2.ll
new file mode 100644
index 0000000000..4e2393686c
--- /dev/null
+++ b/test/Transforms/InstCombine/strcspn-2.ll
@@ -0,0 +1,21 @@
+; Test that the strcspn library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@null = constant [1 x i8] zeroinitializer
+
+declare double @strcspn(i8*, i8*)
+
+; Check that strcspn functions with the wrong prototype aren't simplified.
+
+define double @test_no_simplify1(i8* %pat) {
+; CHECK: @test_no_simplify1
+  %str = getelementptr [1 x i8]* @null, i32 0, i32 0
+
+  %ret = call double @strcspn(i8* %str, i8* %pat)
+; CHECK-NEXT: call double @strcspn
+  ret double %ret
+; CHECK-NEXT: ret double %ret
+}
diff --git a/test/Transforms/InstCombine/strncmp-1.ll b/test/Transforms/InstCombine/strncmp-1.ll
index 48b26d1a5f..187c2fa50e 100644
--- a/test/Transforms/InstCombine/strncmp-1.ll
+++ b/test/Transforms/InstCombine/strncmp-1.ll
@@ -67,12 +67,14 @@ define i32 @test5() {
 }
 
 ; strncmp(x,y,1) -> memcmp(x,y,1)
-; TODO: Once the memcmp simplifier gets moved into the instcombine pass
-; the following memcmp will be folded into two loads and a subtract.
 define i32 @test6(i8* %str1, i8* %str2) {
 ; CHECK: @test6
-; CHECK: call i32 @memcmp
-; CHECK: ret i32 %memcmp
+; CHECK: [[LOAD1:%[a-z]+]] = load i8* %str1, align 1
+; CHECK: [[ZEXT1:%[a-z]+]] = zext i8 [[LOAD1]] to i32
+; CHECK: [[LOAD2:%[a-z]+]] = load i8* %str2, align 1
+; CHECK: [[ZEXT2:%[a-z]+]] = zext i8 [[LOAD2]] to i32
+; CHECK: [[RET:%[a-z]+]] = sub i32 [[ZEXT1]], [[ZEXT2]]
+; CHECK: ret i32 [[RET]]
 
   %temp1 = call i32 @strncmp(i8* %str1, i8* %str2, i32 1)
   ret i32 %temp1
diff --git a/test/Transforms/InstCombine/strstr-1.ll b/test/Transforms/InstCombine/strstr-1.ll
new file mode 100644
index 0000000000..81f5271874
--- /dev/null
+++ b/test/Transforms/InstCombine/strstr-1.ll
@@ -0,0 +1,65 @@
+; Test that the strstr library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@.str = private constant [1 x i8] zeroinitializer
+@.str1 = private constant [2 x i8] c"a\00"
+@.str2 = private constant [6 x i8] c"abcde\00"
+@.str3 = private constant [4 x i8] c"bcd\00"
+
+declare i8* @strstr(i8*, i8*)
+
+; Check strstr(str, "") -> str.
+
+define i8* @test_simplify1(i8* %str) {
+; CHECK: @test_simplify1
+  %pat = getelementptr inbounds [1 x i8]* @.str, i32 0, i32 0
+  %ret = call i8* @strstr(i8* %str, i8* %pat)
+  ret i8* %ret
+; CHECK-NEXT: ret i8* %str
+}
+
+; Check strstr(str, "a") -> strchr(str, 'a').
+
+define i8* @test_simplify2(i8* %str) {
+; CHECK: @test_simplify2
+  %pat = getelementptr inbounds [2 x i8]* @.str1, i32 0, i32 0
+  %ret = call i8* @strstr(i8* %str, i8* %pat)
+  ret i8* %ret
+; CHECK-NEXT: @strchr(i8* %str, i32 97)
+}
+
+; Check strstr("abcde", "bcd") -> "abcde" + 1.
+
+define i8* @test_simplify3() {
+; CHECK: @test_simplify3
+  %str = getelementptr inbounds [6 x i8]* @.str2, i32 0, i32 0
+  %pat = getelementptr inbounds [4 x i8]* @.str3, i32 0, i32 0
+  %ret = call i8* @strstr(i8* %str, i8* %pat)
+  ret i8* %ret
+; CHECK-NEXT: getelementptr inbounds ([6 x i8]* @.str2, i64 0, i64 1)
+}
+
+; Check strstr(str, str) -> str.
+
+define i8* @test_simplify4(i8* %str) {
+; CHECK: @test_simplify4
+  %ret = call i8* @strstr(i8* %str, i8* %str)
+  ret i8* %ret
+; CHECK-NEXT: ret i8* %str
+}
+
+; Check strstr(str, pat) == str -> strncmp(str, pat, strlen(str)) == 0.
+
+define i1 @test_simplify5(i8* %str, i8* %pat) {
+; CHECK: @test_simplify5
+  %ret = call i8* @strstr(i8* %str, i8* %pat)
+  %cmp = icmp eq i8* %ret, %str
+  ret i1 %cmp
+; CHECK: [[LEN:%[a-z]+]] = call {{i[0-9]+}} @strlen(i8* %pat)
+; CHECK: [[NCMP:%[a-z]+]] = call {{i[0-9]+}} @strncmp(i8* %str, i8* %pat, {{i[0-9]+}} [[LEN]])
+; CHECK: icmp eq {{i[0-9]+}} [[NCMP]], 0
+; CHECK: ret i1
+}
diff --git a/test/Transforms/InstCombine/strstr-2.ll b/test/Transforms/InstCombine/strstr-2.ll
new file mode 100644
index 0000000000..5092f9b4f8
--- /dev/null
+++ b/test/Transforms/InstCombine/strstr-2.ll
@@ -0,0 +1,18 @@
+; Test that the strstr library call simplifier works correctly.
+;
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+@null = private constant [1 x i8] zeroinitializer
+
+declare i8 @strstr(i8*, i8*)
+
+define i8 @test_no_simplify1(i8* %str) {
+; CHECK: @test_no_simplify1
+  %pat = getelementptr inbounds [1 x i8]* @null, i32 0, i32 0
+  %ret = call i8 @strstr(i8* %str, i8* %pat)
+; CHECK-NEXT: call i8 @strstr
+  ret i8 %ret
+; CHECK-NEXT: ret i8 %ret
+}
diff --git a/test/Transforms/LoopVectorize/runtime-check.ll b/test/Transforms/LoopVectorize/runtime-check.ll
new file mode 100644
index 0000000000..23933cf7c7
--- /dev/null
+++ b/test/Transforms/LoopVectorize/runtime-check.ll
@@ -0,0 +1,36 @@
+; RUN: opt < %s  -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+; Make sure we vectorize this loop:
+; int foo(float *a, float *b, int n) {
+;   for (int i=0; i<n; ++i)
+;     a[i] = b[i] * 3;
+; }
+
+;CHECK: load <4 x float>
+define i32 @foo(float* nocapture %a, float* nocapture %b, i32 %n) nounwind uwtable ssp {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds float* %b, i64 %indvars.iv
+  %0 = load float* %arrayidx, align 4, !tbaa !0
+  %mul = fmul float %0, 3.000000e+00
+  %arrayidx2 = getelementptr inbounds float* %a, i64 %indvars.iv
+  store float %mul, float* %arrayidx2, align 4, !tbaa !0
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret i32 undef
+}
+
+!0 = metadata !{metadata !"float", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA"}
diff --git a/test/Transforms/SCCP/loadtest.ll b/test/Transforms/SCCP/loadtest.ll
index add2af483f..dd1dba6914 100644
--- a/test/Transforms/SCCP/loadtest.ll
+++ b/test/Transforms/SCCP/loadtest.ll
@@ -1,8 +1,9 @@
 ; This test makes sure that these instructions are properly constant propagated.
 
-target datalayout = "e-p:32:32"
+; RUN: opt < %s -default-data-layout="e-p:32:32" -sccp -S | FileCheck %s
+; RUN: opt < %s -default-data-layout="E-p:32:32" -sccp -S | FileCheck %s
 
-; RUN: opt < %s -sccp -S | not grep load
+; CHECK-NOT: load
 
 
 @X = constant i32 42		; <i32*> [#uses=1]
diff --git a/test/Transforms/SimplifyLibCalls/StrSpn.ll b/test/Transforms/SimplifyLibCalls/StrSpn.ll
deleted file mode 100644
index 2660ee9800..0000000000
--- a/test/Transforms/SimplifyLibCalls/StrSpn.ll
+++ /dev/null
@@ -1,25 +0,0 @@
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-
-target datalayout = "-p:64:64:64"
-
-@abcba = constant [6 x i8] c"abcba\00"
-@abc = constant [4 x i8] c"abc\00"
-@null = constant [1 x i8] zeroinitializer
-
-declare i64 @strcspn(i8*, i8*)
-
-define i64 @testcspn(i8* %s1, i8* %s2) {
-  	%abcba_p = getelementptr [6 x i8]* @abcba, i32 0, i32 0
-	%abc_p = getelementptr [4 x i8]* @abc, i32 0, i32 0
-	%null_p = getelementptr [1 x i8]* @null, i32 0, i32 0
-	%test1 = call i64 @strcspn(i8* %s1, i8* %null_p)
-; CHECK: call i64 @strlen(i8* %s1)
-	%test2 = call i64 @strcspn(i8* %null_p, i8* %s2)
-	%test3 = call i64 @strcspn(i8* %abcba_p, i8* %abc_p)
-; CHECK-NOT: call i64 @strcspn
-	%test4 = call i64 @strcspn(i8* %s1, i8* %s2)
-; CHECK: call i64 @strcspn(i8* %s1, i8* %s2)
-        %add0 = add i64 %test1, %test3
-; CHECK: add i64 %{{.+}}, 0
-	ret i64 %add0
-}
diff --git a/test/Transforms/SimplifyLibCalls/StrStr.ll b/test/Transforms/SimplifyLibCalls/StrStr.ll
deleted file mode 100644
index eefd2e8006..0000000000
--- a/test/Transforms/SimplifyLibCalls/StrStr.ll
+++ /dev/null
@@ -1,60 +0,0 @@
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-; PR5783
-
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
-target triple = "i386-apple-darwin9.0"
-
-@.str = private constant [1 x i8] zeroinitializer ; <[1 x i8]*> [#uses=1]
-@.str1 = private constant [2 x i8] c"a\00"        ; <[2 x i8]*> [#uses=1]
-@.str2 = private constant [6 x i8] c"abcde\00"    ; <[6 x i8]*> [#uses=1]
-@.str3 = private constant [4 x i8] c"bcd\00"      ; <[4 x i8]*> [#uses=1]
-
-define i8* @test1(i8* %P) nounwind readonly {
-entry:
-  %call = tail call i8* @strstr(i8* %P, i8* getelementptr inbounds ([1 x i8]* @.str, i32 0, i32 0)) nounwind ; <i8*> [#uses=1]
-  ret i8* %call
-; strstr(P, "") -> P
-; CHECK: @test1
-; CHECK: ret i8* %P
-}
-
-declare i8* @strstr(i8*, i8* nocapture) nounwind readonly
-
-define i8* @test2(i8* %P) nounwind readonly {
-entry:
-  %call = tail call i8* @strstr(i8* %P, i8* getelementptr inbounds ([2 x i8]* @.str1, i32 0, i32 0)) nounwind ; <i8*> [#uses=1]
-  ret i8* %call
-; strstr(P, "a") -> strchr(P, 'a')
-; CHECK: @test2
-; CHECK: @strchr(i8* %P, i32 97)
-}
-
-define i8* @test3(i8* nocapture %P) nounwind readonly {
-entry:
-  %call = tail call i8* @strstr(i8* getelementptr inbounds ([6 x i8]* @.str2, i32 0, i32 0), i8* getelementptr inbounds ([4 x i8]* @.str3, i32 0, i32 0)) nounwind ; <i8*> [#uses=1]
-  ret i8* %call
-; strstr("abcde", "bcd") -> "abcde"+1
-; CHECK: @test3
-; CHECK: getelementptr inbounds ([6 x i8]* @.str2, i32 0, i64 1)
-}
-
-define i8* @test4(i8* %P) nounwind readonly {
-entry:
-  %call = tail call i8* @strstr(i8* %P, i8* %P) nounwind ; <i8*> [#uses=1]
-  ret i8* %call
-; strstr(P, P) -> P
-; CHECK: @test4
-; CHECK: ret i8* %P
-}
-
-define i1 @test5(i8* %P, i8* %Q) nounwind readonly {
-entry:
-  %call = tail call i8* @strstr(i8* %P, i8* %Q) nounwind ; <i8*> [#uses=1]
-  %cmp = icmp eq i8* %call, %P
-  ret i1 %cmp
-; CHECK: @test5
-; CHECK: [[LEN:%[a-z]+]] = call {{i[0-9]+}} @strlen(i8* %Q)
-; CHECK: [[NCMP:%[a-z]+]] = call {{i[0-9]+}} @strncmp(i8* %P, i8* %Q, {{i[0-9]+}} [[LEN]])
-; CHECK: icmp eq {{i[0-9]+}} [[NCMP]], 0
-; CHECK: ret i1
-}
diff --git a/test/Transforms/SimplifyLibCalls/memcmp.ll b/test/Transforms/SimplifyLibCalls/memcmp.ll
deleted file mode 100644
index 6ca4dc97a1..0000000000
--- a/test/Transforms/SimplifyLibCalls/memcmp.ll
+++ /dev/null
@@ -1,35 +0,0 @@
-; Test that the memcmpOptimizer works correctly
-; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
-
-@h = constant [2 x i8] c"h\00"		; <[2 x i8]*> [#uses=0]
-@hel = constant [4 x i8] c"hel\00"		; <[4 x i8]*> [#uses=0]
-@hello_u = constant [8 x i8] c"hello_u\00"		; <[8 x i8]*> [#uses=0]
-
-declare i32 @memcmp(i8*, i8*, i32)
-
-define void @test(i8* %P, i8* %Q, i32 %N, i32* %IP, i1* %BP) {
-	%A = call i32 @memcmp( i8* %P, i8* %P, i32 %N )		; <i32> [#uses=1]
-; CHECK-NOT: call {{.*}} memcmp
-; CHECK: store volatile
-	store volatile i32 %A, i32* %IP
-	%B = call i32 @memcmp( i8* %P, i8* %Q, i32 0 )		; <i32> [#uses=1]
-; CHECK-NOT: call {{.*}} memcmp
-; CHECK: store volatile
-	store volatile i32 %B, i32* %IP
-	%C = call i32 @memcmp( i8* %P, i8* %Q, i32 1 )		; <i32> [#uses=1]
-; CHECK: load
-; CHECK: zext
-; CHECK: load
-; CHECK: zext
-; CHECK: sub
-; CHECK: store volatile
-	store volatile i32 %C, i32* %IP
-  %F = call i32 @memcmp(i8* getelementptr ([4 x i8]* @hel, i32 0, i32 0),
-                        i8* getelementptr ([8 x i8]* @hello_u, i32 0, i32 0),
-                        i32 3)
-; CHECK-NOT: call {{.*}} memcmp
-; CHECK: store volatile
-  store volatile i32 %F, i32* %IP
-	ret void
-}
-
diff --git a/test/Transforms/SimplifyLibCalls/memmove.ll b/test/Transforms/SimplifyLibCalls/memmove.ll
deleted file mode 100644
index 5aaeeeb024..0000000000
--- a/test/Transforms/SimplifyLibCalls/memmove.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: opt < %s -simplify-libcalls -S | grep "llvm.memmove"
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
-target triple = "i686-pc-linux-gnu"
-
-define i8* @test(i8* %a, i8* %b, i32 %x) {
-entry:
-	%call = call i8* @memmove(i8* %a, i8* %b, i32 %x )
-	ret i8* %call
-}
-
-declare i8* @memmove(i8*,i8*,i32)
-
diff --git a/test/Transforms/SimplifyLibCalls/memset-64.ll b/test/Transforms/SimplifyLibCalls/memset-64.ll
deleted file mode 100644
index 92412dee71..0000000000
--- a/test/Transforms/SimplifyLibCalls/memset-64.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: opt < %s -simplify-libcalls -S | grep "llvm.memset"
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
-target triple = "x86_64-pc-linux-gnu"
-
-define void @a(i8* %x) nounwind {
-entry:
-	%call = call i8* @memset(i8* %x, i32 1, i64 100)		; <i8*> [#uses=0]
-	ret void
-}
-
-declare i8* @memset(i8*, i32, i64)
-
diff --git a/test/Transforms/SimplifyLibCalls/memset.ll b/test/Transforms/SimplifyLibCalls/memset.ll
deleted file mode 100644
index 853215a4d2..0000000000
--- a/test/Transforms/SimplifyLibCalls/memset.ll
+++ /dev/null
@@ -1,12 +0,0 @@
-; RUN: opt < %s -simplify-libcalls -S | grep "llvm.memset"
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
-target triple = "i686-pc-linux-gnu"
-
-define i8* @test(i8* %a, i32 %b, i32 %x) {
-entry:
-	%call = call i8* @memset(i8* %a, i32 %b, i32 %x )
-	ret i8* %call
-}
-
-declare i8* @memset(i8*,i32,i32)
-
diff --git a/unittests/ADT/APFloatTest.cpp b/unittests/ADT/APFloatTest.cpp
index 48d5d83019..117b8204b9 100644
--- a/unittests/ADT/APFloatTest.cpp
+++ b/unittests/ADT/APFloatTest.cpp
@@ -635,6 +635,12 @@ TEST(APFloatTest, exactInverse) {
   EXPECT_TRUE(inv.bitwiseIsEqual(APFloat(0.5)));
   EXPECT_TRUE(APFloat(2.0f).getExactInverse(&inv));
   EXPECT_TRUE(inv.bitwiseIsEqual(APFloat(0.5f)));
+  EXPECT_TRUE(APFloat(APFloat::IEEEquad, "2.0").getExactInverse(&inv));
+  EXPECT_TRUE(inv.bitwiseIsEqual(APFloat(APFloat::IEEEquad, "0.5")));
+  EXPECT_TRUE(APFloat(APFloat::PPCDoubleDouble, "2.0").getExactInverse(&inv));
+  EXPECT_TRUE(inv.bitwiseIsEqual(APFloat(APFloat::PPCDoubleDouble, "0.5")));
+  EXPECT_TRUE(APFloat(APFloat::x87DoubleExtended, "2.0").getExactInverse(&inv));
+  EXPECT_TRUE(inv.bitwiseIsEqual(APFloat(APFloat::x87DoubleExtended, "0.5")));
 
   // FLT_MIN
   EXPECT_TRUE(APFloat(1.17549435e-38f).getExactInverse(&inv));
diff --git a/unittests/Support/AlignOfTest.cpp b/unittests/Support/AlignOfTest.cpp
index f2d11708a2..f01e660939 100644
--- a/unittests/Support/AlignOfTest.cpp
+++ b/unittests/Support/AlignOfTest.cpp
@@ -66,6 +66,17 @@ struct V6 : S1 { virtual ~V6(); };
 struct V7 : virtual V2, virtual V6 { virtual ~V7(); };
 struct V8 : V5, virtual V6, V7 { double zz; virtual ~V8(); };
 
+double S6::f() { return 0.0; }
+float D2::g() { return 0.0f; }
+V1::~V1() {}
+V2::~V2() {}
+V3::~V3() {}
+V4::~V4() {}
+V5::~V5() {}
+V6::~V6() {}
+V7::~V7() {}
+V8::~V8() {}
+
 // Ensure alignment is a compile-time constant.
 char LLVM_ATTRIBUTE_UNUSED test_arr1
   [AlignOf<char>::Alignment > 0]
diff --git a/utils/TableGen/CodeEmitterGen.cpp b/utils/TableGen/CodeEmitterGen.cpp
index e60aec9568..3e4f626d48 100644
--- a/utils/TableGen/CodeEmitterGen.cpp
+++ b/utils/TableGen/CodeEmitterGen.cpp
@@ -134,10 +134,13 @@ AddCodeToMergeInOperand(Record *R, BitsInit *BI, const std::string &VarName,
     assert(!CGI.Operands.isFlatOperandNotEmitted(OpIdx) &&
            "Explicitly used operand also marked as not emitted!");
   } else {
+    unsigned NumberOps = CGI.Operands.size();
     /// If this operand is not supposed to be emitted by the
     /// generated emitter, skip it.
-    while (CGI.Operands.isFlatOperandNotEmitted(NumberedOp))
+    while (NumberedOp < NumberOps &&
+           CGI.Operands.isFlatOperandNotEmitted(NumberedOp))
       ++NumberedOp;
+
     OpIdx = NumberedOp++;
   }
   
diff --git a/utils/lldbDataFormatters.py b/utils/lldbDataFormatters.py
index 18b407a02a..1baf398aa5 100644
--- a/utils/lldbDataFormatters.py
+++ b/utils/lldbDataFormatters.py
@@ -2,6 +2,7 @@
 Load into LLDB with:
 script import lldbDataFormatters
 type synthetic add -x "^llvm::SmallVectorImpl<.+>$" -l lldbDataFormatters.SmallVectorSynthProvider
+type synthetic add -x "^llvm::SmallVector<.+,.+>$" -l lldbDataFormatters.SmallVectorSynthProvider
 """
 
 # Pretty printer for llvm::SmallVector/llvm::SmallVectorImpl
@@ -32,22 +33,15 @@ class SmallVectorSynthProvider:
         return self.begin.CreateChildAtOffset('['+str(index)+']',
                                               offset, self.data_type)
 
-    def get_type_from_name(self):
-        import re
-        name = self.valobj.GetType().GetName()
-        # This class works with both SmallVectors and SmallVectorImpls.
-        res = re.match("^(llvm::)?SmallVectorImpl<(.+)>$", name)
-        if res:
-            return res.group(2)
-        res = re.match("^(llvm::)?SmallVector<(.+), \d+>$", name)
-        if res:
-            return res.group(2)
-        return None
-
     def update(self):
         self.begin = self.valobj.GetChildMemberWithName('BeginX')
         self.end = self.valobj.GetChildMemberWithName('EndX')
-        data_type = self.get_type_from_name()
-        # FIXME: this sometimes returns an invalid type.
-        self.data_type = self.valobj.GetTarget().FindFirstType(data_type)
+        the_type = self.valobj.GetType()
+        # If this is a reference type we have to dereference it to get to the
+        # template parameter.
+        if the_type.IsReferenceType():
+            the_type = the_type.GetDereferencedType()
+
+        self.data_type = the_type.GetTemplateArgumentType(0)
         self.type_size = self.data_type.GetByteSize()
+        assert self.type_size != 0