Merge commit '7dfcb84fc16b3bf6b2379713b53090757f0a45f9'

Conflicts: docs/LangRef.rst include/llvm/CodeGen/CallingConvLower.h include/llvm/IRReader/IRReader.h include/llvm/Target/TargetMachine.h lib/CodeGen/CallingConvLower.cpp lib/IRReader/IRReader.cpp lib/IRReader/LLVMBuild.txt lib/IRReader/Makefile lib/LLVMBuild.txt lib/Makefile lib/Support/MemoryBuffer.cpp lib/Support/Unix/PathV2.inc lib/Target/ARM/ARMBaseInstrInfo.cpp lib/Target/ARM/ARMISelLowering.cpp lib/Target/ARM/ARMInstrInfo.td lib/Target/ARM/ARMSubtarget.cpp lib/Target/ARM/ARMTargetMachine.cpp lib/Target/Mips/CMakeLists.txt lib/Target/Mips/MipsDelaySlotFiller.cpp lib/Target/Mips/MipsISelLowering.cpp lib/Target/Mips/MipsInstrInfo.td lib/Target/Mips/MipsSubtarget.cpp lib/Target/Mips/MipsSubtarget.h lib/Target/X86/X86FastISel.cpp lib/Target/X86/X86ISelDAGToDAG.cpp lib/Target/X86/X86ISelLowering.cpp lib/Target/X86/X86InstrControl.td lib/Target/X86/X86InstrFormats.td lib/Transforms/IPO/ExtractGV.cpp lib/Transforms/InstCombine/InstCombineCompares.cpp lib/Transforms/Utils/SimplifyLibCalls.cpp test/CodeGen/X86/fast-isel-divrem.ll test/MC/ARM/data-in-code.ll tools/Makefile tools/llvm-extract/llvm-extract.cpp tools/llvm-link/CMakeLists.txt tools/opt/CMakeLists.txt tools/opt/LLVMBuild.txt tools/opt/Makefile tools/opt/opt.cpp
author: Eli Bendersky <eliben@chromium.org> 2013-07-15 16:09:15 -0700
committer: Eli Bendersky <eliben@chromium.org> 2013-07-15 16:09:15 -0700
commit: c6cf05cb5108f356dde97c01ee4188b0671d4542 (patch)
tree: 436fdc2a55296d3c202e7ef11f31be3be53efb5f /lib/Transforms
parent: c75199c649c739aade160289d93f257edc798cde (diff)
parent: 7dfcb84fc16b3bf6b2379713b53090757f0a45f9 (diff)
59 files changed, 6156 insertions, 2616 deletions
diff --git a/lib/Transforms/IPO/ConstantMerge.cpp b/lib/Transforms/IPO/ConstantMerge.cpp
index 8336d3ad34..a7bf18896b 100644
--- a/lib/Transforms/IPO/ConstantMerge.cpp
+++ b/lib/Transforms/IPO/ConstantMerge.cpp
@@ -27,6 +27,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
 #include "llvm/Pass.h"
 using namespace llvm;
 
@@ -66,13 +67,13 @@ ModulePass *llvm::createConstantMergePass() { return new ConstantMerge(); }
 static void FindUsedValues(GlobalVariable *LLVMUsed,
                            SmallPtrSet<const GlobalValue*, 8> &UsedValues) {
   if (LLVMUsed == 0) return;
-  ConstantArray *Inits = dyn_cast<ConstantArray>(LLVMUsed->getInitializer());
-  if (Inits == 0) return;
-  
-  for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i)
-    if (GlobalValue *GV = 
-        dyn_cast<GlobalValue>(Inits->getOperand(i)->stripPointerCasts()))
-      UsedValues.insert(GV);
+  ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer());
+
+  for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i) {
+    Value *Operand = Inits->getOperand(i)->stripPointerCastsNoFollowAliases();
+    GlobalValue *GV = cast<GlobalValue>(Operand);
+    UsedValues.insert(GV);
+  }
 }
 
 // True if A is better than B.
diff --git a/lib/Transforms/IPO/ExtractGV.cpp b/lib/Transforms/IPO/ExtractGV.cpp
index 09f9d7c788..b5a05a7f1c 100644
--- a/lib/Transforms/IPO/ExtractGV.cpp
+++ b/lib/Transforms/IPO/ExtractGV.cpp
@@ -69,7 +69,7 @@ namespace {
           // @LOCALMOD-END
         }
 
-        bool Local = I->hasLocalLinkage();
+        bool Local = I->isDiscardableIfUnused();
         if (Local)
           I->setVisibility(GlobalValue::HiddenVisibility);
 
@@ -95,8 +95,8 @@ namespace {
 	    continue;
           // @LOCALMOD-END
         }
- 
-        bool Local = I->hasLocalLinkage();
+
+        bool Local = I->isDiscardableIfUnused();
         if (Local)
           I->setVisibility(GlobalValue::HiddenVisibility);
 
@@ -113,7 +113,7 @@ namespace {
         Module::alias_iterator CurI = I;
         ++I;
 
-        if (CurI->hasLocalLinkage()) {
+        if (CurI->isDiscardableIfUnused()) {
           CurI->setVisibility(GlobalValue::HiddenVisibility);
           CurI->setLinkage(GlobalValue::ExternalLinkage);
         }
diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp
index a75212a386..bc5109b4d4 100644
--- a/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -1,4 +1,4 @@
-//===- FunctionAttrs.cpp - Pass which marks functions readnone or readonly ===//
+//===- FunctionAttrs.cpp - Pass which marks functions attributes ----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,6 +14,8 @@
 // to the function does not create any copies of the pointer value that
 // outlive the call.  This more or less means that the pointer is only
 // dereferenced, and not returned from the function or stored in a global.
+// Finally, well-known library call declarations are marked with all
+// attributes that are consistent with the function's standard definition.
 // This pass is implemented as a bottom-up traversal of the call-graph.
 //
 //===----------------------------------------------------------------------===//
@@ -32,12 +34,14 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/InstIterator.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 using namespace llvm;
 
 STATISTIC(NumReadNone, "Number of functions marked readnone");
 STATISTIC(NumReadOnly, "Number of functions marked readonly");
 STATISTIC(NumNoCapture, "Number of arguments marked nocapture");
 STATISTIC(NumNoAlias, "Number of function returns marked noalias");
+STATISTIC(NumAnnotated, "Number of attributes added to library functions");
 
 namespace {
   struct FunctionAttrs : public CallGraphSCCPass {
@@ -62,14 +66,63 @@ namespace {
     // AddNoAliasAttrs - Deduce noalias attributes for the SCC.
     bool AddNoAliasAttrs(const CallGraphSCC &SCC);
 
+    // Utility methods used by inferPrototypeAttributes to add attributes
+    // and maintain annotation statistics.
+
+    void setDoesNotAccessMemory(Function &F) {
+      if (!F.doesNotAccessMemory()) {
+	F.setDoesNotAccessMemory();
+	++NumAnnotated;
+      }
+    }
+
+    void setOnlyReadsMemory(Function &F) {
+      if (!F.onlyReadsMemory()) {
+	F.setOnlyReadsMemory();
+	++NumAnnotated;
+      }
+    }
+
+    void setDoesNotThrow(Function &F) {
+      if (!F.doesNotThrow()) {
+	F.setDoesNotThrow();
+	++NumAnnotated;
+      }
+    }
+
+    void setDoesNotCapture(Function &F, unsigned n) {
+      if (!F.doesNotCapture(n)) {
+	F.setDoesNotCapture(n);
+	++NumAnnotated;
+      }
+    }
+
+    void setDoesNotAlias(Function &F, unsigned n) {
+      if (!F.doesNotAlias(n)) {
+	F.setDoesNotAlias(n);
+	++NumAnnotated;
+      }
+    }
+
+    // inferPrototypeAttributes - Analyze the name and prototype of the
+    // given function and set any applicable attributes.  Returns true
+    // if any attributes were set and false otherwise.
+    bool inferPrototypeAttributes(Function &F);
+
+    // annotateLibraryCalls - Adds attributes to well-known standard library
+    // call declarations.
+    bool annotateLibraryCalls(const CallGraphSCC &SCC);
+
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.setPreservesCFG();
       AU.addRequired<AliasAnalysis>();
+      AU.addRequired<TargetLibraryInfo>();
       CallGraphSCCPass::getAnalysisUsage(AU);
     }
 
   private:
     AliasAnalysis *AA;
+    TargetLibraryInfo *TLI;
   };
 }
 
@@ -77,6 +130,7 @@ char FunctionAttrs::ID = 0;
 INITIALIZE_PASS_BEGIN(FunctionAttrs, "functionattrs",
                 "Deduce function attributes", false, false)
 INITIALIZE_AG_DEPENDENCY(CallGraph)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfo)
 INITIALIZE_PASS_END(FunctionAttrs, "functionattrs",
                 "Deduce function attributes", false, false)
 
@@ -598,10 +652,693 @@ bool FunctionAttrs::AddNoAliasAttrs(const CallGraphSCC &SCC) {
   return MadeChange;
 }
 
+/// inferPrototypeAttributes - Analyze the name and prototype of the
+/// given function and set any applicable attributes.  Returns true
+/// if any attributes were set and false otherwise.
+bool FunctionAttrs::inferPrototypeAttributes(Function &F) {
+  FunctionType *FTy = F.getFunctionType();
+  LibFunc::Func TheLibFunc;
+  if (!(TLI->getLibFunc(F.getName(), TheLibFunc) && TLI->has(TheLibFunc)))
+    return false;
+
+  switch (TheLibFunc) {
+  case LibFunc::strlen:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setOnlyReadsMemory(F);
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::strchr:
+  case LibFunc::strrchr:
+    if (FTy->getNumParams() != 2 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isIntegerTy())
+      return false;
+    setOnlyReadsMemory(F);
+    setDoesNotThrow(F);
+    break;
+  case LibFunc::strcpy:
+  case LibFunc::stpcpy:
+  case LibFunc::strcat:
+  case LibFunc::strtol:
+  case LibFunc::strtod:
+  case LibFunc::strtof:
+  case LibFunc::strtoul:
+  case LibFunc::strtoll:
+  case LibFunc::strtold:
+  case LibFunc::strncat:
+  case LibFunc::strncpy:
+  case LibFunc::stpncpy:
+  case LibFunc::strtoull:
+    if (FTy->getNumParams() < 2 ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::strxfrm:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::strcmp:
+  case LibFunc::strspn:
+  case LibFunc::strncmp:
+  case LibFunc::strcspn:
+  case LibFunc::strcoll:
+  case LibFunc::strcasecmp:
+  case LibFunc::strncasecmp:
+    if (FTy->getNumParams() < 2 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setOnlyReadsMemory(F);
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::strstr:
+  case LibFunc::strpbrk:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setOnlyReadsMemory(F);
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::strtok:
+  case LibFunc::strtok_r:
+    if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::scanf:
+  case LibFunc::setbuf:
+  case LibFunc::setvbuf:
+    if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::strdup:
+  case LibFunc::strndup:
+    if (FTy->getNumParams() < 1 || !FTy->getReturnType()->isPointerTy() ||
+        !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::stat:
+  case LibFunc::sscanf:
+  case LibFunc::sprintf:
+  case LibFunc::statvfs:
+    if (FTy->getNumParams() < 2 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::snprintf:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(2)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 3);
+    break;
+  case LibFunc::setitimer:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(1)->isPointerTy() ||
+        !FTy->getParamType(2)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    setDoesNotCapture(F, 3);
+    break;
+  case LibFunc::system:
+    if (FTy->getNumParams() != 1 ||
+        !FTy->getParamType(0)->isPointerTy())
+      return false;
+    // May throw; "system" is a valid pthread cancellation point.
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::malloc:
+    if (FTy->getNumParams() != 1 ||
+        !FTy->getReturnType()->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    break;
+  case LibFunc::memcmp:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setOnlyReadsMemory(F);
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::memchr:
+  case LibFunc::memrchr:
+    if (FTy->getNumParams() != 3)
+      return false;
+    setOnlyReadsMemory(F);
+    setDoesNotThrow(F);
+    break;
+  case LibFunc::modf:
+  case LibFunc::modff:
+  case LibFunc::modfl:
+  case LibFunc::memcpy:
+  case LibFunc::memccpy:
+  case LibFunc::memmove:
+    if (FTy->getNumParams() < 2 ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::memalign:
+    if (!FTy->getReturnType()->isPointerTy())
+      return false;
+    setDoesNotAlias(F, 0);
+    break;
+  case LibFunc::mkdir:
+  case LibFunc::mktime:
+    if (FTy->getNumParams() == 0 ||
+        !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::realloc:
+    if (FTy->getNumParams() != 2 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getReturnType()->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::read:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    // May throw; "read" is a valid pthread cancellation point.
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::rmdir:
+  case LibFunc::rewind:
+  case LibFunc::remove:
+  case LibFunc::realpath:
+    if (FTy->getNumParams() < 1 ||
+        !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::rename:
+  case LibFunc::readlink:
+    if (FTy->getNumParams() < 2 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::write:
+    if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    // May throw; "write" is a valid pthread cancellation point.
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::bcopy:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::bcmp:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setOnlyReadsMemory(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::bzero:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::calloc:
+    if (FTy->getNumParams() != 2 ||
+        !FTy->getReturnType()->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    break;
+  case LibFunc::chmod:
+  case LibFunc::chown:
+  case LibFunc::ctermid:
+  case LibFunc::clearerr:
+  case LibFunc::closedir:
+    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::atoi:
+  case LibFunc::atol:
+  case LibFunc::atof:
+  case LibFunc::atoll:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setOnlyReadsMemory(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::access:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::fopen:
+    if (FTy->getNumParams() != 2 ||
+        !FTy->getReturnType()->isPointerTy() ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::fdopen:
+    if (FTy->getNumParams() != 2 ||
+        !FTy->getReturnType()->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::feof:
+  case LibFunc::free:
+  case LibFunc::fseek:
+  case LibFunc::ftell:
+  case LibFunc::fgetc:
+  case LibFunc::fseeko:
+  case LibFunc::ftello:
+  case LibFunc::fileno:
+  case LibFunc::fflush:
+  case LibFunc::fclose:
+  case LibFunc::fsetpos:
+  case LibFunc::flockfile:
+  case LibFunc::funlockfile:
+  case LibFunc::ftrylockfile:
+    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::ferror:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setOnlyReadsMemory(F);
+    break;
+  case LibFunc::fputc:
+  case LibFunc::fstat:
+  case LibFunc::frexp:
+  case LibFunc::frexpf:
+  case LibFunc::frexpl:
+  case LibFunc::fstatvfs:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::fgets:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(2)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 3);
+  case LibFunc::fread:
+  case LibFunc::fwrite:
+    if (FTy->getNumParams() != 4 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(3)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 4);
+  case LibFunc::fputs:
+  case LibFunc::fscanf:
+  case LibFunc::fprintf:
+  case LibFunc::fgetpos:
+    if (FTy->getNumParams() < 2 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::getc:
+  case LibFunc::getlogin_r:
+  case LibFunc::getc_unlocked:
+    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::getenv:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setOnlyReadsMemory(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::gets:
+  case LibFunc::getchar:
+    setDoesNotThrow(F);
+    break;
+  case LibFunc::getitimer:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::getpwnam:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::ungetc:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::uname:
+  case LibFunc::unlink:
+  case LibFunc::unsetenv:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::utime:
+  case LibFunc::utimes:
+    if (FTy->getNumParams() != 2 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::putc:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::puts:
+  case LibFunc::printf:
+  case LibFunc::perror:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::pread:
+  case LibFunc::pwrite:
+    if (FTy->getNumParams() != 4 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    // May throw; these are valid pthread cancellation points.
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::putchar:
+    setDoesNotThrow(F);
+    break;
+  case LibFunc::popen:
+    if (FTy->getNumParams() != 2 ||
+        !FTy->getReturnType()->isPointerTy() ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::pclose:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::vscanf:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::vsscanf:
+  case LibFunc::vfscanf:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(1)->isPointerTy() ||
+        !FTy->getParamType(2)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::valloc:
+    if (!FTy->getReturnType()->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    break;
+  case LibFunc::vprintf:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::vfprintf:
+  case LibFunc::vsprintf:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::vsnprintf:
+    if (FTy->getNumParams() != 4 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(2)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 3);
+    break;
+  case LibFunc::open:
+    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    // May throw; "open" is a valid pthread cancellation point.
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::opendir:
+    if (FTy->getNumParams() != 1 ||
+        !FTy->getReturnType()->isPointerTy() ||
+        !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::tmpfile:
+    if (!FTy->getReturnType()->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    break;
+  case LibFunc::times:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::htonl:
+  case LibFunc::htons:
+  case LibFunc::ntohl:
+  case LibFunc::ntohs:
+    setDoesNotThrow(F);
+    setDoesNotAccessMemory(F);
+    break;
+  case LibFunc::lstat:
+    if (FTy->getNumParams() != 2 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::lchown:
+    if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::qsort:
+    if (FTy->getNumParams() != 4 || !FTy->getParamType(3)->isPointerTy())
+      return false;
+    // May throw; places call through function pointer.
+    setDoesNotCapture(F, 4);
+    break;
+  case LibFunc::dunder_strdup:
+  case LibFunc::dunder_strndup:
+    if (FTy->getNumParams() < 1 ||
+        !FTy->getReturnType()->isPointerTy() ||
+        !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::dunder_strtok_r:
+    if (FTy->getNumParams() != 3 ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::under_IO_getc:
+    if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::under_IO_putc:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::dunder_isoc99_scanf:
+    if (FTy->getNumParams() < 1 ||
+        !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::stat64:
+  case LibFunc::lstat64:
+  case LibFunc::statvfs64:
+  case LibFunc::dunder_isoc99_sscanf:
+    if (FTy->getNumParams() < 1 ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::fopen64:
+    if (FTy->getNumParams() != 2 ||
+        !FTy->getReturnType()->isPointerTy() ||
+        !FTy->getParamType(0)->isPointerTy() ||
+        !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    setDoesNotCapture(F, 1);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::fseeko64:
+  case LibFunc::ftello64:
+    if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 1);
+    break;
+  case LibFunc::tmpfile64:
+    if (!FTy->getReturnType()->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotAlias(F, 0);
+    break;
+  case LibFunc::fstat64:
+  case LibFunc::fstatvfs64:
+    if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
+      return false;
+    setDoesNotThrow(F);
+    setDoesNotCapture(F, 2);
+    break;
+  case LibFunc::open64:
+    if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy())
+      return false;
+    // May throw; "open" is a valid pthread cancellation point.
+    setDoesNotCapture(F, 1);
+    break;
+  default:
+    // Didn't mark any attributes.
+    return false;
+  }
+
+  return true;
+}
+
+/// annotateLibraryCalls - Adds attributes to well-known standard library
+/// call declarations.
+bool FunctionAttrs::annotateLibraryCalls(const CallGraphSCC &SCC) {
+  bool MadeChange = false;
+
+  // Check each function in turn annotating well-known library function
+  // declarations with attributes.
+  for (CallGraphSCC::iterator I = SCC.begin(), E = SCC.end(); I != E; ++I) {
+    Function *F = (*I)->getFunction();
+
+    if (F != 0 && F->isDeclaration())
+      MadeChange |= inferPrototypeAttributes(*F);
+  }
+
+  return MadeChange;
+}
+
 bool FunctionAttrs::runOnSCC(CallGraphSCC &SCC) {
   AA = &getAnalysis<AliasAnalysis>();
+  TLI = &getAnalysis<TargetLibraryInfo>();
 
-  bool Changed = AddReadAttrs(SCC);
+  bool Changed = annotateLibraryCalls(SCC);
+  Changed |= AddReadAttrs(SCC);
   Changed |= AddNoCaptureAttrs(SCC);
   Changed |= AddNoAliasAttrs(SCC);
   return Changed;
diff --git a/lib/Transforms/IPO/GlobalDCE.cpp b/lib/Transforms/IPO/GlobalDCE.cpp
index dc99492990..201f320c43 100644
--- a/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/lib/Transforms/IPO/GlobalDCE.cpp
@@ -42,6 +42,7 @@ namespace {
 
   private:
     SmallPtrSet<GlobalValue*, 32> AliveGlobals;
+    SmallPtrSet<Constant *, 8> SeenConstants;
 
     /// GlobalIsNeeded - mark the specific global value as needed, and
     /// recursively mark anything that it uses as also needed.
@@ -151,6 +152,7 @@ bool GlobalDCE::runOnModule(Module &M) {
 
   // Make sure that all memory is released
   AliveGlobals.clear();
+  SeenConstants.clear();
 
   return Changed;
 }
@@ -190,12 +192,15 @@ void GlobalDCE::GlobalIsNeeded(GlobalValue *G) {
 void GlobalDCE::MarkUsedGlobalsAsNeeded(Constant *C) {
   if (GlobalValue *GV = dyn_cast<GlobalValue>(C))
     return GlobalIsNeeded(GV);
-  
+
   // Loop over all of the operands of the constant, adding any globals they
   // use to the list of needed globals.
-  for (User::op_iterator I = C->op_begin(), E = C->op_end(); I != E; ++I)
-    if (Constant *OpC = dyn_cast<Constant>(*I))
-      MarkUsedGlobalsAsNeeded(OpC);
+  for (User::op_iterator I = C->op_begin(), E = C->op_end(); I != E; ++I) {
+    // If we've already processed this constant there's no need to do it again.
+    Constant *Op = dyn_cast<Constant>(*I);
+    if (Op && SeenConstants.insert(Op))
+      MarkUsedGlobalsAsNeeded(Op);
+  }
 }
 
 // RemoveUnusedGlobalValue - Loop over all of the uses of the specified
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index 700d0dfb5c..6cab6ed0ff 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -473,8 +473,9 @@ static bool CleanupPointerRootUsers(GlobalVariable *GV,
 static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
                                        DataLayout *TD, TargetLibraryInfo *TLI) {
   bool Changed = false;
-  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI != E;) {
-    User *U = *UI++;
+  SmallVector<User*, 8> WorkList(V->use_begin(), V->use_end());
+  while (!WorkList.empty()) {
+    User *U = WorkList.pop_back_val();
 
     if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
       if (Init) {
@@ -537,7 +538,6 @@ static bool CleanupConstantGlobalUsers(Value *V, Constant *Init,
       // us, and if they are all dead, nuke them without remorse.
       if (SafeToDestroyConstant(C)) {
         C->destroyConstant();
-        // This could have invalidated UI, start over from scratch.
         CleanupConstantGlobalUsers(V, Init, TD, TLI);
         return true;
       }
@@ -3062,6 +3062,105 @@ bool GlobalOpt::OptimizeGlobalCtorsList(GlobalVariable *&GCL) {
   return true;
 }
 
+static Value::use_iterator getFirst(Value *V, SmallPtrSet<Use*, 8> &Tried) {
+  for (Value::use_iterator I = V->use_begin(), E = V->use_end(); I != E; ++I) {
+    Use *U = &I.getUse();
+    if (Tried.count(U))
+      continue;
+
+    User *Usr = *I;
+    GlobalVariable *GV = dyn_cast<GlobalVariable>(Usr);
+    if (!GV || !GV->hasName()) {
+      Tried.insert(U);
+      return I;
+    }
+
+    StringRef Name = GV->getName();
+    if (Name != "llvm.used" && Name != "llvm.compiler_used") {
+      Tried.insert(U);
+      return I;
+    }
+  }
+  return V->use_end();
+}
+
+static bool replaceAllNonLLVMUsedUsesWith(Constant *Old, Constant *New);
+
+static bool replaceUsesOfWithOnConstant(ConstantArray *CA, Value *From,
+                                        Value *ToV, Use *U) {
+  Constant *To = cast<Constant>(ToV);
+
+  SmallVector<Constant*, 8> NewOps;
+  for (unsigned i = 0, e = CA->getNumOperands(); i != e; ++i) {
+    Constant *Op = CA->getOperand(i);
+    NewOps.push_back(Op == From ? To : Op);
+  }
+
+  Constant *Replacement = ConstantArray::get(CA->getType(), NewOps);
+  assert(Replacement != CA && "CA didn't contain From!");
+
+  bool Ret = replaceAllNonLLVMUsedUsesWith(CA, Replacement);
+  if (Replacement->use_empty())
+    Replacement->destroyConstant();
+  if (CA->use_empty())
+    CA->destroyConstant();
+  return Ret;
+}
+
+static bool replaceUsesOfWithOnConstant(ConstantExpr *CE, Value *From,
+                                        Value *ToV, Use *U) {
+  Constant *To = cast<Constant>(ToV);
+  SmallVector<Constant*, 8> NewOps;
+  for (unsigned i = 0, e = CE->getNumOperands(); i != e; ++i) {
+    Constant *Op = CE->getOperand(i);
+    NewOps.push_back(Op == From ? To : Op);
+  }
+
+  Constant *Replacement = CE->getWithOperands(NewOps);
+  assert(Replacement != CE && "CE didn't contain From!");
+
+  bool Ret = replaceAllNonLLVMUsedUsesWith(CE, Replacement);
+  if (Replacement->use_empty())
+    Replacement->destroyConstant();
+  if (CE->use_empty())
+    CE->destroyConstant();
+  return Ret;
+}
+
+static bool replaceUsesOfWithOnConstant(Constant *C, Value *From, Value *To,
+                                        Use *U) {
+  if (ConstantArray *CA = dyn_cast<ConstantArray>(C))
+    return replaceUsesOfWithOnConstant(CA, From, To, U);
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(C))
+    return replaceUsesOfWithOnConstant(CE, From, To, U);
+  C->replaceUsesOfWithOnConstant(From, To, U);
+  return true;
+}
+
+static bool replaceAllNonLLVMUsedUsesWith(Constant *Old, Constant *New) {
+  SmallPtrSet<Use*, 8> Tried;
+  bool Ret = false;
+  for (;;) {
+    Value::use_iterator I = getFirst(Old, Tried);
+    if (I == Old->use_end())
+      break;
+    Use &U = I.getUse();
+
+    // Must handle Constants specially, we cannot call replaceUsesOfWith on a
+    // constant because they are uniqued.
+    if (Constant *C = dyn_cast<Constant>(U.getUser())) {
+      if (!isa<GlobalValue>(C)) {
+        Ret |= replaceUsesOfWithOnConstant(C, Old, New, &U);
+        continue;
+      }
+    }
+
+    U.set(New);
+    Ret = true;
+  }
+  return Ret;
+}
+
 bool GlobalOpt::OptimizeGlobalAliases(Module &M) {
   bool Changed = false;
 
@@ -3081,11 +3180,12 @@ bool GlobalOpt::OptimizeGlobalAliases(Module &M) {
     bool hasOneUse = Target->hasOneUse() && Aliasee->hasOneUse();
 
     // Make all users of the alias use the aliasee instead.
-    if (!J->use_empty()) {
-      J->replaceAllUsesWith(Aliasee);
+    if (replaceAllNonLLVMUsedUsesWith(J, Aliasee)) {
       ++NumAliasesResolved;
       Changed = true;
     }
+    if (!J->use_empty())
+      continue;
 
     // If the alias is externally visible, we may still be able to simplify it.
     if (!J->hasLocalLinkage()) {
diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp
index 892100f058..4ce749cfec 100644
--- a/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/lib/Transforms/IPO/MergeFunctions.cpp
@@ -72,6 +72,15 @@ STATISTIC(NumThunksWritten, "Number of thunks generated");
 STATISTIC(NumAliasesWritten, "Number of aliases generated");
 STATISTIC(NumDoubleWeak, "Number of new functions created");
 
+/// Returns the type id for a type to be hashed. We turn pointer types into
+/// integers here because the actual compare logic below considers pointers and
+/// integers of the same size as equal.
+static Type::TypeID getTypeIDForHash(Type *Ty) {
+  if (Ty->isPointerTy())
+    return Type::IntegerTyID;
+  return Ty->getTypeID();
+}
+
 /// Creates a hash-code for the function which is the same for any two
 /// functions that will compare equal, without looking at the instructions
 /// inside the function.
@@ -83,9 +92,9 @@ static unsigned profileFunction(const Function *F) {
   ID.AddInteger(F->getCallingConv());
   ID.AddBoolean(F->hasGC());
   ID.AddBoolean(FTy->isVarArg());
-  ID.AddInteger(FTy->getReturnType()->getTypeID());
+  ID.AddInteger(getTypeIDForHash(FTy->getReturnType()));
   for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i)
-    ID.AddInteger(FTy->getParamType(i)->getTypeID());
+    ID.AddInteger(getTypeIDForHash(FTy->getParamType(i)));
   return ID.ComputeHash();
 }
 
@@ -200,8 +209,7 @@ private:
 
 // Any two pointers in the same address space are equivalent, intptr_t and
 // pointers are equivalent. Otherwise, standard type equivalence rules apply.
-bool FunctionComparator::isEquivalentType(Type *Ty1,
-                                          Type *Ty2) const {
+bool FunctionComparator::isEquivalentType(Type *Ty1, Type *Ty2) const {
   if (Ty1 == Ty2)
     return true;
   if (Ty1->getTypeID() != Ty2->getTypeID()) {
@@ -740,7 +748,13 @@ void MergeFunctions::writeThunk(Function *F, Function *G) {
   if (NewG->getReturnType()->isVoidTy()) {
     Builder.CreateRetVoid();
   } else {
-    Builder.CreateRet(Builder.CreateBitCast(CI, NewG->getReturnType()));
+    Type *RetTy = NewG->getReturnType();
+    if (CI->getType()->isIntegerTy() && RetTy->isPointerTy())
+      Builder.CreateRet(Builder.CreateIntToPtr(CI, RetTy));
+    else if (CI->getType()->isPointerTy() && RetTy->isIntegerTy())
+      Builder.CreateRet(Builder.CreatePtrToInt(CI, RetTy));
+    else
+      Builder.CreateRet(Builder.CreateBitCast(CI, RetTy));
   }
 
   NewG->copyAttributesFrom(G);
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 47b2b51899..986c0b8928 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -33,7 +33,12 @@ RunLoopVectorization("vectorize-loops",
                      cl::desc("Run the Loop vectorization passes"));
 
 static cl::opt<bool>
-RunBBVectorization("vectorize", cl::desc("Run the BB vectorization passes"));
+RunSLPVectorization("vectorize-slp",
+                    cl::desc("Run the SLP vectorization passes"));
+
+static cl::opt<bool>
+RunBBVectorization("vectorize-slp-aggressive",
+                    cl::desc("Run the BB vectorization passes"));
 
 static cl::opt<bool>
 UseGVNAfterVectorization("use-gvn-after-vectorization",
@@ -52,7 +57,8 @@ PassManagerBuilder::PassManagerBuilder() {
     DisableSimplifyLibCalls = false;
     DisableUnitAtATime = false;
     DisableUnrollLoops = false;
-    Vectorize = RunBBVectorization;
+    BBVectorize = RunBBVectorization;
+    SLPVectorize = RunSLPVectorization;
     LoopVectorize = RunLoopVectorization;
 }
 
@@ -207,7 +213,10 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
 
   addExtensionsToPM(EP_ScalarOptimizerLate, MPM);
 
-  if (Vectorize) {
+  if (SLPVectorize)
+    MPM.add(createSLPVectorizerPass());     // Vectorize parallel scalar chains.
+
+  if (BBVectorize) {
     MPM.add(createBBVectorizePass());
     MPM.add(createInstructionCombiningPass());
     if (OptLevel > 1 && UseGVNAfterVectorization)
@@ -321,6 +330,14 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM,
   PM.add(createGlobalDCEPass());
 }
 
+inline PassManagerBuilder *unwrap(LLVMPassManagerBuilderRef P) {
+    return reinterpret_cast<PassManagerBuilder*>(P);
+}
+
+inline LLVMPassManagerBuilderRef wrap(PassManagerBuilder *P) {
+  return reinterpret_cast<LLVMPassManagerBuilderRef>(P);
+}
+
 LLVMPassManagerBuilderRef LLVMPassManagerBuilderCreate() {
   PassManagerBuilder *PMB = new PassManagerBuilder();
   return wrap(PMB);
@@ -391,9 +408,9 @@ LLVMPassManagerBuilderPopulateModulePassManager(LLVMPassManagerBuilderRef PMB,
 
 void LLVMPassManagerBuilderPopulateLTOPassManager(LLVMPassManagerBuilderRef PMB,
                                                   LLVMPassManagerRef PM,
-                                                  bool Internalize,
-                                                  bool RunInliner) {
+                                                  LLVMBool Internalize,
+                                                  LLVMBool RunInliner) {
   PassManagerBuilder *Builder = unwrap(PMB);
   PassManagerBase *LPM = unwrap(PM);
-  Builder->populateLTOPassManager(*LPM, Internalize, RunInliner);
+  Builder->populateLTOPassManager(*LPM, Internalize != 0, RunInliner != 0);
 }
diff --git a/lib/Transforms/IPO/StripSymbols.cpp b/lib/Transforms/IPO/StripSymbols.cpp
index 5f8681ff45..3396f7929e 100644
--- a/lib/Transforms/IPO/StripSymbols.cpp
+++ b/lib/Transforms/IPO/StripSymbols.cpp
@@ -195,10 +195,9 @@ static void findUsedValues(GlobalVariable *LLVMUsed,
                            SmallPtrSet<const GlobalValue*, 8> &UsedValues) {
   if (LLVMUsed == 0) return;
   UsedValues.insert(LLVMUsed);
-  
-  ConstantArray *Inits = dyn_cast<ConstantArray>(LLVMUsed->getInitializer());
-  if (Inits == 0) return;
-  
+
+  ConstantArray *Inits = cast<ConstantArray>(LLVMUsed->getInitializer());
+
   for (unsigned i = 0, e = Inits->getNumOperands(); i != e; ++i)
     if (GlobalValue *GV = 
           dyn_cast<GlobalValue>(Inits->getOperand(i)->stripPointerCasts()))
diff --git a/lib/Transforms/InstCombine/CMakeLists.txt b/lib/Transforms/InstCombine/CMakeLists.txt
index 72cfe2c985..a25696ec03 100644
--- a/lib/Transforms/InstCombine/CMakeLists.txt
+++ b/lib/Transforms/InstCombine/CMakeLists.txt
@@ -9,7 +9,7 @@ add_llvm_library(LLVMInstCombine
   InstCombineMulDivRem.cpp
   InstCombinePHI.cpp
   InstCombineSelect.cpp
-  InstCombineShifts.cpp 
+  InstCombineShifts.cpp
   InstCombineSimplifyDemanded.cpp
   InstCombineVectorOps.cpp
   )
diff --git a/lib/Transforms/InstCombine/InstCombine.h b/lib/Transforms/InstCombine/InstCombine.h
index 1f6a3a5e33..2a36074750 100644
--- a/lib/Transforms/InstCombine/InstCombine.h
+++ b/lib/Transforms/InstCombine/InstCombine.h
@@ -233,6 +233,7 @@ private:
   Instruction *transformSExtICmp(ICmpInst *ICI, Instruction &CI);
   bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS);
   Value *EmitGEPOffset(User *GEP);
+  Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN);
 
 public:
   // InsertNewInstBefore - insert an instruction New before instruction Old
diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index c6d60d6f00..166f8dfdb4 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -24,9 +24,9 @@ namespace {
   /// Class representing coefficient of floating-point addend.
   /// This class needs to be highly efficient, which is especially true for
   /// the constructor. As of I write this comment, the cost of the default
-  /// constructor is merely 4-byte-store-zero (Assuming compiler is able to 
+  /// constructor is merely 4-byte-store-zero (Assuming compiler is able to
   /// perform write-merging).
-  /// 
+  ///
   class FAddendCoef {
   public:
     // The constructor has to initialize a APFloat, which is uncessary for
@@ -37,31 +37,31 @@ namespace {
     //
     FAddendCoef() : IsFp(false), BufHasFpVal(false), IntVal(0) {}
     ~FAddendCoef();
-  
+
     void set(short C) {
       assert(!insaneIntVal(C) && "Insane coefficient");
       IsFp = false; IntVal = C;
     }
-  
+
     void set(const APFloat& C);
-  
+
     void negate();
-  
+
     bool isZero() const { return isInt() ? !IntVal : getFpVal().isZero(); }
     Value *getValue(Type *) const;
-  
+
     // If possible, don't define operator+/operator- etc because these
     // operators inevitably call FAddendCoef's constructor which is not cheap.
     void operator=(const FAddendCoef &A);
     void operator+=(const FAddendCoef &A);
     void operator-=(const FAddendCoef &A);
     void operator*=(const FAddendCoef &S);
-  
+
     bool isOne() const { return isInt() && IntVal == 1; }
     bool isTwo() const { return isInt() && IntVal == 2; }
     bool isMinusOne() const { return isInt() && IntVal == -1; }
     bool isMinusTwo() const { return isInt() && IntVal == -2; }
-  
+
   private:
     bool insaneIntVal(int V) { return V > 4 || V < -4; }
     APFloat *getFpValPtr(void)
@@ -74,18 +74,28 @@ namespace {
       return *getFpValPtr();
     }
 
-    APFloat &getFpVal(void)
-      { assert(IsFp && BufHasFpVal && "Incorret state"); return *getFpValPtr(); }
-  
+    APFloat &getFpVal(void) {
+      assert(IsFp && BufHasFpVal && "Incorret state");
+      return *getFpValPtr();
+    }
+
     bool isInt() const { return !IsFp; }
 
+    // If the coefficient is represented by an integer, promote it to a
+    // floating point.
+    void convertToFpType(const fltSemantics &Sem);
+
+    // Construct an APFloat from a signed integer.
+    // TODO: We should get rid of this function when APFloat can be constructed
+    //       from an *SIGNED* integer.
+    APFloat createAPFloatFromInt(const fltSemantics &Sem, int Val);
   private:
 
     bool IsFp;
-  
+
     // True iff FpValBuf contains an instance of APFloat.
     bool BufHasFpVal;
-  
+
     // The integer coefficient of an individual addend is either 1 or -1,
     // and we try to simplify at most 4 addends from neighboring at most
     // two instructions. So the range of <IntVal> falls in [-4, 4]. APInt
@@ -94,7 +104,7 @@ namespace {
 
     AlignedCharArrayUnion<APFloat> FpValBuf;
   };
-  
+
   /// FAddend is used to represent floating-point addend. An addend is
   /// represented as <C, V>, where the V is a symbolic value, and C is a
   /// constant coefficient. A constant addend is represented as <C, 0>.
@@ -102,10 +112,10 @@ namespace {
   class FAddend {
   public:
     FAddend() { Val = 0; }
-  
+
     Value *getSymVal (void) const { return Val; }
     const FAddendCoef &getCoef(void) const { return Coeff; }
-  
+
     bool isConstant() const { return Val == 0; }
     bool isZero() const { return Coeff.isZero(); }
 
@@ -114,17 +124,17 @@ namespace {
       { Coeff.set(Coefficient); Val = V; }
     void set(const ConstantFP* Coefficient, Value *V)
       { Coeff.set(Coefficient->getValueAPF()); Val = V; }
-  
+
     void negate() { Coeff.negate(); }
-  
+
     /// Drill down the U-D chain one step to find the definition of V, and
     /// try to break the definition into one or two addends.
     static unsigned drillValueDownOneStep(Value* V, FAddend &A0, FAddend &A1);
-  
+
     /// Similar to FAddend::drillDownOneStep() except that the value being
     /// splitted is the addend itself.
     unsigned drillAddendDownOneStep(FAddend &Addend0, FAddend &Addend1) const;
-  
+
     void operator+=(const FAddend &T) {
       assert((Val == T.Val) && "Symbolic-values disagree");
       Coeff += T.Coeff;
@@ -132,12 +142,12 @@ namespace {
 
   private:
     void Scale(const FAddendCoef& ScaleAmt) { Coeff *= ScaleAmt; }
-  
+
     // This addend has the value of "Coeff * Val".
     Value *Val;
     FAddendCoef Coeff;
   };
-  
+
   /// FAddCombine is the class for optimizing an unsafe fadd/fsub along
   /// with its neighboring at most two instructions.
   ///
@@ -145,27 +155,30 @@ namespace {
   public:
     FAddCombine(InstCombiner::BuilderTy *B) : Builder(B), Instr(0) {}
     Value *simplify(Instruction *FAdd);
-  
+
   private:
     typedef SmallVector<const FAddend*, 4> AddendVect;
-  
+
     Value *simplifyFAdd(AddendVect& V, unsigned InstrQuota);
-  
+
+    Value *performFactorization(Instruction *I);
+
     /// Convert given addend to a Value
     Value *createAddendVal(const FAddend &A, bool& NeedNeg);
-    
+
     /// Return the number of instructions needed to emit the N-ary addition.
     unsigned calcInstrNumber(const AddendVect& Vect);
     Value *createFSub(Value *Opnd0, Value *Opnd1);
     Value *createFAdd(Value *Opnd0, Value *Opnd1);
     Value *createFMul(Value *Opnd0, Value *Opnd1);
+    Value *createFDiv(Value *Opnd0, Value *Opnd1);
     Value *createFNeg(Value *V);
     Value *createNaryFAdd(const AddendVect& Opnds, unsigned InstrQuota);
     void createInstPostProc(Instruction *NewInst);
-  
+
     InstCombiner::BuilderTy *Builder;
     Instruction *Instr;
-  
+
   private:
      // Debugging stuff are clustered here.
     #ifndef NDEBUG
@@ -177,7 +190,7 @@ namespace {
       void incCreateInstNum() {}
     #endif
   };
-} 
+}
 
 //===----------------------------------------------------------------------===//
 //
@@ -200,10 +213,34 @@ void FAddendCoef::set(const APFloat& C) {
   } else
     *P = C;
 
-  IsFp = BufHasFpVal = true; 
+  IsFp = BufHasFpVal = true;
+}
+
+void FAddendCoef::convertToFpType(const fltSemantics &Sem) {
+  if (!isInt())
+    return;
+
+  APFloat *P = getFpValPtr();
+  if (IntVal > 0)
+    new(P) APFloat(Sem, IntVal);
+  else {
+    new(P) APFloat(Sem, 0 - IntVal);
+    P->changeSign();
+  }
+  IsFp = BufHasFpVal = true;
 }
 
-void FAddendCoef::operator=(const FAddendCoef& That) {
+APFloat FAddendCoef::createAPFloatFromInt(const fltSemantics &Sem, int Val) {
+  if (Val >= 0)
+    return APFloat(Sem, Val);
+
+  APFloat T(Sem, 0 - Val);
+  T.changeSign();
+
+  return T;
+}
+
+void FAddendCoef::operator=(const FAddendCoef &That) {
   if (That.isInt())
     set(That.IntVal);
   else
@@ -219,16 +256,16 @@ void FAddendCoef::operator+=(const FAddendCoef &That) {
       getFpVal().add(That.getFpVal(), RndMode);
     return;
   }
-  
+
   if (isInt()) {
     const APFloat &T = That.getFpVal();
-    set(T);
-    getFpVal().add(APFloat(T.getSemantics(), IntVal), RndMode);
+    convertToFpType(T.getSemantics());
+    getFpVal().add(T, RndMode);
     return;
   }
-  
+
   APFloat &T = getFpVal();
-  T.add(APFloat(T.getSemantics(), That.IntVal), RndMode);
+  T.add(createAPFloatFromInt(T.getSemantics(), That.IntVal), RndMode);
 }
 
 void FAddendCoef::operator-=(const FAddendCoef &That) {
@@ -240,16 +277,16 @@ void FAddendCoef::operator-=(const FAddendCoef &That) {
       getFpVal().subtract(That.getFpVal(), RndMode);
     return;
   }
-  
+
   if (isInt()) {
     const APFloat &T = That.getFpVal();
-    set(T);
-    getFpVal().subtract(APFloat(T.getSemantics(), IntVal), RndMode);
+    convertToFpType(T.getSemantics());
+    getFpVal().subtract(T, RndMode);
     return;
   }
 
   APFloat &T = getFpVal();
-  T.subtract(APFloat(T.getSemantics(), IntVal), RndMode);
+  T.subtract(createAPFloatFromInt(T.getSemantics(), IntVal), RndMode);
 }
 
 void FAddendCoef::operator*=(const FAddendCoef &That) {
@@ -268,15 +305,16 @@ void FAddendCoef::operator*=(const FAddendCoef &That) {
     return;
   }
 
-  const fltSemantics &Semantic = 
+  const fltSemantics &Semantic =
     isInt() ? That.getFpVal().getSemantics() : getFpVal().getSemantics();
 
   if (isInt())
-    set(APFloat(Semantic, IntVal));
+    convertToFpType(Semantic);
   APFloat &F0 = getFpVal();
 
   if (That.isInt())
-    F0.multiply(APFloat(Semantic, That.IntVal), APFloat::rmNearestTiesToEven);
+    F0.multiply(createAPFloatFromInt(Semantic, That.IntVal),
+                APFloat::rmNearestTiesToEven);
   else
     F0.multiply(That.getFpVal(), APFloat::rmNearestTiesToEven);
 
@@ -302,11 +340,11 @@ Value *FAddendCoef::getValue(Type *Ty) const {
 //  A - B                     <1, A>, <1,B>
 //  0 - B                     <-1, B>
 //  C * A,                    <C, A>
-//  A + C                     <1, A> <C, NULL> 
+//  A + C                     <1, A> <C, NULL>
 //  0 +/- 0                   <0, NULL> (corner case)
 //
 // Legend: A and B are not constant, C is constant
-// 
+//
 unsigned FAddend::drillValueDownOneStep
   (Value *Val, FAddend &Addend0, FAddend &Addend1) {
   Instruction *I = 0;
@@ -377,7 +415,7 @@ unsigned FAddend::drillAddendDownOneStep
     return 0;
 
   unsigned BreakNum = FAddend::drillValueDownOneStep(Val, Addend0, Addend1);
-  if (!BreakNum || Coeff.isOne()) 
+  if (!BreakNum || Coeff.isOne())
     return BreakNum;
 
   Addend0.Scale(Coeff);
@@ -388,6 +426,78 @@ unsigned FAddend::drillAddendDownOneStep
   return BreakNum;
 }
 
+// Try to perform following optimization on the input instruction I. Return the
+// simplified expression if was successful; otherwise, return 0.
+//
+//   Instruction "I" is                Simplified into
+// -------------------------------------------------------
+//   (x * y) +/- (x * z)               x * (y +/- z)
+//   (y / x) +/- (z / x)               (y +/- z) / x
+//
+Value *FAddCombine::performFactorization(Instruction *I) {
+  assert((I->getOpcode() == Instruction::FAdd ||
+          I->getOpcode() == Instruction::FSub) && "Expect add/sub");
+
+  Instruction *I0 = dyn_cast<Instruction>(I->getOperand(0));
+  Instruction *I1 = dyn_cast<Instruction>(I->getOperand(1));
+
+  if (!I0 || !I1 || I0->getOpcode() != I1->getOpcode())
+    return 0;
+
+  bool isMpy = false;
+  if (I0->getOpcode() == Instruction::FMul)
+    isMpy = true;
+  else if (I0->getOpcode() != Instruction::FDiv)
+    return 0;
+
+  Value *Opnd0_0 = I0->getOperand(0);
+  Value *Opnd0_1 = I0->getOperand(1);
+  Value *Opnd1_0 = I1->getOperand(0);
+  Value *Opnd1_1 = I1->getOperand(1);
+
+  //  Input Instr I       Factor   AddSub0  AddSub1
+  //  ----------------------------------------------
+  // (x*y) +/- (x*z)        x        y         z
+  // (y/x) +/- (z/x)        x        y         z
+  //
+  Value *Factor = 0;
+  Value *AddSub0 = 0, *AddSub1 = 0;
+
+  if (isMpy) {
+    if (Opnd0_0 == Opnd1_0 || Opnd0_0 == Opnd1_1)
+      Factor = Opnd0_0;
+    else if (Opnd0_1 == Opnd1_0 || Opnd0_1 == Opnd1_1)
+      Factor = Opnd0_1;
+
+    if (Factor) {
+      AddSub0 = (Factor == Opnd0_0) ? Opnd0_1 : Opnd0_0;
+      AddSub1 = (Factor == Opnd1_0) ? Opnd1_1 : Opnd1_0;
+    }
+  } else if (Opnd0_1 == Opnd1_1) {
+    Factor = Opnd0_1;
+    AddSub0 = Opnd0_0;
+    AddSub1 = Opnd1_0;
+  }
+
+  if (!Factor)
+    return 0;
+
+  // Create expression "NewAddSub = AddSub0 +/- AddsSub1"
+  Value *NewAddSub = (I->getOpcode() == Instruction::FAdd) ?
+                      createFAdd(AddSub0, AddSub1) :
+                      createFSub(AddSub0, AddSub1);
+  if (ConstantFP *CFP = dyn_cast<ConstantFP>(NewAddSub)) {
+    const APFloat &F = CFP->getValueAPF();
+    if (!F.isNormal() || F.isDenormal())
+      return 0;
+  }
+
+  if (isMpy)
+    return createFMul(Factor, NewAddSub);
+
+  return createFDiv(NewAddSub, Factor);
+}
+
 Value *FAddCombine::simplify(Instruction *I) {
   assert(I->hasUnsafeAlgebra() && "Should be in unsafe mode");
 
@@ -398,7 +508,7 @@ Value *FAddCombine::simplify(Instruction *I) {
   assert((I->getOpcode() == Instruction::FAdd ||
           I->getOpcode() == Instruction::FSub) && "Expect add/sub");
 
-  // Save the instruction before calling other member-functions. 
+  // Save the instruction before calling other member-functions.
   Instr = I;
 
   FAddend Opnd0, Opnd1, Opnd0_0, Opnd0_1, Opnd1_0, Opnd1_1;
@@ -409,7 +519,7 @@ Value *FAddCombine::simplify(Instruction *I) {
   unsigned Opnd0_ExpNum = 0;
   unsigned Opnd1_ExpNum = 0;
 
-  if (!Opnd0.isConstant()) 
+  if (!Opnd0.isConstant())
     Opnd0_ExpNum = Opnd0.drillAddendDownOneStep(Opnd0_0, Opnd0_1);
 
   // Step 2: Expand the 2nd addend into Opnd1_0 and Opnd1_1.
@@ -431,7 +541,7 @@ Value *FAddCombine::simplify(Instruction *I) {
 
     Value *V0 = I->getOperand(0);
     Value *V1 = I->getOperand(1);
-    InstQuota = ((!isa<Constant>(V0) && V0->hasOneUse()) &&  
+    InstQuota = ((!isa<Constant>(V0) && V0->hasOneUse()) &&
                  (!isa<Constant>(V1) && V1->hasOneUse())) ? 2 : 1;
 
     if (Value *R = simplifyFAdd(AllOpnds, InstQuota))
@@ -471,7 +581,8 @@ Value *FAddCombine::simplify(Instruction *I) {
       return R;
   }
 
-  return 0;
+  // step 6: Try factorization as the last resort,
+  return performFactorization(I);
 }
 
 Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) {
@@ -479,7 +590,7 @@ Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) {
   unsigned AddendNum = Addends.size();
   assert(AddendNum <= 4 && "Too many addends");
 
-  // For saving intermediate results; 
+  // For saving intermediate results;
   unsigned NextTmpIdx = 0;
   FAddend TmpResult[3];
 
@@ -495,7 +606,7 @@ Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) {
   AddendVect SimpVect;
 
   // The outer loop works on one symbolic-value at a time. Suppose the input
-  // addends are : <a1, x>, <b1, y>, <a2, x>, <c1, z>, <b2, y>, ... 
+  // addends are : <a1, x>, <b1, y>, <a2, x>, <c1, z>, <b2, y>, ...
   // The symbolic-values will be processed in this order: x, y, z.
   //
   for (unsigned SymIdx = 0; SymIdx < AddendNum; SymIdx++) {
@@ -522,7 +633,7 @@ Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) {
       if (T && T->getSymVal() == Val) {
         // Set null such that next iteration of the outer loop will not process
         // this addend again.
-        Addends[SameSymIdx] = 0; 
+        Addends[SameSymIdx] = 0;
         SimpVect.push_back(T);
       }
     }
@@ -535,7 +646,7 @@ Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) {
         R += *SimpVect[Idx];
 
       // Pop all addends being folded and push the resulting folded addend.
-      SimpVect.resize(StartIdx); 
+      SimpVect.resize(StartIdx);
       if (Val != 0) {
         if (!R.isZero()) {
           SimpVect.push_back(&R);
@@ -548,7 +659,7 @@ Value *FAddCombine::simplifyFAdd(AddendVect& Addends, unsigned InstrQuota) {
     }
   }
 
-  assert((NextTmpIdx <= sizeof(TmpResult)/sizeof(TmpResult[0]) + 1) && 
+  assert((NextTmpIdx <= sizeof(TmpResult)/sizeof(TmpResult[0]) + 1) &&
          "out-of-bound access");
 
   if (ConstAdd)
@@ -570,7 +681,7 @@ Value *FAddCombine::createNaryFAdd
   assert(!Opnds.empty() && "Expect at least one addend");
 
   // Step 1: Check if the # of instructions needed exceeds the quota.
-  // 
+  //
   unsigned InstrNeeded = calcInstrNumber(Opnds);
   if (InstrNeeded > InstrQuota)
     return 0;
@@ -591,7 +702,7 @@ Value *FAddCombine::createNaryFAdd
   // Iterate the addends, creating fadd/fsub using adjacent two addends.
   for (AddendVect::const_iterator I = Opnds.begin(), E = Opnds.end();
        I != E; I++) {
-    bool NeedNeg; 
+    bool NeedNeg;
     Value *V = createAddendVal(**I, NeedNeg);
     if (!LastVal) {
       LastVal = V;
@@ -617,7 +728,7 @@ Value *FAddCombine::createNaryFAdd
   }
 
   #ifndef NDEBUG
-    assert(CreateInstrNum == InstrNeeded && 
+    assert(CreateInstrNum == InstrNeeded &&
            "Inconsistent in instruction numbers");
   #endif
 
@@ -627,7 +738,8 @@ Value *FAddCombine::createNaryFAdd
 Value *FAddCombine::createFSub
   (Value *Opnd0, Value *Opnd1) {
   Value *V = Builder->CreateFSub(Opnd0, Opnd1);
-  createInstPostProc(cast<Instruction>(V));
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    createInstPostProc(I);
   return V;
 }
 
@@ -639,13 +751,22 @@ Value *FAddCombine::createFNeg(Value *V) {
 Value *FAddCombine::createFAdd
   (Value *Opnd0, Value *Opnd1) {
   Value *V = Builder->CreateFAdd(Opnd0, Opnd1);
-  createInstPostProc(cast<Instruction>(V));
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    createInstPostProc(I);
   return V;
 }
 
 Value *FAddCombine::createFMul(Value *Opnd0, Value *Opnd1) {
   Value *V = Builder->CreateFMul(Opnd0, Opnd1);
-  createInstPostProc(cast<Instruction>(V));
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    createInstPostProc(I);
+  return V;
+}
+
+Value *FAddCombine::createFDiv(Value *Opnd0, Value *Opnd1) {
+  Value *V = Builder->CreateFDiv(Opnd0, Opnd1);
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    createInstPostProc(I);
   return V;
 }
 
@@ -665,8 +786,8 @@ unsigned FAddCombine::calcInstrNumber(const AddendVect &Opnds) {
   unsigned OpndNum = Opnds.size();
   unsigned InstrNeeded = OpndNum - 1;
 
-  // The number of addends in the form of "(-1)*x". 
-  unsigned NegOpndNum = 0; 
+  // The number of addends in the form of "(-1)*x".
+  unsigned NegOpndNum = 0;
 
   // Adjust the number of instructions needed to emit the N-ary add.
   for (AddendVect::const_iterator I = Opnds.begin(), E = Opnds.end();
@@ -853,6 +974,11 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
           return BinaryOperator::CreateSub(ConstantExpr::getAdd(XorRHS, CI),
                                            XorLHS);
       }
+      // (X + signbit) + C could have gotten canonicalized to (X ^ signbit) + C,
+      // transform them into (X + (signbit ^ C))
+      if (XorRHS->getValue().isSignBit())
+          return BinaryOperator::CreateAdd(XorLHS,
+                                           ConstantExpr::getXor(XorRHS, CI));
     }
   }
 
@@ -1111,6 +1237,31 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
     }
   }
 
+  // select C, 0, B + select C, A, 0 -> select C, A, B
+  {
+    Value *A1, *B1, *C1, *A2, *B2, *C2;
+    if (match(LHS, m_Select(m_Value(C1), m_Value(A1), m_Value(B1))) &&
+        match(RHS, m_Select(m_Value(C2), m_Value(A2), m_Value(B2)))) {
+      if (C1 == C2) {
+        Constant *Z1=0, *Z2=0;
+        Value *A, *B, *C=C1;
+        if (match(A1, m_AnyZero()) && match(B2, m_AnyZero())) {
+            Z1 = dyn_cast<Constant>(A1); A = A2;
+            Z2 = dyn_cast<Constant>(B2); B = B1;
+        } else if (match(B1, m_AnyZero()) && match(A2, m_AnyZero())) {
+            Z1 = dyn_cast<Constant>(B1); B = B2;
+            Z2 = dyn_cast<Constant>(A2); A = A1; 
+        }
+        
+        if (Z1 && Z2 && 
+            (I.hasNoSignedZeros() || 
+             (Z1->isNegativeZeroValue() && Z2->isNegativeZeroValue()))) {
+          return SelectInst::Create(C, A, B);
+        }
+      }
+    }
+  }
+
   if (I.hasUnsafeAlgebra()) {
     if (Value *V = FAddCombine(Builder).simplify(&I))
       return ReplaceInstUsesWith(I, V);
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 4332467371..ec75dd2e04 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -22,8 +22,8 @@ using namespace PatternMatch;
 
 
 /// AddOne - Add one to a ConstantInt.
-static Constant *AddOne(Constant *C) {
-  return ConstantExpr::getAdd(C, ConstantInt::get(C->getType(), 1));
+static Constant *AddOne(ConstantInt *C) {
+  return ConstantInt::get(C->getContext(), C->getValue() + 1);
 }
 /// SubOne - Subtract one from a ConstantInt.
 static Constant *SubOne(ConstantInt *C) {
@@ -266,9 +266,8 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op,
   return 0;
 }
 
-
-/// InsertRangeTest - Emit a computation of: (V >= Lo && V < Hi) if Inside is
-/// true, otherwise (V < Lo || V >= Hi).  In practice, we emit the more efficient
+/// Emit a computation of: (V >= Lo && V < Hi) if Inside is true, otherwise
+/// (V < Lo || V >= Hi).  In practice, we emit the more efficient
 /// (V-Lo) \<u Hi-Lo.  This method expects that Lo <= Hi. isSigned indicates
 /// whether to treat the V, Lo and HI as signed or not. IB is the location to
 /// insert new instructions.
@@ -935,6 +934,9 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
 Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) {
   if (LHS->getPredicate() == FCmpInst::FCMP_ORD &&
       RHS->getPredicate() == FCmpInst::FCMP_ORD) {
+    if (LHS->getOperand(0)->getType() != RHS->getOperand(0)->getType())
+      return 0;
+
     // (fcmp ord x, c) & (fcmp ord y, c)  -> (fcmp ord x, y)
     if (ConstantFP *LHSC = dyn_cast<ConstantFP>(LHS->getOperand(1)))
       if (ConstantFP *RHSC = dyn_cast<ConstantFP>(RHS->getOperand(1))) {
@@ -1545,14 +1547,6 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
     switch (RHSCC) {
     default: llvm_unreachable("Unknown integer condition code!");
     case ICmpInst::ICMP_EQ:
-      if (LHSCst == SubOne(RHSCst)) {
-        // (X == 13 | X == 14) -> X-13 <u 2
-        Constant *AddCST = ConstantExpr::getNeg(LHSCst);
-        Value *Add = Builder->CreateAdd(Val, AddCST, Val->getName()+".off");
-        AddCST = ConstantExpr::getSub(AddOne(RHSCst), LHSCst);
-        return Builder->CreateICmpULT(Add, AddCST);
-      }
-
       if (LHS->getOperand(0) == RHS->getOperand(0)) {
         // if LHSCst and RHSCst differ only by one bit:
         // (A == C1 || A == C2) -> (A & ~(C1 ^ C2)) == C1
@@ -1566,6 +1560,14 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
         }
       }
 
+      if (LHSCst == SubOne(RHSCst)) {
+        // (X == 13 | X == 14) -> X-13 <u 2
+        Constant *AddCST = ConstantExpr::getNeg(LHSCst);
+        Value *Add = Builder->CreateAdd(Val, AddCST, Val->getName()+".off");
+        AddCST = ConstantExpr::getSub(AddOne(RHSCst), LHSCst);
+        return Builder->CreateICmpULT(Add, AddCST);
+      }
+
       break;                         // (X == 13 | X == 15) -> no change
     case ICmpInst::ICMP_UGT:         // (X == 13 | X u> 14) -> no change
     case ICmpInst::ICMP_SGT:         // (X == 13 | X s> 14) -> no change
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 64cd1bd278..78b4a2c6c9 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -1372,7 +1372,8 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
         NestF->getType() == PointerType::getUnqual(NewFTy) ?
         NestF : ConstantExpr::getBitCast(NestF,
                                          PointerType::getUnqual(NewFTy));
-      const AttributeSet &NewPAL = AttributeSet::get(FTy->getContext(), NewAttrs);
+      const AttributeSet &NewPAL =
+          AttributeSet::get(FTy->getContext(), NewAttrs);
 
       Instruction *NewCaller;
       if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index a960ab2499..2ee1278d23 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -104,6 +104,12 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
   uint64_t CastElTySize = TD->getTypeAllocSize(CastElTy);
   if (CastElTySize == 0 || AllocElTySize == 0) return 0;
 
+  // If the allocation has multiple uses, only promote it if we're not
+  // shrinking the amount of memory being allocated.
+  uint64_t AllocElTyStoreSize = TD->getTypeStoreSize(AllocElTy);
+  uint64_t CastElTyStoreSize = TD->getTypeStoreSize(CastElTy);
+  if (!AI.hasOneUse() && CastElTyStoreSize < AllocElTyStoreSize) return 0;
+
   // See if we can satisfy the modulus by pulling a scale out of the array
   // size argument.
   unsigned ArraySizeScale;
@@ -1604,6 +1610,9 @@ static Value *OptimizeIntegerToVectorInsertions(BitCastInst &CI,
 /// OptimizeIntToFloatBitCast - See if we can optimize an integer->float/double
 /// bitcast.  The various long double bitcasts can't get in here.
 static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI,InstCombiner &IC){
+  // We need to know the target byte order to perform this optimization.
+  if (!IC.getDataLayout()) return 0;
+
   Value *Src = CI.getOperand(0);
   Type *DestTy = CI.getType();
 
@@ -1625,7 +1634,10 @@ static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI,InstCombiner &IC){
         VecInput = IC.Builder->CreateBitCast(VecInput, VecTy);
       }
 
-      return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(0));
+      unsigned Elt = 0;
+      if (IC.getDataLayout()->isBigEndian())
+        Elt = VecTy->getPrimitiveSizeInBits() / DestWidth - 1;
+      return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(Elt));
     }
   }
 
@@ -1647,6 +1659,8 @@ static Instruction *OptimizeIntToFloatBitCast(BitCastInst &CI,InstCombiner &IC){
       }
 
       unsigned Elt = ShAmt->getZExtValue() / DestWidth;
+      if (IC.getDataLayout()->isBigEndian())
+        Elt = VecTy->getPrimitiveSizeInBits() / DestWidth - 1 - Elt;
       return ExtractElementInst::Create(VecInput, IC.Builder->getInt32(Elt));
     }
   }
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 6eca399a40..518a8323b6 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -139,6 +139,31 @@ static bool isSignBitCheck(ICmpInst::Predicate pred, ConstantInt *RHS,
   }
 }
 
+/// Returns true if the exploded icmp can be expressed as a signed comparison
+/// to zero and updates the predicate accordingly.
+/// The signedness of the comparison is preserved.
+static bool isSignTest(ICmpInst::Predicate &pred, const ConstantInt *RHS) {
+  if (!ICmpInst::isSigned(pred))
+    return false;
+
+  if (RHS->isZero())
+    return ICmpInst::isRelational(pred);
+
+  if (RHS->isOne()) {
+    if (pred == ICmpInst::ICMP_SLT) {
+      pred = ICmpInst::ICMP_SLE;
+      return true;
+    }
+  } else if (RHS->isAllOnesValue()) {
+    if (pred == ICmpInst::ICMP_SGT) {
+      pred = ICmpInst::ICMP_SGE;
+      return true;
+    }
+  }
+
+  return false;
+}
+
 // isHighOnes - Return true if the constant is of the form 1+0+.
 // This is the same as lowones(~X).
 static bool isHighOnes(const ConstantInt *CI) {
@@ -443,20 +468,29 @@ FoldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP, GlobalVariable *GV,
   }
 
 
-  // If a 32-bit or 64-bit magic bitvector captures the entire comparison state
+  // If a magic bitvector captures the entire comparison state
   // of this load, replace it with computation that does:
   //   ((magic_cst >> i) & 1) != 0
-  if (ArrayElementCount <= 32 ||
-      (TD && ArrayElementCount <= 64 && TD->isLegalInteger(64))) {
-    Type *Ty;
-    if (ArrayElementCount <= 32)
+  {
+    Type *Ty = 0;
+
+    // Look for an appropriate type:
+    // - The type of Idx if the magic fits
+    // - The smallest fitting legal type if we have a DataLayout
+    // - Default to i32
+    if (ArrayElementCount <= Idx->getType()->getIntegerBitWidth())
+      Ty = Idx->getType();
+    else if (TD)
+      Ty = TD->getSmallestLegalIntType(Init->getContext(), ArrayElementCount);
+    else if (ArrayElementCount <= 32)
       Ty = Type::getInt32Ty(Init->getContext());
-    else
-      Ty = Type::getInt64Ty(Init->getContext());
-    Value *V = Builder->CreateIntCast(Idx, Ty, false);
-    V = Builder->CreateLShr(ConstantInt::get(Ty, MagicBitvector), V);
-    V = Builder->CreateAnd(ConstantInt::get(Ty, 1), V);
-    return new ICmpInst(ICmpInst::ICMP_NE, V, ConstantInt::get(Ty, 0));
+
+    if (Ty != 0) {
+      Value *V = Builder->CreateIntCast(Idx, Ty, false);
+      V = Builder->CreateLShr(ConstantInt::get(Ty, MagicBitvector), V);
+      V = Builder->CreateAnd(ConstantInt::get(Ty, 1), V);
+      return new ICmpInst(ICmpInst::ICMP_NE, V, ConstantInt::get(Ty, 0));
+    }
   }
 
   return 0;
@@ -1273,6 +1307,23 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
     break;
   }
 
+  case Instruction::Mul: {       // (icmp pred (mul X, Val), CI)
+    ConstantInt *Val = dyn_cast<ConstantInt>(LHSI->getOperand(1));
+    if (!Val) break;
+
+    // If this is a signed comparison to 0 and the mul is sign preserving,
+    // use the mul LHS operand instead.
+    ICmpInst::Predicate pred = ICI.getPredicate();
+    if (isSignTest(pred, RHS) && !Val->isZero() &&
+        cast<BinaryOperator>(LHSI)->hasNoSignedWrap())
+      return new ICmpInst(Val->isNegative() ?
+                          ICmpInst::getSwappedPredicate(pred) : pred,
+                          LHSI->getOperand(0),
+                          Constant::getNullValue(RHS->getType()));
+
+    break;
+  }
+
   case Instruction::Shl: {       // (icmp pred (shl X, ShAmt), CI)
     ConstantInt *ShAmt = dyn_cast<ConstantInt>(LHSI->getOperand(1));
     if (!ShAmt) break;
@@ -1304,6 +1355,12 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
         return new ICmpInst(ICI.getPredicate(), LHSI->getOperand(0),
                             ConstantExpr::getLShr(RHS, ShAmt));
 
+      // If the shift is NSW and we compare to 0, then it is just shifting out
+      // sign bits, no need for an AND either.
+      if (cast<BinaryOperator>(LHSI)->hasNoSignedWrap() && RHSV == 0)
+        return new ICmpInst(ICI.getPredicate(), LHSI->getOperand(0),
+                            ConstantExpr::getLShr(RHS, ShAmt));
+
       if (LHSI->hasOneUse()) {
         // Otherwise strength reduce the shift into an and.
         uint32_t ShAmtVal = (uint32_t)ShAmt->getLimitedValue(TypeBits);
@@ -1318,6 +1375,15 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
       }
     }
 
+    // If this is a signed comparison to 0 and the shift is sign preserving,
+    // use the shift LHS operand instead.
+    ICmpInst::Predicate pred = ICI.getPredicate();
+    if (isSignTest(pred, RHS) &&
+        cast<BinaryOperator>(LHSI)->hasNoSignedWrap())
+      return new ICmpInst(pred,
+                          LHSI->getOperand(0),
+                          Constant::getNullValue(RHS->getType()));
+
     // Otherwise, if this is a comparison of the sign bit, simplify to and/test.
     bool TrueIfSigned = false;
     if (LHSI->hasOneUse() &&
@@ -1333,18 +1399,19 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
     }
 
     // Transform (icmp pred iM (shl iM %v, N), CI)
-    // -> (icmp pred i(M-N) (trunc %v iM to i(N-N)), (trunc (CI>>N))
-    // Transform the shl to a trunc if (trunc (CI>>N)) has no loss.
+    // -> (icmp pred i(M-N) (trunc %v iM to i(M-N)), (trunc (CI>>N))
+    // Transform the shl to a trunc if (trunc (CI>>N)) has no loss and M-N.
     // This enables to get rid of the shift in favor of a trunc which can be
     // free on the target. It has the additional benefit of comparing to a
     // smaller constant, which will be target friendly.
     unsigned Amt = ShAmt->getLimitedValue(TypeBits-1);
-    // @LOCALMOD-BEGIN
-    // We don't want to introduce non-power-of-two integer sizes for PNaCl's
-    // stable wire format, so modify this transformation for NaCl.
-    if (Amt != 0 && RHSV.countTrailingZeros() >= Amt &&
-        isPowerOf2_32(TypeBits - Amt) && (TypeBits - Amt) >= 8) {
-      // @LOCALMOD-END
+    if (LHSI->hasOneUse() &&
+        // @LOCALMOD-BEGIN
+        // We don't want to introduce non-power-of-two integer sizes for PNaCl's
+        // stable wire format, so modify this transformation for NaCl.       
+        isPowerOf2_32(TypeBits - Amt) && (TypeBits - Amt) >= 8 &&
+        // @LOCALMOD-END
+        Amt != 0 && RHSV.countTrailingZeros() >= Amt) {
       Type *NTy = IntegerType::get(ICI.getContext(), TypeBits - Amt);
       Constant *NCI = ConstantExpr::getTrunc(
                         ConstantExpr::getAShr(RHS,
@@ -1536,6 +1603,19 @@ Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
             return new ICmpInst(pred, X, NegX);
           }
         }
+        break;
+      case Instruction::Mul:
+        if (RHSV == 0 && BO->hasNoSignedWrap()) {
+          if (ConstantInt *BOC = dyn_cast<ConstantInt>(BO->getOperand(1))) {
+            // The trivial case (mul X, 0) is handled by InstSimplify
+            // General case : (mul X, C) != 0 iff X != 0
+            //                (mul X, C) == 0 iff X == 0
+            if (!BOC->isZero())
+              return new ICmpInst(ICI.getPredicate(), BO->getOperand(0),
+                                  Constant::getNullValue(RHS->getType()));
+          }
+        }
+        break;
       default: break;
       }
     } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(LHSI)) {
@@ -2416,6 +2496,55 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
       return new ICmpInst(Pred, Y, Z);
     }
 
+    // icmp slt (X + -1), Y -> icmp sle X, Y
+    if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SLT &&
+        match(B, m_AllOnes()))
+      return new ICmpInst(CmpInst::ICMP_SLE, A, Op1);
+
+    // icmp sge (X + -1), Y -> icmp sgt X, Y
+    if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SGE &&
+        match(B, m_AllOnes()))
+      return new ICmpInst(CmpInst::ICMP_SGT, A, Op1);
+
+    // icmp sle (X + 1), Y -> icmp slt X, Y
+    if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SLE &&
+        match(B, m_One()))
+      return new ICmpInst(CmpInst::ICMP_SLT, A, Op1);
+
+    // icmp sgt (X + 1), Y -> icmp sge X, Y
+    if (A && NoOp0WrapProblem && Pred == CmpInst::ICMP_SGT &&
+        match(B, m_One()))
+      return new ICmpInst(CmpInst::ICMP_SGE, A, Op1);
+
+    // if C1 has greater magnitude than C2:
+    //  icmp (X + C1), (Y + C2) -> icmp (X + C3), Y
+    //  s.t. C3 = C1 - C2
+    //
+    // if C2 has greater magnitude than C1:
+    //  icmp (X + C1), (Y + C2) -> icmp X, (Y + C3)
+    //  s.t. C3 = C2 - C1
+    if (A && C && NoOp0WrapProblem && NoOp1WrapProblem &&
+        (BO0->hasOneUse() || BO1->hasOneUse()) && !I.isUnsigned())
+      if (ConstantInt *C1 = dyn_cast<ConstantInt>(B))
+        if (ConstantInt *C2 = dyn_cast<ConstantInt>(D)) {
+          const APInt &AP1 = C1->getValue();
+          const APInt &AP2 = C2->getValue();
+          if (AP1.isNegative() == AP2.isNegative()) {
+            APInt AP1Abs = C1->getValue().abs();
+            APInt AP2Abs = C2->getValue().abs();
+            if (AP1Abs.uge(AP2Abs)) {
+              ConstantInt *C3 = Builder->getInt(AP1 - AP2);
+              Value *NewAdd = Builder->CreateNSWAdd(A, C3);
+              return new ICmpInst(Pred, NewAdd, C);
+            } else {
+              ConstantInt *C3 = Builder->getInt(AP2 - AP1);
+              Value *NewAdd = Builder->CreateNSWAdd(C, C3);
+              return new ICmpInst(Pred, A, NewAdd);
+            }
+          }
+        }
+
+
     // Analyze the case when either Op0 or Op1 is a sub instruction.
     // Op0 = A - B (or A and B are null); Op1 = C - D (or C and D are null).
     A = 0; B = 0; C = 0; D = 0;
@@ -2549,6 +2678,15 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
   }
 
   { Value *A, *B;
+    // Transform (A & ~B) == 0 --> (A & B) != 0
+    // and       (A & ~B) != 0 --> (A & B) == 0
+    // if A is a power of 2.
+    if (match(Op0, m_And(m_Value(A), m_Not(m_Value(B)))) &&
+        match(Op1, m_Zero()) && isKnownToBeAPowerOfTwo(A) && I.isEquality())
+      return new ICmpInst(I.getInversePredicate(),
+                          Builder->CreateAnd(A, B),
+                          Op1);
+
     // ~x < ~y --> y < x
     // ~x < cst --> ~cst < x
     if (match(Op0, m_Not(m_Value(A)))) {
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 337cfe32a8..e2d7966cb3 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -69,8 +69,8 @@ isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,
     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(U)) {
       // If the GEP has all zero indices, it doesn't offset the pointer.  If it
       // doesn't, it does.
-      if (!isOnlyCopiedFromConstantGlobal(GEP, TheCopy, ToDelete,
-                                          IsOffset || !GEP->hasAllZeroIndices()))
+      if (!isOnlyCopiedFromConstantGlobal(
+              GEP, TheCopy, ToDelete, IsOffset || !GEP->hasAllZeroIndices()))
         return false;
       continue;
     }
@@ -166,7 +166,7 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
   // Convert: alloca Ty, C - where C is a constant != 1 into: alloca [C x Ty], 1
   if (AI.isArrayAllocation()) {  // Check C != 1
     if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) {
-      Type *NewTy = 
+      Type *NewTy =
         ArrayType::get(AI.getAllocatedType(), C->getZExtValue());
       AllocaInst *New = Builder->CreateAlloca(NewTy, 0, AI.getName());
       New->setAlignment(AI.getAlignment());
@@ -294,7 +294,7 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
 
     Type *SrcPTy = SrcTy->getElementType();
 
-    if (DestPTy->isIntegerTy() || DestPTy->isPointerTy() || 
+    if (DestPTy->isIntegerTy() || DestPTy->isPointerTy() ||
          DestPTy->isVectorTy()) {
       // If the source is an array, the code below will not succeed.  Check to
       // see if a trivial 'gep P, 0, 0' will help matters.  Only do this for
@@ -311,7 +311,7 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
           }
 
       if (IC.getDataLayout() &&
-          (SrcPTy->isIntegerTy() || SrcPTy->isPointerTy() || 
+          (SrcPTy->isIntegerTy() || SrcPTy->isPointerTy() ||
             SrcPTy->isVectorTy()) &&
           // Do not allow turning this into a load of an integer, which is then
           // casted to a pointer, this pessimizes pointer analysis a lot.
@@ -322,7 +322,7 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI,
         // Okay, we are casting from one integer or pointer type to another of
         // the same size.  Instead of casting the pointer before the load, cast
         // the result of the loaded value.
-        LoadInst *NewLoad = 
+        LoadInst *NewLoad =
           IC.Builder->CreateLoad(CastOp, LI.isVolatile(), CI->getName());
         NewLoad->setAlignment(LI.getAlignment());
         NewLoad->setAtomic(LI.getOrdering(), LI.getSynchScope());
@@ -359,7 +359,7 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
   // None of the following transforms are legal for volatile/atomic loads.
   // FIXME: Some of it is okay for atomic loads; needs refactoring.
   if (!LI.isSimple()) return 0;
-  
+
   // Do really simple store-to-load forwarding and load CSE, to catch cases
   // where there are several consecutive memory accesses to the same location,
   // separated by a few arithmetic operations.
@@ -380,7 +380,7 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
                     Constant::getNullValue(Op->getType()), &LI);
       return ReplaceInstUsesWith(LI, UndefValue::get(LI.getType()));
     }
-  } 
+  }
 
   // load null/undef -> unreachable
   // TODO: Consider a target hook for valid address spaces for this xform.
@@ -399,7 +399,7 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
     if (CE->isCast())
       if (Instruction *Res = InstCombineLoadCast(*this, LI, TD))
         return Res;
-  
+
   if (Op->hasOneUse()) {
     // Change select and PHI nodes to select values instead of addresses: this
     // helps alias analysis out a lot, allows many others simplifications, and
@@ -453,18 +453,18 @@ static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) {
   Type *DestPTy = cast<PointerType>(CI->getType())->getElementType();
   PointerType *SrcTy = dyn_cast<PointerType>(CastOp->getType());
   if (SrcTy == 0) return 0;
-  
+
   Type *SrcPTy = SrcTy->getElementType();
 
   if (!DestPTy->isIntegerTy() && !DestPTy->isPointerTy())
     return 0;
-  
+
   /// NewGEPIndices - If SrcPTy is an aggregate type, we can emit a "noop gep"
   /// to its first element.  This allows us to handle things like:
   ///   store i32 xxx, (bitcast {foo*, float}* %P to i32*)
   /// on 32-bit hosts.
   SmallVector<Value*, 4> NewGEPIndices;
-  
+
   // If the source is an array, the code below will not succeed.  Check to
   // see if a trivial 'gep P, 0, 0' will help matters.  Only do this for
   // constants.
@@ -472,7 +472,7 @@ static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) {
     // Index through pointer.
     Constant *Zero = Constant::getNullValue(Type::getInt32Ty(SI.getContext()));
     NewGEPIndices.push_back(Zero);
-    
+
     while (1) {
       if (StructType *STy = dyn_cast<StructType>(SrcPTy)) {
         if (!STy->getNumElements()) /* Struct can be empty {} */
@@ -486,24 +486,24 @@ static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) {
         break;
       }
     }
-    
+
     SrcTy = PointerType::get(SrcPTy, SrcTy->getAddressSpace());
   }
 
   if (!SrcPTy->isIntegerTy() && !SrcPTy->isPointerTy())
     return 0;
-  
+
   // If the pointers point into different address spaces or if they point to
   // values with different sizes, we can't do the transformation.
   if (!IC.getDataLayout() ||
-      SrcTy->getAddressSpace() != 
+      SrcTy->getAddressSpace() !=
         cast<PointerType>(CI->getType())->getAddressSpace() ||
       IC.getDataLayout()->getTypeSizeInBits(SrcPTy) !=
       IC.getDataLayout()->getTypeSizeInBits(DestPTy))
     return 0;
 
   // Okay, we are casting from one integer or pointer type to another of
-  // the same size.  Instead of casting the pointer before 
+  // the same size.  Instead of casting the pointer before
   // the store, cast the value to be stored.
   Value *NewCast;
   Value *SIOp0 = SI.getOperand(0);
@@ -517,12 +517,12 @@ static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) {
     if (SIOp0->getType()->isPointerTy())
       opcode = Instruction::PtrToInt;
   }
-  
+
   // SIOp0 is a pointer to aggregate and this is a store to the first field,
   // emit a GEP to index into its first field.
   if (!NewGEPIndices.empty())
     CastOp = IC.Builder->CreateInBoundsGEP(CastOp, NewGEPIndices);
-  
+
   NewCast = IC.Builder->CreateCast(opcode, SIOp0, CastDstTy,
                                    SIOp0->getName()+".c");
   SI.setOperand(0, NewCast);
@@ -541,7 +541,7 @@ static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) {
 static bool equivalentAddressValues(Value *A, Value *B) {
   // Test if the values are trivially equivalent.
   if (A == B) return true;
-  
+
   // Test if the values come form identical arithmetic instructions.
   // This uses isIdenticalToWhenDefined instead of isIdenticalTo because
   // its only used to compare two uses within the same basic block, which
@@ -554,7 +554,7 @@ static bool equivalentAddressValues(Value *A, Value *B) {
     if (Instruction *BI = dyn_cast<Instruction>(B))
       if (cast<Instruction>(A)->isIdenticalToWhenDefined(BI))
         return true;
-  
+
   // Otherwise they may not be equivalent.
   return false;
 }
@@ -585,7 +585,7 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
   // If the RHS is an alloca with a single use, zapify the store, making the
   // alloca dead.
   if (Ptr->hasOneUse()) {
-    if (isa<AllocaInst>(Ptr)) 
+    if (isa<AllocaInst>(Ptr))
       return EraseInstFromFunction(SI);
     if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
       if (isa<AllocaInst>(GEP->getOperand(0))) {
@@ -608,8 +608,8 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
         (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy())) {
       ScanInsts++;
       continue;
-    }    
-    
+    }
+
     if (StoreInst *PrevSI = dyn_cast<StoreInst>(BBI)) {
       // Prev store isn't volatile, and stores to the same location?
       if (PrevSI->isSimple() && equivalentAddressValues(PrevSI->getOperand(1),
@@ -621,7 +621,7 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
       }
       break;
     }
-    
+
     // If this is a load, we have to stop.  However, if the loaded value is from
     // the pointer we're loading and is producing the pointer we're storing,
     // then *this* store is dead (X = load P; store X -> P).
@@ -629,12 +629,12 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
       if (LI == Val && equivalentAddressValues(LI->getOperand(0), Ptr) &&
           LI->isSimple())
         return EraseInstFromFunction(SI);
-      
+
       // Otherwise, this is a load from some other location.  Stores before it
       // may not be dead.
       break;
     }
-    
+
     // Don't skip over loads or things that can modify memory.
     if (BBI->mayWriteToMemory() || BBI->mayReadFromMemory())
       break;
@@ -664,11 +664,11 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
       if (Instruction *Res = InstCombineStoreToCast(*this, SI))
         return Res;
 
-  
+
   // If this store is the last instruction in the basic block (possibly
   // excepting debug info instructions), and if the block ends with an
   // unconditional branch, try to move it to the successor block.
-  BBI = &SI; 
+  BBI = &SI;
   do {
     ++BBI;
   } while (isa<DbgInfoIntrinsic>(BBI) ||
@@ -677,7 +677,7 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
     if (BI->isUnconditional())
       if (SimplifyStoreAtEndOfBlock(SI))
         return 0;  // xform done!
-  
+
   return 0;
 }
 
@@ -691,12 +691,12 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
 ///
 bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
   BasicBlock *StoreBB = SI.getParent();
-  
+
   // Check to see if the successor block has exactly two incoming edges.  If
   // so, see if the other predecessor contains a store to the same location.
   // if so, insert a PHI node (if needed) and move the stores down.
   BasicBlock *DestBB = StoreBB->getTerminator()->getSuccessor(0);
-  
+
   // Determine whether Dest has exactly two predecessors and, if so, compute
   // the other predecessor.
   pred_iterator PI = pred_begin(DestBB);
@@ -708,7 +708,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
 
   if (++PI == pred_end(DestBB))
     return false;
-  
+
   P = *PI;
   if (P != StoreBB) {
     if (OtherBB)
@@ -728,7 +728,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
   BranchInst *OtherBr = dyn_cast<BranchInst>(BBI);
   if (!OtherBr || BBI == OtherBB->begin())
     return false;
-  
+
   // If the other block ends in an unconditional branch, check for the 'if then
   // else' case.  there is an instruction before the branch.
   StoreInst *OtherStore = 0;
@@ -750,10 +750,10 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
   } else {
     // Otherwise, the other block ended with a conditional branch. If one of the
     // destinations is StoreBB, then we have the if/then case.
-    if (OtherBr->getSuccessor(0) != StoreBB && 
+    if (OtherBr->getSuccessor(0) != StoreBB &&
         OtherBr->getSuccessor(1) != StoreBB)
       return false;
-    
+
     // Okay, we know that OtherBr now goes to Dest and StoreBB, so this is an
     // if/then triangle.  See if there is a store to the same ptr as SI that
     // lives in OtherBB.
@@ -771,7 +771,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
           BBI == OtherBB->begin())
         return false;
     }
-    
+
     // In order to eliminate the store in OtherBr, we have to
     // make sure nothing reads or overwrites the stored value in
     // StoreBB.
@@ -781,7 +781,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
         return false;
     }
   }
-  
+
   // Insert a PHI node now if we need it.
   Value *MergedVal = OtherStore->getOperand(0);
   if (MergedVal != SI.getOperand(0)) {
@@ -790,7 +790,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
     PN->addIncoming(OtherStore->getOperand(0), OtherBB);
     MergedVal = InsertNewInstBefore(PN, DestBB->front());
   }
-  
+
   // Advance to a place where it is safe to insert the new store and
   // insert it.
   BBI = DestBB->getFirstInsertionPt();
@@ -800,7 +800,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
                                    SI.getOrdering(),
                                    SI.getSynchScope());
   InsertNewInstBefore(NewSI, *BBI);
-  NewSI->setDebugLoc(OtherStore->getDebugLoc()); 
+  NewSI->setDebugLoc(OtherStore->getDebugLoc());
 
   // If the two stores had the same TBAA tag, preserve it.
   if (MDNode *TBAATag = SI.getMetadata(LLVMContext::MD_tbaa))
@@ -808,7 +808,7 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
                                OtherStore->getMetadata(LLVMContext::MD_tbaa))))
       NewSI->setMetadata(LLVMContext::MD_tbaa, TBAATag);
 
-  
+
   // Nuke the old stores.
   EraseInstFromFunction(SI);
   EraseInstFromFunction(*OtherStore);
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 8e4267f898..ecc9fc3e45 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -28,7 +28,7 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC) {
   // if this is safe.  For example, the use could be in dynamically unreached
   // code.
   if (!V->hasOneUse()) return 0;
-  
+
   bool MadeChange = false;
 
   // ((1 << A) >>u B) --> (1 << (A-B))
@@ -41,7 +41,7 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC) {
     A = IC.Builder->CreateSub(A, B);
     return IC.Builder->CreateShl(PowerOf2, A);
   }
-  
+
   // (PowerOfTwo >>u B) --> isExact since shifting out the result would make it
   // inexact.  Similarly for <<.
   if (BinaryOperator *I = dyn_cast<BinaryOperator>(V))
@@ -52,12 +52,12 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC) {
         I->setOperand(0, V2);
         MadeChange = true;
       }
-      
+
       if (I->getOpcode() == Instruction::LShr && !I->isExact()) {
         I->setIsExact();
         MadeChange = true;
       }
-      
+
       if (I->getOpcode() == Instruction::Shl && !I->hasNoUnsignedWrap()) {
         I->setHasNoUnsignedWrap();
         MadeChange = true;
@@ -67,7 +67,7 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC) {
   // TODO: Lots more we could do here:
   //    If V is a phi node, we can call this on each of its operands.
   //    "select cond, X, 0" can simplify to "X".
-  
+
   return MadeChange ? V : 0;
 }
 
@@ -84,12 +84,12 @@ static bool MultiplyOverflows(ConstantInt *C1, ConstantInt *C2, bool sign) {
     LHSExt = LHSExt.zext(W * 2);
     RHSExt = RHSExt.zext(W * 2);
   }
-  
+
   APInt MulExt = LHSExt * RHSExt;
-  
+
   if (!sign)
     return MulExt.ugt(APInt::getLowBitsSet(W * 2, W));
-  
+
   APInt Min = APInt::getSignedMinValue(W).sext(W * 2);
   APInt Max = APInt::getSignedMaxValue(W).sext(W * 2);
   return MulExt.slt(Min) || MulExt.sgt(Max);
@@ -107,16 +107,16 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
 
   if (match(Op1, m_AllOnes()))  // X * -1 == 0 - X
     return BinaryOperator::CreateNeg(Op0, I.getName());
-  
+
   if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
-    
+
     // ((X << C1)*C2) == (X * (C2 << C1))
     if (BinaryOperator *SI = dyn_cast<BinaryOperator>(Op0))
       if (SI->getOpcode() == Instruction::Shl)
         if (Constant *ShOp = dyn_cast<Constant>(SI->getOperand(1)))
           return BinaryOperator::CreateMul(SI->getOperand(0),
                                            ConstantExpr::getShl(CI, ShOp));
-    
+
     const APInt &Val = CI->getValue();
     if (Val.isPowerOf2()) {          // Replace X*(2^C) with X << C
       Constant *NewCst = ConstantInt::get(Op0->getType(), Val.logBase2());
@@ -125,7 +125,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
       if (I.hasNoUnsignedWrap()) Shl->setHasNoUnsignedWrap();
       return Shl;
     }
-    
+
     // Canonicalize (X+C1)*CI -> X*CI+C1*CI.
     { Value *X; ConstantInt *C1;
       if (Op0->hasOneUse() &&
@@ -158,9 +158,9 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
       }
     }
   }
-  
+
   // Simplify mul instructions with a constant RHS.
-  if (isa<Constant>(Op1)) {    
+  if (isa<Constant>(Op1)) {
     // Try to fold constant mul into select arguments.
     if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
       if (Instruction *R = FoldOpIntoSelect(I, SI))
@@ -181,7 +181,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
     Value *Op1C = Op1;
     BinaryOperator *BO = dyn_cast<BinaryOperator>(Op0);
     if (!BO ||
-        (BO->getOpcode() != Instruction::UDiv && 
+        (BO->getOpcode() != Instruction::UDiv &&
          BO->getOpcode() != Instruction::SDiv)) {
       Op1C = Op0;
       BO = dyn_cast<BinaryOperator>(Op1);
@@ -227,14 +227,14 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
     if (match(Op1, m_Shl(m_One(), m_Value(Y))))
       return BinaryOperator::CreateShl(Op0, Y);
   }
-  
+
   // If one of the operands of the multiply is a cast from a boolean value, then
   // we know the bool is either zero or one, so this is a 'masking' multiply.
   //   X * Y (where Y is 0 or 1) -> X & (0-Y)
   if (!I.getType()->isVectorTy()) {
     // -2 is "-1 << 1" so it is all bits set except the low one.
     APInt Negative2(I.getType()->getPrimitiveSizeInBits(), (uint64_t)-2, true);
-    
+
     Value *BoolCast = 0, *OtherOp = 0;
     if (MaskedValueIsZero(Op0, Negative2))
       BoolCast = Op0, OtherOp = Op1;
@@ -280,7 +280,7 @@ static void detectLog2OfHalf(Value *&Op, Value *&Y, IntrinsicInst *&Log2) {
      return;
    if (I->getOpcode() != Instruction::FMul || !I->hasUnsafeAlgebra())
      return;
-              
+
    ConstantFP *CFP = dyn_cast<ConstantFP>(I->getOperand(0));
    if (CFP && CFP->isExactlyValue(0.5)) {
      Y = I->getOperand(1);
@@ -289,14 +289,14 @@ static void detectLog2OfHalf(Value *&Op, Value *&Y, IntrinsicInst *&Log2) {
    CFP = dyn_cast<ConstantFP>(I->getOperand(1));
    if (CFP && CFP->isExactlyValue(0.5))
      Y = I->getOperand(0);
-} 
+}
 
 /// Helper function of InstCombiner::visitFMul(BinaryOperator(). It returns
 /// true iff the given value is FMul or FDiv with one and only one operand
 /// being a normal constant (i.e. not Zero/NaN/Infinity).
 static bool isFMulOrFDivWithConstant(Value *V) {
   Instruction *I = dyn_cast<Instruction>(V);
-  if (!I || (I->getOpcode() != Instruction::FMul && 
+  if (!I || (I->getOpcode() != Instruction::FMul &&
              I->getOpcode() != Instruction::FDiv))
     return false;
 
@@ -318,10 +318,10 @@ static bool isNormalFp(const ConstantFP *C) {
 /// foldFMulConst() is a helper routine of InstCombiner::visitFMul().
 /// The input \p FMulOrDiv is a FMul/FDiv with one and only one operand
 /// being a constant (i.e. isFMulOrFDivWithConstant(FMulOrDiv) == true).
-/// This function is to simplify "FMulOrDiv * C" and returns the 
+/// This function is to simplify "FMulOrDiv * C" and returns the
 /// resulting expression. Note that this function could return NULL in
 /// case the constants cannot be folded into a normal floating-point.
-/// 
+///
 Value *InstCombiner::foldFMulConst(Instruction *FMulOrDiv, ConstantFP *C,
                                    Instruction *InsertBefore) {
   assert(isFMulOrFDivWithConstant(FMulOrDiv) && "V is invalid");
@@ -351,7 +351,7 @@ Value *InstCombiner::foldFMulConst(Instruction *FMulOrDiv, ConstantFP *C,
       if (isNormalFp(F)) {
         R = BinaryOperator::CreateFMul(Opnd0, F);
       } else {
-        // (X / C1) * C => X / (C1/C) 
+        // (X / C1) * C => X / (C1/C)
         Constant *F = ConstantExpr::getFDiv(C1, C);
         if (isNormalFp(cast<ConstantFP>(F)))
           R = BinaryOperator::CreateFDiv(Opnd0, F);
@@ -402,7 +402,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
           return ReplaceInstUsesWith(I, V);
       }
 
-      // (MDC +/- C1) * C2 => (MDC * C2) +/- (C1 * C2)
+      // (MDC +/- C1) * C => (MDC * C) +/- (C1 * C)
       Instruction *FAddSub = dyn_cast<Instruction>(Op0);
       if (FAddSub &&
           (FAddSub->getOpcode() == Instruction::FAdd ||
@@ -415,13 +415,13 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
         if (C0) {
           std::swap(C0, C1);
           std::swap(Opnd0, Opnd1);
-          Swap = true; 
+          Swap = true;
         }
 
         if (C1 && C1->getValueAPF().isNormal() &&
             isFMulOrFDivWithConstant(Opnd0)) {
-          Value *M0 = ConstantExpr::getFMul(C1, C);
-          Value *M1 = isNormalFp(cast<ConstantFP>(M0)) ? 
+          Value *M1 = ConstantExpr::getFMul(C1, C);
+          Value *M0 = isNormalFp(cast<ConstantFP>(M1)) ?
                       foldFMulConst(cast<Instruction>(Opnd0), C, &I) :
                       0;
           if (M0 && M1) {
@@ -495,7 +495,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
     }
 
     // (X*Y) * X => (X*X) * Y where Y != X
-    //  The purpose is two-fold: 
+    //  The purpose is two-fold:
     //   1) to form a power expression (of X).
     //   2) potentially shorten the critical path: After transformation, the
     //  latency of the instruction Y is amortized by the expression of X*X,
@@ -524,6 +524,35 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
       }
     }
 
+    // B * (uitofp i1 C) -> select C, B, 0
+    if (I.hasNoNaNs() && I.hasNoInfs() && I.hasNoSignedZeros()) {
+      Value *LHS = Op0, *RHS = Op1;
+      Value *B, *C;
+      if (!match(RHS, m_UIToFp(m_Value(C))))
+        std::swap(LHS, RHS);
+
+      if (match(RHS, m_UIToFp(m_Value(C))) && C->getType()->isIntegerTy(1)) {
+        B = LHS;
+        Value *Zero = ConstantFP::getNegativeZero(B->getType());
+        return SelectInst::Create(C, B, Zero);
+      }
+    }
+
+    // A * (1 - uitofp i1 C) -> select C, 0, A
+    if (I.hasNoNaNs() && I.hasNoInfs() && I.hasNoSignedZeros()) {
+      Value *LHS = Op0, *RHS = Op1;
+      Value *A, *C;
+      if (!match(RHS, m_FSub(m_FPOne(), m_UIToFp(m_Value(C)))))
+        std::swap(LHS, RHS);
+
+      if (match(RHS, m_FSub(m_FPOne(), m_UIToFp(m_Value(C)))) &&
+          C->getType()->isIntegerTy(1)) {
+        A = LHS;
+        Value *Zero = ConstantFP::getNegativeZero(A->getType());
+        return SelectInst::Create(C, Zero, A);
+      }
+    }
+
     if (!isa<Constant>(Op1))
       std::swap(Opnd0, Opnd1);
     else
@@ -537,7 +566,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
 /// instruction.
 bool InstCombiner::SimplifyDivRemOfSelect(BinaryOperator &I) {
   SelectInst *SI = cast<SelectInst>(I.getOperand(1));
-  
+
   // div/rem X, (Cond ? 0 : Y) -> div/rem X, Y
   int NonNullOperand = -1;
   if (Constant *ST = dyn_cast<Constant>(SI->getOperand(1)))
@@ -547,36 +576,36 @@ bool InstCombiner::SimplifyDivRemOfSelect(BinaryOperator &I) {
   if (Constant *ST = dyn_cast<Constant>(SI->getOperand(2)))
     if (ST->isNullValue())
       NonNullOperand = 1;
-  
+
   if (NonNullOperand == -1)
     return false;
-  
+
   Value *SelectCond = SI->getOperand(0);
-  
+
   // Change the div/rem to use 'Y' instead of the select.
   I.setOperand(1, SI->getOperand(NonNullOperand));
-  
+
   // Okay, we know we replace the operand of the div/rem with 'Y' with no
   // problem.  However, the select, or the condition of the select may have
   // multiple uses.  Based on our knowledge that the operand must be non-zero,
   // propagate the known value for the select into other uses of it, and
   // propagate a known value of the condition into its other users.
-  
+
   // If the select and condition only have a single use, don't bother with this,
   // early exit.
   if (SI->use_empty() && SelectCond->hasOneUse())
     return true;
-  
+
   // Scan the current block backward, looking for other uses of SI.
   BasicBlock::iterator BBI = &I, BBFront = I.getParent()->begin();
-  
+
   while (BBI != BBFront) {
     --BBI;
     // If we found a call to a function, we can't assume it will return, so
     // information from below it cannot be propagated above it.
     if (isa<CallInst>(BBI) && !isa<IntrinsicInst>(BBI))
       break;
-    
+
     // Replace uses of the select or its condition with the known values.
     for (Instruction::op_iterator I = BBI->op_begin(), E = BBI->op_end();
          I != E; ++I) {
@@ -589,17 +618,17 @@ bool InstCombiner::SimplifyDivRemOfSelect(BinaryOperator &I) {
         Worklist.Add(BBI);
       }
     }
-    
+
     // If we past the instruction, quit looking for it.
     if (&*BBI == SI)
       SI = 0;
     if (&*BBI == SelectCond)
       SelectCond = 0;
-    
+
     // If we ran out of things to eliminate, break out of the loop.
     if (SelectCond == 0 && SI == 0)
       break;
-    
+
   }
   return true;
 }
@@ -617,7 +646,7 @@ Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) {
     I.setOperand(1, V);
     return &I;
   }
-  
+
   // Handle cases involving: [su]div X, (select Cond, Y, Z)
   // This does not apply for fdiv.
   if (isa<SelectInst>(Op1) && SimplifyDivRemOfSelect(I))
@@ -683,16 +712,16 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
   // Handle the integer div common cases
   if (Instruction *Common = commonIDivTransforms(I))
     return Common;
-  
-  { 
+
+  {
     // X udiv 2^C -> X >> C
     // Check to see if this is an unsigned division with an exact power of 2,
     // if so, convert to a right shift.
     const APInt *C;
     if (match(Op1, m_Power2(C))) {
       BinaryOperator *LShr =
-      BinaryOperator::CreateLShr(Op0, 
-                                 ConstantInt::get(Op0->getType(), 
+      BinaryOperator::CreateLShr(Op0,
+                                 ConstantInt::get(Op0->getType(),
                                                   C->logBase2()));
       if (I.isExact()) LShr->setIsExact();
       return LShr;
@@ -732,7 +761,7 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
       return BinaryOperator::CreateLShr(Op0, N);
     }
   }
-  
+
   // udiv X, (Select Cond, C1, C2) --> Select Cond, (shr X, C1), (shr X, C2)
   // where C1&C2 are powers of two.
   { Value *Cond; const APInt *C1, *C2;
@@ -740,11 +769,11 @@ Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
       // Construct the "on true" case of the select
       Value *TSI = Builder->CreateLShr(Op0, C1->logBase2(), Op1->getName()+".t",
                                        I.isExact());
-  
+
       // Construct the "on false" case of the select
       Value *FSI = Builder->CreateLShr(Op0, C2->logBase2(), Op1->getName()+".f",
                                        I.isExact());
-      
+
       // construct the select instruction and return it.
       return SelectInst::Create(Cond, TSI, FSI);
     }
@@ -799,7 +828,7 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
         // X sdiv Y -> X udiv Y, iff X and Y don't have sign bit set
         return BinaryOperator::CreateUDiv(Op0, Op1, I.getName());
       }
-      
+
       if (match(Op1, m_Shl(m_Power2(), m_Value()))) {
         // X sdiv (1 << Y) -> X udiv (1 << Y) ( -> X u>> Y)
         // Safe because the only negative value (1 << Y) can take on is
@@ -809,13 +838,13 @@ Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
       }
     }
   }
-  
+
   return 0;
 }
 
 /// CvtFDivConstToReciprocal tries to convert X/C into X*1/C if C not a special
 /// FP value and:
-///    1) 1/C is exact, or 
+///    1) 1/C is exact, or
 ///    2) reciprocal is allowed.
 /// If the convertion was successful, the simplified expression "X * 1/C" is
 /// returned; otherwise, NULL is returned.
@@ -826,7 +855,7 @@ static Instruction *CvtFDivConstToReciprocal(Value *Dividend,
   const APFloat &FpVal = Divisor->getValueAPF();
   APFloat Reciprocal(FpVal.getSemantics());
   bool Cvt = FpVal.getExactInverse(&Reciprocal);
-    
+
   if (!Cvt && AllowReciprocal && FpVal.isNormal()) {
     Reciprocal = APFloat(FpVal.getSemantics(), 1.0f);
     (void)Reciprocal.divide(FpVal, APFloat::rmNearestTiesToEven);
@@ -870,10 +899,10 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
         Constant *C = ConstantExpr::getFMul(C1, C2);
         const APFloat &F = cast<ConstantFP>(C)->getValueAPF();
         if (F.isNormal() && !F.isDenormal()) {
-          Res = CvtFDivConstToReciprocal(X, cast<ConstantFP>(C), 
+          Res = CvtFDivConstToReciprocal(X, cast<ConstantFP>(C),
                                          AllowReciprocal);
           if (!Res)
-            Res = BinaryOperator::CreateFDiv(X, C); 
+            Res = BinaryOperator::CreateFDiv(X, C);
         }
       }
 
@@ -911,7 +940,7 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
     if (Fold) {
       const APFloat &FoldC = cast<ConstantFP>(Fold)->getValueAPF();
       if (FoldC.isNormal() && !FoldC.isDenormal()) {
-        Instruction *R = CreateDiv ? 
+        Instruction *R = CreateDiv ?
                          BinaryOperator::CreateFDiv(Fold, X) :
                          BinaryOperator::CreateFMul(X, Fold);
         R->setFastMathFlags(I.getFastMathFlags());
@@ -997,7 +1026,7 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) {
 
   if (Instruction *common = commonIRemTransforms(I))
     return common;
-  
+
   // X urem C^2 -> X and C-1
   { const APInt *C;
     if (match(Op1, m_Power2(C)))
@@ -1005,7 +1034,7 @@ Instruction *InstCombiner::visitURem(BinaryOperator &I) {
                                        ConstantInt::get(I.getType(), *C-1));
   }
 
-  // Turn A % (C << N), where C is 2^k, into A & ((C << N)-1)  
+  // Turn A % (C << N), where C is 2^k, into A & ((C << N)-1)
   if (match(Op1, m_Shl(m_Power2(), m_Value()))) {
     Constant *N1 = Constant::getAllOnesValue(I.getType());
     Value *Add = Builder->CreateAdd(Op1, N1);
@@ -1041,7 +1070,7 @@ Instruction *InstCombiner::visitSRem(BinaryOperator &I) {
   // Handle the integer rem common cases
   if (Instruction *Common = commonIRemTransforms(I))
     return Common;
-  
+
   if (Value *RHSNeg = dyn_castNegVal(Op1))
     if (!isa<Constant>(RHSNeg) ||
         (isa<ConstantInt>(RHSNeg) &&
diff --git a/lib/Transforms/InstCombine/InstCombinePHI.cpp b/lib/Transforms/InstCombine/InstCombinePHI.cpp
index b0a998cca7..bd14e81c3f 100644
--- a/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -27,10 +27,10 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
   unsigned Opc = FirstInst->getOpcode();
   Value *LHSVal = FirstInst->getOperand(0);
   Value *RHSVal = FirstInst->getOperand(1);
-    
+
   Type *LHSType = LHSVal->getType();
   Type *RHSType = RHSVal->getType();
-  
+
   bool isNUW = false, isNSW = false, isExact = false;
   if (OverflowingBinaryOperator *BO =
         dyn_cast<OverflowingBinaryOperator>(FirstInst)) {
@@ -39,7 +39,7 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
   } else if (PossiblyExactOperator *PEO =
                dyn_cast<PossiblyExactOperator>(FirstInst))
     isExact = PEO->isExact();
-  
+
   // Scan to see if all operands are the same opcode, and all have one use.
   for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) {
     Instruction *I = dyn_cast<Instruction>(PN.getIncomingValue(i));
@@ -54,14 +54,14 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
     if (CmpInst *CI = dyn_cast<CmpInst>(I))
       if (CI->getPredicate() != cast<CmpInst>(FirstInst)->getPredicate())
         return 0;
-    
+
     if (isNUW)
       isNUW = cast<OverflowingBinaryOperator>(I)->hasNoUnsignedWrap();
     if (isNSW)
       isNSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
     if (isExact)
       isExact = cast<PossiblyExactOperator>(I)->isExact();
-    
+
     // Keep track of which operand needs a phi node.
     if (I->getOperand(0) != LHSVal) LHSVal = 0;
     if (I->getOperand(1) != RHSVal) RHSVal = 0;
@@ -73,9 +73,9 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
   // bad when the PHIs are in the header of a loop.
   if (!LHSVal && !RHSVal)
     return 0;
-  
+
   // Otherwise, this is safe to transform!
-  
+
   Value *InLHS = FirstInst->getOperand(0);
   Value *InRHS = FirstInst->getOperand(1);
   PHINode *NewLHS = 0, *NewRHS = 0;
@@ -86,7 +86,7 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
     InsertNewInstBefore(NewLHS, PN);
     LHSVal = NewLHS;
   }
-  
+
   if (RHSVal == 0) {
     NewRHS = PHINode::Create(RHSType, PN.getNumIncomingValues(),
                              FirstInst->getOperand(1)->getName() + ".pn");
@@ -94,7 +94,7 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
     InsertNewInstBefore(NewRHS, PN);
     RHSVal = NewRHS;
   }
-  
+
   // Add all operands to the new PHIs.
   if (NewLHS || NewRHS) {
     for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
@@ -109,7 +109,7 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
       }
     }
   }
-    
+
   if (CmpInst *CIOp = dyn_cast<CmpInst>(FirstInst)) {
     CmpInst *NewCI = CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(),
                                      LHSVal, RHSVal);
@@ -129,8 +129,8 @@ Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
 
 Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
   GetElementPtrInst *FirstInst =cast<GetElementPtrInst>(PN.getIncomingValue(0));
-  
-  SmallVector<Value*, 16> FixedOperands(FirstInst->op_begin(), 
+
+  SmallVector<Value*, 16> FixedOperands(FirstInst->op_begin(),
                                         FirstInst->op_end());
   // This is true if all GEP bases are allocas and if all indices into them are
   // constants.
@@ -140,9 +140,9 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
   // more than one phi, which leads to higher register pressure. This is
   // especially bad when the PHIs are in the header of a loop.
   bool NeededPhi = false;
-  
+
   bool AllInBounds = true;
-  
+
   // Scan to see if all operands are the same opcode, and all have one use.
   for (unsigned i = 1; i != PN.getNumIncomingValues(); ++i) {
     GetElementPtrInst *GEP= dyn_cast<GetElementPtrInst>(PN.getIncomingValue(i));
@@ -151,18 +151,18 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
       return 0;
 
     AllInBounds &= GEP->isInBounds();
-    
+
     // Keep track of whether or not all GEPs are of alloca pointers.
     if (AllBasePointersAreAllocas &&
         (!isa<AllocaInst>(GEP->getOperand(0)) ||
          !GEP->hasAllConstantIndices()))
       AllBasePointersAreAllocas = false;
-    
+
     // Compare the operand lists.
     for (unsigned op = 0, e = FirstInst->getNumOperands(); op != e; ++op) {
       if (FirstInst->getOperand(op) == GEP->getOperand(op))
         continue;
-      
+
       // Don't merge two GEPs when two operands differ (introducing phi nodes)
       // if one of the PHIs has a constant for the index.  The index may be
       // substantially cheaper to compute for the constants, so making it a
@@ -171,7 +171,7 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
       if (isa<ConstantInt>(FirstInst->getOperand(op)) ||
           isa<ConstantInt>(GEP->getOperand(op)))
         return 0;
-      
+
       if (FirstInst->getOperand(op)->getType() !=GEP->getOperand(op)->getType())
         return 0;
 
@@ -186,7 +186,7 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
       NeededPhi = true;
     }
   }
-  
+
   // If all of the base pointers of the PHI'd GEPs are from allocas, don't
   // bother doing this transformation.  At best, this will just save a bit of
   // offset calculation, but all the predecessors will have to materialize the
@@ -195,11 +195,11 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
   // which can usually all be folded into the load.
   if (AllBasePointersAreAllocas)
     return 0;
-  
+
   // Otherwise, this is safe to transform.  Insert PHI nodes for each operand
   // that is variable.
   SmallVector<PHINode*, 16> OperandPhis(FixedOperands.size());
-  
+
   bool HasAnyPHIs = false;
   for (unsigned i = 0, e = FixedOperands.size(); i != e; ++i) {
     if (FixedOperands[i]) continue;  // operand doesn't need a phi.
@@ -207,28 +207,28 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
     PHINode *NewPN = PHINode::Create(FirstOp->getType(), e,
                                      FirstOp->getName()+".pn");
     InsertNewInstBefore(NewPN, PN);
-    
+
     NewPN->addIncoming(FirstOp, PN.getIncomingBlock(0));
     OperandPhis[i] = NewPN;
     FixedOperands[i] = NewPN;
     HasAnyPHIs = true;
   }
 
-  
+
   // Add all operands to the new PHIs.
   if (HasAnyPHIs) {
     for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
       GetElementPtrInst *InGEP =cast<GetElementPtrInst>(PN.getIncomingValue(i));
       BasicBlock *InBB = PN.getIncomingBlock(i);
-      
+
       for (unsigned op = 0, e = OperandPhis.size(); op != e; ++op)
         if (PHINode *OpPhi = OperandPhis[op])
           OpPhi->addIncoming(InGEP->getOperand(op), InBB);
     }
   }
-  
+
   Value *Base = FixedOperands[0];
-  GetElementPtrInst *NewGEP = 
+  GetElementPtrInst *NewGEP =
     GetElementPtrInst::Create(Base, makeArrayRef(FixedOperands).slice(1));
   if (AllInBounds) NewGEP->setIsInBounds();
   NewGEP->setDebugLoc(FirstInst->getDebugLoc());
@@ -246,11 +246,11 @@ Instruction *InstCombiner::FoldPHIArgGEPIntoPHI(PHINode &PN) {
 /// to a register.
 static bool isSafeAndProfitableToSinkLoad(LoadInst *L) {
   BasicBlock::iterator BBI = L, E = L->getParent()->end();
-  
+
   for (++BBI; BBI != E; ++BBI)
     if (BBI->mayWriteToMemory())
       return false;
-  
+
   // Check for non-address taken alloca.  If not address-taken already, it isn't
   // profitable to do this xform.
   if (AllocaInst *AI = dyn_cast<AllocaInst>(L->getOperand(0))) {
@@ -266,11 +266,11 @@ static bool isSafeAndProfitableToSinkLoad(LoadInst *L) {
       isAddressTaken = true;
       break;
     }
-    
+
     if (!isAddressTaken && AI->isStaticAlloca())
       return false;
   }
-  
+
   // If this load is a load from a GEP with a constant offset from an alloca,
   // then we don't want to sink it.  In its present form, it will be
   // load [constant stack offset].  Sinking it will cause us to have to
@@ -280,7 +280,7 @@ static bool isSafeAndProfitableToSinkLoad(LoadInst *L) {
     if (AllocaInst *AI = dyn_cast<AllocaInst>(GEP->getOperand(0)))
       if (AI->isStaticAlloca() && GEP->hasAllConstantIndices())
         return false;
-  
+
   return true;
 }
 
@@ -300,41 +300,41 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {
   bool isVolatile = FirstLI->isVolatile();
   unsigned LoadAlignment = FirstLI->getAlignment();
   unsigned LoadAddrSpace = FirstLI->getPointerAddressSpace();
-  
+
   // We can't sink the load if the loaded value could be modified between the
   // load and the PHI.
   if (FirstLI->getParent() != PN.getIncomingBlock(0) ||
       !isSafeAndProfitableToSinkLoad(FirstLI))
     return 0;
-  
+
   // If the PHI is of volatile loads and the load block has multiple
   // successors, sinking it would remove a load of the volatile value from
   // the path through the other successor.
-  if (isVolatile && 
+  if (isVolatile &&
       FirstLI->getParent()->getTerminator()->getNumSuccessors() != 1)
     return 0;
-  
+
   // Check to see if all arguments are the same operation.
   for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
     LoadInst *LI = dyn_cast<LoadInst>(PN.getIncomingValue(i));
     if (!LI || !LI->hasOneUse())
       return 0;
-    
-    // We can't sink the load if the loaded value could be modified between 
+
+    // We can't sink the load if the loaded value could be modified between
     // the load and the PHI.
     if (LI->isVolatile() != isVolatile ||
         LI->getParent() != PN.getIncomingBlock(i) ||
         LI->getPointerAddressSpace() != LoadAddrSpace ||
         !isSafeAndProfitableToSinkLoad(LI))
       return 0;
-      
+
     // If some of the loads have an alignment specified but not all of them,
     // we can't do the transformation.
     if ((LoadAlignment != 0) != (LI->getAlignment() != 0))
       return 0;
-    
+
     LoadAlignment = std::min(LoadAlignment, LI->getAlignment());
-    
+
     // If the PHI is of volatile loads and the load block has multiple
     // successors, sinking it would remove a load of the volatile value from
     // the path through the other successor.
@@ -342,16 +342,16 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {
         LI->getParent()->getTerminator()->getNumSuccessors() != 1)
       return 0;
   }
-  
+
   // Okay, they are all the same operation.  Create a new PHI node of the
   // correct type, and PHI together all of the LHS's of the instructions.
   PHINode *NewPN = PHINode::Create(FirstLI->getOperand(0)->getType(),
                                    PN.getNumIncomingValues(),
                                    PN.getName()+".in");
-  
+
   Value *InVal = FirstLI->getOperand(0);
   NewPN->addIncoming(InVal, PN.getIncomingBlock(0));
-  
+
   // Add all operands to the new PHI.
   for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
     Value *NewInVal = cast<LoadInst>(PN.getIncomingValue(i))->getOperand(0);
@@ -359,7 +359,7 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {
       InVal = 0;
     NewPN->addIncoming(NewInVal, PN.getIncomingBlock(i));
   }
-  
+
   Value *PhiVal;
   if (InVal) {
     // The new PHI unions all of the same values together.  This is really
@@ -370,14 +370,14 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {
     InsertNewInstBefore(NewPN, PN);
     PhiVal = NewPN;
   }
-  
+
   // If this was a volatile load that we are merging, make sure to loop through
   // and mark all the input loads as non-volatile.  If we don't do this, we will
   // insert a new volatile load and the old ones will not be deletable.
   if (isVolatile)
     for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i)
       cast<LoadInst>(PN.getIncomingValue(i))->setVolatile(false);
-  
+
   LoadInst *NewLI = new LoadInst(PhiVal, "", isVolatile, LoadAlignment);
   NewLI->setDebugLoc(FirstLI->getDebugLoc());
   return NewLI;
@@ -395,7 +395,7 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
     return FoldPHIArgGEPIntoPHI(PN);
   if (isa<LoadInst>(FirstInst))
     return FoldPHIArgLoadIntoPHI(PN);
-  
+
   // Scan the instruction, looking for input operations that can be folded away.
   // If all input operands to the phi are the same instruction (e.g. a cast from
   // the same type or "+42") we can pull the operation through the PHI, reducing
@@ -403,7 +403,7 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
   Constant *ConstantOp = 0;
   Type *CastSrcTy = 0;
   bool isNUW = false, isNSW = false, isExact = false;
-  
+
   if (isa<CastInst>(FirstInst)) {
     CastSrcTy = FirstInst->getOperand(0)->getType();
 
@@ -414,12 +414,12 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
         return 0;
     }
   } else if (isa<BinaryOperator>(FirstInst) || isa<CmpInst>(FirstInst)) {
-    // Can fold binop, compare or shift here if the RHS is a constant, 
+    // Can fold binop, compare or shift here if the RHS is a constant,
     // otherwise call FoldPHIArgBinOpIntoPHI.
     ConstantOp = dyn_cast<Constant>(FirstInst->getOperand(1));
     if (ConstantOp == 0)
       return FoldPHIArgBinOpIntoPHI(PN);
-    
+
     if (OverflowingBinaryOperator *BO =
         dyn_cast<OverflowingBinaryOperator>(FirstInst)) {
       isNUW = BO->hasNoUnsignedWrap();
@@ -442,7 +442,7 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
     } else if (I->getOperand(1) != ConstantOp) {
       return 0;
     }
-    
+
     if (isNUW)
       isNUW = cast<OverflowingBinaryOperator>(I)->hasNoUnsignedWrap();
     if (isNSW)
@@ -486,7 +486,7 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
     NewCI->setDebugLoc(FirstInst->getDebugLoc());
     return NewCI;
   }
-  
+
   if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(FirstInst)) {
     BinOp = BinaryOperator::Create(BinOp->getOpcode(), PhiVal, ConstantOp);
     if (isNUW) BinOp->setHasNoUnsignedWrap();
@@ -495,7 +495,7 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
     BinOp->setDebugLoc(FirstInst->getDebugLoc());
     return BinOp;
   }
-  
+
   CmpInst *CIOp = cast<CmpInst>(FirstInst);
   CmpInst *NewCI = CmpInst::Create(CIOp->getOpcode(), CIOp->getPredicate(),
                                    PhiVal, ConstantOp);
@@ -513,7 +513,7 @@ static bool DeadPHICycle(PHINode *PN,
   // Remember this node, and if we find the cycle, return.
   if (!PotentiallyDeadPHIs.insert(PN))
     return true;
-  
+
   // Don't scan crazily complex things.
   if (PotentiallyDeadPHIs.size() == 16)
     return false;
@@ -527,16 +527,16 @@ static bool DeadPHICycle(PHINode *PN,
 /// PHIsEqualValue - Return true if this phi node is always equal to
 /// NonPhiInVal.  This happens with mutually cyclic phi nodes like:
 ///   z = some value; x = phi (y, z); y = phi (x, z)
-static bool PHIsEqualValue(PHINode *PN, Value *NonPhiInVal, 
+static bool PHIsEqualValue(PHINode *PN, Value *NonPhiInVal,
                            SmallPtrSet<PHINode*, 16> &ValueEqualPHIs) {
   // See if we already saw this PHI node.
   if (!ValueEqualPHIs.insert(PN))
     return true;
-  
+
   // Don't scan crazily complex things.
   if (ValueEqualPHIs.size() == 16)
     return false;
- 
+
   // Scan the operands to see if they are either phi nodes or are equal to
   // the value.
   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
@@ -547,7 +547,7 @@ static bool PHIsEqualValue(PHINode *PN, Value *NonPhiInVal,
     } else if (Op != NonPhiInVal)
       return false;
   }
-  
+
   return true;
 }
 
@@ -557,10 +557,10 @@ struct PHIUsageRecord {
   unsigned PHIId;     // The ID # of the PHI (something determinstic to sort on)
   unsigned Shift;     // The amount shifted.
   Instruction *Inst;  // The trunc instruction.
-  
+
   PHIUsageRecord(unsigned pn, unsigned Sh, Instruction *User)
     : PHIId(pn), Shift(Sh), Inst(User) {}
-  
+
   bool operator<(const PHIUsageRecord &RHS) const {
     if (PHIId < RHS.PHIId) return true;
     if (PHIId > RHS.PHIId) return false;
@@ -570,15 +570,15 @@ struct PHIUsageRecord {
            RHS.Inst->getType()->getPrimitiveSizeInBits();
   }
 };
-  
+
 struct LoweredPHIRecord {
   PHINode *PN;        // The PHI that was lowered.
   unsigned Shift;     // The amount shifted.
   unsigned Width;     // The width extracted.
-  
+
   LoweredPHIRecord(PHINode *pn, unsigned Sh, Type *Ty)
     : PN(pn), Shift(Sh), Width(Ty->getPrimitiveSizeInBits()) {}
-  
+
   // Ctor form used by DenseMap.
   LoweredPHIRecord(PHINode *pn, unsigned Sh)
     : PN(pn), Shift(Sh), Width(0) {}
@@ -621,20 +621,20 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
   // PHIUsers - Keep track of all of the truncated values extracted from a set
   // of PHIs, along with their offset.  These are the things we want to rewrite.
   SmallVector<PHIUsageRecord, 16> PHIUsers;
-  
+
   // PHIs are often mutually cyclic, so we keep track of a whole set of PHI
   // nodes which are extracted from. PHIsToSlice is a set we use to avoid
   // revisiting PHIs, PHIsInspected is a ordered list of PHIs that we need to
   // check the uses of (to ensure they are all extracts).
   SmallVector<PHINode*, 8> PHIsToSlice;
   SmallPtrSet<PHINode*, 8> PHIsInspected;
-  
+
   PHIsToSlice.push_back(&FirstPhi);
   PHIsInspected.insert(&FirstPhi);
-  
+
   for (unsigned PHIId = 0; PHIId != PHIsToSlice.size(); ++PHIId) {
     PHINode *PN = PHIsToSlice[PHIId];
-    
+
     // Scan the input list of the PHI.  If any input is an invoke, and if the
     // input is defined in the predecessor, then we won't be split the critical
     // edge which is required to insert a truncate.  Because of this, we have to
@@ -644,85 +644,85 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
       if (II == 0) continue;
       if (II->getParent() != PN->getIncomingBlock(i))
         continue;
-     
+
       // If we have a phi, and if it's directly in the predecessor, then we have
       // a critical edge where we need to put the truncate.  Since we can't
       // split the edge in instcombine, we have to bail out.
       return 0;
     }
-      
-    
+
+
     for (Value::use_iterator UI = PN->use_begin(), E = PN->use_end();
          UI != E; ++UI) {
       Instruction *User = cast<Instruction>(*UI);
-      
+
       // If the user is a PHI, inspect its uses recursively.
       if (PHINode *UserPN = dyn_cast<PHINode>(User)) {
         if (PHIsInspected.insert(UserPN))
           PHIsToSlice.push_back(UserPN);
         continue;
       }
-      
+
       // Truncates are always ok.
       if (isa<TruncInst>(User)) {
         PHIUsers.push_back(PHIUsageRecord(PHIId, 0, User));
         continue;
       }
-      
+
       // Otherwise it must be a lshr which can only be used by one trunc.
       if (User->getOpcode() != Instruction::LShr ||
           !User->hasOneUse() || !isa<TruncInst>(User->use_back()) ||
           !isa<ConstantInt>(User->getOperand(1)))
         return 0;
-      
+
       unsigned Shift = cast<ConstantInt>(User->getOperand(1))->getZExtValue();
       PHIUsers.push_back(PHIUsageRecord(PHIId, Shift, User->use_back()));
     }
   }
-  
+
   // If we have no users, they must be all self uses, just nuke the PHI.
   if (PHIUsers.empty())
     return ReplaceInstUsesWith(FirstPhi, UndefValue::get(FirstPhi.getType()));
-  
+
   // If this phi node is transformable, create new PHIs for all the pieces
   // extracted out of it.  First, sort the users by their offset and size.
   array_pod_sort(PHIUsers.begin(), PHIUsers.end());
-  
+
   DEBUG(errs() << "SLICING UP PHI: " << FirstPhi << '\n';
             for (unsigned i = 1, e = PHIsToSlice.size(); i != e; ++i)
               errs() << "AND USER PHI #" << i << ": " << *PHIsToSlice[i] <<'\n';
         );
-  
+
   // PredValues - This is a temporary used when rewriting PHI nodes.  It is
   // hoisted out here to avoid construction/destruction thrashing.
   DenseMap<BasicBlock*, Value*> PredValues;
-  
+
   // ExtractedVals - Each new PHI we introduce is saved here so we don't
   // introduce redundant PHIs.
   DenseMap<LoweredPHIRecord, PHINode*> ExtractedVals;
-  
+
   for (unsigned UserI = 0, UserE = PHIUsers.size(); UserI != UserE; ++UserI) {
     unsigned PHIId = PHIUsers[UserI].PHIId;
     PHINode *PN = PHIsToSlice[PHIId];
     unsigned Offset = PHIUsers[UserI].Shift;
     Type *Ty = PHIUsers[UserI].Inst->getType();
-    
+
     PHINode *EltPHI;
-    
+
     // If we've already lowered a user like this, reuse the previously lowered
     // value.
     if ((EltPHI = ExtractedVals[LoweredPHIRecord(PN, Offset, Ty)]) == 0) {
-      
+
       // Otherwise, Create the new PHI node for this user.
       EltPHI = PHINode::Create(Ty, PN->getNumIncomingValues(),
                                PN->getName()+".off"+Twine(Offset), PN);
       assert(EltPHI->getType() != PN->getType() &&
              "Truncate didn't shrink phi?");
-    
+
       for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
         BasicBlock *Pred = PN->getIncomingBlock(i);
         Value *&PredVal = PredValues[Pred];
-        
+
         // If we already have a value for this predecessor, reuse it.
         if (PredVal) {
           EltPHI->addIncoming(PredVal, Pred);
@@ -736,7 +736,7 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
           EltPHI->addIncoming(PredVal, Pred);
           continue;
         }
-        
+
         if (PHINode *InPHI = dyn_cast<PHINode>(PN)) {
           // If the incoming value was a PHI, and if it was one of the PHIs we
           // already rewrote it, just use the lowered value.
@@ -746,7 +746,7 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
             continue;
           }
         }
-        
+
         // Otherwise, do an extract in the predecessor.
         Builder->SetInsertPoint(Pred, Pred->getTerminator());
         Value *Res = InVal;
@@ -756,7 +756,7 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
         Res = Builder->CreateTrunc(Res, Ty, "extract.t");
         PredVal = Res;
         EltPHI->addIncoming(Res, Pred);
-        
+
         // If the incoming value was a PHI, and if it was one of the PHIs we are
         // rewriting, we will ultimately delete the code we inserted.  This
         // means we need to revisit that PHI to make sure we extract out the
@@ -765,22 +765,22 @@ Instruction *InstCombiner::SliceUpIllegalIntegerPHI(PHINode &FirstPhi) {
           if (PHIsInspected.count(OldInVal)) {
             unsigned RefPHIId = std::find(PHIsToSlice.begin(),PHIsToSlice.end(),
                                           OldInVal)-PHIsToSlice.begin();
-            PHIUsers.push_back(PHIUsageRecord(RefPHIId, Offset, 
+            PHIUsers.push_back(PHIUsageRecord(RefPHIId, Offset,
                                               cast<Instruction>(Res)));
             ++UserE;
           }
       }
       PredValues.clear();
-      
+
       DEBUG(errs() << "  Made element PHI for offset " << Offset << ": "
                    << *EltPHI << '\n');
       ExtractedVals[LoweredPHIRecord(PN, Offset, Ty)] = EltPHI;
     }
-    
+
     // Replace the use of this piece with the PHI node.
     ReplaceInstUsesWith(*PHIUsers[UserI].Inst, EltPHI);
   }
-  
+
   // Replace all the remaining uses of the PHI nodes (self uses and the lshrs)
   // with undefs.
   Value *Undef = UndefValue::get(FirstPhi.getType());
@@ -818,7 +818,7 @@ Instruction *InstCombiner::visitPHINode(PHINode &PN) {
       if (DeadPHICycle(PU, PotentiallyDeadPHIs))
         return ReplaceInstUsesWith(PN, UndefValue::get(PN.getType()));
     }
-   
+
     // If this phi has a single use, and if that use just computes a value for
     // the next iteration of a loop, delete the phi.  This occurs with unused
     // induction variables, e.g. "for (int j = 0; ; ++j);".  Detecting this
@@ -847,7 +847,7 @@ Instruction *InstCombiner::visitPHINode(PHINode &PN) {
 
     if (InValNo != NumIncomingVals) {
       Value *NonPhiInVal = PN.getIncomingValue(InValNo);
-      
+
       // Scan the rest of the operands to see if there are any conflicts, if so
       // there is no need to recursively scan other phis.
       for (++InValNo; InValNo != NumIncomingVals; ++InValNo) {
@@ -855,7 +855,7 @@ Instruction *InstCombiner::visitPHINode(PHINode &PN) {
         if (OpVal != NonPhiInVal && !isa<PHINode>(OpVal))
           break;
       }
-      
+
       // If we scanned over all operands, then we have one unique value plus
       // phi values.  Scan PHI nodes to see if they all merge in each other or
       // the value.
@@ -899,6 +899,6 @@ Instruction *InstCombiner::visitPHINode(PHINode &PN) {
       !TD->isLegalInteger(PN.getType()->getPrimitiveSizeInBits()))
     if (Instruction *Res = SliceUpIllegalIntegerPHI(PN))
       return Res;
-  
+
   return 0;
 }
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index a262d711d3..59502fb988 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -127,13 +127,14 @@ Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI,
     // If this is a non-volatile load or a cast from the same type,
     // merge.
     if (TI->isCast()) {
-      if (TI->getOperand(0)->getType() != FI->getOperand(0)->getType())
+      Type *FIOpndTy = FI->getOperand(0)->getType();
+      if (TI->getOperand(0)->getType() != FIOpndTy)
         return 0;
       // The select condition may be a vector. We may only change the operand
       // type if the vector width remains the same (and matches the condition).
       Type *CondTy = SI.getCondition()->getType();
-      if (CondTy->isVectorTy() && CondTy->getVectorNumElements() !=
-          FI->getOperand(0)->getType()->getVectorNumElements())
+      if (CondTy->isVectorTy() && (!FIOpndTy->isVectorTy() ||
+          CondTy->getVectorNumElements() != FIOpndTy->getVectorNumElements()))
         return 0;
     } else {
       return 0;  // unknown unary op.
@@ -349,6 +350,68 @@ static Value *SimplifyWithOpReplaced(Value *V, Value *Op, Value *RepOp,
   return 0;
 }
 
+/// foldSelectICmpAndOr - We want to turn:
+///   (select (icmp eq (and X, C1), 0), Y, (or Y, C2))
+/// into:
+///   (or (shl (and X, C1), C3), y)
+/// iff:
+///   C1 and C2 are both powers of 2
+/// where:
+///   C3 = Log(C2) - Log(C1)
+///
+/// This transform handles cases where:
+/// 1. The icmp predicate is inverted
+/// 2. The select operands are reversed
+/// 3. The magnitude of C2 and C1 are flipped
+static Value *foldSelectICmpAndOr(const SelectInst &SI, Value *TrueVal,
+                                  Value *FalseVal,
+                                  InstCombiner::BuilderTy *Builder) {
+  const ICmpInst *IC = dyn_cast<ICmpInst>(SI.getCondition());
+  if (!IC || !IC->isEquality())
+    return 0;
+
+  Value *CmpLHS = IC->getOperand(0);
+  Value *CmpRHS = IC->getOperand(1);
+
+  if (!match(CmpRHS, m_Zero()))
+    return 0;
+
+  Value *X;
+  const APInt *C1;
+  if (!match(CmpLHS, m_And(m_Value(X), m_Power2(C1))))
+    return 0;
+
+  const APInt *C2;
+  bool OrOnTrueVal = false;
+  bool OrOnFalseVal = match(FalseVal, m_Or(m_Specific(TrueVal), m_Power2(C2)));
+  if (!OrOnFalseVal)
+    OrOnTrueVal = match(TrueVal, m_Or(m_Specific(FalseVal), m_Power2(C2)));
+
+  if (!OrOnFalseVal && !OrOnTrueVal)
+    return 0;
+
+  Value *V = CmpLHS;
+  Value *Y = OrOnFalseVal ? TrueVal : FalseVal;
+
+  unsigned C1Log = C1->logBase2();
+  unsigned C2Log = C2->logBase2();
+  if (C2Log > C1Log) {
+    V = Builder->CreateZExtOrTrunc(V, Y->getType());
+    V = Builder->CreateShl(V, C2Log - C1Log);
+  } else if (C1Log > C2Log) {
+    V = Builder->CreateLShr(V, C1Log - C2Log);
+    V = Builder->CreateZExtOrTrunc(V, Y->getType());
+  } else
+    V = Builder->CreateZExtOrTrunc(V, Y->getType());
+
+  ICmpInst::Predicate Pred = IC->getPredicate();
+  if ((Pred == ICmpInst::ICMP_NE && OrOnFalseVal) ||
+      (Pred == ICmpInst::ICMP_EQ && OrOnTrueVal))
+    V = Builder->CreateXor(V, *C2);
+
+  return Builder->CreateOr(V, Y);
+}
+
 /// visitSelectInstWithICmp - Visit a SelectInst that has an
 /// ICmpInst as its first operand.
 ///
@@ -520,6 +583,9 @@ Instruction *InstCombiner::visitSelectInstWithICmp(SelectInst &SI,
     }
   }
 
+  if (Value *V = foldSelectICmpAndOr(SI, TrueVal, FalseVal, Builder))
+    return ReplaceInstUsesWith(SI, V);
+
   return Changed ? &SI : 0;
 }
 
@@ -675,7 +741,8 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
       // Change: A = select B, false, C --> A = and !B, C
       Value *NotCond = Builder->CreateNot(CondVal, "not."+CondVal->getName());
       return BinaryOperator::CreateAnd(NotCond, FalseVal);
-    } else if (ConstantInt *C = dyn_cast<ConstantInt>(FalseVal)) {
+    }
+    if (ConstantInt *C = dyn_cast<ConstantInt>(FalseVal)) {
       if (C->getZExtValue() == false) {
         // Change: A = select B, C, false --> A = and B, C
         return BinaryOperator::CreateAnd(CondVal, TrueVal);
@@ -689,14 +756,14 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
     // select a, a, b  -> a|b
     if (CondVal == TrueVal)
       return BinaryOperator::CreateOr(CondVal, FalseVal);
-    else if (CondVal == FalseVal)
+    if (CondVal == FalseVal)
       return BinaryOperator::CreateAnd(CondVal, TrueVal);
 
     // select a, ~a, b -> (~a)&b
     // select a, b, ~a -> (~a)|b
     if (match(TrueVal, m_Not(m_Specific(CondVal))))
       return BinaryOperator::CreateAnd(TrueVal, FalseVal);
-    else if (match(FalseVal, m_Not(m_Specific(CondVal))))
+    if (match(FalseVal, m_Not(m_Specific(CondVal))))
       return BinaryOperator::CreateOr(TrueVal, FalseVal);
   }
 
@@ -837,7 +904,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
             Value *NewFalseOp = NegVal;
             if (AddOp != TI)
               std::swap(NewTrueOp, NewFalseOp);
-            Value *NewSel = 
+            Value *NewSel =
               Builder->CreateSelect(CondVal, NewTrueOp,
                                     NewFalseOp, SI.getName() + ".p");
 
@@ -861,7 +928,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
     Value *LHS, *RHS, *LHS2, *RHS2;
     if (SelectPatternFlavor SPF = MatchSelectPattern(&SI, LHS, RHS)) {
       if (SelectPatternFlavor SPF2 = MatchSelectPattern(LHS, LHS2, RHS2))
-        if (Instruction *R = FoldSPFofSPF(cast<Instruction>(LHS),SPF2,LHS2,RHS2, 
+        if (Instruction *R = FoldSPFofSPF(cast<Instruction>(LHS),SPF2,LHS2,RHS2,
                                           SI, SPF, RHS))
           return R;
       if (SelectPatternFlavor SPF2 = MatchSelectPattern(RHS, LHS2, RHS2))
@@ -907,7 +974,7 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
     return &SI;
   }
 
-  if (VectorType *VecTy = dyn_cast<VectorType>(SI.getType())) {
+  if (VectorType* VecTy = dyn_cast<VectorType>(SI.getType())) {
     unsigned VWidth = VecTy->getNumElements();
     APInt UndefElts(VWidth, 0);
     APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
@@ -917,24 +984,6 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
       return &SI;
     }
 
-    if (ConstantVector *CV = dyn_cast<ConstantVector>(CondVal)) {
-      // Form a shufflevector instruction.
-      SmallVector<Constant *, 8> Mask(VWidth);
-      Type *Int32Ty = Type::getInt32Ty(CV->getContext());
-      for (unsigned i = 0; i != VWidth; ++i) {
-        Constant *Elem = cast<Constant>(CV->getOperand(i));
-        if (ConstantInt *E = dyn_cast<ConstantInt>(Elem))
-          Mask[i] = ConstantInt::get(Int32Ty, i + (E->isZero() ? VWidth : 0));
-        else if (isa<UndefValue>(Elem))
-          Mask[i] = UndefValue::get(Int32Ty);
-        else
-          return 0;
-      }
-      Constant *MaskVal = ConstantVector::get(Mask);
-      Value *V = Builder->CreateShuffleVector(TrueVal, FalseVal, MaskVal);
-      return ReplaceInstUsesWith(SI, V);
-    }
-
     if (isa<ConstantAggregateZero>(CondVal)) {
       return ReplaceInstUsesWith(SI, FalseVal);
     }
diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index 4f71db1a4b..4301ddb5aa 100644
--- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -105,6 +105,75 @@ static Value *FindScalarElement(Value *V, unsigned EltNo) {
   return 0;
 }
 
+// If we have a PHI node with a vector type that has only 2 uses: feed
+// itself and be an operand of extractelemnt at a constant location,
+// try to replace the PHI of the vector type with a PHI of a scalar type
+Instruction *InstCombiner::scalarizePHI(ExtractElementInst &EI, PHINode *PN) {
+  // Verify that the PHI node has exactly 2 uses. Otherwise return NULL.
+  if (!PN->hasNUses(2))
+    return NULL;
+
+  // If so, it's known at this point that one operand is PHI and the other is
+  // an extractelement node. Find the PHI user that is not the extractelement
+  // node.
+  Value::use_iterator iu = PN->use_begin();
+  Instruction *PHIUser = dyn_cast<Instruction>(*iu);
+  if (PHIUser == cast<Instruction>(&EI))
+    PHIUser = cast<Instruction>(*(++iu));
+
+  // Verify that this PHI user has one use, which is the PHI itself,
+  // and that it is a binary operation which is cheap to scalarize.
+  // otherwise return NULL.
+  if (!PHIUser->hasOneUse() || !(PHIUser->use_back() == PN) ||
+    !(isa<BinaryOperator>(PHIUser)) ||
+    !CheapToScalarize(PHIUser, true))
+    return NULL;
+
+  // Create a scalar PHI node that will replace the vector PHI node
+  // just before the current PHI node.
+  PHINode * scalarPHI = cast<PHINode>(
+    InsertNewInstWith(PHINode::Create(EI.getType(),
+    PN->getNumIncomingValues(), ""), *PN));
+  // Scalarize each PHI operand.
+  for (unsigned i=0; i < PN->getNumIncomingValues(); i++) {
+    Value *PHIInVal = PN->getIncomingValue(i);
+    BasicBlock *inBB = PN->getIncomingBlock(i);
+    Value *Elt = EI.getIndexOperand();
+    // If the operand is the PHI induction variable:
+    if (PHIInVal == PHIUser) {
+      // Scalarize the binary operation. Its first operand is the
+      // scalar PHI and the second operand is extracted from the other
+      // vector operand.
+      BinaryOperator *B0 = cast<BinaryOperator>(PHIUser);
+      unsigned opId = (B0->getOperand(0) == PN) ? 1: 0;
+      Value *Op = Builder->CreateExtractElement(
+        B0->getOperand(opId), Elt, B0->getOperand(opId)->getName()+".Elt");
+      Value *newPHIUser = InsertNewInstWith(
+        BinaryOperator::Create(B0->getOpcode(), scalarPHI,Op),
+        *B0);
+      scalarPHI->addIncoming(newPHIUser, inBB);
+    } else {
+      // Scalarize PHI input:
+      Instruction *newEI =
+        ExtractElementInst::Create(PHIInVal, Elt, "");
+      // Insert the new instruction into the predecessor basic block.
+      Instruction *pos = dyn_cast<Instruction>(PHIInVal);
+      BasicBlock::iterator InsertPos;
+      if (pos && !isa<PHINode>(pos)) {
+        InsertPos = pos;
+        ++InsertPos;
+      } else {
+        InsertPos = inBB->getFirstInsertionPt();
+      }
+
+      InsertNewInstWith(newEI, *InsertPos);
+
+      scalarPHI->addIncoming(newEI, inBB);
+    }
+  }
+  return ReplaceInstUsesWith(EI, scalarPHI);
+}
+
 Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
   // If vector val is constant with all elements the same, replace EI with
   // that element.  We handle a known element # below.
@@ -149,6 +218,14 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
           if (Value *Elt = FindScalarElement(BCI->getOperand(0), IndexVal))
             return new BitCastInst(Elt, EI.getType());
     }
+
+    // If there's a vector PHI feeding a scalar use through this extractelement
+    // instruction, try to scalarize the PHI.
+    if (PHINode *PN = dyn_cast<PHINode>(EI.getOperand(0))) {
+      Instruction *scalarPHI = scalarizePHI(EI, PN);
+      if (scalarPHI)
+        return (scalarPHI);
+    }
   }
 
   if (Instruction *I = dyn_cast<Instruction>(EI.getOperand(0))) {
@@ -201,10 +278,10 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
     } else if (CastInst *CI = dyn_cast<CastInst>(I)) {
       // Canonicalize extractelement(cast) -> cast(extractelement)
       // bitcasts can change the number of vector elements and they cost nothing
-      if (CI->hasOneUse() && EI.hasOneUse() &&
-          (CI->getOpcode() != Instruction::BitCast)) {
+      if (CI->hasOneUse() && (CI->getOpcode() != Instruction::BitCast)) {
         Value *EE = Builder->CreateExtractElement(CI->getOperand(0),
                                                   EI.getIndexOperand());
+        Worklist.AddValue(EE);
         return CastInst::Create(CI->getOpcode(), EE, EI.getType());
       }
     }
@@ -336,6 +413,10 @@ static Value *CollectShuffleElements(Value *V, SmallVectorImpl<Constant*> &Mask,
 
         if (VecOp == RHS) {
           Value *V = CollectShuffleElements(EI->getOperand(0), Mask, RHS);
+          // Update Mask to reflect that `ScalarOp' has been inserted at
+          // position `InsertedIdx' within the vector returned by IEI.
+          Mask[InsertedIdx % NumElts] = Mask[ExtractedIdx];
+
           // Everything but the extracted element is replaced with the RHS.
           for (unsigned i = 0; i != NumElts; ++i) {
             if (i != InsertedIdx)
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index c6115e3e91..ec10751202 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1483,7 +1483,7 @@ Instruction *InstCombiner::visitAllocSite(Instruction &MI) {
       Module *M = II->getParent()->getParent()->getParent();
       Function *F = Intrinsic::getDeclaration(M, Intrinsic::donothing);
       InvokeInst::Create(F, II->getNormalDest(), II->getUnwindDest(),
-                         ArrayRef<Value *>(), "", II->getParent());
+                         None, "", II->getParent());
     }
     return EraseInstFromFunction(MI);
   }
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 6877475b1d..623c470506 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -71,7 +71,7 @@ static const char *kAsanRegisterGlobalsName = "__asan_register_globals";
 static const char *kAsanUnregisterGlobalsName = "__asan_unregister_globals";
 static const char *kAsanPoisonGlobalsName = "__asan_before_dynamic_init";
 static const char *kAsanUnpoisonGlobalsName = "__asan_after_dynamic_init";
-static const char *kAsanInitName = "__asan_init_v1";
+static const char *kAsanInitName = "__asan_init_v3";
 static const char *kAsanHandleNoReturnName = "__asan_handle_no_return";
 static const char *kAsanMappingOffsetName = "__asan_mapping_offset";
 static const char *kAsanMappingScaleName = "__asan_mapping_scale";
@@ -244,7 +244,7 @@ static size_t RedzoneSizeForScale(int MappingScale) {
 
 /// AddressSanitizer: instrument the code in module to find memory bugs.
 struct AddressSanitizer : public FunctionPass {
-  AddressSanitizer(bool CheckInitOrder = false,
+  AddressSanitizer(bool CheckInitOrder = true,
                    bool CheckUseAfterReturn = false,
                    bool CheckLifetime = false,
                    StringRef BlacklistFile = StringRef(),
@@ -274,8 +274,6 @@ struct AddressSanitizer : public FunctionPass {
                                    Instruction *InsertBefore, bool IsWrite);
   Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);
   bool runOnFunction(Function &F);
-  void createInitializerPoisonCalls(Module &M,
-                                    Value *FirstAddr, Value *LastAddr);
   bool maybeInsertAsanInitAtFunctionEntry(Function &F);
   void emitShadowMapping(Module &M, IRBuilder<> &IRB) const;
   virtual bool doInitialization(Module &M);
@@ -315,7 +313,7 @@ struct AddressSanitizer : public FunctionPass {
 
 class AddressSanitizerModule : public ModulePass {
  public:
-  AddressSanitizerModule(bool CheckInitOrder = false,
+  AddressSanitizerModule(bool CheckInitOrder = true,
                          StringRef BlacklistFile = StringRef(),
                          bool ZeroBaseShadow = false)
       : ModulePass(ID),
@@ -333,8 +331,7 @@ class AddressSanitizerModule : public ModulePass {
   void initializeCallbacks(Module &M);
 
   bool ShouldInstrumentGlobal(GlobalVariable *G);
-  void createInitializerPoisonCalls(Module &M, Value *FirstAddr,
-                                    Value *LastAddr);
+  void createInitializerPoisonCalls(Module &M, GlobalValue *ModuleName);
   size_t RedzoneSize() const {
     return RedzoneSizeForScale(Mapping.Scale);
   }
@@ -531,9 +528,12 @@ static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
 // Create a constant for Str so that we can pass it to the run-time lib.
 static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str) {
   Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str);
-  return new GlobalVariable(M, StrConst->getType(), true,
+  GlobalVariable *GV = new GlobalVariable(M, StrConst->getType(), true,
                             GlobalValue::PrivateLinkage, StrConst,
                             kAsanGenPrefix);
+  GV->setUnnamedAddr(true);  // Ok to merge these.
+  GV->setAlignment(1);  // Strings may not be merged w/o setting align 1.
+  return GV;
 }
 
 static bool GlobalWasGeneratedByAsan(GlobalVariable *G) {
@@ -750,7 +750,7 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
 }
 
 void AddressSanitizerModule::createInitializerPoisonCalls(
-    Module &M, Value *FirstAddr, Value *LastAddr) {
+    Module &M, GlobalValue *ModuleName) {
   // We do all of our poisoning and unpoisoning within _GLOBAL__I_a.
   Function *GlobalInit = M.getFunction("_GLOBAL__I_a");
   // If that function is not present, this TU contains no globals, or they have
@@ -762,7 +762,8 @@ void AddressSanitizerModule::createInitializerPoisonCalls(
   IRBuilder<> IRB(GlobalInit->begin()->getFirstInsertionPt());
 
   // Add a call to poison all external globals before the given function starts.
-  IRB.CreateCall2(AsanPoisonGlobals, FirstAddr, LastAddr);
+  Value *ModuleNameAddr = ConstantExpr::getPointerCast(ModuleName, IntptrTy);
+  IRB.CreateCall(AsanPoisonGlobals, ModuleNameAddr);
 
   // Add calls to unpoison all globals before each return instruction.
   for (Function::iterator I = GlobalInit->begin(), E = GlobalInit->end();
@@ -836,7 +837,7 @@ void AddressSanitizerModule::initializeCallbacks(Module &M) {
   IRBuilder<> IRB(*C);
   // Declare our poisoning and unpoisoning functions.
   AsanPoisonGlobals = checkInterfaceFunction(M.getOrInsertFunction(
-      kAsanPoisonGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
+      kAsanPoisonGlobalsName, IRB.getVoidTy(), IntptrTy, NULL));
   AsanPoisonGlobals->setLinkage(Function::ExternalLinkage);
   AsanUnpoisonGlobals = checkInterfaceFunction(M.getOrInsertFunction(
       kAsanUnpoisonGlobalsName, IRB.getVoidTy(), NULL));
@@ -885,11 +886,12 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
   //   size_t size;
   //   size_t size_with_redzone;
   //   const char *name;
+  //   const char *module_name;
   //   size_t has_dynamic_init;
   // We initialize an array of such structures and pass it to a run-time call.
   StructType *GlobalStructTy = StructType::get(IntptrTy, IntptrTy,
                                                IntptrTy, IntptrTy,
-                                               IntptrTy, NULL);
+                                               IntptrTy, IntptrTy, NULL);
   SmallVector<Constant *, 16> Initializers(n), DynamicInit;
 
 
@@ -897,9 +899,13 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
   assert(CtorFunc);
   IRBuilder<> IRB(CtorFunc->getEntryBlock().getTerminator());
 
-  // The addresses of the first and last dynamically initialized globals in
-  // this TU.  Used in initialization order checking.
-  Value *FirstDynamic = 0, *LastDynamic = 0;
+  bool HasDynamicallyInitializedGlobals = false;
+
+  GlobalVariable *ModuleName = createPrivateGlobalForString(
+      M, M.getModuleIdentifier());
+  // We shouldn't merge same module names, as this string serves as unique
+  // module ID in runtime.
+  ModuleName->setUnnamedAddr(false);
 
   for (size_t i = 0; i < n; i++) {
     static const uint64_t kMaxGlobalRedzone = 1 << 18;
@@ -930,11 +936,7 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
         NewTy, G->getInitializer(),
         Constant::getNullValue(RightRedZoneTy), NULL);
 
-    SmallString<2048> DescriptionOfGlobal = G->getName();
-    DescriptionOfGlobal += " (";
-    DescriptionOfGlobal += M.getModuleIdentifier();
-    DescriptionOfGlobal += ")";
-    GlobalVariable *Name = createPrivateGlobalForString(M, DescriptionOfGlobal);
+    GlobalVariable *Name = createPrivateGlobalForString(M, G->getName());
 
     // Create a new global variable with enough space for a redzone.
     GlobalVariable *NewGlobal = new GlobalVariable(
@@ -958,15 +960,13 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
         ConstantInt::get(IntptrTy, SizeInBytes),
         ConstantInt::get(IntptrTy, SizeInBytes + RightRedzoneSize),
         ConstantExpr::getPointerCast(Name, IntptrTy),
+        ConstantExpr::getPointerCast(ModuleName, IntptrTy),
         ConstantInt::get(IntptrTy, GlobalHasDynamicInitializer),
         NULL);
 
     // Populate the first and last globals declared in this TU.
-    if (CheckInitOrder && GlobalHasDynamicInitializer) {
-      LastDynamic = ConstantExpr::getPointerCast(NewGlobal, IntptrTy);
-      if (FirstDynamic == 0)
-        FirstDynamic = LastDynamic;
-    }
+    if (CheckInitOrder && GlobalHasDynamicInitializer)
+      HasDynamicallyInitializedGlobals = true;
 
     DEBUG(dbgs() << "NEW GLOBAL: " << *NewGlobal << "\n");
   }
@@ -977,8 +977,8 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
       ConstantArray::get(ArrayOfGlobalStructTy, Initializers), "");
 
   // Create calls for poisoning before initializers run and unpoisoning after.
-  if (CheckInitOrder && FirstDynamic && LastDynamic)
-    createInitializerPoisonCalls(M, FirstDynamic, LastDynamic);
+  if (CheckInitOrder && HasDynamicallyInitializedGlobals)
+    createInitializerPoisonCalls(M, ModuleName);
   IRB.CreateCall2(AsanRegisterGlobals,
                   IRB.CreatePointerCast(AllGlobals, IntptrTy),
                   ConstantInt::get(IntptrTy, n));
@@ -1095,6 +1095,7 @@ bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) {
 bool AddressSanitizer::runOnFunction(Function &F) {
   if (BL->isIn(F)) return false;
   if (&F == AsanCtorFunction) return false;
+  if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage) return false;
   DEBUG(dbgs() << "ASAN instrumenting:\n" << F << "\n");
   initializeCallbacks(*F.getParent());
 
@@ -1312,10 +1313,10 @@ void FunctionStackPoisoner::poisonStack() {
         ConstantInt::get(IntptrTy, LocalStackSize), OrigStackBase);
   }
 
-  // This string will be parsed by the run-time (DescribeStackAddress).
+  // This string will be parsed by the run-time (DescribeAddressIfStack).
   SmallString<2048> StackDescriptionStorage;
   raw_svector_ostream StackDescription(StackDescriptionStorage);
-  StackDescription << F.getName() << " " << AllocaVec.size() << " ";
+  StackDescription << AllocaVec.size() << " ";
 
   // Insert poison calls for lifetime intrinsics for alloca.
   bool HavePoisonedAllocas = false;
@@ -1348,19 +1349,26 @@ void FunctionStackPoisoner::poisonStack() {
   }
   assert(Pos == LocalStackSize);
 
-  // Write the Magic value and the frame description constant to the redzone.
+  // The left-most redzone has enough space for at least 4 pointers.
+  // Write the Magic value to redzone[0].
   Value *BasePlus0 = IRB.CreateIntToPtr(LocalStackBase, IntptrPtrTy);
   IRB.CreateStore(ConstantInt::get(IntptrTy, kCurrentStackFrameMagic),
                   BasePlus0);
-  Value *BasePlus1 = IRB.CreateAdd(LocalStackBase,
-                                   ConstantInt::get(IntptrTy,
-                                                    ASan.LongSize/8));
-  BasePlus1 = IRB.CreateIntToPtr(BasePlus1, IntptrPtrTy);
+  // Write the frame description constant to redzone[1].
+  Value *BasePlus1 = IRB.CreateIntToPtr(
+    IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, ASan.LongSize/8)),
+    IntptrPtrTy);
   GlobalVariable *StackDescriptionGlobal =
       createPrivateGlobalForString(*F.getParent(), StackDescription.str());
   Value *Description = IRB.CreatePointerCast(StackDescriptionGlobal,
                                              IntptrTy);
   IRB.CreateStore(Description, BasePlus1);
+  // Write the PC to redzone[2].
+  Value *BasePlus2 = IRB.CreateIntToPtr(
+    IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy,
+                                                   2 * ASan.LongSize/8)),
+    IntptrPtrTy);
+  IRB.CreateStore(IRB.CreatePointerCast(&F, IntptrTy), BasePlus2);
 
   // Poison the stack redzones at the entry.
   Value *ShadowBase = ASan.memToShadow(LocalStackBase, IRB);
diff --git a/lib/Transforms/Instrumentation/BlackList.cpp b/lib/Transforms/Instrumentation/BlackList.cpp
index 927982d2af..39de4b0401 100644
--- a/lib/Transforms/Instrumentation/BlackList.cpp
+++ b/lib/Transforms/Instrumentation/BlackList.cpp
@@ -110,7 +110,8 @@ static StringRef GetGVTypeString(const GlobalVariable &G) {
 bool BlackList::isInInit(const GlobalVariable &G) const {
   return (isIn(*G.getParent()) ||
           inSection("global-init", G.getName()) ||
-          inSection("global-init-type", GetGVTypeString(G)));
+          inSection("global-init-type", GetGVTypeString(G)) ||
+          inSection("global-init-src", G.getParent()->getModuleIdentifier()));
 }
 
 bool BlackList::inSection(const StringRef Section,
diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 095b852d93..2edd151869 100644
--- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -29,8 +29,10 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/DebugLoc.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/InstIterator.h"
 #include "llvm/Support/PathV2.h"
 #include "llvm/Support/raw_ostream.h"
@@ -39,33 +41,57 @@
 #include <utility>
 using namespace llvm;
 
+static cl::opt<std::string>
+DefaultGCOVVersion("default-gcov-version", cl::init("402*"), cl::Hidden,
+                   cl::ValueRequired);
+
+GCOVOptions GCOVOptions::getDefault() {
+  GCOVOptions Options;
+  Options.EmitNotes = true;
+  Options.EmitData = true;
+  Options.UseCfgChecksum = false;
+  Options.NoRedZone = false;
+  Options.FunctionNamesInData = true;
+
+  if (DefaultGCOVVersion.size() != 4) {
+    llvm::report_fatal_error(std::string("Invalid -default-gcov-version: ") +
+                             DefaultGCOVVersion);
+  }
+  memcpy(Options.Version, DefaultGCOVVersion.c_str(), 4);
+  return Options;
+}
+
 namespace {
   class GCOVProfiler : public ModulePass {
   public:
     static char ID;
-    GCOVProfiler()
-        : ModulePass(ID), EmitNotes(true), EmitData(true), Use402Format(false),
-          UseExtraChecksum(false), NoRedZone(false),
-          NoFunctionNamesInData(false) {
+    GCOVProfiler() : ModulePass(ID), Options(GCOVOptions::getDefault()) {
+      ReversedVersion[0] = Options.Version[3];
+      ReversedVersion[1] = Options.Version[2];
+      ReversedVersion[2] = Options.Version[1];
+      ReversedVersion[3] = Options.Version[0];
+      ReversedVersion[4] = '\0';
       initializeGCOVProfilerPass(*PassRegistry::getPassRegistry());
     }
-    GCOVProfiler(bool EmitNotes, bool EmitData, bool Use402Format,
-                 bool UseExtraChecksum, bool NoRedZone,
-                 bool NoFunctionNamesInData)
-        : ModulePass(ID), EmitNotes(EmitNotes), EmitData(EmitData),
-          Use402Format(Use402Format), UseExtraChecksum(UseExtraChecksum),
-          NoRedZone(NoRedZone), NoFunctionNamesInData(NoFunctionNamesInData) {
-      assert((EmitNotes || EmitData) && "GCOVProfiler asked to do nothing?");
+    GCOVProfiler(const GCOVOptions &Options) : ModulePass(ID), Options(Options){
+      assert((Options.EmitNotes || Options.EmitData) &&
+             "GCOVProfiler asked to do nothing?");
+      ReversedVersion[0] = Options.Version[3];
+      ReversedVersion[1] = Options.Version[2];
+      ReversedVersion[2] = Options.Version[1];
+      ReversedVersion[3] = Options.Version[0];
+      ReversedVersion[4] = '\0';
       initializeGCOVProfilerPass(*PassRegistry::getPassRegistry());
     }
     virtual const char *getPassName() const {
       return "GCOV Profiler";
     }
+
   private:
     bool runOnModule(Module &M);
 
-    // Create the GCNO files for the Module based on DebugInfo.
-    void emitGCNO();
+    // Create the .gcno files for the Module based on DebugInfo.
+    void emitProfileNotes();
 
     // Modify the program to track transitions along edges and call into the
     // profiling runtime to emit .gcda files when run.
@@ -76,6 +102,8 @@ namespace {
     Constant *getIncrementIndirectCounterFunc();
     Constant *getEmitFunctionFunc();
     Constant *getEmitArcsFunc();
+    Constant *getDeleteWriteoutFunctionListFunc();
+    Constant *getDeleteFlushFunctionListFunc();
     Constant *getEndFileFunc();
 
     // Create or retrieve an i32 state value that is used to represent the
@@ -86,23 +114,22 @@ namespace {
     // block number.
     GlobalVariable *buildEdgeLookupTable(Function *F,
                                          GlobalVariable *Counter,
-                                         const UniqueVector<BasicBlock *> &Preds,
-                                         const UniqueVector<BasicBlock *> &Succs);
+                                         const UniqueVector<BasicBlock *>&Preds,
+                                         const UniqueVector<BasicBlock*>&Succs);
 
     // Add the function to write out all our counters to the global destructor
     // list.
-    void insertCounterWriteout(ArrayRef<std::pair<GlobalVariable*, MDNode*> >);
+    Function *insertCounterWriteout(ArrayRef<std::pair<GlobalVariable*,
+                                                       MDNode*> >);
+    Function *insertFlush(ArrayRef<std::pair<GlobalVariable*, MDNode*> >);
     void insertIndirectCounterIncrement();
-    void insertFlush(ArrayRef<std::pair<GlobalVariable*, MDNode*> >);
 
     std::string mangleName(DICompileUnit CU, const char *NewStem);
 
-    bool EmitNotes;
-    bool EmitData;
-    bool Use402Format;
-    bool UseExtraChecksum;
-    bool NoRedZone;
-    bool NoFunctionNamesInData;
+    GCOVOptions Options;
+
+    // Reversed, NUL-terminated copy of Options.Version.
+    char ReversedVersion[5];  
 
     Module *M;
     LLVMContext *Ctx;
@@ -113,13 +140,14 @@ char GCOVProfiler::ID = 0;
 INITIALIZE_PASS(GCOVProfiler, "insert-gcov-profiling",
                 "Insert instrumentation for GCOV profiling", false, false)
 
-ModulePass *llvm::createGCOVProfilerPass(bool EmitNotes, bool EmitData,
-                                         bool Use402Format,
-                                         bool UseExtraChecksum,
-                                         bool NoRedZone,
-                                         bool NoFunctionNamesInData) {
-  return new GCOVProfiler(EmitNotes, EmitData, Use402Format, UseExtraChecksum,
-                          NoRedZone, NoFunctionNamesInData);
+ModulePass *llvm::createGCOVProfilerPass(const GCOVOptions &Options) {
+  return new GCOVProfiler(Options);
+}
+
+static std::string getFunctionName(DISubprogram SP) {
+  if (!SP.getLinkageName().empty())
+    return SP.getLinkageName();
+  return SP.getName();
 }
 
 namespace {
@@ -257,8 +285,8 @@ namespace {
   // object users can construct, the blocks and lines will be rooted here.
   class GCOVFunction : public GCOVRecord {
    public:
-    GCOVFunction(DISubprogram SP, raw_ostream *os,
-                 bool Use402Format, bool UseExtraChecksum) {
+    GCOVFunction(DISubprogram SP, raw_ostream *os, uint32_t Ident,
+                 bool UseCfgChecksum) {
       this->os = os;
 
       Function *F = SP.getFunction();
@@ -270,17 +298,16 @@ namespace {
       ReturnBlock = new GCOVBlock(i++, os);
 
       writeBytes(FunctionTag, 4);
-      uint32_t BlockLen = 1 + 1 + 1 + lengthOfGCOVString(SP.getName()) +
+      uint32_t BlockLen = 1 + 1 + 1 + lengthOfGCOVString(getFunctionName(SP)) +
           1 + lengthOfGCOVString(SP.getFilename()) + 1;
-      if (UseExtraChecksum)
+      if (UseCfgChecksum)
         ++BlockLen;
       write(BlockLen);
-      uint32_t Ident = reinterpret_cast<intptr_t>((MDNode*)SP);
       write(Ident);
       write(0);  // lineno checksum
-      if (UseExtraChecksum)
+      if (UseCfgChecksum)
         write(0);  // cfg checksum
-      writeGCOVString(SP.getName());
+      writeGCOVString(getFunctionName(SP));
       writeGCOVString(SP.getFilename());
       write(SP.getLineNumber());
     }
@@ -355,19 +382,23 @@ std::string GCOVProfiler::mangleName(DICompileUnit CU, const char *NewStem) {
 
   SmallString<128> Filename = CU.getFilename();
   sys::path::replace_extension(Filename, NewStem);
-  return sys::path::filename(Filename.str());
+  StringRef FName = sys::path::filename(Filename);
+  SmallString<128> CurPath;
+  if (sys::fs::current_path(CurPath)) return FName;
+  sys::path::append(CurPath, FName.str());
+  return CurPath.str();
 }
 
 bool GCOVProfiler::runOnModule(Module &M) {
   this->M = &M;
   Ctx = &M.getContext();
 
-  if (EmitNotes) emitGCNO();
-  if (EmitData) return emitProfileArcs();
+  if (Options.EmitNotes) emitProfileNotes();
+  if (Options.EmitData) return emitProfileArcs();
   return false;
 }
 
-void GCOVProfiler::emitGCNO() {
+void GCOVProfiler::emitProfileNotes() {
   NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
   if (!CU_Nodes) return;
 
@@ -380,10 +411,9 @@ void GCOVProfiler::emitGCNO() {
     std::string ErrorInfo;
     raw_fd_ostream out(mangleName(CU, "gcno").c_str(), ErrorInfo,
                        raw_fd_ostream::F_Binary);
-    if (!Use402Format)
-      out.write("oncg*404MVLL", 12);
-    else
-      out.write("oncg*204MVLL", 12);
+    out.write("oncg", 4);
+    out.write(ReversedVersion, 4);
+    out.write("MVLL", 4);
 
     DIArray SPs = CU.getSubprograms();
     for (unsigned i = 0, e = SPs.getNumElements(); i != e; ++i) {
@@ -392,7 +422,7 @@ void GCOVProfiler::emitGCNO() {
 
       Function *F = SP.getFunction();
       if (!F) continue;
-      GCOVFunction Func(SP, &out, Use402Format, UseExtraChecksum);
+      GCOVFunction Func(SP, &out, i, Options.UseCfgChecksum);
 
       for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
         GCOVBlock &Block = Func.getBlock(BB);
@@ -522,8 +552,38 @@ bool GCOVProfiler::emitProfileArcs() {
       }
     }
 
-    insertCounterWriteout(CountersBySP);
-    insertFlush(CountersBySP);
+    Function *WriteoutF = insertCounterWriteout(CountersBySP);
+    Function *FlushF = insertFlush(CountersBySP);
+
+    // Create a small bit of code that registers the "__llvm_gcov_writeout" to
+    // be executed at exit and the "__llvm_gcov_flush" function to be executed
+    // when "__gcov_flush" is called.
+    FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+    Function *F = Function::Create(FTy, GlobalValue::InternalLinkage,
+                                   "__llvm_gcov_init", M);
+    F->setUnnamedAddr(true);
+    F->setLinkage(GlobalValue::InternalLinkage);
+    F->addFnAttr(Attribute::NoInline);
+    if (Options.NoRedZone)
+      F->addFnAttr(Attribute::NoRedZone);
+
+    BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", F);
+    IRBuilder<> Builder(BB);
+
+    FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+    Type *Params[] = {
+      PointerType::get(FTy, 0),
+      PointerType::get(FTy, 0)
+    };
+    FTy = FunctionType::get(Builder.getVoidTy(), Params, false);
+
+    // Inialize the environment and register the local writeout and flush
+    // functions.
+    Constant *GCOVInit = M->getOrInsertFunction("llvm_gcov_init", FTy);
+    Builder.CreateCall2(GCOVInit, WriteoutF, FlushF);
+    Builder.CreateRetVoid();
+
+    appendToGlobalCtors(*M, F, 0);
   }
 
   if (InsertIndCounterIncrCode)
@@ -581,8 +641,11 @@ GlobalVariable *GCOVProfiler::buildEdgeLookupTable(
 }
 
 Constant *GCOVProfiler::getStartFileFunc() {
-  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx),
-                                              Type::getInt8PtrTy(*Ctx), false);
+  Type *Args[] = {
+    Type::getInt8PtrTy(*Ctx),  // const char *orig_filename
+    Type::getInt8PtrTy(*Ctx),  // const char version[4]
+  };
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
   return M->getOrInsertFunction("llvm_gcda_start_file", FTy);
 }
 
@@ -598,9 +661,10 @@ Constant *GCOVProfiler::getIncrementIndirectCounterFunc() {
 }
 
 Constant *GCOVProfiler::getEmitFunctionFunc() {
-  Type *Args[2] = {
+  Type *Args[3] = {
     Type::getInt32Ty(*Ctx),    // uint32_t ident
     Type::getInt8PtrTy(*Ctx),  // const char *function_name
+    Type::getInt8Ty(*Ctx),     // uint8_t use_extra_checksum
   };
   FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
   return M->getOrInsertFunction("llvm_gcda_emit_function", FTy);
@@ -611,11 +675,20 @@ Constant *GCOVProfiler::getEmitArcsFunc() {
     Type::getInt32Ty(*Ctx),     // uint32_t num_counters
     Type::getInt64PtrTy(*Ctx),  // uint64_t *counters
   };
-  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx),
-                                              Args, false);
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
   return M->getOrInsertFunction("llvm_gcda_emit_arcs", FTy);
 }
 
+Constant *GCOVProfiler::getDeleteWriteoutFunctionListFunc() {
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  return M->getOrInsertFunction("llvm_delete_writeout_function_list", FTy);
+}
+
+Constant *GCOVProfiler::getDeleteFlushFunctionListFunc() {
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  return M->getOrInsertFunction("llvm_delete_flush_function_list", FTy);
+}
+
 Constant *GCOVProfiler::getEndFileFunc() {
   FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
   return M->getOrInsertFunction("llvm_gcda_end_file", FTy);
@@ -634,7 +707,7 @@ GlobalVariable *GCOVProfiler::getEdgeStateValue() {
   return GV;
 }
 
-void GCOVProfiler::insertCounterWriteout(
+Function *GCOVProfiler::insertCounterWriteout(
     ArrayRef<std::pair<GlobalVariable *, MDNode *> > CountersBySP) {
   FunctionType *WriteoutFTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
   Function *WriteoutF = M->getFunction("__llvm_gcov_writeout");
@@ -643,7 +716,7 @@ void GCOVProfiler::insertCounterWriteout(
                                  "__llvm_gcov_writeout", M);
   WriteoutF->setUnnamedAddr(true);
   WriteoutF->addFnAttr(Attribute::NoInline);
-  if (NoRedZone)
+  if (Options.NoRedZone)
     WriteoutF->addFnAttr(Attribute::NoRedZone);
 
   BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", WriteoutF);
@@ -659,20 +732,19 @@ void GCOVProfiler::insertCounterWriteout(
     for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
       DICompileUnit CU(CU_Nodes->getOperand(i));
       std::string FilenameGcda = mangleName(CU, "gcda");
-      Builder.CreateCall(StartFile,
-                         Builder.CreateGlobalStringPtr(FilenameGcda));
-      for (ArrayRef<std::pair<GlobalVariable *, MDNode *> >::iterator
-             I = CountersBySP.begin(), E = CountersBySP.end();
-           I != E; ++I) {
-        DISubprogram SP(I->second);
-        intptr_t ident = reinterpret_cast<intptr_t>(I->second);
-        Builder.CreateCall2(EmitFunction,
-                            Builder.getInt32(ident),
-                            NoFunctionNamesInData ?
-                              Constant::getNullValue(Builder.getInt8PtrTy()) :
-                              Builder.CreateGlobalStringPtr(SP.getName()));
-        
-        GlobalVariable *GV = I->first;
+      Builder.CreateCall2(StartFile,
+                          Builder.CreateGlobalStringPtr(FilenameGcda),
+                          Builder.CreateGlobalStringPtr(ReversedVersion));
+      for (unsigned j = 0, e = CountersBySP.size(); j != e; ++j) {
+        DISubprogram SP(CountersBySP[j].second);
+        Builder.CreateCall3(
+            EmitFunction, Builder.getInt32(j),
+            Options.FunctionNamesInData ?
+              Builder.CreateGlobalStringPtr(getFunctionName(SP)) :
+              Constant::getNullValue(Builder.getInt8PtrTy()),
+            Builder.getInt8(Options.UseCfgChecksum));
+
+        GlobalVariable *GV = CountersBySP[j].first;
         unsigned Arcs =
           cast<ArrayType>(GV->getType()->getElementType())->getNumElements();
         Builder.CreateCall2(EmitArcs,
@@ -682,29 +754,9 @@ void GCOVProfiler::insertCounterWriteout(
       Builder.CreateCall(EndFile);
     }
   }
-  Builder.CreateRetVoid();
 
-  // Create a small bit of code that registers the "__llvm_gcov_writeout"
-  // function to be executed at exit.
-  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
-  Function *F = Function::Create(FTy, GlobalValue::InternalLinkage,
-                                 "__llvm_gcov_init", M);
-  F->setUnnamedAddr(true);
-  F->setLinkage(GlobalValue::InternalLinkage);
-  F->addFnAttr(Attribute::NoInline);
-  if (NoRedZone)
-    F->addFnAttr(Attribute::NoRedZone);
-
-  BB = BasicBlock::Create(*Ctx, "entry", F);
-  Builder.SetInsertPoint(BB);
-
-  FTy = FunctionType::get(Builder.getInt32Ty(),
-                          PointerType::get(FTy, 0), false);
-  Constant *AtExitFn = M->getOrInsertFunction("atexit", FTy);
-  Builder.CreateCall(AtExitFn, WriteoutF);
   Builder.CreateRetVoid();
-
-  appendToGlobalCtors(*M, F, 0);
+  return WriteoutF;
 }
 
 void GCOVProfiler::insertIndirectCounterIncrement() {
@@ -713,7 +765,7 @@ void GCOVProfiler::insertIndirectCounterIncrement() {
   Fn->setUnnamedAddr(true);
   Fn->setLinkage(GlobalValue::InternalLinkage);
   Fn->addFnAttr(Attribute::NoInline);
-  if (NoRedZone)
+  if (Options.NoRedZone)
     Fn->addFnAttr(Attribute::NoRedZone);
 
   // Create basic blocks for function.
@@ -758,18 +810,18 @@ void GCOVProfiler::insertIndirectCounterIncrement() {
   Builder.CreateRetVoid();
 }
 
-void GCOVProfiler::
+Function *GCOVProfiler::
 insertFlush(ArrayRef<std::pair<GlobalVariable*, MDNode*> > CountersBySP) {
   FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
-  Function *FlushF = M->getFunction("__gcov_flush");
+  Function *FlushF = M->getFunction("__llvm_gcov_flush");
   if (!FlushF)
     FlushF = Function::Create(FTy, GlobalValue::InternalLinkage,
-                              "__gcov_flush", M);
+                              "__llvm_gcov_flush", M);
   else
     FlushF->setLinkage(GlobalValue::InternalLinkage);
   FlushF->setUnnamedAddr(true);
   FlushF->addFnAttr(Attribute::NoInline);
-  if (NoRedZone)
+  if (Options.NoRedZone)
     FlushF->addFnAttr(Attribute::NoRedZone);
 
   BasicBlock *Entry = BasicBlock::Create(*Ctx, "entry", FlushF);
@@ -794,8 +846,10 @@ insertFlush(ArrayRef<std::pair<GlobalVariable*, MDNode*> > CountersBySP) {
   if (RetTy == Type::getVoidTy(*Ctx))
     Builder.CreateRetVoid();
   else if (RetTy->isIntegerTy())
-    // Used if __gcov_flush was implicitly declared.
+    // Used if __llvm_gcov_flush was implicitly declared.
     Builder.CreateRet(ConstantInt::get(RetTy, 0));
   else
-    report_fatal_error("invalid return type for __gcov_flush");
+    report_fatal_error("invalid return type for __llvm_gcov_flush");
+
+  return FlushF;
 }
diff --git a/lib/Transforms/Instrumentation/Instrumentation.cpp b/lib/Transforms/Instrumentation/Instrumentation.cpp
index 8ba102559b..9f353967f3 100644
--- a/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/InitializePasses.h"
+#include "llvm/PassRegistry.h"
 #include "llvm-c/Initialization.h"
 
 using namespace llvm;
diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index fce6513a97..4e75904ded 100644
--- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -122,6 +122,9 @@ static cl::opt<bool> ClPoisonStackWithCall("msan-poison-stack-with-call",
 static cl::opt<int> ClPoisonStackPattern("msan-poison-stack-pattern",
        cl::desc("poison uninitialized stack variables with the given patter"),
        cl::Hidden, cl::init(0xff));
+static cl::opt<bool> ClPoisonUndef("msan-poison-undef",
+       cl::desc("poison undef temps"),
+       cl::Hidden, cl::init(true));
 
 static cl::opt<bool> ClHandleICmp("msan-handle-icmp",
        cl::desc("propagate shadow through ICmpEQ and ICmpNE"),
@@ -690,7 +693,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   ///
   /// Clean shadow (all zeroes) means all bits of the value are defined
   /// (initialized).
-  Value *getCleanShadow(Value *V) {
+  Constant *getCleanShadow(Value *V) {
     Type *ShadowTy = getShadowTy(V);
     if (!ShadowTy)
       return 0;
@@ -709,6 +712,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     return ConstantStruct::get(ST, Vals);
   }
 
+  /// \brief Create a dirty shadow for a given value.
+  Constant *getPoisonedShadow(Value *V) {
+    Type *ShadowTy = getShadowTy(V);
+    if (!ShadowTy)
+      return 0;
+    return getPoisonedShadow(ShadowTy);
+  }
+
   /// \brief Create a clean (zero) origin.
   Value *getCleanOrigin() {
     return Constant::getNullValue(MS.OriginTy);
@@ -730,7 +741,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       return Shadow;
     }
     if (UndefValue *U = dyn_cast<UndefValue>(V)) {
-      Value *AllOnes = getPoisonedShadow(getShadowTy(V));
+      Value *AllOnes = ClPoisonUndef ? getPoisonedShadow(V) : getCleanShadow(V);
       DEBUG(dbgs() << "Undef: " << *U << " ==> " << *AllOnes << "\n");
       (void)U;
       return AllOnes;
diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index f93c5ab4c8..299060a42f 100644
--- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -30,6 +30,7 @@
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
@@ -56,6 +57,9 @@ static cl::opt<bool>  ClInstrumentFuncEntryExit(
 static cl::opt<bool>  ClInstrumentAtomics(
     "tsan-instrument-atomics", cl::init(true),
     cl::desc("Instrument atomics"), cl::Hidden);
+static cl::opt<bool>  ClInstrumentMemIntrinsics(
+    "tsan-instrument-memintrinsics", cl::init(true),
+    cl::desc("Instrument memintrinsics (memset/memcpy/memmove)"), cl::Hidden);
 
 STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
 STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
@@ -63,6 +67,7 @@ STATISTIC(NumOmittedReadsBeforeWrite,
           "Number of reads ignored due to following writes");
 STATISTIC(NumAccessesWithBadSize, "Number of accesses with bad size");
 STATISTIC(NumInstrumentedVtableWrites, "Number of vtable ptr writes");
+STATISTIC(NumInstrumentedVtableReads, "Number of vtable ptr reads");
 STATISTIC(NumOmittedReadsFromConstantGlobals,
           "Number of reads from constant globals");
 STATISTIC(NumOmittedReadsFromVtable, "Number of vtable reads");
@@ -85,12 +90,14 @@ struct ThreadSanitizer : public FunctionPass {
   void initializeCallbacks(Module &M);
   bool instrumentLoadOrStore(Instruction *I);
   bool instrumentAtomic(Instruction *I);
+  bool instrumentMemIntrinsic(Instruction *I);
   void chooseInstructionsToInstrument(SmallVectorImpl<Instruction*> &Local,
                                       SmallVectorImpl<Instruction*> &All);
   bool addrPointsToConstantData(Value *Addr);
   int getMemoryAccessFuncIndex(Value *Addr);
 
   DataLayout *TD;
+  Type *IntptrTy;
   SmallString<64> BlacklistFile;
   OwningPtr<BlackList> BL;
   IntegerType *OrdTy;
@@ -108,6 +115,8 @@ struct ThreadSanitizer : public FunctionPass {
   Function *TsanAtomicThreadFence;
   Function *TsanAtomicSignalFence;
   Function *TsanVptrUpdate;
+  Function *TsanVptrLoad;
+  Function *MemmoveFn, *MemcpyFn, *MemsetFn;
 };
 }  // namespace
 
@@ -196,10 +205,22 @@ void ThreadSanitizer::initializeCallbacks(Module &M) {
   TsanVptrUpdate = checkInterfaceFunction(M.getOrInsertFunction(
       "__tsan_vptr_update", IRB.getVoidTy(), IRB.getInt8PtrTy(),
       IRB.getInt8PtrTy(), NULL));
+  TsanVptrLoad = checkInterfaceFunction(M.getOrInsertFunction(
+      "__tsan_vptr_read", IRB.getVoidTy(), IRB.getInt8PtrTy(), NULL));
   TsanAtomicThreadFence = checkInterfaceFunction(M.getOrInsertFunction(
       "__tsan_atomic_thread_fence", IRB.getVoidTy(), OrdTy, NULL));
   TsanAtomicSignalFence = checkInterfaceFunction(M.getOrInsertFunction(
       "__tsan_atomic_signal_fence", IRB.getVoidTy(), OrdTy, NULL));
+
+  MemmoveFn = checkInterfaceFunction(M.getOrInsertFunction(
+    "memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+    IRB.getInt8PtrTy(), IntptrTy, NULL));
+  MemcpyFn = checkInterfaceFunction(M.getOrInsertFunction(
+    "memcpy", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+    IntptrTy, NULL));
+  MemsetFn = checkInterfaceFunction(M.getOrInsertFunction(
+    "memset", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt32Ty(),
+    IntptrTy, NULL));
 }
 
 bool ThreadSanitizer::doInitialization(Module &M) {
@@ -210,6 +231,7 @@ bool ThreadSanitizer::doInitialization(Module &M) {
 
   // Always insert a call to __tsan_init into the module's CTORs.
   IRBuilder<> IRB(M.getContext());
+  IntptrTy = IRB.getIntPtrTy(TD);
   Value *TsanInit = M.getOrInsertFunction("__tsan_init",
                                           IRB.getVoidTy(), NULL);
   appendToGlobalCtors(M, cast<Function>(TsanInit), 0);
@@ -309,6 +331,7 @@ bool ThreadSanitizer::runOnFunction(Function &F) {
   SmallVector<Instruction*, 8> AllLoadsAndStores;
   SmallVector<Instruction*, 8> LocalLoadsAndStores;
   SmallVector<Instruction*, 8> AtomicAccesses;
+  SmallVector<Instruction*, 8> MemIntrinCalls;
   bool Res = false;
   bool HasCalls = false;
 
@@ -325,6 +348,8 @@ bool ThreadSanitizer::runOnFunction(Function &F) {
       else if (isa<ReturnInst>(BI))
         RetVec.push_back(BI);
       else if (isa<CallInst>(BI) || isa<InvokeInst>(BI)) {
+        if (isa<MemIntrinsic>(BI))
+          MemIntrinCalls.push_back(BI);
         HasCalls = true;
         chooseInstructionsToInstrument(LocalLoadsAndStores, AllLoadsAndStores);
       }
@@ -348,6 +373,11 @@ bool ThreadSanitizer::runOnFunction(Function &F) {
       Res |= instrumentAtomic(AtomicAccesses[i]);
     }
 
+  if (ClInstrumentMemIntrinsics)
+    for (size_t i = 0, n = MemIntrinCalls.size(); i < n; ++i) {
+      Res |= instrumentMemIntrinsic(MemIntrinCalls[i]);
+    }
+
   // Instrument function entry/exit points if there were instrumented accesses.
   if ((Res || HasCalls) && ClInstrumentFuncEntryExit) {
     IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
@@ -386,6 +416,12 @@ bool ThreadSanitizer::instrumentLoadOrStore(Instruction *I) {
     NumInstrumentedVtableWrites++;
     return true;
   }
+  if (!IsWrite && isVtableAccess(I)) {
+    IRB.CreateCall(TsanVptrLoad,
+                   IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()));
+    NumInstrumentedVtableReads++;
+    return true;
+  }
   Value *OnAccessFunc = IsWrite ? TsanWrite[Idx] : TsanRead[Idx];
   IRB.CreateCall(OnAccessFunc, IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()));
   if (IsWrite) NumInstrumentedWrites++;
@@ -423,6 +459,32 @@ static ConstantInt *createFailOrdering(IRBuilder<> *IRB, AtomicOrdering ord) {
   return IRB->getInt32(v);
 }
 
+// If a memset intrinsic gets inlined by the code gen, we will miss races on it.
+// So, we either need to ensure the intrinsic is not inlined, or instrument it.
+// We do not instrument memset/memmove/memcpy intrinsics (too complicated),
+// instead we simply replace them with regular function calls, which are then
+// intercepted by the run-time.
+// Since tsan is running after everyone else, the calls should not be
+// replaced back with intrinsics. If that becomes wrong at some point,
+// we will need to call e.g. __tsan_memset to avoid the intrinsics.
+bool ThreadSanitizer::instrumentMemIntrinsic(Instruction *I) {
+  IRBuilder<> IRB(I);
+  if (MemSetInst *M = dyn_cast<MemSetInst>(I)) {
+    IRB.CreateCall3(MemsetFn,
+      IRB.CreatePointerCast(M->getArgOperand(0), IRB.getInt8PtrTy()),
+      IRB.CreateIntCast(M->getArgOperand(1), IRB.getInt32Ty(), false),
+      IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false));
+    I->eraseFromParent();
+  } else if (MemTransferInst *M = dyn_cast<MemTransferInst>(I)) {
+    IRB.CreateCall3(isa<MemCpyInst>(M) ? MemcpyFn : MemmoveFn,
+      IRB.CreatePointerCast(M->getArgOperand(0), IRB.getInt8PtrTy()),
+      IRB.CreatePointerCast(M->getArgOperand(1), IRB.getInt8PtrTy()),
+      IRB.CreateIntCast(M->getArgOperand(2), IntptrTy, false));
+    I->eraseFromParent();
+  }
+  return false;
+}
+
 // Both llvm and ThreadSanitizer atomic operations are based on C++11/C1x
 // standards.  For background see C++11 standard.  A slightly older, publically
 // available draft of the standard (not entirely up-to-date, but close enough
diff --git a/lib/Transforms/ObjCARC/DependencyAnalysis.cpp b/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
index 5aada9c373..8f917aeb37 100644
--- a/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
+++ b/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
@@ -38,6 +38,7 @@ llvm::objcarc::CanAlterRefCount(const Instruction *Inst, const Value *Ptr,
   switch (Class) {
   case IC_Autorelease:
   case IC_AutoreleaseRV:
+  case IC_IntrinsicUser:
   case IC_User:
     // These operations never directly modify a reference count.
     return false;
diff --git a/lib/Transforms/ObjCARC/ObjCARC.cpp b/lib/Transforms/ObjCARC/ObjCARC.cpp
index 53a31b0de1..373168e898 100644
--- a/lib/Transforms/ObjCARC/ObjCARC.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARC.cpp
@@ -30,6 +30,7 @@ using namespace llvm::objcarc;
 bool llvm::objcarc::EnableARCOpts;
 static cl::opt<bool, true>
 EnableARCOptimizations("enable-objc-arc-opts",
+                       cl::desc("enable/disable all ARC Optimizations"),
                        cl::location(EnableARCOpts),
                        cl::init(true));
 
diff --git a/lib/Transforms/ObjCARC/ObjCARC.h b/lib/Transforms/ObjCARC/ObjCARC.h
index e062b66555..39670f339e 100644
--- a/lib/Transforms/ObjCARC/ObjCARC.h
+++ b/lib/Transforms/ObjCARC/ObjCARC.h
@@ -64,7 +64,8 @@ static inline bool ModuleHasARC(const Module &M) {
     M.getNamedValue("objc_copyWeak") ||
     M.getNamedValue("objc_retainedObject") ||
     M.getNamedValue("objc_unretainedObject") ||
-    M.getNamedValue("objc_unretainedPointer");
+    M.getNamedValue("objc_unretainedPointer") ||
+    M.getNamedValue("clang.arc.use");
 }
 
 /// \enum InstructionClass
@@ -89,6 +90,7 @@ enum InstructionClass {
   IC_CopyWeak,            ///< objc_copyWeak (derived)
   IC_DestroyWeak,         ///< objc_destroyWeak (derived)
   IC_StoreStrong,         ///< objc_storeStrong (derived)
+  IC_IntrinsicUser,       ///< clang.arc.use
   IC_CallOrUser,          ///< could call objc_release and/or "use" pointers
   IC_Call,                ///< could call objc_release
   IC_User,                ///< could "use" a pointer
@@ -97,6 +99,13 @@ enum InstructionClass {
 
 raw_ostream &operator<<(raw_ostream &OS, const InstructionClass Class);
 
+/// \brief Test if the given class is a kind of user.
+inline static bool IsUser(InstructionClass Class) {
+  return Class == IC_User ||
+         Class == IC_CallOrUser ||
+         Class == IC_IntrinsicUser;
+}
+
 /// \brief Test if the given class is objc_retain or equivalent.
 static inline bool IsRetain(InstructionClass Class) {
   return Class == IC_Retain ||
@@ -112,13 +121,10 @@ static inline bool IsAutorelease(InstructionClass Class) {
 /// \brief Test if the given class represents instructions which return their
 /// argument verbatim.
 static inline bool IsForwarding(InstructionClass Class) {
-  // objc_retainBlock technically doesn't always return its argument
-  // verbatim, but it doesn't matter for our purposes here.
   return Class == IC_Retain ||
          Class == IC_RetainRV ||
          Class == IC_Autorelease ||
          Class == IC_AutoreleaseRV ||
-         Class == IC_RetainBlock ||
          Class == IC_NoopCast;
 }
 
@@ -256,11 +262,11 @@ static inline Value *GetObjCArg(Value *Inst) {
   return StripPointerCastsAndObjCCalls(cast<CallInst>(Inst)->getArgOperand(0));
 }
 
-static inline bool isNullOrUndef(const Value *V) {
+static inline bool IsNullOrUndef(const Value *V) {
   return isa<ConstantPointerNull>(V) || isa<UndefValue>(V);
 }
 
-static inline bool isNoopInstruction(const Instruction *I) {
+static inline bool IsNoopInstruction(const Instruction *I) {
   return isa<BitCastInst>(I) ||
     (isa<GetElementPtrInst>(I) &&
      cast<GetElementPtrInst>(I)->hasAllZeroIndices());
diff --git a/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/lib/Transforms/ObjCARC/ObjCARCContract.cpp
index 1c13d1cbea..c43f4f4a44 100644
--- a/lib/Transforms/ObjCARC/ObjCARCContract.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCContract.cpp
@@ -66,6 +66,8 @@ namespace {
     Constant *RetainAutoreleaseCallee;
     /// Declaration for objc_retainAutoreleaseReturnValue().
     Constant *RetainAutoreleaseRVCallee;
+    /// Declaration for objc_retainAutoreleasedReturnValue().
+    Constant *RetainRVCallee;
 
     /// The inline asm string to insert between calls and RetainRV calls to make
     /// the optimization work on targets which need it.
@@ -77,9 +79,12 @@ namespace {
     SmallPtrSet<CallInst *, 8> StoreStrongCalls;
 
     Constant *getStoreStrongCallee(Module *M);
+    Constant *getRetainRVCallee(Module *M);
     Constant *getRetainAutoreleaseCallee(Module *M);
     Constant *getRetainAutoreleaseRVCallee(Module *M);
 
+    bool OptimizeRetainCall(Function &F, Instruction *Retain);
+
     bool ContractAutorelease(Function &F, Instruction *Autorelease,
                              InstructionClass Class,
                              SmallPtrSet<Instruction *, 4>
@@ -172,6 +177,57 @@ Constant *ObjCARCContract::getRetainAutoreleaseRVCallee(Module *M) {
   return RetainAutoreleaseRVCallee;
 }
 
+Constant *ObjCARCContract::getRetainRVCallee(Module *M) {
+  if (!RetainRVCallee) {
+    LLVMContext &C = M->getContext();
+    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+    Type *Params[] = { I8X };
+    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
+    AttributeSet Attribute =
+      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
+                                  Attribute::NoUnwind);
+    RetainRVCallee =
+      M->getOrInsertFunction("objc_retainAutoreleasedReturnValue", FTy,
+                             Attribute);
+  }
+  return RetainRVCallee;
+}
+
+/// Turn objc_retain into objc_retainAutoreleasedReturnValue if the operand is a
+/// return value. We do this late so we do not disrupt the dataflow analysis in
+/// ObjCARCOpt.
+bool
+ObjCARCContract::OptimizeRetainCall(Function &F, Instruction *Retain) {
+  ImmutableCallSite CS(GetObjCArg(Retain));
+  const Instruction *Call = CS.getInstruction();
+  if (!Call)
+    return false;
+  if (Call->getParent() != Retain->getParent())
+    return false;
+
+  // Check that the call is next to the retain.
+  BasicBlock::const_iterator I = Call;
+  ++I;
+  while (IsNoopInstruction(I)) ++I;
+  if (&*I != Retain)
+    return false;
+
+  // Turn it to an objc_retainAutoreleasedReturnValue.
+  Changed = true;
+  ++NumPeeps;
+
+  DEBUG(dbgs() << "Transforming objc_retain => "
+                  "objc_retainAutoreleasedReturnValue since the operand is a "
+                  "return value.\nOld: "<< *Retain << "\n");
+
+  // We do not have to worry about tail calls/does not throw since
+  // retain/retainRV have the same properties.
+  cast<CallInst>(Retain)->setCalledFunction(getRetainRVCallee(F.getParent()));
+
+  DEBUG(dbgs() << "New: " << *Retain << "\n");
+  return true;
+}
+
 /// Merge an autorelease with a retain into a fused call.
 bool
 ObjCARCContract::ContractAutorelease(Function &F, Instruction *Autorelease,
@@ -329,6 +385,7 @@ bool ObjCARCContract::doInitialization(Module &M) {
   StoreStrongCallee = 0;
   RetainAutoreleaseCallee = 0;
   RetainAutoreleaseRVCallee = 0;
+  RetainRVCallee = 0;
 
   // Initialize RetainRVMarker.
   RetainRVMarker = 0;
@@ -380,7 +437,6 @@ bool ObjCARCContract::runOnFunction(Function &F) {
     // objc_retainBlock does not necessarily return its argument.
     InstructionClass Class = GetBasicInstructionClass(Inst);
     switch (Class) {
-    case IC_Retain:
     case IC_FusedRetainAutorelease:
     case IC_FusedRetainAutoreleaseRV:
       break;
@@ -389,6 +445,13 @@ bool ObjCARCContract::runOnFunction(Function &F) {
       if (ContractAutorelease(F, Inst, Class, DependingInstructions, Visited))
         continue;
       break;
+    case IC_Retain:
+      // Attempt to convert retains to retainrvs if they are next to function
+      // calls.
+      if (!OptimizeRetainCall(F, Inst))
+        break;
+      // If we succeed in our optimization, fall through.
+      // FALLTHROUGH
     case IC_RetainRV: {
       // If we're compiling for a target which needs a special inline-asm
       // marker to do the retainAutoreleasedReturnValue optimization,
@@ -410,7 +473,7 @@ bool ObjCARCContract::runOnFunction(Function &F) {
           break;
         }
         --BBI;
-      } while (isNoopInstruction(BBI));
+      } while (IsNoopInstruction(BBI));
 
       if (&*BBI == GetObjCArg(Inst)) {
         DEBUG(dbgs() << "ObjCARCContract: Adding inline asm marker for "
@@ -429,7 +492,7 @@ bool ObjCARCContract::runOnFunction(Function &F) {
     case IC_InitWeak: {
       // objc_initWeak(p, null) => *p = null
       CallInst *CI = cast<CallInst>(Inst);
-      if (isNullOrUndef(CI->getArgOperand(1))) {
+      if (IsNullOrUndef(CI->getArgOperand(1))) {
         Value *Null =
           ConstantPointerNull::get(cast<PointerType>(CI->getType()));
         Changed = true;
@@ -453,6 +516,10 @@ bool ObjCARCContract::runOnFunction(Function &F) {
       if (isa<AllocaInst>(Inst))
         TailOkForStoreStrongs = false;
       continue;
+    case IC_IntrinsicUser:
+      // Remove calls to @clang.arc.use(...).
+      Inst->eraseFromParent();
+      continue;
     default:
       continue;
     }
diff --git a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
index 9c14949877..43e2e20035 100644
--- a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -33,6 +33,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/Debug.h"
@@ -190,13 +191,13 @@ static bool DoesRetainableObjPtrEscape(const User *Ptr) {
   do {
     const Value *V = Worklist.pop_back_val();
 
-    DEBUG(dbgs() << "DoesRetainableObjPtrEscape: Visiting: " << *V << "\n");
+    DEBUG(dbgs() << "Visiting: " << *V << "\n");
 
     for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end();
          UI != UE; ++UI) {
       const User *UUser = *UI;
 
-      DEBUG(dbgs() << "DoesRetainableObjPtrEscape: User: " << *UUser << "\n");
+      DEBUG(dbgs() << "User: " << *UUser << "\n");
 
       // Special - Use by a call (callee or argument) is not considered
       // to be an escape.
@@ -206,11 +207,13 @@ static bool DoesRetainableObjPtrEscape(const User *Ptr) {
       case IC_StoreStrong:
       case IC_Autorelease:
       case IC_AutoreleaseRV: {
-        DEBUG(dbgs() << "DoesRetainableObjPtrEscape: User copies pointer "
-              "arguments. Pointer Escapes!\n");
+        DEBUG(dbgs() << "User copies pointer arguments. Pointer Escapes!\n");
         // These special functions make copies of their pointer arguments.
         return true;
       }
+      case IC_IntrinsicUser:
+        // Use by the use intrinsic is not an escape.
+        continue;
       case IC_User:
       case IC_None:
         // Use by an instruction which copies the value is an escape if the
@@ -219,12 +222,11 @@ static bool DoesRetainableObjPtrEscape(const User *Ptr) {
             isa<PHINode>(UUser) || isa<SelectInst>(UUser)) {
 
           if (VisitedSet.insert(UUser)) {
-            DEBUG(dbgs() << "DoesRetainableObjPtrEscape: User copies value. "
-                  "Ptr escapes if result escapes. Adding to list.\n");
+            DEBUG(dbgs() << "User copies value. Ptr escapes if result escapes."
+                  " Adding to list.\n");
             Worklist.push_back(UUser);
           } else {
-            DEBUG(dbgs() << "DoesRetainableObjPtrEscape: Already visited node."
-                  "\n");
+            DEBUG(dbgs() << "Already visited node.\n");
           }
           continue;
         }
@@ -241,13 +243,13 @@ static bool DoesRetainableObjPtrEscape(const User *Ptr) {
         continue;
       }
       // Otherwise, conservatively assume an escape.
-      DEBUG(dbgs() << "DoesRetainableObjPtrEscape: Assuming ptr escapes.\n");
+      DEBUG(dbgs() << "Assuming ptr escapes.\n");
       return true;
     }
   } while (!Worklist.empty());
 
   // No escapes found.
-  DEBUG(dbgs() << "DoesRetainableObjPtrEscape: Ptr does not escape.\n");
+  DEBUG(dbgs() << "Ptr does not escape.\n");
   return false;
 }
 
@@ -301,6 +303,16 @@ STATISTIC(NumRets,        "Number of return value forwarding "
                           "retain+autoreleaes eliminated");
 STATISTIC(NumRRs,         "Number of retain+release paths eliminated");
 STATISTIC(NumPeeps,       "Number of calls peephole-optimized");
+STATISTIC(NumRetainsBeforeOpt,
+          "Number of retains before optimization.");
+STATISTIC(NumReleasesBeforeOpt,
+          "Number of releases before optimization.");
+#ifndef NDEBUG
+STATISTIC(NumRetainsAfterOpt,
+          "Number of retains after optimization.");
+STATISTIC(NumReleasesAfterOpt,
+          "Number of releases after optimization.");
+#endif
 
 namespace {
   /// \enum Sequence
@@ -371,7 +383,7 @@ static Sequence MergeSeqs(Sequence A, Sequence B, bool TopDown) {
 namespace {
   /// \brief Unidirectional information about either a
   /// retain-decrement-use-release sequence or release-use-decrement-retain
-  /// reverese sequence.
+  /// reverse sequence.
   struct RRInfo {
     /// After an objc_retain, the reference count of the referenced
     /// object is known to be positive. Similarly, before an objc_release, the
@@ -387,10 +399,6 @@ namespace {
     /// KnownSafe is true when either of these conditions is satisfied.
     bool KnownSafe;
 
-    /// True if the Calls are objc_retainBlock calls (as opposed to objc_retain
-    /// calls).
-    bool IsRetainBlock;
-
     /// True of the objc_release calls are all marked with the "tail" keyword.
     bool IsTailCallRelease;
 
@@ -407,17 +415,18 @@ namespace {
     SmallPtrSet<Instruction *, 2> ReverseInsertPts;
 
     RRInfo() :
-      KnownSafe(false), IsRetainBlock(false),
-      IsTailCallRelease(false),
-      ReleaseMetadata(0) {}
+      KnownSafe(false), IsTailCallRelease(false), ReleaseMetadata(0) {}
 
     void clear();
+
+    bool IsTrackingImpreciseReleases() {
+      return ReleaseMetadata != 0;
+    }
   };
 }
 
 void RRInfo::clear() {
   KnownSafe = false;
-  IsRetainBlock = false;
   IsTailCallRelease = false;
   ReleaseMetadata = 0;
   Calls.clear();
@@ -431,7 +440,7 @@ namespace {
     /// True if the reference count is known to be incremented.
     bool KnownPositiveRefCount;
 
-    /// True of we've seen an opportunity for partial RR elimination, such as
+    /// True if we've seen an opportunity for partial RR elimination, such as
     /// pushing calls into a CFG triangle or into one side of a CFG diamond.
     bool Partial;
 
@@ -451,15 +460,16 @@ namespace {
       KnownPositiveRefCount = true;
     }
 
-    void ClearRefCount() {
+    void ClearKnownPositiveRefCount() {
       KnownPositiveRefCount = false;
     }
 
-    bool IsKnownIncremented() const {
+    bool HasKnownPositiveRefCount() const {
       return KnownPositiveRefCount;
     }
 
     void SetSeq(Sequence NewSeq) {
+      DEBUG(dbgs() << "Old: " << Seq << "; New: " << NewSeq << "\n");
       Seq = NewSeq;
     }
 
@@ -472,7 +482,8 @@ namespace {
     }
 
     void ResetSequenceProgress(Sequence NewSeq) {
-      Seq = NewSeq;
+      DEBUG(dbgs() << "Resetting sequence progress.\n");
+      SetSeq(NewSeq);
       Partial = false;
       RRI.clear();
     }
@@ -486,10 +497,6 @@ PtrState::Merge(const PtrState &Other, bool TopDown) {
   Seq = MergeSeqs(Seq, Other.Seq, TopDown);
   KnownPositiveRefCount = KnownPositiveRefCount && Other.KnownPositiveRefCount;
 
-  // We can't merge a plain objc_retain with an objc_retainBlock.
-  if (RRI.IsRetainBlock != Other.RRI.IsRetainBlock)
-    Seq = S_None;
-
   // If we're not in a sequence (anymore), drop all associated state.
   if (Seq == S_None) {
     Partial = false;
@@ -698,6 +705,287 @@ void BBState::MergeSucc(const BBState &Other) {
       MI->second.Merge(PtrState(), /*TopDown=*/false);
 }
 
+// Only enable ARC Annotations if we are building a debug version of
+// libObjCARCOpts.
+#ifndef NDEBUG
+#define ARC_ANNOTATIONS
+#endif
+
+// Define some macros along the lines of DEBUG and some helper functions to make
+// it cleaner to create annotations in the source code and to no-op when not
+// building in debug mode.
+#ifdef ARC_ANNOTATIONS
+
+#include "llvm/Support/CommandLine.h"
+
+/// Enable/disable ARC sequence annotations.
+static cl::opt<bool>
+EnableARCAnnotations("enable-objc-arc-annotations", cl::init(false),
+                     cl::desc("Enable emission of arc data flow analysis "
+                              "annotations"));
+static cl::opt<bool>
+DisableCheckForCFGHazards("disable-objc-arc-checkforcfghazards", cl::init(false),
+                          cl::desc("Disable check for cfg hazards when "
+                                   "annotating"));
+static cl::opt<std::string>
+ARCAnnotationTargetIdentifier("objc-arc-annotation-target-identifier",
+                              cl::init(""),
+                              cl::desc("filter out all data flow annotations "
+                                       "but those that apply to the given "
+                                       "target llvm identifier."));
+
+/// This function appends a unique ARCAnnotationProvenanceSourceMDKind id to an
+/// instruction so that we can track backwards when post processing via the llvm
+/// arc annotation processor tool. If the function is an
+static MDString *AppendMDNodeToSourcePtr(unsigned NodeId,
+                                         Value *Ptr) {
+  MDString *Hash = 0;
+
+  // If pointer is a result of an instruction and it does not have a source
+  // MDNode it, attach a new MDNode onto it. If pointer is a result of
+  // an instruction and does have a source MDNode attached to it, return a
+  // reference to said Node. Otherwise just return 0.
+  if (Instruction *Inst = dyn_cast<Instruction>(Ptr)) {
+    MDNode *Node;
+    if (!(Node = Inst->getMetadata(NodeId))) {
+      // We do not have any node. Generate and attatch the hash MDString to the
+      // instruction.
+
+      // We just use an MDString to ensure that this metadata gets written out
+      // of line at the module level and to provide a very simple format
+      // encoding the information herein. Both of these makes it simpler to
+      // parse the annotations by a simple external program.
+      std::string Str;
+      raw_string_ostream os(Str);
+      os << "(" << Inst->getParent()->getParent()->getName() << ",%"
+         << Inst->getName() << ")";
+
+      Hash = MDString::get(Inst->getContext(), os.str());
+      Inst->setMetadata(NodeId, MDNode::get(Inst->getContext(),Hash));
+    } else {
+      // We have a node. Grab its hash and return it.
+      assert(Node->getNumOperands() == 1 &&
+        "An ARCAnnotationProvenanceSourceMDKind can only have 1 operand.");
+      Hash = cast<MDString>(Node->getOperand(0));
+    }
+  } else if (Argument *Arg = dyn_cast<Argument>(Ptr)) {
+    std::string str;
+    raw_string_ostream os(str);
+    os << "(" << Arg->getParent()->getName() << ",%" << Arg->getName()
+       << ")";
+    Hash = MDString::get(Arg->getContext(), os.str());
+  }
+
+  return Hash;
+}
+
+static std::string SequenceToString(Sequence A) {
+  std::string str;
+  raw_string_ostream os(str);
+  os << A;
+  return os.str();
+}
+
+/// Helper function to change a Sequence into a String object using our overload
+/// for raw_ostream so we only have printing code in one location.
+static MDString *SequenceToMDString(LLVMContext &Context,
+                                    Sequence A) {
+  return MDString::get(Context, SequenceToString(A));
+}
+
+/// A simple function to generate a MDNode which describes the change in state
+/// for Value *Ptr caused by Instruction *Inst.
+static void AppendMDNodeToInstForPtr(unsigned NodeId,
+                                     Instruction *Inst,
+                                     Value *Ptr,
+                                     MDString *PtrSourceMDNodeID,
+                                     Sequence OldSeq,
+                                     Sequence NewSeq) {
+  MDNode *Node = 0;
+  Value *tmp[3] = {PtrSourceMDNodeID,
+                   SequenceToMDString(Inst->getContext(),
+                                      OldSeq),
+                   SequenceToMDString(Inst->getContext(),
+                                      NewSeq)};
+  Node = MDNode::get(Inst->getContext(),
+                     ArrayRef<Value*>(tmp, 3));
+
+  Inst->setMetadata(NodeId, Node);
+}
+
+/// Add to the beginning of the basic block llvm.ptr.annotations which show the
+/// state of a pointer at the entrance to a basic block.
+static void GenerateARCBBEntranceAnnotation(const char *Name, BasicBlock *BB,
+                                            Value *Ptr, Sequence Seq) {
+  // If we have a target identifier, make sure that we match it before
+  // continuing.
+  if(!ARCAnnotationTargetIdentifier.empty() &&
+     !Ptr->getName().equals(ARCAnnotationTargetIdentifier))
+    return;
+
+  Module *M = BB->getParent()->getParent();
+  LLVMContext &C = M->getContext();
+  Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+  Type *I8XX = PointerType::getUnqual(I8X);
+  Type *Params[] = {I8XX, I8XX};
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(C),
+                                        ArrayRef<Type*>(Params, 2),
+                                        /*isVarArg=*/false);
+  Constant *Callee = M->getOrInsertFunction(Name, FTy);
+
+  IRBuilder<> Builder(BB, BB->getFirstInsertionPt());
+
+  Value *PtrName;
+  StringRef Tmp = Ptr->getName();
+  if (0 == (PtrName = M->getGlobalVariable(Tmp, true))) {
+    Value *ActualPtrName = Builder.CreateGlobalStringPtr(Tmp,
+                                                         Tmp + "_STR");
+    PtrName = new GlobalVariable(*M, I8X, true, GlobalVariable::InternalLinkage,
+                                 cast<Constant>(ActualPtrName), Tmp);
+  }
+
+  Value *S;
+  std::string SeqStr = SequenceToString(Seq);
+  if (0 == (S = M->getGlobalVariable(SeqStr, true))) {
+    Value *ActualPtrName = Builder.CreateGlobalStringPtr(SeqStr,
+                                                         SeqStr + "_STR");
+    S = new GlobalVariable(*M, I8X, true, GlobalVariable::InternalLinkage,
+                           cast<Constant>(ActualPtrName), SeqStr);
+  }
+
+  Builder.CreateCall2(Callee, PtrName, S);
+}
+
+/// Add to the end of the basic block llvm.ptr.annotations which show the state
+/// of the pointer at the bottom of the basic block.
+static void GenerateARCBBTerminatorAnnotation(const char *Name, BasicBlock *BB,
+                                              Value *Ptr, Sequence Seq) {
+  // If we have a target identifier, make sure that we match it before emitting
+  // an annotation.
+  if(!ARCAnnotationTargetIdentifier.empty() &&
+     !Ptr->getName().equals(ARCAnnotationTargetIdentifier))
+    return;
+
+  Module *M = BB->getParent()->getParent();
+  LLVMContext &C = M->getContext();
+  Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
+  Type *I8XX = PointerType::getUnqual(I8X);
+  Type *Params[] = {I8XX, I8XX};
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(C),
+                                        ArrayRef<Type*>(Params, 2),
+                                        /*isVarArg=*/false);
+  Constant *Callee = M->getOrInsertFunction(Name, FTy);
+
+  IRBuilder<> Builder(BB, llvm::prior(BB->end()));
+
+  Value *PtrName;
+  StringRef Tmp = Ptr->getName();
+  if (0 == (PtrName = M->getGlobalVariable(Tmp, true))) {
+    Value *ActualPtrName = Builder.CreateGlobalStringPtr(Tmp,
+                                                         Tmp + "_STR");
+    PtrName = new GlobalVariable(*M, I8X, true, GlobalVariable::InternalLinkage,
+                                 cast<Constant>(ActualPtrName), Tmp);
+  }
+
+  Value *S;
+  std::string SeqStr = SequenceToString(Seq);
+  if (0 == (S = M->getGlobalVariable(SeqStr, true))) {
+    Value *ActualPtrName = Builder.CreateGlobalStringPtr(SeqStr,
+                                                         SeqStr + "_STR");
+    S = new GlobalVariable(*M, I8X, true, GlobalVariable::InternalLinkage,
+                           cast<Constant>(ActualPtrName), SeqStr);
+  }
+  Builder.CreateCall2(Callee, PtrName, S);
+}
+
+/// Adds a source annotation to pointer and a state change annotation to Inst
+/// referencing the source annotation and the old/new state of pointer.
+static void GenerateARCAnnotation(unsigned InstMDId,
+                                  unsigned PtrMDId,
+                                  Instruction *Inst,
+                                  Value *Ptr,
+                                  Sequence OldSeq,
+                                  Sequence NewSeq) {
+  if (EnableARCAnnotations) {
+    // If we have a target identifier, make sure that we match it before
+    // emitting an annotation.
+    if(!ARCAnnotationTargetIdentifier.empty() &&
+       !Ptr->getName().equals(ARCAnnotationTargetIdentifier))
+      return;
+
+    // First generate the source annotation on our pointer. This will return an
+    // MDString* if Ptr actually comes from an instruction implying we can put
+    // in a source annotation. If AppendMDNodeToSourcePtr returns 0 (i.e. NULL),
+    // then we know that our pointer is from an Argument so we put a reference
+    // to the argument number.
+    //
+    // The point of this is to make it easy for the
+    // llvm-arc-annotation-processor tool to cross reference where the source
+    // pointer is in the LLVM IR since the LLVM IR parser does not submit such
+    // information via debug info for backends to use (since why would anyone
+    // need such a thing from LLVM IR besides in non standard cases
+    // [i.e. this]).
+    MDString *SourcePtrMDNode =
+      AppendMDNodeToSourcePtr(PtrMDId, Ptr);
+    AppendMDNodeToInstForPtr(InstMDId, Inst, Ptr, SourcePtrMDNode, OldSeq,
+                             NewSeq);
+  }
+}
+
+// The actual interface for accessing the above functionality is defined via
+// some simple macros which are defined below. We do this so that the user does
+// not need to pass in what metadata id is needed resulting in cleaner code and
+// additionally since it provides an easy way to conditionally no-op all
+// annotation support in a non-debug build.
+
+/// Use this macro to annotate a sequence state change when processing
+/// instructions bottom up,
+#define ANNOTATE_BOTTOMUP(inst, ptr, old, new)                          \
+  GenerateARCAnnotation(ARCAnnotationBottomUpMDKind,                    \
+                        ARCAnnotationProvenanceSourceMDKind, (inst),    \
+                        const_cast<Value*>(ptr), (old), (new))
+/// Use this macro to annotate a sequence state change when processing
+/// instructions top down.
+#define ANNOTATE_TOPDOWN(inst, ptr, old, new)                           \
+  GenerateARCAnnotation(ARCAnnotationTopDownMDKind,                     \
+                        ARCAnnotationProvenanceSourceMDKind, (inst),    \
+                        const_cast<Value*>(ptr), (old), (new))
+
+#define ANNOTATE_BB(_states, _bb, _name, _type, _direction)                   \
+  do {                                                                        \
+    if (EnableARCAnnotations) {                                               \
+      for(BBState::ptr_const_iterator I = (_states)._direction##_ptr_begin(), \
+          E = (_states)._direction##_ptr_end(); I != E; ++I) {                \
+        Value *Ptr = const_cast<Value*>(I->first);                            \
+        Sequence Seq = I->second.GetSeq();                                    \
+        GenerateARCBB ## _type ## Annotation(_name, (_bb), Ptr, Seq);         \
+      }                                                                       \
+    }                                                                         \
+  } while (0)
+
+#define ANNOTATE_BOTTOMUP_BBSTART(_states, _basicblock)                       \
+    ANNOTATE_BB(_states, _basicblock, "llvm.arc.annotation.bottomup.bbstart", \
+                Entrance, bottom_up)
+#define ANNOTATE_BOTTOMUP_BBEND(_states, _basicblock)                         \
+    ANNOTATE_BB(_states, _basicblock, "llvm.arc.annotation.bottomup.bbend",   \
+                Terminator, bottom_up)
+#define ANNOTATE_TOPDOWN_BBSTART(_states, _basicblock)                        \
+    ANNOTATE_BB(_states, _basicblock, "llvm.arc.annotation.topdown.bbstart",  \
+                Entrance, top_down)
+#define ANNOTATE_TOPDOWN_BBEND(_states, _basicblock)                          \
+    ANNOTATE_BB(_states, _basicblock, "llvm.arc.annotation.topdown.bbend",    \
+                Terminator, top_down)
+
+#else // !ARC_ANNOTATION
+// If annotations are off, noop.
+#define ANNOTATE_BOTTOMUP(inst, ptr, old, new)
+#define ANNOTATE_TOPDOWN(inst, ptr, old, new)
+#define ANNOTATE_BOTTOMUP_BBSTART(states, basicblock)
+#define ANNOTATE_BOTTOMUP_BBEND(states, basicblock)
+#define ANNOTATE_TOPDOWN_BBSTART(states, basicblock)
+#define ANNOTATE_TOPDOWN_BBEND(states, basicblock)
+#endif // !ARC_ANNOTATION
+
 namespace {
   /// \brief The main ARC optimization pass.
   class ObjCARCOpt : public FunctionPass {
@@ -711,9 +999,6 @@ namespace {
     /// them. These are initialized lazily to avoid cluttering up the Module
     /// with unused declarations.
 
-    /// Declaration for ObjC runtime function
-    /// objc_retainAutoreleasedReturnValue.
-    Constant *RetainRVCallee;
     /// Declaration for ObjC runtime function objc_autoreleaseReturnValue.
     Constant *AutoreleaseRVCallee;
     /// Declaration for ObjC runtime function objc_release.
@@ -738,7 +1023,15 @@ namespace {
     /// The Metadata Kind for clang.arc.no_objc_arc_exceptions metadata.
     unsigned NoObjCARCExceptionsMDKind;
 
-    Constant *getRetainRVCallee(Module *M);
+#ifdef ARC_ANNOTATIONS
+    /// The Metadata Kind for llvm.arc.annotation.bottomup metadata.
+    unsigned ARCAnnotationBottomUpMDKind;
+    /// The Metadata Kind for llvm.arc.annotation.topdown metadata.
+    unsigned ARCAnnotationTopDownMDKind;
+    /// The Metadata Kind for llvm.arc.annotation.provenancesource metadata.
+    unsigned ARCAnnotationProvenanceSourceMDKind;
+#endif // ARC_ANNOATIONS
+
     Constant *getAutoreleaseRVCallee(Module *M);
     Constant *getReleaseCallee(Module *M);
     Constant *getRetainCallee(Module *M);
@@ -747,10 +1040,11 @@ namespace {
 
     bool IsRetainBlockOptimizable(const Instruction *Inst);
 
-    void OptimizeRetainCall(Function &F, Instruction *Retain);
     bool OptimizeRetainRVCall(Function &F, Instruction *RetainRV);
     void OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV,
                                    InstructionClass &Class);
+    bool OptimizeRetainBlockCall(Function &F, Instruction *RetainBlock,
+                                 InstructionClass &Class);
     void OptimizeIndividualCalls(Function &F);
 
     void CheckForCFGHazards(const BasicBlock *BB,
@@ -804,6 +1098,10 @@ namespace {
 
     void OptimizeReturns(Function &F);
 
+#ifndef NDEBUG
+    void GatherStatistics(Function &F, bool AfterOptimization = false);
+#endif
+
     virtual void getAnalysisUsage(AnalysisUsage &AU) const;
     virtual bool doInitialization(Module &M);
     virtual bool runOnFunction(Function &F);
@@ -851,22 +1149,6 @@ bool ObjCARCOpt::IsRetainBlockOptimizable(const Instruction *Inst) {
   return true;
 }
 
-Constant *ObjCARCOpt::getRetainRVCallee(Module *M) {
-  if (!RetainRVCallee) {
-    LLVMContext &C = M->getContext();
-    Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
-    Type *Params[] = { I8X };
-    FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttributeSet Attribute =
-      AttributeSet().addAttribute(M->getContext(), AttributeSet::FunctionIndex,
-                                  Attribute::NoUnwind);
-    RetainRVCallee =
-      M->getOrInsertFunction("objc_retainAutoreleasedReturnValue", FTy,
-                             Attribute);
-  }
-  return RetainRVCallee;
-}
-
 Constant *ObjCARCOpt::getAutoreleaseRVCallee(Module *M) {
   if (!AutoreleaseRVCallee) {
     LLVMContext &C = M->getContext();
@@ -946,38 +1228,6 @@ Constant *ObjCARCOpt::getAutoreleaseCallee(Module *M) {
   return AutoreleaseCallee;
 }
 
-/// Turn objc_retain into objc_retainAutoreleasedReturnValue if the operand is a
-/// return value.
-void
-ObjCARCOpt::OptimizeRetainCall(Function &F, Instruction *Retain) {
-  ImmutableCallSite CS(GetObjCArg(Retain));
-  const Instruction *Call = CS.getInstruction();
-  if (!Call) return;
-  if (Call->getParent() != Retain->getParent()) return;
-
-  // Check that the call is next to the retain.
-  BasicBlock::const_iterator I = Call;
-  ++I;
-  while (isNoopInstruction(I)) ++I;
-  if (&*I != Retain)
-    return;
-
-  // Turn it to an objc_retainAutoreleasedReturnValue..
-  Changed = true;
-  ++NumPeeps;
-
-  DEBUG(dbgs() << "ObjCARCOpt::OptimizeRetainCall: Transforming "
-                  "objc_retain => objc_retainAutoreleasedReturnValue"
-                  " since the operand is a return value.\n"
-                  "                                Old: "
-               << *Retain << "\n");
-
-  cast<CallInst>(Retain)->setCalledFunction(getRetainRVCallee(F.getParent()));
-
-  DEBUG(dbgs() << "                                New: "
-               << *Retain << "\n");
-}
-
 /// Turn objc_retainAutoreleasedReturnValue into objc_retain if the operand is
 /// not a return value.  Or, if it can be paired with an
 /// objc_autoreleaseReturnValue, delete the pair and return true.
@@ -990,14 +1240,14 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {
     if (Call->getParent() == RetainRV->getParent()) {
       BasicBlock::const_iterator I = Call;
       ++I;
-      while (isNoopInstruction(I)) ++I;
+      while (IsNoopInstruction(I)) ++I;
       if (&*I == RetainRV)
         return false;
     } else if (const InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
       BasicBlock *RetainRVParent = RetainRV->getParent();
       if (II->getNormalDest() == RetainRVParent) {
         BasicBlock::const_iterator I = RetainRVParent->begin();
-        while (isNoopInstruction(I)) ++I;
+        while (IsNoopInstruction(I)) ++I;
         if (&*I == RetainRV)
           return false;
       }
@@ -1008,15 +1258,14 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {
   // pointer. In this case, we can delete the pair.
   BasicBlock::iterator I = RetainRV, Begin = RetainRV->getParent()->begin();
   if (I != Begin) {
-    do --I; while (I != Begin && isNoopInstruction(I));
+    do --I; while (I != Begin && IsNoopInstruction(I));
     if (GetBasicInstructionClass(I) == IC_AutoreleaseRV &&
         GetObjCArg(I) == Arg) {
       Changed = true;
       ++NumPeeps;
 
-      DEBUG(dbgs() << "ObjCARCOpt::OptimizeRetainRVCall: Erasing " << *I << "\n"
-                   << "                                  Erasing " << *RetainRV
-                   << "\n");
+      DEBUG(dbgs() << "Erasing autoreleaseRV,retainRV pair: " << *I << "\n"
+                   << "Erasing " << *RetainRV << "\n");
 
       EraseInstruction(I);
       EraseInstruction(RetainRV);
@@ -1028,16 +1277,13 @@ ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {
   Changed = true;
   ++NumPeeps;
 
-  DEBUG(dbgs() << "ObjCARCOpt::OptimizeRetainRVCall: Transforming "
-                  "objc_retainAutoreleasedReturnValue => "
+  DEBUG(dbgs() << "Transforming objc_retainAutoreleasedReturnValue => "
                   "objc_retain since the operand is not a return value.\n"
-                  "                                  Old: "
-               << *RetainRV << "\n");
+                  "Old = " << *RetainRV << "\n");
 
   cast<CallInst>(RetainRV)->setCalledFunction(getRetainCallee(F.getParent()));
 
-  DEBUG(dbgs() << "                                  New: "
-               << *RetainRV << "\n");
+  DEBUG(dbgs() << "New = " << *RetainRV << "\n");
 
   return false;
 }
@@ -1066,12 +1312,10 @@ ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV,
   Changed = true;
   ++NumPeeps;
 
-  DEBUG(dbgs() << "ObjCARCOpt::OptimizeAutoreleaseRVCall: Transforming "
-                  "objc_autoreleaseReturnValue => "
+  DEBUG(dbgs() << "Transforming objc_autoreleaseReturnValue => "
                   "objc_autorelease since its operand is not used as a return "
                   "value.\n"
-                  "                                       Old: "
-               << *AutoreleaseRV << "\n");
+                  "Old = " << *AutoreleaseRV << "\n");
 
   CallInst *AutoreleaseRVCI = cast<CallInst>(AutoreleaseRV);
   AutoreleaseRVCI->
@@ -1079,14 +1323,48 @@ ObjCARCOpt::OptimizeAutoreleaseRVCall(Function &F, Instruction *AutoreleaseRV,
   AutoreleaseRVCI->setTailCall(false); // Never tail call objc_autorelease.
   Class = IC_Autorelease;
 
-  DEBUG(dbgs() << "                                       New: "
-               << *AutoreleaseRV << "\n");
+  DEBUG(dbgs() << "New: " << *AutoreleaseRV << "\n");
+
+}
+
+// \brief Attempt to strength reduce objc_retainBlock calls to objc_retain
+// calls.
+//
+// Specifically: If an objc_retainBlock call has the copy_on_escape metadata and
+// does not escape (following the rules of block escaping), strength reduce the
+// objc_retainBlock to an objc_retain.
+//
+// TODO: If an objc_retainBlock call is dominated period by a previous
+// objc_retainBlock call, strength reduce the objc_retainBlock to an
+// objc_retain.
+bool
+ObjCARCOpt::OptimizeRetainBlockCall(Function &F, Instruction *Inst,
+                                    InstructionClass &Class) {
+  assert(GetBasicInstructionClass(Inst) == Class);
+  assert(IC_RetainBlock == Class);
+
+  // If we can not optimize Inst, return false.
+  if (!IsRetainBlockOptimizable(Inst))
+    return false;
 
+  Changed = true;
+  ++NumPeeps;
+
+  DEBUG(dbgs() << "Strength reduced retainBlock => retain.\n");
+  DEBUG(dbgs() << "Old: " << *Inst << "\n");
+  CallInst *RetainBlock = cast<CallInst>(Inst);
+  RetainBlock->setCalledFunction(getRetainCallee(F.getParent()));
+  // Remove copy_on_escape metadata.
+  RetainBlock->setMetadata(CopyOnEscapeMDKind, 0);
+  Class = IC_Retain;
+  DEBUG(dbgs() << "New: " << *Inst << "\n");
+  return true;
 }
 
 /// Visit each call, one at a time, and make simplifications without doing any
 /// additional analysis.
 void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
+  DEBUG(dbgs() << "\n== ObjCARCOpt::OptimizeIndividualCalls ==\n");
   // Reset all the flags in preparation for recomputing them.
   UsedInThisFunction = 0;
 
@@ -1096,8 +1374,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
 
     InstructionClass Class = GetBasicInstructionClass(Inst);
 
-    DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Visiting: Class: "
-          << Class << "; " << *Inst << "\n");
+    DEBUG(dbgs() << "Visiting: Class: " << Class << "; " << *Inst << "\n");
 
     switch (Class) {
     default: break;
@@ -1113,8 +1390,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
     case IC_NoopCast:
       Changed = true;
       ++NumNoops;
-      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Erasing no-op cast:"
-                   " " << *Inst << "\n");
+      DEBUG(dbgs() << "Erasing no-op cast: " << *Inst << "\n");
       EraseInstruction(Inst);
       continue;
 
@@ -1125,18 +1401,15 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
     case IC_InitWeak:
     case IC_DestroyWeak: {
       CallInst *CI = cast<CallInst>(Inst);
-      if (isNullOrUndef(CI->getArgOperand(0))) {
+      if (IsNullOrUndef(CI->getArgOperand(0))) {
         Changed = true;
         Type *Ty = CI->getArgOperand(0)->getType();
         new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
                       Constant::getNullValue(Ty),
                       CI);
         llvm::Value *NewValue = UndefValue::get(CI->getType());
-        DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: A null "
-                        "pointer-to-weak-pointer is undefined behavior.\n"
-                        "                                     Old = " << *CI <<
-                        "\n                                     New = " <<
-                        *NewValue << "\n");
+        DEBUG(dbgs() << "A null pointer-to-weak-pointer is undefined behavior."
+                       "\nOld = " << *CI << "\nNew = " << *NewValue << "\n");
         CI->replaceAllUsesWith(NewValue);
         CI->eraseFromParent();
         continue;
@@ -1146,8 +1419,8 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
     case IC_CopyWeak:
     case IC_MoveWeak: {
       CallInst *CI = cast<CallInst>(Inst);
-      if (isNullOrUndef(CI->getArgOperand(0)) ||
-          isNullOrUndef(CI->getArgOperand(1))) {
+      if (IsNullOrUndef(CI->getArgOperand(0)) ||
+          IsNullOrUndef(CI->getArgOperand(1))) {
         Changed = true;
         Type *Ty = CI->getArgOperand(0)->getType();
         new StoreInst(UndefValue::get(cast<PointerType>(Ty)->getElementType()),
@@ -1155,11 +1428,8 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
                       CI);
 
         llvm::Value *NewValue = UndefValue::get(CI->getType());
-        DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: A null "
-                        "pointer-to-weak-pointer is undefined behavior.\n"
-                        "                                     Old = " << *CI <<
-                        "\n                                     New = " <<
-                        *NewValue << "\n");
+        DEBUG(dbgs() << "A null pointer-to-weak-pointer is undefined behavior."
+                        "\nOld = " << *CI << "\nNew = " << *NewValue << "\n");
 
         CI->replaceAllUsesWith(NewValue);
         CI->eraseFromParent();
@@ -1167,8 +1437,14 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
       }
       break;
     }
+    case IC_RetainBlock:
+      // If we strength reduce an objc_retainBlock to an objc_retain, continue
+      // onto the objc_retain peephole optimizations. Otherwise break.
+      if (!OptimizeRetainBlockCall(F, Inst, Class))
+        break;
+      // FALLTHROUGH
     case IC_Retain:
-      OptimizeRetainCall(F, Inst);
+      ++NumRetainsBeforeOpt;
       break;
     case IC_RetainRV:
       if (OptimizeRetainRVCall(F, Inst))
@@ -1177,6 +1453,9 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
     case IC_AutoreleaseRV:
       OptimizeAutoreleaseRVCall(F, Inst, Class);
       break;
+    case IC_Release:
+      ++NumReleasesBeforeOpt;
+      break;
     }
 
     // objc_autorelease(x) -> objc_release(x) if x is otherwise unused.
@@ -1193,15 +1472,11 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
         CallInst *NewCall =
           CallInst::Create(getReleaseCallee(F.getParent()),
                            Call->getArgOperand(0), "", Call);
-        NewCall->setMetadata(ImpreciseReleaseMDKind,
-                             MDNode::get(C, ArrayRef<Value *>()));
+        NewCall->setMetadata(ImpreciseReleaseMDKind, MDNode::get(C, None));
 
-        DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Replacing "
-                        "objc_autorelease(x) with objc_release(x) since x is "
-                        "otherwise unused.\n"
-                        "                                     Old: " << *Call <<
-                        "\n                                     New: " <<
-                        *NewCall << "\n");
+        DEBUG(dbgs() << "Replacing autorelease{,RV}(x) with objc_release(x) "
+              "since x is otherwise unused.\nOld: " << *Call << "\nNew: "
+              << *NewCall << "\n");
 
         EraseInstruction(Call);
         Inst = NewCall;
@@ -1213,9 +1488,8 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
     // a tail keyword.
     if (IsAlwaysTail(Class)) {
       Changed = true;
-      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Adding tail keyword"
-            " to function since it can never be passed stack args: " << *Inst <<
-            "\n");
+      DEBUG(dbgs() << "Adding tail keyword to function since it can never be "
+                      "passed stack args: " << *Inst << "\n");
       cast<CallInst>(Inst)->setTailCall();
     }
 
@@ -1223,8 +1497,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
     // semantics of ARC truly do not do so.
     if (IsNeverTail(Class)) {
       Changed = true;
-      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Removing tail "
-            "keyword from function: " << *Inst <<
+      DEBUG(dbgs() << "Removing tail keyword from function: " << *Inst <<
             "\n");
       cast<CallInst>(Inst)->setTailCall(false);
     }
@@ -1232,8 +1505,8 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
     // Set nounwind as needed.
     if (IsNoThrow(Class)) {
       Changed = true;
-      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Found no throw"
-            " class. Setting nounwind on: " << *Inst << "\n");
+      DEBUG(dbgs() << "Found no throw class. Setting nounwind on: " << *Inst
+                   << "\n");
       cast<CallInst>(Inst)->setDoesNotThrow();
     }
 
@@ -1245,11 +1518,11 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
     const Value *Arg = GetObjCArg(Inst);
 
     // ARC calls with null are no-ops. Delete them.
-    if (isNullOrUndef(Arg)) {
+    if (IsNullOrUndef(Arg)) {
       Changed = true;
       ++NumNoops;
-      DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: ARC calls with "
-            " null are no-ops. Erasing: " << *Inst << "\n");
+      DEBUG(dbgs() << "ARC calls with  null are no-ops. Erasing: " << *Inst
+            << "\n");
       EraseInstruction(Inst);
       continue;
     }
@@ -1280,7 +1553,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
       for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
         Value *Incoming =
           StripPointerCastsAndObjCCalls(PN->getIncomingValue(i));
-        if (isNullOrUndef(Incoming))
+        if (IsNullOrUndef(Incoming))
           HasNull = true;
         else if (cast<TerminatorInst>(PN->getIncomingBlock(i)->back())
                    .getNumSuccessors() != 1) {
@@ -1334,7 +1607,7 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
           for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
             Value *Incoming =
               StripPointerCastsAndObjCCalls(PN->getIncomingValue(i));
-            if (!isNullOrUndef(Incoming)) {
+            if (!IsNullOrUndef(Incoming)) {
               CallInst *Clone = cast<CallInst>(CInst->clone());
               Value *Op = PN->getIncomingValue(i);
               Instruction *InsertPos = &PN->getIncomingBlock(i)->back();
@@ -1343,10 +1616,9 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
               Clone->setArgOperand(0, Op);
               Clone->insertBefore(InsertPos);
 
-              DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Cloning "
+              DEBUG(dbgs() << "Cloning "
                            << *CInst << "\n"
-                           "                                     And inserting "
-                           "clone at " << *InsertPos << "\n");
+                           "And inserting clone at " << *InsertPos << "\n");
               Worklist.push_back(std::make_pair(Clone, Incoming));
             }
           }
@@ -1358,7 +1630,65 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
       }
     } while (!Worklist.empty());
   }
-  DEBUG(dbgs() << "ObjCARCOpt::OptimizeIndividualCalls: Finished List.\n");
+}
+
+/// If we have a top down pointer in the S_Use state, make sure that there are
+/// no CFG hazards by checking the states of various bottom up pointers.
+static void CheckForUseCFGHazard(const Sequence SuccSSeq,
+                                 const bool SuccSRRIKnownSafe,
+                                 PtrState &S,
+                                 bool &SomeSuccHasSame,
+                                 bool &AllSuccsHaveSame,
+                                 bool &ShouldContinue) {
+  switch (SuccSSeq) {
+  case S_CanRelease: {
+    if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe) {
+      S.ClearSequenceProgress();
+      break;
+    }
+    ShouldContinue = true;
+    break;
+  }
+  case S_Use:
+    SomeSuccHasSame = true;
+    break;
+  case S_Stop:
+  case S_Release:
+  case S_MovableRelease:
+    if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe)
+      AllSuccsHaveSame = false;
+    break;
+  case S_Retain:
+    llvm_unreachable("bottom-up pointer in retain state!");
+  case S_None:
+    llvm_unreachable("This should have been handled earlier.");
+  }
+}
+
+/// If we have a Top Down pointer in the S_CanRelease state, make sure that
+/// there are no CFG hazards by checking the states of various bottom up
+/// pointers.
+static void CheckForCanReleaseCFGHazard(const Sequence SuccSSeq,
+                                        const bool SuccSRRIKnownSafe,
+                                        PtrState &S,
+                                        bool &SomeSuccHasSame,
+                                        bool &AllSuccsHaveSame) {
+  switch (SuccSSeq) {
+  case S_CanRelease:
+    SomeSuccHasSame = true;
+    break;
+  case S_Stop:
+  case S_Release:
+  case S_MovableRelease:
+  case S_Use:
+    if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe)
+      AllSuccsHaveSame = false;
+    break;
+  case S_Retain:
+    llvm_unreachable("bottom-up pointer in retain state!");
+  case S_None:
+    llvm_unreachable("This should have been handled earlier.");
+  }
 }
 
 /// Check for critical edges, loop boundaries, irreducible control flow, or
@@ -1371,106 +1701,82 @@ ObjCARCOpt::CheckForCFGHazards(const BasicBlock *BB,
   // If any top-down local-use or possible-dec has a succ which is earlier in
   // the sequence, forget it.
   for (BBState::ptr_iterator I = MyStates.top_down_ptr_begin(),
-       E = MyStates.top_down_ptr_end(); I != E; ++I)
-    switch (I->second.GetSeq()) {
-    default: break;
-    case S_Use: {
-      const Value *Arg = I->first;
-      const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
-      bool SomeSuccHasSame = false;
-      bool AllSuccsHaveSame = true;
-      PtrState &S = I->second;
-      succ_const_iterator SI(TI), SE(TI, false);
-
-      for (; SI != SE; ++SI) {
-        Sequence SuccSSeq = S_None;
-        bool SuccSRRIKnownSafe = false;
-        // If VisitBottomUp has pointer information for this successor, take
-        // what we know about it.
-        DenseMap<const BasicBlock *, BBState>::iterator BBI =
-          BBStates.find(*SI);
-        assert(BBI != BBStates.end());
-        const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
-        SuccSSeq = SuccS.GetSeq();
-        SuccSRRIKnownSafe = SuccS.RRI.KnownSafe;
-        switch (SuccSSeq) {
-        case S_None:
-        case S_CanRelease: {
-          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe) {
-            S.ClearSequenceProgress();
-            break;
-          }
-          continue;
-        }
-        case S_Use:
-          SomeSuccHasSame = true;
-          break;
-        case S_Stop:
-        case S_Release:
-        case S_MovableRelease:
-          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe)
-            AllSuccsHaveSame = false;
-          break;
-        case S_Retain:
-          llvm_unreachable("bottom-up pointer in retain state!");
-        }
-      }
-      // If the state at the other end of any of the successor edges
-      // matches the current state, require all edges to match. This
-      // guards against loops in the middle of a sequence.
-      if (SomeSuccHasSame && !AllSuccsHaveSame)
+         E = MyStates.top_down_ptr_end(); I != E; ++I) {
+    PtrState &S = I->second;
+    const Sequence Seq = I->second.GetSeq();
+
+    // We only care about S_Retain, S_CanRelease, and S_Use.
+    if (Seq == S_None)
+      continue;
+
+    // Make sure that if extra top down states are added in the future that this
+    // code is updated to handle it.
+    assert((Seq == S_Retain || Seq == S_CanRelease || Seq == S_Use) &&
+           "Unknown top down sequence state.");
+
+    const Value *Arg = I->first;
+    const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
+    bool SomeSuccHasSame = false;
+    bool AllSuccsHaveSame = true;
+
+    succ_const_iterator SI(TI), SE(TI, false);
+
+    for (; SI != SE; ++SI) {
+      // If VisitBottomUp has pointer information for this successor, take
+      // what we know about it.
+      const DenseMap<const BasicBlock *, BBState>::iterator BBI =
+        BBStates.find(*SI);
+      assert(BBI != BBStates.end());
+      const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
+      const Sequence SuccSSeq = SuccS.GetSeq();
+
+      // If bottom up, the pointer is in an S_None state, clear the sequence
+      // progress since the sequence in the bottom up state finished
+      // suggesting a mismatch in between retains/releases. This is true for
+      // all three cases that we are handling here: S_Retain, S_Use, and
+      // S_CanRelease.
+      if (SuccSSeq == S_None) {
         S.ClearSequenceProgress();
-      break;
-    }
-    case S_CanRelease: {
-      const Value *Arg = I->first;
-      const TerminatorInst *TI = cast<TerminatorInst>(&BB->back());
-      bool SomeSuccHasSame = false;
-      bool AllSuccsHaveSame = true;
-      PtrState &S = I->second;
-      succ_const_iterator SI(TI), SE(TI, false);
-
-      for (; SI != SE; ++SI) {
-        Sequence SuccSSeq = S_None;
-        bool SuccSRRIKnownSafe = false;
-        // If VisitBottomUp has pointer information for this successor, take
-        // what we know about it.
-        DenseMap<const BasicBlock *, BBState>::iterator BBI =
-          BBStates.find(*SI);
-        assert(BBI != BBStates.end());
-        const PtrState &SuccS = BBI->second.getPtrBottomUpState(Arg);
-        SuccSSeq = SuccS.GetSeq();
-        SuccSRRIKnownSafe = SuccS.RRI.KnownSafe;
-        switch (SuccSSeq) {
-        case S_None: {
-          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe) {
-            S.ClearSequenceProgress();
-            break;
-          }
+        continue;
+      }
+
+      // If we have S_Use or S_CanRelease, perform our check for cfg hazard
+      // checks.
+      const bool SuccSRRIKnownSafe = SuccS.RRI.KnownSafe;
+
+      // *NOTE* We do not use Seq from above here since we are allowing for
+      // S.GetSeq() to change while we are visiting basic blocks.
+      switch(S.GetSeq()) {
+      case S_Use: {
+        bool ShouldContinue = false;
+        CheckForUseCFGHazard(SuccSSeq, SuccSRRIKnownSafe, S,
+                             SomeSuccHasSame, AllSuccsHaveSame,
+                             ShouldContinue);
+        if (ShouldContinue)
           continue;
-        }
-        case S_CanRelease:
-          SomeSuccHasSame = true;
-          break;
-        case S_Stop:
-        case S_Release:
-        case S_MovableRelease:
-        case S_Use:
-          if (!S.RRI.KnownSafe && !SuccSRRIKnownSafe)
-            AllSuccsHaveSame = false;
-          break;
-        case S_Retain:
-          llvm_unreachable("bottom-up pointer in retain state!");
-        }
+        break;
+      }
+      case S_CanRelease: {
+        CheckForCanReleaseCFGHazard(SuccSSeq, SuccSRRIKnownSafe,
+                                    S, SomeSuccHasSame,
+                                    AllSuccsHaveSame);
+        break;
+      }
+      case S_Retain:
+      case S_None:
+      case S_Stop:
+      case S_Release:
+      case S_MovableRelease:
+        break;
       }
-      // If the state at the other end of any of the successor edges
-      // matches the current state, require all edges to match. This
-      // guards against loops in the middle of a sequence.
-      if (SomeSuccHasSame && !AllSuccsHaveSame)
-        S.ClearSequenceProgress();
-      break;
-    }
     }
+
+    // If the state at the other end of any of the successor edges
+    // matches the current state, require all edges to match. This
+    // guards against loops in the middle of a sequence.
+    if (SomeSuccHasSame && !AllSuccsHaveSame)
+      S.ClearSequenceProgress();
+  }
 }
 
 bool
@@ -1482,6 +1788,8 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
   InstructionClass Class = GetInstructionClass(Inst);
   const Value *Arg = 0;
 
+  DEBUG(dbgs() << "Class: " << Class << "\n");
+
   switch (Class) {
   case IC_Release: {
     Arg = GetObjCArg(Inst);
@@ -1496,27 +1804,26 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
     // pairs by making PtrState hold a stack of states, but this is
     // simple and avoids adding overhead for the non-nested case.
     if (S.GetSeq() == S_Release || S.GetSeq() == S_MovableRelease) {
-      DEBUG(dbgs() << "ObjCARCOpt::VisitInstructionBottomUp: Found nested "
-                      "releases (i.e. a release pair)\n");
+      DEBUG(dbgs() << "Found nested releases (i.e. a release pair)\n");
       NestingDetected = true;
     }
 
     MDNode *ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind);
-    S.ResetSequenceProgress(ReleaseMetadata ? S_MovableRelease : S_Release);
+    Sequence NewSeq = ReleaseMetadata ? S_MovableRelease : S_Release;
+    ANNOTATE_BOTTOMUP(Inst, Arg, S.GetSeq(), NewSeq);
+    S.ResetSequenceProgress(NewSeq);
     S.RRI.ReleaseMetadata = ReleaseMetadata;
-    S.RRI.KnownSafe = S.IsKnownIncremented();
+    S.RRI.KnownSafe = S.HasKnownPositiveRefCount();
     S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
     S.RRI.Calls.insert(Inst);
-
     S.SetKnownPositiveRefCount();
     break;
   }
   case IC_RetainBlock:
-    // An objc_retainBlock call with just a use may need to be kept,
-    // because it may be copying a block from the stack to the heap.
-    if (!IsRetainBlockOptimizable(Inst))
-      break;
-    // FALLTHROUGH
+    // In OptimizeIndividualCalls, we have strength reduced all optimizable
+    // objc_retainBlocks to objc_retains. Thus at this point any
+    // objc_retainBlocks that we see are not optimizable.
+    break;
   case IC_Retain:
   case IC_RetainRV: {
     Arg = GetObjCArg(Inst);
@@ -1524,20 +1831,22 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
     PtrState &S = MyStates.getPtrBottomUpState(Arg);
     S.SetKnownPositiveRefCount();
 
-    switch (S.GetSeq()) {
+    Sequence OldSeq = S.GetSeq();
+    switch (OldSeq) {
     case S_Stop:
     case S_Release:
     case S_MovableRelease:
     case S_Use:
-      S.RRI.ReverseInsertPts.clear();
+      // If OldSeq is not S_Use or OldSeq is S_Use and we are tracking an
+      // imprecise release, clear our reverse insertion points.
+      if (OldSeq != S_Use || S.RRI.IsTrackingImpreciseReleases())
+        S.RRI.ReverseInsertPts.clear();
       // FALL THROUGH
     case S_CanRelease:
       // Don't do retain+release tracking for IC_RetainRV, because it's
       // better to let it remain as the first instruction after a call.
-      if (Class != IC_RetainRV) {
-        S.RRI.IsRetainBlock = Class == IC_RetainBlock;
+      if (Class != IC_RetainRV)
         Retains[Inst] = S.RRI;
-      }
       S.ClearSequenceProgress();
       break;
     case S_None:
@@ -1545,7 +1854,9 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
     case S_Retain:
       llvm_unreachable("bottom-up pointer in retain state!");
     }
-    return NestingDetected;
+    ANNOTATE_BOTTOMUP(Inst, Arg, OldSeq, S.GetSeq());
+    // A retain moving bottom up can be a use.
+    break;
   }
   case IC_AutoreleasepoolPop:
     // Conservatively, clear MyStates for all known pointers.
@@ -1571,10 +1882,13 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
 
     // Check for possible releases.
     if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
-      S.ClearRefCount();
+      DEBUG(dbgs() << "CanAlterRefCount: Seq: " << Seq << "; " << *Ptr
+            << "\n");
+      S.ClearKnownPositiveRefCount();
       switch (Seq) {
       case S_Use:
         S.SetSeq(S_CanRelease);
+        ANNOTATE_BOTTOMUP(Inst, Ptr, Seq, S.GetSeq());
         continue;
       case S_CanRelease:
       case S_Release:
@@ -1592,6 +1906,8 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
     case S_Release:
     case S_MovableRelease:
       if (CanUse(Inst, Ptr, PA, Class)) {
+        DEBUG(dbgs() << "CanUse: Seq: " << Seq << "; " << *Ptr
+              << "\n");
         assert(S.RRI.ReverseInsertPts.empty());
         // If this is an invoke instruction, we're scanning it as part of
         // one of its successor blocks, since we can't insert code after it
@@ -1601,10 +1917,13 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
         else
           S.RRI.ReverseInsertPts.insert(llvm::next(BasicBlock::iterator(Inst)));
         S.SetSeq(S_Use);
-      } else if (Seq == S_Release &&
-                 (Class == IC_User || Class == IC_CallOrUser)) {
+        ANNOTATE_BOTTOMUP(Inst, Ptr, Seq, S_Use);
+      } else if (Seq == S_Release && IsUser(Class)) {
+        DEBUG(dbgs() << "PreciseReleaseUse: Seq: " << Seq << "; " << *Ptr
+              << "\n");
         // Non-movable releases depend on any possible objc pointer use.
         S.SetSeq(S_Stop);
+        ANNOTATE_BOTTOMUP(Inst, Ptr, S_Release, S_Stop);
         assert(S.RRI.ReverseInsertPts.empty());
         // As above; handle invoke specially.
         if (isa<InvokeInst>(Inst))
@@ -1614,8 +1933,12 @@ ObjCARCOpt::VisitInstructionBottomUp(Instruction *Inst,
       }
       break;
     case S_Stop:
-      if (CanUse(Inst, Ptr, PA, Class))
+      if (CanUse(Inst, Ptr, PA, Class)) {
+        DEBUG(dbgs() << "PreciseStopUse: Seq: " << Seq << "; " << *Ptr
+              << "\n");
         S.SetSeq(S_Use);
+        ANNOTATE_BOTTOMUP(Inst, Ptr, Seq, S_Use);
+      }
       break;
     case S_CanRelease:
     case S_Use:
@@ -1633,6 +1956,9 @@ bool
 ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
                           DenseMap<const BasicBlock *, BBState> &BBStates,
                           MapVector<Value *, RRInfo> &Retains) {
+
+  DEBUG(dbgs() << "\n== ObjCARCOpt::VisitBottomUp ==\n");
+
   bool NestingDetected = false;
   BBState &MyStates = BBStates[BB];
 
@@ -1654,6 +1980,10 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
     }
   }
 
+  // If ARC Annotations are enabled, output the current state of pointers at the
+  // bottom of the basic block.
+  ANNOTATE_BOTTOMUP_BBEND(MyStates, BB);
+
   // Visit all the instructions, bottom-up.
   for (BasicBlock::iterator I = BB->end(), E = BB->begin(); I != E; --I) {
     Instruction *Inst = llvm::prior(I);
@@ -1662,7 +1992,7 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
     if (isa<InvokeInst>(Inst))
       continue;
 
-    DEBUG(dbgs() << "ObjCARCOpt::VisitButtonUp: Visiting " << *Inst << "\n");
+    DEBUG(dbgs() << "Visiting " << *Inst << "\n");
 
     NestingDetected |= VisitInstructionBottomUp(Inst, BB, Retains, MyStates);
   }
@@ -1677,6 +2007,10 @@ ObjCARCOpt::VisitBottomUp(BasicBlock *BB,
       NestingDetected |= VisitInstructionBottomUp(II, BB, Retains, MyStates);
   }
 
+  // If ARC Annotations are enabled, output the current state of pointers at the
+  // top of the basic block.
+  ANNOTATE_BOTTOMUP_BBSTART(MyStates, BB);
+
   return NestingDetected;
 }
 
@@ -1690,11 +2024,10 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
 
   switch (Class) {
   case IC_RetainBlock:
-    // An objc_retainBlock call with just a use may need to be kept,
-    // because it may be copying a block from the stack to the heap.
-    if (!IsRetainBlockOptimizable(Inst))
-      break;
-    // FALLTHROUGH
+    // In OptimizeIndividualCalls, we have strength reduced all optimizable
+    // objc_retainBlocks to objc_retains. Thus at this point any
+    // objc_retainBlocks that we see are not optimizable.
+    break;
   case IC_Retain:
   case IC_RetainRV: {
     Arg = GetObjCArg(Inst);
@@ -1714,9 +2047,9 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
       if (S.GetSeq() == S_Retain)
         NestingDetected = true;
 
+      ANNOTATE_TOPDOWN(Inst, Arg, S.GetSeq(), S_Retain);
       S.ResetSequenceProgress(S_Retain);
-      S.RRI.IsRetainBlock = Class == IC_RetainBlock;
-      S.RRI.KnownSafe = S.IsKnownIncremented();
+      S.RRI.KnownSafe = S.HasKnownPositiveRefCount();
       S.RRI.Calls.insert(Inst);
     }
 
@@ -1730,17 +2063,23 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
     Arg = GetObjCArg(Inst);
 
     PtrState &S = MyStates.getPtrTopDownState(Arg);
-    S.ClearRefCount();
+    S.ClearKnownPositiveRefCount();
+
+    Sequence OldSeq = S.GetSeq();
 
-    switch (S.GetSeq()) {
+    MDNode *ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind);
+
+    switch (OldSeq) {
     case S_Retain:
     case S_CanRelease:
-      S.RRI.ReverseInsertPts.clear();
+      if (OldSeq == S_Retain || ReleaseMetadata != 0)
+        S.RRI.ReverseInsertPts.clear();
       // FALL THROUGH
     case S_Use:
-      S.RRI.ReleaseMetadata = Inst->getMetadata(ImpreciseReleaseMDKind);
+      S.RRI.ReleaseMetadata = ReleaseMetadata;
       S.RRI.IsTailCallRelease = cast<CallInst>(Inst)->isTailCall();
       Releases[Inst] = S.RRI;
+      ANNOTATE_TOPDOWN(Inst, Arg, S.GetSeq(), S_None);
       S.ClearSequenceProgress();
       break;
     case S_None:
@@ -1776,10 +2115,13 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
 
     // Check for possible releases.
     if (CanAlterRefCount(Inst, Ptr, PA, Class)) {
-      S.ClearRefCount();
+      DEBUG(dbgs() << "CanAlterRefCount: Seq: " << Seq << "; " << *Ptr
+            << "\n");
+      S.ClearKnownPositiveRefCount();
       switch (Seq) {
       case S_Retain:
         S.SetSeq(S_CanRelease);
+        ANNOTATE_TOPDOWN(Inst, Ptr, Seq, S_CanRelease);
         assert(S.RRI.ReverseInsertPts.empty());
         S.RRI.ReverseInsertPts.insert(Inst);
 
@@ -1801,8 +2143,12 @@ ObjCARCOpt::VisitInstructionTopDown(Instruction *Inst,
     // Check for possible direct uses.
     switch (Seq) {
     case S_CanRelease:
-      if (CanUse(Inst, Ptr, PA, Class))
+      if (CanUse(Inst, Ptr, PA, Class)) {
+        DEBUG(dbgs() << "CanUse: Seq: " << Seq << "; " << *Ptr
+              << "\n");
         S.SetSeq(S_Use);
+        ANNOTATE_TOPDOWN(Inst, Ptr, Seq, S_Use);
+      }
       break;
     case S_Retain:
     case S_Use:
@@ -1822,6 +2168,7 @@ bool
 ObjCARCOpt::VisitTopDown(BasicBlock *BB,
                          DenseMap<const BasicBlock *, BBState> &BBStates,
                          DenseMap<Value *, RRInfo> &Releases) {
+  DEBUG(dbgs() << "\n== ObjCARCOpt::VisitTopDown ==\n");
   bool NestingDetected = false;
   BBState &MyStates = BBStates[BB];
 
@@ -1843,15 +2190,26 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB,
     }
   }
 
+  // If ARC Annotations are enabled, output the current state of pointers at the
+  // top of the basic block.
+  ANNOTATE_TOPDOWN_BBSTART(MyStates, BB);
+
   // Visit all the instructions, top-down.
   for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
     Instruction *Inst = I;
 
-    DEBUG(dbgs() << "ObjCARCOpt::VisitTopDown: Visiting " << *Inst << "\n");
+    DEBUG(dbgs() << "Visiting " << *Inst << "\n");
 
     NestingDetected |= VisitInstructionTopDown(Inst, Releases, MyStates);
   }
 
+  // If ARC Annotations are enabled, output the current state of pointers at the
+  // bottom of the basic block.
+  ANNOTATE_TOPDOWN_BBEND(MyStates, BB);
+
+#ifdef ARC_ANNOTATIONS
+  if (!(EnableARCAnnotations && DisableCheckForCFGHazards))
+#endif
   CheckForCFGHazards(BB, BBStates, MyStates);
   return NestingDetected;
 }
@@ -1983,6 +2341,8 @@ void ObjCARCOpt::MoveCalls(Value *Arg,
   Type *ArgTy = Arg->getType();
   Type *ParamTy = PointerType::getUnqual(Type::getInt8Ty(ArgTy->getContext()));
 
+  DEBUG(dbgs() << "== ObjCARCOpt::MoveCalls ==\n");
+
   // Insert the new retain and release calls.
   for (SmallPtrSet<Instruction *, 2>::const_iterator
        PI = ReleasesToMove.ReverseInsertPts.begin(),
@@ -1991,20 +2351,12 @@ void ObjCARCOpt::MoveCalls(Value *Arg,
     Value *MyArg = ArgTy == ParamTy ? Arg :
                    new BitCastInst(Arg, ParamTy, "", InsertPt);
     CallInst *Call =
-      CallInst::Create(RetainsToMove.IsRetainBlock ?
-                         getRetainBlockCallee(M) : getRetainCallee(M),
-                       MyArg, "", InsertPt);
+      CallInst::Create(getRetainCallee(M), MyArg, "", InsertPt);
     Call->setDoesNotThrow();
-    if (RetainsToMove.IsRetainBlock)
-      Call->setMetadata(CopyOnEscapeMDKind,
-                        MDNode::get(M->getContext(), ArrayRef<Value *>()));
-    else
-      Call->setTailCall();
+    Call->setTailCall();
 
-    DEBUG(dbgs() << "ObjCARCOpt::MoveCalls: Inserting new Release: " << *Call
-                 << "\n"
-                    "                       At insertion point: " << *InsertPt
-                 << "\n");
+    DEBUG(dbgs() << "Inserting new Retain: " << *Call << "\n"
+                    "At insertion point: " << *InsertPt << "\n");
   }
   for (SmallPtrSet<Instruction *, 2>::const_iterator
        PI = RetainsToMove.ReverseInsertPts.begin(),
@@ -2021,10 +2373,8 @@ void ObjCARCOpt::MoveCalls(Value *Arg,
     if (ReleasesToMove.IsTailCallRelease)
       Call->setTailCall();
 
-    DEBUG(dbgs() << "ObjCARCOpt::MoveCalls: Inserting new Retain: " << *Call
-                 << "\n"
-                    "                       At insertion point: " << *InsertPt
-                 << "\n");
+    DEBUG(dbgs() << "Inserting new Release: " << *Call << "\n"
+                    "At insertion point: " << *InsertPt << "\n");
   }
 
   // Delete the original retain and release calls.
@@ -2034,8 +2384,7 @@ void ObjCARCOpt::MoveCalls(Value *Arg,
     Instruction *OrigRetain = *AI;
     Retains.blot(OrigRetain);
     DeadInsts.push_back(OrigRetain);
-    DEBUG(dbgs() << "ObjCARCOpt::MoveCalls: Deleting retain: " << *OrigRetain <<
-                    "\n");
+    DEBUG(dbgs() << "Deleting retain: " << *OrigRetain << "\n");
   }
   for (SmallPtrSet<Instruction *, 2>::const_iterator
        AI = ReleasesToMove.Calls.begin(),
@@ -2043,9 +2392,9 @@ void ObjCARCOpt::MoveCalls(Value *Arg,
     Instruction *OrigRelease = *AI;
     Releases.erase(OrigRelease);
     DeadInsts.push_back(OrigRelease);
-    DEBUG(dbgs() << "ObjCARCOpt::MoveCalls: Deleting release: " << *OrigRelease
-                 << "\n");
+    DEBUG(dbgs() << "Deleting release: " << *OrigRelease << "\n");
   }
+
 }
 
 bool
@@ -2075,7 +2424,6 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
   unsigned OldCount = 0;
   unsigned NewCount = 0;
   bool FirstRelease = true;
-  bool FirstRetain = true;
   for (;;) {
     for (SmallVectorImpl<Instruction *>::const_iterator
            NI = NewRetains.begin(), NE = NewRetains.end(); NI != NE; ++NI) {
@@ -2156,16 +2504,6 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
           OldDelta += PathCount;
           OldCount += PathCount;
 
-          // Merge the IsRetainBlock values.
-          if (FirstRetain) {
-            RetainsToMove.IsRetainBlock = NewReleaseRetainRRI.IsRetainBlock;
-            FirstRetain = false;
-          } else if (ReleasesToMove.IsRetainBlock !=
-                     NewReleaseRetainRRI.IsRetainBlock)
-            // It's not possible to merge the sequences if one uses
-            // objc_retain and the other uses objc_retainBlock.
-            return false;
-
           // Collect the optimal insertion points.
           if (!KnownSafe)
             for (SmallPtrSet<Instruction *, 2>::const_iterator
@@ -2210,6 +2548,12 @@ ObjCARCOpt::ConnectTDBUTraversals(DenseMap<const BasicBlock *, BBState>
   if (OldDelta != 0)
     return false;
 
+#ifdef ARC_ANNOTATIONS
+  // Do not move calls if ARC annotations are requested.
+  if (EnableARCAnnotations)
+    return false;
+#endif // ARC_ANNOTATIONS
+
   Changed = true;
   assert(OldCount != 0 && "Unreachable code?");
   NumRRs += OldCount - NewCount;
@@ -2228,6 +2572,8 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
                                  MapVector<Value *, RRInfo> &Retains,
                                  DenseMap<Value *, RRInfo> &Releases,
                                  Module *M) {
+  DEBUG(dbgs() << "\n== ObjCARCOpt::PerformCodePlacement ==\n");
+
   bool AnyPairsCompletelyEliminated = false;
   RRInfo RetainsToMove;
   RRInfo ReleasesToMove;
@@ -2243,8 +2589,7 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
 
     Instruction *Retain = cast<Instruction>(V);
 
-    DEBUG(dbgs() << "ObjCARCOpt::PerformCodePlacement: Visiting: " << *Retain
-          << "\n");
+    DEBUG(dbgs() << "Visiting: " << *Retain << "\n");
 
     Value *Arg = GetObjCArg(Retain);
 
@@ -2295,14 +2640,15 @@ ObjCARCOpt::PerformCodePlacement(DenseMap<const BasicBlock *, BBState>
 
 /// Weak pointer optimizations.
 void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
+  DEBUG(dbgs() << "\n== ObjCARCOpt::OptimizeWeakCalls ==\n");
+
   // First, do memdep-style RLE and S2L optimizations. We can't use memdep
   // itself because it uses AliasAnalysis and we need to do provenance
   // queries instead.
   for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
     Instruction *Inst = &*I++;
 
-    DEBUG(dbgs() << "ObjCARCOpt::OptimizeWeakCalls: Visiting: " << *Inst <<
-          "\n");
+    DEBUG(dbgs() << "Visiting: " << *Inst << "\n");
 
     InstructionClass Class = GetBasicInstructionClass(Inst);
     if (Class != IC_LoadWeak && Class != IC_LoadWeakRetained)
@@ -2392,6 +2738,7 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
         goto clobbered;
       case IC_AutoreleasepoolPush:
       case IC_None:
+      case IC_IntrinsicUser:
       case IC_User:
         // Weak pointers are only modified through the weak entry points
         // (and arbitrary calls, which could call the weak entry points).
@@ -2449,9 +2796,6 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
     done:;
     }
   }
-
-  DEBUG(dbgs() << "ObjCARCOpt::OptimizeWeakCalls: Finished List.\n\n");
-
 }
 
 /// Identify program paths which execute sequences of retains and releases which
@@ -2476,6 +2820,88 @@ bool ObjCARCOpt::OptimizeSequences(Function &F) {
          NestingDetected;
 }
 
+/// Check if there is a dependent call earlier that does not have anything in
+/// between the Retain and the call that can affect the reference count of their
+/// shared pointer argument. Note that Retain need not be in BB.
+static bool
+HasSafePathToPredecessorCall(const Value *Arg, Instruction *Retain,
+                             SmallPtrSet<Instruction *, 4> &DepInsts,
+                             SmallPtrSet<const BasicBlock *, 4> &Visited,
+                             ProvenanceAnalysis &PA) {
+  FindDependencies(CanChangeRetainCount, Arg, Retain->getParent(), Retain,
+                   DepInsts, Visited, PA);
+  if (DepInsts.size() != 1)
+    return false;
+
+  CallInst *Call =
+    dyn_cast_or_null<CallInst>(*DepInsts.begin());
+
+  // Check that the pointer is the return value of the call.
+  if (!Call || Arg != Call)
+    return false;
+
+  // Check that the call is a regular call.
+  InstructionClass Class = GetBasicInstructionClass(Call);
+  if (Class != IC_CallOrUser && Class != IC_Call)
+    return false;
+
+  return true;
+}
+
+/// Find a dependent retain that precedes the given autorelease for which there
+/// is nothing in between the two instructions that can affect the ref count of
+/// Arg.
+static CallInst *
+FindPredecessorRetainWithSafePath(const Value *Arg, BasicBlock *BB,
+                                  Instruction *Autorelease,
+                                  SmallPtrSet<Instruction *, 4> &DepInsts,
+                                  SmallPtrSet<const BasicBlock *, 4> &Visited,
+                                  ProvenanceAnalysis &PA) {
+  FindDependencies(CanChangeRetainCount, Arg,
+                   BB, Autorelease, DepInsts, Visited, PA);
+  if (DepInsts.size() != 1)
+    return 0;
+
+  CallInst *Retain =
+    dyn_cast_or_null<CallInst>(*DepInsts.begin());
+
+  // Check that we found a retain with the same argument.
+  if (!Retain ||
+      !IsRetain(GetBasicInstructionClass(Retain)) ||
+      GetObjCArg(Retain) != Arg) {
+    return 0;
+  }
+
+  return Retain;
+}
+
+/// Look for an ``autorelease'' instruction dependent on Arg such that there are
+/// no instructions dependent on Arg that need a positive ref count in between
+/// the autorelease and the ret.
+static CallInst *
+FindPredecessorAutoreleaseWithSafePath(const Value *Arg, BasicBlock *BB,
+                                       ReturnInst *Ret,
+                                       SmallPtrSet<Instruction *, 4> &DepInsts,
+                                       SmallPtrSet<const BasicBlock *, 4> &V,
+                                       ProvenanceAnalysis &PA) {
+  FindDependencies(NeedsPositiveRetainCount, Arg,
+                   BB, Ret, DepInsts, V, PA);
+  if (DepInsts.size() != 1)
+    return 0;
+
+  CallInst *Autorelease =
+    dyn_cast_or_null<CallInst>(*DepInsts.begin());
+  if (!Autorelease)
+    return 0;
+  InstructionClass AutoreleaseClass = GetBasicInstructionClass(Autorelease);
+  if (!IsAutorelease(AutoreleaseClass))
+    return 0;
+  if (GetObjCArg(Autorelease) != Arg)
+    return 0;
+
+  return Autorelease;
+}
+
 /// Look for this pattern:
 /// \code
 ///    %call = call i8* @something(...)
@@ -2484,122 +2910,91 @@ bool ObjCARCOpt::OptimizeSequences(Function &F) {
 ///    ret i8* %3
 /// \endcode
 /// And delete the retain and autorelease.
-///
-/// Otherwise if it's just this:
-/// \code
-///    %3 = call i8* @objc_autorelease(i8* %2)
-///    ret i8* %3
-/// \endcode
-/// convert the autorelease to autoreleaseRV.
 void ObjCARCOpt::OptimizeReturns(Function &F) {
   if (!F.getReturnType()->isPointerTy())
     return;
 
+  DEBUG(dbgs() << "\n== ObjCARCOpt::OptimizeReturns ==\n");
+
   SmallPtrSet<Instruction *, 4> DependingInstructions;
   SmallPtrSet<const BasicBlock *, 4> Visited;
   for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
     BasicBlock *BB = FI;
     ReturnInst *Ret = dyn_cast<ReturnInst>(&BB->back());
 
-    DEBUG(dbgs() << "ObjCARCOpt::OptimizeReturns: Visiting: " << *Ret << "\n");
+    DEBUG(dbgs() << "Visiting: " << *Ret << "\n");
 
-    if (!Ret) continue;
+    if (!Ret)
+      continue;
 
     const Value *Arg = StripPointerCastsAndObjCCalls(Ret->getOperand(0));
-    FindDependencies(NeedsPositiveRetainCount, Arg,
-                     BB, Ret, DependingInstructions, Visited, PA);
-    if (DependingInstructions.size() != 1)
-      goto next_block;
-
-    {
-      CallInst *Autorelease =
-        dyn_cast_or_null<CallInst>(*DependingInstructions.begin());
-      if (!Autorelease)
-        goto next_block;
-      InstructionClass AutoreleaseClass = GetBasicInstructionClass(Autorelease);
-      if (!IsAutorelease(AutoreleaseClass))
-        goto next_block;
-      if (GetObjCArg(Autorelease) != Arg)
-        goto next_block;
-
-      DependingInstructions.clear();
-      Visited.clear();
-
-      // Check that there is nothing that can affect the reference
-      // count between the autorelease and the retain.
-      FindDependencies(CanChangeRetainCount, Arg,
-                       BB, Autorelease, DependingInstructions, Visited, PA);
-      if (DependingInstructions.size() != 1)
-        goto next_block;
-
-      {
-        CallInst *Retain =
-          dyn_cast_or_null<CallInst>(*DependingInstructions.begin());
-
-        // Check that we found a retain with the same argument.
-        if (!Retain ||
-            !IsRetain(GetBasicInstructionClass(Retain)) ||
-            GetObjCArg(Retain) != Arg)
-          goto next_block;
-
-        DependingInstructions.clear();
-        Visited.clear();
-
-        // Convert the autorelease to an autoreleaseRV, since it's
-        // returning the value.
-        if (AutoreleaseClass == IC_Autorelease) {
-          DEBUG(dbgs() << "ObjCARCOpt::OptimizeReturns: Converting autorelease "
-                          "=> autoreleaseRV since it's returning a value.\n"
-                          "                             In: " << *Autorelease
-                       << "\n");
-          Autorelease->setCalledFunction(getAutoreleaseRVCallee(F.getParent()));
-          DEBUG(dbgs() << "                             Out: " << *Autorelease
-                       << "\n");
-          Autorelease->setTailCall(); // Always tail call autoreleaseRV.
-          AutoreleaseClass = IC_AutoreleaseRV;
-        }
-
-        // Check that there is nothing that can affect the reference
-        // count between the retain and the call.
-        // Note that Retain need not be in BB.
-        FindDependencies(CanChangeRetainCount, Arg, Retain->getParent(), Retain,
-                         DependingInstructions, Visited, PA);
-        if (DependingInstructions.size() != 1)
-          goto next_block;
 
-        {
-          CallInst *Call =
-            dyn_cast_or_null<CallInst>(*DependingInstructions.begin());
+    // Look for an ``autorelease'' instruction that is a predecessor of Ret and
+    // dependent on Arg such that there are no instructions dependent on Arg
+    // that need a positive ref count in between the autorelease and Ret.
+    CallInst *Autorelease =
+      FindPredecessorAutoreleaseWithSafePath(Arg, BB, Ret,
+                                             DependingInstructions, Visited,
+                                             PA);
+    DependingInstructions.clear();
+    Visited.clear();
 
-          // Check that the pointer is the return value of the call.
-          if (!Call || Arg != Call)
-            goto next_block;
+    if (!Autorelease)
+      continue;
 
-          // Check that the call is a regular call.
-          InstructionClass Class = GetBasicInstructionClass(Call);
-          if (Class != IC_CallOrUser && Class != IC_Call)
-            goto next_block;
+    CallInst *Retain =
+      FindPredecessorRetainWithSafePath(Arg, BB, Autorelease,
+                                        DependingInstructions, Visited, PA);
+    DependingInstructions.clear();
+    Visited.clear();
 
-          // If so, we can zap the retain and autorelease.
-          Changed = true;
-          ++NumRets;
-          DEBUG(dbgs() << "ObjCARCOpt::OptimizeReturns: Erasing: " << *Retain
-                       << "\n                             Erasing: "
-                       << *Autorelease << "\n");
-          EraseInstruction(Retain);
-          EraseInstruction(Autorelease);
-        }
-      }
-    }
+    if (!Retain)
+      continue;
 
-  next_block:
+    // Check that there is nothing that can affect the reference count
+    // between the retain and the call.  Note that Retain need not be in BB.
+    bool HasSafePathToCall = HasSafePathToPredecessorCall(Arg, Retain,
+                                                          DependingInstructions,
+                                                          Visited, PA);
     DependingInstructions.clear();
     Visited.clear();
+
+    if (!HasSafePathToCall)
+      continue;
+
+    // If so, we can zap the retain and autorelease.
+    Changed = true;
+    ++NumRets;
+    DEBUG(dbgs() << "Erasing: " << *Retain << "\nErasing: "
+          << *Autorelease << "\n");
+    EraseInstruction(Retain);
+    EraseInstruction(Autorelease);
   }
+}
 
-  DEBUG(dbgs() << "ObjCARCOpt::OptimizeReturns: Finished List.\n\n");
+#ifndef NDEBUG
+void
+ObjCARCOpt::GatherStatistics(Function &F, bool AfterOptimization) {
+  llvm::Statistic &NumRetains =
+    AfterOptimization? NumRetainsAfterOpt : NumRetainsBeforeOpt;
+  llvm::Statistic &NumReleases =
+    AfterOptimization? NumReleasesAfterOpt : NumReleasesBeforeOpt;
 
+  for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E; ) {
+    Instruction *Inst = &*I++;
+    switch (GetBasicInstructionClass(Inst)) {
+    default:
+      break;
+    case IC_Retain:
+      ++NumRetains;
+      break;
+    case IC_Release:
+      ++NumReleases;
+      break;
+    }
+  }
 }
+#endif
 
 bool ObjCARCOpt::doInitialization(Module &M) {
   if (!EnableARCOpts)
@@ -2617,13 +3012,20 @@ bool ObjCARCOpt::doInitialization(Module &M) {
     M.getContext().getMDKindID("clang.arc.copy_on_escape");
   NoObjCARCExceptionsMDKind =
     M.getContext().getMDKindID("clang.arc.no_objc_arc_exceptions");
+#ifdef ARC_ANNOTATIONS
+  ARCAnnotationBottomUpMDKind =
+    M.getContext().getMDKindID("llvm.arc.annotation.bottomup");
+  ARCAnnotationTopDownMDKind =
+    M.getContext().getMDKindID("llvm.arc.annotation.topdown");
+  ARCAnnotationProvenanceSourceMDKind =
+    M.getContext().getMDKindID("llvm.arc.annotation.provenancesource");
+#endif // ARC_ANNOTATIONS
 
   // Intuitively, objc_retain and others are nocapture, however in practice
   // they are not, because they return their argument value. And objc_release
   // calls finalizers which can have arbitrary side effects.
 
   // These are initialized lazily.
-  RetainRVCallee = 0;
   AutoreleaseRVCallee = 0;
   ReleaseCallee = 0;
   RetainCallee = 0;
@@ -2643,7 +3045,8 @@ bool ObjCARCOpt::runOnFunction(Function &F) {
 
   Changed = false;
 
-  DEBUG(dbgs() << "ObjCARCOpt: Visiting Function: " << F.getName() << "\n");
+  DEBUG(dbgs() << "<<< ObjCARCOpt: Visiting Function: " << F.getName() << " >>>"
+        "\n");
 
   PA.setAA(&getAnalysis<AliasAnalysis>());
 
@@ -2651,7 +3054,7 @@ bool ObjCARCOpt::runOnFunction(Function &F) {
   // when compiling code that isn't ObjC, skip these if the relevant ObjC
   // library functions aren't declared.
 
-  // Preliminary optimizations. This also computs UsedInThisFunction.
+  // Preliminary optimizations. This also computes UsedInThisFunction.
   OptimizeIndividualCalls(F);
 
   // Optimizations for weak pointers.
@@ -2678,6 +3081,13 @@ bool ObjCARCOpt::runOnFunction(Function &F) {
                             (1 << IC_AutoreleaseRV)))
     OptimizeReturns(F);
 
+  // Gather statistics after optimization.
+#ifndef NDEBUG
+  if (AreStatisticsEnabled()) {
+    GatherStatistics(F, true);
+  }
+#endif
+
   DEBUG(dbgs() << "\n");
 
   return Changed;
diff --git a/lib/Transforms/ObjCARC/ObjCARCUtil.cpp b/lib/Transforms/ObjCARC/ObjCARCUtil.cpp
index a841c64a9f..03e12d4fd7 100644
--- a/lib/Transforms/ObjCARC/ObjCARCUtil.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCUtil.cpp
@@ -72,6 +72,8 @@ raw_ostream &llvm::objcarc::operator<<(raw_ostream &OS,
     return OS << "IC_Call";
   case IC_User:
     return OS << "IC_User";
+  case IC_IntrinsicUser:
+    return OS << "IC_IntrinsicUser";
   case IC_None:
     return OS << "IC_None";
   }
@@ -81,10 +83,11 @@ raw_ostream &llvm::objcarc::operator<<(raw_ostream &OS,
 InstructionClass llvm::objcarc::GetFunctionClass(const Function *F) {
   Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
 
-  // No arguments.
+  // No (mandatory) arguments.
   if (AI == AE)
     return StringSwitch<InstructionClass>(F->getName())
       .Case("objc_autoreleasePoolPush",  IC_AutoreleasepoolPush)
+      .Case("clang.arc.use", IC_IntrinsicUser)
       .Default(IC_CallOrUser);
 
   // One argument.
@@ -142,6 +145,14 @@ InstructionClass llvm::objcarc::GetFunctionClass(const Function *F) {
                 return StringSwitch<InstructionClass>(F->getName())
                   .Case("objc_moveWeak",              IC_MoveWeak)
                   .Case("objc_copyWeak",              IC_CopyWeak)
+                  // Ignore annotation calls. This is important to stop the
+                  // optimizer from treating annotations as uses which would
+                  // make the state of the pointers they are attempting to
+                  // elucidate to be incorrect.
+                  .Case("llvm.arc.annotation.topdown.bbstart", IC_None)
+                  .Case("llvm.arc.annotation.topdown.bbend", IC_None)
+                  .Case("llvm.arc.annotation.bottomup.bbstart", IC_None)
+                  .Case("llvm.arc.annotation.bottomup.bbend", IC_None)
                   .Default(IC_CallOrUser);
           }
 
diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp
index d71dd5dec6..f0d29c88a8 100644
--- a/lib/Transforms/Scalar/CodeGenPrepare.cpp
+++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/ValueMap.h"
 #include "llvm/Analysis/DominatorInternals.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
@@ -88,7 +89,7 @@ namespace {
     /// Keeps track of non-local addresses that have been sunk into a block.
     /// This allows us to avoid inserting duplicate code for blocks with
     /// multiple load/stores of the same address.
-    DenseMap<Value*, Value*> SunkAddrs;
+    ValueMap<Value*, Value*> SunkAddrs;
 
     /// ModifiedDT - If CFG is modified in anyway, dominator tree may need to
     /// be updated.
@@ -154,7 +155,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
 
   /// This optimization identifies DIV instructions that can be
   /// profitably bypassed and carried out with a shorter, faster divide.
-  if (TLI && TLI->isSlowDivBypassed()) {
+  if (!OptSize && TLI && TLI->isSlowDivBypassed()) {
     const DenseMap<unsigned int, unsigned int> &BypassWidths =
        TLI->getBypassSlowDivWidths();
     for (Function::iterator I = F.begin(); I != F.end(); I++)
@@ -1653,10 +1654,6 @@ bool CodeGenPrepare::OptimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
       // start of the block.
       CurInstIterator = BB->begin();
       SunkAddrs.clear();
-    } else {
-      // This address is now available for reassignment, so erase the table
-      // entry; we don't want to match some completely different instruction.
-      SunkAddrs[Addr] = 0;
     }
   }
   ++NumMemoryInsts;
@@ -1761,7 +1758,7 @@ bool CodeGenPrepare::OptimizeExtUses(Instruction *I) {
   if (!DefIsLiveOut)
     return false;
 
-  // Make sure non of the uses are PHI nodes.
+  // Make sure none of the uses are PHI nodes.
   for (Value::use_iterator UI = Src->use_begin(), E = Src->use_end();
        UI != E; ++UI) {
     Instruction *User = cast<Instruction>(*UI);
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index c04b447f1c..f350b9bbde 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -498,6 +498,75 @@ void ValueTable::verifyRemoved(const Value *V) const {
 //===----------------------------------------------------------------------===//
 
 namespace {
+  class GVN;
+  struct AvailableValueInBlock {
+    /// BB - The basic block in question.
+    BasicBlock *BB;
+    enum ValType {
+      SimpleVal,  // A simple offsetted value that is accessed.
+      LoadVal,    // A value produced by a load.
+      MemIntrin   // A memory intrinsic which is loaded from.
+    };
+  
+    /// V - The value that is live out of the block.
+    PointerIntPair<Value *, 2, ValType> Val;
+  
+    /// Offset - The byte offset in Val that is interesting for the load query.
+    unsigned Offset;
+  
+    static AvailableValueInBlock get(BasicBlock *BB, Value *V,
+                                     unsigned Offset = 0) {
+      AvailableValueInBlock Res;
+      Res.BB = BB;
+      Res.Val.setPointer(V);
+      Res.Val.setInt(SimpleVal);
+      Res.Offset = Offset;
+      return Res;
+    }
+  
+    static AvailableValueInBlock getMI(BasicBlock *BB, MemIntrinsic *MI,
+                                       unsigned Offset = 0) {
+      AvailableValueInBlock Res;
+      Res.BB = BB;
+      Res.Val.setPointer(MI);
+      Res.Val.setInt(MemIntrin);
+      Res.Offset = Offset;
+      return Res;
+    }
+  
+    static AvailableValueInBlock getLoad(BasicBlock *BB, LoadInst *LI,
+                                         unsigned Offset = 0) {
+      AvailableValueInBlock Res;
+      Res.BB = BB;
+      Res.Val.setPointer(LI);
+      Res.Val.setInt(LoadVal);
+      Res.Offset = Offset;
+      return Res;
+    }
+  
+    bool isSimpleValue() const { return Val.getInt() == SimpleVal; }
+    bool isCoercedLoadValue() const { return Val.getInt() == LoadVal; }
+    bool isMemIntrinValue() const { return Val.getInt() == MemIntrin; }
+  
+    Value *getSimpleValue() const {
+      assert(isSimpleValue() && "Wrong accessor");
+      return Val.getPointer();
+    }
+  
+    LoadInst *getCoercedLoadValue() const {
+      assert(isCoercedLoadValue() && "Wrong accessor");
+      return cast<LoadInst>(Val.getPointer());
+    }
+  
+    MemIntrinsic *getMemIntrinValue() const {
+      assert(isMemIntrinValue() && "Wrong accessor");
+      return cast<MemIntrinsic>(Val.getPointer());
+    }
+  
+    /// MaterializeAdjustedValue - Emit code into this block to adjust the value
+    /// defined here to the specified type.  This handles various coercion cases.
+    Value *MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) const;
+  };
 
   class GVN : public FunctionPass {
     bool NoLoads;
@@ -519,6 +588,11 @@ namespace {
     BumpPtrAllocator TableAllocator;
 
     SmallVector<Instruction*, 8> InstrsToErase;
+
+    typedef SmallVector<NonLocalDepResult, 64> LoadDepVect;
+    typedef SmallVector<AvailableValueInBlock, 64> AvailValInBlkVect;
+    typedef SmallVector<BasicBlock*, 64> UnavailBlkVect;
+
   public:
     static char ID; // Pass identification, replacement for typeid
     explicit GVN(bool noloads = false)
@@ -599,11 +673,17 @@ namespace {
     }
 
 
-    // Helper fuctions
-    // FIXME: eliminate or document these better
+    // Helper fuctions of redundant load elimination 
     bool processLoad(LoadInst *L);
-    bool processInstruction(Instruction *I);
     bool processNonLocalLoad(LoadInst *L);
+    void AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, 
+                                 AvailValInBlkVect &ValuesPerBlock,
+                                 UnavailBlkVect &UnavailableBlocks);
+    bool PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, 
+                        UnavailBlkVect &UnavailableBlocks);
+
+    // Other helper routines
+    bool processInstruction(Instruction *I);
     bool processBlock(BasicBlock *BB);
     void dump(DenseMap<uint32_t, Value*> &d);
     bool iterateOnFunction(Function &F);
@@ -1159,114 +1239,6 @@ static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
   return ConstantFoldLoadFromConstPtr(Src, &TD);
 }
 
-namespace {
-
-struct AvailableValueInBlock {
-  /// BB - The basic block in question.
-  BasicBlock *BB;
-  enum ValType {
-    SimpleVal,  // A simple offsetted value that is accessed.
-    LoadVal,    // A value produced by a load.
-    MemIntrin   // A memory intrinsic which is loaded from.
-  };
-
-  /// V - The value that is live out of the block.
-  PointerIntPair<Value *, 2, ValType> Val;
-
-  /// Offset - The byte offset in Val that is interesting for the load query.
-  unsigned Offset;
-
-  static AvailableValueInBlock get(BasicBlock *BB, Value *V,
-                                   unsigned Offset = 0) {
-    AvailableValueInBlock Res;
-    Res.BB = BB;
-    Res.Val.setPointer(V);
-    Res.Val.setInt(SimpleVal);
-    Res.Offset = Offset;
-    return Res;
-  }
-
-  static AvailableValueInBlock getMI(BasicBlock *BB, MemIntrinsic *MI,
-                                     unsigned Offset = 0) {
-    AvailableValueInBlock Res;
-    Res.BB = BB;
-    Res.Val.setPointer(MI);
-    Res.Val.setInt(MemIntrin);
-    Res.Offset = Offset;
-    return Res;
-  }
-
-  static AvailableValueInBlock getLoad(BasicBlock *BB, LoadInst *LI,
-                                       unsigned Offset = 0) {
-    AvailableValueInBlock Res;
-    Res.BB = BB;
-    Res.Val.setPointer(LI);
-    Res.Val.setInt(LoadVal);
-    Res.Offset = Offset;
-    return Res;
-  }
-
-  bool isSimpleValue() const { return Val.getInt() == SimpleVal; }
-  bool isCoercedLoadValue() const { return Val.getInt() == LoadVal; }
-  bool isMemIntrinValue() const { return Val.getInt() == MemIntrin; }
-
-  Value *getSimpleValue() const {
-    assert(isSimpleValue() && "Wrong accessor");
-    return Val.getPointer();
-  }
-
-  LoadInst *getCoercedLoadValue() const {
-    assert(isCoercedLoadValue() && "Wrong accessor");
-    return cast<LoadInst>(Val.getPointer());
-  }
-
-  MemIntrinsic *getMemIntrinValue() const {
-    assert(isMemIntrinValue() && "Wrong accessor");
-    return cast<MemIntrinsic>(Val.getPointer());
-  }
-
-  /// MaterializeAdjustedValue - Emit code into this block to adjust the value
-  /// defined here to the specified type.  This handles various coercion cases.
-  Value *MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) const {
-    Value *Res;
-    if (isSimpleValue()) {
-      Res = getSimpleValue();
-      if (Res->getType() != LoadTy) {
-        const DataLayout *TD = gvn.getDataLayout();
-        assert(TD && "Need target data to handle type mismatch case");
-        Res = GetStoreValueForLoad(Res, Offset, LoadTy, BB->getTerminator(),
-                                   *TD);
-
-        DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << "  "
-                     << *getSimpleValue() << '\n'
-                     << *Res << '\n' << "\n\n\n");
-      }
-    } else if (isCoercedLoadValue()) {
-      LoadInst *Load = getCoercedLoadValue();
-      if (Load->getType() == LoadTy && Offset == 0) {
-        Res = Load;
-      } else {
-        Res = GetLoadValueForLoad(Load, Offset, LoadTy, BB->getTerminator(),
-                                  gvn);
-
-        DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset << "  "
-                     << *getCoercedLoadValue() << '\n'
-                     << *Res << '\n' << "\n\n\n");
-      }
-    } else {
-      const DataLayout *TD = gvn.getDataLayout();
-      assert(TD && "Need target data to handle type mismatch case");
-      Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset,
-                                   LoadTy, BB->getTerminator(), *TD);
-      DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset
-                   << "  " << *getMemIntrinValue() << '\n'
-                   << *Res << '\n' << "\n\n\n");
-    }
-    return Res;
-  }
-};
-
-} // end anonymous namespace
 
 /// ConstructSSAForLoadSet - Given a set of loads specified by ValuesPerBlock,
 /// construct SSA form, allowing us to eliminate LI.  This returns the value
@@ -1323,48 +1295,59 @@ static Value *ConstructSSAForLoadSet(LoadInst *LI,
   return V;
 }
 
+Value *AvailableValueInBlock::MaterializeAdjustedValue(Type *LoadTy, GVN &gvn) const {
+  Value *Res;
+  if (isSimpleValue()) {
+    Res = getSimpleValue();
+    if (Res->getType() != LoadTy) {
+      const DataLayout *TD = gvn.getDataLayout();
+      assert(TD && "Need target data to handle type mismatch case");
+      Res = GetStoreValueForLoad(Res, Offset, LoadTy, BB->getTerminator(),
+                                 *TD);
+  
+      DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << "  "
+                   << *getSimpleValue() << '\n'
+                   << *Res << '\n' << "\n\n\n");
+    }
+  } else if (isCoercedLoadValue()) {
+    LoadInst *Load = getCoercedLoadValue();
+    if (Load->getType() == LoadTy && Offset == 0) {
+      Res = Load;
+    } else {
+      Res = GetLoadValueForLoad(Load, Offset, LoadTy, BB->getTerminator(),
+                                gvn);
+  
+      DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset << "  "
+                   << *getCoercedLoadValue() << '\n'
+                   << *Res << '\n' << "\n\n\n");
+    }
+  } else {
+    const DataLayout *TD = gvn.getDataLayout();
+    assert(TD && "Need target data to handle type mismatch case");
+    Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset,
+                                 LoadTy, BB->getTerminator(), *TD);
+    DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset
+                 << "  " << *getMemIntrinValue() << '\n'
+                 << *Res << '\n' << "\n\n\n");
+  }
+  return Res;
+}
+
 static bool isLifetimeStart(const Instruction *Inst) {
   if (const IntrinsicInst* II = dyn_cast<IntrinsicInst>(Inst))
     return II->getIntrinsicID() == Intrinsic::lifetime_start;
   return false;
 }
 
-/// processNonLocalLoad - Attempt to eliminate a load whose dependencies are
-/// non-local by performing PHI construction.
-bool GVN::processNonLocalLoad(LoadInst *LI) {
-  // Find the non-local dependencies of the load.
-  SmallVector<NonLocalDepResult, 64> Deps;
-  AliasAnalysis::Location Loc = VN.getAliasAnalysis()->getLocation(LI);
-  MD->getNonLocalPointerDependency(Loc, true, LI->getParent(), Deps);
-  //DEBUG(dbgs() << "INVESTIGATING NONLOCAL LOAD: "
-  //             << Deps.size() << *LI << '\n');
-
-  // If we had to process more than one hundred blocks to find the
-  // dependencies, this load isn't worth worrying about.  Optimizing
-  // it will be too expensive.
-  unsigned NumDeps = Deps.size();
-  if (NumDeps > 100)
-    return false;
-
-  // If we had a phi translation failure, we'll have a single entry which is a
-  // clobber in the current block.  Reject this early.
-  if (NumDeps == 1 &&
-      !Deps[0].getResult().isDef() && !Deps[0].getResult().isClobber()) {
-    DEBUG(
-      dbgs() << "GVN: non-local load ";
-      WriteAsOperand(dbgs(), LI);
-      dbgs() << " has unknown dependencies\n";
-    );
-    return false;
-  }
+void GVN::AnalyzeLoadAvailability(LoadInst *LI, LoadDepVect &Deps, 
+                                  AvailValInBlkVect &ValuesPerBlock,
+                                  UnavailBlkVect &UnavailableBlocks) {
 
   // Filter out useless results (non-locals, etc).  Keep track of the blocks
   // where we have a value available in repl, also keep track of whether we see
   // dependencies that produce an unknown value for the load (such as a call
   // that could potentially clobber the load).
-  SmallVector<AvailableValueInBlock, 64> ValuesPerBlock;
-  SmallVector<BasicBlock*, 64> UnavailableBlocks;
-
+  unsigned NumDeps = Deps.size();
   for (unsigned i = 0, e = NumDeps; i != e; ++i) {
     BasicBlock *DepBB = Deps[i].getBB();
     MemDepResult DepInfo = Deps[i].getResult();
@@ -1480,35 +1463,11 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
     }
 
     UnavailableBlocks.push_back(DepBB);
-    continue;
   }
+}
 
-  // If we have no predecessors that produce a known value for this load, exit
-  // early.
-  if (ValuesPerBlock.empty()) return false;
-
-  // If all of the instructions we depend on produce a known value for this
-  // load, then it is fully redundant and we can use PHI insertion to compute
-  // its value.  Insert PHIs and remove the fully redundant value now.
-  if (UnavailableBlocks.empty()) {
-    DEBUG(dbgs() << "GVN REMOVING NONLOCAL LOAD: " << *LI << '\n');
-
-    // Perform PHI construction.
-    Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this);
-    LI->replaceAllUsesWith(V);
-
-    if (isa<PHINode>(V))
-      V->takeName(LI);
-    if (V->getType()->getScalarType()->isPointerTy())
-      MD->invalidateCachedPointerInfo(V);
-    markInstructionForDeletion(LI);
-    ++NumGVNLoad;
-    return true;
-  }
-
-  if (!EnablePRE || !EnableLoadPRE)
-    return false;
-
+bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock, 
+                         UnavailBlkVect &UnavailableBlocks) {
   // Okay, we have *some* definitions of the value.  This means that the value
   // is available in some of our (transitive) predecessors.  Lets think about
   // doing PRE of this load.  This will involve inserting a new load into the
@@ -1526,7 +1485,6 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
   BasicBlock *LoadBB = LI->getParent();
   BasicBlock *TmpBB = LoadBB;
 
-  bool allSingleSucc = true;
   while (TmpBB->getSinglePredecessor()) {
     TmpBB = TmpBB->getSinglePredecessor();
     if (TmpBB == LoadBB) // Infinite (unreachable) loop.
@@ -1615,13 +1573,8 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
     // pointer if it is not available.
     PHITransAddr Address(LI->getPointerOperand(), TD);
     Value *LoadPtr = 0;
-    if (allSingleSucc) {
-      LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred,
-                                                  *DT, NewInsts);
-    } else {
-      Address.PHITranslateValue(LoadBB, UnavailablePred, DT);
-      LoadPtr = Address.getAddr();
-    }
+    LoadPtr = Address.PHITranslateWithInsertion(LoadBB, UnavailablePred,
+                                                *DT, NewInsts);
 
     // If we couldn't find or insert a computation of this phi translated value,
     // we fail PRE.
@@ -1632,24 +1585,6 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
       break;
     }
 
-    // Make sure it is valid to move this load here.  We have to watch out for:
-    //  @1 = getelementptr (i8* p, ...
-    //  test p and branch if == 0
-    //  load @1
-    // It is valid to have the getelementptr before the test, even if p can
-    // be 0, as getelementptr only does address arithmetic.
-    // If we are not pushing the value through any multiple-successor blocks
-    // we do not have this case.  Otherwise, check that the load is safe to
-    // put anywhere; this can be improved, but should be conservatively safe.
-    if (!allSingleSucc &&
-        // FIXME: REEVALUTE THIS.
-        !isSafeToLoadUnconditionally(LoadPtr,
-                                     UnavailablePred->getTerminator(),
-                                     LI->getAlignment(), TD)) {
-      CanDoPRE = false;
-      break;
-    }
-
     I->second = LoadPtr;
   }
 
@@ -1714,7 +1649,73 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
   return true;
 }
 
-static void patchReplacementInstruction(Value *Repl, Instruction *I) {
+/// processNonLocalLoad - Attempt to eliminate a load whose dependencies are
+/// non-local by performing PHI construction.
+bool GVN::processNonLocalLoad(LoadInst *LI) {
+  // Step 1: Find the non-local dependencies of the load.
+  LoadDepVect Deps;
+  AliasAnalysis::Location Loc = VN.getAliasAnalysis()->getLocation(LI);
+  MD->getNonLocalPointerDependency(Loc, true, LI->getParent(), Deps);
+
+  // If we had to process more than one hundred blocks to find the
+  // dependencies, this load isn't worth worrying about.  Optimizing
+  // it will be too expensive.
+  unsigned NumDeps = Deps.size();
+  if (NumDeps > 100)
+    return false;
+
+  // If we had a phi translation failure, we'll have a single entry which is a
+  // clobber in the current block.  Reject this early.
+  if (NumDeps == 1 &&
+      !Deps[0].getResult().isDef() && !Deps[0].getResult().isClobber()) {
+    DEBUG(
+      dbgs() << "GVN: non-local load ";
+      WriteAsOperand(dbgs(), LI);
+      dbgs() << " has unknown dependencies\n";
+    );
+    return false;
+  }
+
+  // Step 2: Analyze the availability of the load
+  AvailValInBlkVect ValuesPerBlock;
+  UnavailBlkVect UnavailableBlocks;
+  AnalyzeLoadAvailability(LI, Deps, ValuesPerBlock, UnavailableBlocks);
+
+  // If we have no predecessors that produce a known value for this load, exit
+  // early.
+  if (ValuesPerBlock.empty())
+    return false;
+
+  // Step 3: Eliminate fully redundancy.
+  //
+  // If all of the instructions we depend on produce a known value for this
+  // load, then it is fully redundant and we can use PHI insertion to compute
+  // its value.  Insert PHIs and remove the fully redundant value now.
+  if (UnavailableBlocks.empty()) {
+    DEBUG(dbgs() << "GVN REMOVING NONLOCAL LOAD: " << *LI << '\n');
+
+    // Perform PHI construction.
+    Value *V = ConstructSSAForLoadSet(LI, ValuesPerBlock, *this);
+    LI->replaceAllUsesWith(V);
+
+    if (isa<PHINode>(V))
+      V->takeName(LI);
+    if (V->getType()->getScalarType()->isPointerTy())
+      MD->invalidateCachedPointerInfo(V);
+    markInstructionForDeletion(LI);
+    ++NumGVNLoad;
+    return true;
+  }
+
+  // Step 4: Eliminate partial redundancy.
+  if (!EnablePRE || !EnableLoadPRE)
+    return false;
+
+  return PerformLoadPRE(LI, ValuesPerBlock, UnavailableBlocks);
+}
+
+
+static void patchReplacementInstruction(Instruction *I, Value *Repl) {
   // Patch the replacement so that it is not more restrictive than the value
   // being replaced.
   BinaryOperator *Op = dyn_cast<BinaryOperator>(I);
@@ -1756,8 +1757,8 @@ static void patchReplacementInstruction(Value *Repl, Instruction *I) {
   }
 }
 
-static void patchAndReplaceAllUsesWith(Value *Repl, Instruction *I) {
-  patchReplacementInstruction(Repl, I);
+static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) {
+  patchReplacementInstruction(I, Repl);
   I->replaceAllUsesWith(Repl);
 }
 
@@ -1919,7 +1920,7 @@ bool GVN::processLoad(LoadInst *L) {
     }
 
     // Remove it!
-    patchAndReplaceAllUsesWith(AvailableVal, L);
+    patchAndReplaceAllUsesWith(L, AvailableVal);
     if (DepLI->getType()->getScalarType()->isPointerTy())
       MD->invalidateCachedPointerInfo(DepLI);
     markInstructionForDeletion(L);
@@ -2260,7 +2261,7 @@ bool GVN::processInstruction(Instruction *I) {
   }
 
   // Remove it!
-  patchAndReplaceAllUsesWith(repl, I);
+  patchAndReplaceAllUsesWith(I, repl);
   if (MD && repl->getType()->getScalarType()->isPointerTy())
     MD->invalidateCachedPointerInfo(repl);
   markInstructionForDeletion(I);
diff --git a/lib/Transforms/Scalar/GlobalMerge.cpp b/lib/Transforms/Scalar/GlobalMerge.cpp
index 1601a8d646..4796eb2953 100644
--- a/lib/Transforms/Scalar/GlobalMerge.cpp
+++ b/lib/Transforms/Scalar/GlobalMerge.cpp
@@ -53,6 +53,7 @@
 
 #define DEBUG_TYPE "global-merge"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Constants.h"
@@ -64,10 +65,16 @@
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 using namespace llvm;
 
+static cl::opt<bool>
+EnableGlobalMergeOnConst("global-merge-on-const", cl::Hidden,
+                  	cl::desc("Enable global merge pass on constants"),
+                  	cl::init(false));
+
 STATISTIC(NumMerged      , "Number of globals merged");
 namespace {
   class GlobalMerge : public FunctionPass {
@@ -78,6 +85,23 @@ namespace {
     bool doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
                  Module &M, bool isConst, unsigned AddrSpace) const;
 
+    /// \brief Check if the given variable has been identified as must keep
+    /// \pre setMustKeepGlobalVariables must have been called on the Module that
+    ///      contains GV
+    bool isMustKeepGlobalVariable(const GlobalVariable *GV) const {
+      return MustKeepGlobalVariables.count(GV);
+    }
+
+    /// Collect every variables marked as "used" or used in a landing pad
+    /// instruction for this Module.
+    void setMustKeepGlobalVariables(Module &M);
+
+    /// Collect every variables marked as "used"
+    void collectUsedGlobalVariables(Module &M);
+
+    /// Keep track of the GlobalVariable that must not be merged away
+    SmallPtrSet<const GlobalVariable *, 16> MustKeepGlobalVariables;
+
   public:
     static char ID;             // Pass identification, replacement for typeid.
     explicit GlobalMerge(const TargetLowering *tli = 0)
@@ -87,6 +111,7 @@ namespace {
 
     virtual bool doInitialization(Module &M);
     virtual bool runOnFunction(Function &F);
+    virtual bool doFinalization(Module &M);
 
     const char *getPassName() const {
       return "Merge internal globals";
@@ -169,6 +194,42 @@ bool GlobalMerge::doMerge(SmallVectorImpl<GlobalVariable*> &Globals,
   return true;
 }
 
+void GlobalMerge::collectUsedGlobalVariables(Module &M) {
+  // Extract global variables from llvm.used array
+  const GlobalVariable *GV = M.getGlobalVariable("llvm.used");
+  if (!GV || !GV->hasInitializer()) return;
+
+  // Should be an array of 'i8*'.
+  const ConstantArray *InitList = cast<ConstantArray>(GV->getInitializer());
+
+  for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i)
+    if (const GlobalVariable *G =
+        dyn_cast<GlobalVariable>(InitList->getOperand(i)->stripPointerCasts()))
+      MustKeepGlobalVariables.insert(G);
+}
+
+void GlobalMerge::setMustKeepGlobalVariables(Module &M) {
+  collectUsedGlobalVariables(M);
+
+  for (Module::iterator IFn = M.begin(), IEndFn = M.end(); IFn != IEndFn;
+       ++IFn) {
+    for (Function::iterator IBB = IFn->begin(), IEndBB = IFn->end();
+         IBB != IEndBB; ++IBB) {
+      // Follow the inwoke link to find the landing pad instruction
+      const InvokeInst *II = dyn_cast<InvokeInst>(IBB->getTerminator());
+      if (!II) continue;
+
+      const LandingPadInst *LPInst = II->getUnwindDest()->getLandingPadInst();
+      // Look for globals in the clauses of the landing pad instruction
+      for (unsigned Idx = 0, NumClauses = LPInst->getNumClauses();
+           Idx != NumClauses; ++Idx)
+        if (const GlobalVariable *GV =
+            dyn_cast<GlobalVariable>(LPInst->getClause(Idx)
+                                     ->stripPointerCasts()))
+          MustKeepGlobalVariables.insert(GV);
+    }
+  }
+}
 
 bool GlobalMerge::doInitialization(Module &M) {
   DenseMap<unsigned, SmallVector<GlobalVariable*, 16> > Globals, ConstGlobals,
@@ -176,6 +237,7 @@ bool GlobalMerge::doInitialization(Module &M) {
   const DataLayout *TD = TLI->getDataLayout();
   unsigned MaxOffset = TLI->getMaximalGlobalOffset();
   bool Changed = false;
+  setMustKeepGlobalVariables(M);
 
   // Grab all non-const globals.
   for (Module::global_iterator I = M.global_begin(),
@@ -200,6 +262,10 @@ bool GlobalMerge::doInitialization(Module &M) {
         I->getName().startswith(".llvm."))
       continue;
 
+    // Ignore all "required" globals:
+    if (isMustKeepGlobalVariable(I))
+      continue;
+
     if (TD->getTypeAllocSize(Ty) < MaxOffset) {
       if (TargetLoweringObjectFile::getKindForGlobal(I, TLI->getTargetMachine())
           .isBSSLocal())
@@ -221,11 +287,11 @@ bool GlobalMerge::doInitialization(Module &M) {
     if (I->second.size() > 1)
       Changed |= doMerge(I->second, M, false, I->first);
 
-  // FIXME: This currently breaks the EH processing due to way how the
-  // typeinfo detection works. We might want to detect the TIs and ignore
-  // them in the future.
-  // if (ConstGlobals.size() > 1)
-  //  Changed |= doMerge(ConstGlobals, M, true);
+  if (EnableGlobalMergeOnConst)
+    for (DenseMap<unsigned, SmallVector<GlobalVariable*, 16> >::iterator
+         I = ConstGlobals.begin(), E = ConstGlobals.end(); I != E; ++I)
+      if (I->second.size() > 1)
+        Changed |= doMerge(I->second, M, true, I->first);
 
   return Changed;
 }
@@ -234,6 +300,11 @@ bool GlobalMerge::runOnFunction(Function &F) {
   return false;
 }
 
+bool GlobalMerge::doFinalization(Module &M) {
+  MustKeepGlobalVariables.clear();
+  return false;
+}
+
 Pass *llvm::createGlobalMergePass(const TargetLowering *tli) {
   return new GlobalMerge(tli);
 }
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 97fff7e782..8e76c78f5a 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -535,6 +535,45 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
         if (!SE->isLoopInvariant(ExitValue, L))
           continue;
 
+        // Computing the value outside of the loop brings no benefit if :
+        //  - it is definitely used inside the loop in a way which can not be
+        //    optimized away.
+        //  - no use outside of the loop can take advantage of hoisting the
+        //    computation out of the loop
+        if (ExitValue->getSCEVType()>=scMulExpr) {
+          unsigned NumHardInternalUses = 0;
+          unsigned NumSoftExternalUses = 0;
+          unsigned NumUses = 0;
+          for (Value::use_iterator IB=Inst->use_begin(), IE=Inst->use_end();
+               IB!=IE && NumUses<=6 ; ++IB) {
+            Instruction *UseInstr = cast<Instruction>(*IB);
+            unsigned Opc = UseInstr->getOpcode();
+            NumUses++;
+            if (L->contains(UseInstr)) {
+              if (Opc == Instruction::Call || Opc == Instruction::Ret)
+                NumHardInternalUses++;
+            } else {
+              if (Opc == Instruction::PHI) {
+                // Do not count the Phi as a use. LCSSA may have inserted
+                // plenty of trivial ones.
+                NumUses--;
+                for (Value::use_iterator PB=UseInstr->use_begin(),
+                                         PE=UseInstr->use_end();
+                     PB!=PE && NumUses<=6 ; ++PB, ++NumUses) {
+                  unsigned PhiOpc = cast<Instruction>(*PB)->getOpcode();
+                  if (PhiOpc != Instruction::Call && PhiOpc != Instruction::Ret)
+                    NumSoftExternalUses++;
+                }
+                continue;
+              }
+              if (Opc != Instruction::Call && Opc != Instruction::Ret)
+                NumSoftExternalUses++;
+            }
+          }
+          if (NumUses <= 6 && NumHardInternalUses && !NumSoftExternalUses)
+            continue;
+        }
+
         Value *ExitVal = Rewriter.expandCodeFor(ExitValue, PN->getType(), Inst);
 
         DEBUG(dbgs() << "INDVARS: RLEV: AfterLoopVal = " << *ExitVal << '\n'
diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp
index 9c67e327e2..0b62050b17 100644
--- a/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -34,13 +34,9 @@ namespace {
     }
 
     // Possibly eliminate loop L if it is dead.
-    bool runOnLoop(Loop* L, LPPassManager& LPM);
+    bool runOnLoop(Loop *L, LPPassManager &LPM);
 
-    bool IsLoopDead(Loop* L, SmallVector<BasicBlock*, 4>& exitingBlocks,
-                    SmallVector<BasicBlock*, 4>& exitBlocks,
-                    bool &Changed, BasicBlock *Preheader);
-
-    virtual void getAnalysisUsage(AnalysisUsage& AU) const {
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addRequired<DominatorTree>();
       AU.addRequired<LoopInfo>();
       AU.addRequired<ScalarEvolution>();
@@ -53,6 +49,12 @@ namespace {
       AU.addPreservedID(LoopSimplifyID);
       AU.addPreservedID(LCSSAID);
     }
+
+  private:
+    bool isLoopDead(Loop *L, SmallVector<BasicBlock*, 4> &exitingBlocks,
+                    SmallVector<BasicBlock*, 4> &exitBlocks,
+                    bool &Changed, BasicBlock *Preheader);
+
   };
 }
 
@@ -67,18 +69,18 @@ INITIALIZE_PASS_DEPENDENCY(LCSSA)
 INITIALIZE_PASS_END(LoopDeletion, "loop-deletion",
                 "Delete dead loops", false, false)
 
-Pass* llvm::createLoopDeletionPass() {
+Pass *llvm::createLoopDeletionPass() {
   return new LoopDeletion();
 }
 
-/// IsLoopDead - Determined if a loop is dead.  This assumes that we've already
+/// isLoopDead - Determined if a loop is dead.  This assumes that we've already
 /// checked for unique exit and exiting blocks, and that the code is in LCSSA
 /// form.
-bool LoopDeletion::IsLoopDead(Loop* L,
-                              SmallVector<BasicBlock*, 4>& exitingBlocks,
-                              SmallVector<BasicBlock*, 4>& exitBlocks,
+bool LoopDeletion::isLoopDead(Loop *L,
+                              SmallVector<BasicBlock*, 4> &exitingBlocks,
+                              SmallVector<BasicBlock*, 4> &exitBlocks,
                               bool &Changed, BasicBlock *Preheader) {
-  BasicBlock* exitBlock = exitBlocks[0];
+  BasicBlock *exitBlock = exitBlocks[0];
 
   // Make sure that all PHI entries coming from the loop are loop invariant.
   // Because the code is in LCSSA form, any values used outside of the loop
@@ -86,19 +88,19 @@ bool LoopDeletion::IsLoopDead(Loop* L,
   // sufficient to guarantee that no loop-variant values are used outside
   // of the loop.
   BasicBlock::iterator BI = exitBlock->begin();
-  while (PHINode* P = dyn_cast<PHINode>(BI)) {
-    Value* incoming = P->getIncomingValueForBlock(exitingBlocks[0]);
+  while (PHINode *P = dyn_cast<PHINode>(BI)) {
+    Value *incoming = P->getIncomingValueForBlock(exitingBlocks[0]);
 
     // Make sure all exiting blocks produce the same incoming value for the exit
     // block.  If there are different incoming values for different exiting
     // blocks, then it is impossible to statically determine which value should
     // be used.
-    for (unsigned i = 1; i < exitingBlocks.size(); ++i) {
+    for (unsigned i = 1, e = exitingBlocks.size(); i < e; ++i) {
       if (incoming != P->getIncomingValueForBlock(exitingBlocks[i]))
         return false;
     }
 
-    if (Instruction* I = dyn_cast<Instruction>(incoming))
+    if (Instruction *I = dyn_cast<Instruction>(incoming))
       if (!L->makeLoopInvariant(I, Changed, Preheader->getTerminator()))
         return false;
 
@@ -127,10 +129,10 @@ bool LoopDeletion::IsLoopDead(Loop* L,
 /// so could change the halting/non-halting nature of a program.
 /// NOTE: This entire process relies pretty heavily on LoopSimplify and LCSSA
 /// in order to make various safety checks work.
-bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) {
+bool LoopDeletion::runOnLoop(Loop *L, LPPassManager &LPM) {
   // We can only remove the loop if there is a preheader that we can
   // branch from after removing it.
-  BasicBlock* preheader = L->getLoopPreheader();
+  BasicBlock *preheader = L->getLoopPreheader();
   if (!preheader)
     return false;
 
@@ -158,19 +160,19 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) {
 
   // Finally, we have to check that the loop really is dead.
   bool Changed = false;
-  if (!IsLoopDead(L, exitingBlocks, exitBlocks, Changed, preheader))
+  if (!isLoopDead(L, exitingBlocks, exitBlocks, Changed, preheader))
     return Changed;
 
   // Don't remove loops for which we can't solve the trip count.
   // They could be infinite, in which case we'd be changing program behavior.
-  ScalarEvolution& SE = getAnalysis<ScalarEvolution>();
+  ScalarEvolution &SE = getAnalysis<ScalarEvolution>();
   const SCEV *S = SE.getMaxBackedgeTakenCount(L);
   if (isa<SCEVCouldNotCompute>(S))
     return Changed;
 
   // Now that we know the removal is safe, remove the loop by changing the
   // branch from the preheader to go to the single exit block.
-  BasicBlock* exitBlock = exitBlocks[0];
+  BasicBlock *exitBlock = exitBlocks[0];
 
   // Because we're deleting a large chunk of code at once, the sequence in which
   // we remove things is very important to avoid invalidation issues.  Don't
@@ -182,14 +184,14 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) {
   SE.forgetLoop(L);
 
   // Connect the preheader directly to the exit block.
-  TerminatorInst* TI = preheader->getTerminator();
+  TerminatorInst *TI = preheader->getTerminator();
   TI->replaceUsesOfWith(L->getHeader(), exitBlock);
 
   // Rewrite phis in the exit block to get their inputs from
   // the preheader instead of the exiting block.
-  BasicBlock* exitingBlock = exitingBlocks[0];
+  BasicBlock *exitingBlock = exitingBlocks[0];
   BasicBlock::iterator BI = exitBlock->begin();
-  while (PHINode* P = dyn_cast<PHINode>(BI)) {
+  while (PHINode *P = dyn_cast<PHINode>(BI)) {
     int j = P->getBasicBlockIndex(exitingBlock);
     assert(j >= 0 && "Can't find exiting block in exit block's phi node!");
     P->setIncomingBlock(j, preheader);
@@ -200,7 +202,7 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) {
 
   // Update the dominator tree and remove the instructions and blocks that will
   // be deleted from the reference counting scheme.
-  DominatorTree& DT = getAnalysis<DominatorTree>();
+  DominatorTree &DT = getAnalysis<DominatorTree>();
   SmallVector<DomTreeNode*, 8> ChildNodes;
   for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
        LI != LE; ++LI) {
@@ -230,7 +232,7 @@ bool LoopDeletion::runOnLoop(Loop* L, LPPassManager& LPM) {
 
   // Finally, the blocks from loopinfo.  This has to happen late because
   // otherwise our loop iterators won't work.
-  LoopInfo& loopInfo = getAnalysis<LoopInfo>();
+  LoopInfo &loopInfo = getAnalysis<LoopInfo>();
   SmallPtrSet<BasicBlock*, 8> blocks;
   blocks.insert(L->block_begin(), L->block_end());
   for (SmallPtrSet<BasicBlock*,8>::iterator I = blocks.begin(),
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index e98ae953e5..14c5655f08 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -56,8 +56,8 @@ namespace {
     }
 
     bool runOnLoop(Loop *L, LPPassManager &LPM);
-    void simplifyLoopLatch(Loop *L);
-    bool rotateLoop(Loop *L);
+    bool simplifyLoopLatch(Loop *L);
+    bool rotateLoop(Loop *L, bool SimplifiedLatch);
 
   private:
     LoopInfo *LI;
@@ -84,13 +84,14 @@ bool LoopRotate::runOnLoop(Loop *L, LPPassManager &LPM) {
   // Simplify the loop latch before attempting to rotate the header
   // upward. Rotation may not be needed if the loop tail can be folded into the
   // loop exit.
-  simplifyLoopLatch(L);
+  bool SimplifiedLatch = simplifyLoopLatch(L);
 
   // One loop can be rotated multiple times.
   bool MadeChange = false;
-  while (rotateLoop(L))
+  while (rotateLoop(L, SimplifiedLatch)) {
     MadeChange = true;
-
+    SimplifiedLatch = false;
+  }
   return MadeChange;
 }
 
@@ -212,25 +213,25 @@ static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
 /// canonical form so downstream passes can handle it.
 ///
 /// I don't believe this invalidates SCEV.
-void LoopRotate::simplifyLoopLatch(Loop *L) {
+bool LoopRotate::simplifyLoopLatch(Loop *L) {
   BasicBlock *Latch = L->getLoopLatch();
   if (!Latch || Latch->hasAddressTaken())
-    return;
+    return false;
 
   BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator());
   if (!Jmp || !Jmp->isUnconditional())
-    return;
+    return false;
 
   BasicBlock *LastExit = Latch->getSinglePredecessor();
   if (!LastExit || !L->isLoopExiting(LastExit))
-    return;
+    return false;
 
   BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator());
   if (!BI)
-    return;
+    return false;
 
   if (!shouldSpeculateInstrs(Latch->begin(), Jmp))
-    return;
+    return false;
 
   DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into "
         << LastExit->getName() << "\n");
@@ -253,10 +254,20 @@ void LoopRotate::simplifyLoopLatch(Loop *L) {
   if (DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>())
     DT->eraseNode(Latch);
   Latch->eraseFromParent();
+  return true;
 }
 
 /// Rotate loop LP. Return true if the loop is rotated.
-bool LoopRotate::rotateLoop(Loop *L) {
+///
+/// \param SimplifiedLatch is true if the latch was just folded into the final
+/// loop exit. In this case we may want to rotate even though the new latch is
+/// now an exiting branch. This rotation would have happened had the latch not
+/// been simplified. However, if SimplifiedLatch is false, then we avoid
+/// rotating loops in which the latch exits to avoid excessive or endless
+/// rotation. LoopRotate should be repeatable and converge to a canonical
+/// form. This property is satisfied because simplifying the loop latch can only
+/// happen once across multiple invocations of the LoopRotate pass.
+bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
   // If the loop has only one block then there is not much to rotate.
   if (L->getBlocks().size() == 1)
     return false;
@@ -276,7 +287,12 @@ bool LoopRotate::rotateLoop(Loop *L) {
 
   // If the loop latch already contains a branch that leaves the loop then the
   // loop is already rotated.
-  if (OrigLatch == 0 || L->isLoopExiting(OrigLatch))
+  if (OrigLatch == 0)
+    return false;
+
+  // Rotate if either the loop latch does *not* exit the loop, or if the loop
+  // latch was just simplified.
+  if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch)
     return false;
 
   // Check size of original header and reject loop if it is very big or we can't
@@ -505,4 +521,3 @@ bool LoopRotate::rotateLoop(Loop *L) {
   ++NumRotated;
   return true;
 }
-
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 4e4cb86464..73e44d7edf 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -895,7 +895,7 @@ void Cost::RatePrimaryRegister(const SCEV *Reg,
   }
   if (Regs.insert(Reg)) {
     RateRegister(Reg, Regs, L, SE, DT);
-    if (isLoser())
+    if (LoserRegs && isLoser())
       LoserRegs->insert(Reg);
   }
 }
@@ -1895,15 +1895,13 @@ ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
   if (ICmpInst::isTrueWhenEqual(Pred)) {
     // Look for n+1, and grab n.
     if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(1)))
-      if (isa<ConstantInt>(BO->getOperand(1)) &&
-          cast<ConstantInt>(BO->getOperand(1))->isOne() &&
-          SE.getSCEV(BO->getOperand(0)) == MaxRHS)
-        NewRHS = BO->getOperand(0);
+      if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
+         if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
+           NewRHS = BO->getOperand(0);
     if (AddOperator *BO = dyn_cast<AddOperator>(Sel->getOperand(2)))
-      if (isa<ConstantInt>(BO->getOperand(1)) &&
-          cast<ConstantInt>(BO->getOperand(1))->isOne() &&
-          SE.getSCEV(BO->getOperand(0)) == MaxRHS)
-        NewRHS = BO->getOperand(0);
+      if (ConstantInt *BO1 = dyn_cast<ConstantInt>(BO->getOperand(1)))
+        if (BO1->isOne() && SE.getSCEV(BO->getOperand(0)) == MaxRHS)
+          NewRHS = BO->getOperand(0);
     if (!NewRHS)
       return Cond;
   } else if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
@@ -2716,6 +2714,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
   // by LSR.
   const IVInc &Head = Chain.Incs[0];
   User::op_iterator IVOpEnd = Head.UserInst->op_end();
+  // findIVOperand returns IVOpEnd if it can no longer find a valid IV user.
   User::op_iterator IVOpIter = findIVOperand(Head.UserInst->op_begin(),
                                              IVOpEnd, L, SE);
   Value *IVSrc = 0;
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index 0da3746950..a3c241dc0f 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -110,6 +110,51 @@ namespace {
       }
     };
   };
+  
+  /// Utility class representing a non-constant Xor-operand. We classify
+  /// non-constant Xor-Operands into two categories:
+  ///  C1) The operand is in the form "X & C", where C is a constant and C != ~0
+  ///  C2)
+  ///    C2.1) The operand is in the form of "X | C", where C is a non-zero
+  ///          constant.
+  ///    C2.2) Any operand E which doesn't fall into C1 and C2.1, we view this
+  ///          operand as "E | 0"
+  class XorOpnd {
+  public:
+    XorOpnd(Value *V);
+    const XorOpnd &operator=(const XorOpnd &That);
+
+    bool isInvalid() const { return SymbolicPart == 0; }
+    bool isOrExpr() const { return isOr; }
+    Value *getValue() const { return OrigVal; }
+    Value *getSymbolicPart() const { return SymbolicPart; }
+    unsigned getSymbolicRank() const { return SymbolicRank; }
+    const APInt &getConstPart() const { return ConstPart; }
+
+    void Invalidate() { SymbolicPart = OrigVal = 0; }
+    void setSymbolicRank(unsigned R) { SymbolicRank = R; }
+
+    // Sort the XorOpnd-Pointer in ascending order of symbolic-value-rank.
+    // The purpose is twofold:
+    // 1) Cluster together the operands sharing the same symbolic-value.
+    // 2) Operand having smaller symbolic-value-rank is permuted earlier, which 
+    //   could potentially shorten crital path, and expose more loop-invariants.
+    //   Note that values' rank are basically defined in RPO order (FIXME). 
+    //   So, if Rank(X) < Rank(Y) < Rank(Z), it means X is defined earlier 
+    //   than Y which is defined earlier than Z. Permute "x | 1", "Y & 2",
+    //   "z" in the order of X-Y-Z is better than any other orders.
+    struct PtrSortFunctor {
+      bool operator()(XorOpnd * const &LHS, XorOpnd * const &RHS) {
+        return LHS->getSymbolicRank() < RHS->getSymbolicRank();
+      }
+    };
+  private:
+    Value *OrigVal;
+    Value *SymbolicPart;
+    APInt ConstPart;
+    unsigned SymbolicRank;
+    bool isOr;
+  };
 }
 
 namespace {
@@ -137,6 +182,11 @@ namespace {
     Value *OptimizeExpression(BinaryOperator *I,
                               SmallVectorImpl<ValueEntry> &Ops);
     Value *OptimizeAdd(Instruction *I, SmallVectorImpl<ValueEntry> &Ops);
+    Value *OptimizeXor(Instruction *I, SmallVectorImpl<ValueEntry> &Ops);
+    bool CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, APInt &ConstOpnd,
+                        Value *&Res);
+    bool CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, XorOpnd *Opnd2,
+                        APInt &ConstOpnd, Value *&Res);
     bool collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops,
                                 SmallVectorImpl<Factor> &Factors);
     Value *buildMinimalMultiplyDAG(IRBuilder<> &Builder,
@@ -148,6 +198,42 @@ namespace {
   };
 }
 
+XorOpnd::XorOpnd(Value *V) {
+  assert(!isa<ConstantInt>(V) && "No ConstantInt");
+  OrigVal = V;
+  Instruction *I = dyn_cast<Instruction>(V);
+  SymbolicRank = 0;
+
+  if (I && (I->getOpcode() == Instruction::Or ||
+            I->getOpcode() == Instruction::And)) {
+    Value *V0 = I->getOperand(0);
+    Value *V1 = I->getOperand(1);
+    if (isa<ConstantInt>(V0))
+      std::swap(V0, V1);
+
+    if (ConstantInt *C = dyn_cast<ConstantInt>(V1)) {
+      ConstPart = C->getValue();
+      SymbolicPart = V0;
+      isOr = (I->getOpcode() == Instruction::Or);
+      return;
+    }
+  }
+
+  // view the operand as "V | 0"
+  SymbolicPart = V;
+  ConstPart = APInt::getNullValue(V->getType()->getIntegerBitWidth());
+  isOr = true;
+}
+
+const XorOpnd &XorOpnd::operator=(const XorOpnd &That) {
+  OrigVal = That.OrigVal;
+  SymbolicPart = That.SymbolicPart;
+  ConstPart = That.ConstPart;
+  SymbolicRank = That.SymbolicRank;
+  isOr = That.isOr;
+  return *this;
+}
+
 char Reassociate::ID = 0;
 INITIALIZE_PASS(Reassociate, "reassociate",
                 "Reassociate expressions", false, false)
@@ -1040,6 +1126,250 @@ static Value *OptimizeAndOrXor(unsigned Opcode,
   return 0;
 }
 
+/// Helper funciton of CombineXorOpnd(). It creates a bitwise-and
+/// instruction with the given two operands, and return the resulting
+/// instruction. There are two special cases: 1) if the constant operand is 0,
+/// it will return NULL. 2) if the constant is ~0, the symbolic operand will
+/// be returned.
+static Value *createAndInstr(Instruction *InsertBefore, Value *Opnd, 
+                             const APInt &ConstOpnd) {
+  if (ConstOpnd != 0) {
+    if (!ConstOpnd.isAllOnesValue()) {
+      LLVMContext &Ctx = Opnd->getType()->getContext();
+      Instruction *I;
+      I = BinaryOperator::CreateAnd(Opnd, ConstantInt::get(Ctx, ConstOpnd),
+                                    "and.ra", InsertBefore);
+      I->setDebugLoc(InsertBefore->getDebugLoc());
+      return I;
+    }
+    return Opnd;
+  }
+  return 0;
+}
+
+// Helper function of OptimizeXor(). It tries to simplify "Opnd1 ^ ConstOpnd"
+// into "R ^ C", where C would be 0, and R is a symbolic value.
+//
+// If it was successful, true is returned, and the "R" and "C" is returned
+// via "Res" and "ConstOpnd", respectively; otherwise, false is returned,
+// and both "Res" and "ConstOpnd" remain unchanged.
+//  
+bool Reassociate::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1,
+                                 APInt &ConstOpnd, Value *&Res) {
+  // Xor-Rule 1: (x | c1) ^ c2 = (x | c1) ^ (c1 ^ c1) ^ c2 
+  //                       = ((x | c1) ^ c1) ^ (c1 ^ c2)
+  //                       = (x & ~c1) ^ (c1 ^ c2)
+  // It is useful only when c1 == c2.
+  if (Opnd1->isOrExpr() && Opnd1->getConstPart() != 0) {
+    if (!Opnd1->getValue()->hasOneUse())
+      return false;
+
+    const APInt &C1 = Opnd1->getConstPart();
+    if (C1 != ConstOpnd)
+      return false;
+
+    Value *X = Opnd1->getSymbolicPart();
+    Res = createAndInstr(I, X, ~C1);
+    // ConstOpnd was C2, now C1 ^ C2.
+    ConstOpnd ^= C1;
+
+    if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue()))
+      RedoInsts.insert(T);
+    return true;
+  }
+  return false;
+}
+
+                           
+// Helper function of OptimizeXor(). It tries to simplify
+// "Opnd1 ^ Opnd2 ^ ConstOpnd" into "R ^ C", where C would be 0, and R is a
+// symbolic value. 
+// 
+// If it was successful, true is returned, and the "R" and "C" is returned 
+// via "Res" and "ConstOpnd", respectively (If the entire expression is
+// evaluated to a constant, the Res is set to NULL); otherwise, false is
+// returned, and both "Res" and "ConstOpnd" remain unchanged.
+bool Reassociate::CombineXorOpnd(Instruction *I, XorOpnd *Opnd1, XorOpnd *Opnd2,
+                                 APInt &ConstOpnd, Value *&Res) {
+  Value *X = Opnd1->getSymbolicPart();
+  if (X != Opnd2->getSymbolicPart())
+    return false;
+
+  // This many instruction become dead.(At least "Opnd1 ^ Opnd2" will die.)
+  int DeadInstNum = 1;
+  if (Opnd1->getValue()->hasOneUse())
+    DeadInstNum++;
+  if (Opnd2->getValue()->hasOneUse())
+    DeadInstNum++;
+
+  // Xor-Rule 2:
+  //  (x | c1) ^ (x & c2)
+  //   = (x|c1) ^ (x&c2) ^ (c1 ^ c1) = ((x|c1) ^ c1) ^ (x & c2) ^ c1
+  //   = (x & ~c1) ^ (x & c2) ^ c1               // Xor-Rule 1
+  //   = (x & c3) ^ c1, where c3 = ~c1 ^ c2      // Xor-rule 3
+  //
+  if (Opnd1->isOrExpr() != Opnd2->isOrExpr()) {
+    if (Opnd2->isOrExpr())
+      std::swap(Opnd1, Opnd2);
+
+    const APInt &C1 = Opnd1->getConstPart();
+    const APInt &C2 = Opnd2->getConstPart();
+    APInt C3((~C1) ^ C2);
+
+    // Do not increase code size!
+    if (C3 != 0 && !C3.isAllOnesValue()) {
+      int NewInstNum = ConstOpnd != 0 ? 1 : 2;
+      if (NewInstNum > DeadInstNum)
+        return false;
+    }
+
+    Res = createAndInstr(I, X, C3);
+    ConstOpnd ^= C1;
+
+  } else if (Opnd1->isOrExpr()) {
+    // Xor-Rule 3: (x | c1) ^ (x | c2) = (x & c3) ^ c3 where c3 = c1 ^ c2
+    //
+    const APInt &C1 = Opnd1->getConstPart();
+    const APInt &C2 = Opnd2->getConstPart();
+    APInt C3 = C1 ^ C2;
+    
+    // Do not increase code size
+    if (C3 != 0 && !C3.isAllOnesValue()) {
+      int NewInstNum = ConstOpnd != 0 ? 1 : 2;
+      if (NewInstNum > DeadInstNum)
+        return false;
+    }
+
+    Res = createAndInstr(I, X, C3);
+    ConstOpnd ^= C3;
+  } else {
+    // Xor-Rule 4: (x & c1) ^ (x & c2) = (x & (c1^c2))
+    //
+    const APInt &C1 = Opnd1->getConstPart();
+    const APInt &C2 = Opnd2->getConstPart();
+    APInt C3 = C1 ^ C2;
+    Res = createAndInstr(I, X, C3);
+  }
+
+  // Put the original operands in the Redo list; hope they will be deleted
+  // as dead code.
+  if (Instruction *T = dyn_cast<Instruction>(Opnd1->getValue()))
+    RedoInsts.insert(T);
+  if (Instruction *T = dyn_cast<Instruction>(Opnd2->getValue()))
+    RedoInsts.insert(T);
+
+  return true;
+}
+
+/// Optimize a series of operands to an 'xor' instruction. If it can be reduced
+/// to a single Value, it is returned, otherwise the Ops list is mutated as
+/// necessary.
+Value *Reassociate::OptimizeXor(Instruction *I,
+                                SmallVectorImpl<ValueEntry> &Ops) {
+  if (Value *V = OptimizeAndOrXor(Instruction::Xor, Ops))
+    return V;
+      
+  if (Ops.size() == 1)
+    return 0;
+
+  SmallVector<XorOpnd, 8> Opnds;
+  SmallVector<XorOpnd*, 8> OpndPtrs;
+  Type *Ty = Ops[0].Op->getType();
+  APInt ConstOpnd(Ty->getIntegerBitWidth(), 0);
+
+  // Step 1: Convert ValueEntry to XorOpnd
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+    Value *V = Ops[i].Op;
+    if (!isa<ConstantInt>(V)) {
+      XorOpnd O(V);
+      O.setSymbolicRank(getRank(O.getSymbolicPart()));
+      Opnds.push_back(O);
+    } else
+      ConstOpnd ^= cast<ConstantInt>(V)->getValue();
+  }
+
+  // NOTE: From this point on, do *NOT* add/delete element to/from "Opnds".
+  //  It would otherwise invalidate the "Opnds"'s iterator, and hence invalidate
+  //  the "OpndPtrs" as well. For the similar reason, do not fuse this loop
+  //  with the previous loop --- the iterator of the "Opnds" may be invalidated
+  //  when new elements are added to the vector.
+  for (unsigned i = 0, e = Opnds.size(); i != e; ++i)
+    OpndPtrs.push_back(&Opnds[i]);
+
+  // Step 2: Sort the Xor-Operands in a way such that the operands containing
+  //  the same symbolic value cluster together. For instance, the input operand
+  //  sequence ("x | 123", "y & 456", "x & 789") will be sorted into:
+  //  ("x | 123", "x & 789", "y & 456").
+  std::sort(OpndPtrs.begin(), OpndPtrs.end(), XorOpnd::PtrSortFunctor());
+
+  // Step 3: Combine adjacent operands
+  XorOpnd *PrevOpnd = 0;
+  bool Changed = false;
+  for (unsigned i = 0, e = Opnds.size(); i < e; i++) {
+    XorOpnd *CurrOpnd = OpndPtrs[i];
+    // The combined value
+    Value *CV;
+
+    // Step 3.1: Try simplifying "CurrOpnd ^ ConstOpnd"
+    if (ConstOpnd != 0 && CombineXorOpnd(I, CurrOpnd, ConstOpnd, CV)) {
+      Changed = true;
+      if (CV)
+        *CurrOpnd = XorOpnd(CV);
+      else {
+        CurrOpnd->Invalidate();
+        continue;
+      }
+    }
+
+    if (!PrevOpnd || CurrOpnd->getSymbolicPart() != PrevOpnd->getSymbolicPart()) {
+      PrevOpnd = CurrOpnd;
+      continue;
+    }
+
+    // step 3.2: When previous and current operands share the same symbolic
+    //  value, try to simplify "PrevOpnd ^ CurrOpnd ^ ConstOpnd" 
+    //    
+    if (CombineXorOpnd(I, CurrOpnd, PrevOpnd, ConstOpnd, CV)) {
+      // Remove previous operand
+      PrevOpnd->Invalidate();
+      if (CV) {
+        *CurrOpnd = XorOpnd(CV);
+        PrevOpnd = CurrOpnd;
+      } else {
+        CurrOpnd->Invalidate();
+        PrevOpnd = 0;
+      }
+      Changed = true;
+    }
+  }
+
+  // Step 4: Reassemble the Ops
+  if (Changed) {
+    Ops.clear();
+    for (unsigned int i = 0, e = Opnds.size(); i < e; i++) {
+      XorOpnd &O = Opnds[i];
+      if (O.isInvalid())
+        continue;
+      ValueEntry VE(getRank(O.getValue()), O.getValue());
+      Ops.push_back(VE);
+    }
+    if (ConstOpnd != 0) {
+      Value *C = ConstantInt::get(Ty->getContext(), ConstOpnd);
+      ValueEntry VE(getRank(C), C);
+      Ops.push_back(VE);
+    }
+    int Sz = Ops.size();
+    if (Sz == 1)
+      return Ops.back().Op;
+    else if (Sz == 0) {
+      assert(ConstOpnd == 0);
+      return ConstantInt::get(Ty->getContext(), ConstOpnd);
+    }
+  }
+
+  return 0;
+}
+
 /// OptimizeAdd - Optimize a series of operands to an 'add' instruction.  This
 /// optimizes based on identities.  If it can be reduced to a single Value, it
 /// is returned, otherwise the Ops list is mutated as necessary.
@@ -1431,11 +1761,15 @@ Value *Reassociate::OptimizeExpression(BinaryOperator *I,
   default: break;
   case Instruction::And:
   case Instruction::Or:
-  case Instruction::Xor:
     if (Value *Result = OptimizeAndOrXor(Opcode, Ops))
       return Result;
     break;
 
+  case Instruction::Xor:
+    if (Value *Result = OptimizeXor(I, Ops))
+      return Result;
+    break;
+
   case Instruction::Add:
     if (Value *Result = OptimizeAdd(I, Ops))
       return Result;
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index e90fe907d5..d073e789dc 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -57,11 +57,15 @@
 using namespace llvm;
 
 STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement");
-STATISTIC(NumNewAllocas,      "Number of new, smaller allocas introduced");
-STATISTIC(NumPromoted,        "Number of allocas promoted to SSA values");
+STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed");
+STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions");
+STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses found");
+STATISTIC(MaxPartitionUsesPerAlloca, "Maximum number of partition uses");
+STATISTIC(NumNewAllocas, "Number of new, smaller allocas introduced");
+STATISTIC(NumPromoted, "Number of allocas promoted to SSA values");
 STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
-STATISTIC(NumDeleted,         "Number of instructions deleted");
-STATISTIC(NumVectorized,      "Number of vectorized aggregates");
+STATISTIC(NumDeleted, "Number of instructions deleted");
+STATISTIC(NumVectorized, "Number of vectorized aggregates");
 
 /// Hidden option to force the pass to not use DomTree and mem2reg, instead
 /// forming SSA values through the SSAUpdater infrastructure.
@@ -69,112 +73,167 @@ static cl::opt<bool>
 ForceSSAUpdater("force-ssa-updater", cl::init(false), cl::Hidden);
 
 namespace {
-/// \brief Alloca partitioning representation.
-///
-/// This class represents a partitioning of an alloca into slices, and
-/// information about the nature of uses of each slice of the alloca. The goal
-/// is that this information is sufficient to decide if and how to split the
-/// alloca apart and replace slices with scalars. It is also intended that this
-/// structure can capture the relevant information needed both to decide about
-/// and to enact these transformations.
-class AllocaPartitioning {
+/// \brief A custom IRBuilder inserter which prefixes all names if they are
+/// preserved.
+template <bool preserveNames = true>
+class IRBuilderPrefixedInserter :
+    public IRBuilderDefaultInserter<preserveNames> {
+  std::string Prefix;
+
 public:
-  /// \brief A common base class for representing a half-open byte range.
-  struct ByteRange {
-    /// \brief The beginning offset of the range.
-    uint64_t BeginOffset;
+  void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
 
-    /// \brief The ending offset, not included in the range.
-    uint64_t EndOffset;
+protected:
+  void InsertHelper(Instruction *I, const Twine &Name, BasicBlock *BB,
+                    BasicBlock::iterator InsertPt) const {
+    IRBuilderDefaultInserter<preserveNames>::InsertHelper(
+        I, Name.isTriviallyEmpty() ? Name : Prefix + Name, BB, InsertPt);
+  }
+};
 
-    ByteRange() : BeginOffset(), EndOffset() {}
-    ByteRange(uint64_t BeginOffset, uint64_t EndOffset)
-        : BeginOffset(BeginOffset), EndOffset(EndOffset) {}
+// Specialization for not preserving the name is trivial.
+template <>
+class IRBuilderPrefixedInserter<false> :
+    public IRBuilderDefaultInserter<false> {
+public:
+  void SetNamePrefix(const Twine &P) {}
+};
 
-    /// \brief Support for ordering ranges.
-    ///
-    /// This provides an ordering over ranges such that start offsets are
-    /// always increasing, and within equal start offsets, the end offsets are
-    /// decreasing. Thus the spanning range comes first in a cluster with the
-    /// same start position.
-    bool operator<(const ByteRange &RHS) const {
-      if (BeginOffset < RHS.BeginOffset) return true;
-      if (BeginOffset > RHS.BeginOffset) return false;
-      if (EndOffset > RHS.EndOffset) return true;
-      return false;
-    }
+/// \brief Provide a typedef for IRBuilder that drops names in release builds.
+#ifndef NDEBUG
+typedef llvm::IRBuilder<true, ConstantFolder,
+                        IRBuilderPrefixedInserter<true> > IRBuilderTy;
+#else
+typedef llvm::IRBuilder<false, ConstantFolder,
+                        IRBuilderPrefixedInserter<false> > IRBuilderTy;
+#endif
+}
 
-    /// \brief Support comparison with a single offset to allow binary searches.
-    friend bool operator<(const ByteRange &LHS, uint64_t RHSOffset) {
-      return LHS.BeginOffset < RHSOffset;
-    }
+namespace {
+/// \brief A common base class for representing a half-open byte range.
+struct ByteRange {
+  /// \brief The beginning offset of the range.
+  uint64_t BeginOffset;
 
-    friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset,
-                                                const ByteRange &RHS) {
-      return LHSOffset < RHS.BeginOffset;
-    }
+  /// \brief The ending offset, not included in the range.
+  uint64_t EndOffset;
 
-    bool operator==(const ByteRange &RHS) const {
-      return BeginOffset == RHS.BeginOffset && EndOffset == RHS.EndOffset;
-    }
-    bool operator!=(const ByteRange &RHS) const { return !operator==(RHS); }
-  };
+  ByteRange() : BeginOffset(), EndOffset() {}
+  ByteRange(uint64_t BeginOffset, uint64_t EndOffset)
+      : BeginOffset(BeginOffset), EndOffset(EndOffset) {}
 
-  /// \brief A partition of an alloca.
+  /// \brief Support for ordering ranges.
   ///
-  /// This structure represents a contiguous partition of the alloca. These are
-  /// formed by examining the uses of the alloca. During formation, they may
-  /// overlap but once an AllocaPartitioning is built, the Partitions within it
-  /// are all disjoint.
-  struct Partition : public ByteRange {
-    /// \brief Whether this partition is splittable into smaller partitions.
-    ///
-    /// We flag partitions as splittable when they are formed entirely due to
-    /// accesses by trivially splittable operations such as memset and memcpy.
-    bool IsSplittable;
+  /// This provides an ordering over ranges such that start offsets are
+  /// always increasing, and within equal start offsets, the end offsets are
+  /// decreasing. Thus the spanning range comes first in a cluster with the
+  /// same start position.
+  bool operator<(const ByteRange &RHS) const {
+    if (BeginOffset < RHS.BeginOffset) return true;
+    if (BeginOffset > RHS.BeginOffset) return false;
+    if (EndOffset > RHS.EndOffset) return true;
+    return false;
+  }
 
-    /// \brief Test whether a partition has been marked as dead.
-    bool isDead() const {
-      if (BeginOffset == UINT64_MAX) {
-        assert(EndOffset == UINT64_MAX);
-        return true;
-      }
-      return false;
-    }
+  /// \brief Support comparison with a single offset to allow binary searches.
+  friend bool operator<(const ByteRange &LHS, uint64_t RHSOffset) {
+    return LHS.BeginOffset < RHSOffset;
+  }
+
+  friend LLVM_ATTRIBUTE_UNUSED bool operator<(uint64_t LHSOffset,
+                                              const ByteRange &RHS) {
+    return LHSOffset < RHS.BeginOffset;
+  }
+
+  bool operator==(const ByteRange &RHS) const {
+    return BeginOffset == RHS.BeginOffset && EndOffset == RHS.EndOffset;
+  }
+  bool operator!=(const ByteRange &RHS) const { return !operator==(RHS); }
+};
 
-    /// \brief Kill a partition.
-    /// This is accomplished by setting both its beginning and end offset to
-    /// the maximum possible value.
-    void kill() {
-      assert(!isDead() && "He's Dead, Jim!");
-      BeginOffset = EndOffset = UINT64_MAX;
+/// \brief A partition of an alloca.
+///
+/// This structure represents a contiguous partition of the alloca. These are
+/// formed by examining the uses of the alloca. During formation, they may
+/// overlap but once an AllocaPartitioning is built, the Partitions within it
+/// are all disjoint.
+struct Partition : public ByteRange {
+  /// \brief Whether this partition is splittable into smaller partitions.
+  ///
+  /// We flag partitions as splittable when they are formed entirely due to
+  /// accesses by trivially splittable operations such as memset and memcpy.
+  bool IsSplittable;
+
+  /// \brief Test whether a partition has been marked as dead.
+  bool isDead() const {
+    if (BeginOffset == UINT64_MAX) {
+      assert(EndOffset == UINT64_MAX);
+      return true;
     }
+    return false;
+  }
 
-    Partition() : ByteRange(), IsSplittable() {}
-    Partition(uint64_t BeginOffset, uint64_t EndOffset, bool IsSplittable)
-        : ByteRange(BeginOffset, EndOffset), IsSplittable(IsSplittable) {}
-  };
+  /// \brief Kill a partition.
+  /// This is accomplished by setting both its beginning and end offset to
+  /// the maximum possible value.
+  void kill() {
+    assert(!isDead() && "He's Dead, Jim!");
+    BeginOffset = EndOffset = UINT64_MAX;
+  }
 
-  /// \brief A particular use of a partition of the alloca.
+  Partition() : ByteRange(), IsSplittable() {}
+  Partition(uint64_t BeginOffset, uint64_t EndOffset, bool IsSplittable)
+      : ByteRange(BeginOffset, EndOffset), IsSplittable(IsSplittable) {}
+};
+
+/// \brief A particular use of a partition of the alloca.
+///
+/// This structure is used to associate uses of a partition with it. They
+/// mark the range of bytes which are referenced by a particular instruction,
+/// and includes a handle to the user itself and the pointer value in use.
+/// The bounds of these uses are determined by intersecting the bounds of the
+/// memory use itself with a particular partition. As a consequence there is
+/// intentionally overlap between various uses of the same partition.
+class PartitionUse : public ByteRange {
+  /// \brief Combined storage for both the Use* and split state.
+  PointerIntPair<Use*, 1, bool> UsePtrAndIsSplit;
+
+public:
+  PartitionUse() : ByteRange(), UsePtrAndIsSplit() {}
+  PartitionUse(uint64_t BeginOffset, uint64_t EndOffset, Use *U,
+               bool IsSplit)
+      : ByteRange(BeginOffset, EndOffset), UsePtrAndIsSplit(U, IsSplit) {}
+
+  /// \brief The use in question. Provides access to both user and used value.
   ///
-  /// This structure is used to associate uses of a partition with it. They
-  /// mark the range of bytes which are referenced by a particular instruction,
-  /// and includes a handle to the user itself and the pointer value in use.
-  /// The bounds of these uses are determined by intersecting the bounds of the
-  /// memory use itself with a particular partition. As a consequence there is
-  /// intentionally overlap between various uses of the same partition.
-  struct PartitionUse : public ByteRange {
-    /// \brief The use in question. Provides access to both user and used value.
-    ///
-    /// Note that this may be null if the partition use is *dead*, that is, it
-    /// should be ignored.
-    Use *U;
+  /// Note that this may be null if the partition use is *dead*, that is, it
+  /// should be ignored.
+  Use *getUse() const { return UsePtrAndIsSplit.getPointer(); }
 
-    PartitionUse() : ByteRange(), U() {}
-    PartitionUse(uint64_t BeginOffset, uint64_t EndOffset, Use *U)
-        : ByteRange(BeginOffset, EndOffset), U(U) {}
-  };
+  /// \brief Set the use for this partition use range.
+  void setUse(Use *U) { UsePtrAndIsSplit.setPointer(U); }
 
+  /// \brief Whether this use is split across multiple partitions.
+  bool isSplit() const { return UsePtrAndIsSplit.getInt(); }
+};
+}
+
+namespace llvm {
+template <> struct isPodLike<Partition> : llvm::true_type {};
+template <> struct isPodLike<PartitionUse> : llvm::true_type {};
+}
+
+namespace {
+/// \brief Alloca partitioning representation.
+///
+/// This class represents a partitioning of an alloca into slices, and
+/// information about the nature of uses of each slice of the alloca. The goal
+/// is that this information is sufficient to decide if and how to split the
+/// alloca apart and replace slices with scalars. It is also intended that this
+/// structure can capture the relevant information needed both to decide about
+/// and to enact these transformations.
+class AllocaPartitioning {
+public:
   /// \brief Construct a partitioning of a particular alloca.
   ///
   /// Construction does most of the work for partitioning the alloca. This
@@ -456,10 +515,10 @@ private:
 
     // Clamp the end offset to the end of the allocation. Note that this is
     // formulated to handle even the case where "BeginOffset + Size" overflows.
-    // NOTE! This may appear superficially to be something we could ignore
-    // entirely, but that is not so! There may be PHI-node uses where some
-    // instructions are dead but not others. We can't completely ignore the
-    // PHI node, and so have to record at least the information here.
+    // This may appear superficially to be something we could ignore entirely,
+    // but that is not so! There may be widened loads or PHI-node uses where
+    // some instructions are dead but not others. We can't completely ignore
+    // them, and so have to record at least the information here.
     assert(AllocSize >= BeginOffset); // Established above.
     if (Size > AllocSize - BeginOffset) {
       DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset
@@ -474,33 +533,17 @@ private:
   }
 
   void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
-                         bool IsVolatile) {
-    uint64_t Size = DL.getTypeStoreSize(Ty);
-
-    // If this memory access can be shown to *statically* extend outside the
-    // bounds of of the allocation, it's behavior is undefined, so simply
-    // ignore it. Note that this is more strict than the generic clamping
-    // behavior of insertUse. We also try to handle cases which might run the
-    // risk of overflow.
-    // FIXME: We should instead consider the pointer to have escaped if this
-    // function is being instrumented for addressing bugs or race conditions.
-    if (Offset.isNegative() || Size > AllocSize ||
-        Offset.ugt(AllocSize - Size)) {
-      DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte "
-                   << (isa<LoadInst>(I) ? "load" : "store") << " @" << Offset
-                   << " which extends past the end of the " << AllocSize
-                   << " byte alloca:\n"
-                   << "    alloca: " << P.AI << "\n"
-                   << "       use: " << I << "\n");
-      return;
-    }
-
+                         uint64_t Size, bool IsVolatile) {
     // We allow splitting of loads and stores where the type is an integer type
-    // and which cover the entire alloca. Such integer loads and stores
-    // often require decomposition into fine grained loads and stores.
-    bool IsSplittable = false;
-    if (IntegerType *ITy = dyn_cast<IntegerType>(Ty))
-      IsSplittable = !IsVolatile && ITy->getBitWidth() == AllocSize*8;
+    // and cover the entire alloca. This prevents us from splitting over
+    // eagerly.
+    // FIXME: In the great blue eventually, we should eagerly split all integer
+    // loads and stores, and then have a separate step that merges adjacent
+    // alloca partitions into a single partition suitable for integer widening.
+    // Or we should skip the merge step and rely on GVN and other passes to
+    // merge adjacent loads and stores that survive mem2reg.
+    bool IsSplittable =
+        Ty->isIntegerTy() && !IsVolatile && Offset == 0 && Size >= AllocSize;
 
     insertUse(I, Offset, Size, IsSplittable);
   }
@@ -512,7 +555,8 @@ private:
     if (!IsOffsetKnown)
       return PI.setAborted(&LI);
 
-    return handleLoadOrStore(LI.getType(), LI, Offset, LI.isVolatile());
+    uint64_t Size = DL.getTypeStoreSize(LI.getType());
+    return handleLoadOrStore(LI.getType(), LI, Offset, Size, LI.isVolatile());
   }
 
   void visitStoreInst(StoreInst &SI) {
@@ -522,9 +566,28 @@ private:
     if (!IsOffsetKnown)
       return PI.setAborted(&SI);
 
+    uint64_t Size = DL.getTypeStoreSize(ValOp->getType());
+
+    // If this memory access can be shown to *statically* extend outside the
+    // bounds of of the allocation, it's behavior is undefined, so simply
+    // ignore it. Note that this is more strict than the generic clamping
+    // behavior of insertUse. We also try to handle cases which might run the
+    // risk of overflow.
+    // FIXME: We should instead consider the pointer to have escaped if this
+    // function is being instrumented for addressing bugs or race conditions.
+    if (Offset.isNegative() || Size > AllocSize ||
+        Offset.ugt(AllocSize - Size)) {
+      DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte store @" << Offset
+                   << " which extends past the end of the " << AllocSize
+                   << " byte alloca:\n"
+                   << "    alloca: " << P.AI << "\n"
+                   << "       use: " << SI << "\n");
+      return;
+    }
+
     assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
            "All simple FCA stores should have been pre-split");
-    handleLoadOrStore(ValOp->getType(), SI, Offset, SI.isVolatile());
+    handleLoadOrStore(ValOp->getType(), SI, Offset, Size, SI.isVolatile());
   }
 
 
@@ -795,13 +858,14 @@ private:
       EndOffset = AllocSize;
 
     // NB: This only works if we have zero overlapping partitions.
-    iterator B = std::lower_bound(P.begin(), P.end(), BeginOffset);
-    if (B != P.begin() && llvm::prior(B)->EndOffset > BeginOffset)
-      B = llvm::prior(B);
-    for (iterator I = B, E = P.end(); I != E && I->BeginOffset < EndOffset;
-         ++I) {
+    iterator I = std::lower_bound(P.begin(), P.end(), BeginOffset);
+    if (I != P.begin() && llvm::prior(I)->EndOffset > BeginOffset)
+      I = llvm::prior(I);
+    iterator E = P.end();
+    bool IsSplit = llvm::next(I) != E && llvm::next(I)->BeginOffset < EndOffset;
+    for (; I != E && I->BeginOffset < EndOffset; ++I) {
       PartitionUse NewPU(std::max(I->BeginOffset, BeginOffset),
-                         std::min(I->EndOffset, EndOffset), U);
+                         std::min(I->EndOffset, EndOffset), U, IsSplit);
       P.use_push_back(I, NewPU);
       if (isa<PHINode>(U->getUser()) || isa<SelectInst>(U->getUser()))
         P.PHIOrSelectOpMap[U]
@@ -809,20 +873,6 @@ private:
     }
   }
 
-  void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset) {
-    uint64_t Size = DL.getTypeStoreSize(Ty);
-
-    // If this memory access can be shown to *statically* extend outside the
-    // bounds of of the allocation, it's behavior is undefined, so simply
-    // ignore it. Note that this is more strict than the generic clamping
-    // behavior of insertUse.
-    if (Offset.isNegative() || Size > AllocSize ||
-        Offset.ugt(AllocSize - Size))
-      return markAsDead(I);
-
-    insertUse(I, Offset, Size);
-  }
-
   void visitBitCastInst(BitCastInst &BC) {
     if (BC.use_empty())
       return markAsDead(BC);
@@ -839,12 +889,23 @@ private:
 
   void visitLoadInst(LoadInst &LI) {
     assert(IsOffsetKnown);
-    handleLoadOrStore(LI.getType(), LI, Offset);
+    uint64_t Size = DL.getTypeStoreSize(LI.getType());
+    insertUse(LI, Offset, Size);
   }
 
   void visitStoreInst(StoreInst &SI) {
     assert(IsOffsetKnown);
-    handleLoadOrStore(SI.getOperand(0)->getType(), SI, Offset);
+    uint64_t Size = DL.getTypeStoreSize(SI.getOperand(0)->getType());
+
+    // If this memory access can be shown to *statically* extend outside the
+    // bounds of of the allocation, it's behavior is undefined, so simply
+    // ignore it. Note that this is more strict than the generic clamping
+    // behavior of insertUse.
+    if (Offset.isNegative() || Size > AllocSize ||
+        Offset.ugt(AllocSize - Size))
+      return markAsDead(SI);
+
+    insertUse(SI, Offset, Size);
   }
 
   void visitMemSetInst(MemSetInst &II) {
@@ -868,7 +929,7 @@ private:
     uint64_t Size = Length ? Length->getLimitedValue()
                            : AllocSize - Offset.getLimitedValue();
 
-    MemTransferOffsets &Offsets = P.MemTransferInstData[&II];
+    const MemTransferOffsets &Offsets = P.MemTransferInstData[&II];
     if (!II.isVolatile() && Offsets.DestEnd && Offsets.SourceEnd &&
         Offsets.DestBegin == Offsets.SourceBegin)
       return markAsDead(II); // Skip identity transfers without side-effects.
@@ -1077,6 +1138,10 @@ AllocaPartitioning::AllocaPartitioning(const DataLayout &TD, AllocaInst &AI)
     splitAndMergePartitions();
   }
 
+  // Record how many partitions we end up with.
+  NumAllocaPartitions += Partitions.size();
+  MaxPartitionsPerAlloca = std::max<unsigned>(Partitions.size(), MaxPartitionsPerAlloca);
+
   // Now build up the user lists for each of these disjoint partitions by
   // re-walking the recursive users of the alloca.
   Uses.resize(Partitions.size());
@@ -1084,26 +1149,34 @@ AllocaPartitioning::AllocaPartitioning(const DataLayout &TD, AllocaInst &AI)
   PtrI = UB.visitPtr(AI);
   assert(!PtrI.isEscaped() && "Previously analyzed pointer now escapes!");
   assert(!PtrI.isAborted() && "Early aborted the visit of the pointer.");
+
+  unsigned NumUses = 0;
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_STATS)
+  for (unsigned Idx = 0, Size = Uses.size(); Idx != Size; ++Idx)
+    NumUses += Uses[Idx].size();
+#endif
+  NumAllocaPartitionUses += NumUses;
+  MaxPartitionUsesPerAlloca = std::max<unsigned>(NumUses, MaxPartitionUsesPerAlloca);
 }
 
 Type *AllocaPartitioning::getCommonType(iterator I) const {
   Type *Ty = 0;
   for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) {
-    if (!UI->U)
+    Use *U = UI->getUse();
+    if (!U)
       continue; // Skip dead uses.
-    if (isa<IntrinsicInst>(*UI->U->getUser()))
+    if (isa<IntrinsicInst>(*U->getUser()))
       continue;
     if (UI->BeginOffset != I->BeginOffset || UI->EndOffset != I->EndOffset)
       continue;
 
     Type *UserTy = 0;
-    if (LoadInst *LI = dyn_cast<LoadInst>(UI->U->getUser())) {
+    if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser()))
       UserTy = LI->getType();
-    } else if (StoreInst *SI = dyn_cast<StoreInst>(UI->U->getUser())) {
+    else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser()))
       UserTy = SI->getValueOperand()->getType();
-    } else {
+    else
       return 0; // Bail if we have weird uses.
-    }
 
     if (IntegerType *ITy = dyn_cast<IntegerType>(UserTy)) {
       // If the type is larger than the partition, skip it. We only encounter
@@ -1140,11 +1213,12 @@ void AllocaPartitioning::print(raw_ostream &OS, const_iterator I,
 void AllocaPartitioning::printUsers(raw_ostream &OS, const_iterator I,
                                     StringRef Indent) const {
   for (const_use_iterator UI = use_begin(I), UE = use_end(I); UI != UE; ++UI) {
-    if (!UI->U)
+    if (!UI->getUse())
       continue; // Skip dead uses.
     OS << Indent << "  [" << UI->BeginOffset << "," << UI->EndOffset << ") "
-       << "used by: " << *UI->U->getUser() << "\n";
-    if (MemTransferInst *II = dyn_cast<MemTransferInst>(UI->U->getUser())) {
+       << "used by: " << *UI->getUse()->getUser() << "\n";
+    if (MemTransferInst *II =
+            dyn_cast<MemTransferInst>(UI->getUse()->getUser())) {
       const MemTransferOffsets &MTO = MemTransferInstData.lookup(II);
       bool IsDest;
       if (!MTO.IsSplittable)
@@ -1244,12 +1318,12 @@ public:
         // may be zapped by an optimization pass in future.
         if (ZExtInst *ZExt = dyn_cast<ZExtInst>(SI->getOperand(0)))
           Arg = dyn_cast<Argument>(ZExt->getOperand(0));
-        if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0)))
+        else if (SExtInst *SExt = dyn_cast<SExtInst>(SI->getOperand(0)))
           Arg = dyn_cast<Argument>(SExt->getOperand(0));
         if (!Arg)
-          Arg = SI->getOperand(0);
+          Arg = SI->getValueOperand();
       } else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
-        Arg = LI->getOperand(0);
+        Arg = LI->getPointerOperand();
       } else {
         continue;
       }
@@ -1375,11 +1449,11 @@ public:
     // may be grown during speculation. However, we never need to re-visit the
     // new uses, and so we can use the initial size bound.
     for (unsigned Idx = 0, Size = P.use_size(PI); Idx != Size; ++Idx) {
-      const AllocaPartitioning::PartitionUse &PU = P.getUse(PI, Idx);
-      if (!PU.U)
+      const PartitionUse &PU = P.getUse(PI, Idx);
+      if (!PU.getUse())
         continue; // Skip dead use.
 
-      visit(cast<Instruction>(PU.U->getUser()));
+      visit(cast<Instruction>(PU.getUse()->getUser()));
     }
   }
 
@@ -1473,7 +1547,7 @@ private:
     assert(!Loads.empty());
 
     Type *LoadTy = cast<PointerType>(PN.getType())->getElementType();
-    IRBuilder<> PHIBuilder(&PN);
+    IRBuilderTy PHIBuilder(&PN);
     PHINode *NewPN = PHIBuilder.CreatePHI(LoadTy, PN.getNumIncomingValues(),
                                           PN.getName() + ".sroa.speculated");
 
@@ -1496,7 +1570,7 @@ private:
       TerminatorInst *TI = Pred->getTerminator();
       Use *InUse = &PN.getOperandUse(PN.getOperandNumForIncomingValue(Idx));
       Value *InVal = PN.getIncomingValue(Idx);
-      IRBuilder<> PredBuilder(TI);
+      IRBuilderTy PredBuilder(TI);
 
       LoadInst *Load
         = PredBuilder.CreateLoad(InVal, (PN.getName() + ".sroa.speculate.load." +
@@ -1523,8 +1597,8 @@ private:
       // inside the load.
       AllocaPartitioning::use_iterator UI
         = P.findPartitionUseForPHIOrSelectOperand(InUse);
-      assert(isa<PHINode>(*UI->U->getUser()));
-      UI->U = &Load->getOperandUse(Load->getPointerOperandIndex());
+      assert(isa<PHINode>(*UI->getUse()->getUser()));
+      UI->setUse(&Load->getOperandUse(Load->getPointerOperandIndex()));
     }
     DEBUG(dbgs() << "          speculated to: " << *NewPN << "\n");
   }
@@ -1571,16 +1645,16 @@ private:
 
   void visitSelectInst(SelectInst &SI) {
     DEBUG(dbgs() << "    original: " << SI << "\n");
-    IRBuilder<> IRB(&SI);
 
     // If the select isn't safe to speculate, just use simple logic to emit it.
     SmallVector<LoadInst *, 4> Loads;
     if (!isSafeSelectToSpeculate(SI, Loads))
       return;
 
+    IRBuilderTy IRB(&SI);
     Use *Ops[2] = { &SI.getOperandUse(1), &SI.getOperandUse(2) };
     AllocaPartitioning::iterator PIs[2];
-    AllocaPartitioning::PartitionUse PUs[2];
+    PartitionUse PUs[2];
     for (unsigned i = 0, e = 2; i != e; ++i) {
       PIs[i] = P.findPartitionForPHIOrSelectOperand(Ops[i]);
       if (PIs[i] != P.end()) {
@@ -1591,7 +1665,7 @@ private:
         PUs[i] = *UI;
         // Clear out the use here so that the offsets into the use list remain
         // stable but this use is ignored when rewriting.
-        UI->U = 0;
+        UI->setUse(0);
       }
     }
 
@@ -1623,8 +1697,8 @@ private:
       for (unsigned i = 0, e = 2; i != e; ++i) {
         if (PIs[i] != P.end()) {
           Use *LoadUse = &Loads[i]->getOperandUse(0);
-          assert(PUs[i].U->get() == LoadUse->get());
-          PUs[i].U = LoadUse;
+          assert(PUs[i].getUse()->get() == LoadUse->get());
+          PUs[i].setUse(LoadUse);
           P.use_push_back(PIs[i], PUs[i]);
         }
       }
@@ -1641,9 +1715,8 @@ private:
 ///
 /// This will return the BasePtr if that is valid, or build a new GEP
 /// instruction using the IRBuilder if GEP-ing is needed.
-static Value *buildGEP(IRBuilder<> &IRB, Value *BasePtr,
-                       SmallVectorImpl<Value *> &Indices,
-                       const Twine &Prefix) {
+static Value *buildGEP(IRBuilderTy &IRB, Value *BasePtr,
+                       SmallVectorImpl<Value *> &Indices) {
   if (Indices.empty())
     return BasePtr;
 
@@ -1652,7 +1725,7 @@ static Value *buildGEP(IRBuilder<> &IRB, Value *BasePtr,
   if (Indices.size() == 1 && cast<ConstantInt>(Indices.back())->isZero())
     return BasePtr;
 
-  return IRB.CreateInBoundsGEP(BasePtr, Indices, Prefix + ".idx");
+  return IRB.CreateInBoundsGEP(BasePtr, Indices, "idx");
 }
 
 /// \brief Get a natural GEP off of the BasePtr walking through Ty toward
@@ -1664,12 +1737,11 @@ static Value *buildGEP(IRBuilder<> &IRB, Value *BasePtr,
 /// TargetTy. If we can't find one with the same type, we at least try to use
 /// one with the same size. If none of that works, we just produce the GEP as
 /// indicated by Indices to have the correct offset.
-static Value *getNaturalGEPWithType(IRBuilder<> &IRB, const DataLayout &TD,
+static Value *getNaturalGEPWithType(IRBuilderTy &IRB, const DataLayout &TD,
                                     Value *BasePtr, Type *Ty, Type *TargetTy,
-                                    SmallVectorImpl<Value *> &Indices,
-                                    const Twine &Prefix) {
+                                    SmallVectorImpl<Value *> &Indices) {
   if (Ty == TargetTy)
-    return buildGEP(IRB, BasePtr, Indices, Prefix);
+    return buildGEP(IRB, BasePtr, Indices);
 
   // See if we can descend into a struct and locate a field with the correct
   // type.
@@ -1696,20 +1768,19 @@ static Value *getNaturalGEPWithType(IRBuilder<> &IRB, const DataLayout &TD,
   if (ElementTy != TargetTy)
     Indices.erase(Indices.end() - NumLayers, Indices.end());
 
-  return buildGEP(IRB, BasePtr, Indices, Prefix);
+  return buildGEP(IRB, BasePtr, Indices);
 }
 
 /// \brief Recursively compute indices for a natural GEP.
 ///
 /// This is the recursive step for getNaturalGEPWithOffset that walks down the
 /// element types adding appropriate indices for the GEP.
-static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const DataLayout &TD,
+static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &TD,
                                        Value *Ptr, Type *Ty, APInt &Offset,
                                        Type *TargetTy,
-                                       SmallVectorImpl<Value *> &Indices,
-                                       const Twine &Prefix) {
+                                       SmallVectorImpl<Value *> &Indices) {
   if (Offset == 0)
-    return getNaturalGEPWithType(IRB, TD, Ptr, Ty, TargetTy, Indices, Prefix);
+    return getNaturalGEPWithType(IRB, TD, Ptr, Ty, TargetTy, Indices);
 
   // We can't recurse through pointer types.
   if (Ty->isPointerTy())
@@ -1729,7 +1800,7 @@ static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const DataLayout &TD,
     Offset -= NumSkippedElements * ElementSize;
     Indices.push_back(IRB.getInt(NumSkippedElements));
     return getNaturalGEPRecursively(IRB, TD, Ptr, VecTy->getElementType(),
-                                    Offset, TargetTy, Indices, Prefix);
+                                    Offset, TargetTy, Indices);
   }
 
   if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
@@ -1742,7 +1813,7 @@ static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const DataLayout &TD,
     Offset -= NumSkippedElements * ElementSize;
     Indices.push_back(IRB.getInt(NumSkippedElements));
     return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy,
-                                    Indices, Prefix);
+                                    Indices);
   }
 
   StructType *STy = dyn_cast<StructType>(Ty);
@@ -1761,7 +1832,7 @@ static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const DataLayout &TD,
 
   Indices.push_back(IRB.getInt32(Index));
   return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy,
-                                  Indices, Prefix);
+                                  Indices);
 }
 
 /// \brief Get a natural GEP from a base pointer to a particular offset and
@@ -1774,10 +1845,9 @@ static Value *getNaturalGEPRecursively(IRBuilder<> &IRB, const DataLayout &TD,
 /// Indices, and setting Ty to the result subtype.
 ///
 /// If no natural GEP can be constructed, this function returns null.
-static Value *getNaturalGEPWithOffset(IRBuilder<> &IRB, const DataLayout &TD,
+static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &TD,
                                       Value *Ptr, APInt Offset, Type *TargetTy,
-                                      SmallVectorImpl<Value *> &Indices,
-                                      const Twine &Prefix) {
+                                      SmallVectorImpl<Value *> &Indices) {
   PointerType *Ty = cast<PointerType>(Ptr->getType());
 
   // Don't consider any GEPs through an i8* as natural unless the TargetTy is
@@ -1796,7 +1866,7 @@ static Value *getNaturalGEPWithOffset(IRBuilder<> &IRB, const DataLayout &TD,
   Offset -= NumSkippedElements * ElementSize;
   Indices.push_back(IRB.getInt(NumSkippedElements));
   return getNaturalGEPRecursively(IRB, TD, Ptr, ElementTy, Offset, TargetTy,
-                                  Indices, Prefix);
+                                  Indices);
 }
 
 /// \brief Compute an adjusted pointer from Ptr by Offset bytes where the
@@ -1814,9 +1884,8 @@ static Value *getNaturalGEPWithOffset(IRBuilder<> &IRB, const DataLayout &TD,
 /// properties. The algorithm tries to fold as many constant indices into
 /// a single GEP as possible, thus making each GEP more independent of the
 /// surrounding code.
-static Value *getAdjustedPtr(IRBuilder<> &IRB, const DataLayout &TD,
-                             Value *Ptr, APInt Offset, Type *PointerTy,
-                             const Twine &Prefix) {
+static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &TD,
+                             Value *Ptr, APInt Offset, Type *PointerTy) {
   // Even though we don't look through PHI nodes, we could be called on an
   // instruction in an unreachable block, which may be on a cycle.
   SmallPtrSet<Value *, 4> Visited;
@@ -1850,7 +1919,7 @@ static Value *getAdjustedPtr(IRBuilder<> &IRB, const DataLayout &TD,
     // See if we can perform a natural GEP here.
     Indices.clear();
     if (Value *P = getNaturalGEPWithOffset(IRB, TD, Ptr, Offset, TargetTy,
-                                           Indices, Prefix)) {
+                                           Indices)) {
       if (P->getType() == PointerTy) {
         // Zap any offset pointer that we ended up computing in previous rounds.
         if (OffsetPtr && OffsetPtr->use_empty())
@@ -1885,19 +1954,19 @@ static Value *getAdjustedPtr(IRBuilder<> &IRB, const DataLayout &TD,
   if (!OffsetPtr) {
     if (!Int8Ptr) {
       Int8Ptr = IRB.CreateBitCast(Ptr, IRB.getInt8PtrTy(),
-                                  Prefix + ".raw_cast");
+                                  "raw_cast");
       Int8PtrOffset = Offset;
     }
 
     OffsetPtr = Int8PtrOffset == 0 ? Int8Ptr :
       IRB.CreateInBoundsGEP(Int8Ptr, IRB.getInt(Int8PtrOffset),
-                            Prefix + ".raw_idx");
+                            "raw_idx");
   }
   Ptr = OffsetPtr;
 
   // On the off chance we were targeting i8*, guard the bitcast here.
   if (Ptr->getType() != PointerTy)
-    Ptr = IRB.CreateBitCast(Ptr, PointerTy, Prefix + ".cast");
+    Ptr = IRB.CreateBitCast(Ptr, PointerTy, "cast");
 
   return Ptr;
 }
@@ -1911,6 +1980,10 @@ static Value *getAdjustedPtr(IRBuilder<> &IRB, const DataLayout &TD,
 static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
   if (OldTy == NewTy)
     return true;
+  if (IntegerType *OldITy = dyn_cast<IntegerType>(OldTy))
+    if (IntegerType *NewITy = dyn_cast<IntegerType>(NewTy))
+      if (NewITy->getBitWidth() >= OldITy->getBitWidth())
+        return true;
   if (DL.getTypeSizeInBits(NewTy) != DL.getTypeSizeInBits(OldTy))
     return false;
   if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
@@ -1933,12 +2006,16 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
 /// This will try various different casting techniques, such as bitcasts,
 /// inttoptr, and ptrtoint casts. Use the \c canConvertValue predicate to test
 /// two types for viability with this routine.
-static Value *convertValue(const DataLayout &DL, IRBuilder<> &IRB, Value *V,
+static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
                            Type *Ty) {
   assert(canConvertValue(DL, V->getType(), Ty) &&
          "Value not convertable to type");
   if (V->getType() == Ty)
     return V;
+  if (IntegerType *OldITy = dyn_cast<IntegerType>(V->getType()))
+    if (IntegerType *NewITy = dyn_cast<IntegerType>(Ty))
+      if (NewITy->getBitWidth() > OldITy->getBitWidth())
+        return IRB.CreateZExt(V, NewITy);
   if (V->getType()->isIntegerTy() && Ty->isPointerTy())
     return IRB.CreateIntToPtr(V, Ty);
   if (V->getType()->isPointerTy() && Ty->isIntegerTy())
@@ -1977,7 +2054,8 @@ static bool isVectorPromotionViable(const DataLayout &TD,
   ElementSize /= 8;
 
   for (; I != E; ++I) {
-    if (!I->U)
+    Use *U = I->getUse();
+    if (!U)
       continue; // Skip dead use.
 
     uint64_t BeginOffset = I->BeginOffset - PartitionBeginOffset;
@@ -1997,24 +2075,24 @@ static bool isVectorPromotionViable(const DataLayout &TD,
       = (NumElements == 1) ? Ty->getElementType()
                            : VectorType::get(Ty->getElementType(), NumElements);
 
-    if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I->U->getUser())) {
+    if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
       if (MI->isVolatile())
         return false;
-      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(I->U->getUser())) {
+      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(U->getUser())) {
         const AllocaPartitioning::MemTransferOffsets &MTO
           = P.getMemTransferOffsets(*MTI);
         if (!MTO.IsSplittable)
           return false;
       }
-    } else if (I->U->get()->getType()->getPointerElementType()->isStructTy()) {
+    } else if (U->get()->getType()->getPointerElementType()->isStructTy()) {
       // Disable vector promotion when there are loads or stores of an FCA.
       return false;
-    } else if (LoadInst *LI = dyn_cast<LoadInst>(I->U->getUser())) {
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
       if (LI->isVolatile())
         return false;
       if (!canConvertValue(TD, PartitionTy, LI->getType()))
         return false;
-    } else if (StoreInst *SI = dyn_cast<StoreInst>(I->U->getUser())) {
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
       if (SI->isVolatile())
         return false;
       if (!canConvertValue(TD, SI->getValueOperand()->getType(), PartitionTy))
@@ -2063,7 +2141,8 @@ static bool isIntegerWideningViable(const DataLayout &TD,
   // unsplittable entry (which we may make splittable later).
   bool WholeAllocaOp = false;
   for (; I != E; ++I) {
-    if (!I->U)
+    Use *U = I->getUse();
+    if (!U)
       continue; // Skip dead use.
 
     uint64_t RelBegin = I->BeginOffset - AllocBeginOffset;
@@ -2074,7 +2153,7 @@ static bool isIntegerWideningViable(const DataLayout &TD,
     if (RelEnd > Size)
       return false;
 
-    if (LoadInst *LI = dyn_cast<LoadInst>(I->U->getUser())) {
+    if (LoadInst *LI = dyn_cast<LoadInst>(U->getUser())) {
       if (LI->isVolatile())
         return false;
       if (RelBegin == 0 && RelEnd == Size)
@@ -2089,7 +2168,7 @@ static bool isIntegerWideningViable(const DataLayout &TD,
       if (RelBegin != 0 || RelEnd != Size ||
           !canConvertValue(TD, AllocaTy, LI->getType()))
         return false;
-    } else if (StoreInst *SI = dyn_cast<StoreInst>(I->U->getUser())) {
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(U->getUser())) {
       Type *ValueTy = SI->getValueOperand()->getType();
       if (SI->isVolatile())
         return false;
@@ -2105,16 +2184,16 @@ static bool isIntegerWideningViable(const DataLayout &TD,
       if (RelBegin != 0 || RelEnd != Size ||
           !canConvertValue(TD, ValueTy, AllocaTy))
         return false;
-    } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I->U->getUser())) {
+    } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(U->getUser())) {
       if (MI->isVolatile() || !isa<Constant>(MI->getLength()))
         return false;
-      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(I->U->getUser())) {
+      if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(U->getUser())) {
         const AllocaPartitioning::MemTransferOffsets &MTO
           = P.getMemTransferOffsets(*MTI);
         if (!MTO.IsSplittable)
           return false;
       }
-    } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I->U->getUser())) {
+    } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(U->getUser())) {
       if (II->getIntrinsicID() != Intrinsic::lifetime_start &&
           II->getIntrinsicID() != Intrinsic::lifetime_end)
         return false;
@@ -2125,7 +2204,7 @@ static bool isIntegerWideningViable(const DataLayout &TD,
   return WholeAllocaOp;
 }
 
-static Value *extractInteger(const DataLayout &DL, IRBuilder<> &IRB, Value *V,
+static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
                              IntegerType *Ty, uint64_t Offset,
                              const Twine &Name) {
   DEBUG(dbgs() << "       start: " << *V << "\n");
@@ -2148,7 +2227,7 @@ static Value *extractInteger(const DataLayout &DL, IRBuilder<> &IRB, Value *V,
   return V;
 }
 
-static Value *insertInteger(const DataLayout &DL, IRBuilder<> &IRB, Value *Old,
+static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
                             Value *V, uint64_t Offset, const Twine &Name) {
   IntegerType *IntTy = cast<IntegerType>(Old->getType());
   IntegerType *Ty = cast<IntegerType>(V->getType());
@@ -2179,7 +2258,7 @@ static Value *insertInteger(const DataLayout &DL, IRBuilder<> &IRB, Value *Old,
   return V;
 }
 
-static Value *extractVector(IRBuilder<> &IRB, Value *V,
+static Value *extractVector(IRBuilderTy &IRB, Value *V,
                             unsigned BeginIndex, unsigned EndIndex,
                             const Twine &Name) {
   VectorType *VecTy = cast<VectorType>(V->getType());
@@ -2207,7 +2286,7 @@ static Value *extractVector(IRBuilder<> &IRB, Value *V,
   return V;
 }
 
-static Value *insertVector(IRBuilder<> &IRB, Value *Old, Value *V,
+static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
                            unsigned BeginIndex, const Twine &Name) {
   VectorType *VecTy = cast<VectorType>(Old->getType());
   assert(VecTy && "Can only insert a vector into a vector");
@@ -2243,17 +2322,15 @@ static Value *insertVector(IRBuilder<> &IRB, Value *Old, Value *V,
   V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
                               ConstantVector::get(Mask),
                               Name + ".expand");
-  DEBUG(dbgs() << "    shuffle1: " << *V << "\n");
+  DEBUG(dbgs() << "    shuffle: " << *V << "\n");
 
   Mask.clear();
   for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
-    if (i >= BeginIndex && i < EndIndex)
-      Mask.push_back(IRB.getInt32(i));
-    else
-      Mask.push_back(IRB.getInt32(i + VecTy->getNumElements()));
-  V = IRB.CreateShuffleVector(V, Old, ConstantVector::get(Mask),
-                              Name + "insert");
-  DEBUG(dbgs() << "    shuffle2: " << *V << "\n");
+    Mask.push_back(IRB.getInt1(i >= BeginIndex && i < EndIndex));
+
+  V = IRB.CreateSelect(ConstantVector::get(Mask), V, Old, Name + "blend");
+
+  DEBUG(dbgs() << "    blend: " << *V << "\n");
   return V;
 }
 
@@ -2297,11 +2374,13 @@ class AllocaPartitionRewriter : public InstVisitor<AllocaPartitionRewriter,
 
   // The offset of the partition user currently being rewritten.
   uint64_t BeginOffset, EndOffset;
+  bool IsSplit;
   Use *OldUse;
   Instruction *OldPtr;
 
-  // The name prefix to use when rewriting instructions for this alloca.
-  std::string NamePrefix;
+  // Utility IR builder, whose name prefix is setup for each visited use, and
+  // the insertion point is set to point to the user.
+  IRBuilderTy IRB;
 
 public:
   AllocaPartitionRewriter(const DataLayout &TD, AllocaPartitioning &P,
@@ -2314,7 +2393,8 @@ public:
       NewAllocaEndOffset(NewEndOffset),
       NewAllocaTy(NewAI.getAllocatedType()),
       VecTy(), ElementTy(), ElementSize(), IntTy(),
-      BeginOffset(), EndOffset() {
+      BeginOffset(), EndOffset(), IsSplit(), OldUse(), OldPtr(),
+      IRB(NewAI.getContext(), ConstantFolder()) {
   }
 
   /// \brief Visit the users of the alloca partition and rewrite them.
@@ -2336,14 +2416,21 @@ public:
     }
     bool CanSROA = true;
     for (; I != E; ++I) {
-      if (!I->U)
+      if (!I->getUse())
         continue; // Skip dead uses.
       BeginOffset = I->BeginOffset;
       EndOffset = I->EndOffset;
-      OldUse = I->U;
-      OldPtr = cast<Instruction>(I->U->get());
-      NamePrefix = (Twine(NewAI.getName()) + "." + Twine(BeginOffset)).str();
-      CanSROA &= visit(cast<Instruction>(I->U->getUser()));
+      IsSplit = I->isSplit();
+      OldUse = I->getUse();
+      OldPtr = cast<Instruction>(OldUse->get());
+
+      Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
+      IRB.SetInsertPoint(OldUserI);
+      IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
+      IRB.SetNamePrefix(Twine(NewAI.getName()) + "." + Twine(BeginOffset) +
+                        ".");
+
+      CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
     }
     if (VecTy) {
       assert(CanSROA);
@@ -2365,14 +2452,10 @@ private:
     llvm_unreachable("No rewrite rule for this instruction!");
   }
 
-  Twine getName(const Twine &Suffix) {
-    return NamePrefix + Suffix;
-  }
-
-  Value *getAdjustedAllocaPtr(IRBuilder<> &IRB, Type *PointerTy) {
+  Value *getAdjustedAllocaPtr(IRBuilderTy &IRB, Type *PointerTy) {
     assert(BeginOffset >= NewAllocaBeginOffset);
     APInt Offset(TD.getPointerSizeInBits(), BeginOffset - NewAllocaBeginOffset);
-    return getAdjustedPtr(IRB, TD, &NewAI, Offset, PointerTy, getName(""));
+    return getAdjustedPtr(IRB, TD, &NewAI, Offset, PointerTy);
   }
 
   /// \brief Compute suitable alignment to access an offset into the new alloca.
@@ -2422,27 +2505,27 @@ private:
       Pass.DeadInsts.insert(I);
   }
 
-  Value *rewriteVectorizedLoadInst(IRBuilder<> &IRB) {
+  Value *rewriteVectorizedLoadInst() {
     unsigned BeginIndex = getIndex(BeginOffset);
     unsigned EndIndex = getIndex(EndOffset);
     assert(EndIndex > BeginIndex && "Empty vector!");
 
     Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                     getName(".load"));
-    return extractVector(IRB, V, BeginIndex, EndIndex, getName(".vec"));
+                                     "load");
+    return extractVector(IRB, V, BeginIndex, EndIndex, "vec");
   }
 
-  Value *rewriteIntegerLoad(IRBuilder<> &IRB, LoadInst &LI) {
+  Value *rewriteIntegerLoad(LoadInst &LI) {
     assert(IntTy && "We cannot insert an integer to the alloca");
     assert(!LI.isVolatile());
     Value *V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                     getName(".load"));
+                                     "load");
     V = convertValue(TD, IRB, V, IntTy);
     assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
     uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
     if (Offset > 0 || EndOffset < NewAllocaEndOffset)
       V = extractInteger(TD, IRB, V, cast<IntegerType>(LI.getType()), Offset,
-                         getName(".extract"));
+                         "extract");
     return V;
   }
 
@@ -2450,58 +2533,39 @@ private:
     DEBUG(dbgs() << "    original: " << LI << "\n");
     Value *OldOp = LI.getOperand(0);
     assert(OldOp == OldPtr);
-    IRBuilder<> IRB(&LI);
 
     uint64_t Size = EndOffset - BeginOffset;
-    bool IsSplitIntLoad = Size < TD.getTypeStoreSize(LI.getType());
-
-    // If this memory access can be shown to *statically* extend outside the
-    // bounds of the original allocation it's behavior is undefined. Rather
-    // than trying to transform it, just replace it with undef.
-    // FIXME: We should do something more clever for functions being
-    // instrumented by asan.
-    // FIXME: Eventually, once ASan and friends can flush out bugs here, this
-    // should be transformed to a load of null making it unreachable.
-    uint64_t OldAllocSize = TD.getTypeAllocSize(OldAI.getAllocatedType());
-    if (TD.getTypeStoreSize(LI.getType()) > OldAllocSize) {
-      LI.replaceAllUsesWith(UndefValue::get(LI.getType()));
-      Pass.DeadInsts.insert(&LI);
-      deleteIfTriviallyDead(OldOp);
-      DEBUG(dbgs() << "          to: undef!!\n");
-      return true;
-    }
 
-    Type *TargetTy = IsSplitIntLoad ? Type::getIntNTy(LI.getContext(), Size * 8)
-                                    : LI.getType();
+    Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), Size * 8)
+                             : LI.getType();
     bool IsPtrAdjusted = false;
     Value *V;
     if (VecTy) {
-      V = rewriteVectorizedLoadInst(IRB);
+      V = rewriteVectorizedLoadInst();
     } else if (IntTy && LI.getType()->isIntegerTy()) {
-      V = rewriteIntegerLoad(IRB, LI);
+      V = rewriteIntegerLoad(LI);
     } else if (BeginOffset == NewAllocaBeginOffset &&
                canConvertValue(TD, NewAllocaTy, LI.getType())) {
       V = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                LI.isVolatile(), getName(".load"));
+                                LI.isVolatile(), "load");
     } else {
       Type *LTy = TargetTy->getPointerTo();
       V = IRB.CreateAlignedLoad(getAdjustedAllocaPtr(IRB, LTy),
                                 getPartitionTypeAlign(TargetTy),
-                                LI.isVolatile(), getName(".load"));
+                                LI.isVolatile(), "load");
       IsPtrAdjusted = true;
     }
     V = convertValue(TD, IRB, V, TargetTy);
 
-    if (IsSplitIntLoad) {
+    if (IsSplit) {
       assert(!LI.isVolatile());
       assert(LI.getType()->isIntegerTy() &&
              "Only integer type loads and stores are split");
+      assert(Size < TD.getTypeStoreSize(LI.getType()) &&
+             "Split load isn't smaller than original load");
       assert(LI.getType()->getIntegerBitWidth() ==
              TD.getTypeStoreSizeInBits(LI.getType()) &&
              "Non-byte-multiple bit width");
-      assert(LI.getType()->getIntegerBitWidth() ==
-             TD.getTypeAllocSizeInBits(OldAI.getAllocatedType()) &&
-             "Only alloca-wide loads can be split and recomposed");
       // Move the insertion point just past the load so that we can refer to it.
       IRB.SetInsertPoint(llvm::next(BasicBlock::iterator(&LI)));
       // Create a placeholder value with the same type as LI to use as the
@@ -2511,7 +2575,7 @@ private:
       Value *Placeholder
         = new LoadInst(UndefValue::get(LI.getType()->getPointerTo()));
       V = insertInteger(TD, IRB, Placeholder, V, BeginOffset,
-                        getName(".insert"));
+                        "insert");
       LI.replaceAllUsesWith(V);
       Placeholder->replaceAllUsesWith(&LI);
       delete Placeholder;
@@ -2525,7 +2589,7 @@ private:
     return !LI.isVolatile() && !IsPtrAdjusted;
   }
 
-  bool rewriteVectorizedStoreInst(IRBuilder<> &IRB, Value *V,
+  bool rewriteVectorizedStoreInst(Value *V,
                                   StoreInst &SI, Value *OldOp) {
     unsigned BeginIndex = getIndex(BeginOffset);
     unsigned EndIndex = getIndex(EndOffset);
@@ -2540,8 +2604,8 @@ private:
 
     // Mix in the existing elements.
     Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                       getName(".load"));
-    V = insertVector(IRB, Old, V, BeginIndex, getName(".vec"));
+                                       "load");
+    V = insertVector(IRB, Old, V, BeginIndex, "vec");
 
     StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
     Pass.DeadInsts.insert(&SI);
@@ -2551,17 +2615,17 @@ private:
     return true;
   }
 
-  bool rewriteIntegerStore(IRBuilder<> &IRB, Value *V, StoreInst &SI) {
+  bool rewriteIntegerStore(Value *V, StoreInst &SI) {
     assert(IntTy && "We cannot extract an integer from the alloca");
     assert(!SI.isVolatile());
     if (TD.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) {
       Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                         getName(".oldload"));
+                                         "oldload");
       Old = convertValue(TD, IRB, Old, IntTy);
       assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
       uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
       V = insertInteger(TD, IRB, Old, SI.getValueOperand(), Offset,
-                        getName(".insert"));
+                        "insert");
     }
     V = convertValue(TD, IRB, V, NewAllocaTy);
     StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
@@ -2575,7 +2639,6 @@ private:
     DEBUG(dbgs() << "    original: " << SI << "\n");
     Value *OldOp = SI.getOperand(1);
     assert(OldOp == OldPtr);
-    IRBuilder<> IRB(&SI);
 
     Value *V = SI.getValueOperand();
 
@@ -2588,26 +2651,25 @@ private:
     uint64_t Size = EndOffset - BeginOffset;
     if (Size < TD.getTypeStoreSize(V->getType())) {
       assert(!SI.isVolatile());
+      assert(IsSplit && "A seemingly split store isn't splittable");
       assert(V->getType()->isIntegerTy() &&
              "Only integer type loads and stores are split");
       assert(V->getType()->getIntegerBitWidth() ==
              TD.getTypeStoreSizeInBits(V->getType()) &&
              "Non-byte-multiple bit width");
-      assert(V->getType()->getIntegerBitWidth() ==
-             TD.getTypeAllocSizeInBits(OldAI.getAllocatedType()) &&
-             "Only alloca-wide stores can be split and recomposed");
       IntegerType *NarrowTy = Type::getIntNTy(SI.getContext(), Size * 8);
       V = extractInteger(TD, IRB, V, NarrowTy, BeginOffset,
-                         getName(".extract"));
+                         "extract");
     }
 
     if (VecTy)
-      return rewriteVectorizedStoreInst(IRB, V, SI, OldOp);
+      return rewriteVectorizedStoreInst(V, SI, OldOp);
     if (IntTy && V->getType()->isIntegerTy())
-      return rewriteIntegerStore(IRB, V, SI);
+      return rewriteIntegerStore(V, SI);
 
     StoreInst *NewSI;
     if (BeginOffset == NewAllocaBeginOffset &&
+        EndOffset == NewAllocaEndOffset &&
         canConvertValue(TD, V->getType(), NewAllocaTy)) {
       V = convertValue(TD, IRB, V, NewAllocaTy);
       NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
@@ -2635,7 +2697,7 @@ private:
   ///
   /// \param V The i8 value to splat.
   /// \param Size The number of bytes in the output (assuming i8 is one byte)
-  Value *getIntegerSplat(IRBuilder<> &IRB, Value *V, unsigned Size) {
+  Value *getIntegerSplat(Value *V, unsigned Size) {
     assert(Size > 0 && "Expected a positive number of bytes.");
     IntegerType *VTy = cast<IntegerType>(V->getType());
     assert(VTy->getBitWidth() == 8 && "Expected an i8 value for the byte");
@@ -2643,26 +2705,25 @@ private:
       return V;
 
     Type *SplatIntTy = Type::getIntNTy(VTy->getContext(), Size*8);
-    V = IRB.CreateMul(IRB.CreateZExt(V, SplatIntTy, getName(".zext")),
+    V = IRB.CreateMul(IRB.CreateZExt(V, SplatIntTy, "zext"),
                       ConstantExpr::getUDiv(
                         Constant::getAllOnesValue(SplatIntTy),
                         ConstantExpr::getZExt(
                           Constant::getAllOnesValue(V->getType()),
                           SplatIntTy)),
-                      getName(".isplat"));
+                      "isplat");
     return V;
   }
 
   /// \brief Compute a vector splat for a given element value.
-  Value *getVectorSplat(IRBuilder<> &IRB, Value *V, unsigned NumElements) {
-    V = IRB.CreateVectorSplat(NumElements, V, NamePrefix);
+  Value *getVectorSplat(Value *V, unsigned NumElements) {
+    V = IRB.CreateVectorSplat(NumElements, V, "vsplat");
     DEBUG(dbgs() << "       splat: " << *V << "\n");
     return V;
   }
 
   bool visitMemSetInst(MemSetInst &II) {
     DEBUG(dbgs() << "    original: " << II << "\n");
-    IRBuilder<> IRB(&II);
     assert(II.getRawDest() == OldPtr);
 
     // If the memset has a variable size, it cannot be split, just adjust the
@@ -2719,31 +2780,31 @@ private:
       unsigned NumElements = EndIndex - BeginIndex;
       assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
 
-      Value *Splat = getIntegerSplat(IRB, II.getValue(),
-                                     TD.getTypeSizeInBits(ElementTy)/8);
+      Value *Splat =
+          getIntegerSplat(II.getValue(), TD.getTypeSizeInBits(ElementTy) / 8);
       Splat = convertValue(TD, IRB, Splat, ElementTy);
       if (NumElements > 1)
-        Splat = getVectorSplat(IRB, Splat, NumElements);
+        Splat = getVectorSplat(Splat, NumElements);
 
       Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                         getName(".oldload"));
-      V = insertVector(IRB, Old, Splat, BeginIndex, getName(".vec"));
+                                         "oldload");
+      V = insertVector(IRB, Old, Splat, BeginIndex, "vec");
     } else if (IntTy) {
       // If this is a memset on an alloca where we can widen stores, insert the
       // set integer.
       assert(!II.isVolatile());
 
       uint64_t Size = EndOffset - BeginOffset;
-      V = getIntegerSplat(IRB, II.getValue(), Size);
+      V = getIntegerSplat(II.getValue(), Size);
 
       if (IntTy && (BeginOffset != NewAllocaBeginOffset ||
                     EndOffset != NewAllocaBeginOffset)) {
         Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                           getName(".oldload"));
+                                           "oldload");
         Old = convertValue(TD, IRB, Old, IntTy);
         assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
         uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
-        V = insertInteger(TD, IRB, Old, V, Offset, getName(".insert"));
+        V = insertInteger(TD, IRB, Old, V, Offset, "insert");
       } else {
         assert(V->getType() == IntTy &&
                "Wrong type for an alloca wide integer!");
@@ -2754,10 +2815,9 @@ private:
       assert(BeginOffset == NewAllocaBeginOffset);
       assert(EndOffset == NewAllocaEndOffset);
 
-      V = getIntegerSplat(IRB, II.getValue(),
-                          TD.getTypeSizeInBits(ScalarTy)/8);
+      V = getIntegerSplat(II.getValue(), TD.getTypeSizeInBits(ScalarTy) / 8);
       if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy))
-        V = getVectorSplat(IRB, V, AllocaVecTy->getNumElements());
+        V = getVectorSplat(V, AllocaVecTy->getNumElements());
 
       V = convertValue(TD, IRB, V, AllocaTy);
     }
@@ -2774,7 +2834,6 @@ private:
     // them into two categories: split intrinsics and unsplit intrinsics.
 
     DEBUG(dbgs() << "    original: " << II << "\n");
-    IRBuilder<> IRB(&II);
 
     assert(II.getRawSource() == OldPtr || II.getRawDest() == OldPtr);
     bool IsDest = II.getRawDest() == OldPtr;
@@ -2858,8 +2917,7 @@ private:
 
       // Compute the other pointer, folding as much as possible to produce
       // a single, simple GEP in most cases.
-      OtherPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy,
-                                getName("." + OtherPtr->getName()));
+      OtherPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy);
 
       Value *OurPtr
         = getAdjustedAllocaPtr(IRB, IsDest ? II.getRawDest()->getType()
@@ -2902,8 +2960,7 @@ private:
       OtherPtrTy = SubIntTy->getPointerTo();
     }
 
-    Value *SrcPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy,
-                                   getName("." + OtherPtr->getName()));
+    Value *SrcPtr = getAdjustedPtr(IRB, TD, OtherPtr, RelOffset, OtherPtrTy);
     Value *DstPtr = &NewAI;
     if (!IsDest)
       std::swap(SrcPtr, DstPtr);
@@ -2911,31 +2968,31 @@ private:
     Value *Src;
     if (VecTy && !IsWholeAlloca && !IsDest) {
       Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                  getName(".load"));
-      Src = extractVector(IRB, Src, BeginIndex, EndIndex, getName(".vec"));
+                                  "load");
+      Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec");
     } else if (IntTy && !IsWholeAlloca && !IsDest) {
       Src = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                  getName(".load"));
+                                  "load");
       Src = convertValue(TD, IRB, Src, IntTy);
       assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
       uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
-      Src = extractInteger(TD, IRB, Src, SubIntTy, Offset, getName(".extract"));
+      Src = extractInteger(TD, IRB, Src, SubIntTy, Offset, "extract");
     } else {
       Src = IRB.CreateAlignedLoad(SrcPtr, Align, II.isVolatile(),
-                                  getName(".copyload"));
+                                  "copyload");
     }
 
     if (VecTy && !IsWholeAlloca && IsDest) {
       Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                         getName(".oldload"));
-      Src = insertVector(IRB, Old, Src, BeginIndex, getName(".vec"));
+                                         "oldload");
+      Src = insertVector(IRB, Old, Src, BeginIndex, "vec");
     } else if (IntTy && !IsWholeAlloca && IsDest) {
       Value *Old = IRB.CreateAlignedLoad(&NewAI, NewAI.getAlignment(),
-                                         getName(".oldload"));
+                                         "oldload");
       Old = convertValue(TD, IRB, Old, IntTy);
       assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
       uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
-      Src = insertInteger(TD, IRB, Old, Src, Offset, getName(".insert"));
+      Src = insertInteger(TD, IRB, Old, Src, Offset, "insert");
       Src = convertValue(TD, IRB, Src, NewAllocaTy);
     }
 
@@ -2950,7 +3007,6 @@ private:
     assert(II.getIntrinsicID() == Intrinsic::lifetime_start ||
            II.getIntrinsicID() == Intrinsic::lifetime_end);
     DEBUG(dbgs() << "    original: " << II << "\n");
-    IRBuilder<> IRB(&II);
     assert(II.getArgOperand(1) == OldPtr);
 
     // Record this instruction for deletion.
@@ -2978,7 +3034,9 @@ private:
     // as local as possible to the PHI. To do that, we re-use the location of
     // the old pointer, which necessarily must be in the right position to
     // dominate the PHI.
-    IRBuilder<> PtrBuilder(cast<Instruction>(OldPtr));
+    IRBuilderTy PtrBuilder(cast<Instruction>(OldPtr));
+    PtrBuilder.SetNamePrefix(Twine(NewAI.getName()) + "." + Twine(BeginOffset) +
+                             ".");
 
     Value *NewPtr = getAdjustedAllocaPtr(PtrBuilder, OldPtr->getType());
     // Replace the operands which were using the old pointer.
@@ -2991,17 +3049,16 @@ private:
 
   bool visitSelectInst(SelectInst &SI) {
     DEBUG(dbgs() << "    original: " << SI << "\n");
-    IRBuilder<> IRB(&SI);
-
-    // Find the operand we need to rewrite here.
-    bool IsTrueVal = SI.getTrueValue() == OldPtr;
-    if (IsTrueVal)
-      assert(SI.getFalseValue() != OldPtr && "Pointer is both operands!");
-    else
-      assert(SI.getFalseValue() == OldPtr && "Pointer isn't an operand!");
+    assert((SI.getTrueValue() == OldPtr || SI.getFalseValue() == OldPtr) &&
+           "Pointer isn't an operand!");
 
     Value *NewPtr = getAdjustedAllocaPtr(IRB, OldPtr->getType());
-    SI.setOperand(IsTrueVal ? 1 : 2, NewPtr);
+    // Replace the operands which were using the old pointer.
+    if (SI.getOperand(1) == OldPtr)
+      SI.setOperand(1, NewPtr);
+    if (SI.getOperand(2) == OldPtr)
+      SI.setOperand(2, NewPtr);
+
     DEBUG(dbgs() << "          to: " << SI << "\n");
     deleteIfTriviallyDead(OldPtr);
     return false;
@@ -3066,7 +3123,7 @@ private:
   class OpSplitter {
   protected:
     /// The builder used to form new instructions.
-    IRBuilder<> IRB;
+    IRBuilderTy IRB;
     /// The indices which to be used with insert- or extractvalue to select the
     /// appropriate value within the aggregate.
     SmallVector<unsigned, 4> Indices;
@@ -3278,12 +3335,13 @@ static Type *getTypePartition(const DataLayout &TD, Type *Ty,
     Type *ElementTy = SeqTy->getElementType();
     uint64_t ElementSize = TD.getTypeAllocSize(ElementTy);
     uint64_t NumSkippedElements = Offset / ElementSize;
-    if (ArrayType *ArrTy = dyn_cast<ArrayType>(SeqTy))
+    if (ArrayType *ArrTy = dyn_cast<ArrayType>(SeqTy)) {
       if (NumSkippedElements >= ArrTy->getNumElements())
         return 0;
-    if (VectorType *VecTy = dyn_cast<VectorType>(SeqTy))
+    } else if (VectorType *VecTy = dyn_cast<VectorType>(SeqTy)) {
       if (NumSkippedElements >= VecTy->getNumElements())
         return 0;
+    }
     Offset -= NumSkippedElements * ElementSize;
 
     // First check if we need to recurse.
@@ -3381,7 +3439,7 @@ bool SROA::rewriteAllocaPartition(AllocaInst &AI,
   for (AllocaPartitioning::use_iterator UI = P.use_begin(PI),
                                         UE = P.use_end(PI);
        UI != UE && !IsLive; ++UI)
-    if (UI->U)
+    if (UI->getUse())
       IsLive = true;
   if (!IsLive)
     return false; // No live uses left of this partition.
diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
index e590a374ea..bfde334c36 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -1462,8 +1462,8 @@ bool SROA::ShouldAttemptScalarRepl(AllocaInst *AI) {
 }
 
 // performScalarRepl - This algorithm is a simple worklist driven algorithm,
-// which runs on all of the alloca instructions in the function, removing them
-// if they are only used by getelementptr instructions.
+// which runs on all of the alloca instructions in the entry block, removing
+// them if they are only used by getelementptr instructions.
 //
 bool SROA::performScalarRepl(Function &F) {
   std::vector<AllocaInst*> WorkList;
@@ -1724,17 +1724,8 @@ void SROA::isSafeGEP(GetElementPtrInst *GEPI,
       continue;
 
     ConstantInt *IdxVal = dyn_cast<ConstantInt>(GEPIt.getOperand());
-    if (!IdxVal) {
-      // Non constant GEPs are only a problem on arrays, structs, and pointers
-      // Vectors can be dynamically indexed.
-      // FIXME: Add support for dynamic indexing on arrays.  This should be
-      // ok on any subarrays of the alloca array, eg, a[0][i] is ok, but a[i][0]
-      // isn't.
-      if (!(*GEPIt)->isVectorTy())
-        return MarkUnsafe(Info, GEPI);
-      NonConstant = true;
-      NonConstantIdxSize = TD->getTypeAllocSize(*GEPIt);
-    }
+    if (!IdxVal)
+      return MarkUnsafe(Info, GEPI);
   }
 
   // Compute the offset due to this GEP and check if the alloca has a
diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
index 916b37d4a8..3514e6c2aa 100644
--- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
@@ -19,7 +19,6 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Config/config.h"            // FIXME: Shouldn't depend on host!
@@ -35,7 +34,6 @@
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 using namespace llvm;
 
-STATISTIC(NumAnnotated, "Number of attributes added to library functions");
 
 //===----------------------------------------------------------------------===//
 // Optimizer Base Class
@@ -91,8 +89,6 @@ namespace {
     TargetLibraryInfo *TLI;
 
     StringMap<LibCallOptimization*> Optimizations;
-
-    bool Modified;  // This is only used by doInitialization.
   public:
     static char ID; // Pass identification
     SimplifyLibCalls() : FunctionPass(ID) {
@@ -104,14 +100,6 @@ namespace {
     void InitOptimizations();
     bool runOnFunction(Function &F);
 
-    void setDoesNotAccessMemory(Function &F);
-    void setOnlyReadsMemory(Function &F);
-    void setDoesNotThrow(Function &F);
-    void setDoesNotCapture(Function &F, unsigned n);
-    void setDoesNotAlias(Function &F, unsigned n);
-    bool doInitialization(Module &M);
-
-    void inferPrototypeAttributes(Function &F);
     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
       AU.addRequired<TargetLibraryInfo>();
     }
@@ -208,697 +196,6 @@ bool SimplifyLibCalls::runOnFunction(Function &F) {
   return Changed;
 }
 
-// Utility methods for doInitialization.
-
-void SimplifyLibCalls::setDoesNotAccessMemory(Function &F) {
-  if (!F.doesNotAccessMemory()) {
-    F.setDoesNotAccessMemory();
-    ++NumAnnotated;
-    Modified = true;
-  }
-}
-void SimplifyLibCalls::setOnlyReadsMemory(Function &F) {
-  if (!F.onlyReadsMemory()) {
-    F.setOnlyReadsMemory();
-    ++NumAnnotated;
-    Modified = true;
-  }
-}
-void SimplifyLibCalls::setDoesNotThrow(Function &F) {
-  if (!F.doesNotThrow()) {
-    F.setDoesNotThrow();
-    ++NumAnnotated;
-    Modified = true;
-  }
-}
-void SimplifyLibCalls::setDoesNotCapture(Function &F, unsigned n) {
-  if (!F.doesNotCapture(n)) {
-    F.setDoesNotCapture(n);
-    ++NumAnnotated;
-    Modified = true;
-  }
-}
-void SimplifyLibCalls::setDoesNotAlias(Function &F, unsigned n) {
-  if (!F.doesNotAlias(n)) {
-    F.setDoesNotAlias(n);
-    ++NumAnnotated;
-    Modified = true;
-  }
-}
-
-
-void SimplifyLibCalls::inferPrototypeAttributes(Function &F) {
-  FunctionType *FTy = F.getFunctionType();
-
-  StringRef Name = F.getName();
-  switch (Name[0]) {
-  case 's':
-    if (Name == "strlen") {
-      if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setOnlyReadsMemory(F);
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "strchr" ||
-               Name == "strrchr") {
-      if (FTy->getNumParams() != 2 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isIntegerTy())
-        return;
-      setOnlyReadsMemory(F);
-      setDoesNotThrow(F);
-    } else if (Name == "strcpy" ||
-               Name == "stpcpy" ||
-               Name == "strcat" ||
-               Name == "strtol" ||
-               Name == "strtod" ||
-               Name == "strtof" ||
-               Name == "strtoul" ||
-               Name == "strtoll" ||
-               Name == "strtold" ||
-               Name == "strncat" ||
-               Name == "strncpy" ||
-               Name == "stpncpy" ||
-               Name == "strtoull") {
-      if (FTy->getNumParams() < 2 ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "strxfrm") {
-      if (FTy->getNumParams() != 3 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "strcmp" ||
-               Name == "strspn" ||
-               Name == "strncmp" ||
-               Name == "strcspn" ||
-               Name == "strcoll" ||
-               Name == "strcasecmp" ||
-               Name == "strncasecmp") {
-      if (FTy->getNumParams() < 2 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setOnlyReadsMemory(F);
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "strstr" ||
-               Name == "strpbrk") {
-      if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-        return;
-      setOnlyReadsMemory(F);
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "strtok" ||
-               Name == "strtok_r") {
-      if (FTy->getNumParams() < 2 || !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "scanf" ||
-               Name == "setbuf" ||
-               Name == "setvbuf") {
-      if (FTy->getNumParams() < 1 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "strdup" ||
-               Name == "strndup") {
-      if (FTy->getNumParams() < 1 || !FTy->getReturnType()->isPointerTy() ||
-          !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "stat" ||
-               Name == "sscanf" ||
-               Name == "sprintf" ||
-               Name == "statvfs") {
-      if (FTy->getNumParams() < 2 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "snprintf") {
-      if (FTy->getNumParams() != 3 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(2)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 3);
-    } else if (Name == "setitimer") {
-      if (FTy->getNumParams() != 3 ||
-          !FTy->getParamType(1)->isPointerTy() ||
-          !FTy->getParamType(2)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 2);
-      setDoesNotCapture(F, 3);
-    } else if (Name == "system") {
-      if (FTy->getNumParams() != 1 ||
-          !FTy->getParamType(0)->isPointerTy())
-        return;
-      // May throw; "system" is a valid pthread cancellation point.
-      setDoesNotCapture(F, 1);
-    }
-    break;
-  case 'm':
-    if (Name == "malloc") {
-      if (FTy->getNumParams() != 1 ||
-          !FTy->getReturnType()->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-    } else if (Name == "memcmp") {
-      if (FTy->getNumParams() != 3 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setOnlyReadsMemory(F);
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "memchr" ||
-               Name == "memrchr") {
-      if (FTy->getNumParams() != 3)
-        return;
-      setOnlyReadsMemory(F);
-      setDoesNotThrow(F);
-    } else if (Name == "modf" ||
-               Name == "modff" ||
-               Name == "modfl" ||
-               Name == "memcpy" ||
-               Name == "memccpy" ||
-               Name == "memmove") {
-      if (FTy->getNumParams() < 2 ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "memalign") {
-      if (!FTy->getReturnType()->isPointerTy())
-        return;
-      setDoesNotAlias(F, 0);
-    } else if (Name == "mkdir" ||
-               Name == "mktime") {
-      if (FTy->getNumParams() == 0 ||
-          !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    }
-    break;
-  case 'r':
-    if (Name == "realloc") {
-      if (FTy->getNumParams() != 2 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getReturnType()->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "read") {
-      if (FTy->getNumParams() != 3 ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      // May throw; "read" is a valid pthread cancellation point.
-      setDoesNotCapture(F, 2);
-    } else if (Name == "rmdir" ||
-               Name == "rewind" ||
-               Name == "remove" ||
-               Name == "realpath") {
-      if (FTy->getNumParams() < 1 ||
-          !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "rename" ||
-               Name == "readlink") {
-      if (FTy->getNumParams() < 2 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    }
-    break;
-  case 'w':
-    if (Name == "write") {
-      if (FTy->getNumParams() != 3 || !FTy->getParamType(1)->isPointerTy())
-        return;
-      // May throw; "write" is a valid pthread cancellation point.
-      setDoesNotCapture(F, 2);
-    }
-    break;
-  case 'b':
-    if (Name == "bcopy") {
-      if (FTy->getNumParams() != 3 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "bcmp") {
-      if (FTy->getNumParams() != 3 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setOnlyReadsMemory(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "bzero") {
-      if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    }
-    break;
-  case 'c':
-    if (Name == "calloc") {
-      if (FTy->getNumParams() != 2 ||
-          !FTy->getReturnType()->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-    } else if (Name == "chmod" ||
-               Name == "chown" ||
-               Name == "ctermid" ||
-               Name == "clearerr" ||
-               Name == "closedir") {
-      if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    }
-    break;
-  case 'a':
-    if (Name == "atoi" ||
-        Name == "atol" ||
-        Name == "atof" ||
-        Name == "atoll") {
-      if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setOnlyReadsMemory(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "access") {
-      if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    }
-    break;
-  case 'f':
-    if (Name == "fopen") {
-      if (FTy->getNumParams() != 2 ||
-          !FTy->getReturnType()->isPointerTy() ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "fdopen") {
-      if (FTy->getNumParams() != 2 ||
-          !FTy->getReturnType()->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "feof" ||
-               Name == "free" ||
-               Name == "fseek" ||
-               Name == "ftell" ||
-               Name == "fgetc" ||
-               Name == "fseeko" ||
-               Name == "ftello" ||
-               Name == "fileno" ||
-               Name == "fflush" ||
-               Name == "fclose" ||
-               Name == "fsetpos" ||
-               Name == "flockfile" ||
-               Name == "funlockfile" ||
-               Name == "ftrylockfile") {
-      if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "ferror") {
-      if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setOnlyReadsMemory(F);
-    } else if (Name == "fputc" ||
-               Name == "fstat" ||
-               Name == "frexp" ||
-               Name == "frexpf" ||
-               Name == "frexpl" ||
-               Name == "fstatvfs") {
-      if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "fgets") {
-      if (FTy->getNumParams() != 3 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(2)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 3);
-    } else if (Name == "fread" ||
-               Name == "fwrite") {
-      if (FTy->getNumParams() != 4 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(3)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 4);
-    } else if (Name == "fputs" ||
-               Name == "fscanf" ||
-               Name == "fprintf" ||
-               Name == "fgetpos") {
-      if (FTy->getNumParams() < 2 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    }
-    break;
-  case 'g':
-    if (Name == "getc" ||
-        Name == "getlogin_r" ||
-        Name == "getc_unlocked") {
-      if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "getenv") {
-      if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setOnlyReadsMemory(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "gets" ||
-               Name == "getchar") {
-      setDoesNotThrow(F);
-    } else if (Name == "getitimer") {
-      if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "getpwnam") {
-      if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    }
-    break;
-  case 'u':
-    if (Name == "ungetc") {
-      if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "uname" ||
-               Name == "unlink" ||
-               Name == "unsetenv") {
-      if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "utime" ||
-               Name == "utimes") {
-      if (FTy->getNumParams() != 2 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    }
-    break;
-  case 'p':
-    if (Name == "putc") {
-      if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "puts" ||
-               Name == "printf" ||
-               Name == "perror") {
-      if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "pread" ||
-               Name == "pwrite") {
-      if (FTy->getNumParams() != 4 || !FTy->getParamType(1)->isPointerTy())
-        return;
-      // May throw; these are valid pthread cancellation points.
-      setDoesNotCapture(F, 2);
-    } else if (Name == "putchar") {
-      setDoesNotThrow(F);
-    } else if (Name == "popen") {
-      if (FTy->getNumParams() != 2 ||
-          !FTy->getReturnType()->isPointerTy() ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "pclose") {
-      if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    }
-    break;
-  case 'v':
-    if (Name == "vscanf") {
-      if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "vsscanf" ||
-               Name == "vfscanf") {
-      if (FTy->getNumParams() != 3 ||
-          !FTy->getParamType(1)->isPointerTy() ||
-          !FTy->getParamType(2)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "valloc") {
-      if (!FTy->getReturnType()->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-    } else if (Name == "vprintf") {
-      if (FTy->getNumParams() != 2 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "vfprintf" ||
-               Name == "vsprintf") {
-      if (FTy->getNumParams() != 3 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "vsnprintf") {
-      if (FTy->getNumParams() != 4 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(2)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 3);
-    }
-    break;
-  case 'o':
-    if (Name == "open") {
-      if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      // May throw; "open" is a valid pthread cancellation point.
-      setDoesNotCapture(F, 1);
-    } else if (Name == "opendir") {
-      if (FTy->getNumParams() != 1 ||
-          !FTy->getReturnType()->isPointerTy() ||
-          !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-      setDoesNotCapture(F, 1);
-    }
-    break;
-  case 't':
-    if (Name == "tmpfile") {
-      if (!FTy->getReturnType()->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-    } else if (Name == "times") {
-      if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    }
-    break;
-  case 'h':
-    if (Name == "htonl" ||
-        Name == "htons") {
-      setDoesNotThrow(F);
-      setDoesNotAccessMemory(F);
-    }
-    break;
-  case 'n':
-    if (Name == "ntohl" ||
-        Name == "ntohs") {
-      setDoesNotThrow(F);
-      setDoesNotAccessMemory(F);
-    }
-    break;
-  case 'l':
-    if (Name == "lstat") {
-      if (FTy->getNumParams() != 2 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "lchown") {
-      if (FTy->getNumParams() != 3 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    }
-    break;
-  case 'q':
-    if (Name == "qsort") {
-      if (FTy->getNumParams() != 4 || !FTy->getParamType(3)->isPointerTy())
-        return;
-      // May throw; places call through function pointer.
-      setDoesNotCapture(F, 4);
-    }
-    break;
-  case '_':
-    if (Name == "__strdup" ||
-        Name == "__strndup") {
-      if (FTy->getNumParams() < 1 ||
-          !FTy->getReturnType()->isPointerTy() ||
-          !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "__strtok_r") {
-      if (FTy->getNumParams() != 3 ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "_IO_getc") {
-      if (FTy->getNumParams() != 1 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "_IO_putc") {
-      if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 2);
-    }
-    break;
-  case 1:
-    if (Name == "\1__isoc99_scanf") {
-      if (FTy->getNumParams() < 1 ||
-          !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "\1stat64" ||
-               Name == "\1lstat64" ||
-               Name == "\1statvfs64" ||
-               Name == "\1__isoc99_sscanf") {
-      if (FTy->getNumParams() < 1 ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "\1fopen64") {
-      if (FTy->getNumParams() != 2 ||
-          !FTy->getReturnType()->isPointerTy() ||
-          !FTy->getParamType(0)->isPointerTy() ||
-          !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-      setDoesNotCapture(F, 1);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "\1fseeko64" ||
-               Name == "\1ftello64") {
-      if (FTy->getNumParams() == 0 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 1);
-    } else if (Name == "\1tmpfile64") {
-      if (!FTy->getReturnType()->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotAlias(F, 0);
-    } else if (Name == "\1fstat64" ||
-               Name == "\1fstatvfs64") {
-      if (FTy->getNumParams() != 2 || !FTy->getParamType(1)->isPointerTy())
-        return;
-      setDoesNotThrow(F);
-      setDoesNotCapture(F, 2);
-    } else if (Name == "\1open64") {
-      if (FTy->getNumParams() < 2 || !FTy->getParamType(0)->isPointerTy())
-        return;
-      // May throw; "open" is a valid pthread cancellation point.
-      setDoesNotCapture(F, 1);
-    }
-    break;
-  }
-}
-
-/// doInitialization - Add attributes to well-known functions.
-///
-bool SimplifyLibCalls::doInitialization(Module &M) {
-  Modified = false;
-  for (Module::iterator I = M.begin(), E = M.end(); I != E; ++I) {
-    Function &F = *I;
-    if (F.isDeclaration() && F.hasName())
-      inferPrototypeAttributes(F);
-  }
-  return Modified;
-}
-
 // TODO:
 //   Additional cases that we need to add to this file:
 //
diff --git a/lib/Transforms/Utils/BypassSlowDivision.cpp b/lib/Transforms/Utils/BypassSlowDivision.cpp
index 00cda8e034..1f517d038d 100644
--- a/lib/Transforms/Utils/BypassSlowDivision.cpp
+++ b/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -163,7 +163,7 @@ static bool insertFastDiv(Function &F,
   Value *AndV = MainBuilder.CreateAnd(OrV, BitMask);
 
   // Compare operand values and branch
-  Value *ZeroV = MainBuilder.getInt32(0);
+  Value *ZeroV = ConstantInt::getSigned(Dividend->getType(), 0);
   Value *CmpV = MainBuilder.CreateICmpEQ(AndV, ZeroV);
   MainBuilder.CreateCondBr(CmpV, FastBB, SlowBB);
 
@@ -244,7 +244,7 @@ bool llvm::bypassSlowDivision(Function &F,
 
     // Get bitwidth of div/rem instruction
     IntegerType *T = cast<IntegerType>(J->getType());
-    int bitwidth = T->getBitWidth();
+    unsigned int bitwidth = T->getBitWidth();
 
     // Continue if bitwidth is not bypassed
     DenseMap<unsigned int, unsigned int>::const_iterator BI = BypassWidths.find(bitwidth);
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index a309bce544..be8d39e128 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -87,26 +87,26 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
     assert(VMap.count(I) && "No mapping from source argument specified!");
 #endif
 
-  // Clone any attributes.
-  if (NewFunc->arg_size() == OldFunc->arg_size())
-    NewFunc->copyAttributesFrom(OldFunc);
-  else {
-    //Some arguments were deleted with the VMap. Copy arguments one by one
-    for (Function::const_arg_iterator I = OldFunc->arg_begin(), 
-           E = OldFunc->arg_end(); I != E; ++I)
-      if (Argument* Anew = dyn_cast<Argument>(VMap[I]))
-        Anew->addAttr(OldFunc->getAttributes()
-                       .getParamAttributes(I->getArgNo() + 1));
-    NewFunc->setAttributes(NewFunc->getAttributes()
-                           .addAttributes(NewFunc->getContext(),
-                                          AttributeSet::ReturnIndex,
-                                          OldFunc->getAttributes()));
-    NewFunc->setAttributes(NewFunc->getAttributes()
-                           .addAttributes(NewFunc->getContext(),
-                                          AttributeSet::FunctionIndex,
-                                          OldFunc->getAttributes()));
+  AttributeSet OldAttrs = OldFunc->getAttributes();
+  // Clone any argument attributes that are present in the VMap.
+  for (Function::const_arg_iterator I = OldFunc->arg_begin(),
+                                    E = OldFunc->arg_end();
+       I != E; ++I)
+    if (Argument *Anew = dyn_cast<Argument>(VMap[I])) {
+      AttributeSet attrs =
+          OldAttrs.getParamAttributes(I->getArgNo() + 1);
+      if (attrs.getNumSlots() > 0)
+        Anew->addAttr(attrs);
+    }
 
-  }
+  NewFunc->setAttributes(NewFunc->getAttributes()
+                         .addAttributes(NewFunc->getContext(),
+                                        AttributeSet::ReturnIndex,
+                                        OldAttrs.getRetAttributes()));
+  NewFunc->setAttributes(NewFunc->getAttributes()
+                         .addAttributes(NewFunc->getContext(),
+                                        AttributeSet::FunctionIndex,
+                                        OldAttrs.getFnAttributes()));
 
   // Loop over all of the basic blocks in the function, cloning them as
   // appropriate.  Note that we save BE this way in order to handle cloning of
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index 0d2598a221..dabb67b921 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -82,7 +82,8 @@ namespace {
     /// a simple branch. When there is more than one predecessor, we need to
     /// split the landing pad block after the landingpad instruction and jump
     /// to there.
-    void forwardResume(ResumeInst *RI);
+    void forwardResume(ResumeInst *RI,
+                       SmallPtrSet<LandingPadInst*, 16> &InlinedLPads);
 
     /// addIncomingPHIValuesFor - Add incoming-PHI values to the unwind
     /// destination block for the given basic block, using the values for the
@@ -140,8 +141,10 @@ BasicBlock *InvokeInliningInfo::getInnerResumeDest() {
 /// block. When the landing pad block has only one predecessor, this is a simple
 /// branch. When there is more than one predecessor, we need to split the
 /// landing pad block after the landingpad instruction and jump to there.
-void InvokeInliningInfo::forwardResume(ResumeInst *RI) {
+void InvokeInliningInfo::forwardResume(ResumeInst *RI,
+                               SmallPtrSet<LandingPadInst*, 16> &InlinedLPads) {
   BasicBlock *Dest = getInnerResumeDest();
+  LandingPadInst *OuterLPad = getLandingPadInst();
   BasicBlock *Src = RI->getParent();
 
   BranchInst::Create(Dest, Src);
@@ -152,6 +155,16 @@ void InvokeInliningInfo::forwardResume(ResumeInst *RI) {
 
   InnerEHValuesPHI->addIncoming(RI->getOperand(0), Src);
   RI->eraseFromParent();
+
+  // Append the clauses from the outer landing pad instruction into the inlined
+  // landing pad instructions.
+  for (SmallPtrSet<LandingPadInst*, 16>::iterator I = InlinedLPads.begin(),
+         E = InlinedLPads.end(); I != E; ++I) {
+    LandingPadInst *InlinedLPad = *I;
+    for (unsigned OuterIdx = 0, OuterNum = OuterLPad->getNumClauses();
+         OuterIdx != OuterNum; ++OuterIdx)
+      InlinedLPad->addClause(OuterLPad->getClause(OuterIdx));
+  }
 }
 
 /// HandleCallsInBlockInlinedThroughInvoke - When we inline a basic block into
@@ -229,19 +242,15 @@ static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock,
 
   // The inlined code is currently at the end of the function, scan from the
   // start of the inlined code to its end, checking for stuff we need to
-  // rewrite.  If the code doesn't have calls or unwinds, we know there is
-  // nothing to rewrite.
-  if (!InlinedCodeInfo.ContainsCalls) {
-    // Now that everything is happy, we have one final detail.  The PHI nodes in
-    // the exception destination block still have entries due to the original
-    // invoke instruction.  Eliminate these entries (which might even delete the
-    // PHI node) now.
-    InvokeDest->removePredecessor(II->getParent());
-    return;
-  }
-
+  // rewrite.
   InvokeInliningInfo Invoke(II);
-  
+
+  // Get all of the inlined landing pad instructions.
+  SmallPtrSet<LandingPadInst*, 16> InlinedLPads;
+  for (Function::iterator I = FirstNewBlock, E = Caller->end(); I != E; ++I)
+    if (InvokeInst *II = dyn_cast<InvokeInst>(I->getTerminator()))
+      InlinedLPads.insert(II->getLandingPadInst());
+
   for (Function::iterator BB = FirstNewBlock, E = Caller->end(); BB != E; ++BB){
     if (InlinedCodeInfo.ContainsCalls)
       if (HandleCallsInBlockInlinedThroughInvoke(BB, Invoke)) {
@@ -250,13 +259,14 @@ static void HandleInlinedInvoke(InvokeInst *II, BasicBlock *FirstNewBlock,
         continue;
       }
 
+    // Forward any resumes that are remaining here.
     if (ResumeInst *RI = dyn_cast<ResumeInst>(BB->getTerminator()))
-      Invoke.forwardResume(RI);
+      Invoke.forwardResume(RI, InlinedLPads);
   }
 
   // Now that everything is happy, we have one final detail.  The PHI nodes in
   // the exception destination block still have entries due to the original
-  // invoke instruction.  Eliminate these entries (which might even delete the
+  // invoke instruction. Eliminate these entries (which might even delete the
   // PHI node) now.
   InvokeDest->removePredecessor(II->getParent());
 }
@@ -748,8 +758,10 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
 
     // If the call site was an invoke instruction, add a branch to the normal
     // destination.
-    if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall))
-      BranchInst::Create(II->getNormalDest(), TheCall);
+    if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall)) {
+      BranchInst *NewBr = BranchInst::Create(II->getNormalDest(), TheCall);
+      NewBr->setDebugLoc(Returns[0]->getDebugLoc());
+    }
 
     // If the return instruction returned a value, replace uses of the call with
     // uses of the returned value.
@@ -777,15 +789,16 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
   // "starter" and "ender" blocks.  How we accomplish this depends on whether
   // this is an invoke instruction or a call instruction.
   BasicBlock *AfterCallBB;
+  BranchInst *CreatedBranchToNormalDest = NULL;
   if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall)) {
 
     // Add an unconditional branch to make this look like the CallInst case...
-    BranchInst *NewBr = BranchInst::Create(II->getNormalDest(), TheCall);
+    CreatedBranchToNormalDest = BranchInst::Create(II->getNormalDest(), TheCall);
 
     // Split the basic block.  This guarantees that no PHI nodes will have to be
     // updated due to new incoming edges, and make the invoke case more
     // symmetric to the call case.
-    AfterCallBB = OrigBB->splitBasicBlock(NewBr,
+    AfterCallBB = OrigBB->splitBasicBlock(CreatedBranchToNormalDest,
                                           CalledFunc->getName()+".exit");
 
   } else {  // It's a call
@@ -840,11 +853,20 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
 
 
     // Add a branch to the merge points and remove return instructions.
+    DebugLoc Loc;
     for (unsigned i = 0, e = Returns.size(); i != e; ++i) {
       ReturnInst *RI = Returns[i];
-      BranchInst::Create(AfterCallBB, RI);
+      BranchInst* BI = BranchInst::Create(AfterCallBB, RI);
+      Loc = RI->getDebugLoc();
+      BI->setDebugLoc(Loc);
       RI->eraseFromParent();
     }
+    // We need to set the debug location to *somewhere* inside the
+    // inlined function. The line number may be nonsensical, but the
+    // instruction will at least be associated with the right
+    // function.
+    if (CreatedBranchToNormalDest)
+      CreatedBranchToNormalDest->setDebugLoc(Loc);
   } else if (!Returns.empty()) {
     // Otherwise, if there is exactly one return value, just replace anything
     // using the return value of the call with the computed value.
@@ -864,6 +886,9 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     AfterCallBB->getInstList().splice(AfterCallBB->begin(),
                                       ReturnBB->getInstList());
 
+    if (CreatedBranchToNormalDest)
+      CreatedBranchToNormalDest->setDebugLoc(Returns[0]->getDebugLoc());
+
     // Delete the return instruction now and empty ReturnBB now.
     Returns[0]->eraseFromParent();
     ReturnBB->eraseFromParent();
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index a54ee08b67..12e5b3e9d2 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -832,7 +832,24 @@ unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign,
 ///  Dbg Intrinsic utilities
 ///
 
-/// Inserts a llvm.dbg.value instrinsic before the stores to an alloca'd value
+/// See if there is a dbg.value intrinsic for DIVar before I.
+static bool LdStHasDebugValue(DIVariable &DIVar, Instruction *I) {
+  // Since we can't guarantee that the original dbg.declare instrinsic
+  // is removed by LowerDbgDeclare(), we need to make sure that we are
+  // not inserting the same dbg.value intrinsic over and over.
+  llvm::BasicBlock::InstListType::iterator PrevI(I);
+  if (PrevI != I->getParent()->getInstList().begin()) {
+    --PrevI;
+    if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(PrevI))
+      if (DVI->getValue() == I->getOperand(0) &&
+          DVI->getOffset() == 0 &&
+          DVI->getVariable() == DIVar)
+        return true;
+  }
+  return false;
+}
+
+/// Inserts a llvm.dbg.value intrinsic before a store to an alloca'd value
 /// that has an associated llvm.dbg.decl intrinsic.
 bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
                                            StoreInst *SI, DIBuilder &Builder) {
@@ -840,6 +857,9 @@ bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
   if (!DIVar.Verify())
     return false;
 
+  if (LdStHasDebugValue(DIVar, SI))
+    return true;
+
   Instruction *DbgVal = NULL;
   // If an argument is zero extended then use argument directly. The ZExt
   // may be zapped by an optimization pass in future.
@@ -863,7 +883,7 @@ bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
   return true;
 }
 
-/// Inserts a llvm.dbg.value instrinsic before the stores to an alloca'd value
+/// Inserts a llvm.dbg.value intrinsic before a load of an alloca'd value
 /// that has an associated llvm.dbg.decl intrinsic.
 bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
                                            LoadInst *LI, DIBuilder &Builder) {
@@ -871,6 +891,9 @@ bool llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI,
   if (!DIVar.Verify())
     return false;
 
+  if (LdStHasDebugValue(DIVar, LI))
+    return true;
+
   Instruction *DbgVal = 
     Builder.insertDbgValueIntrinsic(LI->getOperand(0), 0,
                                     DIVar, LI);
@@ -902,6 +925,8 @@ bool llvm::LowerDbgDeclare(Function &F) {
          E = Dbgs.end(); I != E; ++I) {
     DbgDeclareInst *DDI = *I;
     if (AllocaInst *AI = dyn_cast_or_null<AllocaInst>(DDI->getAddress())) {
+      // We only remove the dbg.declare intrinsic if all uses are
+      // converted to dbg.value intrinsics.
       bool RemoveDDI = true;
       for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end();
            UI != E; ++UI)
@@ -985,22 +1010,17 @@ bool llvm::removeUnreachableBlocks(Function &F) {
     if (Reachable.count(I))
       continue;
 
-    // Remove the block as predecessor of all its reachable successors.
-    // Unreachable successors don't matter as they'll soon be removed, too.
     for (succ_iterator SI = succ_begin(I), SE = succ_end(I); SI != SE; ++SI)
       if (Reachable.count(*SI))
         (*SI)->removePredecessor(I);
+    I->dropAllReferences();
+  }
 
-    // Zap all instructions in this basic block.
-    while (!I->empty()) {
-      Instruction &Inst = I->back();
-      if (!Inst.use_empty())
-        Inst.replaceAllUsesWith(UndefValue::get(Inst.getType()));
-      I->getInstList().pop_back();
-    }
+  for (Function::iterator I = llvm::next(F.begin()), E=F.end(); I != E;)
+    if (!Reachable.count(I))
+      I = F.getBasicBlockList().erase(I);
+    else
+      ++I;
 
-    --I;
-    llvm::next(I)->eraseFromParent();
-  }
   return true;
 }
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index a63d31d5af..052ad85551 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -59,6 +59,10 @@ static cl::opt<bool>
 SinkCommon("simplifycfg-sink-common", cl::Hidden, cl::init(true),
        cl::desc("Sink common instructions down to the end block"));
 
+static cl::opt<bool>
+HoistCondStores("simplifycfg-hoist-cond-stores", cl::Hidden, cl::init(true),
+       cl::desc("Hoist conditional stores if an unconditional store preceeds"));
+
 STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
 STATISTIC(NumLookupTables, "Number of switch instructions turned into lookup tables");
 STATISTIC(NumSinkCommons, "Number of common instructions sunk down to the end block");
@@ -1332,6 +1336,66 @@ static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
   return Changed;
 }
 
+/// \brief Determine if we can hoist sink a sole store instruction out of a
+/// conditional block.
+///
+/// We are looking for code like the following:
+///   BrBB:
+///     store i32 %add, i32* %arrayidx2
+///     ... // No other stores or function calls (we could be calling a memory
+///     ... // function).
+///     %cmp = icmp ult %x, %y
+///     br i1 %cmp, label %EndBB, label %ThenBB
+///   ThenBB:
+///     store i32 %add5, i32* %arrayidx2
+///     br label EndBB
+///   EndBB:
+///     ...
+///   We are going to transform this into:
+///   BrBB:
+///     store i32 %add, i32* %arrayidx2
+///     ... //
+///     %cmp = icmp ult %x, %y
+///     %add.add5 = select i1 %cmp, i32 %add, %add5
+///     store i32 %add.add5, i32* %arrayidx2
+///     ...
+///
+/// \return The pointer to the value of the previous store if the store can be
+///         hoisted into the predecessor block. 0 otherwise.
+Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB,
+                              BasicBlock *StoreBB, BasicBlock *EndBB) {
+  StoreInst *StoreToHoist = dyn_cast<StoreInst>(I);
+  if (!StoreToHoist)
+    return 0;
+
+  // Volatile or atomic.
+  if (!StoreToHoist->isSimple())
+    return 0;
+
+  Value *StorePtr = StoreToHoist->getPointerOperand();
+
+  // Look for a store to the same pointer in BrBB.
+  unsigned MaxNumInstToLookAt = 10;
+  for (BasicBlock::reverse_iterator RI = BrBB->rbegin(),
+       RE = BrBB->rend(); RI != RE && (--MaxNumInstToLookAt); ++RI) {
+    Instruction *CurI = &*RI;
+
+    // Could be calling an instruction that effects memory like free().
+    if (CurI->mayHaveSideEffects() && !isa<StoreInst>(CurI))
+      return 0;
+
+    StoreInst *SI = dyn_cast<StoreInst>(CurI);
+    // Found the previous store make sure it stores to the same location.
+    if (SI && SI->getPointerOperand() == StorePtr)
+      // Found the previous store, return its value operand.
+      return SI->getValueOperand();
+    else if (SI)
+      return 0; // Unknown store.
+  }
+
+  return 0;
+}
+
 /// \brief Speculate a conditional basic block flattening the CFG.
 ///
 /// Note that this is a very risky transform currently. Speculating
@@ -1395,6 +1459,8 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB) {
   SmallDenseMap<Instruction *, unsigned, 4> SinkCandidateUseCounts;
 
   unsigned SpeculationCost = 0;
+  Value *SpeculatedStoreValue = 0;
+  StoreInst *SpeculatedStore = 0;
   for (BasicBlock::iterator BBI = ThenBB->begin(),
                             BBE = llvm::prior(ThenBB->end());
        BBI != BBE; ++BBI) {
@@ -1410,13 +1476,21 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB) {
       return false;
 
     // Don't hoist the instruction if it's unsafe or expensive.
-    if (!isSafeToSpeculativelyExecute(I))
+    if (!isSafeToSpeculativelyExecute(I) &&
+        !(HoistCondStores &&
+          (SpeculatedStoreValue = isSafeToSpeculateStore(I, BB, ThenBB,
+                                                         EndBB))))
       return false;
-    if (ComputeSpeculationCost(I) > PHINodeFoldingThreshold)
+    if (!SpeculatedStoreValue &&
+        ComputeSpeculationCost(I) > PHINodeFoldingThreshold)
       return false;
 
+    // Store the store speculation candidate.
+    if (SpeculatedStoreValue)
+      SpeculatedStore = cast<StoreInst>(I);
+
     // Do not hoist the instruction if any of its operands are defined but not
-    // used in this BB. The transformation will prevent the operand from
+    // used in BB. The transformation will prevent the operand from
     // being sunk into the use block.
     for (User::op_iterator i = I->op_begin(), e = I->op_end();
          i != e; ++i) {
@@ -1473,12 +1547,24 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB) {
 
   // If there are no PHIs to process, bail early. This helps ensure idempotence
   // as well.
-  if (!HaveRewritablePHIs)
+  if (!HaveRewritablePHIs && !(HoistCondStores && SpeculatedStoreValue))
     return false;
 
   // If we get here, we can hoist the instruction and if-convert.
   DEBUG(dbgs() << "SPECULATIVELY EXECUTING BB" << *ThenBB << "\n";);
 
+  // Insert a select of the value of the speculated store.
+  if (SpeculatedStoreValue) {
+    IRBuilder<true, NoFolder> Builder(BI);
+    Value *TrueV = SpeculatedStore->getValueOperand();
+    Value *FalseV = SpeculatedStoreValue;
+    if (Invert)
+      std::swap(TrueV, FalseV);
+    Value *S = Builder.CreateSelect(BrCond, TrueV, FalseV, TrueV->getName() +
+                                    "." + FalseV->getName());
+    SpeculatedStore->setOperand(0, S);
+  }
+
   // Hoist the instructions.
   BB->getInstList().splice(BI, ThenBB->getInstList(), ThenBB->begin(),
                            llvm::prior(ThenBB->end()));
@@ -2789,9 +2875,20 @@ bool SimplifyCFGOpt::SimplifyResume(ResumeInst *RI, IRBuilder<> &Builder) {
       return false;
 
   // Turn all invokes that unwind here into calls and delete the basic block.
+  bool InvokeRequiresTableEntry = false;
+  bool Changed = false;
   for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE;) {
     InvokeInst *II = cast<InvokeInst>((*PI++)->getTerminator());
+
+    if (II->hasFnAttr(Attribute::UWTable)) {
+      // Don't remove an `invoke' instruction if the ABI requires an entry into
+      // the table.
+      InvokeRequiresTableEntry = true;
+      continue;
+    }
+
     SmallVector<Value*, 8> Args(II->op_begin(), II->op_end() - 3);
+
     // Insert a call instruction before the invoke.
     CallInst *Call = CallInst::Create(II->getCalledValue(), Args, "", II);
     Call->takeName(II);
@@ -2811,11 +2908,14 @@ bool SimplifyCFGOpt::SimplifyResume(ResumeInst *RI, IRBuilder<> &Builder) {
 
     // Finally, delete the invoke instruction!
     II->eraseFromParent();
+    Changed = true;
   }
 
-  // The landingpad is now unreachable.  Zap it.
-  BB->eraseFromParent();
-  return true;
+  if (!InvokeRequiresTableEntry)
+    // The landingpad is now unreachable.  Zap it.
+    BB->eraseFromParent();
+
+  return Changed;
 }
 
 bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) {
@@ -3059,7 +3159,12 @@ static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) {
   Value *Sub = SI->getCondition();
   if (!Offset->isNullValue())
     Sub = Builder.CreateAdd(Sub, Offset, Sub->getName()+".off");
-  Value *Cmp = Builder.CreateICmpULT(Sub, NumCases, "switch");
+  Value *Cmp;
+  // If NumCases overflowed, then all possible values jump to the successor.
+  if (NumCases->isNullValue() && SI->getNumCases() != 0)
+    Cmp = ConstantInt::getTrue(SI->getContext());
+  else
+    Cmp = Builder.CreateICmpULT(Sub, NumCases, "switch");
   BranchInst *NewBI = Builder.CreateCondBr(
       Cmp, SI->case_begin().getCaseSuccessor(), SI->getDefaultDest());
 
@@ -3944,11 +4049,13 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I) {
 
     // Load from null is undefined.
     if (LoadInst *LI = dyn_cast<LoadInst>(Use))
-      return LI->getPointerAddressSpace() == 0;
+      if (!LI->isVolatile())
+        return LI->getPointerAddressSpace() == 0;
 
     // Store to null is undefined.
     if (StoreInst *SI = dyn_cast<StoreInst>(Use))
-      return SI->getPointerAddressSpace() == 0 && SI->getPointerOperand() == I;
+      if (!SI->isVolatile())
+        return SI->getPointerAddressSpace() == 0 && SI->getPointerOperand() == I;
   }
   return false;
 }
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index b25febaa14..cadec21c50 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -15,11 +15,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
@@ -798,8 +800,7 @@ struct StrToOpt : public LibCallOptimization {
     if (isa<ConstantPointerNull>(EndPtr)) {
       // With a null EndPtr, this function won't capture the main argument.
       // It would be readonly too, except that it still may write to errno.
-      CI->addAttribute(1, Attribute::get(Callee->getContext(),
-                                          Attribute::NoCapture));
+      CI->addAttribute(1, Attribute::NoCapture);
     }
 
     return 0;
@@ -1517,6 +1518,12 @@ struct FPrintFOpt : public LibCallOptimization {
     if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr))
       return 0;
 
+    // Do not do any of the following transformations if the fprintf return
+    // value is used, in general the fprintf return value is not compatible
+    // with fwrite(), fputc() or fputs().
+    if (!CI->use_empty())
+      return 0;
+
     // fprintf(F, "foo") --> fwrite("foo", 3, 1, F)
     if (CI->getNumArgOperands() == 2) {
       for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
@@ -1526,11 +1533,10 @@ struct FPrintFOpt : public LibCallOptimization {
       // These optimizations require DataLayout.
       if (!TD) return 0;
 
-      Value *NewCI = EmitFWrite(CI->getArgOperand(1),
-                                ConstantInt::get(TD->getIntPtrType(*Context),
-                                                 FormatStr.size()),
-                                CI->getArgOperand(0), B, TD, TLI);
-      return NewCI ? ConstantInt::get(CI->getType(), FormatStr.size()) : 0;
+      return EmitFWrite(CI->getArgOperand(1),
+                        ConstantInt::get(TD->getIntPtrType(*Context),
+                                         FormatStr.size()),
+                        CI->getArgOperand(0), B, TD, TLI);
     }
 
     // The remaining optimizations require the format string to be "%s" or "%c"
@@ -1543,14 +1549,12 @@ struct FPrintFOpt : public LibCallOptimization {
     if (FormatStr[1] == 'c') {
       // fprintf(F, "%c", chr) --> fputc(chr, F)
       if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return 0;
-      Value *NewCI = EmitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B,
-                               TD, TLI);
-      return NewCI ? ConstantInt::get(CI->getType(), 1) : 0;
+      return EmitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B, TD, TLI);
     }
 
     if (FormatStr[1] == 's') {
       // fprintf(F, "%s", str) --> fputs(str, F)
-      if (!CI->getArgOperand(2)->getType()->isPointerTy() || !CI->use_empty())
+      if (!CI->getArgOperand(2)->getType()->isPointerTy())
         return 0;
       return EmitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TD, TLI);
     }
@@ -1673,67 +1677,17 @@ class LibCallSimplifierImpl {
   const TargetLibraryInfo *TLI;
   const LibCallSimplifier *LCS;
   bool UnsafeFPShrink;
-  StringMap<LibCallOptimization*, BumpPtrAllocator> Optimizations;
-
-  // Fortified library call optimizations.
-  MemCpyChkOpt MemCpyChk;
-  MemMoveChkOpt MemMoveChk;
-  MemSetChkOpt MemSetChk;
-  StrCpyChkOpt StrCpyChk;
-  StpCpyChkOpt StpCpyChk;
-  StrNCpyChkOpt StrNCpyChk;
-
-  // String library call optimizations.
-  StrCatOpt StrCat;
-  StrNCatOpt StrNCat;
-  StrChrOpt StrChr;
-  StrRChrOpt StrRChr;
-  StrCmpOpt StrCmp;
-  StrNCmpOpt StrNCmp;
-  StrCpyOpt StrCpy;
-  StpCpyOpt StpCpy;
-  StrNCpyOpt StrNCpy;
-  StrLenOpt StrLen;
-  StrPBrkOpt StrPBrk;
-  StrToOpt StrTo;
-  StrSpnOpt StrSpn;
-  StrCSpnOpt StrCSpn;
-  StrStrOpt StrStr;
-
-  // Memory library call optimizations.
-  MemCmpOpt MemCmp;
-  MemCpyOpt MemCpy;
-  MemMoveOpt MemMove;
-  MemSetOpt MemSet;
 
-  // Math library call optimizations.
-  UnaryDoubleFPOpt UnaryDoubleFP, UnsafeUnaryDoubleFP;
-  CosOpt Cos; PowOpt Pow; Exp2Opt Exp2;
 
-  // Integer library call optimizations.
-  FFSOpt FFS;
-  AbsOpt Abs;
-  IsDigitOpt IsDigit;
-  IsAsciiOpt IsAscii;
-  ToAsciiOpt ToAscii;
-
-  // Formatting and IO library call optimizations.
-  PrintFOpt PrintF;
-  SPrintFOpt SPrintF;
-  FPrintFOpt FPrintF;
-  FWriteOpt FWrite;
-  FPutsOpt FPuts;
-  PutsOpt Puts;
-
-  void initOptimizations();
-  void addOpt(LibFunc::Func F, LibCallOptimization* Opt);
-  void addOpt(LibFunc::Func F1, LibFunc::Func F2, LibCallOptimization* Opt);
+  // Math library call optimizations.
+  CosOpt Cos;
+  PowOpt Pow;
+  Exp2Opt Exp2;
 public:
   LibCallSimplifierImpl(const DataLayout *TD, const TargetLibraryInfo *TLI,
                         const LibCallSimplifier *LCS,
                         bool UnsafeFPShrink = false)
-    : UnaryDoubleFP(false), UnsafeUnaryDoubleFP(true),
-      Cos(UnsafeFPShrink), Pow(UnsafeFPShrink), Exp2(UnsafeFPShrink) {
+    : Cos(UnsafeFPShrink), Pow(UnsafeFPShrink), Exp2(UnsafeFPShrink) {
     this->TD = TD;
     this->TLI = TLI;
     this->LCS = LCS;
@@ -1741,123 +1695,234 @@ public:
   }
 
   Value *optimizeCall(CallInst *CI);
+  LibCallOptimization *lookupOptimization(CallInst *CI);
+  bool hasFloatVersion(StringRef FuncName);
 };
 
-void LibCallSimplifierImpl::initOptimizations() {
-  // Fortified library call optimizations.
-  Optimizations["__memcpy_chk"] = &MemCpyChk;
-  Optimizations["__memmove_chk"] = &MemMoveChk;
-  Optimizations["__memset_chk"] = &MemSetChk;
-  Optimizations["__strcpy_chk"] = &StrCpyChk;
-  Optimizations["__stpcpy_chk"] = &StpCpyChk;
-  Optimizations["__strncpy_chk"] = &StrNCpyChk;
-  Optimizations["__stpncpy_chk"] = &StrNCpyChk;
-
-  // String library call optimizations.
-  addOpt(LibFunc::strcat, &StrCat);
-  addOpt(LibFunc::strncat, &StrNCat);
-  addOpt(LibFunc::strchr, &StrChr);
-  addOpt(LibFunc::strrchr, &StrRChr);
-  addOpt(LibFunc::strcmp, &StrCmp);
-  addOpt(LibFunc::strncmp, &StrNCmp);
-  addOpt(LibFunc::strcpy, &StrCpy);
-  addOpt(LibFunc::stpcpy, &StpCpy);
-  addOpt(LibFunc::strncpy, &StrNCpy);
-  addOpt(LibFunc::strlen, &StrLen);
-  addOpt(LibFunc::strpbrk, &StrPBrk);
-  addOpt(LibFunc::strtol, &StrTo);
-  addOpt(LibFunc::strtod, &StrTo);
-  addOpt(LibFunc::strtof, &StrTo);
-  addOpt(LibFunc::strtoul, &StrTo);
-  addOpt(LibFunc::strtoll, &StrTo);
-  addOpt(LibFunc::strtold, &StrTo);
-  addOpt(LibFunc::strtoull, &StrTo);
-  addOpt(LibFunc::strspn, &StrSpn);
-  addOpt(LibFunc::strcspn, &StrCSpn);
-  addOpt(LibFunc::strstr, &StrStr);
-
-  // Memory library call optimizations.
-  addOpt(LibFunc::memcmp, &MemCmp);
-  addOpt(LibFunc::memcpy, &MemCpy);
-  addOpt(LibFunc::memmove, &MemMove);
-  addOpt(LibFunc::memset, &MemSet);
+bool LibCallSimplifierImpl::hasFloatVersion(StringRef FuncName) {
+  LibFunc::Func Func;
+  SmallString<20> FloatFuncName = FuncName;
+  FloatFuncName += 'f';
+  if (TLI->getLibFunc(FloatFuncName, Func))
+    return TLI->has(Func);
+  return false;
+}
 
-  // Math library call optimizations.
-  addOpt(LibFunc::ceil, LibFunc::ceilf, &UnaryDoubleFP);
-  addOpt(LibFunc::fabs, LibFunc::fabsf, &UnaryDoubleFP);
-  addOpt(LibFunc::floor, LibFunc::floorf, &UnaryDoubleFP);
-  addOpt(LibFunc::rint, LibFunc::rintf, &UnaryDoubleFP);
-  addOpt(LibFunc::round, LibFunc::roundf, &UnaryDoubleFP);
-  addOpt(LibFunc::nearbyint, LibFunc::nearbyintf, &UnaryDoubleFP);
-  addOpt(LibFunc::trunc, LibFunc::truncf, &UnaryDoubleFP);
-
-  if(UnsafeFPShrink) {
-    addOpt(LibFunc::acos, LibFunc::acosf, &UnsafeUnaryDoubleFP);
-    addOpt(LibFunc::acosh, LibFunc::acoshf, &UnsafeUnaryDoubleFP);
-    addOpt(LibFunc::asin, LibFunc::asinf, &UnsafeUnaryDoubleFP);
-    addOpt(LibFunc::asinh, LibFunc::asinhf, &UnsafeUnaryDoubleFP);
-    addOpt(LibFunc::atan, LibFunc::atanf, &UnsafeUnaryDoubleFP);
-    addOpt(LibFunc::atanh, LibFunc::atanhf, &UnsafeUnaryDoubleFP);
-    addOpt(LibFunc::cbrt, LibFunc::cbrtf, &UnsafeUnaryDoubleFP);
-    addOpt(LibFunc::cosh, LibFunc::coshf, &UnsafeUnaryDoubleFP);
-    addOpt(LibFunc::exp, LibFunc::expf, &UnsafeUnaryDoubleFP);
-    addOpt(LibFunc::exp10, LibFunc::exp10f, &UnsafeUnaryDoubleFP);
-    addOpt(LibFunc::expm1, LibFunc::expm1f, &UnsafeUnaryDoubleFP);
-    addOpt(LibFunc::log, LibFunc::logf, &UnsafeUnaryDoubleFP);
-    addOpt(LibFunc::log10, LibFunc::log10f, &UnsafeUnaryDoubleFP);
-    addOpt(LibFunc::log1p, LibFunc::log1pf, &UnsafeUnaryDoubleFP);
-    addOpt(LibFunc::log2, LibFunc::log2f, &UnsafeUnaryDoubleFP);
-    addOpt(LibFunc::logb, LibFunc::logbf, &UnsafeUnaryDoubleFP);
-    addOpt(LibFunc::sin, LibFunc::sinf, &UnsafeUnaryDoubleFP);
-    addOpt(LibFunc::sinh, LibFunc::sinhf, &UnsafeUnaryDoubleFP);
-    addOpt(LibFunc::sqrt, LibFunc::sqrtf, &UnsafeUnaryDoubleFP);
-    addOpt(LibFunc::tan, LibFunc::tanf, &UnsafeUnaryDoubleFP);
-    addOpt(LibFunc::tanh, LibFunc::tanhf, &UnsafeUnaryDoubleFP);
+// Fortified library call optimizations.
+static MemCpyChkOpt MemCpyChk;
+static MemMoveChkOpt MemMoveChk;
+static MemSetChkOpt MemSetChk;
+static StrCpyChkOpt StrCpyChk;
+static StpCpyChkOpt StpCpyChk;
+static StrNCpyChkOpt StrNCpyChk;
+
+// String library call optimizations.
+static StrCatOpt StrCat;
+static StrNCatOpt StrNCat;
+static StrChrOpt StrChr;
+static StrRChrOpt StrRChr;
+static StrCmpOpt StrCmp;
+static StrNCmpOpt StrNCmp;
+static StrCpyOpt StrCpy;
+static StpCpyOpt StpCpy;
+static StrNCpyOpt StrNCpy;
+static StrLenOpt StrLen;
+static StrPBrkOpt StrPBrk;
+static StrToOpt StrTo;
+static StrSpnOpt StrSpn;
+static StrCSpnOpt StrCSpn;
+static StrStrOpt StrStr;
+
+// Memory library call optimizations.
+static MemCmpOpt MemCmp;
+static MemCpyOpt MemCpy;
+static MemMoveOpt MemMove;
+static MemSetOpt MemSet;
+
+// Math library call optimizations.
+static UnaryDoubleFPOpt UnaryDoubleFP(false);
+static UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
+
+  // Integer library call optimizations.
+static FFSOpt FFS;
+static AbsOpt Abs;
+static IsDigitOpt IsDigit;
+static IsAsciiOpt IsAscii;
+static ToAsciiOpt ToAscii;
+
+// Formatting and IO library call optimizations.
+static PrintFOpt PrintF;
+static SPrintFOpt SPrintF;
+static FPrintFOpt FPrintF;
+static FWriteOpt FWrite;
+static FPutsOpt FPuts;
+static PutsOpt Puts;
+
+LibCallOptimization *LibCallSimplifierImpl::lookupOptimization(CallInst *CI) {
+  LibFunc::Func Func;
+  Function *Callee = CI->getCalledFunction();
+  StringRef FuncName = Callee->getName();
+
+  // Next check for intrinsics.
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::pow:
+       return &Pow;
+    case Intrinsic::exp2:
+       return &Exp2;
+    default:
+       return 0;
+    }
   }
 
-  addOpt(LibFunc::cosf, &Cos);
-  addOpt(LibFunc::cos, &Cos);
-  addOpt(LibFunc::cosl, &Cos);
-  addOpt(LibFunc::powf, &Pow);
-  addOpt(LibFunc::pow, &Pow);
-  addOpt(LibFunc::powl, &Pow);
-  Optimizations["llvm.pow.f32"] = &Pow;
-  Optimizations["llvm.pow.f64"] = &Pow;
-  Optimizations["llvm.pow.f80"] = &Pow;
-  Optimizations["llvm.pow.f128"] = &Pow;
-  Optimizations["llvm.pow.ppcf128"] = &Pow;
-  addOpt(LibFunc::exp2l, &Exp2);
-  addOpt(LibFunc::exp2, &Exp2);
-  addOpt(LibFunc::exp2f, &Exp2);
-  Optimizations["llvm.exp2.ppcf128"] = &Exp2;
-  Optimizations["llvm.exp2.f128"] = &Exp2;
-  Optimizations["llvm.exp2.f80"] = &Exp2;
-  Optimizations["llvm.exp2.f64"] = &Exp2;
-  Optimizations["llvm.exp2.f32"] = &Exp2;
+  // Then check for known library functions.
+  if (TLI->getLibFunc(FuncName, Func) && TLI->has(Func)) {
+    switch (Func) {
+      case LibFunc::strcat:
+        return &StrCat;
+      case LibFunc::strncat:
+        return &StrNCat;
+      case LibFunc::strchr:
+        return &StrChr;
+      case LibFunc::strrchr:
+        return &StrRChr;
+      case LibFunc::strcmp:
+        return &StrCmp;
+      case LibFunc::strncmp:
+        return &StrNCmp;
+      case LibFunc::strcpy:
+        return &StrCpy;
+      case LibFunc::stpcpy:
+        return &StpCpy;
+      case LibFunc::strncpy:
+        return &StrNCpy;
+      case LibFunc::strlen:
+        return &StrLen;
+      case LibFunc::strpbrk:
+        return &StrPBrk;
+      case LibFunc::strtol:
+      case LibFunc::strtod:
+      case LibFunc::strtof:
+      case LibFunc::strtoul:
+      case LibFunc::strtoll:
+      case LibFunc::strtold:
+      case LibFunc::strtoull:
+        return &StrTo;
+      case LibFunc::strspn:
+        return &StrSpn;
+      case LibFunc::strcspn:
+        return &StrCSpn;
+      case LibFunc::strstr:
+        return &StrStr;
+      case LibFunc::memcmp:
+        return &MemCmp;
+      case LibFunc::memcpy:
+        return &MemCpy;
+      case LibFunc::memmove:
+        return &MemMove;
+      case LibFunc::memset:
+        return &MemSet;
+      case LibFunc::cosf:
+      case LibFunc::cos:
+      case LibFunc::cosl:
+        return &Cos;
+      case LibFunc::powf:
+      case LibFunc::pow:
+      case LibFunc::powl:
+        return &Pow;
+      case LibFunc::exp2l:
+      case LibFunc::exp2:
+      case LibFunc::exp2f:
+        return &Exp2;
+      case LibFunc::ffs:
+      case LibFunc::ffsl:
+      case LibFunc::ffsll:
+        return &FFS;
+      case LibFunc::abs:
+      case LibFunc::labs:
+      case LibFunc::llabs:
+        return &Abs;
+      case LibFunc::isdigit:
+        return &IsDigit;
+      case LibFunc::isascii:
+        return &IsAscii;
+      case LibFunc::toascii:
+        return &ToAscii;
+      case LibFunc::printf:
+        return &PrintF;
+      case LibFunc::sprintf:
+        return &SPrintF;
+      case LibFunc::fprintf:
+        return &FPrintF;
+      case LibFunc::fwrite:
+        return &FWrite;
+      case LibFunc::fputs:
+        return &FPuts;
+      case LibFunc::puts:
+        return &Puts;
+      case LibFunc::ceil:
+      case LibFunc::fabs:
+      case LibFunc::floor:
+      case LibFunc::rint:
+      case LibFunc::round:
+      case LibFunc::nearbyint:
+      case LibFunc::trunc:
+        if (hasFloatVersion(FuncName))
+          return &UnaryDoubleFP;
+        return 0;
+      case LibFunc::acos:
+      case LibFunc::acosh:
+      case LibFunc::asin:
+      case LibFunc::asinh:
+      case LibFunc::atan:
+      case LibFunc::atanh:
+      case LibFunc::cbrt:
+      case LibFunc::cosh:
+      case LibFunc::exp:
+      case LibFunc::exp10:
+      case LibFunc::expm1:
+      case LibFunc::log:
+      case LibFunc::log10:
+      case LibFunc::log1p:
+      case LibFunc::log2:
+      case LibFunc::logb:
+      case LibFunc::sin:
+      case LibFunc::sinh:
+      case LibFunc::sqrt:
+      case LibFunc::tan:
+      case LibFunc::tanh:
+        if (UnsafeFPShrink && hasFloatVersion(FuncName))
+         return &UnsafeUnaryDoubleFP;
+        return 0;
+      case LibFunc::memcpy_chk:
+        return &MemCpyChk;
+      default:
+        return 0;
+      }
+  }
+
+  // Finally check for fortified library calls.
+  if (FuncName.endswith("_chk")) {
+    if (FuncName == "__memmove_chk")
+      return &MemMoveChk;
+    else if (FuncName == "__memset_chk")
+      return &MemSetChk;
+    else if (FuncName == "__strcpy_chk")
+      return &StrCpyChk;
+    else if (FuncName == "__stpcpy_chk")
+      return &StpCpyChk;
+    else if (FuncName == "__strncpy_chk")
+      return &StrNCpyChk;
+    else if (FuncName == "__stpncpy_chk")
+      return &StrNCpyChk;
+  }
+
+  return 0;
 
-  // Integer library call optimizations.
-  addOpt(LibFunc::ffs, &FFS);
-  addOpt(LibFunc::ffsl, &FFS);
-  addOpt(LibFunc::ffsll, &FFS);
-  addOpt(LibFunc::abs, &Abs);
-  addOpt(LibFunc::labs, &Abs);
-  addOpt(LibFunc::llabs, &Abs);
-  addOpt(LibFunc::isdigit, &IsDigit);
-  addOpt(LibFunc::isascii, &IsAscii);
-  addOpt(LibFunc::toascii, &ToAscii);
-
-  // Formatting and IO library call optimizations.
-  addOpt(LibFunc::printf, &PrintF);
-  addOpt(LibFunc::sprintf, &SPrintF);
-  addOpt(LibFunc::fprintf, &FPrintF);
-  addOpt(LibFunc::fwrite, &FWrite);
-  addOpt(LibFunc::fputs, &FPuts);
-  addOpt(LibFunc::puts, &Puts);
 }
 
 Value *LibCallSimplifierImpl::optimizeCall(CallInst *CI) {
-  if (Optimizations.empty())
-    initOptimizations();
+  LibCallOptimization *LCO = lookupOptimization(CI);
 
   // @LOCALMOD-BEGIN
   Function *Caller = CI->getParent()->getParent();
@@ -1866,8 +1931,6 @@ Value *LibCallSimplifierImpl::optimizeCall(CallInst *CI) {
   if (TLI->getLibFunc(Caller->getName(), F))
     return 0;
   // @LOCALMOD-END
-  Function *Callee = CI->getCalledFunction();
-  LibCallOptimization *LCO = Optimizations.lookup(Callee->getName());
   if (LCO) {
     IRBuilder<> Builder(CI);
     return LCO->optimizeCall(CI, TD, TLI, LCS, Builder);
@@ -1875,16 +1938,6 @@ Value *LibCallSimplifierImpl::optimizeCall(CallInst *CI) {
   return 0;
 }
 
-void LibCallSimplifierImpl::addOpt(LibFunc::Func F, LibCallOptimization* Opt) {
-  if (TLI->has(F))
-    Optimizations[TLI->getName(F)] = Opt;
-}
-
-void LibCallSimplifierImpl::addOpt(LibFunc::Func F1, LibFunc::Func F2,
-                                   LibCallOptimization* Opt) {
-  if (TLI->has(F1) && TLI->has(F2))
-    Optimizations[TLI->getName(F1)] = Opt;
-}
 
 LibCallSimplifier::LibCallSimplifier(const DataLayout *TD,
                                      const TargetLibraryInfo *TLI,
diff --git a/lib/Transforms/Utils/Utils.cpp b/lib/Transforms/Utils/Utils.cpp
index 5812d4607d..c3df215c29 100644
--- a/lib/Transforms/Utils/Utils.cpp
+++ b/lib/Transforms/Utils/Utils.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/InitializePasses.h"
+#include "llvm/PassRegistry.h"
 #include "llvm-c/Initialization.h"
 
 using namespace llvm;
diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp
index b5941bdf24..544c5eed7e 100644
--- a/lib/Transforms/Utils/ValueMapper.cpp
+++ b/lib/Transforms/Utils/ValueMapper.cpp
@@ -57,7 +57,7 @@ Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
       return VM[V] = const_cast<Value*>(V);
     
     // Create a dummy node in case we have a metadata cycle.
-    MDNode *Dummy = MDNode::getTemporary(V->getContext(), ArrayRef<Value*>());
+    MDNode *Dummy = MDNode::getTemporary(V->getContext(), None);
     VM[V] = Dummy;
     
     // Check all operands to see if any need to be remapped.
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp
index 76365417aa..17900dabbe 100644
--- a/lib/Transforms/Vectorize/BBVectorize.cpp
+++ b/lib/Transforms/Vectorize/BBVectorize.cpp
@@ -1771,7 +1771,7 @@ namespace {
       size_t MaxDepth = DAG.lookup(IJ);
 
       DEBUG(if (DebugPairSelection) dbgs() << "BBV: found DAG for pair {"
-                   << IJ.first << " <-> " << IJ.second << "} of depth " <<
+                   << *IJ.first << " <-> " << *IJ.second << "} of depth " <<
                    MaxDepth << " and size " << DAG.size() << "\n");
 
       // At this point the DAG has been constructed, but, may contain
@@ -2086,7 +2086,7 @@ namespace {
 
       DEBUG(if (DebugPairSelection)
              dbgs() << "BBV: found pruned DAG for pair {"
-             << IJ.first << " <-> " << IJ.second << "} of depth " <<
+             << *IJ.first << " <-> " << *IJ.second << "} of depth " <<
              MaxDepth << " and size " << PrunedDAG.size() <<
             " (effective size: " << EffSize << ")\n");
       if (((TTI && !UseChainDepthWithTI) ||
diff --git a/lib/Transforms/Vectorize/CMakeLists.txt b/lib/Transforms/Vectorize/CMakeLists.txt
index e64034ab26..7ae082f55e 100644
--- a/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/lib/Transforms/Vectorize/CMakeLists.txt
@@ -2,6 +2,8 @@ add_llvm_library(LLVMVectorize
   BBVectorize.cpp
   Vectorize.cpp
   LoopVectorize.cpp
+  SLPVectorizer.cpp
+  VecUtils.cpp
   )
 
 add_dependencies(LLVMVectorize intrinsics_gen)
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index a696a2ffba..11ee99ddf1 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8,9 +8,9 @@
 //===----------------------------------------------------------------------===//
 //
 // This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
-// and generates target-independent LLVM-IR. Legalization of the IR is done
-// in the codegen. However, the vectorizer uses (will use) the codegen
-// interfaces to generate IR that is likely to result in an optimal binary.
+// and generates target-independent LLVM-IR.
+// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
+// of instructions in order to estimate the profitability of vectorization.
 //
 // The loop vectorizer combines consecutive loop iterations into a single
 // 'wide' iteration. After this transformation the index is incremented
@@ -78,7 +78,9 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/PatternMatch.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/ValueHandle.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -87,6 +89,7 @@
 #include <map>
 
 using namespace llvm;
+using namespace llvm::PatternMatch;
 
 static cl::opt<unsigned>
 VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
@@ -112,9 +115,15 @@ TinyTripCountVectorThreshold("vectorizer-min-trip-count", cl::init(16),
 /// We don't unroll loops with a known constant trip count below this number.
 static const unsigned TinyTripCountUnrollThreshold = 128;
 
-/// When performing a runtime memory check, do not check more than this
-/// number of pointers. Notice that the check is quadratic!
-static const unsigned RuntimeMemoryCheckThreshold = 4;
+/// When performing memory disambiguation checks at runtime do not make more
+/// than this number of comparisons.
+static const unsigned RuntimeMemoryCheckThreshold = 8;
+
+/// We use a metadata with this name  to indicate that a scalar loop was
+/// vectorized and that we don't need to re-vectorize it if we run into it
+/// again.
+static const char*
+AlreadyVectorizedMDName = "llvm.vectorizer.already_vectorized";
 
 namespace {
 
@@ -208,7 +217,7 @@ private:
   /// This function adds 0, 1, 2 ... to each vector element, starting at zero.
   /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...).
   /// The sequence starts at StartIndex.
-  Value *getConsecutiveVector(Value* Val, unsigned StartIdx, bool Negate);
+  Value *getConsecutiveVector(Value* Val, int StartIdx, bool Negate);
 
   /// When we go over instructions in the basic block we rely on previous
   /// values within the current basic block or on loop invariant values.
@@ -327,7 +336,7 @@ public:
                             DominatorTree *DT, TargetTransformInfo* TTI,
                             AliasAnalysis *AA, TargetLibraryInfo *TLI)
       : TheLoop(L), SE(SE), DL(DL), DT(DT), TTI(TTI), AA(AA), TLI(TLI),
-        Induction(0) {}
+        Induction(0), HasFunNoNaNAttr(false) {}
 
   /// This enum represents the kinds of reductions that we support.
   enum ReductionKind {
@@ -337,8 +346,10 @@ public:
     RK_IntegerOr,   ///< Bitwise or logical OR of numbers.
     RK_IntegerAnd,  ///< Bitwise or logical AND of numbers.
     RK_IntegerXor,  ///< Bitwise or logical XOR of numbers.
+    RK_IntegerMinMax, ///< Min/max implemented in terms of select(cmp()).
     RK_FloatAdd,    ///< Sum of floats.
-    RK_FloatMult    ///< Product of floats.
+    RK_FloatMult,   ///< Product of floats.
+    RK_FloatMinMax  ///< Min/max implemented in terms of select(cmp()).
   };
 
   /// This enum represents the kinds of inductions that we support.
@@ -350,21 +361,52 @@ public:
     IK_ReversePtrInduction  ///< Reverse ptr indvar. Step = - sizeof(elem).
   };
 
+  // This enum represents the kind of minmax reduction.
+  enum MinMaxReductionKind {
+    MRK_Invalid,
+    MRK_UIntMin,
+    MRK_UIntMax,
+    MRK_SIntMin,
+    MRK_SIntMax,
+    MRK_FloatMin,
+    MRK_FloatMax
+  };
+
   /// This POD struct holds information about reduction variables.
   struct ReductionDescriptor {
     ReductionDescriptor() : StartValue(0), LoopExitInstr(0),
-      Kind(RK_NoReduction) {}
+      Kind(RK_NoReduction), MinMaxKind(MRK_Invalid) {}
 
-    ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K)
-        : StartValue(Start), LoopExitInstr(Exit), Kind(K) {}
+    ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K,
+                        MinMaxReductionKind MK)
+        : StartValue(Start), LoopExitInstr(Exit), Kind(K), MinMaxKind(MK) {}
 
     // The starting value of the reduction.
     // It does not have to be zero!
-    Value *StartValue;
+    TrackingVH<Value> StartValue;
     // The instruction who's value is used outside the loop.
     Instruction *LoopExitInstr;
     // The kind of the reduction.
     ReductionKind Kind;
+    // If this a min/max reduction the kind of reduction.
+    MinMaxReductionKind MinMaxKind;
+  };
+
+  /// This POD struct holds information about a potential reduction operation.
+  struct ReductionInstDesc {
+    ReductionInstDesc(bool IsRedux, Instruction *I) :
+      IsReduction(IsRedux), PatternLastInst(I), MinMaxKind(MRK_Invalid) {}
+
+    ReductionInstDesc(Instruction *I, MinMaxReductionKind K) :
+      IsReduction(true), PatternLastInst(I), MinMaxKind(K) {}
+
+    // Is this instruction a reduction candidate.
+    bool IsReduction;
+    // The last instruction in a min/max pattern (select of the select(icmp())
+    // pattern), or the current reduction instruction otherwise.
+    Instruction *PatternLastInst;
+    // If this is a min/max pattern the comparison predicate.
+    MinMaxReductionKind MinMaxKind;
   };
 
   // This POD struct holds information about the memory runtime legality
@@ -381,16 +423,18 @@ public:
     }
 
     /// Insert a pointer and calculate the start and end SCEVs.
-    void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr);
+    void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr, bool WritePtr);
 
     /// This flag indicates if we need to add the runtime check.
     bool Need;
     /// Holds the pointers that we need to check.
-    SmallVector<Value*, 2> Pointers;
+    SmallVector<TrackingVH<Value>, 2> Pointers;
     /// Holds the pointer value at the beginning of the loop.
     SmallVector<const SCEV*, 2> Starts;
     /// Holds the pointer value at the end of the loop.
     SmallVector<const SCEV*, 2> Ends;
+    /// Holds the information if this pointer is used for writing to memory.
+    SmallVector<bool, 2> IsWritePtr;
   };
 
   /// A POD for saving information about induction variables.
@@ -398,7 +442,7 @@ public:
     InductionInfo(Value *Start, InductionKind K) : StartValue(Start), IK(K) {}
     InductionInfo() : StartValue(0), IK(IK_NoInduction) {}
     /// Start value.
-    Value *StartValue;
+    TrackingVH<Value> StartValue;
     /// Induction kind.
     InductionKind IK;
   };
@@ -413,7 +457,7 @@ public:
 
   /// Alias(Multi)Map stores the values (GEPs or underlying objects and their
   /// respective Store/Load instruction(s) to calculate aliasing.
-  typedef DenseMap<Value*, Instruction* > AliasMap;
+  typedef MapVector<Value*, Instruction* > AliasMap;
   typedef DenseMap<Value*, std::vector<Instruction*> > AliasMultiMap;
 
   /// Returns true if it is legal to vectorize this loop.
@@ -455,6 +499,10 @@ public:
 
   /// Returns the information that we collected about runtime memory check.
   RuntimePointerCheck *getRuntimePointerCheck() { return &PtrRtCheck; }
+
+  /// This function returns the identity element (or neutral element) for
+  /// the operation K.
+  static Constant *getReductionIdentity(ReductionKind K, Type *Tp);
 private:
   /// Check if a single basic block loop is vectorizable.
   /// At this point we know that this is a loop with a constant trip count
@@ -481,9 +529,17 @@ private:
   /// Returns True, if 'Phi' is the kind of reduction variable for type
   /// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
   bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
-  /// Returns true if the instruction I can be a reduction variable of type
-  /// 'Kind'.
-  bool isReductionInstr(Instruction *I, ReductionKind Kind);
+  /// Returns a struct describing if the instruction 'I' can be a reduction
+  /// variable of type 'Kind'. If the reduction is a min/max pattern of
+  /// select(icmp()) this function advances the instruction pointer 'I' from the
+  /// compare instruction to the select instruction and stores this pointer in
+  /// 'PatternLastInst' member of the returned struct.
+  ReductionInstDesc isReductionInstr(Instruction *I, ReductionKind Kind,
+                                     ReductionInstDesc &Desc);
+  /// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction
+  /// pattern corresponding to a min(X, Y) or max(X, Y).
+  static ReductionInstDesc isMinMaxSelectCmpPattern(Instruction *I,
+                                                    ReductionInstDesc &Prev);
   /// Returns the induction kind of Phi. This function may return NoInduction
   /// if the PHI is not an induction variable.
   InductionKind isInductionVariable(PHINode *Phi);
@@ -534,6 +590,8 @@ private:
   /// We need to check that all of the pointers in this list are disjoint
   /// at runtime.
   RuntimePointerCheck PtrRtCheck;
+  /// Can we assume the absence of NaNs.
+  bool HasFunNoNaNAttr;
 };
 
 /// LoopVectorizationCostModel - estimates the expected speedups due to
@@ -656,6 +714,11 @@ struct LoopVectorize : public LoopPass {
     AA = getAnalysisIfAvailable<AliasAnalysis>();
     TLI = getAnalysisIfAvailable<TargetLibraryInfo>();
 
+    if (DL == NULL) {
+      DEBUG(dbgs() << "LV: Not vectorizing because of missing data layout");
+      return false;
+    }
+
     DEBUG(dbgs() << "LV: Checking a loop in \"" <<
           L->getHeader()->getParent()->getName() << "\"\n");
 
@@ -731,7 +794,8 @@ struct LoopVectorize : public LoopPass {
 
 void
 LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE,
-                                                       Loop *Lp, Value *Ptr) {
+                                                       Loop *Lp, Value *Ptr,
+                                                       bool WritePtr) {
   const SCEV *Sc = SE->getSCEV(Ptr);
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
   assert(AR && "Invalid addrec expression");
@@ -740,6 +804,7 @@ LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE,
   Pointers.push_back(Ptr);
   Starts.push_back(AR->getStart());
   Ends.push_back(ScEnd);
+  IsWritePtr.push_back(WritePtr);
 }
 
 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
@@ -765,7 +830,7 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
   return Shuf;
 }
 
-Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, unsigned StartIdx,
+Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, int StartIdx,
                                                  bool Negate) {
   assert(Val->getType()->isVectorTy() && "Must be a vector");
   assert(Val->getType()->getScalarType()->isIntegerTy() &&
@@ -778,8 +843,8 @@ Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, unsigned StartIdx,
 
   // Create a vector of consecutive numbers from zero to VF.
   for (int i = 0; i < VLen; ++i) {
-    int Idx = Negate ? (-i): i;
-    Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx));
+    int64_t Idx = Negate ? (-i) : i;
+    Indices.push_back(ConstantInt::get(ITy, StartIdx + Idx, Negate));
   }
 
   // Add the consecutive indices to the vector value.
@@ -899,13 +964,19 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
   Type *DataTy = VectorType::get(ScalarDataTy, VF);
   Value *Ptr = LI ? LI->getPointerOperand() : SI->getPointerOperand();
   unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
+  unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
+  unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ScalarDataTy);
+  unsigned VectorElementSize = DL->getTypeStoreSize(DataTy)/VF;
+
+  if (ScalarAllocatedSize != VectorElementSize)
+    return scalarizeInstruction(Instr);
 
   // If the pointer is loop invariant or if it is non consecutive,
   // scalarize the load.
-  int Stride = Legal->isConsecutivePtr(Ptr);
-  bool Reverse = Stride < 0;
+  int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
+  bool Reverse = ConsecutiveStride < 0;
   bool UniformLoad = LI && Legal->isUniform(Ptr);
-  if (Stride == 0 || UniformLoad)
+  if (!ConsecutiveStride || UniformLoad)
     return scalarizeInstruction(Instr);
 
   Constant *Zero = Builder.getInt32(0);
@@ -968,7 +1039,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
         PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
       }
 
-      Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo());
+      Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
       Builder.CreateStore(StoredVal[Part], VecPtr)->setAlignment(Alignment);
     }
   }
@@ -984,7 +1055,7 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
       PartPtr = Builder.CreateGEP(PartPtr, Builder.getInt32(1 - VF));
     }
 
-    Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo());
+    Value *VecPtr = Builder.CreateBitCast(PartPtr, DataTy->getPointerTo(AddressSpace));
     Value *LI = Builder.CreateLoad(VecPtr, "wide.load");
     cast<LoadInst>(LI)->setAlignment(Alignment);
     Entry[Part] = Reverse ? reverseVector(LI) :  LI;
@@ -1034,10 +1105,10 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
   // Create a new entry in the WidenMap and initialize it to Undef or Null.
   VectorParts &VecResults = WidenMap.splat(Instr, UndefVec);
 
-  // For each scalar that we create:
-  for (unsigned Width = 0; Width < VF; ++Width) {
-    // For each vector unroll 'part':
-    for (unsigned Part = 0; Part < UF; ++Part) {
+  // For each vector unroll 'part':
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    // For each scalar that we create:
+    for (unsigned Width = 0; Width < VF; ++Width) {
       Instruction *Cloned = Instr->clone();
       if (!IsVoidRetTy)
         Cloned->setName(Instr->getName() + ".cloned");
@@ -1104,6 +1175,10 @@ InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
 
   for (unsigned i = 0; i < NumPointers; ++i) {
     for (unsigned j = i+1; j < NumPointers; ++j) {
+      // No need to check if two readonly pointers intersect.
+      if (!PtrRtCheck->IsWritePtr[i] && !PtrRtCheck->IsWritePtr[j])
+        continue;
+
       Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy, "bc");
       Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy, "bc");
       Value *End0 =   ChkBuilder.CreateBitCast(Ends[i],   PtrArithTy, "bc");
@@ -1159,6 +1234,11 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   BasicBlock *ExitBlock = OrigLoop->getExitBlock();
   assert(ExitBlock && "Must have an exit block");
 
+  // Mark the old scalar loop with metadata that tells us not to vectorize this
+  // loop again if we run into it.
+  MDNode *MD = MDNode::get(OldBasicBlock->getContext(), None);
+  OldBasicBlock->getTerminator()->setMetadata(AlreadyVectorizedMDName, MD);
+
   // Some loops have a single integer induction variable, while other loops
   // don't. One example is c++ iterators that often have multiple pointer
   // induction variables. In the code below we also support a case where we
@@ -1425,24 +1505,24 @@ InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
 
 /// This function returns the identity element (or neutral element) for
 /// the operation K.
-static Constant*
-getReductionIdentity(LoopVectorizationLegality::ReductionKind K, Type *Tp) {
+Constant*
+LoopVectorizationLegality::getReductionIdentity(ReductionKind K, Type *Tp) {
   switch (K) {
-  case LoopVectorizationLegality:: RK_IntegerXor:
-  case LoopVectorizationLegality:: RK_IntegerAdd:
-  case LoopVectorizationLegality:: RK_IntegerOr:
+  case RK_IntegerXor:
+  case RK_IntegerAdd:
+  case RK_IntegerOr:
     // Adding, Xoring, Oring zero to a number does not change it.
     return ConstantInt::get(Tp, 0);
-  case LoopVectorizationLegality:: RK_IntegerMult:
+  case RK_IntegerMult:
     // Multiplying a number by 1 does not change it.
     return ConstantInt::get(Tp, 1);
-  case LoopVectorizationLegality:: RK_IntegerAnd:
+  case RK_IntegerAnd:
     // AND-ing a number with an all-1 value does not change it.
     return ConstantInt::get(Tp, -1, true);
-  case LoopVectorizationLegality:: RK_FloatMult:
+  case  RK_FloatMult:
     // Multiplying a number by 1 does not change it.
     return ConstantFP::get(Tp, 1.0L);
-  case LoopVectorizationLegality:: RK_FloatAdd:
+  case  RK_FloatAdd:
     // Adding zero to a number does not change it.
     return ConstantFP::get(Tp, 0.0L);
   default:
@@ -1555,7 +1635,7 @@ getIntrinsicIDForCall(CallInst *CI, const TargetLibraryInfo *TLI) {
 }
 
 /// This function translates the reduction kind to an LLVM binary operator.
-static Instruction::BinaryOps
+static unsigned
 getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) {
   switch (Kind) {
     case LoopVectorizationLegality::RK_IntegerAdd:
@@ -1572,11 +1652,53 @@ getReductionBinOp(LoopVectorizationLegality::ReductionKind Kind) {
       return Instruction::FMul;
     case LoopVectorizationLegality::RK_FloatAdd:
       return Instruction::FAdd;
+    case LoopVectorizationLegality::RK_IntegerMinMax:
+      return Instruction::ICmp;
+    case LoopVectorizationLegality::RK_FloatMinMax:
+      return Instruction::FCmp;
     default:
       llvm_unreachable("Unknown reduction operation");
   }
 }
 
+Value *createMinMaxOp(IRBuilder<> &Builder,
+                      LoopVectorizationLegality::MinMaxReductionKind RK,
+                      Value *Left,
+                      Value *Right) {
+  CmpInst::Predicate P = CmpInst::ICMP_NE;
+  switch (RK) {
+  default:
+    llvm_unreachable("Unknown min/max reduction kind");
+  case LoopVectorizationLegality::MRK_UIntMin:
+    P = CmpInst::ICMP_ULT;
+    break;
+  case LoopVectorizationLegality::MRK_UIntMax:
+    P = CmpInst::ICMP_UGT;
+    break;
+  case LoopVectorizationLegality::MRK_SIntMin:
+    P = CmpInst::ICMP_SLT;
+    break;
+  case LoopVectorizationLegality::MRK_SIntMax:
+    P = CmpInst::ICMP_SGT;
+    break;
+  case LoopVectorizationLegality::MRK_FloatMin:
+    P = CmpInst::FCMP_OLT;
+    break;
+  case LoopVectorizationLegality::MRK_FloatMax:
+    P = CmpInst::FCMP_OGT;
+    break;
+  }
+
+  Value *Cmp;
+  if (RK == LoopVectorizationLegality::MRK_FloatMin || RK == LoopVectorizationLegality::MRK_FloatMax)
+    Cmp = Builder.CreateFCmp(P, Left, Right, "rdx.minmax.cmp");
+  else
+    Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp");
+
+  Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select");
+  return Select;
+}
+
 void
 InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
   //===------------------------------------------------===//
@@ -1632,7 +1754,7 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     // To do so, we need to generate the 'identity' vector and overide
     // one of the elements with the incoming scalar reduction. We need
     // to do it in the vector-loop preheader.
-    Builder.SetInsertPoint(LoopBypassBlocks.back()->getTerminator());
+    Builder.SetInsertPoint(LoopBypassBlocks.front()->getTerminator());
 
     // This is the vector-clone of the value that leaves the loop.
     VectorParts &VectorExit = getVectorValue(RdxDesc.LoopExitInstr);
@@ -1640,13 +1762,24 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
 
     // Find the reduction identity variable. Zero for addition, or, xor,
     // one for multiplication, -1 for And.
-    Constant *Iden = getReductionIdentity(RdxDesc.Kind, VecTy->getScalarType());
-    Constant *Identity = ConstantVector::getSplat(VF, Iden);
-
-    // This vector is the Identity vector where the first element is the
-    // incoming scalar reduction.
-    Value *VectorStart = Builder.CreateInsertElement(Identity,
-                                                     RdxDesc.StartValue, Zero);
+    Value *Identity;
+    Value *VectorStart;
+    if (RdxDesc.Kind == LoopVectorizationLegality::RK_IntegerMinMax ||
+        RdxDesc.Kind == LoopVectorizationLegality::RK_FloatMinMax) {
+      // MinMax reduction have the start value as their identify.
+      VectorStart = Identity = Builder.CreateVectorSplat(VF, RdxDesc.StartValue,
+                                                         "minmax.ident");
+    } else {
+      Constant *Iden =
+        LoopVectorizationLegality::getReductionIdentity(RdxDesc.Kind,
+                                                        VecTy->getScalarType());
+      Identity = ConstantVector::getSplat(VF, Iden);
+
+      // This vector is the Identity vector where the first element is the
+      // incoming scalar reduction.
+      VectorStart = Builder.CreateInsertElement(Identity,
+                                                RdxDesc.StartValue, Zero);
+    }
 
     // Fix the vector-loop phi.
     // We created the induction variable so we know that the
@@ -1688,10 +1821,15 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
 
     // Reduce all of the unrolled parts into a single vector.
     Value *ReducedPartRdx = RdxParts[0];
+    unsigned Op = getReductionBinOp(RdxDesc.Kind);
     for (unsigned part = 1; part < UF; ++part) {
-      Instruction::BinaryOps Op = getReductionBinOp(RdxDesc.Kind);
-      ReducedPartRdx = Builder.CreateBinOp(Op, RdxParts[part], ReducedPartRdx,
-                                           "bin.rdx");
+      if (Op != Instruction::ICmp && Op != Instruction::FCmp)
+        ReducedPartRdx = Builder.CreateBinOp((Instruction::BinaryOps)Op,
+                                             RdxParts[part], ReducedPartRdx,
+                                             "bin.rdx");
+      else
+        ReducedPartRdx = createMinMaxOp(Builder, RdxDesc.MinMaxKind,
+                                        ReducedPartRdx, RdxParts[part]);
     }
 
     // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
@@ -1716,8 +1854,11 @@ InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
                                     ConstantVector::get(ShuffleMask),
                                     "rdx.shuf");
 
-      Instruction::BinaryOps Op = getReductionBinOp(RdxDesc.Kind);
-      TmpVec = Builder.CreateBinOp(Op, TmpVec, Shuf, "bin.rdx");
+      if (Op != Instruction::ICmp && Op != Instruction::FCmp)
+        TmpVec = Builder.CreateBinOp((Instruction::BinaryOps)Op, TmpVec, Shuf,
+                                     "bin.rdx");
+      else
+        TmpVec = createMinMaxOp(Builder, RdxDesc.MinMaxKind, TmpVec, Shuf);
     }
 
     // The result is in the first element of the vector.
@@ -1850,18 +1991,33 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
         // We know that all PHIs in non header blocks are converted into
         // selects, so we don't have to worry about the insertion order and we
         // can just use the builder.
-
         // At this point we generate the predication tree. There may be
         // duplications since this is a simple recursive scan, but future
         // optimizations will clean it up.
-        VectorParts Cond = createEdgeMask(P->getIncomingBlock(0),
-                                               P->getParent());
 
-        for (unsigned part = 0; part < UF; ++part) {
-        VectorParts &In0 = getVectorValue(P->getIncomingValue(0));
-        VectorParts &In1 = getVectorValue(P->getIncomingValue(1));
-          Entry[part] = Builder.CreateSelect(Cond[part], In0[part], In1[part],
-                                             "predphi");
+        unsigned NumIncoming = P->getNumIncomingValues();
+        assert(NumIncoming > 1 && "Invalid PHI");
+
+        // Generate a sequence of selects of the form:
+        // SELECT(Mask3, In3,
+        //      SELECT(Mask2, In2,
+        //                   ( ...)))
+        for (unsigned In = 0; In < NumIncoming; In++) {
+          VectorParts Cond = createEdgeMask(P->getIncomingBlock(In),
+                                            P->getParent());
+          VectorParts &In0 = getVectorValue(P->getIncomingValue(In));
+
+          for (unsigned part = 0; part < UF; ++part) {
+            // We don't need to 'select' the first PHI operand because it is
+            // the default value if all of the other masks don't match.
+            if (In == 0)
+              Entry[part] = In0[part];
+            else
+              // Select between the current value and the previous incoming edge
+              // based on the incoming mask.
+              Entry[part] = Builder.CreateSelect(Cond[part], In0[part],
+                                                 Entry[part], "predphi");
+          }
         }
         continue;
       }
@@ -1917,7 +2073,8 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
           // After broadcasting the induction variable we need to make the
           // vector consecutive by adding  ... -3, -2, -1, 0.
           for (unsigned part = 0; part < UF; ++part)
-            Entry[part] = getConsecutiveVector(Broadcasted, -VF * part, true);
+            Entry[part] = getConsecutiveVector(Broadcasted, -(int)VF * part,
+                                               true);
           continue;
         }
 
@@ -2077,6 +2234,10 @@ InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
     }
 
     case Instruction::Call: {
+      // Ignore dbg intrinsics.
+      if (isa<DbgInfoIntrinsic>(it))
+        break;
+
       Module *M = BB->getParent()->getParent();
       CallInst *CI = cast<CallInst>(it);
       Intrinsic::ID ID = getIntrinsicIDForCall(CI, TLI);
@@ -2137,12 +2298,6 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
     if (!isa<BranchInst>(BB->getTerminator()))
       return false;
 
-    // We must have at most two predecessors because we need to convert
-    // all PHIs to selects.
-    unsigned Preds = std::distance(pred_begin(BB), pred_end(BB));
-    if (Preds > 2)
-      return false;
-
     // We must be able to predicate all blocks that need to be predicated.
     if (blockNeedsPredication(BB) && !blockCanBePredicated(BB))
       return false;
@@ -2153,7 +2308,10 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
 }
 
 bool LoopVectorizationLegality::canVectorize() {
-  assert(TheLoop->getLoopPreheader() && "No preheader!!");
+  // We must have a loop in canonical form. Loops with indirectbr in them cannot
+  // be canonicalized.
+  if (!TheLoop->getLoopPreheader())
+    return false;
 
   // We can only vectorize innermost loops.
   if (TheLoop->getSubLoopsVector().size())
@@ -2220,10 +2378,44 @@ bool LoopVectorizationLegality::canVectorize() {
   return true;
 }
 
+/// \brief Check that the instruction has outside loop users and is not an
+/// identified reduction variable.
+static bool hasOutsideLoopUser(const Loop *TheLoop, Instruction *Inst,
+                               SmallPtrSet<Value *, 4> &Reductions) {
+  // Reduction instructions are allowed to have exit users. All other
+  // instructions must not have external users.
+  if (!Reductions.count(Inst))
+    //Check that all of the users of the loop are inside the BB.
+    for (Value::use_iterator I = Inst->use_begin(), E = Inst->use_end();
+         I != E; ++I) {
+      Instruction *U = cast<Instruction>(*I);
+      // This user may be a reduction exit value.
+      if (!TheLoop->contains(U)) {
+        DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
+        return true;
+      }
+    }
+  return false;
+}
+
 bool LoopVectorizationLegality::canVectorizeInstrs() {
   BasicBlock *PreHeader = TheLoop->getLoopPreheader();
   BasicBlock *Header = TheLoop->getHeader();
 
+  // If we marked the scalar loop as "already vectorized" then no need
+  // to vectorize it again.
+  if (Header->getTerminator()->getMetadata(AlreadyVectorizedMDName)) {
+    DEBUG(dbgs() << "LV: This loop was vectorized before\n");
+    return false;
+  }
+
+  // Look for the attribute signaling the absence of NaNs.
+  Function &F = *Header->getParent();
+  if (F.hasFnAttribute("no-nans-fp-math"))
+    HasFunNoNaNAttr = F.getAttributes().getAttribute(
+      AttributeSet::FunctionIndex,
+      "no-nans-fp-math").getValueAsString() == "true";
+
   // For each block in the loop.
   for (Loop::block_iterator bb = TheLoop->block_begin(),
        be = TheLoop->block_end(); bb != be; ++bb) {
@@ -2233,12 +2425,6 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
          ++it) {
 
       if (PHINode *Phi = dyn_cast<PHINode>(it)) {
-        // This should not happen because the loop should be normalized.
-        if (Phi->getNumIncomingValues() != 2) {
-          DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
-          return false;
-        }
-
         // Check that this PHI type is allowed.
         if (!Phi->getType()->isIntegerTy() &&
             !Phi->getType()->isFloatingPointTy() &&
@@ -2250,8 +2436,19 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         // If this PHINode is not in the header block, then we know that we
         // can convert it to select during if-conversion. No need to check if
         // the PHIs in this block are induction or reduction variables.
-        if (*bb != Header)
-          continue;
+        if (*bb != Header) {
+          // Check that this instruction has no outside users or is an
+          // identified reduction value with an outside user.
+          if(!hasOutsideLoopUser(TheLoop, it, AllowedExit))
+            continue;
+          return false;
+        }
+
+        // We only allow if-converted PHIs with more than two incoming values.
+        if (Phi->getNumIncomingValues() != 2) {
+          DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
+          return false;
+        }
 
         // This is the value coming from the preheader.
         Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);
@@ -2293,6 +2490,10 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n");
           continue;
         }
+        if (AddReductionVar(Phi, RK_IntegerMinMax)) {
+          DEBUG(dbgs() << "LV: Found a MINMAX reduction PHI."<< *Phi <<"\n");
+          continue;
+        }
         if (AddReductionVar(Phi, RK_FloatMult)) {
           DEBUG(dbgs() << "LV: Found an FMult reduction PHI."<< *Phi <<"\n");
           continue;
@@ -2301,14 +2502,19 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           DEBUG(dbgs() << "LV: Found an FAdd reduction PHI."<< *Phi <<"\n");
           continue;
         }
+        if (AddReductionVar(Phi, RK_FloatMinMax)) {
+          DEBUG(dbgs() << "LV: Found an float MINMAX reduction PHI."<< *Phi <<"\n");
+          continue;
+        }
 
         DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
         return false;
       }// end of PHI handling
 
-      // We still don't handle functions.
+      // We still don't handle functions. However, we can ignore dbg intrinsic
+      // calls and we do handle certain intrinsic and libm functions.
       CallInst *CI = dyn_cast<CallInst>(it);
-      if (CI && !getIntrinsicIDForCall(CI, TLI)) {
+      if (CI && !getIntrinsicIDForCall(CI, TLI) && !isa<DbgInfoIntrinsic>(CI)) {
         DEBUG(dbgs() << "LV: Found a call site.\n");
         return false;
       }
@@ -2329,17 +2535,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
 
       // Reduction instructions are allowed to have exit users.
       // All other instructions must not have external users.
-      if (!AllowedExit.count(it))
-        //Check that all of the users of the loop are inside the BB.
-        for (Value::use_iterator I = it->use_begin(), E = it->use_end();
-             I != E; ++I) {
-          Instruction *U = cast<Instruction>(*I);
-          // This user may be a reduction exit value.
-          if (!TheLoop->contains(U)) {
-            DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
-            return false;
-          }
-        }
+      if (hasOutsideLoopUser(TheLoop, it, AllowedExit))
+        return false;
+
     } // next instr.
 
   }
@@ -2419,13 +2617,6 @@ LoopVectorizationLegality::hasPossibleGlobalWriteReorder(
 
 bool LoopVectorizationLegality::canVectorizeMemory() {
 
-  if (TheLoop->isAnnotatedParallel()) {
-    DEBUG(dbgs()
-          << "LV: A loop annotated parallel, ignore memory dependency "
-          << "checks.\n");
-    return true;
-  }
-
   typedef SmallVector<Value*, 16> ValueVector;
   typedef SmallPtrSet<Value*, 16> ValueSet;
   // Holds the Load and Store *instructions*.
@@ -2434,6 +2625,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
   PtrRtCheck.Pointers.clear();
   PtrRtCheck.Need = false;
 
+  const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
+
   // For each block.
   for (Loop::block_iterator bb = TheLoop->block_begin(),
        be = TheLoop->block_end(); bb != be; ++bb) {
@@ -2448,7 +2641,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
       if (it->mayReadFromMemory()) {
         LoadInst *Ld = dyn_cast<LoadInst>(it);
         if (!Ld) return false;
-        if (!Ld->isSimple()) {
+        if (!Ld->isSimple() && !IsAnnotatedParallel) {
           DEBUG(dbgs() << "LV: Found a non-simple load.\n");
           return false;
         }
@@ -2460,7 +2653,7 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
       if (it->mayWriteToMemory()) {
         StoreInst *St = dyn_cast<StoreInst>(it);
         if (!St) return false;
-        if (!St->isSimple()) {
+        if (!St->isSimple() && !IsAnnotatedParallel) {
           DEBUG(dbgs() << "LV: Found a non-simple store.\n");
           return false;
         }
@@ -2507,6 +2700,13 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
       ReadWrites.insert(std::make_pair(Ptr, ST));
   }
 
+  if (IsAnnotatedParallel) {
+    DEBUG(dbgs()
+          << "LV: A loop annotated parallel, ignore memory dependency "
+          << "checks.\n");
+    return true;
+  }
+
   for (I = Loads.begin(), IE = Loads.end(); I != IE; ++I) {
     LoadInst *LD = cast<LoadInst>(*I);
     Value* Ptr = LD->getPointerOperand();
@@ -2529,6 +2729,9 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
     return true;
   }
 
+  unsigned NumReadPtrs = 0;
+  unsigned NumWritePtrs = 0;
+
   // Find pointers with computable bounds. We are going to use this information
   // to place a runtime bound check.
   bool CanDoRT = true;
@@ -2536,7 +2739,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
   for (MI = ReadWrites.begin(), ME = ReadWrites.end(); MI != ME; ++MI) {
     Value *V = (*MI).first;
     if (hasComputableBounds(V)) {
-      PtrRtCheck.insert(SE, TheLoop, V);
+      PtrRtCheck.insert(SE, TheLoop, V, true);
+      NumWritePtrs++;
       DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n");
     } else {
       CanDoRT = false;
@@ -2546,7 +2750,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
   for (MI = Reads.begin(), ME = Reads.end(); MI != ME; ++MI) {
     Value *V = (*MI).first;
     if (hasComputableBounds(V)) {
-      PtrRtCheck.insert(SE, TheLoop, V);
+      PtrRtCheck.insert(SE, TheLoop, V, false);
+      NumReadPtrs++;
       DEBUG(dbgs() << "LV: Found a runtime check ptr:" << *V <<"\n");
     } else {
       CanDoRT = false;
@@ -2556,7 +2761,9 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
 
   // Check that we did not collect too many pointers or found a
   // unsizeable pointer.
-  if (!CanDoRT || PtrRtCheck.Pointers.size() > RuntimeMemoryCheckThreshold) {
+  unsigned NumComparisons = (NumWritePtrs * (NumReadPtrs + NumWritePtrs - 1));
+  DEBUG(dbgs() << "LV: We need to compare " << NumComparisons << " ptrs.\n");
+  if (!CanDoRT || NumComparisons > RuntimeMemoryCheckThreshold) {
     PtrRtCheck.reset();
     CanDoRT = false;
   }
@@ -2619,8 +2826,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
                                         Inst,
                                         WriteObjects,
                                         MaxByteWidth)) {
-        DEBUG(dbgs() << "LV: Found a possible write-write reorder:"
-              << *UI <<"\n");
+        DEBUG(dbgs() << "LV: Found a possible write-write reorder:" << **UI
+                     << "\n");
         return false;
       }
 
@@ -2663,8 +2870,8 @@ bool LoopVectorizationLegality::canVectorizeMemory() {
                                         Inst,
                                         WriteObjects,
                                         MaxByteWidth)) {
-        DEBUG(dbgs() << "LV: Found a possible read-write reorder:"
-              << *UI <<"\n");
+        DEBUG(dbgs() << "LV: Found a possible read-write reorder:" << **UI
+                     << "\n");
         return false;
       }
     }
@@ -2710,7 +2917,18 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
   // used as reduction variables (such as ADD). We may have a single
   // out-of-block user. The cycle must end with the original PHI.
   Instruction *Iter = Phi;
-  while (true) {
+
+  // To recognize min/max patterns formed by a icmp select sequence, we store
+  // the number of instruction we saw from the recognized min/max pattern,
+  // such that we don't stop when we see the phi has two uses (one by the select
+  // and one by the icmp) and to make sure we only see exactly the two
+  // instructions.
+  unsigned NumCmpSelectPatternInst = 0;
+  ReductionInstDesc ReduxDesc(false, 0);
+
+  // Avoid cycles in the chain.
+  SmallPtrSet<Instruction *, 8> VisitedInsts;
+  while (VisitedInsts.insert(Iter)) {
     // If the instruction has no users then this is a broken
     // chain and can't be a reduction variable.
     if (Iter->use_empty())
@@ -2749,26 +2967,40 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
       if (isa<PHINode>(Iter) && isa<PHINode>(U) &&
           U->getParent() != TheLoop->getHeader() &&
           TheLoop->contains(U) &&
-          Iter->getNumUses() > 1)
+          Iter->hasNUsesOrMore(2))
         continue;
 
-      // We can't have multiple inside users.
-      if (FoundInBlockUser)
+      // We can't have multiple inside users except for a combination of
+      // icmp/select both using the phi.
+      if (FoundInBlockUser && !NumCmpSelectPatternInst)
         return false;
       FoundInBlockUser = true;
 
       // Any reduction instr must be of one of the allowed kinds.
-      if (!isReductionInstr(U, Kind))
+      ReduxDesc = isReductionInstr(U, Kind, ReduxDesc);
+      if (!ReduxDesc.IsReduction)
         return false;
 
+      if (Kind == RK_IntegerMinMax && (isa<ICmpInst>(U) || isa<SelectInst>(U)))
+          ++NumCmpSelectPatternInst;
+      if (Kind == RK_FloatMinMax && (isa<FCmpInst>(U) || isa<SelectInst>(U)))
+          ++NumCmpSelectPatternInst;
+
       // Reductions of instructions such as Div, and Sub is only
       // possible if the LHS is the reduction variable.
-      if (!U->isCommutative() && !isa<PHINode>(U) && U->getOperand(0) != Iter)
+      if (!U->isCommutative() && !isa<PHINode>(U) && !isa<SelectInst>(U) &&
+          !isa<ICmpInst>(U) && !isa<FCmpInst>(U) && U->getOperand(0) != Iter)
         return false;
 
-      Iter = U;
+      Iter = ReduxDesc.PatternLastInst;
     }
 
+    // This means we have seen one but not the other instruction of the
+    // pattern or more than just a select and cmp.
+    if ((Kind == RK_IntegerMinMax || Kind == RK_FloatMinMax) &&
+        NumCmpSelectPatternInst != 2)
+      return false;
+
     // We found a reduction var if we have reached the original
     // phi node and we only have a single instruction with out-of-loop
     // users.
@@ -2777,47 +3009,107 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
       AllowedExit.insert(ExitInstruction);
 
       // Save the description of this reduction variable.
-      ReductionDescriptor RD(RdxStart, ExitInstruction, Kind);
+      ReductionDescriptor RD(RdxStart, ExitInstruction, Kind,
+                             ReduxDesc.MinMaxKind);
       Reductions[Phi] = RD;
       // We've ended the cycle. This is a reduction variable if we have an
       // outside user and it has a binary op.
       return FoundBinOp && ExitInstruction;
     }
   }
+
+  return false;
 }
 
-bool
+/// Returns true if the instruction is a Select(ICmp(X, Y), X, Y) instruction
+/// pattern corresponding to a min(X, Y) or max(X, Y).
+LoopVectorizationLegality::ReductionInstDesc
+LoopVectorizationLegality::isMinMaxSelectCmpPattern(Instruction *I,
+                                                    ReductionInstDesc &Prev) {
+
+  assert((isa<ICmpInst>(I) || isa<FCmpInst>(I) || isa<SelectInst>(I)) &&
+         "Expect a select instruction");
+  Instruction *Cmp = 0;
+  SelectInst *Select = 0;
+
+  // We must handle the select(cmp()) as a single instruction. Advance to the
+  // select.
+  if ((Cmp = dyn_cast<ICmpInst>(I)) || (Cmp = dyn_cast<FCmpInst>(I))) {
+    if (!Cmp->hasOneUse() || !(Select = dyn_cast<SelectInst>(*I->use_begin())))
+      return ReductionInstDesc(false, I);
+    return ReductionInstDesc(Select, Prev.MinMaxKind);
+  }
+
+  // Only handle single use cases for now.
+  if (!(Select = dyn_cast<SelectInst>(I)))
+    return ReductionInstDesc(false, I);
+  if (!(Cmp = dyn_cast<ICmpInst>(I->getOperand(0))) &&
+      !(Cmp = dyn_cast<FCmpInst>(I->getOperand(0))))
+    return ReductionInstDesc(false, I);
+  if (!Cmp->hasOneUse())
+    return ReductionInstDesc(false, I);
+
+  Value *CmpLeft;
+  Value *CmpRight;
+
+  // Look for a min/max pattern.
+  if (m_UMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+    return ReductionInstDesc(Select, MRK_UIntMin);
+  else if (m_UMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+    return ReductionInstDesc(Select, MRK_UIntMax);
+  else if (m_SMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+    return ReductionInstDesc(Select, MRK_SIntMax);
+  else if (m_SMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+    return ReductionInstDesc(Select, MRK_SIntMin);
+  else if (m_OrdFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+    return ReductionInstDesc(Select, MRK_FloatMin);
+  else if (m_OrdFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+    return ReductionInstDesc(Select, MRK_FloatMax);
+  else if (m_UnordFMin(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+    return ReductionInstDesc(Select, MRK_FloatMin);
+  else if (m_UnordFMax(m_Value(CmpLeft), m_Value(CmpRight)).match(Select))
+    return ReductionInstDesc(Select, MRK_FloatMax);
+
+  return ReductionInstDesc(false, I);
+}
+
+LoopVectorizationLegality::ReductionInstDesc
 LoopVectorizationLegality::isReductionInstr(Instruction *I,
-                                            ReductionKind Kind) {
+                                            ReductionKind Kind,
+                                            ReductionInstDesc &Prev) {
   bool FP = I->getType()->isFloatingPointTy();
   bool FastMath = (FP && I->isCommutative() && I->isAssociative());
-
   switch (I->getOpcode()) {
   default:
-    return false;
+    return ReductionInstDesc(false, I);
   case Instruction::PHI:
-      if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd))
-        return false;
-    // possibly.
-    return true;
+      if (FP && (Kind != RK_FloatMult && Kind != RK_FloatAdd &&
+                 Kind != RK_FloatMinMax))
+        return ReductionInstDesc(false, I);
+    return ReductionInstDesc(I, Prev.MinMaxKind);
   case Instruction::Sub:
   case Instruction::Add:
-    return Kind == RK_IntegerAdd;
-  case Instruction::SDiv:
-  case Instruction::UDiv:
+    return ReductionInstDesc(Kind == RK_IntegerAdd, I);
   case Instruction::Mul:
-    return Kind == RK_IntegerMult;
+    return ReductionInstDesc(Kind == RK_IntegerMult, I);
   case Instruction::And:
-    return Kind == RK_IntegerAnd;
+    return ReductionInstDesc(Kind == RK_IntegerAnd, I);
   case Instruction::Or:
-    return Kind == RK_IntegerOr;
+    return ReductionInstDesc(Kind == RK_IntegerOr, I);
   case Instruction::Xor:
-    return Kind == RK_IntegerXor;
+    return ReductionInstDesc(Kind == RK_IntegerXor, I);
   case Instruction::FMul:
-    return Kind == RK_FloatMult && FastMath;
+    return ReductionInstDesc(Kind == RK_FloatMult && FastMath, I);
   case Instruction::FAdd:
-    return Kind == RK_FloatAdd && FastMath;
-   }
+    return ReductionInstDesc(Kind == RK_FloatAdd && FastMath, I);
+  case Instruction::FCmp:
+  case Instruction::ICmp:
+  case Instruction::Select:
+    if (Kind != RK_IntegerMinMax &&
+        (!HasFunNoNaNAttr || Kind != RK_FloatMinMax))
+      return ReductionInstDesc(false, I);
+    return isMinMaxSelectCmpPattern(I, Prev);
+  }
 }
 
 LoopVectorizationLegality::InductionKind
@@ -3238,6 +3530,10 @@ unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
 
     // For each instruction in the old loop.
     for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+      // Skip dbg intrinsics.
+      if (isa<DbgInfoIntrinsic>(it))
+        continue;
+
       unsigned C = getInstructionCost(it, VF);
       Cost += C;
       DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF " <<
@@ -3297,14 +3593,25 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   case Instruction::AShr:
   case Instruction::And:
   case Instruction::Or:
-  case Instruction::Xor:
-    return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy);
+  case Instruction::Xor: {
+    // Certain instructions can be cheaper to vectorize if they have a constant
+    // second vector operand. One example of this are shifts on x86.
+    TargetTransformInfo::OperandValueKind Op1VK =
+      TargetTransformInfo::OK_AnyValue;
+    TargetTransformInfo::OperandValueKind Op2VK =
+      TargetTransformInfo::OK_AnyValue;
+
+    if (isa<ConstantInt>(I->getOperand(1)))
+      Op2VK = TargetTransformInfo::OK_UniformConstantValue;
+
+    return TTI.getArithmeticInstrCost(I->getOpcode(), VectorTy, Op1VK, Op2VK);
+  }
   case Instruction::Select: {
     SelectInst *SI = cast<SelectInst>(I);
     const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
     Type *CondTy = SI->getCondition()->getType();
-    if (ScalarCond)
+    if (!ScalarCond)
       CondTy = VectorType::get(CondTy, VF);
 
     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
@@ -3335,9 +3642,11 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
         TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
 
     // Scalarized loads/stores.
-    int Stride = Legal->isConsecutivePtr(Ptr);
-    bool Reverse = Stride < 0;
-    if (0 == Stride) {
+    int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
+    bool Reverse = ConsecutiveStride < 0;
+    unsigned ScalarAllocatedSize = DL->getTypeAllocSize(ValTy);
+    unsigned VectorElementSize = DL->getTypeStoreSize(VectorTy)/VF;
+    if (!ConsecutiveStride || ScalarAllocatedSize != VectorElementSize) {
       unsigned Cost = 0;
       // The cost of extracting from the value vector and pointer vector.
       Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
new file mode 100644
index 0000000000..cc30cc9278
--- /dev/null
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -0,0 +1,348 @@
+//===- SLPVectorizer.cpp - A bottom up SLP Vectorizer ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This pass implements the Bottom Up SLP vectorizer. It detects consecutive
+// stores that can be put together into vector-stores. Next, it attempts to
+// construct vectorizable tree using the use-def chains. If a profitable tree
+// was found, the SLP vectorizer performs vectorization on the tree.
+//
+// The pass is inspired by the work described in the paper:
+//  "Loop-Aware SLP in GCC" by Ira Rosen, Dorit Nuzman, Ayal Zaks.
+//
+//===----------------------------------------------------------------------===//
+#define SV_NAME "slp-vectorizer"
+#define DEBUG_TYPE SV_NAME
+
+#include "VecUtils.h"
+#include "llvm/Transforms/Vectorize.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <map>
+
+using namespace llvm;
+
+static cl::opt<int>
+SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
+                 cl::desc("Only vectorize trees if the gain is above this "
+                          "number. (gain = -cost of vectorization)"));
+namespace {
+
+/// The SLPVectorizer Pass.
+struct SLPVectorizer : public FunctionPass {
+  typedef std::map<Value*, BoUpSLP::StoreList> StoreListMap;
+
+  /// Pass identification, replacement for typeid
+  static char ID;
+
+  explicit SLPVectorizer() : FunctionPass(ID) {
+    initializeSLPVectorizerPass(*PassRegistry::getPassRegistry());
+  }
+
+  ScalarEvolution *SE;
+  DataLayout *DL;
+  TargetTransformInfo *TTI;
+  AliasAnalysis *AA;
+  LoopInfo *LI;
+
+  virtual bool runOnFunction(Function &F) {
+    SE = &getAnalysis<ScalarEvolution>();
+    DL = getAnalysisIfAvailable<DataLayout>();
+    TTI = &getAnalysis<TargetTransformInfo>();
+    AA = &getAnalysis<AliasAnalysis>();
+    LI = &getAnalysis<LoopInfo>();
+
+    StoreRefs.clear();
+    bool Changed = false;
+
+    // Must have DataLayout. We can't require it because some tests run w/o
+    // triple.
+    if (!DL)
+      return false;
+
+    for (Function::iterator it = F.begin(), e = F.end(); it != e; ++it) {
+      BasicBlock *BB = it;
+      bool BBChanged = false;
+
+      // Use the bollom up slp vectorizer to construct chains that start with
+      // he store instructions.
+      BoUpSLP R(BB, SE, DL, TTI, AA, LI->getLoopFor(BB));
+
+      // Vectorize trees that end at reductions.
+      BBChanged |= vectorizeReductions(BB, R);
+
+      // Vectorize trees that end at stores.
+      if (unsigned count = collectStores(BB, R)) {
+        (void)count;
+        DEBUG(dbgs()<<"SLP: Found " << count << " stores to vectorize.\n");
+        BBChanged |= vectorizeStoreChains(R);
+      }
+
+      // Try to hoist some of the scalarization code to the preheader.
+      if (BBChanged) hoistGatherSequence(LI, BB, R);
+
+      Changed |= BBChanged;
+    }
+
+    if (Changed) {
+      DEBUG(dbgs()<<"SLP: vectorized \""<<F.getName()<<"\"\n");
+      DEBUG(verifyFunction(F));
+    }
+    return Changed;
+  }
+
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    FunctionPass::getAnalysisUsage(AU);
+    AU.addRequired<ScalarEvolution>();
+    AU.addRequired<AliasAnalysis>();
+    AU.addRequired<TargetTransformInfo>();
+    AU.addRequired<LoopInfo>();
+  }
+
+private:
+
+  /// \brief Collect memory references and sort them according to their base
+  /// object. We sort the stores to their base objects to reduce the cost of the
+  /// quadratic search on the stores. TODO: We can further reduce this cost
+  /// if we flush the chain creation every time we run into a memory barrier.
+  unsigned collectStores(BasicBlock *BB, BoUpSLP &R);
+
+  /// \brief Try to vectorize a chain that starts at two arithmetic instrs.
+  bool tryToVectorizePair(Value *A, Value *B,  BoUpSLP &R);
+
+  /// \brief Try to vectorize a list of operands.
+  bool tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R);
+
+  /// \brief Try to vectorize a chain that may start at the operands of \V;
+  bool tryToVectorize(BinaryOperator *V,  BoUpSLP &R);
+
+  /// \brief Vectorize the stores that were collected in StoreRefs.
+  bool vectorizeStoreChains(BoUpSLP &R);
+
+  /// \brief Try to hoist gather sequences outside of the loop in cases where
+  /// all of the sources are loop invariant.
+  void hoistGatherSequence(LoopInfo *LI, BasicBlock *BB, BoUpSLP &R);
+
+  /// \brief Scan the basic block and look for reductions that may start a
+  /// vectorization chain.
+  bool vectorizeReductions(BasicBlock *BB, BoUpSLP &R);
+
+private:
+  StoreListMap StoreRefs;
+};
+
+unsigned SLPVectorizer::collectStores(BasicBlock *BB, BoUpSLP &R) {
+  unsigned count = 0;
+  StoreRefs.clear();
+  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+    StoreInst *SI = dyn_cast<StoreInst>(it);
+    if (!SI)
+      continue;
+
+    // Check that the pointer points to scalars.
+    Type *Ty = SI->getValueOperand()->getType();
+    if (Ty->isAggregateType() || Ty->isVectorTy())
+      return 0;
+
+    // Find the base of the GEP.
+    Value *Ptr = SI->getPointerOperand();
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr))
+      Ptr = GEP->getPointerOperand();
+
+    // Save the store locations.
+    StoreRefs[Ptr].push_back(SI);
+    count++;
+  }
+  return count;
+}
+
+bool SLPVectorizer::tryToVectorizePair(Value *A, Value *B,  BoUpSLP &R) {
+  if (!A || !B) return false;
+  Value *VL[] = { A, B };
+  return tryToVectorizeList(VL, R);
+}
+
+bool SLPVectorizer::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R) {
+  DEBUG(dbgs()<<"SLP: Vectorizing a list of length = " << VL.size() << ".\n");
+
+  // Check that all of the parts are scalar.
+  for (int i = 0, e = VL.size(); i < e; ++i) {
+    Type *Ty = VL[i]->getType();
+    if (Ty->isAggregateType() || Ty->isVectorTy())
+      return 0;
+  }
+
+  int Cost = R.getTreeCost(VL);
+  int ExtrCost = R.getScalarizationCost(VL);
+  DEBUG(dbgs()<<"SLP: Cost of pair:" << Cost <<
+        " Cost of extract:" << ExtrCost << ".\n");
+  if ((Cost+ExtrCost) >= -SLPCostThreshold) return false;
+  DEBUG(dbgs()<<"SLP: Vectorizing pair.\n");
+  R.vectorizeArith(VL);
+  return true;
+}
+
+bool SLPVectorizer::tryToVectorize(BinaryOperator *V,  BoUpSLP &R) {
+  if (!V) return false;
+  // Try to vectorize V.
+  if (tryToVectorizePair(V->getOperand(0), V->getOperand(1), R))
+    return true;
+
+  BinaryOperator *A = dyn_cast<BinaryOperator>(V->getOperand(0));
+  BinaryOperator *B = dyn_cast<BinaryOperator>(V->getOperand(1));
+  // Try to skip B.
+  if (B && B->hasOneUse()) {
+    BinaryOperator *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
+    BinaryOperator *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
+    if (tryToVectorizePair(A, B0, R)) {
+      B->moveBefore(V);
+      return true;
+    }
+    if (tryToVectorizePair(A, B1, R)) {
+      B->moveBefore(V);
+      return true;
+    }
+  }
+
+  // Try to skip A.
+  if (A && A->hasOneUse()) {
+    BinaryOperator *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
+    BinaryOperator *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
+    if (tryToVectorizePair(A0, B, R)) {
+      A->moveBefore(V);
+      return true;
+    }
+    if (tryToVectorizePair(A1, B, R)) {
+      A->moveBefore(V);
+      return true;
+    }
+  }
+  return 0;
+}
+
+bool SLPVectorizer::vectorizeReductions(BasicBlock *BB, BoUpSLP &R) {
+  bool Changed = false;
+  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+    if (isa<DbgInfoIntrinsic>(it)) continue;
+
+    // Try to vectorize reductions that use PHINodes.
+    if (PHINode *P = dyn_cast<PHINode>(it)) {
+      // Check that the PHI is a reduction PHI.
+      if (P->getNumIncomingValues() != 2) return Changed;
+      Value *Rdx = (P->getIncomingBlock(0) == BB ? P->getIncomingValue(0) :
+                    (P->getIncomingBlock(1) == BB ? P->getIncomingValue(1) :
+                     0));
+      // Check if this is a Binary Operator.
+      BinaryOperator *BI = dyn_cast_or_null<BinaryOperator>(Rdx);
+      if (!BI)
+        continue;
+
+      Value *Inst = BI->getOperand(0);
+      if (Inst == P) Inst = BI->getOperand(1);
+      Changed |= tryToVectorize(dyn_cast<BinaryOperator>(Inst), R);
+      continue;
+    }
+
+    // Try to vectorize trees that start at compare instructions.
+    if (CmpInst *CI = dyn_cast<CmpInst>(it)) {
+      if (tryToVectorizePair(CI->getOperand(0), CI->getOperand(1), R)) {
+        Changed |= true;
+        continue;
+      }
+      for (int i = 0; i < 2; ++i)
+        if (BinaryOperator *BI = dyn_cast<BinaryOperator>(CI->getOperand(i)))
+          Changed |= tryToVectorizePair(BI->getOperand(0), BI->getOperand(1), R);
+      continue;
+    }
+  }
+
+  return Changed;
+}
+
+bool SLPVectorizer::vectorizeStoreChains(BoUpSLP &R) {
+  bool Changed = false;
+  // Attempt to sort and vectorize each of the store-groups.
+  for (StoreListMap::iterator it = StoreRefs.begin(), e = StoreRefs.end();
+       it != e; ++it) {
+    if (it->second.size() < 2)
+      continue;
+
+    DEBUG(dbgs()<<"SLP: Analyzing a store chain of length " <<
+          it->second.size() << ".\n");
+
+    Changed |= R.vectorizeStores(it->second, -SLPCostThreshold);
+  }
+  return Changed;
+}
+
+void SLPVectorizer::hoistGatherSequence(LoopInfo *LI, BasicBlock *BB,
+                                        BoUpSLP &R) {
+  // Check if this block is inside a loop.
+  Loop *L = LI->getLoopFor(BB);
+  if (!L)
+    return;
+
+  // Check if it has a preheader.
+  BasicBlock *PreHeader = L->getLoopPreheader();
+  if (!PreHeader)
+    return;
+
+  // Mark the insertion point for the block.
+  Instruction *Location = PreHeader->getTerminator();
+
+  BoUpSLP::ValueList &Gathers = R.getGatherSeqInstructions();
+  for (BoUpSLP::ValueList::iterator it = Gathers.begin(), e = Gathers.end();
+       it != e; ++it) {
+    InsertElementInst *Insert = dyn_cast<InsertElementInst>(*it);
+
+    // The InsertElement sequence can be simplified into a constant.
+    if (!Insert)
+      continue;
+
+    // If the vector or the element that we insert into it are
+    // instructions that are defined in this basic block then we can't
+    // hoist this instruction.
+    Instruction *CurrVec = dyn_cast<Instruction>(Insert->getOperand(0));
+    Instruction *NewElem = dyn_cast<Instruction>(Insert->getOperand(1));
+    if (CurrVec && L->contains(CurrVec)) continue;
+    if (NewElem && L->contains(NewElem)) continue;
+
+    // We can hoist this instruction. Move it to the pre-header.
+    Insert->moveBefore(Location);
+  }
+}
+
+} // end anonymous namespace
+
+char SLPVectorizer::ID = 0;
+static const char lv_name[] = "SLP Vectorizer";
+INITIALIZE_PASS_BEGIN(SLPVectorizer, SV_NAME, lv_name, false, false)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_AG_DEPENDENCY(TargetTransformInfo)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
+
+namespace llvm {
+  Pass *createSLPVectorizerPass() {
+    return new SLPVectorizer();
+  }
+}
+
diff --git a/lib/Transforms/Vectorize/VecUtils.cpp b/lib/Transforms/Vectorize/VecUtils.cpp
new file mode 100644
index 0000000000..9b9436683b
--- /dev/null
+++ b/lib/Transforms/Vectorize/VecUtils.cpp
@@ -0,0 +1,730 @@
+//===- VecUtils.cpp --- Vectorization Utilities ---------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#define DEBUG_TYPE "SLP"
+
+#include "VecUtils.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <map>
+
+using namespace llvm;
+
+static const unsigned MinVecRegSize = 128;
+
+static const unsigned RecursionMaxDepth = 6;
+
+namespace llvm {
+
+BoUpSLP::BoUpSLP(BasicBlock *Bb, ScalarEvolution *S, DataLayout *Dl,
+                 TargetTransformInfo *Tti, AliasAnalysis *Aa, Loop *Lp) :
+  BB(Bb), SE(S), DL(Dl), TTI(Tti), AA(Aa), L(Lp)  {
+  numberInstructions();
+}
+
+void BoUpSLP::numberInstructions() {
+  int Loc = 0;
+  InstrIdx.clear();
+  InstrVec.clear();
+  // Number the instructions in the block.
+  for (BasicBlock::iterator it=BB->begin(), e=BB->end(); it != e; ++it) {
+    InstrIdx[it] = Loc++;
+    InstrVec.push_back(it);
+    assert(InstrVec[InstrIdx[it]] == it && "Invalid allocation");
+  }
+}
+
+Value *BoUpSLP::getPointerOperand(Value *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) return LI->getPointerOperand();
+  if (StoreInst *SI = dyn_cast<StoreInst>(I)) return SI->getPointerOperand();
+  return 0;
+}
+
+unsigned BoUpSLP::getAddressSpaceOperand(Value *I) {
+  if (LoadInst *L=dyn_cast<LoadInst>(I)) return L->getPointerAddressSpace();
+  if (StoreInst *S=dyn_cast<StoreInst>(I)) return S->getPointerAddressSpace();
+  return -1;
+}
+
+bool BoUpSLP::isConsecutiveAccess(Value *A, Value *B) {
+  Value *PtrA = getPointerOperand(A);
+  Value *PtrB = getPointerOperand(B);
+  unsigned ASA = getAddressSpaceOperand(A);
+  unsigned ASB = getAddressSpaceOperand(B);
+
+  // Check that the address spaces match and that the pointers are valid.
+  if (!PtrA || !PtrB || (ASA != ASB)) return false;
+
+  // Check that A and B are of the same type.
+  if (PtrA->getType() != PtrB->getType()) return false;
+
+  // Calculate the distance.
+  const SCEV *PtrSCEVA = SE->getSCEV(PtrA);
+  const SCEV *PtrSCEVB = SE->getSCEV(PtrB);
+  const SCEV *OffsetSCEV = SE->getMinusSCEV(PtrSCEVA, PtrSCEVB);
+  const SCEVConstant *ConstOffSCEV = dyn_cast<SCEVConstant>(OffsetSCEV);
+
+  // Non constant distance.
+  if (!ConstOffSCEV) return false;
+
+  int64_t Offset = ConstOffSCEV->getValue()->getSExtValue();
+  Type *Ty = cast<PointerType>(PtrA->getType())->getElementType();
+  // The Instructions are connsecutive if the size of the first load/store is
+  // the same as the offset.
+  int64_t Sz = DL->getTypeStoreSize(Ty);
+  return ((-Offset) == Sz);
+}
+
+bool BoUpSLP::vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold) {
+  Type *StoreTy = cast<StoreInst>(Chain[0])->getValueOperand()->getType();
+  unsigned Sz = DL->getTypeSizeInBits(StoreTy);
+  unsigned VF = MinVecRegSize / Sz;
+
+  if (!isPowerOf2_32(Sz) || VF < 2) return false;
+
+  bool Changed = false;
+  // Look for profitable vectorizable trees at all offsets, starting at zero.
+  for (unsigned i = 0, e = Chain.size(); i < e; ++i) {
+    if (i + VF > e) return Changed;
+    DEBUG(dbgs()<<"SLP: Analyzing " << VF << " stores at offset "<< i << "\n");
+    ArrayRef<Value *> Operands = Chain.slice(i, VF);
+
+    int Cost = getTreeCost(Operands);
+    DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n");
+    if (Cost < CostThreshold) {
+      DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n");
+      vectorizeTree(Operands, VF);
+      i += VF - 1;
+      Changed = true;
+    }
+  }
+
+  return Changed;
+}
+
+bool BoUpSLP::vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold) {
+  ValueSet Heads, Tails;
+  SmallDenseMap<Value*, Value*> ConsecutiveChain;
+
+  // We may run into multiple chains that merge into a single chain. We mark the
+  // stores that we vectorized so that we don't visit the same store twice.
+  ValueSet VectorizedStores;
+  bool Changed = false;
+
+  // Do a quadratic search on all of the given stores and find
+  // all of the pairs of loads that follow each other.
+  for (unsigned i = 0, e = Stores.size(); i < e; ++i)
+    for (unsigned j = 0; j < e; ++j) {
+      if (i == j) continue;
+      if (isConsecutiveAccess(Stores[i], Stores[j])) {
+        Tails.insert(Stores[j]);
+        Heads.insert(Stores[i]);
+        ConsecutiveChain[Stores[i]] = Stores[j];
+      }
+    }
+
+  // For stores that start but don't end a link in the chain:
+  for (ValueSet::iterator it = Heads.begin(), e = Heads.end();it != e; ++it) {
+    if (Tails.count(*it)) continue;
+
+    // We found a store instr that starts a chain. Now follow the chain and try
+    // to vectorize it.
+    ValueList Operands;
+    Value *I = *it;
+    // Collect the chain into a list.
+    while (Tails.count(I) || Heads.count(I)) {
+      if (VectorizedStores.count(I)) break;
+      Operands.push_back(I);
+      // Move to the next value in the chain.
+      I = ConsecutiveChain[I];
+    }
+
+    bool Vectorized = vectorizeStoreChain(Operands, costThreshold);
+
+    // Mark the vectorized stores so that we don't vectorize them again.
+    if (Vectorized)
+      VectorizedStores.insert(Operands.begin(), Operands.end());
+    Changed |= Vectorized;
+  }
+
+  return Changed;
+}
+
+int BoUpSLP::getScalarizationCost(ArrayRef<Value *> VL) {
+  // Find the type of the operands in VL.
+  Type *ScalarTy = VL[0]->getType();
+  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+    ScalarTy = SI->getValueOperand()->getType();
+  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+  // Find the cost of inserting/extracting values from the vector.
+  return getScalarizationCost(VecTy);
+}
+
+int BoUpSLP::getScalarizationCost(Type *Ty) {
+  int Cost = 0;
+  for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
+    Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
+  return Cost;
+}
+
+AliasAnalysis::Location BoUpSLP::getLocation(Instruction *I) {
+  if (StoreInst *SI = dyn_cast<StoreInst>(I)) return AA->getLocation(SI);
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) return AA->getLocation(LI);
+  return AliasAnalysis::Location();
+}
+
+Value *BoUpSLP::isUnsafeToSink(Instruction *Src, Instruction *Dst) {
+  assert(Src->getParent() == Dst->getParent() && "Not the same BB");
+  BasicBlock::iterator I = Src, E = Dst;
+  /// Scan all of the instruction from SRC to DST and check if
+  /// the source may alias.
+  for (++I; I != E; ++I) {
+    // Ignore store instructions that are marked as 'ignore'.
+    if (MemBarrierIgnoreList.count(I)) continue;
+    if (Src->mayWriteToMemory()) /* Write */ {
+      if (!I->mayReadOrWriteMemory()) continue;
+    } else /* Read */ {
+      if (!I->mayWriteToMemory()) continue;
+    }
+    AliasAnalysis::Location A = getLocation(&*I);
+    AliasAnalysis::Location B = getLocation(Src);
+
+    if (!A.Ptr || !B.Ptr || AA->alias(A, B))
+      return I;
+  }
+  return 0;
+}
+
+void BoUpSLP::vectorizeArith(ArrayRef<Value *> Operands) {
+  Value *Vec = vectorizeTree(Operands, Operands.size());
+  BasicBlock::iterator Loc = cast<Instruction>(Vec);
+  IRBuilder<> Builder(++Loc);
+  // After vectorizing the operands we need to generate extractelement
+  // instructions and replace all of the uses of the scalar values with
+  // the values that we extracted from the vectorized tree.
+  for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
+    Value *S = Builder.CreateExtractElement(Vec, Builder.getInt32(i));
+    Operands[i]->replaceAllUsesWith(S);
+  }
+}
+
+int BoUpSLP::getTreeCost(ArrayRef<Value *> VL) {
+  // Get rid of the list of stores that were removed, and from the
+  // lists of instructions with multiple users.
+  MemBarrierIgnoreList.clear();
+  LaneMap.clear();
+  MultiUserVals.clear();
+  MustScalarize.clear();
+
+  // Scan the tree and find which value is used by which lane, and which values
+  // must be scalarized.
+  getTreeUses_rec(VL, 0);
+
+  // Check that instructions with multiple users can be vectorized. Mark unsafe
+  // instructions.
+  for (ValueSet::iterator it = MultiUserVals.begin(),
+       e = MultiUserVals.end(); it != e; ++it) {
+    // Check that all of the users of this instr are within the tree
+    // and that they are all from the same lane.
+    int Lane = -1;
+    for (Value::use_iterator I = (*it)->use_begin(), E = (*it)->use_end();
+         I != E; ++I) {
+      if (LaneMap.find(*I) == LaneMap.end()) {
+        MustScalarize.insert(*it);
+        DEBUG(dbgs()<<"SLP: Adding " << **it <<
+              " to MustScalarize because of an out of tree usage.\n");
+        break;
+      }
+      if (Lane == -1) Lane = LaneMap[*I];
+      if (Lane != LaneMap[*I]) {
+        MustScalarize.insert(*it);
+        DEBUG(dbgs()<<"Adding " << **it <<
+              " to MustScalarize because multiple lane use it: "
+              << Lane << " and " << LaneMap[*I] << ".\n");
+        break;
+      }
+    }
+  }
+
+  // Now calculate the cost of vectorizing the tree.
+  return getTreeCost_rec(VL, 0);
+}
+
+void BoUpSLP::getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth) {
+  if (Depth == RecursionMaxDepth) return;
+
+  // Don't handle vectors.
+  if (VL[0]->getType()->isVectorTy()) return;
+  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+    if (SI->getValueOperand()->getType()->isVectorTy()) return;
+
+  // Check if all of the operands are constants.
+  bool AllConst = true;
+  bool AllSameScalar = true;
+  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+    AllConst &= isa<Constant>(VL[i]);
+    AllSameScalar &= (VL[0] == VL[i]);
+    Instruction *I = dyn_cast<Instruction>(VL[i]);
+    // If one of the instructions is out of this BB, we need to scalarize all.
+    if (I && I->getParent() != BB) return;
+  }
+
+  // If all of the operands are identical or constant we have a simple solution.
+  if (AllConst || AllSameScalar) return;
+
+  // Scalarize unknown structures.
+  Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
+  if (!VL0) return;
+
+  unsigned Opcode = VL0->getOpcode();
+  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+    Instruction *I = dyn_cast<Instruction>(VL[i]);
+    // If not all of the instructions are identical then we have to scalarize.
+    if (!I || Opcode != I->getOpcode()) return;
+  }
+
+  // Mark instructions with multiple users.
+  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+    Instruction *I = dyn_cast<Instruction>(VL[i]);
+    // Remember to check if all of the users of this instr are vectorized
+    // within our tree.
+    if (I && I->getNumUses() > 1) MultiUserVals.insert(I);
+  }
+
+  for (int i = 0, e = VL.size(); i < e; ++i) {
+    // Check that the instruction is only used within
+    // one lane.
+    if (LaneMap.count(VL[i]) && LaneMap[VL[i]] != i) return;
+    // Make this instruction as 'seen' and remember the lane.
+    LaneMap[VL[i]] = i;
+  }
+
+  switch (Opcode) {
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::SIToFP:
+    case Instruction::UIToFP:
+    case Instruction::Trunc:
+    case Instruction::FPTrunc:
+    case Instruction::BitCast:
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor: {
+      for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+        ValueList Operands;
+        // Prepare the operand vector.
+        for (unsigned j = 0; j < VL.size(); ++j)
+          Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+
+        getTreeUses_rec(Operands, Depth+1);
+      }
+      return;
+    }
+    case Instruction::Store: {
+      ValueList Operands;
+      for (unsigned j = 0; j < VL.size(); ++j)
+        Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
+      getTreeUses_rec(Operands, Depth+1);
+      return;
+    }
+    default:
+    return;
+  }
+}
+
+int BoUpSLP::getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth) {
+  Type *ScalarTy = VL[0]->getType();
+
+  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+    ScalarTy = SI->getValueOperand()->getType();
+
+  /// Don't mess with vectors.
+  if (ScalarTy->isVectorTy()) return max_cost;
+  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+
+  if (Depth == RecursionMaxDepth) return getScalarizationCost(VecTy);
+
+  // Check if all of the operands are constants.
+  bool AllConst = true;
+  bool AllSameScalar = true;
+  bool MustScalarizeFlag = false;
+  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+    AllConst &= isa<Constant>(VL[i]);
+    AllSameScalar &= (VL[0] == VL[i]);
+    // Must have a single use.
+    Instruction *I = dyn_cast<Instruction>(VL[i]);
+    MustScalarizeFlag |= MustScalarize.count(VL[i]);
+    // This instruction is outside the basic block.
+    if (I && I->getParent() != BB)
+      return getScalarizationCost(VecTy);
+  }
+
+  // Is this a simple vector constant.
+  if (AllConst) return 0;
+
+  // If all of the operands are identical we can broadcast them.
+  Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
+  if (AllSameScalar) {
+    // If we are in a loop, and this is not an instruction (e.g. constant or
+    // argument) or the instruction is defined outside the loop then assume
+    // that the cost is zero.
+    if (L && (!VL0 || !L->contains(VL0)))
+      return 0;
+
+    // We need to broadcast the scalar.
+    return TTI->getShuffleCost(TargetTransformInfo::SK_Broadcast, VecTy, 0);
+  }
+
+  // If this is not a constant, or a scalar from outside the loop then we
+  // need to scalarize it.
+  if (MustScalarizeFlag)
+    return getScalarizationCost(VecTy);
+
+  if (!VL0) return getScalarizationCost(VecTy);
+  assert(VL0->getParent() == BB && "Wrong BB");
+
+  unsigned Opcode = VL0->getOpcode();
+  for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+    Instruction *I = dyn_cast<Instruction>(VL[i]);
+    // If not all of the instructions are identical then we have to scalarize.
+    if (!I || Opcode != I->getOpcode()) return getScalarizationCost(VecTy);
+  }
+
+  // Check if it is safe to sink the loads or the stores.
+  if (Opcode == Instruction::Load || Opcode == Instruction::Store) {
+    int MaxIdx = InstrIdx[VL0];
+    for (unsigned i = 1, e = VL.size(); i < e; ++i )
+      MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]);
+
+    Instruction *Last = InstrVec[MaxIdx];
+    for (unsigned i = 0, e = VL.size(); i < e; ++i ) {
+      if (VL[i] == Last) continue;
+      Value *Barrier = isUnsafeToSink(cast<Instruction>(VL[i]), Last);
+      if (Barrier) {
+        DEBUG(dbgs() << "SLP: Can't sink " << *VL[i] << "\n down to " <<
+              *Last << "\n because of " << *Barrier << "\n");
+        return max_cost;
+      }
+    }
+  }
+
+  switch (Opcode) {
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::FPExt:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::SIToFP:
+  case Instruction::UIToFP:
+  case Instruction::Trunc:
+  case Instruction::FPTrunc:
+  case Instruction::BitCast: {
+    int Cost = 0;
+    ValueList Operands;
+    Type *SrcTy = VL0->getOperand(0)->getType();
+    // Prepare the operand vector.
+    for (unsigned j = 0; j < VL.size(); ++j) {
+      Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
+      // Check that the casted type is the same for all users.
+      if (cast<Instruction>(VL[j])->getOperand(0)->getType() != SrcTy)
+        return getScalarizationCost(VecTy);
+    }
+
+    Cost += getTreeCost_rec(Operands, Depth+1);
+    if (Cost >= max_cost) return max_cost;
+
+    // Calculate the cost of this instruction.
+    int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
+                                                       VL0->getType(), SrcTy);
+
+    VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
+    int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
+    Cost += (VecCost - ScalarCost);
+    return Cost;
+  }
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::FDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    int Cost = 0;
+    // Calculate the cost of all of the operands.
+    for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
+      ValueList Operands;
+      // Prepare the operand vector.
+      for (unsigned j = 0; j < VL.size(); ++j)
+        Operands.push_back(cast<Instruction>(VL[j])->getOperand(i));
+
+      Cost += getTreeCost_rec(Operands, Depth+1);
+      if (Cost >= max_cost) return max_cost;
+    }
+
+    // Calculate the cost of this instruction.
+    int ScalarCost = VecTy->getNumElements() *
+      TTI->getArithmeticInstrCost(Opcode, ScalarTy);
+
+    int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy);
+    Cost += (VecCost - ScalarCost);
+    return Cost;
+  }
+  case Instruction::Load: {
+    // If we are scalarize the loads, add the cost of forming the vector.
+    for (unsigned i = 0, e = VL.size()-1; i < e; ++i)
+      if (!isConsecutiveAccess(VL[i], VL[i+1]))
+        return getScalarizationCost(VecTy);
+
+    // Cost of wide load - cost of scalar loads.
+    int ScalarLdCost = VecTy->getNumElements() *
+      TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
+    int VecLdCost = TTI->getMemoryOpCost(Instruction::Load, ScalarTy, 1, 0);
+    return VecLdCost - ScalarLdCost;
+  }
+  case Instruction::Store: {
+    // We know that we can merge the stores. Calculate the cost.
+    int ScalarStCost = VecTy->getNumElements() *
+      TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1, 0);
+    int VecStCost = TTI->getMemoryOpCost(Instruction::Store, ScalarTy, 1,0);
+    int StoreCost = VecStCost - ScalarStCost;
+
+    ValueList Operands;
+    for (unsigned j = 0; j < VL.size(); ++j) {
+      Operands.push_back(cast<Instruction>(VL[j])->getOperand(0));
+      MemBarrierIgnoreList.insert(VL[j]);
+    }
+
+    int TotalCost = StoreCost + getTreeCost_rec(Operands, Depth + 1);
+    return TotalCost;
+  }
+  default:
+    // Unable to vectorize unknown instructions.
+    return getScalarizationCost(VecTy);
+  }
+}
+
+Instruction *BoUpSLP::GetLastInstr(ArrayRef<Value *> VL, unsigned VF) {
+  int MaxIdx = InstrIdx[BB->getFirstNonPHI()];
+  for (unsigned i = 0; i < VF; ++i )
+    MaxIdx = std::max(MaxIdx, InstrIdx[VL[i]]);
+  return InstrVec[MaxIdx + 1];
+}
+
+Value *BoUpSLP::Scalarize(ArrayRef<Value *> VL, VectorType *Ty) {
+  IRBuilder<> Builder(GetLastInstr(VL, Ty->getNumElements()));
+  Value *Vec = UndefValue::get(Ty);
+  for (unsigned i=0; i < Ty->getNumElements(); ++i) {
+    // Generate the 'InsertElement' instruction.
+    Vec = Builder.CreateInsertElement(Vec, VL[i], Builder.getInt32(i));
+    // Remember that this instruction is used as part of a 'gather' sequence.
+    // The caller of the bottom-up slp vectorizer can try to hoist the sequence
+    // if the users are outside of the basic block.
+    GatherInstructions.push_back(Vec);
+  }
+
+  return Vec;
+}
+
+Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, int VF) {
+  Value *V = vectorizeTree_rec(VL, VF);
+  // We moved some instructions around. We have to number them again
+  // before we can do any analysis.
+  numberInstructions();
+  MustScalarize.clear();
+  return V;
+}
+
+Value *BoUpSLP::vectorizeTree_rec(ArrayRef<Value *> VL, int VF) {
+  Type *ScalarTy = VL[0]->getType();
+  if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
+    ScalarTy = SI->getValueOperand()->getType();
+  VectorType *VecTy = VectorType::get(ScalarTy, VF);
+
+  // Check if all of the operands are constants or identical.
+  bool AllConst = true;
+  bool AllSameScalar = true;
+  for (unsigned i = 0, e = VF; i < e; ++i) {
+    AllConst &= isa<Constant>(VL[i]);
+    AllSameScalar &= (VL[0] == VL[i]);
+    // The instruction must be in the same BB, and it must be vectorizable.
+    Instruction *I = dyn_cast<Instruction>(VL[i]);
+    if (MustScalarize.count(VL[i]) || (I && I->getParent() != BB))
+      return Scalarize(VL, VecTy);
+  }
+
+  // Check that this is a simple vector constant.
+  if (AllConst || AllSameScalar) return Scalarize(VL, VecTy);
+
+  // Scalarize unknown structures.
+  Instruction *VL0 = dyn_cast<Instruction>(VL[0]);
+  if (!VL0) return Scalarize(VL, VecTy);
+
+  if (VectorizedValues.count(VL0)) return VectorizedValues[VL0];
+
+  unsigned Opcode = VL0->getOpcode();
+  for (unsigned i = 0, e = VF; i < e; ++i) {
+    Instruction *I = dyn_cast<Instruction>(VL[i]);
+    // If not all of the instructions are identical then we have to scalarize.
+    if (!I || Opcode != I->getOpcode()) return Scalarize(VL, VecTy);
+  }
+
+  switch (Opcode) {
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::FPExt:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::SIToFP:
+  case Instruction::UIToFP:
+  case Instruction::Trunc:
+  case Instruction::FPTrunc:
+  case Instruction::BitCast: {
+    ValueList INVL;
+    for (int i = 0; i < VF; ++i)
+      INVL.push_back(cast<Instruction>(VL[i])->getOperand(0));
+    Value *InVec = vectorizeTree_rec(INVL, VF);
+    IRBuilder<> Builder(GetLastInstr(VL, VF));
+    CastInst *CI = dyn_cast<CastInst>(VL0);
+    Value *V = Builder.CreateCast(CI->getOpcode(), InVec, VecTy);
+    VectorizedValues[VL0] = V;
+    return V;
+  }
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::FDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor: {
+    ValueList LHSVL, RHSVL;
+    for (int i = 0; i < VF; ++i) {
+      RHSVL.push_back(cast<Instruction>(VL[i])->getOperand(0));
+      LHSVL.push_back(cast<Instruction>(VL[i])->getOperand(1));
+    }
+
+    Value *RHS = vectorizeTree_rec(RHSVL, VF);
+    Value *LHS = vectorizeTree_rec(LHSVL, VF);
+    IRBuilder<> Builder(GetLastInstr(VL, VF));
+    BinaryOperator *BinOp = cast<BinaryOperator>(VL0);
+    Value *V = Builder.CreateBinOp(BinOp->getOpcode(), RHS,LHS);
+    VectorizedValues[VL0] = V;
+    return V;
+  }
+  case Instruction::Load: {
+    LoadInst *LI = cast<LoadInst>(VL0);
+    unsigned Alignment = LI->getAlignment();
+
+    // Check if all of the loads are consecutive.
+    for (unsigned i = 1, e = VF; i < e; ++i)
+      if (!isConsecutiveAccess(VL[i-1], VL[i]))
+        return Scalarize(VL, VecTy);
+
+    IRBuilder<> Builder(GetLastInstr(VL, VF));
+    Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
+                                          VecTy->getPointerTo());
+    LI = Builder.CreateLoad(VecPtr);
+    LI->setAlignment(Alignment);
+    VectorizedValues[VL0] = LI;
+    return LI;
+  }
+  case Instruction::Store: {
+    StoreInst *SI = cast<StoreInst>(VL0);
+    unsigned Alignment = SI->getAlignment();
+
+    ValueList ValueOp;
+    for (int i = 0; i < VF; ++i)
+      ValueOp.push_back(cast<StoreInst>(VL[i])->getValueOperand());
+
+    Value *VecValue = vectorizeTree_rec(ValueOp, VF);
+
+    IRBuilder<> Builder(GetLastInstr(VL, VF));
+    Value *VecPtr = Builder.CreateBitCast(SI->getPointerOperand(),
+                                          VecTy->getPointerTo());
+    Builder.CreateStore(VecValue, VecPtr)->setAlignment(Alignment);
+
+    for (int i = 0; i < VF; ++i)
+      cast<Instruction>(VL[i])->eraseFromParent();
+    return 0;
+  }
+  default:
+    Value *S = Scalarize(VL, VecTy);
+    VectorizedValues[VL0] = S;
+    return S;
+  }
+}
+
+} // end of namespace
diff --git a/lib/Transforms/Vectorize/VecUtils.h b/lib/Transforms/Vectorize/VecUtils.h
new file mode 100644
index 0000000000..5456c6c779
--- /dev/null
+++ b/lib/Transforms/Vectorize/VecUtils.h
@@ -0,0 +1,164 @@
+//===- VecUtils.h - Vectorization Utilities -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This family of classes and functions manipulate vectors and chains of
+// vectors.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H
+#define LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include <vector>
+
+namespace llvm {
+
+class BasicBlock; class Instruction; class Type;
+class VectorType; class StoreInst; class Value;
+class ScalarEvolution; class DataLayout;
+class TargetTransformInfo; class AliasAnalysis;
+class Loop;
+
+/// Bottom Up SLP vectorization utility class.
+struct BoUpSLP  {
+  typedef SmallVector<Value*, 8> ValueList;
+  typedef SmallPtrSet<Value*, 16> ValueSet;
+  typedef SmallVector<StoreInst*, 8> StoreList;
+  static const int max_cost = 1<<20;
+
+  // \brief C'tor.
+  BoUpSLP(BasicBlock *Bb, ScalarEvolution *Se, DataLayout *Dl,
+         TargetTransformInfo *Tti, AliasAnalysis *Aa, Loop *Lp);
+
+  /// \brief Take the pointer operand from the Load/Store instruction.
+  /// \returns NULL if this is not a valid Load/Store instruction.
+  static Value *getPointerOperand(Value *I);
+
+  /// \brief Take the address space operand from the Load/Store instruction.
+  /// \returns -1 if this is not a valid Load/Store instruction.
+  static unsigned getAddressSpaceOperand(Value *I);
+
+  /// \returns true if the memory operations A and B are consecutive.
+  bool isConsecutiveAccess(Value *A, Value *B);
+
+  /// \brief Vectorize the tree that starts with the elements in \p VL.
+  /// \returns the vectorized value.
+  Value *vectorizeTree(ArrayRef<Value *> VL, int VF);
+
+  /// \returns the vectorization cost of the subtree that starts at \p VL.
+  /// A negative number means that this is profitable.
+  int getTreeCost(ArrayRef<Value *> VL);
+
+  /// \returns the scalarization cost for this list of values. Assuming that
+  /// this subtree gets vectorized, we may need to extract the values from the
+  /// roots. This method calculates the cost of extracting the values.
+  int getScalarizationCost(ArrayRef<Value *> VL);
+
+  /// \brief Attempts to order and vectorize a sequence of stores. This
+  /// function does a quadratic scan of the given stores.
+  /// \returns true if the basic block was modified.
+  bool vectorizeStores(ArrayRef<StoreInst *> Stores, int costThreshold);
+
+  /// \brief Vectorize a group of scalars into a vector tree.
+  void vectorizeArith(ArrayRef<Value *> Operands);
+
+  /// \returns the list of new instructions that were added in order to collect
+  /// scalars into vectors. This list can be used to further optimize the gather
+  /// sequences.
+  ValueList &getGatherSeqInstructions() {return GatherInstructions; }
+
+private:
+  /// \brief This method contains the recursive part of getTreeCost.
+  int getTreeCost_rec(ArrayRef<Value *> VL, unsigned Depth);
+
+  /// \brief This recursive method looks for vectorization hazards such as
+  /// values that are used by multiple users and checks that values are used
+  /// by only one vector lane. It updates the variables LaneMap, MultiUserVals.
+  void getTreeUses_rec(ArrayRef<Value *> VL, unsigned Depth);
+
+  /// \brief This method contains the recursive part of vectorizeTree.
+  Value *vectorizeTree_rec(ArrayRef<Value *> VL, int VF);
+
+  /// \brief Number all of the instructions in the block.
+  void numberInstructions();
+
+  ///  \brief Vectorize a sorted sequence of stores.
+  bool vectorizeStoreChain(ArrayRef<Value *> Chain, int CostThreshold);
+
+  /// \returns the scalarization cost for this type. Scalarization in this
+  /// context means the creation of vectors from a group of scalars.
+  int getScalarizationCost(Type *Ty);
+
+  /// \returns the AA location that is being access by the instruction.
+  AliasAnalysis::Location getLocation(Instruction *I);
+
+  /// \brief Checks if it is possible to sink an instruction from
+  /// \p Src to \p Dst.
+  /// \returns the pointer to the barrier instruction if we can't sink.
+  Value *isUnsafeToSink(Instruction *Src, Instruction *Dst);
+
+  /// \returns the instruction that appears last in the BB from \p VL.
+  /// Only consider the first \p VF elements.
+  Instruction *GetLastInstr(ArrayRef<Value *> VL, unsigned VF);
+
+  /// \returns a vector from a collection of scalars in \p VL.
+  Value *Scalarize(ArrayRef<Value *> VL, VectorType *Ty);
+
+private:
+  /// Maps instructions to numbers and back.
+  SmallDenseMap<Value*, int> InstrIdx;
+  /// Maps integers to Instructions.
+  std::vector<Instruction*> InstrVec;
+
+  // -- containers that are used during getTreeCost -- //
+
+  /// Contains values that must be scalarized because they are used
+  /// by multiple lanes, or by users outside the tree.
+  /// NOTICE: The vectorization methods also use this set.
+  ValueSet MustScalarize;
+
+  /// Contains a list of values that are used outside the current tree. This
+  /// set must be reset between runs.
+  ValueSet MultiUserVals;
+  /// Maps values in the tree to the vector lanes that uses them. This map must
+  /// be reset between runs of getCost.
+  std::map<Value*, int> LaneMap;
+  /// A list of instructions to ignore while sinking
+  /// memory instructions. This map must be reset between runs of getCost.
+  SmallPtrSet<Value *, 8> MemBarrierIgnoreList;
+
+  // -- Containers that are used during vectorizeTree -- //
+
+  /// Maps between the first scalar to the vector. This map must be reset
+  ///between runs.
+  DenseMap<Value*, Value*> VectorizedValues;
+
+  // -- Containers that are used after vectorization by the caller -- //
+
+  /// A list of instructions that are used when gathering scalars into vectors.
+  /// In many cases these instructions can be hoisted outside of the BB.
+  /// Iterating over this list is faster than calling LICM.
+  ValueList GatherInstructions;
+
+  // Analysis and block reference.
+  BasicBlock *BB;
+  ScalarEvolution *SE;
+  DataLayout *DL;
+  TargetTransformInfo *TTI;
+  AliasAnalysis *AA;
+  Loop *L;
+};
+
+} // end of namespace
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VECUTILS_H
diff --git a/lib/Transforms/Vectorize/Vectorize.cpp b/lib/Transforms/Vectorize/Vectorize.cpp
index 19eefd2f87..a927fe1451 100644
--- a/lib/Transforms/Vectorize/Vectorize.cpp
+++ b/lib/Transforms/Vectorize/Vectorize.cpp
@@ -1,4 +1,4 @@
-   //===-- Vectorize.cpp -----------------------------------------------------===//
+//===-- Vectorize.cpp -----------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -28,6 +28,7 @@ using namespace llvm;
 void llvm::initializeVectorization(PassRegistry &Registry) {
   initializeBBVectorizePass(Registry);
   initializeLoopVectorizePass(Registry);
+  initializeSLPVectorizerPass(Registry);
 }
 
 void LLVMInitializeVectorization(LLVMPassRegistryRef R) {
@@ -41,3 +42,7 @@ void LLVMAddBBVectorizePass(LLVMPassManagerRef PM) {
 void LLVMAddLoopVectorizePass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createLoopVectorizePass());
 }
+
+void LLVMAddSLPVectorizePass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createSLPVectorizerPass());
+}
author	Eli Bendersky <eliben@chromium.org>	2013-07-15 16:09:15 -0700
committer	Eli Bendersky <eliben@chromium.org>	2013-07-15 16:09:15 -0700
commit	c6cf05cb5108f356dde97c01ee4188b0671d4542 (patch)
tree	436fdc2a55296d3c202e7ef11f31be3be53efb5f /lib/Transforms
parent	c75199c649c739aade160289d93f257edc798cde (diff)
parent	7dfcb84fc16b3bf6b2379713b53090757f0a45f9 (diff)