111 files changed, 5257 insertions, 2478 deletions
diff --git a/lib/Transforms/Hello/Hello.cpp b/lib/Transforms/Hello/Hello.cpp
index b0e22de8d7..d0b146b4e9 100644
--- a/lib/Transforms/Hello/Hello.cpp
+++ b/lib/Transforms/Hello/Hello.cpp
@@ -13,10 +13,10 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "hello"
-#include "llvm/Pass.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Function.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
 STATISTIC(HelloCounter, "Counts number of functions greeted");
diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp
index be48b2063f..2132e0a5fe 100644
--- a/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -31,21 +31,21 @@
 
 #define DEBUG_TYPE "argpromotion"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/CallGraphSCCPass.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
-#include "llvm/Module.h"
-#include "llvm/CallGraphSCCPass.h"
 #include "llvm/Instructions.h"
 #include "llvm/LLVMContext.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/CallGraph.h"
-#include "llvm/Support/CallSite.h"
+#include "llvm/Module.h"
 #include "llvm/Support/CFG.h"
+#include "llvm/Support/CallSite.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
 #include <set>
 using namespace llvm;
 
@@ -515,12 +515,12 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
   // that we are *not* promoting. For the ones that we do promote, the parameter
   // attributes are lost
   SmallVector<AttributeWithIndex, 8> AttributesVec;
-  const AttrListPtr &PAL = F->getAttributes();
+  const AttributeSet &PAL = F->getAttributes();
 
   // Add any return attributes.
   Attributes attrs = PAL.getRetAttributes();
   if (attrs.hasAttributes())
-    AttributesVec.push_back(AttributeWithIndex::get(AttrListPtr::ReturnIndex,
+    AttributesVec.push_back(AttributeWithIndex::get(AttributeSet::ReturnIndex,
                                                     attrs));
 
   // First, determine the new argument list
@@ -593,7 +593,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
   // Add any function attributes.
   attrs = PAL.getFnAttributes();
   if (attrs.hasAttributes())
-    AttributesVec.push_back(AttributeWithIndex::get(AttrListPtr::FunctionIndex,
+    AttributesVec.push_back(AttributeWithIndex::get(AttributeSet::FunctionIndex,
                                                     attrs));
 
   Type *RetTy = FTy->getReturnType();
@@ -611,7 +611,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
   
   // Recompute the parameter attributes list based on the new arguments for
   // the function.
-  NF->setAttributes(AttrListPtr::get(F->getContext(), AttributesVec));
+  NF->setAttributes(AttributeSet::get(F->getContext(), AttributesVec));
   AttributesVec.clear();
 
   F->getParent()->getFunctionList().insert(F, NF);
@@ -636,12 +636,12 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
     CallSite CS(F->use_back());
     assert(CS.getCalledFunction() == F);
     Instruction *Call = CS.getInstruction();
-    const AttrListPtr &CallPAL = CS.getAttributes();
+    const AttributeSet &CallPAL = CS.getAttributes();
 
     // Add any return attributes.
     Attributes attrs = CallPAL.getRetAttributes();
     if (attrs.hasAttributes())
-      AttributesVec.push_back(AttributeWithIndex::get(AttrListPtr::ReturnIndex,
+      AttributesVec.push_back(AttributeWithIndex::get(AttributeSet::ReturnIndex,
                                                       attrs));
 
     // Loop over the operands, inserting GEP and loads in the caller as
@@ -723,7 +723,7 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
     // Add any function attributes.
     attrs = CallPAL.getFnAttributes();
     if (attrs.hasAttributes())
-      AttributesVec.push_back(AttributeWithIndex::get(AttrListPtr::FunctionIndex,
+      AttributesVec.push_back(AttributeWithIndex::get(AttributeSet::FunctionIndex,
                                                       attrs));
 
     Instruction *New;
@@ -731,12 +731,12 @@ CallGraphNode *ArgPromotion::DoPromotion(Function *F,
       New = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
                                Args, "", Call);
       cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv());
-      cast<InvokeInst>(New)->setAttributes(AttrListPtr::get(II->getContext(),
+      cast<InvokeInst>(New)->setAttributes(AttributeSet::get(II->getContext(),
                                                             AttributesVec));
     } else {
       New = CallInst::Create(NF, Args, "", Call);
       cast<CallInst>(New)->setCallingConv(CS.getCallingConv());
-      cast<CallInst>(New)->setAttributes(AttrListPtr::get(New->getContext(),
+      cast<CallInst>(New)->setAttributes(AttributeSet::get(New->getContext(),
                                                           AttributesVec));
       if (cast<CallInst>(Call)->isTailCall())
         cast<CallInst>(New)->setTailCall();
diff --git a/lib/Transforms/IPO/ConstantMerge.cpp b/lib/Transforms/IPO/ConstantMerge.cpp
index e2f012657f..d30eeaf7d3 100644
--- a/lib/Transforms/IPO/ConstantMerge.cpp
+++ b/lib/Transforms/IPO/ConstantMerge.cpp
@@ -19,15 +19,15 @@
 
 #define DEBUG_TYPE "constmerge"
 #include "llvm/Transforms/IPO.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/DataLayout.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Constants.h"
+#include "llvm/DataLayout.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
 using namespace llvm;
 
 STATISTIC(NumMerged, "Number of global constants merged");
diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 4cfd0b235a..6236a04fc2 100644
--- a/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -19,11 +19,15 @@
 
 #define DEBUG_TYPE "deadargelim"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/CallingConv.h"
 #include "llvm/Constant.h"
+#include "llvm/DIBuilder.h"
 #include "llvm/DebugInfo.h"
 #include "llvm/DerivedTypes.h"
-#include "llvm/DIBuilder.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/LLVMContext.h"
@@ -32,10 +36,6 @@
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
 #include <map>
 #include <set>
 using namespace llvm;
@@ -271,16 +271,16 @@ bool DAE::DeleteDeadVarargs(Function &Fn) {
     Args.assign(CS.arg_begin(), CS.arg_begin() + NumArgs);
 
     // Drop any attributes that were on the vararg arguments.
-    AttrListPtr PAL = CS.getAttributes();
+    AttributeSet PAL = CS.getAttributes();
     if (!PAL.isEmpty() && PAL.getSlot(PAL.getNumSlots() - 1).Index > NumArgs) {
       SmallVector<AttributeWithIndex, 8> AttributesVec;
       for (unsigned i = 0; PAL.getSlot(i).Index <= NumArgs; ++i)
         AttributesVec.push_back(PAL.getSlot(i));
       Attributes FnAttrs = PAL.getFnAttributes();
       if (FnAttrs.hasAttributes())
-        AttributesVec.push_back(AttributeWithIndex::get(AttrListPtr::FunctionIndex,
+        AttributesVec.push_back(AttributeWithIndex::get(AttributeSet::FunctionIndex,
                                                         FnAttrs));
-      PAL = AttrListPtr::get(Fn.getContext(), AttributesVec);
+      PAL = AttributeSet::get(Fn.getContext(), AttributesVec);
     }
 
     Instruction *New;
@@ -698,7 +698,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
 
   // Set up to build a new list of parameter attributes.
   SmallVector<AttributeWithIndex, 8> AttributesVec;
-  const AttrListPtr &PAL = F->getAttributes();
+  const AttributeSet &PAL = F->getAttributes();
 
   // The existing function return attributes.
   Attributes RAttrs = PAL.getRetAttributes();
@@ -773,7 +773,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
            "Return attributes no longer compatible?");
 
   if (RAttrs.hasAttributes())
-    AttributesVec.push_back(AttributeWithIndex::get(AttrListPtr::ReturnIndex,
+    AttributesVec.push_back(AttributeWithIndex::get(AttributeSet::ReturnIndex,
                                                     RAttrs));
 
   // Remember which arguments are still alive.
@@ -802,11 +802,11 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
   }
 
   if (FnAttrs.hasAttributes())
-    AttributesVec.push_back(AttributeWithIndex::get(AttrListPtr::FunctionIndex,
+    AttributesVec.push_back(AttributeWithIndex::get(AttributeSet::FunctionIndex,
                                                     FnAttrs));
 
   // Reconstruct the AttributesList based on the vector we constructed.
-  AttrListPtr NewPAL = AttrListPtr::get(F->getContext(), AttributesVec);
+  AttributeSet NewPAL = AttributeSet::get(F->getContext(), AttributesVec);
 
   // Create the new function type based on the recomputed parameters.
   FunctionType *NFTy = FunctionType::get(NRetTy, Params, FTy->isVarArg());
@@ -833,7 +833,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
     Instruction *Call = CS.getInstruction();
 
     AttributesVec.clear();
-    const AttrListPtr &CallPAL = CS.getAttributes();
+    const AttributeSet &CallPAL = CS.getAttributes();
 
     // The call return attributes.
     Attributes RAttrs = CallPAL.getRetAttributes();
@@ -843,7 +843,7 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
       Attributes::get(NF->getContext(), AttrBuilder(RAttrs).
            removeAttributes(Attributes::typeIncompatible(NF->getReturnType())));
     if (RAttrs.hasAttributes())
-      AttributesVec.push_back(AttributeWithIndex::get(AttrListPtr::ReturnIndex,
+      AttributesVec.push_back(AttributeWithIndex::get(AttributeSet::ReturnIndex,
                                                       RAttrs));
 
     // Declare these outside of the loops, so we can reuse them for the second
@@ -870,11 +870,11 @@ bool DAE::RemoveDeadStuffFromFunction(Function *F) {
     }
 
     if (FnAttrs.hasAttributes())
-      AttributesVec.push_back(AttributeWithIndex::get(AttrListPtr::FunctionIndex,
+      AttributesVec.push_back(AttributeWithIndex::get(AttributeSet::FunctionIndex,
                                                       FnAttrs));
 
     // Reconstruct the AttributesList based on the vector we constructed.
-    AttrListPtr NewCallPAL = AttrListPtr::get(F->getContext(), AttributesVec);
+    AttributeSet NewCallPAL = AttributeSet::get(F->getContext(), AttributesVec);
 
     Instruction *New;
     if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
diff --git a/lib/Transforms/IPO/ExtractGV.cpp b/lib/Transforms/IPO/ExtractGV.cpp
index 05aefeff9f..1dc79d113e 100644
--- a/lib/Transforms/IPO/ExtractGV.cpp
+++ b/lib/Transforms/IPO/ExtractGV.cpp
@@ -11,13 +11,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Constants.h"
 #include "llvm/Instructions.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/Constants.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/ADT/SetVector.h"
 #include <algorithm>
 using namespace llvm;
 
diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp
index 18409f77b3..685833da1a 100644
--- a/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -20,17 +20,17 @@
 
 #define DEBUG_TYPE "functionattrs"
 #include "llvm/Transforms/IPO.h"
-#include "llvm/CallGraphSCCPass.h"
-#include "llvm/GlobalVariable.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/CallGraph.h"
-#include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/CallGraphSCCPass.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/LLVMContext.h"
 #include "llvm/Support/InstIterator.h"
 using namespace llvm;
 
@@ -215,13 +215,13 @@ bool FunctionAttrs::AddReadAttrs(const CallGraphSCC &SCC) {
     AttrBuilder B;
     B.addAttribute(Attributes::ReadOnly)
       .addAttribute(Attributes::ReadNone);
-    F->removeAttribute(AttrListPtr::FunctionIndex,
+    F->removeAttribute(AttributeSet::FunctionIndex,
                        Attributes::get(F->getContext(), B));
 
     // Add in the new attribute.
     B.clear();
     B.addAttribute(ReadsMemory ? Attributes::ReadOnly : Attributes::ReadNone);
-    F->addAttribute(AttrListPtr::FunctionIndex,
+    F->addAttribute(AttributeSet::FunctionIndex,
                     Attributes::get(F->getContext(), B));
 
     if (ReadsMemory)
diff --git a/lib/Transforms/IPO/GlobalDCE.cpp b/lib/Transforms/IPO/GlobalDCE.cpp
index 18c1c7b000..b2c819de2b 100644
--- a/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/lib/Transforms/IPO/GlobalDCE.cpp
@@ -17,11 +17,11 @@
 
 #define DEBUG_TYPE "globaldce"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Constants.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
 STATISTIC(NumAliases  , "Number of global aliases removed");
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index 591278fa62..20f9de5a83 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -15,29 +15,29 @@
 
 #define DEBUG_TYPE "globalopt"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/CallingConv.h"
 #include "llvm/Constants.h"
+#include "llvm/DataLayout.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Module.h"
 #include "llvm/Operator.h"
 #include "llvm/Pass.h"
-#include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/DataLayout.h"
-#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -148,17 +148,13 @@ struct GlobalStatus {
   /// an instruction (e.g. a constant expr or GV initializer).
   bool HasNonInstructionUser;
 
-  /// HasPHIUser - Set to true if this global has a user that is a PHI node.
-  bool HasPHIUser;
-
   /// AtomicOrdering - Set to the strongest atomic ordering requirement.
   AtomicOrdering Ordering;
 
   GlobalStatus() : isCompared(false), isLoaded(false), StoredType(NotStored),
                    StoredOnceValue(0), AccessingFunction(0),
                    HasMultipleAccessingFunctions(false),
-                   HasNonInstructionUser(false), HasPHIUser(false),
-                   Ordering(NotAtomic) {}
+                   HasNonInstructionUser(false), Ordering(NotAtomic) {}
 };
 
 }
@@ -200,11 +196,11 @@ static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS,
     const User *U = *UI;
     if (const ConstantExpr *CE = dyn_cast<ConstantExpr>(U)) {
       GS.HasNonInstructionUser = true;
-      
+
       // If the result of the constantexpr isn't pointer type, then we won't
       // know to expect it in various places.  Just reject early.
       if (!isa<PointerType>(CE->getType())) return true;
-      
+
       if (AnalyzeGlobal(CE, GS, PHIUsers)) return true;
     } else if (const Instruction *I = dyn_cast<Instruction>(U)) {
       if (!GS.HasMultipleAccessingFunctions) {
@@ -274,7 +270,6 @@ static bool AnalyzeGlobal(const Value *V, GlobalStatus &GS,
         // have to be careful about infinite recursion.
         if (PHIUsers.insert(PN))  // Not already visited.
           if (AnalyzeGlobal(I, GS, PHIUsers)) return true;
-        GS.HasPHIUser = true;
       } else if (isa<CmpInst>(I)) {
         GS.isCompared = true;
       } else if (const MemTransferInst *MTI = dyn_cast<MemTransferInst>(I)) {
@@ -2070,7 +2065,7 @@ static void ChangeCalleesToFastCall(Function *F) {
   }
 }
 
-static AttrListPtr StripNest(LLVMContext &C, const AttrListPtr &Attrs) {
+static AttributeSet StripNest(LLVMContext &C, const AttributeSet &Attrs) {
   for (unsigned i = 0, e = Attrs.getNumSlots(); i != e; ++i) {
     if (!Attrs.getSlot(i).Attrs.hasAttribute(Attributes::Nest))
       continue;
@@ -2157,7 +2152,7 @@ bool GlobalOpt::OptimizeGlobalVars(Module &M) {
 GlobalVariable *GlobalOpt::FindGlobalCtors(Module &M) {
   GlobalVariable *GV = M.getGlobalVariable("llvm.global_ctors");
   if (GV == 0) return 0;
-  
+
   // Verify that the initializer is simple enough for us to handle. We are
   // only allowed to optimize the initializer if it is unique.
   if (!GV->hasUniqueInitializer()) return 0;
@@ -2263,7 +2258,7 @@ static GlobalVariable *InstallGlobalCtors(GlobalVariable *GCL,
 }
 
 
-static inline bool 
+static inline bool
 isSimpleEnoughValueToCommit(Constant *C,
                             SmallPtrSet<Constant*, 8> &SimpleConstants,
                             const DataLayout *TD);
@@ -2285,7 +2280,7 @@ static bool isSimpleEnoughValueToCommitHelper(Constant *C,
   if (C->getNumOperands() == 0 || isa<BlockAddress>(C) ||
       isa<GlobalValue>(C))
     return true;
-  
+
   // Aggregate values are safe if all their elements are.
   if (isa<ConstantArray>(C) || isa<ConstantStruct>(C) ||
       isa<ConstantVector>(C)) {
@@ -2296,7 +2291,7 @@ static bool isSimpleEnoughValueToCommitHelper(Constant *C,
     }
     return true;
   }
-  
+
   // We don't know exactly what relocations are allowed in constant expressions,
   // so we allow &global+constantoffset, which is safe and uniformly supported
   // across targets.
@@ -2314,14 +2309,14 @@ static bool isSimpleEnoughValueToCommitHelper(Constant *C,
                TD->getTypeSizeInBits(CE->getOperand(0)->getType()))
       return false;
     return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, TD);
-      
+
   // GEP is fine if it is simple + constant offset.
   case Instruction::GetElementPtr:
     for (unsigned i = 1, e = CE->getNumOperands(); i != e; ++i)
       if (!isa<ConstantInt>(CE->getOperand(i)))
         return false;
     return isSimpleEnoughValueToCommit(CE->getOperand(0), SimpleConstants, TD);
-      
+
   case Instruction::Add:
     // We allow simple+cst.
     if (!isa<ConstantInt>(CE->getOperand(1)))
@@ -2331,7 +2326,7 @@ static bool isSimpleEnoughValueToCommitHelper(Constant *C,
   return false;
 }
 
-static inline bool 
+static inline bool
 isSimpleEnoughValueToCommit(Constant *C,
                             SmallPtrSet<Constant*, 8> &SimpleConstants,
                             const DataLayout *TD) {
@@ -2379,7 +2374,7 @@ static bool isSimpleEnoughPointerToCommit(Constant *C) {
         return false;
 
       return ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE);
-    
+
     // A constantexpr bitcast from a pointer to another pointer is a no-op,
     // and we know how to evaluate it by moving the bitcast from the pointer
     // operand to the value operand.
@@ -2390,7 +2385,7 @@ static bool isSimpleEnoughPointerToCommit(Constant *C) {
       return cast<GlobalVariable>(CE->getOperand(0))->hasUniqueInitializer();
     }
   }
-  
+
   return false;
 }
 
@@ -2420,7 +2415,7 @@ static Constant *EvaluateStoreInto(Constant *Init, Constant *Val,
     // Return the modified struct.
     return ConstantStruct::get(STy, Elts);
   }
-  
+
   ConstantInt *CI = cast<ConstantInt>(Addr->getOperand(OpNo));
   SequentialType *InitTy = cast<SequentialType>(Init->getType());
 
@@ -2597,23 +2592,23 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
       if (!isSimpleEnoughPointerToCommit(Ptr))
         // If this is too complex for us to commit, reject it.
         return false;
-      
+
       Constant *Val = getVal(SI->getOperand(0));
 
       // If this might be too difficult for the backend to handle (e.g. the addr
       // of one global variable divided by another) then we can't commit it.
       if (!isSimpleEnoughValueToCommit(Val, SimpleConstants, TD))
         return false;
-        
+
       if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr))
         if (CE->getOpcode() == Instruction::BitCast) {
           // If we're evaluating a store through a bitcast, then we need
           // to pull the bitcast off the pointer type and push it onto the
           // stored value.
           Ptr = CE->getOperand(0);
-          
+
           Type *NewTy = cast<PointerType>(Ptr->getType())->getElementType();
-          
+
           // In order to push the bitcast onto the stored value, a bitcast
           // from NewTy to Val's type must be legal.  If it's not, we can try
           // introspecting NewTy to find a legal conversion.
@@ -2638,12 +2633,12 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
               return false;
             }
           }
-          
+
           // If we found compatible types, go ahead and push the bitcast
           // onto the stored value.
           Val = ConstantExpr::getBitCast(Val, NewTy);
         }
-          
+
       MutatedMemory[Ptr] = Val;
     } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(CurInst)) {
       InstResult = ConstantExpr::get(BO->getOpcode(),
@@ -2805,7 +2800,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
     if (!CurInst->use_empty()) {
       if (ConstantExpr *CE = dyn_cast<ConstantExpr>(InstResult))
         InstResult = ConstantFoldConstantExpression(CE, TD, TLI);
-      
+
       setVal(CurInst, InstResult);
     }
 
@@ -2891,7 +2886,7 @@ static bool EvaluateStaticConstructor(Function *F, const DataLayout *TD,
   Constant *RetValDummy;
   bool EvalSuccess = Eval.EvaluateFunction(F, RetValDummy,
                                            SmallVector<Constant*, 0>());
-  
+
   if (EvalSuccess) {
     // We succeeded at evaluation: commit the result.
     DEBUG(dbgs() << "FULLY EVALUATED GLOBAL CTOR FUNCTION '"
@@ -3011,13 +3006,13 @@ static Function *FindCXAAtExit(Module &M, TargetLibraryInfo *TLI) {
     return 0;
 
   Function *Fn = M.getFunction(TLI->getName(LibFunc::cxa_atexit));
-  
+
   if (!Fn)
     return 0;
 
   FunctionType *FTy = Fn->getFunctionType();
-  
-  // Checking that the function has the right return type, the right number of 
+
+  // Checking that the function has the right return type, the right number of
   // parameters and that they all have pointer types should be enough.
   if (!FTy->getReturnType()->isIntegerTy() ||
       FTy->getNumParams() != 3 ||
@@ -3092,7 +3087,7 @@ bool GlobalOpt::OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) {
   // and remove them.
   bool Changed = false;
 
-  for (Function::use_iterator I = CXAAtExitFn->use_begin(), 
+  for (Function::use_iterator I = CXAAtExitFn->use_begin(),
        E = CXAAtExitFn->use_end(); I != E;) {
     // We're only interested in calls. Theoretically, we could handle invoke
     // instructions as well, but neither llvm-gcc nor clang generate invokes
@@ -3101,7 +3096,7 @@ bool GlobalOpt::OptimizeEmptyGlobalCXXDtors(Function *CXAAtExitFn) {
     if (!CI)
       continue;
 
-    Function *DtorFn = 
+    Function *DtorFn =
       dyn_cast<Function>(CI->getArgOperand(0)->stripPointerCasts());
     if (!DtorFn)
       continue;
diff --git a/lib/Transforms/IPO/IPConstantPropagation.cpp b/lib/Transforms/IPO/IPConstantPropagation.cpp
index d757e1fdb1..252b5b0584 100644
--- a/lib/Transforms/IPO/IPConstantPropagation.cpp
+++ b/lib/Transforms/IPO/IPConstantPropagation.cpp
@@ -17,14 +17,14 @@
 
 #define DEBUG_TYPE "ipconstprop"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Constants.h"
 #include "llvm/Instructions.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Support/CallSite.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/SmallVector.h"
 using namespace llvm;
 
 STATISTIC(NumArgumentsProped, "Number of args turned into constants");
diff --git a/lib/Transforms/IPO/InlineAlways.cpp b/lib/Transforms/IPO/InlineAlways.cpp
index 6f4b810acc..5b8832e5d7 100644
--- a/lib/Transforms/IPO/InlineAlways.cpp
+++ b/lib/Transforms/IPO/InlineAlways.cpp
@@ -13,18 +13,18 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "inline"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/InlineCost.h"
 #include "llvm/CallingConv.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Module.h"
-#include "llvm/Type.h"
-#include "llvm/Analysis/CallGraph.h"
-#include "llvm/Analysis/InlineCost.h"
 #include "llvm/Support/CallSite.h"
-#include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/InlinerPass.h"
-#include "llvm/DataLayout.h"
-#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Type.h"
 
 using namespace llvm;
 
@@ -44,6 +44,10 @@ namespace {
     }
     static char ID; // Pass identification, replacement for typeid
     virtual InlineCost getInlineCost(CallSite CS);
+
+    using llvm::Pass::doInitialization;
+    using llvm::Pass::doFinalization;
+
     virtual bool doFinalization(CallGraph &CG) {
       return removeDeadFunctions(CG, /*AlwaysInlineOnly=*/true);
     }
diff --git a/lib/Transforms/IPO/InlineSimple.cpp b/lib/Transforms/IPO/InlineSimple.cpp
index bf0b1f91a2..9c5feba08b 100644
--- a/lib/Transforms/IPO/InlineSimple.cpp
+++ b/lib/Transforms/IPO/InlineSimple.cpp
@@ -12,17 +12,17 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "inline"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/InlineCost.h"
 #include "llvm/CallingConv.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Module.h"
-#include "llvm/Type.h"
-#include "llvm/Analysis/CallGraph.h"
-#include "llvm/Analysis/InlineCost.h"
 #include "llvm/Support/CallSite.h"
-#include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/InlinerPass.h"
-#include "llvm/DataLayout.h"
+#include "llvm/Type.h"
 
 using namespace llvm;
 
@@ -42,6 +42,7 @@ namespace {
     InlineCost getInlineCost(CallSite CS) {
       return CA.getInlineCost(CS, getInlineThreshold(CS));
     }
+    using llvm::Pass::doInitialization;
     virtual bool doInitialization(CallGraph &CG);
   };
 }
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index abcb25fd45..bd8fa66d52 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -14,22 +14,22 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "inline"
-#include "llvm/Module.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
+#include "llvm/Transforms/IPO/InlinerPass.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/DataLayout.h"
-#include "llvm/Target/TargetLibraryInfo.h"
-#include "llvm/Transforms/IPO/InlinerPass.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
 STATISTIC(NumInlined, "Number of functions inlined");
diff --git a/lib/Transforms/IPO/Internalize.cpp b/lib/Transforms/IPO/Internalize.cpp
index aa629cc0c6..b2cd3a765a 100644
--- a/lib/Transforms/IPO/Internalize.cpp
+++ b/lib/Transforms/IPO/Internalize.cpp
@@ -14,14 +14,14 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "internalize"
-#include "llvm/Analysis/CallGraph.h"
 #include "llvm/Transforms/IPO.h"
-#include "llvm/Pass.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CallGraph.h"
 #include "llvm/Module.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/Statistic.h"
 #include <fstream>
 #include <set>
 using namespace llvm;
diff --git a/lib/Transforms/IPO/LoopExtractor.cpp b/lib/Transforms/IPO/LoopExtractor.cpp
index 97d7cdced0..af04d054ed 100644
--- a/lib/Transforms/IPO/LoopExtractor.cpp
+++ b/lib/Transforms/IPO/LoopExtractor.cpp
@@ -16,16 +16,16 @@
 
 #define DEBUG_TYPE "loop-extract"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopPass.h"
 #include "llvm/Instructions.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/Analysis/Dominators.h"
-#include "llvm/Analysis/LoopPass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/CodeExtractor.h"
-#include "llvm/ADT/Statistic.h"
 #include <fstream>
 #include <set>
 using namespace llvm;
diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp
index 44283ddce7..70345b8334 100644
--- a/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/lib/Transforms/IPO/MergeFunctions.cpp
@@ -45,7 +45,13 @@
 
 #define DEBUG_TYPE "mergefunc"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Constants.h"
+#include "llvm/DataLayout.h"
 #include "llvm/IRBuilder.h"
 #include "llvm/InlineAsm.h"
 #include "llvm/Instructions.h"
@@ -53,17 +59,11 @@
 #include "llvm/Module.h"
 #include "llvm/Operator.h"
 #include "llvm/Pass.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/FoldingSet.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/DataLayout.h"
 #include <vector>
 using namespace llvm;
 
diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp
index 9c9910bd5c..6bd9c8372e 100644
--- a/lib/Transforms/IPO/PartialInlining.cpp
+++ b/lib/Transforms/IPO/PartialInlining.cpp
@@ -14,14 +14,14 @@
 
 #define DEBUG_TYPE "partialinlining"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Dominators.h"
 #include "llvm/Instructions.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/Analysis/Dominators.h"
+#include "llvm/Support/CFG.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/CodeExtractor.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Support/CFG.h"
 using namespace llvm;
 
 STATISTIC(NumPartialInlined, "Number of functions partially inlined");
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 48e20ec339..a9a9f2eece 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -14,21 +14,19 @@
 
 
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
-
 #include "llvm-c/Transforms/PassManagerBuilder.h"
-
-#include "llvm/PassManager.h"
-#include "llvm/DefaultPasses.h"
-#include "llvm/PassManager.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/Verifier.h"
+#include "llvm/DefaultPasses.h"
+#include "llvm/PassManager.h"
+#include "llvm/PassManager.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ManagedStatic.h"
 #include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Vectorize.h"
-#include "llvm/Transforms/IPO.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/ManagedStatic.h"
 
 using namespace llvm;
 
@@ -190,10 +188,8 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) {
   MPM.add(createLoopIdiomPass());             // Recognize idioms like memset.
   MPM.add(createLoopDeletionPass());          // Delete dead loops
 
-  if (LoopVectorize) {
+  if (LoopVectorize && OptLevel > 1)
     MPM.add(createLoopVectorizePass());
-    MPM.add(createLICMPass());
-  }
 
   if (!DisableUnrollLoops)
     MPM.add(createLoopUnrollPass());          // Unroll small loops
diff --git a/lib/Transforms/IPO/PruneEH.cpp b/lib/Transforms/IPO/PruneEH.cpp
index fb4ecbfe7b..19f34837c7 100644
--- a/lib/Transforms/IPO/PruneEH.cpp
+++ b/lib/Transforms/IPO/PruneEH.cpp
@@ -16,16 +16,16 @@
 
 #define DEBUG_TYPE "prune-eh"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CallGraph.h"
 #include "llvm/CallGraphSCCPass.h"
 #include "llvm/Constants.h"
 #include "llvm/Function.h"
-#include "llvm/LLVMContext.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
-#include "llvm/Analysis/CallGraph.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/LLVMContext.h"
 #include "llvm/Support/CFG.h"
 #include <algorithm>
 using namespace llvm;
@@ -145,8 +145,8 @@ bool PruneEH::runOnSCC(CallGraphSCC &SCC) {
         NewAttributes.addAttribute(Attributes::NoReturn);
 
       Function *F = (*I)->getFunction();
-      const AttrListPtr &PAL = F->getAttributes();
-      const AttrListPtr &NPAL = PAL.addAttr(F->getContext(), ~0,
+      const AttributeSet &PAL = F->getAttributes();
+      const AttributeSet &NPAL = PAL.addAttr(F->getContext(), ~0,
                                             Attributes::get(F->getContext(),
                                                             NewAttributes));
       if (PAL != NPAL) {
diff --git a/lib/Transforms/IPO/StripDeadPrototypes.cpp b/lib/Transforms/IPO/StripDeadPrototypes.cpp
index b5f09eccca..80cb869f02 100644
--- a/lib/Transforms/IPO/StripDeadPrototypes.cpp
+++ b/lib/Transforms/IPO/StripDeadPrototypes.cpp
@@ -16,9 +16,9 @@
 
 #define DEBUG_TYPE "strip-dead-prototypes"
 #include "llvm/Transforms/IPO.h"
-#include "llvm/Pass.h"
-#include "llvm/Module.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
 using namespace llvm;
 
 STATISTIC(NumDeadPrototypes, "Number of dead prototypes removed");
diff --git a/lib/Transforms/IPO/StripSymbols.cpp b/lib/Transforms/IPO/StripSymbols.cpp
index 80bfc1cdb2..ad915d716f 100644
--- a/lib/Transforms/IPO/StripSymbols.cpp
+++ b/lib/Transforms/IPO/StripSymbols.cpp
@@ -21,17 +21,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Constants.h"
 #include "llvm/DebugInfo.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Instructions.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/TypeFinder.h"
 #include "llvm/ValueSymbolTable.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallPtrSet.h"
 using namespace llvm;
 
 namespace {
diff --git a/lib/Transforms/InstCombine/InstCombine.h b/lib/Transforms/InstCombine/InstCombine.h
index 7467eca7ab..0570104205 100644
--- a/lib/Transforms/InstCombine/InstCombine.h
+++ b/lib/Transforms/InstCombine/InstCombine.h
@@ -11,12 +11,12 @@
 #define INSTCOMBINE_INSTCOMBINE_H
 
 #include "InstCombineWorklist.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IRBuilder.h"
+#include "llvm/InstVisitor.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Operator.h"
 #include "llvm/Pass.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Support/InstVisitor.h"
 #include "llvm/Support/TargetFolder.h"
 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
 
@@ -327,6 +327,11 @@ private:
   bool SimplifyDemandedBits(Use &U, APInt DemandedMask, 
                             APInt& KnownZero, APInt& KnownOne,
                             unsigned Depth=0);
+  /// Helper routine of SimplifyDemandedUseBits. It tries to simplify demanded
+  /// bit for "r1 = shr x, c1; r2 = shl r1, c2" instruction sequence.
+  Value *SimplifyShrShlDemandedBits(Instruction *Lsr, Instruction *Sftl,
+                                    APInt DemandedMask, APInt &KnownZero,
+                                    APInt &KnownOne);
       
   /// SimplifyDemandedInstructionBits - Inst is an integer instruction that
   /// SimplifyDemandedBits knows about.  See if the instruction has any
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 12faedb0ff..f5c42a7983 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -12,11 +12,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombine.h"
-#include "llvm/Intrinsics.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Transforms/Utils/CmpInstAnalysis.h"
+#include "llvm/Intrinsics.h"
 #include "llvm/Support/ConstantRange.h"
 #include "llvm/Support/PatternMatch.h"
+#include "llvm/Transforms/Utils/CmpInstAnalysis.h"
 using namespace llvm;
 using namespace PatternMatch;
 
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index aa9512cf03..41fe0873b3 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -12,13 +12,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombine.h"
-#include "llvm/Support/CallSite.h"
-#include "llvm/DataLayout.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/DataLayout.h"
+#include "llvm/Support/CallSite.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
+STATISTIC(NumSimplified, "Number of library calls simplified");
+
 /// getPromotedType - Return the specified type promoted as it would be to pass
 /// though a va_arg area.
 static Type *getPromotedType(Type *Ty) {
@@ -785,8 +788,10 @@ static bool isSafeToEliminateVarargsCast(const CallSite CS,
 Instruction *InstCombiner::tryOptimizeCall(CallInst *CI, const DataLayout *TD) {
   if (CI->getCalledFunction() == 0) return 0;
 
-  if (Value *With = Simplifier->optimizeCall(CI))
-    return ReplaceInstUsesWith(*CI, With);
+  if (Value *With = Simplifier->optimizeCall(CI)) {
+    ++NumSimplified;
+    return CI->use_empty() ? CI : ReplaceInstUsesWith(*CI, With);
+  }
 
   return 0;
 }
@@ -977,7 +982,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
   if (Callee == 0)
     return false;
   Instruction *Caller = CS.getInstruction();
-  const AttrListPtr &CallerPAL = CS.getAttributes();
+  const AttributeSet &CallerPAL = CS.getAttributes();
 
   // Okay, this is a cast from a function to a different type.  Unless doing so
   // would cause a type conversion of one of our arguments, change this call to
@@ -1118,7 +1123,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
   // Add the new return attributes.
   if (RAttrs.hasAttributes())
     attrVec.push_back(
-      AttributeWithIndex::get(AttrListPtr::ReturnIndex,
+      AttributeWithIndex::get(AttributeSet::ReturnIndex,
                               Attributes::get(FT->getContext(), RAttrs)));
 
   AI = CS.arg_begin();
@@ -1173,13 +1178,13 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
 
   Attributes FnAttrs = CallerPAL.getFnAttributes();
   if (FnAttrs.hasAttributes())
-    attrVec.push_back(AttributeWithIndex::get(AttrListPtr::FunctionIndex,
+    attrVec.push_back(AttributeWithIndex::get(AttributeSet::FunctionIndex,
                                               FnAttrs));
 
   if (NewRetTy->isVoidTy())
     Caller->setName("");   // Void type should not have a name.
 
-  const AttrListPtr &NewCallerPAL = AttrListPtr::get(Callee->getContext(),
+  const AttributeSet &NewCallerPAL = AttributeSet::get(Callee->getContext(),
                                                      attrVec);
 
   Instruction *NC;
@@ -1240,7 +1245,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
   Value *Callee = CS.getCalledValue();
   PointerType *PTy = cast<PointerType>(Callee->getType());
   FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
-  const AttrListPtr &Attrs = CS.getAttributes();
+  const AttributeSet &Attrs = CS.getAttributes();
 
   // If the call already has the 'nest' attribute somewhere then give up -
   // otherwise 'nest' would occur twice after splicing in the chain.
@@ -1255,7 +1260,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
   PointerType *NestFPTy = cast<PointerType>(NestF->getType());
   FunctionType *NestFTy = cast<FunctionType>(NestFPTy->getElementType());
 
-  const AttrListPtr &NestAttrs = NestF->getAttributes();
+  const AttributeSet &NestAttrs = NestF->getAttributes();
   if (!NestAttrs.isEmpty()) {
     unsigned NestIdx = 1;
     Type *NestTy = 0;
@@ -1285,7 +1290,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
       // Add any result attributes.
       Attributes Attr = Attrs.getRetAttributes();
       if (Attr.hasAttributes())
-        NewAttrs.push_back(AttributeWithIndex::get(AttrListPtr::ReturnIndex,
+        NewAttrs.push_back(AttributeWithIndex::get(AttributeSet::ReturnIndex,
                                                    Attr));
 
       {
@@ -1318,7 +1323,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
       // Add any function attributes.
       Attr = Attrs.getFnAttributes();
       if (Attr.hasAttributes())
-        NewAttrs.push_back(AttributeWithIndex::get(AttrListPtr::FunctionIndex,
+        NewAttrs.push_back(AttributeWithIndex::get(AttributeSet::FunctionIndex,
                                                    Attr));
 
       // The trampoline may have been bitcast to a bogus type (FTy).
@@ -1358,7 +1363,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
         NestF->getType() == PointerType::getUnqual(NewFTy) ?
         NestF : ConstantExpr::getBitCast(NestF,
                                          PointerType::getUnqual(NewFTy));
-      const AttrListPtr &NewPAL = AttrListPtr::get(FTy->getContext(), NewAttrs);
+      const AttributeSet &NewPAL = AttributeSet::get(FTy->getContext(), NewAttrs);
 
       Instruction *NewCaller;
       if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index bb59db8e7b..19de62c81f 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -14,8 +14,8 @@
 #include "InstCombine.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/DataLayout.h"
-#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Support/PatternMatch.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 using namespace llvm;
 using namespace PatternMatch;
 
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 7c3f8fe15d..1b96c3cca4 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -12,15 +12,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombine.h"
-#include "llvm/IntrinsicInst.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/DataLayout.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/IntrinsicInst.h"
 #include "llvm/Support/ConstantRange.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/PatternMatch.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 using namespace llvm;
 using namespace PatternMatch;
 
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index fd684200fc..5726d3a91d 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -12,12 +12,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombine.h"
-#include "llvm/IntrinsicInst.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/DataLayout.h"
+#include "llvm/IntrinsicInst.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
 STATISTIC(NumDeadStore,    "Number of dead stores eliminated");
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index cefe45ec86..5cd611c420 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -13,8 +13,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombine.h"
-#include "llvm/IntrinsicInst.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IntrinsicInst.h"
 #include "llvm/Support/PatternMatch.h"
 using namespace llvm;
 using namespace PatternMatch;
@@ -252,6 +252,46 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
   return Changed ? &I : 0;
 }
 
+//
+// Detect pattern:
+//
+// log2(Y*0.5)
+//
+// And check for corresponding fast math flags
+//
+
+static void detectLog2OfHalf(Value *&Op, Value *&Y, IntrinsicInst *&Log2) {
+
+   if (!Op->hasOneUse())
+     return;
+
+   IntrinsicInst *II = dyn_cast<IntrinsicInst>(Op);
+   if (!II)
+     return;
+   if (II->getIntrinsicID() != Intrinsic::log2 || !II->hasUnsafeAlgebra())
+     return;
+   Log2 = II;
+
+   Value *OpLog2Of = II->getArgOperand(0);
+   if (!OpLog2Of->hasOneUse())
+     return;
+
+   Instruction *I = dyn_cast<Instruction>(OpLog2Of);
+   if (!I)
+     return;
+   if (I->getOpcode() != Instruction::FMul || !I->hasUnsafeAlgebra())
+     return;
+              
+   ConstantFP *CFP = dyn_cast<ConstantFP>(I->getOperand(0));
+   if (CFP && CFP->isExactlyValue(0.5)) {
+     Y = I->getOperand(1);
+     return;
+   }
+   CFP = dyn_cast<ConstantFP>(I->getOperand(1));
+   if (CFP && CFP->isExactlyValue(0.5))
+     Y = I->getOperand(0);
+} 
+
 Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
   bool Changed = SimplifyAssociativeOrCommutative(I);
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
@@ -284,6 +324,33 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
     if (Value *Op1v = dyn_castFNegVal(Op1))
       return BinaryOperator::CreateFMul(Op0v, Op1v);
 
+  // Under unsafe algebra do:
+  // X * log2(0.5*Y) = X*log2(Y) - X
+  if (I.hasUnsafeAlgebra()) {
+    Value *OpX = NULL;
+    Value *OpY = NULL;
+    IntrinsicInst *Log2;
+    detectLog2OfHalf(Op0, OpY, Log2);
+    if (OpY) {
+      OpX = Op1;
+    } else {
+      detectLog2OfHalf(Op1, OpY, Log2);
+      if (OpY) {
+        OpX = Op0;
+      }
+    }
+    // if pattern detected emit alternate sequence
+    if (OpX && OpY) {
+      Log2->setArgOperand(0, OpY);
+      Value *FMulVal = Builder->CreateFMul(OpX, Log2);
+      Instruction *FMul = cast<Instruction>(FMulVal);
+      FMul->copyFastMathFlags(Log2);
+      Instruction *FSub = BinaryOperator::CreateFSub(FMulVal, OpX);
+      FSub->copyFastMathFlags(Log2);
+      return FSub;
+    }
+  }
+
   return Changed ? &I : 0;
 }
 
diff --git a/lib/Transforms/InstCombine/InstCombinePHI.cpp b/lib/Transforms/InstCombine/InstCombinePHI.cpp
index de9c77e600..ea127e9f53 100644
--- a/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -12,10 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombine.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/DataLayout.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/STLExtras.h"
 using namespace llvm;
 
 /// FoldPHIArgBinOpIntoPHI - If we have something like phi [add (a,b), add(a,c)]
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index a2d4c888f2..a262d711d3 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -12,9 +12,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombine.h"
-#include "llvm/Support/PatternMatch.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Support/PatternMatch.h"
 using namespace llvm;
 using namespace PatternMatch;
 
diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 57021f1bef..8a28d8eaa2 100644
--- a/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -12,9 +12,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombine.h"
-#include "llvm/IntrinsicInst.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IntrinsicInst.h"
 #include "llvm/Support/PatternMatch.h"
 using namespace llvm;
 using namespace PatternMatch;
@@ -49,7 +49,7 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
     I.setOperand(1, Rem);
     return &I;
   }
-  
+
   return 0;
 }
 
@@ -70,10 +70,10 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift,
   // We can always evaluate constants shifted.
   if (isa<Constant>(V))
     return true;
-  
+
   Instruction *I = dyn_cast<Instruction>(V);
   if (!I) return false;
-  
+
   // If this is the opposite shift, we can directly reuse the input of the shift
   // if the needed bits are already zero in the input.  This allows us to reuse
   // the value which means that we don't care if the shift has multiple uses.
@@ -95,14 +95,14 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift,
         return CanEvaluateTruncated(I->getOperand(0), Ty);
       }
 #endif
-      
+
     }
   }
-  
+
   // We can't mutate something that has multiple uses: doing so would
   // require duplicating the instruction in general, which isn't profitable.
   if (!I->hasOneUse()) return false;
-  
+
   switch (I->getOpcode()) {
   default: return false;
   case Instruction::And:
@@ -111,7 +111,7 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift,
     // Bitwise operators can all arbitrarily be arbitrarily evaluated shifted.
     return CanEvaluateShifted(I->getOperand(0), NumBits, isLeftShift, IC) &&
            CanEvaluateShifted(I->getOperand(1), NumBits, isLeftShift, IC);
-      
+
   case Instruction::Shl: {
     // We can often fold the shift into shifts-by-a-constant.
     CI = dyn_cast<ConstantInt>(I->getOperand(1));
@@ -119,10 +119,10 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift,
 
     // We can always fold shl(c1)+shl(c2) -> shl(c1+c2).
     if (isLeftShift) return true;
-    
+
     // We can always turn shl(c)+shr(c) -> and(c2).
     if (CI->getValue() == NumBits) return true;
-      
+
     unsigned TypeWidth = I->getType()->getScalarSizeInBits();
 
     // We can turn shl(c1)+shr(c2) -> shl(c3)+and(c4), but it isn't
@@ -133,20 +133,20 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift,
                        APInt::getLowBitsSet(TypeWidth, NumBits) << LowBits))
         return true;
     }
-      
+
     return false;
   }
   case Instruction::LShr: {
     // We can often fold the shift into shifts-by-a-constant.
     CI = dyn_cast<ConstantInt>(I->getOperand(1));
     if (CI == 0) return false;
-    
+
     // We can always fold lshr(c1)+lshr(c2) -> lshr(c1+c2).
     if (!isLeftShift) return true;
-    
+
     // We can always turn lshr(c)+shl(c) -> and(c2).
     if (CI->getValue() == NumBits) return true;
-      
+
     unsigned TypeWidth = I->getType()->getScalarSizeInBits();
 
     // We can always turn lshr(c1)+shl(c2) -> lshr(c3)+and(c4), but it isn't
@@ -157,7 +157,7 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift,
                           APInt::getLowBitsSet(TypeWidth, NumBits) << LowBits))
         return true;
     }
-      
+
     return false;
   }
   case Instruction::Select: {
@@ -175,7 +175,7 @@ static bool CanEvaluateShifted(Value *V, unsigned NumBits, bool isLeftShift,
         return false;
     return true;
   }
-  }      
+  }
 }
 
 /// GetShiftedValue - When CanEvaluateShifted returned true for an expression,
@@ -194,7 +194,7 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
                                          IC.getTargetLibraryInfo());
     return V;
   }
-  
+
   Instruction *I = cast<Instruction>(V);
   IC.Worklist.Add(I);
 
@@ -207,7 +207,7 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
     I->setOperand(0, GetShiftedValue(I->getOperand(0), NumBits,isLeftShift,IC));
     I->setOperand(1, GetShiftedValue(I->getOperand(1), NumBits,isLeftShift,IC));
     return I;
-    
+
   case Instruction::Shl: {
     BinaryOperator *BO = cast<BinaryOperator>(I);
     unsigned TypeWidth = BO->getType()->getScalarSizeInBits();
@@ -227,7 +227,7 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
       BO->setHasNoSignedWrap(false);
       return I;
     }
-    
+
     // We turn shl(c)+lshr(c) -> and(c2) if the input doesn't already have
     // zeros.
     if (CI->getValue() == NumBits) {
@@ -240,7 +240,7 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
       }
       return V;
     }
-    
+
     // We turn shl(c1)+shr(c2) -> shl(c3)+and(c4), but only when we know that
     // the and won't be needed.
     assert(CI->getZExtValue() > NumBits);
@@ -255,19 +255,19 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
     unsigned TypeWidth = BO->getType()->getScalarSizeInBits();
     // We only accept shifts-by-a-constant in CanEvaluateShifted.
     ConstantInt *CI = cast<ConstantInt>(BO->getOperand(1));
-    
+
     // We can always fold lshr(c1)+lshr(c2) -> lshr(c1+c2).
     if (!isLeftShift) {
       // If this is oversized composite shift, then unsigned shifts get 0.
       unsigned NewShAmt = NumBits+CI->getZExtValue();
       if (NewShAmt >= TypeWidth)
         return Constant::getNullValue(BO->getType());
-      
+
       BO->setOperand(1, ConstantInt::get(BO->getType(), NewShAmt));
       BO->setIsExact(false);
       return I;
     }
-    
+
     // We turn lshr(c)+shl(c) -> and(c2) if the input doesn't already have
     // zeros.
     if (CI->getValue() == NumBits) {
@@ -280,7 +280,7 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
       }
       return V;
     }
-    
+
     // We turn lshr(c1)+shl(c2) -> lshr(c3)+and(c4), but only when we know that
     // the and won't be needed.
     assert(CI->getZExtValue() > NumBits);
@@ -289,7 +289,7 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
     BO->setIsExact(false);
     return BO;
   }
-    
+
   case Instruction::Select:
     I->setOperand(1, GetShiftedValue(I->getOperand(1), NumBits,isLeftShift,IC));
     I->setOperand(2, GetShiftedValue(I->getOperand(2), NumBits,isLeftShift,IC));
@@ -304,7 +304,7 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
                                               NumBits, isLeftShift, IC));
     return PN;
   }
-  }      
+  }
 }
 
 
@@ -312,24 +312,24 @@ static Value *GetShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
 Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
                                                BinaryOperator &I) {
   bool isLeftShift = I.getOpcode() == Instruction::Shl;
-  
-  
+
+
   // See if we can propagate this shift into the input, this covers the trivial
   // cast of lshr(shl(x,c1),c2) as well as other more complex cases.
   if (I.getOpcode() != Instruction::AShr &&
       CanEvaluateShifted(Op0, Op1->getZExtValue(), isLeftShift, *this)) {
     DEBUG(dbgs() << "ICE: GetShiftedValue propagating shift through expression"
               " to eliminate shift:\n  IN: " << *Op0 << "\n  SH: " << I <<"\n");
-    
-    return ReplaceInstUsesWith(I, 
+
+    return ReplaceInstUsesWith(I,
                  GetShiftedValue(Op0, Op1->getZExtValue(), isLeftShift, *this));
   }
-  
-  
-  // See if we can simplify any instructions used by the instruction whose sole 
+
+
+  // See if we can simplify any instructions used by the instruction whose sole
   // purpose is to compute bits we don't care about.
   uint32_t TypeBits = Op0->getType()->getScalarSizeInBits();
-  
+
   // shl i32 X, 32 = 0 and srl i8 Y, 9 = 0, ... just don't eliminate
   // a signed shift.
   //
@@ -340,14 +340,14 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
     I.setOperand(1, ConstantInt::get(I.getType(), TypeBits-1));
     return &I;
   }
-  
+
   // ((X*C1) << C2) == (X * (C1 << C2))
   if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op0))
     if (BO->getOpcode() == Instruction::Mul && isLeftShift)
       if (Constant *BOOp = dyn_cast<Constant>(BO->getOperand(1)))
         return BinaryOperator::CreateMul(BO->getOperand(0),
                                         ConstantExpr::getShl(BOOp, Op1));
-  
+
   // Try to fold constant and into select arguments.
   if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
     if (Instruction *R = FoldOpIntoSelect(I, SI))
@@ -355,7 +355,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
   if (isa<PHINode>(Op0))
     if (Instruction *NV = FoldOpIntoPhi(I))
       return NV;
-  
+
   // Fold shift2(trunc(shift1(x,c1)), c2) -> trunc(shift2(shift1(x,c1),c2))
   if (TruncInst *TI = dyn_cast<TruncInst>(Op0)) {
     Instruction *TrOp = dyn_cast<Instruction>(TI->getOperand(0));
@@ -364,7 +364,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
     // require that the input operand is a shift-by-constant so that we have
     // confidence that the shifts will get folded together.  We could do this
     // xform in more cases, but it is unlikely to be profitable.
-    if (TrOp && I.isLogicalShift() && TrOp->isShift() && 
+    if (TrOp && I.isLogicalShift() && TrOp->isShift() &&
         isa<ConstantInt>(TrOp->getOperand(1))) {
       // Okay, we'll do this xform.  Make the shift of shift.
       Constant *ShAmt = ConstantExpr::getZExt(Op1, TrOp->getType());
@@ -378,7 +378,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
       unsigned SrcSize = TrOp->getType()->getScalarSizeInBits();
       unsigned DstSize = TI->getType()->getScalarSizeInBits();
       APInt MaskV(APInt::getLowBitsSet(SrcSize, DstSize));
-      
+
       // The mask we constructed says what the trunc would do if occurring
       // between the shifts.  We want to know the effect *after* the second
       // shift.  We know that it is a logical shift by a constant, so adjust the
@@ -399,7 +399,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
       return new TruncInst(And, I.getType());
     }
   }
-  
+
   if (Op0->hasOneUse()) {
     if (BinaryOperator *Op0BO = dyn_cast<BinaryOperator>(Op0)) {
       // Turn ((X >> C) + Y) << C  ->  (X + (Y << C)) & (~0 << C)
@@ -425,14 +425,13 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
           return BinaryOperator::CreateAnd(X, ConstantInt::get(I.getContext(),
                      APInt::getHighBitsSet(TypeBits, TypeBits-Op1Val)));
         }
-        
+
         // Turn (Y + ((X >> C) & CC)) << C  ->  ((X & (CC << C)) + (Y << C))
         Value *Op0BOOp1 = Op0BO->getOperand(1);
         if (isLeftShift && Op0BOOp1->hasOneUse() &&
-            match(Op0BOOp1, 
-                  m_And(m_Shr(m_Value(V1), m_Specific(Op1)),
-                        m_ConstantInt(CC))) &&
-            cast<BinaryOperator>(Op0BOOp1)->getOperand(0)->hasOneUse()) {
+            match(Op0BOOp1,
+                  m_And(m_OneUse(m_Shr(m_Value(V1), m_Specific(Op1))),
+                        m_ConstantInt(CC)))) {
           Value *YS =   // (Y << C)
             Builder->CreateShl(Op0BO->getOperand(0), Op1,
                                          Op0BO->getName());
@@ -442,7 +441,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
           return BinaryOperator::Create(Op0BO->getOpcode(), YS, XM);
         }
       }
-        
+
       // FALL THROUGH.
       case Instruction::Sub: {
         // Turn ((X >> C) + Y) << C  ->  (X + (Y << C)) & (~0 << C)
@@ -458,34 +457,32 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
           return BinaryOperator::CreateAnd(X, ConstantInt::get(I.getContext(),
                      APInt::getHighBitsSet(TypeBits, TypeBits-Op1Val)));
         }
-        
+
         // Turn (((X >> C)&CC) + Y) << C  ->  (X + (Y << C)) & (CC << C)
         if (isLeftShift && Op0BO->getOperand(0)->hasOneUse() &&
             match(Op0BO->getOperand(0),
-                  m_And(m_Shr(m_Value(V1), m_Value(V2)),
-                        m_ConstantInt(CC))) && V2 == Op1 &&
-            cast<BinaryOperator>(Op0BO->getOperand(0))
-                ->getOperand(0)->hasOneUse()) {
+                  m_And(m_OneUse(m_Shr(m_Value(V1), m_Value(V2))),
+                        m_ConstantInt(CC))) && V2 == Op1) {
           Value *YS = // (Y << C)
             Builder->CreateShl(Op0BO->getOperand(1), Op1, Op0BO->getName());
           // X & (CC << C)
           Value *XM = Builder->CreateAnd(V1, ConstantExpr::getShl(CC, Op1),
                                          V1->getName()+".mask");
-          
+
           return BinaryOperator::Create(Op0BO->getOpcode(), XM, YS);
         }
-        
+
         break;
       }
       }
-      
-      
+
+
       // If the operand is an bitwise operator with a constant RHS, and the
       // shift is the only use, we can pull it out of the shift.
       if (ConstantInt *Op0C = dyn_cast<ConstantInt>(Op0BO->getOperand(1))) {
         bool isValid = true;     // Valid only for And, Or, Xor
         bool highBitSet = false; // Transform if high bit of constant set?
-        
+
         switch (Op0BO->getOpcode()) {
         default: isValid = false; break;   // Do not perform transform!
         case Instruction::Add:
@@ -499,7 +496,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
           highBitSet = true;
           break;
         }
-        
+
         // If this is a signed shift right, and the high bit is modified
         // by the logical operation, do not perform the transformation.
         // The highBitSet boolean indicates the value of the high bit of
@@ -508,26 +505,26 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
         //
         if (isValid && I.getOpcode() == Instruction::AShr)
           isValid = Op0C->getValue()[TypeBits-1] == highBitSet;
-        
+
         if (isValid) {
           Constant *NewRHS = ConstantExpr::get(I.getOpcode(), Op0C, Op1);
-          
+
           Value *NewShift =
             Builder->CreateBinOp(I.getOpcode(), Op0BO->getOperand(0), Op1);
           NewShift->takeName(Op0BO);
-          
+
           return BinaryOperator::Create(Op0BO->getOpcode(), NewShift,
                                         NewRHS);
         }
       }
     }
   }
-  
+
   // Find out if this is a shift of a shift by a constant.
   BinaryOperator *ShiftOp = dyn_cast<BinaryOperator>(Op0);
   if (ShiftOp && !ShiftOp->isShift())
     ShiftOp = 0;
-  
+
   if (ShiftOp && isa<ConstantInt>(ShiftOp->getOperand(1))) {
 
     // This is a constant shift of a constant shift. Be careful about hiding
@@ -548,9 +545,9 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
     assert(ShiftAmt2 != 0 && "Should have been simplified earlier");
     if (ShiftAmt1 == 0) return 0;  // Will be simplified in the future.
     Value *X = ShiftOp->getOperand(0);
-    
+
     IntegerType *Ty = cast<IntegerType>(I.getType());
-    
+
     // Check for (X << c1) << c2  and  (X >> c1) >> c2
     if (I.getOpcode() == ShiftOp->getOpcode()) {
       uint32_t AmtSum = ShiftAmt1+ShiftAmt2;   // Fold into one big shift.
@@ -561,11 +558,11 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
           return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
         AmtSum = TypeBits-1;  // Saturate to 31 for i32 ashr.
       }
-      
+
       return BinaryOperator::Create(I.getOpcode(), X,
                                     ConstantInt::get(Ty, AmtSum));
     }
-    
+
     if (ShiftAmt1 == ShiftAmt2) {
       // If we have ((X << C) >>u C), turn this into X & (-1 >>u C).
       if (I.getOpcode() == Instruction::LShr &&
@@ -605,7 +602,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
           return NewLShr;
         }
         Value *Shift = Builder->CreateLShr(X, ShiftDiffCst);
-        
+
         APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt2));
         return BinaryOperator::CreateAnd(Shift,
                                          ConstantInt::get(I.getContext(),Mask));
@@ -653,12 +650,12 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
           return NewShl;
         }
         Value *Shift = Builder->CreateShl(X, ShiftDiffCst);
-        
+
         APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt2));
         return BinaryOperator::CreateAnd(Shift,
                                          ConstantInt::get(I.getContext(),Mask));
       }
-      
+
       // We can't handle (X << C1) >>s C2, it shifts arbitrary bits in. However,
       // we can handle (X <<nsw C1) >>s C2 since it only shifts in sign bits.
       if (I.getOpcode() == Instruction::AShr &&
@@ -682,21 +679,21 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) {
                                  I.hasNoSignedWrap(), I.hasNoUnsignedWrap(),
                                  TD))
     return ReplaceInstUsesWith(I, V);
-  
+
   if (Instruction *V = commonShiftTransforms(I))
     return V;
-  
+
   if (ConstantInt *Op1C = dyn_cast<ConstantInt>(I.getOperand(1))) {
     unsigned ShAmt = Op1C->getZExtValue();
-    
+
     // If the shifted-out value is known-zero, then this is a NUW shift.
-    if (!I.hasNoUnsignedWrap() && 
+    if (!I.hasNoUnsignedWrap() &&
         MaskedValueIsZero(I.getOperand(0),
                           APInt::getHighBitsSet(Op1C->getBitWidth(), ShAmt))) {
           I.setHasNoUnsignedWrap();
           return &I;
         }
-    
+
     // If the shifted out value is all signbits, this is a NSW shift.
     if (!I.hasNoSignedWrap() &&
         ComputeNumSignBits(I.getOperand(0)) > ShAmt) {
@@ -712,7 +709,7 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) {
       match(I.getOperand(1), m_Constant(C2)))
     return BinaryOperator::CreateShl(ConstantExpr::getShl(C1, C2), A);
 
-  return 0;    
+  return 0;
 }
 
 Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
@@ -722,9 +719,9 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
 
   if (Instruction *R = commonShiftTransforms(I))
     return R;
-  
+
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-  
+
   if (ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
     unsigned ShAmt = Op1C->getZExtValue();
 
@@ -743,15 +740,15 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
         return new ZExtInst(Cmp, II->getType());
       }
     }
-  
+
     // If the shifted-out value is known-zero, then this is an exact shift.
-    if (!I.isExact() && 
+    if (!I.isExact() &&
         MaskedValueIsZero(Op0,APInt::getLowBitsSet(Op1C->getBitWidth(),ShAmt))){
       I.setIsExact();
       return &I;
-    }    
+    }
   }
-  
+
   return 0;
 }
 
@@ -762,12 +759,12 @@ Instruction *InstCombiner::visitAShr(BinaryOperator &I) {
 
   if (Instruction *R = commonShiftTransforms(I))
     return R;
-  
+
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
   if (ConstantInt *Op1C = dyn_cast<ConstantInt>(Op1)) {
     unsigned ShAmt = Op1C->getZExtValue();
-    
+
     // If the input is a SHL by the same constant (ashr (shl X, C), C), then we
     // have a sign-extend idiom.
     Value *X;
@@ -791,23 +788,23 @@ Instruction *InstCombiner::visitAShr(BinaryOperator &I) {
     }
 
     // If the shifted-out value is known-zero, then this is an exact shift.
-    if (!I.isExact() && 
+    if (!I.isExact() &&
         MaskedValueIsZero(Op0,APInt::getLowBitsSet(Op1C->getBitWidth(),ShAmt))){
       I.setIsExact();
       return &I;
     }
-  }            
-  
+  }
+
   // See if we can turn a signed shr into an unsigned shr.
   if (MaskedValueIsZero(Op0,
                         APInt::getSignBit(I.getType()->getScalarSizeInBits())))
     return BinaryOperator::CreateLShr(Op0, Op1);
-  
+
   // Arithmetic shifting an all-sign-bit value is a no-op.
   unsigned NumSignBits = ComputeNumSignBits(Op0);
   if (NumSignBits == Op0->getType()->getScalarSizeInBits())
     return ReplaceInstUsesWith(I, Op0);
-  
+
   return 0;
 }
 
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 602b203371..08aedb3200 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -16,9 +16,10 @@
 #include "InstCombine.h"
 #include "llvm/DataLayout.h"
 #include "llvm/IntrinsicInst.h"
+#include "llvm/Support/PatternMatch.h"
 
 using namespace llvm;
-
+using namespace llvm::PatternMatch;
 
 /// ShrinkDemandedConstant - Check to see if the specified operand of the 
 /// specified instruction is a constant integer.  If so, check to see if there
@@ -199,8 +200,21 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       if ((DemandedMask & (~LHSKnownZero) & RHSKnownOne) == 
           (DemandedMask & (~LHSKnownZero)))
         return I->getOperand(1);
+    } else if (I->getOpcode() == Instruction::Xor) {
+      // We can simplify (X^Y) -> X or Y in the user's context if we know that
+      // only bits from X or Y are demanded.
+      
+      ComputeMaskedBits(I->getOperand(1), RHSKnownZero, RHSKnownOne, Depth+1);
+      ComputeMaskedBits(I->getOperand(0), LHSKnownZero, LHSKnownOne, Depth+1);
+      
+      // If all of the demanded bits are known zero on one side, return the
+      // other. 
+      if ((DemandedMask & RHSKnownZero) == DemandedMask)
+        return I->getOperand(0);
+      if ((DemandedMask & LHSKnownZero) == DemandedMask)
+        return I->getOperand(1);
     }
-    
+
     // Compute the KnownZero/KnownOne bits to simplify things downstream.
     ComputeMaskedBits(I, KnownZero, KnownOne, Depth);
     return 0;
@@ -580,6 +594,17 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     break;
   case Instruction::Shl:
     if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      {
+        Value *VarX; ConstantInt *C1;
+        if (match(I->getOperand(0), m_Shr(m_Value(VarX), m_ConstantInt(C1)))) {
+          Instruction *Shr = cast<Instruction>(I->getOperand(0));
+          Value *R = SimplifyShrShlDemandedBits(Shr, I, DemandedMask,
+                                                KnownZero, KnownOne);
+          if (R)
+            return R;
+        }
+      }
+
       uint64_t ShiftAmt = SA->getLimitedValue(BitWidth-1);
       APInt DemandedMaskIn(DemandedMask.lshr(ShiftAmt));
       
@@ -800,6 +825,79 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
   return 0;
 }
 
+/// Helper routine of SimplifyDemandedUseBits. It tries to simplify
+/// "E1 = (X lsr C1) << C2", where the C1 and C2 are constant, into
+/// "E2 = X << (C2 - C1)" or "E2 = X >> (C1 - C2)", depending on the sign
+/// of "C2-C1".
+///
+/// Suppose E1 and E2 are generally different in bits S={bm, bm+1,
+/// ..., bn}, without considering the specific value X is holding.
+/// This transformation is legal iff one of following conditions is hold:
+///  1) All the bit in S are 0, in this case E1 == E2.
+///  2) We don't care those bits in S, per the input DemandedMask.
+///  3) Combination of 1) and 2). Some bits in S are 0, and we don't care the
+///     rest bits.
+///
+/// Currently we only test condition 2).
+///
+/// As with SimplifyDemandedUseBits, it returns NULL if the simplification was
+/// not successful.
+Value *InstCombiner::SimplifyShrShlDemandedBits(Instruction *Shr,
+  Instruction *Shl, APInt DemandedMask, APInt &KnownZero, APInt &KnownOne) {
+
+  unsigned ShlAmt = cast<ConstantInt>(Shl->getOperand(1))->getZExtValue();
+  unsigned ShrAmt = cast<ConstantInt>(Shr->getOperand(1))->getZExtValue();
+
+  KnownOne.clearAllBits();
+  KnownZero = APInt::getBitsSet(KnownZero.getBitWidth(), 0, ShlAmt-1);
+  KnownZero &= DemandedMask;
+
+  if (ShlAmt == 0 || ShrAmt == 0)
+    return 0;
+
+  Value *VarX = Shr->getOperand(0);
+  Type *Ty = VarX->getType();
+
+  APInt BitMask1(Ty->getIntegerBitWidth(), (uint64_t)-1);
+  APInt BitMask2(Ty->getIntegerBitWidth(), (uint64_t)-1);
+
+  bool isLshr = (Shr->getOpcode() == Instruction::LShr);
+  BitMask1 = isLshr ? (BitMask1.lshr(ShrAmt) << ShlAmt) :
+                      (BitMask1.ashr(ShrAmt) << ShlAmt);
+
+  if (ShrAmt <= ShlAmt) {
+    BitMask2 <<= (ShlAmt - ShrAmt);
+  } else {
+    BitMask2 = isLshr ? BitMask2.lshr(ShrAmt - ShlAmt):
+                        BitMask2.ashr(ShrAmt - ShlAmt);
+  }
+
+  // Check if condition-2 (see the comment to this function) is satified.
+  if ((BitMask1 & DemandedMask) == (BitMask2 & DemandedMask)) {
+    if (ShrAmt == ShlAmt)
+      return VarX;
+
+    if (!Shr->hasOneUse())
+      return 0;
+
+    BinaryOperator *New;
+    if (ShrAmt < ShlAmt) {
+      Constant *Amt = ConstantInt::get(VarX->getType(), ShlAmt - ShrAmt);
+      New = BinaryOperator::CreateShl(VarX, Amt);
+      BinaryOperator *Orig = cast<BinaryOperator>(Shl);
+      New->setHasNoSignedWrap(Orig->hasNoSignedWrap());
+      New->setHasNoUnsignedWrap(Orig->hasNoUnsignedWrap());
+    } else {
+      Constant *Amt = ConstantInt::get(VarX->getType(), ShrAmt - ShlAmt);
+      New = isLshr ? BinaryOperator::CreateLShr(VarX, Amt) :
+                     BinaryOperator::CreateAShr(VarX, Amt);
+    }
+
+    return InsertNewInstWith(New, *Shl);
+  }
+
+  return 0;
+}
 
 /// SimplifyDemandedVectorElts - The specified value produces a vector with
 /// any number of elements. DemandedElts contains the set of elements that are
diff --git a/lib/Transforms/InstCombine/InstCombineWorklist.h b/lib/Transforms/InstCombine/InstCombineWorklist.h
index ea654ae9ed..b1a4966920 100644
--- a/lib/Transforms/InstCombine/InstCombineWorklist.h
+++ b/lib/Transforms/InstCombine/InstCombineWorklist.h
@@ -11,11 +11,11 @@
 #define INSTCOMBINE_WORKLIST_H
 
 #define DEBUG_TYPE "instcombine"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Instruction.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/Compiler.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index feef2ccee4..9da58d0e71 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -36,23 +36,23 @@
 #define DEBUG_TYPE "instcombine"
 #include "llvm/Transforms/Scalar.h"
 #include "InstCombine.h"
-#include "llvm/IntrinsicInst.h"
+#include "llvm-c/Initialization.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/DataLayout.h"
-#include "llvm/Target/TargetLibraryInfo.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/IntrinsicInst.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/PatternMatch.h"
 #include "llvm/Support/ValueHandle.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm-c/Initialization.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 #include <climits>
 using namespace llvm;
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 4e05c3200c..f095cff33c 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -15,14 +15,8 @@
 
 #define DEBUG_TYPE "asan"
 
+#include "llvm/Transforms/Instrumentation.h"
 #include "BlackList.h"
-#include "llvm/Function.h"
-#include "llvm/IRBuilder.h"
-#include "llvm/InlineAsm.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Module.h"
-#include "llvm/Type.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallSet.h"
@@ -30,19 +24,24 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/DataLayout.h"
+#include "llvm/Function.h"
+#include "llvm/IRBuilder.h"
+#include "llvm/InlineAsm.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Module.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/system_error.h"
-#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
-
-#include <string>
+#include "llvm/Type.h"
 #include <algorithm>
+#include <string>
 
 using namespace llvm;
 
@@ -70,6 +69,9 @@ static const char *kAsanMappingScaleName = "__asan_mapping_scale";
 static const char *kAsanStackMallocName = "__asan_stack_malloc";
 static const char *kAsanStackFreeName = "__asan_stack_free";
 static const char *kAsanGenPrefix = "__asan_gen_";
+static const char *kAsanPoisonStackMemoryName = "__asan_poison_stack_memory";
+static const char *kAsanUnpoisonStackMemoryName =
+    "__asan_unpoison_stack_memory";
 
 static const int kAsanStackLeftRedzoneMagic = 0xf1;
 static const int kAsanStackMidRedzoneMagic = 0xf2;
@@ -113,9 +115,10 @@ static cl::opt<bool> ClInitializers("asan-initialization-order",
        cl::desc("Handle C++ initializer order"), cl::Hidden, cl::init(false));
 static cl::opt<bool> ClMemIntrin("asan-memintrin",
        cl::desc("Handle memset/memcpy/memmove"), cl::Hidden, cl::init(true));
-// This flag may need to be replaced with -fasan-blacklist.
-static cl::opt<std::string>  ClBlackListFile("asan-blacklist",
-       cl::desc("File containing the list of functions to ignore "
+static cl::opt<bool> ClRealignStack("asan-realign-stack",
+       cl::desc("Realign stack to 32"), cl::Hidden, cl::init(true));
+static cl::opt<std::string> ClBlacklistFile("asan-blacklist",
+       cl::desc("File containing the list of objects to ignore "
                 "during instrumentation"), cl::Hidden);
 
 // These flags allow to change the shadow mapping.
@@ -136,6 +139,10 @@ static cl::opt<bool> ClOptSameTemp("asan-opt-same-temp",
 static cl::opt<bool> ClOptGlobals("asan-opt-globals",
        cl::desc("Don't instrument scalar globals"), cl::Hidden, cl::init(true));
 
+static cl::opt<bool> ClCheckLifetime("asan-check-lifetime",
+       cl::desc("Use llvm.lifetime intrinsics to insert extra checks"),
+       cl::Hidden, cl::init(false));
+
 // Debug flags.
 static cl::opt<int> ClDebug("asan-debug", cl::desc("debug"), cl::Hidden,
                             cl::init(0));
@@ -186,8 +193,19 @@ static size_t RedzoneSize() {
 
 /// AddressSanitizer: instrument the code in module to find memory bugs.
 struct AddressSanitizer : public FunctionPass {
-  AddressSanitizer();
-  virtual const char *getPassName() const;
+  AddressSanitizer(bool CheckInitOrder = false,
+                   bool CheckUseAfterReturn = false,
+                   bool CheckLifetime = false,
+                   StringRef BlacklistFile = StringRef())
+      : FunctionPass(ID),
+        CheckInitOrder(CheckInitOrder || ClInitializers),
+        CheckUseAfterReturn(CheckUseAfterReturn || ClUseAfterReturn),
+        CheckLifetime(CheckLifetime || ClCheckLifetime),
+        BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile
+                                            : BlacklistFile) {}
+  virtual const char *getPassName() const {
+    return "AddressSanitizerFunctionPass";
+  }
   void instrumentMop(Instruction *I);
   void instrumentAddress(Instruction *OrigIns, IRBuilder<> &IRB,
                          Value *Addr, uint32_t TypeSize, bool IsWrite);
@@ -206,10 +224,10 @@ struct AddressSanitizer : public FunctionPass {
   bool maybeInsertAsanInitAtFunctionEntry(Function &F);
   bool poisonStackInFunction(Function &F);
   virtual bool doInitialization(Module &M);
-  virtual bool doFinalization(Module &M);
   static char ID;  // Pass identification, replacement for typeid
 
  private:
+  void initializeCallbacks(Module &M);
   uint64_t getAllocaSizeInBytes(AllocaInst *AI) {
     Type *Ty = AI->getAllocatedType();
     uint64_t SizeInBytes = TD->getTypeAllocSize(Ty);
@@ -229,7 +247,18 @@ struct AddressSanitizer : public FunctionPass {
                    Value *ShadowBase, bool DoPoison);
   bool LooksLikeCodeInBug11395(Instruction *I);
   void FindDynamicInitializers(Module &M);
-
+  /// Analyze lifetime intrinsics for given alloca. Use Value* instead of
+  /// AllocaInst* here, as we call this method after we merge all allocas into a
+  /// single one. Returns true if ASan added some instrumentation.
+  bool handleAllocaLifetime(Value *Alloca);
+  /// Analyze lifetime intrinsics for a specific value, casted from alloca.
+  /// Returns true if if ASan added some instrumentation.
+  bool handleValueLifetime(Value *V);
+  void poisonAlloca(Value *V, uint64_t Size, IRBuilder<> IRB, bool DoPoison);
+
+  bool CheckInitOrder;
+  bool CheckUseAfterReturn;
+  bool CheckLifetime;
   LLVMContext *C;
   DataLayout *TD;
   uint64_t MappingOffset;
@@ -239,7 +268,9 @@ struct AddressSanitizer : public FunctionPass {
   Function *AsanCtorFunction;
   Function *AsanInitFunction;
   Function *AsanStackMallocFunc, *AsanStackFreeFunc;
+  Function *AsanPoisonStackMemoryFunc, *AsanUnpoisonStackMemoryFunc;
   Function *AsanHandleNoReturnFunc;
+  SmallString<64> BlacklistFile;
   OwningPtr<BlackList> BL;
   // This array is indexed by AccessIsWrite and log2(AccessSize).
   Function *AsanErrorCallback[2][kNumberOfAccessSizes];
@@ -247,19 +278,32 @@ struct AddressSanitizer : public FunctionPass {
   SetOfDynamicallyInitializedGlobals DynamicallyInitializedGlobals;
 };
 
-// FIXME: inherit this from ModulePass and actually use it as a ModulePass.
-class AddressSanitizerCreateGlobalRedzonesPass {
+class AddressSanitizerModule : public ModulePass {
  public:
-  bool runOnModule(Module &M, DataLayout *TD);
+  AddressSanitizerModule(bool CheckInitOrder = false,
+                         StringRef BlacklistFile = StringRef())
+      : ModulePass(ID),
+        CheckInitOrder(CheckInitOrder || ClInitializers),
+        BlacklistFile(BlacklistFile.empty() ? ClBlacklistFile
+                                            : BlacklistFile) {}
+  bool runOnModule(Module &M);
+  static char ID;  // Pass identification, replacement for typeid
+  virtual const char *getPassName() const {
+    return "AddressSanitizerModule";
+  }
+
  private:
   bool ShouldInstrumentGlobal(GlobalVariable *G);
   void createInitializerPoisonCalls(Module &M, Value *FirstAddr,
                                     Value *LastAddr);
 
+  bool CheckInitOrder;
+  SmallString<64> BlacklistFile;
   OwningPtr<BlackList> BL;
   SetOfDynamicallyInitializedGlobals DynamicallyInitializedGlobals;
   Type *IntptrTy;
   LLVMContext *C;
+  DataLayout *TD;
 };
 
 }  // namespace
@@ -268,13 +312,20 @@ char AddressSanitizer::ID = 0;
 INITIALIZE_PASS(AddressSanitizer, "asan",
     "AddressSanitizer: detects use-after-free and out-of-bounds bugs.",
     false, false)
-AddressSanitizer::AddressSanitizer() : FunctionPass(ID) { }
-FunctionPass *llvm::createAddressSanitizerPass() {
-  return new AddressSanitizer();
+FunctionPass *llvm::createAddressSanitizerFunctionPass(
+    bool CheckInitOrder, bool CheckUseAfterReturn, bool CheckLifetime,
+    StringRef BlacklistFile) {
+  return new AddressSanitizer(CheckInitOrder, CheckUseAfterReturn,
+                              CheckLifetime, BlacklistFile);
 }
 
-const char *AddressSanitizer::getPassName() const {
-  return "AddressSanitizer";
+char AddressSanitizerModule::ID = 0;
+INITIALIZE_PASS(AddressSanitizerModule, "asan-module",
+    "AddressSanitizer: detects use-after-free and out-of-bounds bugs."
+    "ModulePass", false, false)
+ModulePass *llvm::createAddressSanitizerModulePass(
+    bool CheckInitOrder, StringRef BlacklistFile) {
+  return new AddressSanitizerModule(CheckInitOrder, BlacklistFile);
 }
 
 static size_t TypeSizeToSizeIndex(uint32_t TypeSize) {
@@ -385,7 +436,7 @@ void AddressSanitizer::instrumentMop(Instruction *I) {
     if (GlobalVariable *G = dyn_cast<GlobalVariable>(Addr)) {
       // If initialization order checking is disabled, a simple access to a
       // dynamically initialized global is always valid.
-      if (!ClInitializers)
+      if (!CheckInitOrder)
         return;
       // If a global variable does not have dynamic initialization we don't
       // have to instrument it.  However, if a global does not have initailizer
@@ -492,7 +543,7 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
   Crash->setDebugLoc(OrigIns->getDebugLoc());
 }
 
-void AddressSanitizerCreateGlobalRedzonesPass::createInitializerPoisonCalls(
+void AddressSanitizerModule::createInitializerPoisonCalls(
     Module &M, Value *FirstAddr, Value *LastAddr) {
   // We do all of our poisoning and unpoisoning within _GLOBAL__I_a.
   Function *GlobalInit = M.getFunction("_GLOBAL__I_a");
@@ -524,8 +575,7 @@ void AddressSanitizerCreateGlobalRedzonesPass::createInitializerPoisonCalls(
   }
 }
 
-bool AddressSanitizerCreateGlobalRedzonesPass::ShouldInstrumentGlobal(
-    GlobalVariable *G) {
+bool AddressSanitizerModule::ShouldInstrumentGlobal(GlobalVariable *G) {
   Type *Ty = cast<PointerType>(G->getType())->getElementType();
   DEBUG(dbgs() << "GLOBAL: " << *G << "\n");
 
@@ -587,9 +637,13 @@ bool AddressSanitizerCreateGlobalRedzonesPass::ShouldInstrumentGlobal(
 // This function replaces all global variables with new variables that have
 // trailing redzones. It also creates a function that poisons
 // redzones and inserts this function into llvm.global_ctors.
-bool AddressSanitizerCreateGlobalRedzonesPass::runOnModule(Module &M,
-                                                           DataLayout *TD) {
-  BL.reset(new BlackList(ClBlackListFile));
+bool AddressSanitizerModule::runOnModule(Module &M) {
+  if (!ClGlobals) return false;
+  TD = getAnalysisIfAvailable<DataLayout>();
+  if (!TD)
+    return false;
+  BL.reset(new BlackList(BlacklistFile));
+  if (BL->isIn(M)) return false;
   DynamicallyInitializedGlobals.Init(M);
   C = &(M.getContext());
   IntptrTy = Type::getIntNTy(*C, TD->getPointerSizeInBits());
@@ -677,7 +731,7 @@ bool AddressSanitizerCreateGlobalRedzonesPass::runOnModule(Module &M,
         NULL);
 
     // Populate the first and last globals declared in this TU.
-    if (ClInitializers && GlobalHasDynamicInitializer) {
+    if (CheckInitOrder && GlobalHasDynamicInitializer) {
       LastDynamic = ConstantExpr::getPointerCast(NewGlobal, IntptrTy);
       if (FirstDynamic == 0)
         FirstDynamic = LastDynamic;
@@ -692,7 +746,7 @@ bool AddressSanitizerCreateGlobalRedzonesPass::runOnModule(Module &M,
       ConstantArray::get(ArrayOfGlobalStructTy, Initializers), "");
 
   // Create calls for poisoning before initializers run and unpoisoning after.
-  if (ClInitializers && FirstDynamic && LastDynamic)
+  if (CheckInitOrder && FirstDynamic && LastDynamic)
     createInitializerPoisonCalls(M, FirstDynamic, LastDynamic);
 
   Function *AsanRegisterGlobals = checkInterfaceFunction(M.getOrInsertFunction(
@@ -726,32 +780,8 @@ bool AddressSanitizerCreateGlobalRedzonesPass::runOnModule(Module &M,
   return true;
 }
 
-// virtual
-bool AddressSanitizer::doInitialization(Module &M) {
-  // Initialize the private fields. No one has accessed them before.
-  TD = getAnalysisIfAvailable<DataLayout>();
-
-  if (!TD)
-    return false;
-  BL.reset(new BlackList(ClBlackListFile));
-  DynamicallyInitializedGlobals.Init(M);
-
-  C = &(M.getContext());
-  LongSize = TD->getPointerSizeInBits();
-  IntptrTy = Type::getIntNTy(*C, LongSize);
-  IntptrPtrTy = PointerType::get(IntptrTy, 0);
-
-  AsanCtorFunction = Function::Create(
-      FunctionType::get(Type::getVoidTy(*C), false),
-      GlobalValue::InternalLinkage, kAsanModuleCtorName, &M);
-  BasicBlock *AsanCtorBB = BasicBlock::Create(*C, "", AsanCtorFunction);
-  // call __asan_init in the module ctor.
-  IRBuilder<> IRB(ReturnInst::Create(*C, AsanCtorBB));
-  AsanInitFunction = checkInterfaceFunction(
-      M.getOrInsertFunction(kAsanInitName, IRB.getVoidTy(), NULL));
-  AsanInitFunction->setLinkage(Function::ExternalLinkage);
-  IRB.CreateCall(AsanInitFunction);
-
+void AddressSanitizer::initializeCallbacks(Module &M) {
+  IRBuilder<> IRB(*C);
   // Create __asan_report* callbacks.
   for (size_t AccessIsWrite = 0; AccessIsWrite <= 1; AccessIsWrite++) {
     for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
@@ -773,11 +803,42 @@ bool AddressSanitizer::doInitialization(Module &M) {
       IntptrTy, IntptrTy, IntptrTy, NULL));
   AsanHandleNoReturnFunc = checkInterfaceFunction(M.getOrInsertFunction(
       kAsanHandleNoReturnName, IRB.getVoidTy(), NULL));
+  AsanPoisonStackMemoryFunc = checkInterfaceFunction(M.getOrInsertFunction(
+      kAsanPoisonStackMemoryName, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
+  AsanUnpoisonStackMemoryFunc = checkInterfaceFunction(M.getOrInsertFunction(
+      kAsanUnpoisonStackMemoryName, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
 
   // We insert an empty inline asm after __asan_report* to avoid callback merge.
   EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
                             StringRef(""), StringRef(""),
                             /*hasSideEffects=*/true);
+}
+
+// virtual
+bool AddressSanitizer::doInitialization(Module &M) {
+  // Initialize the private fields. No one has accessed them before.
+  TD = getAnalysisIfAvailable<DataLayout>();
+
+  if (!TD)
+    return false;
+  BL.reset(new BlackList(BlacklistFile));
+  DynamicallyInitializedGlobals.Init(M);
+
+  C = &(M.getContext());
+  LongSize = TD->getPointerSizeInBits();
+  IntptrTy = Type::getIntNTy(*C, LongSize);
+  IntptrPtrTy = PointerType::get(IntptrTy, 0);
+
+  AsanCtorFunction = Function::Create(
+      FunctionType::get(Type::getVoidTy(*C), false),
+      GlobalValue::InternalLinkage, kAsanModuleCtorName, &M);
+  BasicBlock *AsanCtorBB = BasicBlock::Create(*C, "", AsanCtorFunction);
+  // call __asan_init in the module ctor.
+  IRBuilder<> IRB(ReturnInst::Create(*C, AsanCtorBB));
+  AsanInitFunction = checkInterfaceFunction(
+      M.getOrInsertFunction(kAsanInitName, IRB.getVoidTy(), NULL));
+  AsanInitFunction->setLinkage(Function::ExternalLinkage);
+  IRB.CreateCall(AsanInitFunction);
 
   llvm::Triple targetTriple(M.getTargetTriple());
   bool isAndroid = targetTriple.getEnvironment() == llvm::Triple::Android;
@@ -817,18 +878,6 @@ bool AddressSanitizer::doInitialization(Module &M) {
   return true;
 }
 
-bool AddressSanitizer::doFinalization(Module &M) {
-  // We transform the globals at the very end so that the optimization analysis
-  // works on the original globals.
-  if (ClGlobals) {
-    // FIXME: instead of doFinalization, run this as a true ModulePass.
-    AddressSanitizerCreateGlobalRedzonesPass Pass;
-    return Pass.runOnModule(M, TD);
-  }
-  return false;
-}
-
-
 bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) {
   // For each NSObject descendant having a +load method, this method is invoked
   // by the ObjC runtime before any of the static constructors is called.
@@ -849,6 +898,7 @@ bool AddressSanitizer::runOnFunction(Function &F) {
   if (BL->isIn(F)) return false;
   if (&F == AsanCtorFunction) return false;
   DEBUG(dbgs() << "ASAN instrumenting:\n" << F << "\n");
+  initializeCallbacks(*F.getParent());
 
   // If needed, insert __asan_init before checking for AddressSafety attr.
   maybeInsertAsanInitAtFunctionEntry(F);
@@ -1021,6 +1071,74 @@ bool AddressSanitizer::LooksLikeCodeInBug11395(Instruction *I) {
   return true;
 }
 
+// Handling llvm.lifetime intrinsics for a given %alloca:
+// (1) collect all llvm.lifetime.xxx(%size, %value) describing the alloca.
+// (2) if %size is constant, poison memory for llvm.lifetime.end (to detect
+//     invalid accesses) and unpoison it for llvm.lifetime.start (the memory
+//     could be poisoned by previous llvm.lifetime.end instruction, as the
+//     variable may go in and out of scope several times, e.g. in loops).
+// (3) if we poisoned at least one %alloca in a function,
+//     unpoison the whole stack frame at function exit.
+bool AddressSanitizer::handleAllocaLifetime(Value *Alloca) {
+  assert(CheckLifetime);
+  Type *AllocaType = Alloca->getType();
+  Type *Int8PtrTy = Type::getInt8PtrTy(AllocaType->getContext());
+
+  bool Res = false;
+  // Typical code looks like this:
+  // %alloca = alloca <type>, <alignment>
+  // ... some code ...
+  // %val1 = bitcast <type>* %alloca to i8*
+  // call void @llvm.lifetime.start(i64 <size>, i8* %val1)
+  // ... more code ...
+  // %val2 = bitcast <type>* %alloca to i8*
+  // call void @llvm.lifetime.start(i64 <size>, i8* %val2)
+  // That is, to handle %alloca we must find all its casts to
+  // i8* values, and find lifetime instructions for these values.
+  if (AllocaType == Int8PtrTy)
+    Res |= handleValueLifetime(Alloca);
+  for (Value::use_iterator UI = Alloca->use_begin(), UE = Alloca->use_end();
+       UI != UE; ++UI) {
+    if (UI->getType() != Int8PtrTy) continue;
+    if (UI->stripPointerCasts() != Alloca) continue;
+    Res |= handleValueLifetime(*UI);
+  }
+  return Res;
+}
+
+bool AddressSanitizer::handleValueLifetime(Value *V) {
+  assert(CheckLifetime);
+  bool Res = false;
+  for (Value::use_iterator UI = V->use_begin(), UE = V->use_end(); UI != UE;
+       ++UI) {
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(*UI);
+    if (!II) continue;
+    Intrinsic::ID ID = II->getIntrinsicID();
+    if (ID != Intrinsic::lifetime_start &&
+        ID != Intrinsic::lifetime_end)
+      continue;
+    if (V != II->getArgOperand(1))
+      continue;
+    // Found lifetime intrinsic, add ASan instrumentation if necessary.
+    ConstantInt *Size = dyn_cast<ConstantInt>(II->getArgOperand(0));
+    // If size argument is undefined, don't do anything.
+    if (Size->isMinusOne())
+      continue;
+    // Check that size doesn't saturate uint64_t and can
+    // be stored in IntptrTy.
+    const uint64_t SizeValue = Size->getValue().getLimitedValue();
+    if (SizeValue == ~0ULL ||
+        !ConstantInt::isValueValidForType(IntptrTy, SizeValue)) {
+      continue;
+    }
+    IRBuilder<> IRB(II);
+    bool DoPoison = (ID == Intrinsic::lifetime_end);
+    poisonAlloca(V, SizeValue, IRB, DoPoison);
+    Res = true;
+  }
+  return Res;
+}
+
 // Find all static Alloca instructions and put
 // poisoned red zones around all of them.
 // Then unpoison everything back before the function returns.
@@ -1039,9 +1157,11 @@ bool AddressSanitizer::poisonStackInFunction(Function &F) {
   SmallVector<AllocaInst*, 16> AllocaVec;
   SmallVector<Instruction*, 8> RetVec;
   uint64_t TotalSize = 0;
+  bool HavePoisonedAllocas = false;
 
   // Filter out Alloca instructions we want (and can) handle.
   // Collect Ret instructions.
+  unsigned ResultAlignment = 1 << MappingScale();
   for (Function::iterator FI = F.begin(), FE = F.end();
        FI != FE; ++FI) {
     BasicBlock &BB = *FI;
@@ -1057,7 +1177,7 @@ bool AddressSanitizer::poisonStackInFunction(Function &F) {
       if (AI->isArrayAllocation()) continue;
       if (!AI->isStaticAlloca()) continue;
       if (!AI->getAllocatedType()->isSized()) continue;
-      if (AI->getAlignment() > RedzoneSize()) continue;
+      ResultAlignment = std::max(ResultAlignment, AI->getAlignment());
       AllocaVec.push_back(AI);
       uint64_t AlignedSize =  getAlignedAllocaSize(AI);
       TotalSize += AlignedSize;
@@ -1068,7 +1188,7 @@ bool AddressSanitizer::poisonStackInFunction(Function &F) {
 
   uint64_t LocalStackSize = TotalSize + (AllocaVec.size() + 1) * RedzoneSize();
 
-  bool DoStackMalloc = ClUseAfterReturn
+  bool DoStackMalloc = CheckUseAfterReturn
       && LocalStackSize <= kMaxStackMallocSize;
 
   Instruction *InsBefore = AllocaVec[0];
@@ -1078,7 +1198,9 @@ bool AddressSanitizer::poisonStackInFunction(Function &F) {
   Type *ByteArrayTy = ArrayType::get(IRB.getInt8Ty(), LocalStackSize);
   AllocaInst *MyAlloca =
       new AllocaInst(ByteArrayTy, "MyAlloca", InsBefore);
-  MyAlloca->setAlignment(RedzoneSize());
+  if (ClRealignStack && ResultAlignment < RedzoneSize())
+    ResultAlignment = RedzoneSize();
+  MyAlloca->setAlignment(ResultAlignment);
   assert(MyAlloca->isStaticAlloca());
   Value *OrigStackBase = IRB.CreatePointerCast(MyAlloca, IntptrTy);
   Value *LocalStackBase = OrigStackBase;
@@ -1103,10 +1225,13 @@ bool AddressSanitizer::poisonStackInFunction(Function &F) {
                      << Name.size() << " " << Name << " ";
     uint64_t AlignedSize = getAlignedAllocaSize(AI);
     assert((AlignedSize % RedzoneSize()) == 0);
-    AI->replaceAllUsesWith(
-        IRB.CreateIntToPtr(
+    Value *NewAllocaPtr = IRB.CreateIntToPtr(
             IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, Pos)),
-            AI->getType()));
+            AI->getType());
+    AI->replaceAllUsesWith(NewAllocaPtr);
+    // Analyze lifetime intrinsics only for static allocas we handle.
+    if (CheckLifetime)
+      HavePoisonedAllocas |= handleAllocaLifetime(NewAllocaPtr);
     Pos += AlignedSize + RedzoneSize();
   }
   assert(Pos == LocalStackSize);
@@ -1139,9 +1264,15 @@ bool AddressSanitizer::poisonStackInFunction(Function &F) {
     PoisonStack(ArrayRef<AllocaInst*>(AllocaVec), IRBRet, ShadowBase, false);
 
     if (DoStackMalloc) {
+      // In use-after-return mode, mark the whole stack frame unaddressable.
       IRBRet.CreateCall3(AsanStackFreeFunc, LocalStackBase,
                          ConstantInt::get(IntptrTy, LocalStackSize),
                          OrigStackBase);
+    } else if (HavePoisonedAllocas) {
+      // If we poisoned some allocas in llvm.lifetime analysis,
+      // unpoison whole stack frame now.
+      assert(LocalStackBase == OrigStackBase);
+      poisonAlloca(LocalStackBase, LocalStackSize, IRBRet, false);
     }
   }
 
@@ -1155,3 +1286,13 @@ bool AddressSanitizer::poisonStackInFunction(Function &F) {
 
   return true;
 }
+
+void AddressSanitizer::poisonAlloca(Value *V, uint64_t Size, IRBuilder<> IRB,
+                                    bool DoPoison) {
+  // For now just insert the call to ASan runtime.
+  Value *AddrArg = IRB.CreatePointerCast(V, IntptrTy);
+  Value *SizeArg = ConstantInt::get(IntptrTy, Size);
+  IRB.CreateCall2(DoPoison ? AsanPoisonStackMemoryFunc
+                           : AsanUnpoisonStackMemoryFunc,
+                  AddrArg, SizeArg);
+}
diff --git a/lib/Transforms/Instrumentation/BlackList.cpp b/lib/Transforms/Instrumentation/BlackList.cpp
index e02c631f7f..0bfb186562 100644
--- a/lib/Transforms/Instrumentation/BlackList.cpp
+++ b/lib/Transforms/Instrumentation/BlackList.cpp
@@ -13,9 +13,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <utility>
-#include <string>
-
 #include "BlackList.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallVector.h"
@@ -28,12 +25,14 @@
 #include "llvm/Support/Regex.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/system_error.h"
+#include <string>
+#include <utility>
 
 namespace llvm {
 
 BlackList::BlackList(const StringRef Path) {
   // Validate and open blacklist file.
-  if (!Path.size()) return;
+  if (Path.empty()) return;
   OwningPtr<MemoryBuffer> File;
   if (error_code EC = MemoryBuffer::getFile(Path, File)) {
     report_fatal_error("Can't open blacklist file: " + Path + ": " +
@@ -53,6 +52,10 @@ BlackList::BlackList(const StringRef Path) {
     std::pair<StringRef, StringRef> SplitLine = I->split(":");
     StringRef Prefix = SplitLine.first;
     std::string Regexp = SplitLine.second;
+    if (Regexp.empty()) {
+      // Missing ':' in the line.
+      report_fatal_error("malformed blacklist line: " + SplitLine.first);
+    }
 
     // Replace * with .*
     for (size_t pos = 0; (pos = Regexp.find("*", pos)) != std::string::npos;
@@ -69,7 +72,7 @@ BlackList::BlackList(const StringRef Path) {
     }
 
     // Add this regexp into the proper group by its prefix.
-    if (Regexps[Prefix].size())
+    if (!Regexps[Prefix].empty())
       Regexps[Prefix] += "|";
     Regexps[Prefix] += Regexp;
   }
@@ -110,10 +113,12 @@ bool BlackList::isInInit(const GlobalVariable &G) {
           inSection("global-init-type", GetGVTypeString(G)));
 }
 
-bool BlackList::inSection(const StringRef Section,
-                          const StringRef Query) {
-  Regex *FunctionRegex = Entries[Section];
-  return FunctionRegex ? FunctionRegex->match(Query) : false;
+bool BlackList::inSection(const StringRef Section, const StringRef Query) {
+  StringMap<Regex*>::iterator I = Entries.find(Section);
+  if (I == Entries.end()) return false;
+
+  Regex *FunctionRegex = I->getValue();
+  return FunctionRegex->match(Query);
 }
 
 }  // namespace llvm
diff --git a/lib/Transforms/Instrumentation/BoundsChecking.cpp b/lib/Transforms/Instrumentation/BoundsChecking.cpp
index 2d5d5603c0..303e04ac16 100644
--- a/lib/Transforms/Instrumentation/BoundsChecking.cpp
+++ b/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -13,19 +13,19 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "bounds-checking"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/DataLayout.h"
 #include "llvm/IRBuilder.h"
 #include "llvm/Intrinsics.h"
 #include "llvm/Pass.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/InstIterator.h"
 #include "llvm/Support/TargetFolder.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLibraryInfo.h"
-#include "llvm/Transforms/Instrumentation.h"
 using namespace llvm;
 
 static cl::opt<bool> SingleTrapBB("bounds-checking-single-trap",
@@ -108,6 +108,7 @@ void BoundsChecking::emitBranchToTrap(Value *Cmp) {
     else
       Cmp = 0; // unconditional branch
   }
+  ++ChecksAdded;
 
   Instruction *Inst = Builder->GetInsertPoint();
   BasicBlock *OldBB = Inst->getParent();
@@ -162,7 +163,6 @@ bool BoundsChecking::instrument(Value *Ptr, Value *InstVal) {
   }
   emitBranchToTrap(Or);
 
-  ++ChecksAdded;
   return true;
 }
 
diff --git a/lib/Transforms/Instrumentation/CMakeLists.txt b/lib/Transforms/Instrumentation/CMakeLists.txt
index 058f68c7ce..1c9e053679 100644
--- a/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -4,6 +4,7 @@ add_llvm_library(LLVMInstrumentation
   BoundsChecking.cpp
   EdgeProfiling.cpp
   GCOVProfiling.cpp
+  MemorySanitizer.cpp
   Instrumentation.cpp
   OptimalEdgeProfiling.cpp
   PathProfiling.cpp
diff --git a/lib/Transforms/Instrumentation/EdgeProfiling.cpp b/lib/Transforms/Instrumentation/EdgeProfiling.cpp
index e8ef2654d2..41e42aff49 100644
--- a/lib/Transforms/Instrumentation/EdgeProfiling.cpp
+++ b/lib/Transforms/Instrumentation/EdgeProfiling.cpp
@@ -18,13 +18,13 @@
 //===----------------------------------------------------------------------===//
 #define DEBUG_TYPE "insert-edge-profiling"
 
+#include "llvm/Transforms/Instrumentation.h"
 #include "ProfilingUtils.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Instrumentation.h"
-#include "llvm/ADT/Statistic.h"
 #include <set>
 using namespace llvm;
 
diff --git a/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index a8adaa62d7..5e064cd70d 100644
--- a/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -16,19 +16,19 @@
 
 #define DEBUG_TYPE "insert-gcov-profiling"
 
-#include "ProfilingUtils.h"
 #include "llvm/Transforms/Instrumentation.h"
-#include "llvm/DebugInfo.h"
-#include "llvm/IRBuilder.h"
-#include "llvm/Instructions.h"
-#include "llvm/Module.h"
-#include "llvm/Pass.h"
+#include "ProfilingUtils.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/UniqueVector.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IRBuilder.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/DebugLoc.h"
 #include "llvm/Support/InstIterator.h"
@@ -45,11 +45,11 @@ namespace {
     static char ID;
     GCOVProfiler()
         : ModulePass(ID), EmitNotes(true), EmitData(true), Use402Format(false),
-          UseExtraChecksum(false) {
+          UseExtraChecksum(false), NoRedZone(false) {
       initializeGCOVProfilerPass(*PassRegistry::getPassRegistry());
     }
     GCOVProfiler(bool EmitNotes, bool EmitData, bool use402Format = false,
-                 bool useExtraChecksum = false)
+                 bool useExtraChecksum = false, bool NoRedZone = false)
         : ModulePass(ID), EmitNotes(EmitNotes), EmitData(EmitData),
           Use402Format(use402Format), UseExtraChecksum(useExtraChecksum) {
       assert((EmitNotes || EmitData) && "GCOVProfiler asked to do nothing?");
@@ -98,6 +98,7 @@ namespace {
     bool EmitData;
     bool Use402Format;
     bool UseExtraChecksum;
+    bool NoRedZone;
 
     Module *M;
     LLVMContext *Ctx;
@@ -110,8 +111,10 @@ INITIALIZE_PASS(GCOVProfiler, "insert-gcov-profiling",
 
 ModulePass *llvm::createGCOVProfilerPass(bool EmitNotes, bool EmitData,
                                          bool Use402Format,
-                                         bool UseExtraChecksum) {
-  return new GCOVProfiler(EmitNotes, EmitData, Use402Format, UseExtraChecksum);
+                                         bool UseExtraChecksum,
+                                         bool NoRedZone) {
+  return new GCOVProfiler(EmitNotes, EmitData, Use402Format, UseExtraChecksum,
+                          NoRedZone);
 }
 
 namespace {
@@ -638,6 +641,9 @@ void GCOVProfiler::insertCounterWriteout(
     WriteoutF = Function::Create(WriteoutFTy, GlobalValue::InternalLinkage,
                                  "__llvm_gcov_writeout", M);
   WriteoutF->setUnnamedAddr(true);
+  WriteoutF->addFnAttr(Attributes::NoInline);
+  if (NoRedZone)
+    WriteoutF->addFnAttr(Attributes::NoRedZone);
 
   BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", WriteoutF);
   IRBuilder<> Builder(BB);
@@ -683,6 +689,8 @@ void GCOVProfiler::insertCounterWriteout(
   F->setUnnamedAddr(true);
   F->setLinkage(GlobalValue::InternalLinkage);
   F->addFnAttr(Attributes::NoInline);
+  if (NoRedZone)
+    F->addFnAttr(Attributes::NoRedZone);
 
   BB = BasicBlock::Create(*Ctx, "entry", F);
   Builder.SetInsertPoint(BB);
@@ -702,6 +710,8 @@ void GCOVProfiler::insertIndirectCounterIncrement() {
   Fn->setUnnamedAddr(true);
   Fn->setLinkage(GlobalValue::InternalLinkage);
   Fn->addFnAttr(Attributes::NoInline);
+  if (NoRedZone)
+    Fn->addFnAttr(Attributes::NoRedZone);
 
   Type *Int32Ty = Type::getInt32Ty(*Ctx);
   Type *Int64Ty = Type::getInt64Ty(*Ctx);
@@ -758,6 +768,9 @@ insertFlush(ArrayRef<std::pair<GlobalVariable*, MDNode*> > CountersBySP) {
   else
     FlushF->setLinkage(GlobalValue::InternalLinkage);
   FlushF->setUnnamedAddr(true);
+  FlushF->addFnAttr(Attributes::NoInline);
+  if (NoRedZone)
+    FlushF->addFnAttr(Attributes::NoRedZone);
 
   BasicBlock *Entry = BasicBlock::Create(*Ctx, "entry", FlushF);
 
diff --git a/lib/Transforms/Instrumentation/Instrumentation.cpp b/lib/Transforms/Instrumentation/Instrumentation.cpp
index 1e0b4a348a..8ba102559b 100644
--- a/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -21,11 +21,13 @@ using namespace llvm;
 /// library.
 void llvm::initializeInstrumentation(PassRegistry &Registry) {
   initializeAddressSanitizerPass(Registry);
+  initializeAddressSanitizerModulePass(Registry);
   initializeBoundsCheckingPass(Registry);
   initializeEdgeProfilerPass(Registry);
   initializeGCOVProfilerPass(Registry);
   initializeOptimalEdgeProfilerPass(Registry);
   initializePathProfilerPass(Registry);
+  initializeMemorySanitizerPass(Registry);
   initializeThreadSanitizerPass(Registry);
 }
 
diff --git a/lib/Transforms/Instrumentation/MaximumSpanningTree.h b/lib/Transforms/Instrumentation/MaximumSpanningTree.h
index a4bb5a66af..50226db8c2 100644
--- a/lib/Transforms/Instrumentation/MaximumSpanningTree.h
+++ b/lib/Transforms/Instrumentation/MaximumSpanningTree.h
@@ -15,10 +15,10 @@
 #ifndef LLVM_ANALYSIS_MAXIMUMSPANNINGTREE_H
 #define LLVM_ANALYSIS_MAXIMUMSPANNINGTREE_H
 
-#include "llvm/BasicBlock.h"
 #include "llvm/ADT/EquivalenceClasses.h"
-#include <vector>
+#include "llvm/BasicBlock.h"
 #include <algorithm>
+#include <vector>
 
 namespace llvm {
 
diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
new file mode 100644
index 0000000000..947a2e3b12
--- /dev/null
+++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -0,0 +1,1579 @@
+//===-- MemorySanitizer.cpp - detector of uninitialized reads -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file is a part of MemorySanitizer, a detector of uninitialized
+/// reads.
+///
+/// Status: early prototype.
+///
+/// The algorithm of the tool is similar to Memcheck
+/// (http://goo.gl/QKbem). We associate a few shadow bits with every
+/// byte of the application memory, poison the shadow of the malloc-ed
+/// or alloca-ed memory, load the shadow bits on every memory read,
+/// propagate the shadow bits through some of the arithmetic
+/// instruction (including MOV), store the shadow bits on every memory
+/// write, report a bug on some other instructions (e.g. JMP) if the
+/// associated shadow is poisoned.
+///
+/// But there are differences too. The first and the major one:
+/// compiler instrumentation instead of binary instrumentation. This
+/// gives us much better register allocation, possible compiler
+/// optimizations and a fast start-up. But this brings the major issue
+/// as well: msan needs to see all program events, including system
+/// calls and reads/writes in system libraries, so we either need to
+/// compile *everything* with msan or use a binary translation
+/// component (e.g. DynamoRIO) to instrument pre-built libraries.
+/// Another difference from Memcheck is that we use 8 shadow bits per
+/// byte of application memory and use a direct shadow mapping. This
+/// greatly simplifies the instrumentation code and avoids races on
+/// shadow updates (Memcheck is single-threaded so races are not a
+/// concern there. Memcheck uses 2 shadow bits per byte with a slow
+/// path storage that uses 8 bits per byte).
+///
+/// The default value of shadow is 0, which means "clean" (not poisoned).
+///
+/// Every module initializer should call __msan_init to ensure that the
+/// shadow memory is ready. On error, __msan_warning is called. Since
+/// parameters and return values may be passed via registers, we have a
+/// specialized thread-local shadow for return values
+/// (__msan_retval_tls) and parameters (__msan_param_tls).
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "msan"
+
+#include "llvm/Transforms/Instrumentation.h"
+#include "BlackList.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/ValueMap.h"
+#include "llvm/DataLayout.h"
+#include "llvm/Function.h"
+#include "llvm/IRBuilder.h"
+#include "llvm/InlineAsm.h"
+#include "llvm/InstVisitor.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/MDBuilder.h"
+#include "llvm/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Type.h"
+
+using namespace llvm;
+
+static const uint64_t kShadowMask32 = 1ULL << 31;
+static const uint64_t kShadowMask64 = 1ULL << 46;
+static const uint64_t kOriginOffset32 = 1ULL << 30;
+static const uint64_t kOriginOffset64 = 1ULL << 45;
+
+// This is an important flag that makes the reports much more
+// informative at the cost of greater slowdown. Not fully implemented
+// yet.
+// FIXME: this should be a top-level clang flag, e.g.
+// -fmemory-sanitizer-full.
+static cl::opt<bool> ClTrackOrigins("msan-track-origins",
+       cl::desc("Track origins (allocation sites) of poisoned memory"),
+       cl::Hidden, cl::init(false));
+static cl::opt<bool> ClKeepGoing("msan-keep-going",
+       cl::desc("keep going after reporting a UMR"),
+       cl::Hidden, cl::init(false));
+static cl::opt<bool> ClPoisonStack("msan-poison-stack",
+       cl::desc("poison uninitialized stack variables"),
+       cl::Hidden, cl::init(true));
+static cl::opt<bool> ClPoisonStackWithCall("msan-poison-stack-with-call",
+       cl::desc("poison uninitialized stack variables with a call"),
+       cl::Hidden, cl::init(false));
+static cl::opt<int> ClPoisonStackPattern("msan-poison-stack-pattern",
+       cl::desc("poison uninitialized stack variables with the given patter"),
+       cl::Hidden, cl::init(0xff));
+
+static cl::opt<bool> ClHandleICmp("msan-handle-icmp",
+       cl::desc("propagate shadow through ICmpEQ and ICmpNE"),
+       cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClStoreCleanOrigin("msan-store-clean-origin",
+       cl::desc("store origin for clean (fully initialized) values"),
+       cl::Hidden, cl::init(false));
+
+// This flag controls whether we check the shadow of the address
+// operand of load or store. Such bugs are very rare, since load from
+// a garbage address typically results in SEGV, but still happen
+// (e.g. only lower bits of address are garbage, or the access happens
+// early at program startup where malloc-ed memory is more likely to
+// be zeroed. As of 2012-08-28 this flag adds 20% slowdown.
+static cl::opt<bool> ClCheckAccessAddress("msan-check-access-address",
+       cl::desc("report accesses through a pointer which has poisoned shadow"),
+       cl::Hidden, cl::init(true));
+
+static cl::opt<bool> ClDumpStrictInstructions("msan-dump-strict-instructions",
+       cl::desc("print out instructions with default strict semantics"),
+       cl::Hidden, cl::init(false));
+
+static cl::opt<std::string>  ClBlackListFile("msan-blacklist",
+       cl::desc("File containing the list of functions where MemorySanitizer "
+                "should not report bugs"), cl::Hidden);
+
+namespace {
+
+/// \brief An instrumentation pass implementing detection of uninitialized
+/// reads.
+///
+/// MemorySanitizer: instrument the code in module to find
+/// uninitialized reads.
+class MemorySanitizer : public FunctionPass {
+public:
+  MemorySanitizer() : FunctionPass(ID), TD(0), WarningFn(0) { }
+  const char *getPassName() const { return "MemorySanitizer"; }
+  bool runOnFunction(Function &F);
+  bool doInitialization(Module &M);
+  static char ID;  // Pass identification, replacement for typeid.
+
+private:
+  void initializeCallbacks(Module &M);
+
+  DataLayout *TD;
+  LLVMContext *C;
+  Type *IntptrTy;
+  Type *OriginTy;
+  /// \brief Thread-local shadow storage for function parameters.
+  GlobalVariable *ParamTLS;
+  /// \brief Thread-local origin storage for function parameters.
+  GlobalVariable *ParamOriginTLS;
+  /// \brief Thread-local shadow storage for function return value.
+  GlobalVariable *RetvalTLS;
+  /// \brief Thread-local origin storage for function return value.
+  GlobalVariable *RetvalOriginTLS;
+  /// \brief Thread-local shadow storage for in-register va_arg function
+  /// parameters (x86_64-specific).
+  GlobalVariable *VAArgTLS;
+  /// \brief Thread-local shadow storage for va_arg overflow area
+  /// (x86_64-specific).
+  GlobalVariable *VAArgOverflowSizeTLS;
+  /// \brief Thread-local space used to pass origin value to the UMR reporting
+  /// function.
+  GlobalVariable *OriginTLS;
+
+  /// \brief The run-time callback to print a warning.
+  Value *WarningFn;
+  /// \brief Run-time helper that copies origin info for a memory range.
+  Value *MsanCopyOriginFn;
+  /// \brief Run-time helper that generates a new origin value for a stack
+  /// allocation.
+  Value *MsanSetAllocaOriginFn;
+  /// \brief Run-time helper that poisons stack on function entry.
+  Value *MsanPoisonStackFn;
+  /// \brief MSan runtime replacements for memmove, memcpy and memset.
+  Value *MemmoveFn, *MemcpyFn, *MemsetFn;
+
+  /// \brief Address mask used in application-to-shadow address calculation.
+  /// ShadowAddr is computed as ApplicationAddr & ~ShadowMask.
+  uint64_t ShadowMask;
+  /// \brief Offset of the origin shadow from the "normal" shadow.
+  /// OriginAddr is computed as (ShadowAddr + OriginOffset) & ~3ULL
+  uint64_t OriginOffset;
+  /// \brief Branch weights for error reporting.
+  MDNode *ColdCallWeights;
+  /// \brief Branch weights for origin store.
+  MDNode *OriginStoreWeights;
+  /// \brief The blacklist.
+  OwningPtr<BlackList> BL;
+  /// \brief An empty volatile inline asm that prevents callback merge.
+  InlineAsm *EmptyAsm;
+
+  friend struct MemorySanitizerVisitor;
+  friend struct VarArgAMD64Helper;
+};
+}  // namespace
+
+char MemorySanitizer::ID = 0;
+INITIALIZE_PASS(MemorySanitizer, "msan",
+                "MemorySanitizer: detects uninitialized reads.",
+                false, false)
+
+FunctionPass *llvm::createMemorySanitizerPass() {
+  return new MemorySanitizer();
+}
+
+/// \brief Create a non-const global initialized with the given string.
+///
+/// Creates a writable global for Str so that we can pass it to the
+/// run-time lib. Runtime uses first 4 bytes of the string to store the
+/// frame ID, so the string needs to be mutable.
+static GlobalVariable *createPrivateNonConstGlobalForString(Module &M,
+                                                            StringRef Str) {
+  Constant *StrConst = ConstantDataArray::getString(M.getContext(), Str);
+  return new GlobalVariable(M, StrConst->getType(), /*isConstant=*/false,
+                            GlobalValue::PrivateLinkage, StrConst, "");
+}
+
+
+/// \brief Insert extern declaration of runtime-provided functions and globals.
+void MemorySanitizer::initializeCallbacks(Module &M) {
+  // Only do this once.
+  if (WarningFn)
+    return;
+
+  IRBuilder<> IRB(*C);
+  // Create the callback.
+  // FIXME: this function should have "Cold" calling conv,
+  // which is not yet implemented.
+  StringRef WarningFnName = ClKeepGoing ? "__msan_warning"
+                                        : "__msan_warning_noreturn";
+  WarningFn = M.getOrInsertFunction(WarningFnName, IRB.getVoidTy(), NULL);
+
+  MsanCopyOriginFn = M.getOrInsertFunction(
+    "__msan_copy_origin", IRB.getVoidTy(), IRB.getInt8PtrTy(),
+    IRB.getInt8PtrTy(), IntptrTy, NULL);
+  MsanSetAllocaOriginFn = M.getOrInsertFunction(
+    "__msan_set_alloca_origin", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy,
+    IRB.getInt8PtrTy(), NULL);
+  MsanPoisonStackFn = M.getOrInsertFunction(
+    "__msan_poison_stack", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy, NULL);
+  MemmoveFn = M.getOrInsertFunction(
+    "__msan_memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+    IntptrTy, NULL);
+  MemcpyFn = M.getOrInsertFunction(
+    "__msan_memcpy", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
+    IntptrTy, NULL);
+  MemsetFn = M.getOrInsertFunction(
+    "__msan_memset", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt32Ty(),
+    IntptrTy, NULL);
+
+  // Create globals.
+  RetvalTLS = new GlobalVariable(
+    M, ArrayType::get(IRB.getInt64Ty(), 8), false,
+    GlobalVariable::ExternalLinkage, 0, "__msan_retval_tls", 0,
+    GlobalVariable::GeneralDynamicTLSModel);
+  RetvalOriginTLS = new GlobalVariable(
+    M, OriginTy, false, GlobalVariable::ExternalLinkage, 0,
+    "__msan_retval_origin_tls", 0, GlobalVariable::GeneralDynamicTLSModel);
+
+  ParamTLS = new GlobalVariable(
+    M, ArrayType::get(IRB.getInt64Ty(), 1000), false,
+    GlobalVariable::ExternalLinkage, 0, "__msan_param_tls", 0,
+    GlobalVariable::GeneralDynamicTLSModel);
+  ParamOriginTLS = new GlobalVariable(
+    M, ArrayType::get(OriginTy, 1000), false, GlobalVariable::ExternalLinkage,
+    0, "__msan_param_origin_tls", 0, GlobalVariable::GeneralDynamicTLSModel);
+
+  VAArgTLS = new GlobalVariable(
+    M, ArrayType::get(IRB.getInt64Ty(), 1000), false,
+    GlobalVariable::ExternalLinkage, 0, "__msan_va_arg_tls", 0,
+    GlobalVariable::GeneralDynamicTLSModel);
+  VAArgOverflowSizeTLS = new GlobalVariable(
+    M, IRB.getInt64Ty(), false, GlobalVariable::ExternalLinkage, 0,
+    "__msan_va_arg_overflow_size_tls", 0,
+    GlobalVariable::GeneralDynamicTLSModel);
+  OriginTLS = new GlobalVariable(
+    M, IRB.getInt32Ty(), false, GlobalVariable::ExternalLinkage, 0,
+    "__msan_origin_tls", 0, GlobalVariable::GeneralDynamicTLSModel);
+
+  // We insert an empty inline asm after __msan_report* to avoid callback merge.
+  EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
+                            StringRef(""), StringRef(""),
+                            /*hasSideEffects=*/true);
+}
+
+/// \brief Module-level initialization.
+///
+/// inserts a call to __msan_init to the module's constructor list.
+bool MemorySanitizer::doInitialization(Module &M) {
+  TD = getAnalysisIfAvailable<DataLayout>();
+  if (!TD)
+    return false;
+  BL.reset(new BlackList(ClBlackListFile));
+  C = &(M.getContext());
+  unsigned PtrSize = TD->getPointerSizeInBits(/* AddressSpace */0);
+  switch (PtrSize) {
+    case 64:
+      ShadowMask = kShadowMask64;
+      OriginOffset = kOriginOffset64;
+      break;
+    case 32:
+      ShadowMask = kShadowMask32;
+      OriginOffset = kOriginOffset32;
+      break;
+    default:
+      report_fatal_error("unsupported pointer size");
+      break;
+  }
+
+  IRBuilder<> IRB(*C);
+  IntptrTy = IRB.getIntPtrTy(TD);
+  OriginTy = IRB.getInt32Ty();
+
+  ColdCallWeights = MDBuilder(*C).createBranchWeights(1, 1000);
+  OriginStoreWeights = MDBuilder(*C).createBranchWeights(1, 1000);
+
+  // Insert a call to __msan_init/__msan_track_origins into the module's CTORs.
+  appendToGlobalCtors(M, cast<Function>(M.getOrInsertFunction(
+                      "__msan_init", IRB.getVoidTy(), NULL)), 0);
+
+  new GlobalVariable(M, IRB.getInt32Ty(), true, GlobalValue::WeakODRLinkage,
+                     IRB.getInt32(ClTrackOrigins), "__msan_track_origins");
+
+  return true;
+}
+
+namespace {
+
+/// \brief A helper class that handles instrumentation of VarArg
+/// functions on a particular platform.
+///
+/// Implementations are expected to insert the instrumentation
+/// necessary to propagate argument shadow through VarArg function
+/// calls. Visit* methods are called during an InstVisitor pass over
+/// the function, and should avoid creating new basic blocks. A new
+/// instance of this class is created for each instrumented function.
+struct VarArgHelper {
+  /// \brief Visit a CallSite.
+  virtual void visitCallSite(CallSite &CS, IRBuilder<> &IRB) = 0;
+
+  /// \brief Visit a va_start call.
+  virtual void visitVAStartInst(VAStartInst &I) = 0;
+
+  /// \brief Visit a va_copy call.
+  virtual void visitVACopyInst(VACopyInst &I) = 0;
+
+  /// \brief Finalize function instrumentation.
+  ///
+  /// This method is called after visiting all interesting (see above)
+  /// instructions in a function.
+  virtual void finalizeInstrumentation() = 0;
+
+  virtual ~VarArgHelper() {}
+};
+
+struct MemorySanitizerVisitor;
+
+VarArgHelper*
+CreateVarArgHelper(Function &Func, MemorySanitizer &Msan,
+                   MemorySanitizerVisitor &Visitor);
+
+/// This class does all the work for a given function. Store and Load
+/// instructions store and load corresponding shadow and origin
+/// values. Most instructions propagate shadow from arguments to their
+/// return values. Certain instructions (most importantly, BranchInst)
+/// test their argument shadow and print reports (with a runtime call) if it's
+/// non-zero.
+struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
+  Function &F;
+  MemorySanitizer &MS;
+  SmallVector<PHINode *, 16> ShadowPHINodes, OriginPHINodes;
+  ValueMap<Value*, Value*> ShadowMap, OriginMap;
+  bool InsertChecks;
+  OwningPtr<VarArgHelper> VAHelper;
+
+  // An unfortunate workaround for asymmetric lowering of va_arg stuff.
+  // See a comment in visitCallSite for more details.
+  static const unsigned AMD64GpEndOffset = 48; // AMD64 ABI Draft 0.99.6 p3.5.7
+  static const unsigned AMD64FpEndOffset = 176;
+
+  struct ShadowOriginAndInsertPoint {
+    Instruction *Shadow;
+    Instruction *Origin;
+    Instruction *OrigIns;
+    ShadowOriginAndInsertPoint(Instruction *S, Instruction *O, Instruction *I)
+      : Shadow(S), Origin(O), OrigIns(I) { }
+    ShadowOriginAndInsertPoint() : Shadow(0), Origin(0), OrigIns(0) { }
+  };
+  SmallVector<ShadowOriginAndInsertPoint, 16> InstrumentationList;
+  SmallVector<Instruction*, 16> StoreList;
+
+  MemorySanitizerVisitor(Function &F, MemorySanitizer &MS)
+    : F(F), MS(MS), VAHelper(CreateVarArgHelper(F, MS, *this)) {
+    InsertChecks = !MS.BL->isIn(F);
+    DEBUG(if (!InsertChecks)
+            dbgs() << "MemorySanitizer is not inserting checks into '"
+                   << F.getName() << "'\n");
+  }
+
+  void materializeStores() {
+    for (size_t i = 0, n = StoreList.size(); i < n; i++) {
+      StoreInst& I = *dyn_cast<StoreInst>(StoreList[i]);
+
+      IRBuilder<> IRB(&I);
+      Value *Val = I.getValueOperand();
+      Value *Addr = I.getPointerOperand();
+      Value *Shadow = getShadow(Val);
+      Value *ShadowPtr = getShadowPtr(Addr, Shadow->getType(), IRB);
+
+      StoreInst *NewSI = IRB.CreateAlignedStore(Shadow, ShadowPtr, I.getAlignment());
+      DEBUG(dbgs() << "  STORE: " << *NewSI << "\n");
+      (void)NewSI;
+      // If the store is volatile, add a check.
+      if (I.isVolatile())
+        insertCheck(Val, &I);
+      if (ClCheckAccessAddress)
+        insertCheck(Addr, &I);
+
+      if (ClTrackOrigins) {
+        if (ClStoreCleanOrigin || isa<StructType>(Shadow->getType())) {
+          IRB.CreateAlignedStore(getOrigin(Val), getOriginPtr(Addr, IRB), I.getAlignment());
+        } else {
+          Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB);
+
+          Constant *Cst = dyn_cast_or_null<Constant>(ConvertedShadow);
+          // TODO(eugenis): handle non-zero constant shadow by inserting an
+          // unconditional check (can not simply fail compilation as this could
+          // be in the dead code).
+          if (Cst)
+            continue;
+
+          Value *Cmp = IRB.CreateICmpNE(ConvertedShadow,
+              getCleanShadow(ConvertedShadow), "_mscmp");
+          Instruction *CheckTerm =
+            SplitBlockAndInsertIfThen(cast<Instruction>(Cmp), false, MS.OriginStoreWeights);
+          IRBuilder<> IRBNewBlock(CheckTerm);
+          IRBNewBlock.CreateAlignedStore(getOrigin(Val),
+              getOriginPtr(Addr, IRBNewBlock), I.getAlignment());
+        }
+      }
+    }
+  }
+
+  void materializeChecks() {
+    for (size_t i = 0, n = InstrumentationList.size(); i < n; i++) {
+      Instruction *Shadow = InstrumentationList[i].Shadow;
+      Instruction *OrigIns = InstrumentationList[i].OrigIns;
+      IRBuilder<> IRB(OrigIns);
+      DEBUG(dbgs() << "  SHAD0 : " << *Shadow << "\n");
+      Value *ConvertedShadow = convertToShadowTyNoVec(Shadow, IRB);
+      DEBUG(dbgs() << "  SHAD1 : " << *ConvertedShadow << "\n");
+      Value *Cmp = IRB.CreateICmpNE(ConvertedShadow,
+                                    getCleanShadow(ConvertedShadow), "_mscmp");
+      Instruction *CheckTerm =
+        SplitBlockAndInsertIfThen(cast<Instruction>(Cmp),
+                                  /* Unreachable */ !ClKeepGoing,
+                                  MS.ColdCallWeights);
+
+      IRB.SetInsertPoint(CheckTerm);
+      if (ClTrackOrigins) {
+        Instruction *Origin = InstrumentationList[i].Origin;
+        IRB.CreateStore(Origin ? (Value*)Origin : (Value*)IRB.getInt32(0),
+                        MS.OriginTLS);
+      }
+      CallInst *Call = IRB.CreateCall(MS.WarningFn);
+      Call->setDebugLoc(OrigIns->getDebugLoc());
+      IRB.CreateCall(MS.EmptyAsm);
+      DEBUG(dbgs() << "  CHECK: " << *Cmp << "\n");
+    }
+    DEBUG(dbgs() << "DONE:\n" << F);
+  }
+
+  /// \brief Add MemorySanitizer instrumentation to a function.
+  bool runOnFunction() {
+    MS.initializeCallbacks(*F.getParent());
+    if (!MS.TD) return false;
+    // Iterate all BBs in depth-first order and create shadow instructions
+    // for all instructions (where applicable).
+    // For PHI nodes we create dummy shadow PHIs which will be finalized later.
+    for (df_iterator<BasicBlock*> DI = df_begin(&F.getEntryBlock()),
+         DE = df_end(&F.getEntryBlock()); DI != DE; ++DI) {
+      BasicBlock *BB = *DI;
+      visit(*BB);
+    }
+
+    // Finalize PHI nodes.
+    for (size_t i = 0, n = ShadowPHINodes.size(); i < n; i++) {
+      PHINode *PN = ShadowPHINodes[i];
+      PHINode *PNS = cast<PHINode>(getShadow(PN));
+      PHINode *PNO = ClTrackOrigins ? cast<PHINode>(getOrigin(PN)) : 0;
+      size_t NumValues = PN->getNumIncomingValues();
+      for (size_t v = 0; v < NumValues; v++) {
+        PNS->addIncoming(getShadow(PN, v), PN->getIncomingBlock(v));
+        if (PNO)
+          PNO->addIncoming(getOrigin(PN, v), PN->getIncomingBlock(v));
+      }
+    }
+
+    VAHelper->finalizeInstrumentation();
+
+    // Delayed instrumentation of StoreInst.
+    // This may add new checks to be inserted later.
+    materializeStores();
+
+    // Insert shadow value checks.
+    materializeChecks();
+
+    return true;
+  }
+
+  /// \brief Compute the shadow type that corresponds to a given Value.
+  Type *getShadowTy(Value *V) {
+    return getShadowTy(V->getType());
+  }
+
+  /// \brief Compute the shadow type that corresponds to a given Type.
+  Type *getShadowTy(Type *OrigTy) {
+    if (!OrigTy->isSized()) {
+      return 0;
+    }
+    // For integer type, shadow is the same as the original type.
+    // This may return weird-sized types like i1.
+    if (IntegerType *IT = dyn_cast<IntegerType>(OrigTy))
+      return IT;
+    if (VectorType *VT = dyn_cast<VectorType>(OrigTy))
+      return VectorType::getInteger(VT);
+    if (StructType *ST = dyn_cast<StructType>(OrigTy)) {
+      SmallVector<Type*, 4> Elements;
+      for (unsigned i = 0, n = ST->getNumElements(); i < n; i++)
+        Elements.push_back(getShadowTy(ST->getElementType(i)));
+      StructType *Res = StructType::get(*MS.C, Elements, ST->isPacked());
+      DEBUG(dbgs() << "getShadowTy: " << *ST << " ===> " << *Res << "\n");
+      return Res;
+    }
+    uint32_t TypeSize = MS.TD->getTypeStoreSizeInBits(OrigTy);
+    return IntegerType::get(*MS.C, TypeSize);
+  }
+
+  /// \brief Flatten a vector type.
+  Type *getShadowTyNoVec(Type *ty) {
+    if (VectorType *vt = dyn_cast<VectorType>(ty))
+      return IntegerType::get(*MS.C, vt->getBitWidth());
+    return ty;
+  }
+
+  /// \brief Convert a shadow value to it's flattened variant.
+  Value *convertToShadowTyNoVec(Value *V, IRBuilder<> &IRB) {
+    Type *Ty = V->getType();
+    Type *NoVecTy = getShadowTyNoVec(Ty);
+    if (Ty == NoVecTy) return V;
+    return IRB.CreateBitCast(V, NoVecTy);
+  }
+
+  /// \brief Compute the shadow address that corresponds to a given application
+  /// address.
+  ///
+  /// Shadow = Addr & ~ShadowMask.
+  Value *getShadowPtr(Value *Addr, Type *ShadowTy,
+                      IRBuilder<> &IRB) {
+    Value *ShadowLong =
+      IRB.CreateAnd(IRB.CreatePointerCast(Addr, MS.IntptrTy),
+                    ConstantInt::get(MS.IntptrTy, ~MS.ShadowMask));
+    return IRB.CreateIntToPtr(ShadowLong, PointerType::get(ShadowTy, 0));
+  }
+
+  /// \brief Compute the origin address that corresponds to a given application
+  /// address.
+  ///
+  /// OriginAddr = (ShadowAddr + OriginOffset) & ~3ULL
+  Value *getOriginPtr(Value *Addr, IRBuilder<> &IRB) {
+    Value *ShadowLong =
+      IRB.CreateAnd(IRB.CreatePointerCast(Addr, MS.IntptrTy),
+                    ConstantInt::get(MS.IntptrTy, ~MS.ShadowMask));
+    Value *Add =
+      IRB.CreateAdd(ShadowLong,
+                    ConstantInt::get(MS.IntptrTy, MS.OriginOffset));
+    Value *SecondAnd =
+      IRB.CreateAnd(Add, ConstantInt::get(MS.IntptrTy, ~3ULL));
+    return IRB.CreateIntToPtr(SecondAnd, PointerType::get(IRB.getInt32Ty(), 0));
+  }
+
+  /// \brief Compute the shadow address for a given function argument.
+  ///
+  /// Shadow = ParamTLS+ArgOffset.
+  Value *getShadowPtrForArgument(Value *A, IRBuilder<> &IRB,
+                                 int ArgOffset) {
+    Value *Base = IRB.CreatePointerCast(MS.ParamTLS, MS.IntptrTy);
+    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(getShadowTy(A), 0),
+                              "_msarg");
+  }
+
+  /// \brief Compute the origin address for a given function argument.
+  Value *getOriginPtrForArgument(Value *A, IRBuilder<> &IRB,
+                                 int ArgOffset) {
+    if (!ClTrackOrigins) return 0;
+    Value *Base = IRB.CreatePointerCast(MS.ParamOriginTLS, MS.IntptrTy);
+    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(MS.OriginTy, 0),
+                              "_msarg_o");
+  }
+
+  /// \brief Compute the shadow address for a retval.
+  Value *getShadowPtrForRetval(Value *A, IRBuilder<> &IRB) {
+    Value *Base = IRB.CreatePointerCast(MS.RetvalTLS, MS.IntptrTy);
+    return IRB.CreateIntToPtr(Base, PointerType::get(getShadowTy(A), 0),
+                              "_msret");
+  }
+
+  /// \brief Compute the origin address for a retval.
+  Value *getOriginPtrForRetval(IRBuilder<> &IRB) {
+    // We keep a single origin for the entire retval. Might be too optimistic.
+    return MS.RetvalOriginTLS;
+  }
+
+  /// \brief Set SV to be the shadow value for V.
+  void setShadow(Value *V, Value *SV) {
+    assert(!ShadowMap.count(V) && "Values may only have one shadow");
+    ShadowMap[V] = SV;
+  }
+
+  /// \brief Set Origin to be the origin value for V.
+  void setOrigin(Value *V, Value *Origin) {
+    if (!ClTrackOrigins) return;
+    assert(!OriginMap.count(V) && "Values may only have one origin");
+    DEBUG(dbgs() << "ORIGIN: " << *V << "  ==> " << *Origin << "\n");
+    OriginMap[V] = Origin;
+  }
+
+  /// \brief Create a clean shadow value for a given value.
+  ///
+  /// Clean shadow (all zeroes) means all bits of the value are defined
+  /// (initialized).
+  Value *getCleanShadow(Value *V) {
+    Type *ShadowTy = getShadowTy(V);
+    if (!ShadowTy)
+      return 0;
+    return Constant::getNullValue(ShadowTy);
+  }
+
+  /// \brief Create a dirty shadow of a given shadow type.
+  Constant *getPoisonedShadow(Type *ShadowTy) {
+    assert(ShadowTy);
+    if (isa<IntegerType>(ShadowTy) || isa<VectorType>(ShadowTy))
+      return Constant::getAllOnesValue(ShadowTy);
+    StructType *ST = cast<StructType>(ShadowTy);
+    SmallVector<Constant *, 4> Vals;
+    for (unsigned i = 0, n = ST->getNumElements(); i < n; i++)
+      Vals.push_back(getPoisonedShadow(ST->getElementType(i)));
+    return ConstantStruct::get(ST, Vals);
+  }
+
+  /// \brief Create a clean (zero) origin.
+  Value *getCleanOrigin() {
+    return Constant::getNullValue(MS.OriginTy);
+  }
+
+  /// \brief Get the shadow value for a given Value.
+  ///
+  /// This function either returns the value set earlier with setShadow,
+  /// or extracts if from ParamTLS (for function arguments).
+  Value *getShadow(Value *V) {
+    if (Instruction *I = dyn_cast<Instruction>(V)) {
+      // For instructions the shadow is already stored in the map.
+      Value *Shadow = ShadowMap[V];
+      if (!Shadow) {
+        DEBUG(dbgs() << "No shadow: " << *V << "\n" << *(I->getParent()));
+        (void)I;
+        assert(Shadow && "No shadow for a value");
+      }
+      return Shadow;
+    }
+    if (UndefValue *U = dyn_cast<UndefValue>(V)) {
+      Value *AllOnes = getPoisonedShadow(getShadowTy(V));
+      DEBUG(dbgs() << "Undef: " << *U << " ==> " << *AllOnes << "\n");
+      (void)U;
+      return AllOnes;
+    }
+    if (Argument *A = dyn_cast<Argument>(V)) {
+      // For arguments we compute the shadow on demand and store it in the map.
+      Value **ShadowPtr = &ShadowMap[V];
+      if (*ShadowPtr)
+        return *ShadowPtr;
+      Function *F = A->getParent();
+      IRBuilder<> EntryIRB(F->getEntryBlock().getFirstNonPHI());
+      unsigned ArgOffset = 0;
+      for (Function::arg_iterator AI = F->arg_begin(), AE = F->arg_end();
+           AI != AE; ++AI) {
+        if (!AI->getType()->isSized()) {
+          DEBUG(dbgs() << "Arg is not sized\n");
+          continue;
+        }
+        unsigned Size = AI->hasByValAttr()
+          ? MS.TD->getTypeAllocSize(AI->getType()->getPointerElementType())
+          : MS.TD->getTypeAllocSize(AI->getType());
+        if (A == AI) {
+          Value *Base = getShadowPtrForArgument(AI, EntryIRB, ArgOffset);
+          if (AI->hasByValAttr()) {
+            // ByVal pointer itself has clean shadow. We copy the actual
+            // argument shadow to the underlying memory.
+            Value *Cpy = EntryIRB.CreateMemCpy(
+              getShadowPtr(V, EntryIRB.getInt8Ty(), EntryIRB),
+              Base, Size, AI->getParamAlignment());
+            DEBUG(dbgs() << "  ByValCpy: " << *Cpy << "\n");
+            (void)Cpy;
+            *ShadowPtr = getCleanShadow(V);
+          } else {
+            *ShadowPtr = EntryIRB.CreateLoad(Base);
+          }
+          DEBUG(dbgs() << "  ARG:    "  << *AI << " ==> " <<
+                **ShadowPtr << "\n");
+          if (ClTrackOrigins) {
+            Value* OriginPtr = getOriginPtrForArgument(AI, EntryIRB, ArgOffset);
+            setOrigin(A, EntryIRB.CreateLoad(OriginPtr));
+          }
+        }
+        ArgOffset += DataLayout::RoundUpAlignment(Size, 8);
+      }
+      assert(*ShadowPtr && "Could not find shadow for an argument");
+      return *ShadowPtr;
+    }
+    // For everything else the shadow is zero.
+    return getCleanShadow(V);
+  }
+
+  /// \brief Get the shadow for i-th argument of the instruction I.
+  Value *getShadow(Instruction *I, int i) {
+    return getShadow(I->getOperand(i));
+  }
+
+  /// \brief Get the origin for a value.
+  Value *getOrigin(Value *V) {
+    if (!ClTrackOrigins) return 0;
+    if (isa<Instruction>(V) || isa<Argument>(V)) {
+      Value *Origin = OriginMap[V];
+      if (!Origin) {
+        DEBUG(dbgs() << "NO ORIGIN: " << *V << "\n");
+        Origin = getCleanOrigin();
+      }
+      return Origin;
+    }
+    return getCleanOrigin();
+  }
+
+  /// \brief Get the origin for i-th argument of the instruction I.
+  Value *getOrigin(Instruction *I, int i) {
+    return getOrigin(I->getOperand(i));
+  }
+
+  /// \brief Remember the place where a shadow check should be inserted.
+  ///
+  /// This location will be later instrumented with a check that will print a
+  /// UMR warning in runtime if the value is not fully defined.
+  void insertCheck(Value *Val, Instruction *OrigIns) {
+    assert(Val);
+    if (!InsertChecks) return;
+    Instruction *Shadow = dyn_cast_or_null<Instruction>(getShadow(Val));
+    if (!Shadow) return;
+#ifndef NDEBUG
+    Type *ShadowTy = Shadow->getType();
+    assert((isa<IntegerType>(ShadowTy) || isa<VectorType>(ShadowTy)) &&
+           "Can only insert checks for integer and vector shadow types");
+#endif
+    Instruction *Origin = dyn_cast_or_null<Instruction>(getOrigin(Val));
+    InstrumentationList.push_back(
+      ShadowOriginAndInsertPoint(Shadow, Origin, OrigIns));
+  }
+
+  //------------------- Visitors.
+
+  /// \brief Instrument LoadInst
+  ///
+  /// Loads the corresponding shadow and (optionally) origin.
+  /// Optionally, checks that the load address is fully defined.
+  void visitLoadInst(LoadInst &I) {
+    assert(I.getType()->isSized() && "Load type must have size");
+    IRBuilder<> IRB(&I);
+    Type *ShadowTy = getShadowTy(&I);
+    Value *Addr = I.getPointerOperand();
+    Value *ShadowPtr = getShadowPtr(Addr, ShadowTy, IRB);
+    setShadow(&I, IRB.CreateAlignedLoad(ShadowPtr, I.getAlignment(), "_msld"));
+
+    if (ClCheckAccessAddress)
+      insertCheck(I.getPointerOperand(), &I);
+
+    if (ClTrackOrigins)
+      setOrigin(&I, IRB.CreateAlignedLoad(getOriginPtr(Addr, IRB), I.getAlignment()));
+  }
+
+  /// \brief Instrument StoreInst
+  ///
+  /// Stores the corresponding shadow and (optionally) origin.
+  /// Optionally, checks that the store address is fully defined.
+  /// Volatile stores check that the value being stored is fully defined.
+  void visitStoreInst(StoreInst &I) {
+    StoreList.push_back(&I);
+  }
+
+  // Vector manipulation.
+  void visitExtractElementInst(ExtractElementInst &I) {
+    insertCheck(I.getOperand(1), &I);
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateExtractElement(getShadow(&I, 0), I.getOperand(1),
+              "_msprop"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitInsertElementInst(InsertElementInst &I) {
+    insertCheck(I.getOperand(2), &I);
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateInsertElement(getShadow(&I, 0), getShadow(&I, 1),
+              I.getOperand(2), "_msprop"));
+    setOriginForNaryOp(I);
+  }
+
+  void visitShuffleVectorInst(ShuffleVectorInst &I) {
+    insertCheck(I.getOperand(2), &I);
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateShuffleVector(getShadow(&I, 0), getShadow(&I, 1),
+              I.getOperand(2), "_msprop"));
+    setOriginForNaryOp(I);
+  }
+
+  // Casts.
+  void visitSExtInst(SExtInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateSExt(getShadow(&I, 0), I.getType(), "_msprop"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitZExtInst(ZExtInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateZExt(getShadow(&I, 0), I.getType(), "_msprop"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitTruncInst(TruncInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateTrunc(getShadow(&I, 0), I.getType(), "_msprop"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitBitCastInst(BitCastInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateBitCast(getShadow(&I, 0), getShadowTy(&I)));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitPtrToIntInst(PtrToIntInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateIntCast(getShadow(&I, 0), getShadowTy(&I), false,
+             "_msprop_ptrtoint"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitIntToPtrInst(IntToPtrInst &I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I, IRB.CreateIntCast(getShadow(&I, 0), getShadowTy(&I), false,
+             "_msprop_inttoptr"));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitFPToSIInst(CastInst& I) { handleShadowOr(I); }
+  void visitFPToUIInst(CastInst& I) { handleShadowOr(I); }
+  void visitSIToFPInst(CastInst& I) { handleShadowOr(I); }
+  void visitUIToFPInst(CastInst& I) { handleShadowOr(I); }
+  void visitFPExtInst(CastInst& I) { handleShadowOr(I); }
+  void visitFPTruncInst(CastInst& I) { handleShadowOr(I); }
+
+  /// \brief Propagate shadow for bitwise AND.
+  ///
+  /// This code is exact, i.e. if, for example, a bit in the left argument
+  /// is defined and 0, then neither the value not definedness of the
+  /// corresponding bit in B don't affect the resulting shadow.
+  void visitAnd(BinaryOperator &I) {
+    IRBuilder<> IRB(&I);
+    //  "And" of 0 and a poisoned value results in unpoisoned value.
+    //  1&1 => 1;     0&1 => 0;     p&1 => p;
+    //  1&0 => 0;     0&0 => 0;     p&0 => 0;
+    //  1&p => p;     0&p => 0;     p&p => p;
+    //  S = (S1 & S2) | (V1 & S2) | (S1 & V2)
+    Value *S1 = getShadow(&I, 0);
+    Value *S2 = getShadow(&I, 1);
+    Value *V1 = I.getOperand(0);
+    Value *V2 = I.getOperand(1);
+    if (V1->getType() != S1->getType()) {
+      V1 = IRB.CreateIntCast(V1, S1->getType(), false);
+      V2 = IRB.CreateIntCast(V2, S2->getType(), false);
+    }
+    Value *S1S2 = IRB.CreateAnd(S1, S2);
+    Value *V1S2 = IRB.CreateAnd(V1, S2);
+    Value *S1V2 = IRB.CreateAnd(S1, V2);
+    setShadow(&I, IRB.CreateOr(S1S2, IRB.CreateOr(V1S2, S1V2)));
+    setOriginForNaryOp(I);
+  }
+
+  void visitOr(BinaryOperator &I) {
+    IRBuilder<> IRB(&I);
+    //  "Or" of 1 and a poisoned value results in unpoisoned value.
+    //  1|1 => 1;     0|1 => 1;     p|1 => 1;
+    //  1|0 => 1;     0|0 => 0;     p|0 => p;
+    //  1|p => 1;     0|p => p;     p|p => p;
+    //  S = (S1 & S2) | (~V1 & S2) | (S1 & ~V2)
+    Value *S1 = getShadow(&I, 0);
+    Value *S2 = getShadow(&I, 1);
+    Value *V1 = IRB.CreateNot(I.getOperand(0));
+    Value *V2 = IRB.CreateNot(I.getOperand(1));
+    if (V1->getType() != S1->getType()) {
+      V1 = IRB.CreateIntCast(V1, S1->getType(), false);
+      V2 = IRB.CreateIntCast(V2, S2->getType(), false);
+    }
+    Value *S1S2 = IRB.CreateAnd(S1, S2);
+    Value *V1S2 = IRB.CreateAnd(V1, S2);
+    Value *S1V2 = IRB.CreateAnd(S1, V2);
+    setShadow(&I, IRB.CreateOr(S1S2, IRB.CreateOr(V1S2, S1V2)));
+    setOriginForNaryOp(I);
+  }
+
+  /// \brief Propagate origin for an instruction.
+  ///
+  /// This is a general case of origin propagation. For an Nary operation,
+  /// is set to the origin of an argument that is not entirely initialized.
+  /// If there is more than one such arguments, the rightmost of them is picked.
+  /// It does not matter which one is picked if all arguments are initialized.
+  void setOriginForNaryOp(Instruction &I) {
+    if (!ClTrackOrigins) return;
+    IRBuilder<> IRB(&I);
+    Value *Origin = getOrigin(&I, 0);
+    for (unsigned Op = 1, n = I.getNumOperands(); Op < n; ++Op) {
+      Value *S = convertToShadowTyNoVec(getShadow(&I, Op), IRB);
+      Origin = IRB.CreateSelect(IRB.CreateICmpNE(S, getCleanShadow(S)),
+                                getOrigin(&I, Op), Origin);
+    }
+    setOrigin(&I, Origin);
+  }
+
+  /// \brief Propagate shadow for a binary operation.
+  ///
+  /// Shadow = Shadow0 | Shadow1, all 3 must have the same type.
+  /// Bitwise OR is selected as an operation that will never lose even a bit of
+  /// poison.
+  void handleShadowOrBinary(Instruction &I) {
+    IRBuilder<> IRB(&I);
+    Value *Shadow0 = getShadow(&I, 0);
+    Value *Shadow1 = getShadow(&I, 1);
+    setShadow(&I, IRB.CreateOr(Shadow0, Shadow1, "_msprop"));
+    setOriginForNaryOp(I);
+  }
+
+  /// \brief Propagate shadow for arbitrary operation.
+  ///
+  /// This is a general case of shadow propagation, used in all cases where we
+  /// don't know and/or care about what the operation actually does.
+  /// It converts all input shadow values to a common type (extending or
+  /// truncating as necessary), and bitwise OR's them.
+  ///
+  /// This is much cheaper than inserting checks (i.e. requiring inputs to be
+  /// fully initialized), and less prone to false positives.
+  // FIXME: is the casting actually correct?
+  // FIXME: merge this with handleShadowOrBinary.
+  void handleShadowOr(Instruction &I) {
+    IRBuilder<> IRB(&I);
+    Value *Shadow = getShadow(&I, 0);
+    for (unsigned Op = 1, n = I.getNumOperands(); Op < n; ++Op)
+      Shadow = IRB.CreateOr(
+        Shadow, IRB.CreateIntCast(getShadow(&I, Op), Shadow->getType(), false),
+        "_msprop");
+    Shadow = IRB.CreateIntCast(Shadow, getShadowTy(&I), false);
+    setShadow(&I, Shadow);
+    setOriginForNaryOp(I);
+  }
+
+  void visitFAdd(BinaryOperator &I) { handleShadowOrBinary(I); }
+  void visitFSub(BinaryOperator &I) { handleShadowOrBinary(I); }
+  void visitFMul(BinaryOperator &I) { handleShadowOrBinary(I); }
+  void visitAdd(BinaryOperator &I) { handleShadowOrBinary(I); }
+  void visitSub(BinaryOperator &I) { handleShadowOrBinary(I); }
+  void visitXor(BinaryOperator &I) { handleShadowOrBinary(I); }
+  void visitMul(BinaryOperator &I) { handleShadowOrBinary(I); }
+
+  void handleDiv(Instruction &I) {
+    IRBuilder<> IRB(&I);
+    // Strict on the second argument.
+    insertCheck(I.getOperand(1), &I);
+    setShadow(&I, getShadow(&I, 0));
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  void visitUDiv(BinaryOperator &I) { handleDiv(I); }
+  void visitSDiv(BinaryOperator &I) { handleDiv(I); }
+  void visitFDiv(BinaryOperator &I) { handleDiv(I); }
+  void visitURem(BinaryOperator &I) { handleDiv(I); }
+  void visitSRem(BinaryOperator &I) { handleDiv(I); }
+  void visitFRem(BinaryOperator &I) { handleDiv(I); }
+
+  /// \brief Instrument == and != comparisons.
+  ///
+  /// Sometimes the comparison result is known even if some of the bits of the
+  /// arguments are not.
+  void handleEqualityComparison(ICmpInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *A = I.getOperand(0);
+    Value *B = I.getOperand(1);
+    Value *Sa = getShadow(A);
+    Value *Sb = getShadow(B);
+    if (A->getType()->isPointerTy())
+      A = IRB.CreatePointerCast(A, MS.IntptrTy);
+    if (B->getType()->isPointerTy())
+      B = IRB.CreatePointerCast(B, MS.IntptrTy);
+    // A == B  <==>  (C = A^B) == 0
+    // A != B  <==>  (C = A^B) != 0
+    // Sc = Sa | Sb
+    Value *C = IRB.CreateXor(A, B);
+    Value *Sc = IRB.CreateOr(Sa, Sb);
+    // Now dealing with i = (C == 0) comparison (or C != 0, does not matter now)
+    // Result is defined if one of the following is true
+    // * there is a defined 1 bit in C
+    // * C is fully defined
+    // Si = !(C & ~Sc) && Sc
+    Value *Zero = Constant::getNullValue(Sc->getType());
+    Value *MinusOne = Constant::getAllOnesValue(Sc->getType());
+    Value *Si =
+      IRB.CreateAnd(IRB.CreateICmpNE(Sc, Zero),
+                    IRB.CreateICmpEQ(
+                      IRB.CreateAnd(IRB.CreateXor(Sc, MinusOne), C), Zero));
+    Si->setName("_msprop_icmp");
+    setShadow(&I, Si);
+    setOriginForNaryOp(I);
+  }
+
+  /// \brief Instrument signed relational comparisons.
+  ///
+  /// Handle (x<0) and (x>=0) comparisons (essentially, sign bit tests) by
+  /// propagating the highest bit of the shadow. Everything else is delegated
+  /// to handleShadowOr().
+  void handleSignedRelationalComparison(ICmpInst &I) {
+    Constant *constOp0 = dyn_cast<Constant>(I.getOperand(0));
+    Constant *constOp1 = dyn_cast<Constant>(I.getOperand(1));
+    Value* op = NULL;
+    CmpInst::Predicate pre = I.getPredicate();
+    if (constOp0 && constOp0->isNullValue() &&
+        (pre == CmpInst::ICMP_SGT || pre == CmpInst::ICMP_SLE)) {
+      op = I.getOperand(1);
+    } else if (constOp1 && constOp1->isNullValue() &&
+               (pre == CmpInst::ICMP_SLT || pre == CmpInst::ICMP_SGE)) {
+      op = I.getOperand(0);
+    }
+    if (op) {
+      IRBuilder<> IRB(&I);
+      Value* Shadow =
+        IRB.CreateICmpSLT(getShadow(op), getCleanShadow(op), "_msprop_icmpslt");
+      setShadow(&I, Shadow);
+      setOrigin(&I, getOrigin(op));
+    } else {
+      handleShadowOr(I);
+    }
+  }
+
+  void visitICmpInst(ICmpInst &I) {
+    if (ClHandleICmp && I.isEquality())
+      handleEqualityComparison(I);
+    else if (ClHandleICmp && I.isSigned() && I.isRelational())
+      handleSignedRelationalComparison(I);
+    else
+      handleShadowOr(I);
+  }
+
+  void visitFCmpInst(FCmpInst &I) {
+    handleShadowOr(I);
+  }
+
+  void handleShift(BinaryOperator &I) {
+    IRBuilder<> IRB(&I);
+    // If any of the S2 bits are poisoned, the whole thing is poisoned.
+    // Otherwise perform the same shift on S1.
+    Value *S1 = getShadow(&I, 0);
+    Value *S2 = getShadow(&I, 1);
+    Value *S2Conv = IRB.CreateSExt(IRB.CreateICmpNE(S2, getCleanShadow(S2)),
+                                   S2->getType());
+    Value *V2 = I.getOperand(1);
+    Value *Shift = IRB.CreateBinOp(I.getOpcode(), S1, V2);
+    setShadow(&I, IRB.CreateOr(Shift, S2Conv));
+    setOriginForNaryOp(I);
+  }
+
+  void visitShl(BinaryOperator &I) { handleShift(I); }
+  void visitAShr(BinaryOperator &I) { handleShift(I); }
+  void visitLShr(BinaryOperator &I) { handleShift(I); }
+
+  /// \brief Instrument llvm.memmove
+  ///
+  /// At this point we don't know if llvm.memmove will be inlined or not.
+  /// If we don't instrument it and it gets inlined,
+  /// our interceptor will not kick in and we will lose the memmove.
+  /// If we instrument the call here, but it does not get inlined,
+  /// we will memove the shadow twice: which is bad in case
+  /// of overlapping regions. So, we simply lower the intrinsic to a call.
+  ///
+  /// Similar situation exists for memcpy and memset.
+  void visitMemMoveInst(MemMoveInst &I) {
+    IRBuilder<> IRB(&I);
+    IRB.CreateCall3(
+      MS.MemmoveFn,
+      IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()),
+      IRB.CreatePointerCast(I.getArgOperand(1), IRB.getInt8PtrTy()),
+      IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false));
+    I.eraseFromParent();
+  }
+
+  // Similar to memmove: avoid copying shadow twice.
+  // This is somewhat unfortunate as it may slowdown small constant memcpys.
+  // FIXME: consider doing manual inline for small constant sizes and proper
+  // alignment.
+  void visitMemCpyInst(MemCpyInst &I) {
+    IRBuilder<> IRB(&I);
+    IRB.CreateCall3(
+      MS.MemcpyFn,
+      IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()),
+      IRB.CreatePointerCast(I.getArgOperand(1), IRB.getInt8PtrTy()),
+      IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false));
+    I.eraseFromParent();
+  }
+
+  // Same as memcpy.
+  void visitMemSetInst(MemSetInst &I) {
+    IRBuilder<> IRB(&I);
+    IRB.CreateCall3(
+      MS.MemsetFn,
+      IRB.CreatePointerCast(I.getArgOperand(0), IRB.getInt8PtrTy()),
+      IRB.CreateIntCast(I.getArgOperand(1), IRB.getInt32Ty(), false),
+      IRB.CreateIntCast(I.getArgOperand(2), MS.IntptrTy, false));
+    I.eraseFromParent();
+  }
+
+  void visitVAStartInst(VAStartInst &I) {
+    VAHelper->visitVAStartInst(I);
+  }
+
+  void visitVACopyInst(VACopyInst &I) {
+    VAHelper->visitVACopyInst(I);
+  }
+
+  void handleBswap(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *Op = I.getArgOperand(0);
+    Type *OpType = Op->getType();
+    Function *BswapFunc = Intrinsic::getDeclaration(
+      F.getParent(), Intrinsic::bswap, ArrayRef<Type*>(&OpType, 1));
+    setShadow(&I, IRB.CreateCall(BswapFunc, getShadow(Op)));
+    setOrigin(&I, getOrigin(Op));
+  }
+
+  void visitIntrinsicInst(IntrinsicInst &I) {
+    switch (I.getIntrinsicID()) {
+    case llvm::Intrinsic::bswap:
+      handleBswap(I); break;
+    default:
+      visitInstruction(I); break;
+    }
+  }
+
+  void visitCallSite(CallSite CS) {
+    Instruction &I = *CS.getInstruction();
+    assert((CS.isCall() || CS.isInvoke()) && "Unknown type of CallSite");
+    if (CS.isCall()) {
+      CallInst *Call = cast<CallInst>(&I);
+
+      // For inline asm, do the usual thing: check argument shadow and mark all
+      // outputs as clean. Note that any side effects of the inline asm that are
+      // not immediately visible in its constraints are not handled.
+      if (Call->isInlineAsm()) {
+        visitInstruction(I);
+        return;
+      }
+
+      // Allow only tail calls with the same types, otherwise
+      // we may have a false positive: shadow for a non-void RetVal
+      // will get propagated to a void RetVal.
+      if (Call->isTailCall() && Call->getType() != Call->getParent()->getType())
+        Call->setTailCall(false);
+
+      assert(!isa<IntrinsicInst>(&I) && "intrinsics are handled elsewhere");
+
+      // We are going to insert code that relies on the fact that the callee
+      // will become a non-readonly function after it is instrumented by us. To
+      // prevent this code from being optimized out, mark that function
+      // non-readonly in advance.
+      if (Function *Func = Call->getCalledFunction()) {
+        // Clear out readonly/readnone attributes.
+        AttrBuilder B;
+        B.addAttribute(Attributes::ReadOnly)
+          .addAttribute(Attributes::ReadNone);
+        Func->removeAttribute(AttributeSet::FunctionIndex,
+                              Attributes::get(Func->getContext(), B));
+      }
+    }
+    IRBuilder<> IRB(&I);
+    unsigned ArgOffset = 0;
+    DEBUG(dbgs() << "  CallSite: " << I << "\n");
+    for (CallSite::arg_iterator ArgIt = CS.arg_begin(), End = CS.arg_end();
+         ArgIt != End; ++ArgIt) {
+      Value *A = *ArgIt;
+      unsigned i = ArgIt - CS.arg_begin();
+      if (!A->getType()->isSized()) {
+        DEBUG(dbgs() << "Arg " << i << " is not sized: " << I << "\n");
+        continue;
+      }
+      unsigned Size = 0;
+      Value *Store = 0;
+      // Compute the Shadow for arg even if it is ByVal, because
+      // in that case getShadow() will copy the actual arg shadow to
+      // __msan_param_tls.
+      Value *ArgShadow = getShadow(A);
+      Value *ArgShadowBase = getShadowPtrForArgument(A, IRB, ArgOffset);
+      DEBUG(dbgs() << "  Arg#" << i << ": " << *A <<
+            " Shadow: " << *ArgShadow << "\n");
+      if (CS.paramHasAttr(i + 1, Attributes::ByVal)) {
+        assert(A->getType()->isPointerTy() &&
+               "ByVal argument is not a pointer!");
+        Size = MS.TD->getTypeAllocSize(A->getType()->getPointerElementType());
+        unsigned Alignment = CS.getParamAlignment(i + 1);
+        Store = IRB.CreateMemCpy(ArgShadowBase,
+                                 getShadowPtr(A, Type::getInt8Ty(*MS.C), IRB),
+                                 Size, Alignment);
+      } else {
+        Size = MS.TD->getTypeAllocSize(A->getType());
+        Store = IRB.CreateStore(ArgShadow, ArgShadowBase);
+      }
+      if (ClTrackOrigins)
+        IRB.CreateStore(getOrigin(A),
+                        getOriginPtrForArgument(A, IRB, ArgOffset));
+      assert(Size != 0 && Store != 0);
+      DEBUG(dbgs() << "  Param:" << *Store << "\n");
+      ArgOffset += DataLayout::RoundUpAlignment(Size, 8);
+    }
+    DEBUG(dbgs() << "  done with call args\n");
+
+    FunctionType *FT =
+      cast<FunctionType>(CS.getCalledValue()->getType()-> getContainedType(0));
+    if (FT->isVarArg()) {
+      VAHelper->visitCallSite(CS, IRB);
+    }
+
+    // Now, get the shadow for the RetVal.
+    if (!I.getType()->isSized()) return;
+    IRBuilder<> IRBBefore(&I);
+    // Untill we have full dynamic coverage, make sure the retval shadow is 0.
+    Value *Base = getShadowPtrForRetval(&I, IRBBefore);
+    IRBBefore.CreateStore(getCleanShadow(&I), Base);
+    Instruction *NextInsn = 0;
+    if (CS.isCall()) {
+      NextInsn = I.getNextNode();
+    } else {
+      BasicBlock *NormalDest = cast<InvokeInst>(&I)->getNormalDest();
+      if (!NormalDest->getSinglePredecessor()) {
+        // FIXME: this case is tricky, so we are just conservative here.
+        // Perhaps we need to split the edge between this BB and NormalDest,
+        // but a naive attempt to use SplitEdge leads to a crash.
+        setShadow(&I, getCleanShadow(&I));
+        setOrigin(&I, getCleanOrigin());
+        return;
+      }
+      NextInsn = NormalDest->getFirstInsertionPt();
+      assert(NextInsn &&
+             "Could not find insertion point for retval shadow load");
+    }
+    IRBuilder<> IRBAfter(NextInsn);
+    setShadow(&I, IRBAfter.CreateLoad(getShadowPtrForRetval(&I, IRBAfter),
+                                      "_msret"));
+    if (ClTrackOrigins)
+      setOrigin(&I, IRBAfter.CreateLoad(getOriginPtrForRetval(IRBAfter)));
+  }
+
+  void visitReturnInst(ReturnInst &I) {
+    IRBuilder<> IRB(&I);
+    if (Value *RetVal = I.getReturnValue()) {
+      // Set the shadow for the RetVal.
+      Value *Shadow = getShadow(RetVal);
+      Value *ShadowPtr = getShadowPtrForRetval(RetVal, IRB);
+      DEBUG(dbgs() << "Return: " << *Shadow << "\n" << *ShadowPtr << "\n");
+      IRB.CreateStore(Shadow, ShadowPtr);
+      if (ClTrackOrigins)
+        IRB.CreateStore(getOrigin(RetVal), getOriginPtrForRetval(IRB));
+    }
+  }
+
+  void visitPHINode(PHINode &I) {
+    IRBuilder<> IRB(&I);
+    ShadowPHINodes.push_back(&I);
+    setShadow(&I, IRB.CreatePHI(getShadowTy(&I), I.getNumIncomingValues(),
+                                "_msphi_s"));
+    if (ClTrackOrigins)
+      setOrigin(&I, IRB.CreatePHI(MS.OriginTy, I.getNumIncomingValues(),
+                                  "_msphi_o"));
+  }
+
+  void visitAllocaInst(AllocaInst &I) {
+    setShadow(&I, getCleanShadow(&I));
+    if (!ClPoisonStack) return;
+    IRBuilder<> IRB(I.getNextNode());
+    uint64_t Size = MS.TD->getTypeAllocSize(I.getAllocatedType());
+    if (ClPoisonStackWithCall) {
+      IRB.CreateCall2(MS.MsanPoisonStackFn,
+                      IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()),
+                      ConstantInt::get(MS.IntptrTy, Size));
+    } else {
+      Value *ShadowBase = getShadowPtr(&I, Type::getInt8PtrTy(*MS.C), IRB);
+      IRB.CreateMemSet(ShadowBase, IRB.getInt8(ClPoisonStackPattern),
+                       Size, I.getAlignment());
+    }
+
+    if (ClTrackOrigins) {
+      setOrigin(&I, getCleanOrigin());
+      SmallString<2048> StackDescriptionStorage;
+      raw_svector_ostream StackDescription(StackDescriptionStorage);
+      // We create a string with a description of the stack allocation and
+      // pass it into __msan_set_alloca_origin.
+      // It will be printed by the run-time if stack-originated UMR is found.
+      // The first 4 bytes of the string are set to '----' and will be replaced
+      // by __msan_va_arg_overflow_size_tls at the first call.
+      StackDescription << "----" << I.getName() << "@" << F.getName();
+      Value *Descr =
+          createPrivateNonConstGlobalForString(*F.getParent(),
+                                               StackDescription.str());
+      IRB.CreateCall3(MS.MsanSetAllocaOriginFn,
+                      IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()),
+                      ConstantInt::get(MS.IntptrTy, Size),
+                      IRB.CreatePointerCast(Descr, IRB.getInt8PtrTy()));
+    }
+  }
+
+  void visitSelectInst(SelectInst& I) {
+    IRBuilder<> IRB(&I);
+    setShadow(&I,  IRB.CreateSelect(I.getCondition(),
+              getShadow(I.getTrueValue()), getShadow(I.getFalseValue()),
+              "_msprop"));
+    if (ClTrackOrigins)
+      setOrigin(&I, IRB.CreateSelect(I.getCondition(),
+                getOrigin(I.getTrueValue()), getOrigin(I.getFalseValue())));
+  }
+
+  void visitLandingPadInst(LandingPadInst &I) {
+    // Do nothing.
+    // See http://code.google.com/p/memory-sanitizer/issues/detail?id=1
+    setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
+  }
+
+  void visitGetElementPtrInst(GetElementPtrInst &I) {
+    handleShadowOr(I);
+  }
+
+  void visitExtractValueInst(ExtractValueInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *Agg = I.getAggregateOperand();
+    DEBUG(dbgs() << "ExtractValue:  " << I << "\n");
+    Value *AggShadow = getShadow(Agg);
+    DEBUG(dbgs() << "   AggShadow:  " << *AggShadow << "\n");
+    Value *ResShadow = IRB.CreateExtractValue(AggShadow, I.getIndices());
+    DEBUG(dbgs() << "   ResShadow:  " << *ResShadow << "\n");
+    setShadow(&I, ResShadow);
+    setOrigin(&I, getCleanOrigin());
+  }
+
+  void visitInsertValueInst(InsertValueInst &I) {
+    IRBuilder<> IRB(&I);
+    DEBUG(dbgs() << "InsertValue:  " << I << "\n");
+    Value *AggShadow = getShadow(I.getAggregateOperand());
+    Value *InsShadow = getShadow(I.getInsertedValueOperand());
+    DEBUG(dbgs() << "   AggShadow:  " << *AggShadow << "\n");
+    DEBUG(dbgs() << "   InsShadow:  " << *InsShadow << "\n");
+    Value *Res = IRB.CreateInsertValue(AggShadow, InsShadow, I.getIndices());
+    DEBUG(dbgs() << "   Res:        " << *Res << "\n");
+    setShadow(&I, Res);
+    setOrigin(&I, getCleanOrigin());
+  }
+
+  void dumpInst(Instruction &I) {
+    if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+      errs() << "ZZZ call " << CI->getCalledFunction()->getName() << "\n";
+    } else {
+      errs() << "ZZZ " << I.getOpcodeName() << "\n";
+    }
+    errs() << "QQQ " << I << "\n";
+  }
+
+  void visitResumeInst(ResumeInst &I) {
+    DEBUG(dbgs() << "Resume: " << I << "\n");
+    // Nothing to do here.
+  }
+
+  void visitInstruction(Instruction &I) {
+    // Everything else: stop propagating and check for poisoned shadow.
+    if (ClDumpStrictInstructions)
+      dumpInst(I);
+    DEBUG(dbgs() << "DEFAULT: " << I << "\n");
+    for (size_t i = 0, n = I.getNumOperands(); i < n; i++)
+      insertCheck(I.getOperand(i), &I);
+    setShadow(&I, getCleanShadow(&I));
+    setOrigin(&I, getCleanOrigin());
+  }
+};
+
+/// \brief AMD64-specific implementation of VarArgHelper.
+struct VarArgAMD64Helper : public VarArgHelper {
+  // An unfortunate workaround for asymmetric lowering of va_arg stuff.
+  // See a comment in visitCallSite for more details.
+  static const unsigned AMD64GpEndOffset = 48; // AMD64 ABI Draft 0.99.6 p3.5.7
+  static const unsigned AMD64FpEndOffset = 176;
+
+  Function &F;
+  MemorySanitizer &MS;
+  MemorySanitizerVisitor &MSV;
+  Value *VAArgTLSCopy;
+  Value *VAArgOverflowSize;
+
+  SmallVector<CallInst*, 16> VAStartInstrumentationList;
+
+  VarArgAMD64Helper(Function &F, MemorySanitizer &MS,
+                    MemorySanitizerVisitor &MSV)
+    : F(F), MS(MS), MSV(MSV), VAArgTLSCopy(0), VAArgOverflowSize(0) { }
+
+  enum ArgKind { AK_GeneralPurpose, AK_FloatingPoint, AK_Memory };
+
+  ArgKind classifyArgument(Value* arg) {
+    // A very rough approximation of X86_64 argument classification rules.
+    Type *T = arg->getType();
+    if (T->isFPOrFPVectorTy() || T->isX86_MMXTy())
+      return AK_FloatingPoint;
+    if (T->isIntegerTy() && T->getPrimitiveSizeInBits() <= 64)
+      return AK_GeneralPurpose;
+    if (T->isPointerTy())
+      return AK_GeneralPurpose;
+    return AK_Memory;
+  }
+
+  // For VarArg functions, store the argument shadow in an ABI-specific format
+  // that corresponds to va_list layout.
+  // We do this because Clang lowers va_arg in the frontend, and this pass
+  // only sees the low level code that deals with va_list internals.
+  // A much easier alternative (provided that Clang emits va_arg instructions)
+  // would have been to associate each live instance of va_list with a copy of
+  // MSanParamTLS, and extract shadow on va_arg() call in the argument list
+  // order.
+  void visitCallSite(CallSite &CS, IRBuilder<> &IRB) {
+    unsigned GpOffset = 0;
+    unsigned FpOffset = AMD64GpEndOffset;
+    unsigned OverflowOffset = AMD64FpEndOffset;
+    for (CallSite::arg_iterator ArgIt = CS.arg_begin(), End = CS.arg_end();
+         ArgIt != End; ++ArgIt) {
+      Value *A = *ArgIt;
+      ArgKind AK = classifyArgument(A);
+      if (AK == AK_GeneralPurpose && GpOffset >= AMD64GpEndOffset)
+        AK = AK_Memory;
+      if (AK == AK_FloatingPoint && FpOffset >= AMD64FpEndOffset)
+        AK = AK_Memory;
+      Value *Base;
+      switch (AK) {
+      case AK_GeneralPurpose:
+        Base = getShadowPtrForVAArgument(A, IRB, GpOffset);
+        GpOffset += 8;
+        break;
+      case AK_FloatingPoint:
+        Base = getShadowPtrForVAArgument(A, IRB, FpOffset);
+        FpOffset += 16;
+        break;
+      case AK_Memory:
+        uint64_t ArgSize = MS.TD->getTypeAllocSize(A->getType());
+        Base = getShadowPtrForVAArgument(A, IRB, OverflowOffset);
+        OverflowOffset += DataLayout::RoundUpAlignment(ArgSize, 8);
+      }
+      IRB.CreateStore(MSV.getShadow(A), Base);
+    }
+    Constant *OverflowSize =
+      ConstantInt::get(IRB.getInt64Ty(), OverflowOffset - AMD64FpEndOffset);
+    IRB.CreateStore(OverflowSize, MS.VAArgOverflowSizeTLS);
+  }
+
+  /// \brief Compute the shadow address for a given va_arg.
+  Value *getShadowPtrForVAArgument(Value *A, IRBuilder<> &IRB,
+                                   int ArgOffset) {
+    Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
+    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(MSV.getShadowTy(A), 0),
+                              "_msarg");
+  }
+
+  void visitVAStartInst(VAStartInst &I) {
+    IRBuilder<> IRB(&I);
+    VAStartInstrumentationList.push_back(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB);
+
+    // Unpoison the whole __va_list_tag.
+    // FIXME: magic ABI constants.
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */24, /* alignment */16, false);
+  }
+
+  void visitVACopyInst(VACopyInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr = MSV.getShadowPtr(VAListTag, IRB.getInt8Ty(), IRB);
+
+    // Unpoison the whole __va_list_tag.
+    // FIXME: magic ABI constants.
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     /* size */ 24, /* alignment */ 16, false);
+  }
+
+  void finalizeInstrumentation() {
+    assert(!VAArgOverflowSize && !VAArgTLSCopy &&
+           "finalizeInstrumentation called twice");
+    if (!VAStartInstrumentationList.empty()) {
+      // If there is a va_start in this function, make a backup copy of
+      // va_arg_tls somewhere in the function entry block.
+      IRBuilder<> IRB(F.getEntryBlock().getFirstNonPHI());
+      VAArgOverflowSize = IRB.CreateLoad(MS.VAArgOverflowSizeTLS);
+      Value *CopySize =
+        IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, AMD64FpEndOffset),
+                      VAArgOverflowSize);
+      VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
+      IRB.CreateMemCpy(VAArgTLSCopy, MS.VAArgTLS, CopySize, 8);
+    }
+
+    // Instrument va_start.
+    // Copy va_list shadow from the backup copy of the TLS contents.
+    for (size_t i = 0, n = VAStartInstrumentationList.size(); i < n; i++) {
+      CallInst *OrigInst = VAStartInstrumentationList[i];
+      IRBuilder<> IRB(OrigInst->getNextNode());
+      Value *VAListTag = OrigInst->getArgOperand(0);
+
+      Value *RegSaveAreaPtrPtr =
+        IRB.CreateIntToPtr(
+          IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+                        ConstantInt::get(MS.IntptrTy, 16)),
+          Type::getInt64PtrTy(*MS.C));
+      Value *RegSaveAreaPtr = IRB.CreateLoad(RegSaveAreaPtrPtr);
+      Value *RegSaveAreaShadowPtr =
+        MSV.getShadowPtr(RegSaveAreaPtr, IRB.getInt8Ty(), IRB);
+      IRB.CreateMemCpy(RegSaveAreaShadowPtr, VAArgTLSCopy,
+                       AMD64FpEndOffset, 16);
+
+      Value *OverflowArgAreaPtrPtr =
+        IRB.CreateIntToPtr(
+          IRB.CreateAdd(IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+                        ConstantInt::get(MS.IntptrTy, 8)),
+          Type::getInt64PtrTy(*MS.C));
+      Value *OverflowArgAreaPtr = IRB.CreateLoad(OverflowArgAreaPtrPtr);
+      Value *OverflowArgAreaShadowPtr =
+        MSV.getShadowPtr(OverflowArgAreaPtr, IRB.getInt8Ty(), IRB);
+      Value *SrcPtr =
+        getShadowPtrForVAArgument(VAArgTLSCopy, IRB, AMD64FpEndOffset);
+      IRB.CreateMemCpy(OverflowArgAreaShadowPtr, SrcPtr, VAArgOverflowSize, 16);
+    }
+  }
+};
+
+VarArgHelper* CreateVarArgHelper(Function &Func, MemorySanitizer &Msan,
+                                 MemorySanitizerVisitor &Visitor) {
+  return new VarArgAMD64Helper(Func, Msan, Visitor);
+}
+
+}  // namespace
+
+bool MemorySanitizer::runOnFunction(Function &F) {
+  MemorySanitizerVisitor Visitor(F, *this);
+
+  // Clear out readonly/readnone attributes.
+  AttrBuilder B;
+  B.addAttribute(Attributes::ReadOnly)
+    .addAttribute(Attributes::ReadNone);
+  F.removeAttribute(AttributeSet::FunctionIndex,
+                    Attributes::get(F.getContext(), B));
+
+  return Visitor.runOnFunction();
+}
diff --git a/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp b/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
index 1fe12545d2..8f8d027dca 100644
--- a/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
+++ b/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
@@ -13,20 +13,20 @@
 //
 //===----------------------------------------------------------------------===//
 #define DEBUG_TYPE "insert-optimal-edge-profiling"
+#include "llvm/Transforms/Instrumentation.h"
+#include "MaximumSpanningTree.h"
 #include "ProfilingUtils.h"
-#include "llvm/Constants.h"
-#include "llvm/Module.h"
-#include "llvm/Pass.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/ProfileInfo.h"
 #include "llvm/Analysis/ProfileInfoLoader.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Constants.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Instrumentation.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "MaximumSpanningTree.h"
 using namespace llvm;
 
 STATISTIC(NumEdgesInserted, "The # of edges inserted.");
diff --git a/lib/Transforms/Instrumentation/PathProfiling.cpp b/lib/Transforms/Instrumentation/PathProfiling.cpp
index cc27146ebc..8aefe5901c 100644
--- a/lib/Transforms/Instrumentation/PathProfiling.cpp
+++ b/lib/Transforms/Instrumentation/PathProfiling.cpp
@@ -45,24 +45,24 @@
 //===----------------------------------------------------------------------===//
 #define DEBUG_TYPE "insert-path-profiling"
 
-#include "llvm/DerivedTypes.h"
+#include "llvm/Transforms/Instrumentation.h"
 #include "ProfilingUtils.h"
 #include "llvm/Analysis/PathNumbering.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
+#include "llvm/DerivedTypes.h"
 #include "llvm/InstrTypes.h"
 #include "llvm/Instructions.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/TypeBuilder.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/TypeBuilder.h"
 #include <vector>
 
 #define HASH_THRESHHOLD 100000
diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index d054b5e22f..f14a5d8a1e 100644
--- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -21,27 +21,27 @@
 
 #define DEBUG_TYPE "tsan"
 
+#include "llvm/Transforms/Instrumentation.h"
 #include "BlackList.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Function.h"
 #include "llvm/IRBuilder.h"
 #include "llvm/Intrinsics.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Metadata.h"
 #include "llvm/Module.h"
-#include "llvm/Type.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/DataLayout.h"
-#include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Type.h"
 
 using namespace llvm;
 
@@ -78,6 +78,7 @@ struct ThreadSanitizer : public FunctionPass {
   static char ID;  // Pass identification, replacement for typeid.
 
  private:
+  void initializeCallbacks(Module &M);
   bool instrumentLoadOrStore(Instruction *I);
   bool instrumentAtomic(Instruction *I);
   void chooseInstructionsToInstrument(SmallVectorImpl<Instruction*> &Local,
@@ -130,18 +131,8 @@ static Function *checkInterfaceFunction(Constant *FuncOrBitcast) {
   report_fatal_error("ThreadSanitizer interface function redefined");
 }
 
-bool ThreadSanitizer::doInitialization(Module &M) {
-  TD = getAnalysisIfAvailable<DataLayout>();
-  if (!TD)
-    return false;
-  BL.reset(new BlackList(ClBlackListFile));
-
-  // Always insert a call to __tsan_init into the module's CTORs.
+void ThreadSanitizer::initializeCallbacks(Module &M) {
   IRBuilder<> IRB(M.getContext());
-  Value *TsanInit = M.getOrInsertFunction("__tsan_init",
-                                          IRB.getVoidTy(), NULL);
-  appendToGlobalCtors(M, cast<Function>(TsanInit), 0);
-
   // Initialize the callbacks.
   TsanFuncEntry = checkInterfaceFunction(M.getOrInsertFunction(
       "__tsan_func_entry", IRB.getVoidTy(), IRB.getInt8PtrTy(), NULL));
@@ -188,6 +179,8 @@ bool ThreadSanitizer::doInitialization(Module &M) {
         NamePart = "_fetch_or";
       else if (op == AtomicRMWInst::Xor)
         NamePart = "_fetch_xor";
+      else if (op == AtomicRMWInst::Nand)
+        NamePart = "_fetch_nand";
       else
         continue;
       SmallString<32> RMWName("__tsan_atomic" + itostr(BitSize) + NamePart);
@@ -207,6 +200,20 @@ bool ThreadSanitizer::doInitialization(Module &M) {
       "__tsan_atomic_thread_fence", IRB.getVoidTy(), OrdTy, NULL));
   TsanAtomicSignalFence = checkInterfaceFunction(M.getOrInsertFunction(
       "__tsan_atomic_signal_fence", IRB.getVoidTy(), OrdTy, NULL));
+}
+
+bool ThreadSanitizer::doInitialization(Module &M) {
+  TD = getAnalysisIfAvailable<DataLayout>();
+  if (!TD)
+    return false;
+  BL.reset(new BlackList(ClBlackListFile));
+
+  // Always insert a call to __tsan_init into the module's CTORs.
+  IRBuilder<> IRB(M.getContext());
+  Value *TsanInit = M.getOrInsertFunction("__tsan_init",
+                                          IRB.getVoidTy(), NULL);
+  appendToGlobalCtors(M, cast<Function>(TsanInit), 0);
+
   return true;
 }
 
@@ -297,6 +304,7 @@ static bool isAtomic(Instruction *I) {
 bool ThreadSanitizer::runOnFunction(Function &F) {
   if (!TD) return false;
   if (BL->isIn(F)) return false;
+  initializeCallbacks(*F.getParent());
   SmallVector<Instruction*, 8> RetVec;
   SmallVector<Instruction*, 8> AllLoadsAndStores;
   SmallVector<Instruction*, 8> LocalLoadsAndStores;
diff --git a/lib/Transforms/NaCl/ExpandTls.cpp b/lib/Transforms/NaCl/ExpandTls.cpp
index 8ce439c018..54299edf4b 100644
--- a/lib/Transforms/NaCl/ExpandTls.cpp
+++ b/lib/Transforms/NaCl/ExpandTls.cpp
@@ -239,8 +239,8 @@ static void rewriteTlsVars(Module &M, std::vector<VarInfo> *TlsVars,
   AttrBuilder B;
   B.addAttribute(Attributes::ReadOnly);
   B.addAttribute(Attributes::NoUnwind);
-  AttrListPtr ReadTpAttrs = AttrListPtr().addAttr(
-      M.getContext(), AttrListPtr::FunctionIndex,
+  AttributeSet ReadTpAttrs = AttributeSet().addAttr(
+      M.getContext(), AttributeSet::FunctionIndex,
       Attributes::get(M.getContext(), B));
   Constant *ReadTpFunc = M.getOrInsertTargetIntrinsic("llvm.nacl.read.tp",
                                                       ReadTpType,
diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp
index b344952cc5..f43baf5a76 100644
--- a/lib/Transforms/Scalar/ADCE.cpp
+++ b/lib/Transforms/Scalar/ADCE.cpp
@@ -16,16 +16,16 @@
 
 #define DEBUG_TYPE "adce"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/BasicBlock.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/InstIterator.h"
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
 STATISTIC(NumRemoved, "Number of instructions removed");
diff --git a/lib/Transforms/Scalar/BasicBlockPlacement.cpp b/lib/Transforms/Scalar/BasicBlockPlacement.cpp
index cee5502656..6214e3b703 100644
--- a/lib/Transforms/Scalar/BasicBlockPlacement.cpp
+++ b/lib/Transforms/Scalar/BasicBlockPlacement.cpp
@@ -27,12 +27,12 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "block-placement"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ProfileInfo.h"
 #include "llvm/Function.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CFG.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Transforms/Scalar.h"
 #include <set>
 using namespace llvm;
 
diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp
index b608a5535e..e6abfdf581 100644
--- a/lib/Transforms/Scalar/CodeGenPrepare.cpp
+++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp
@@ -15,7 +15,16 @@
 
 #define DEBUG_TYPE "codegenprepare"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/DominatorInternals.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/ProfileInfo.h"
+#include "llvm/Assembly/Writer.h"
 #include "llvm/Constants.h"
+#include "llvm/DataLayout.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
 #include "llvm/IRBuilder.h"
@@ -23,14 +32,6 @@
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Pass.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/Dominators.h"
-#include "llvm/Analysis/DominatorInternals.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/ProfileInfo.h"
-#include "llvm/Assembly/Writer.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -38,7 +39,6 @@
 #include "llvm/Support/PatternMatch.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Transforms/Utils/AddrModeMatcher.h"
@@ -194,9 +194,20 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
           WorkList.insert(*II);
     }
 
-    for (SmallPtrSet<BasicBlock*, 8>::iterator
-           I = WorkList.begin(), E = WorkList.end(); I != E; ++I)
-      DeleteDeadBlock(*I);
+    // Delete the dead blocks and any of their dead successors.
+    MadeChange |= !WorkList.empty();
+    while (!WorkList.empty()) {
+      BasicBlock *BB = *WorkList.begin();
+      WorkList.erase(BB);
+      SmallVector<BasicBlock*, 2> Successors(succ_begin(BB), succ_end(BB));
+
+      DeleteDeadBlock(BB);
+      
+      for (SmallVectorImpl<BasicBlock*>::iterator
+             II = Successors.begin(), IE = Successors.end(); II != IE; ++II)
+        if (pred_begin(*II) == pred_end(*II))
+          WorkList.insert(*II);
+    }
 
     // Merge pairs of basic blocks with unconditional branches, connected by
     // a single edge.
diff --git a/lib/Transforms/Scalar/ConstantProp.cpp b/lib/Transforms/Scalar/ConstantProp.cpp
index 369720b3dc..27efde53cd 100644
--- a/lib/Transforms/Scalar/ConstantProp.cpp
+++ b/lib/Transforms/Scalar/ConstantProp.cpp
@@ -20,14 +20,14 @@
 
 #define DEBUG_TYPE "constprop"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Constant.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Instruction.h"
 #include "llvm/Pass.h"
-#include "llvm/DataLayout.h"
-#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Support/InstIterator.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include <set>
 using namespace llvm;
 
diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 3ec6f3dcc3..b5a2a25ba0 100644
--- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -13,15 +13,15 @@
 
 #define DEBUG_TYPE "correlated-value-propagation"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LazyValueInfo.h"
 #include "llvm/Constants.h"
 #include "llvm/Function.h"
 #include "llvm/Instructions.h"
 #include "llvm/Pass.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/LazyValueInfo.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
 STATISTIC(NumPhis,      "Number of phis propagated");
diff --git a/lib/Transforms/Scalar/DCE.cpp b/lib/Transforms/Scalar/DCE.cpp
index a2e074fae8..f260331c6d 100644
--- a/lib/Transforms/Scalar/DCE.cpp
+++ b/lib/Transforms/Scalar/DCE.cpp
@@ -18,12 +18,12 @@
 
 #define DEBUG_TYPE "dce"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Instruction.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/InstIterator.h"
 #include "llvm/Target/TargetLibraryInfo.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
 STATISTIC(DIEEliminated, "Number of insts removed by DIE pass");
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 736cc05e04..124892887c 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -17,25 +17,25 @@
 
 #define DEBUG_TYPE "dse"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Constants.h"
-#include "llvm/Function.h"
-#include "llvm/GlobalVariable.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Pass.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Constants.h"
 #include "llvm/DataLayout.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
 using namespace llvm;
 
 STATISTIC(NumFastStores, "Number of stores deleted");
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index 101009dd64..6b622c73f0 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -14,18 +14,18 @@
 
 #define DEBUG_TYPE "early-cse"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Instructions.h"
-#include "llvm/Pass.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/ScopedHashTable.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/DataLayout.h"
-#include "llvm/Target/TargetLibraryInfo.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Instructions.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/RecyclingAllocator.h"
-#include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/ScopedHashTable.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <deque>
 using namespace llvm;
 
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index f003e06699..1c540b240c 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -17,11 +17,6 @@
 
 #define DEBUG_TYPE "gvn"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/GlobalVariable.h"
-#include "llvm/IRBuilder.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Metadata.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/Hashing.h"
@@ -37,11 +32,16 @@
 #include "llvm/Analysis/PHITransAddr.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Assembly/Writer.h"
+#include "llvm/DataLayout.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/IRBuilder.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Metadata.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/PatternMatch.h"
-#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
diff --git a/lib/Transforms/Scalar/GlobalMerge.cpp b/lib/Transforms/Scalar/GlobalMerge.cpp
index 6301aad610..486a349c55 100644
--- a/lib/Transforms/Scalar/GlobalMerge.cpp
+++ b/lib/Transforms/Scalar/GlobalMerge.cpp
@@ -53,8 +53,10 @@
 
 #define DEBUG_TYPE "global-merge"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Attributes.h"
 #include "llvm/Constants.h"
+#include "llvm/DataLayout.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
 #include "llvm/GlobalVariable.h"
@@ -62,10 +64,8 @@
 #include "llvm/Intrinsics.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
-#include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
 STATISTIC(NumMerged      , "Number of globals merged");
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 310fd6147a..29f5a10e09 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -26,28 +26,28 @@
 
 #define DEBUG_TYPE "indvars"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/BasicBlock.h"
 #include "llvm/Constants.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/LLVMContext.h"
-#include "llvm/Type.h"
-#include "llvm/Analysis/Dominators.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPass.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
-#include "llvm/DataLayout.h"
-#include "llvm/Target/TargetLibraryInfo.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/Type.h"
 using namespace llvm;
 
 STATISTIC(NumWidened     , "Number of indvars widened");
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index e7ffa09f17..4a4cd705e2 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -13,28 +13,28 @@
 
 #define DEBUG_TYPE "jump-threading"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Pass.h"
-#include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/LazyValueInfo.h"
-#include "llvm/Analysis/Loads.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
-#include "llvm/DataLayout.h"
-#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/DataLayout.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
 using namespace llvm;
 
 STATISTIC(NumThreads, "Number of jumps threaded");
@@ -216,19 +216,24 @@ bool JumpThreading::runOnFunction(Function &F) {
 }
 
 /// getJumpThreadDuplicationCost - Return the cost of duplicating this block to
-/// thread across it.
-static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB) {
+/// thread across it. Stop scanning the block when passing the threshold.
+static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB,
+                                             unsigned Threshold) {
   /// Ignore PHI nodes, these will be flattened when duplication happens.
   BasicBlock::const_iterator I = BB->getFirstNonPHI();
 
   // FIXME: THREADING will delete values that are just used to compute the
   // branch, so they shouldn't count against the duplication cost.
 
-
   // Sum up the cost of each instruction until we get to the terminator.  Don't
   // include the terminator because the copy won't include it.
   unsigned Size = 0;
   for (; !isa<TerminatorInst>(I); ++I) {
+
+    // Stop scanning the block if we've reached the threshold.
+    if (Size > Threshold)
+      return Size;
+
     // Debugger intrinsics don't incur code size.
     if (isa<DbgInfoIntrinsic>(I)) continue;
 
@@ -1337,7 +1342,7 @@ bool JumpThreading::ThreadEdge(BasicBlock *BB,
     return false;
   }
 
-  unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB);
+  unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB, Threshold);
   if (JumpThreadCost > Threshold) {
     DEBUG(dbgs() << "  Not threading BB '" << BB->getName()
           << "' - Cost is too high: " << JumpThreadCost << "\n");
@@ -1481,7 +1486,7 @@ bool JumpThreading::DuplicateCondBranchOnPHIIntoPred(BasicBlock *BB,
     return false;
   }
 
-  unsigned DuplicationCost = getJumpThreadDuplicationCost(BB);
+  unsigned DuplicationCost = getJumpThreadDuplicationCost(BB, Threshold);
   if (DuplicationCost > Threshold) {
     DEBUG(dbgs() << "  Not duplicating BB '" << BB->getName()
           << "' - Cost is too high: " << DuplicationCost << "\n");
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 4818437c24..7ef1d34d3f 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -32,27 +32,27 @@
 
 #define DEBUG_TYPE "licm"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Instructions.h"
-#include "llvm/LLVMContext.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Constants.h"
 #include "llvm/DataLayout.h"
-#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/LLVMContext.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -90,6 +90,8 @@ namespace {
       AU.addRequired<TargetLibraryInfo>();
     }
 
+    using llvm::Pass::doFinalization;
+
     bool doFinalization() {
       assert(LoopToAliasSetMap.empty() && "Didn't free loop alias sets");
       return false;
diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp
index 3771f5aa97..9c67e327e2 100644
--- a/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -16,11 +16,11 @@
 
 #define DEBUG_TYPE "loop-delete"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Analysis/LoopPass.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/SmallVector.h"
 using namespace llvm;
 
 STATISTIC(NumDeleted, "Number of loops deleted");
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index a44e798f12..7807e9bb4f 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -43,19 +43,20 @@
 
 #define DEBUG_TYPE "loop-idiom"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/IRBuilder.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Module.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/DataLayout.h"
+#include "llvm/IRBuilder.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Module.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/TargetTransformInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
@@ -63,16 +64,83 @@ STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
 STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
 
 namespace {
+
+  class LoopIdiomRecognize;
+
+  /// This class defines some utility functions for loop idiom recognization.
+  class LIRUtil {
+  public:
+    /// Return true iff the block contains nothing but an uncondition branch
+    /// (aka goto instruction).
+    static bool isAlmostEmpty(BasicBlock *);
+
+    static BranchInst *getBranch(BasicBlock *BB) {
+      return dyn_cast<BranchInst>(BB->getTerminator());
+    }
+
+    /// Return the condition of the branch terminating the given basic block.
+    static Value *getBrCondtion(BasicBlock *);
+
+    /// Derive the precondition block (i.e the block that guards the loop 
+    /// preheader) from the given preheader.
+    static BasicBlock *getPrecondBb(BasicBlock *PreHead);
+  };
+
+  /// This class is to recoginize idioms of population-count conducted in
+  /// a noncountable loop. Currently it only recognizes this pattern:
+  /// \code
+  ///   while(x) {cnt++; ...; x &= x - 1; ...}
+  /// \endcode
+  class NclPopcountRecognize {
+    LoopIdiomRecognize &LIR;
+    Loop *CurLoop;
+    BasicBlock *PreCondBB;
+
+    typedef IRBuilder<> IRBuilderTy;
+
+  public:
+    explicit NclPopcountRecognize(LoopIdiomRecognize &TheLIR);
+    bool recognize();
+
+  private:
+    /// Take a glimpse of the loop to see if we need to go ahead recoginizing
+    /// the idiom.
+    bool preliminaryScreen();
+
+    /// Check if the given conditional branch is based on the comparison
+    /// beween a variable and zero, and if the variable is non-zero, the
+    /// control yeilds to the loop entry. If the branch matches the behavior,
+    /// the variable involved in the comparion is returned. This function will
+    /// be called to see if the precondition and postcondition of the loop 
+    /// are in desirable form.
+    Value *matchCondition (BranchInst *Br, BasicBlock *NonZeroTarget) const;
+
+    /// Return true iff the idiom is detected in the loop. and 1) \p CntInst
+    /// is set to the instruction counting the pupulation bit. 2) \p CntPhi
+    /// is set to the corresponding phi node. 3) \p Var is set to the value
+    /// whose population bits are being counted.
+    bool detectIdiom
+      (Instruction *&CntInst, PHINode *&CntPhi, Value *&Var) const;
+
+    /// Insert ctpop intrinsic function and some obviously dead instructions.
+    void transform (Instruction *CntInst, PHINode *CntPhi, Value *Var);
+
+    /// Create llvm.ctpop.* intrinsic function.
+    CallInst *createPopcntIntrinsic(IRBuilderTy &IRB, Value *Val, DebugLoc DL);
+  };
+
   class LoopIdiomRecognize : public LoopPass {
     Loop *CurLoop;
     const DataLayout *TD;
     DominatorTree *DT;
     ScalarEvolution *SE;
     TargetLibraryInfo *TLI;
+    const ScalarTargetTransformInfo *STTI;
   public:
     static char ID;
     explicit LoopIdiomRecognize() : LoopPass(ID) {
       initializeLoopIdiomRecognizePass(*PassRegistry::getPassRegistry());
+      TD = 0; DT = 0; SE = 0; TLI = 0; STTI = 0;
     }
 
     bool runOnLoop(Loop *L, LPPassManager &LPM);
@@ -110,6 +178,36 @@ namespace {
       AU.addRequired<DominatorTree>();
       AU.addRequired<TargetLibraryInfo>();
     }
+
+    const DataLayout *getDataLayout() {
+      return TD ? TD : TD=getAnalysisIfAvailable<DataLayout>();
+    }
+
+    DominatorTree *getDominatorTree() {
+      return DT ? DT : (DT=&getAnalysis<DominatorTree>());
+    }
+
+    ScalarEvolution *getScalarEvolution() {
+      return SE ? SE : (SE = &getAnalysis<ScalarEvolution>());
+    }
+
+    TargetLibraryInfo *getTargetLibraryInfo() {
+      return TLI ? TLI : (TLI = &getAnalysis<TargetLibraryInfo>());
+    }
+
+    const ScalarTargetTransformInfo *getScalarTargetTransformInfo() {
+      if (!STTI) {
+        TargetTransformInfo *TTI = getAnalysisIfAvailable<TargetTransformInfo>();
+        if (TTI) STTI = TTI->getScalarTargetTransformInfo();
+      }
+      return STTI;
+    }
+
+    Loop *getLoop() const { return CurLoop; }
+
+  private:
+    bool runOnNoncountableLoop();
+    bool runOnCountableLoop();
   };
 }
 
@@ -172,24 +270,393 @@ static void deleteIfDeadInstruction(Value *V, ScalarEvolution &SE,
       deleteDeadInstruction(I, SE, TLI);
 }
 
-bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
-  CurLoop = L;
+//===----------------------------------------------------------------------===//
+//
+//          Implementation of LIRUtil
+//
+//===----------------------------------------------------------------------===//
 
-  // If the loop could not be converted to canonical form, it must have an
-  // indirectbr in it, just give up.
-  if (!L->getLoopPreheader())
+// This fucntion will return true iff the given block contains nothing but goto. 
+// A typical usage of this function is to check if the preheader fucntion is 
+// "almost" empty such that generated intrinsic function can be moved across 
+// preheader and to be placed at the end of the preconditiona block without 
+// concerning of breaking data dependence.
+bool LIRUtil::isAlmostEmpty(BasicBlock *BB) {
+  if (BranchInst *Br = getBranch(BB)) {
+    return Br->isUnconditional() && BB->size() == 1;
+  }
+  return false;
+}
+
+Value *LIRUtil::getBrCondtion(BasicBlock *BB) {
+  BranchInst *Br = getBranch(BB);
+  return Br ? Br->getCondition() : 0;
+}
+
+BasicBlock *LIRUtil::getPrecondBb(BasicBlock *PreHead) {
+  if (BasicBlock *BB = PreHead->getSinglePredecessor()) {
+    BranchInst *Br = getBranch(BB);
+    return Br && Br->isConditional() ? BB : 0;
+  }
+  return 0;
+}
+
+//===----------------------------------------------------------------------===//
+//
+//          Implementation of NclPopcountRecognize
+//
+//===----------------------------------------------------------------------===//
+
+NclPopcountRecognize::NclPopcountRecognize(LoopIdiomRecognize &TheLIR):
+  LIR(TheLIR), CurLoop(TheLIR.getLoop()), PreCondBB(0) {
+}
+
+bool NclPopcountRecognize::preliminaryScreen() {
+  const ScalarTargetTransformInfo *STTI = LIR.getScalarTargetTransformInfo();
+  if (STTI->getPopcntHwSupport(32) != ScalarTargetTransformInfo::Fast)
     return false;
 
-  // Disable loop idiom recognition if the function's name is a common idiom.
-  StringRef Name = L->getHeader()->getParent()->getName();
-  if (Name == "memset" || Name == "memcpy")
+  // Counting population are usually conducted by few arithmetic instrutions.
+  // Such instructions can be easilly "absorbed" by vacant slots in a
+  // non-compact loop. Therefore, recognizing popcount idiom only makes sense
+  // in a compact loop.
+
+  // Give up if the loop has multiple blocks or multiple backedges.
+  if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1)
     return false;
 
-  // The trip count of the loop must be analyzable.
-  SE = &getAnalysis<ScalarEvolution>();
-  if (!SE->hasLoopInvariantBackedgeTakenCount(L))
+  BasicBlock *LoopBody = *(CurLoop->block_begin());
+  if (LoopBody->size() >= 20) {
+    // The loop is too big, bail out.
+    return false;
+  }
+
+  // It should have a preheader containing nothing but a goto instruction.
+  BasicBlock *PreHead = CurLoop->getLoopPreheader();
+  if (!PreHead || !LIRUtil::isAlmostEmpty(PreHead))
+    return false;
+
+  // It should have a precondition block where the generated popcount instrinsic
+  // function will be inserted.
+  PreCondBB = LIRUtil::getPrecondBb(PreHead);
+  if (!PreCondBB)
+    return false;
+ 
+  return true;
+}
+
+Value *NclPopcountRecognize::matchCondition (BranchInst *Br,
+                                             BasicBlock *LoopEntry) const {
+  if (!Br || !Br->isConditional())
+    return 0;
+
+  ICmpInst *Cond = dyn_cast<ICmpInst>(Br->getCondition());
+  if (!Cond)
+    return 0;
+
+  ConstantInt *CmpZero = dyn_cast<ConstantInt>(Cond->getOperand(1));
+  if (!CmpZero || !CmpZero->isZero())
+    return 0;
+
+  ICmpInst::Predicate Pred = Cond->getPredicate();
+  if ((Pred == ICmpInst::ICMP_NE && Br->getSuccessor(0) == LoopEntry) ||
+      (Pred == ICmpInst::ICMP_EQ && Br->getSuccessor(1) == LoopEntry))
+    return Cond->getOperand(0);
+
+  return 0;
+}
+
+bool NclPopcountRecognize::detectIdiom(Instruction *&CntInst,
+                                       PHINode *&CntPhi,
+                                       Value *&Var) const {
+  // Following code tries to detect this idiom:
+  //
+  //    if (x0 != 0)
+  //      goto loop-exit // the precondition of the loop
+  //    cnt0 = init-val;
+  //    do {
+  //       x1 = phi (x0, x2);
+  //       cnt1 = phi(cnt0, cnt2);
+  //
+  //       cnt2 = cnt1 + 1;
+  //        ...
+  //       x2 = x1 & (x1 - 1);
+  //        ...
+  //    } while(x != 0);
+  //
+  // loop-exit:
+  //
+
+  // step 1: Check to see if the look-back branch match this pattern:
+  //    "if (a!=0) goto loop-entry".
+  BasicBlock *LoopEntry;
+  Instruction *DefX2, *CountInst;
+  Value *VarX1, *VarX0;
+  PHINode *PhiX, *CountPhi;
+
+  DefX2 = CountInst = 0;
+  VarX1 = VarX0 = 0;
+  PhiX = CountPhi = 0;
+  LoopEntry = *(CurLoop->block_begin());
+
+  // step 1: Check if the loop-back branch is in desirable form.
+  {
+    if (Value *T = matchCondition (LIRUtil::getBranch(LoopEntry), LoopEntry))
+      DefX2 = dyn_cast<Instruction>(T);
+    else
+      return false;
+  }
+
+  // step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
+  {
+    if (DefX2->getOpcode() != Instruction::And)
+      return false;
+
+    BinaryOperator *SubOneOp;
+
+    if ((SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(0))))
+      VarX1 = DefX2->getOperand(1);
+    else {
+      VarX1 = DefX2->getOperand(0);
+      SubOneOp = dyn_cast<BinaryOperator>(DefX2->getOperand(1));
+    }
+    if (!SubOneOp)
+      return false;
+
+    Instruction *SubInst = cast<Instruction>(SubOneOp);
+    ConstantInt *Dec = dyn_cast<ConstantInt>(SubInst->getOperand(1));
+    if (!Dec ||
+        !((SubInst->getOpcode() == Instruction::Sub && Dec->isOne()) ||
+          (SubInst->getOpcode() == Instruction::Add && Dec->isAllOnesValue()))) {
+      return false;
+    }
+  }
+
+  // step 3: Check the recurrence of variable X
+  {
+    PhiX = dyn_cast<PHINode>(VarX1);
+    if (!PhiX ||
+        (PhiX->getOperand(0) != DefX2 && PhiX->getOperand(1) != DefX2)) {
+      return false;
+    }
+  }
+
+  // step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
+  {
+    CountInst = NULL;
+    for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI(),
+           IterE = LoopEntry->end(); Iter != IterE; Iter++) {
+      Instruction *Inst = Iter;
+      if (Inst->getOpcode() != Instruction::Add)
+        continue;
+
+      ConstantInt *Inc = dyn_cast<ConstantInt>(Inst->getOperand(1));
+      if (!Inc || !Inc->isOne())
+        continue;
+
+      PHINode *Phi = dyn_cast<PHINode>(Inst->getOperand(0));
+      if (!Phi || Phi->getParent() != LoopEntry)
+        continue;
+
+      // Check if the result of the instruction is live of the loop.
+      bool LiveOutLoop = false;
+      for (Value::use_iterator I = Inst->use_begin(), E = Inst->use_end();
+             I != E;  I++) {
+        if ((cast<Instruction>(*I))->getParent() != LoopEntry) {
+          LiveOutLoop = true; break;
+        }
+      }
+
+      if (LiveOutLoop) {
+        CountInst = Inst;
+        CountPhi = Phi;
+        break;
+      }
+    }
+
+    if (!CountInst)
+      return false;
+  }
+
+  // step 5: check if the precondition is in this form:
+  //   "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
+  {
+    BranchInst *PreCondBr = LIRUtil::getBranch(PreCondBB);
+    Value *T = matchCondition (PreCondBr, CurLoop->getLoopPreheader());
+    if (T != PhiX->getOperand(0) && T != PhiX->getOperand(1))
+      return false;
+
+    CntInst = CountInst;
+    CntPhi = CountPhi;
+    Var = T;
+  }
+
+  return true;
+}
+
+void NclPopcountRecognize::transform(Instruction *CntInst,
+                                     PHINode *CntPhi, Value *Var) {
+
+  ScalarEvolution *SE = LIR.getScalarEvolution();
+  TargetLibraryInfo *TLI = LIR.getTargetLibraryInfo();
+  BasicBlock *PreHead = CurLoop->getLoopPreheader();
+  BranchInst *PreCondBr = LIRUtil::getBranch(PreCondBB);
+  const DebugLoc DL = CntInst->getDebugLoc();
+
+  // Assuming before transformation, the loop is following:
+  //  if (x) // the precondition
+  //     do { cnt++; x &= x - 1; } while(x);
+ 
+  // Step 1: Insert the ctpop instruction at the end of the precondition block
+  IRBuilderTy Builder(PreCondBr);
+  Value *PopCnt, *PopCntZext, *NewCount, *TripCnt;
+  {
+    PopCnt = createPopcntIntrinsic(Builder, Var, DL);
+    NewCount = PopCntZext =
+      Builder.CreateZExtOrTrunc(PopCnt, cast<IntegerType>(CntPhi->getType()));
+
+    if (NewCount != PopCnt)
+      (cast<Instruction>(NewCount))->setDebugLoc(DL);
+
+    // TripCnt is exactly the number of iterations the loop has
+    TripCnt = NewCount;
+
+    // If the popoulation counter's initial value is not zero, insert Add Inst.
+    Value *CntInitVal = CntPhi->getIncomingValueForBlock(PreHead);
+    ConstantInt *InitConst = dyn_cast<ConstantInt>(CntInitVal);
+    if (!InitConst || !InitConst->isZero()) {
+      NewCount = Builder.CreateAdd(NewCount, CntInitVal);
+      (cast<Instruction>(NewCount))->setDebugLoc(DL);
+    }
+  }
+
+  // Step 2: Replace the precondition from "if(x == 0) goto loop-exit" to
+  //   "if(NewCount == 0) loop-exit". Withtout this change, the intrinsic
+  //   function would be partial dead code, and downstream passes will drag
+  //   it back from the precondition block to the preheader.
+  {
+    ICmpInst *PreCond = cast<ICmpInst>(PreCondBr->getCondition());
+
+    Value *Opnd0 = PopCntZext;
+    Value *Opnd1 = ConstantInt::get(PopCntZext->getType(), 0);
+    if (PreCond->getOperand(0) != Var)
+      std::swap(Opnd0, Opnd1);
+
+    ICmpInst *NewPreCond =
+      cast<ICmpInst>(Builder.CreateICmp(PreCond->getPredicate(), Opnd0, Opnd1));
+    PreCond->replaceAllUsesWith(NewPreCond);
+
+    deleteDeadInstruction(PreCond, *SE, TLI);
+  }
+
+  // Step 3: Note that the population count is exactly the trip count of the
+  // loop in question, which enble us to to convert the loop from noncountable
+  // loop into a countable one. The benefit is twofold:
+  //
+  //  - If the loop only counts population, the entire loop become dead after
+  //    the transformation. It is lots easier to prove a countable loop dead
+  //    than to prove a noncountable one. (In some C dialects, a infite loop
+  //    isn't dead even if it computes nothing useful. In general, DCE needs
+  //    to prove a noncountable loop finite before safely delete it.)
+  //
+  //  - If the loop also performs something else, it remains alive.
+  //    Since it is transformed to countable form, it can be aggressively
+  //    optimized by some optimizations which are in general not applicable
+  //    to a noncountable loop.
+  //
+  // After this step, this loop (conceptually) would look like following:
+  //   newcnt = __builtin_ctpop(x);
+  //   t = newcnt;
+  //   if (x)
+  //     do { cnt++; x &= x-1; t--) } while (t > 0);
+  BasicBlock *Body = *(CurLoop->block_begin());
+  {
+    BranchInst *LbBr = LIRUtil::getBranch(Body);
+    ICmpInst *LbCond = cast<ICmpInst>(LbBr->getCondition());
+    Type *Ty = TripCnt->getType();
+
+    PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", Body->begin());
+
+    Builder.SetInsertPoint(LbCond);
+    Value *Opnd1 = cast<Value>(TcPhi);
+    Value *Opnd2 = cast<Value>(ConstantInt::get(Ty, 1));
+    Instruction *TcDec =
+      cast<Instruction>(Builder.CreateSub(Opnd1, Opnd2, "tcdec", false, true));
+
+    TcPhi->addIncoming(TripCnt, PreHead);
+    TcPhi->addIncoming(TcDec, Body);
+
+    CmpInst::Predicate Pred = (LbBr->getSuccessor(0) == Body) ?
+      CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
+    LbCond->setPredicate(Pred);
+    LbCond->setOperand(0, TcDec);
+    LbCond->setOperand(1, cast<Value>(ConstantInt::get(Ty, 0)));
+  }
+
+  // Step 4: All the references to the original population counter outside
+  //  the loop are replaced with the NewCount -- the value returned from
+  //  __builtin_ctpop().
+  {
+    SmallVector<Value *, 4> CntUses;
+    for (Value::use_iterator I = CntInst->use_begin(), E = CntInst->use_end();
+         I != E; I++) {
+      if (cast<Instruction>(*I)->getParent() != Body)
+        CntUses.push_back(*I);
+    }
+    for (unsigned Idx = 0; Idx < CntUses.size(); Idx++) {
+      (cast<Instruction>(CntUses[Idx]))->replaceUsesOfWith(CntInst, NewCount);
+    }
+  }
+
+  // step 5: Forget the "non-computable" trip-count SCEV associated with the
+  //   loop. The loop would otherwise not be deleted even if it becomes empty.
+  SE->forgetLoop(CurLoop);
+}
+
+CallInst *NclPopcountRecognize::createPopcntIntrinsic(IRBuilderTy &IRBuilder, 
+                                                      Value *Val, DebugLoc DL) {
+  Value *Ops[] = { Val };
+  Type *Tys[] = { Val->getType() };
+
+  Module *M = (*(CurLoop->block_begin()))->getParent()->getParent();
+  Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctpop, Tys);
+  CallInst *CI = IRBuilder.CreateCall(Func, Ops);
+  CI->setDebugLoc(DL);
+
+  return CI;
+}
+
+/// recognize - detect population count idiom in a non-countable loop. If
+///   detected, transform the relevant code to popcount intrinsic function
+///   call, and return true; otherwise, return false.
+bool NclPopcountRecognize::recognize() {
+
+  if (!LIR.getScalarTargetTransformInfo())
     return false;
-  const SCEV *BECount = SE->getBackedgeTakenCount(L);
+
+  LIR.getScalarEvolution();
+
+  if (!preliminaryScreen())
+    return false;
+
+  Instruction *CntInst;
+  PHINode *CntPhi;
+  Value *Val;
+  if (!detectIdiom(CntInst, CntPhi, Val))
+    return false;
+
+  transform(CntInst, CntPhi, Val);
+  return true;
+}
+
+//===----------------------------------------------------------------------===//
+//
+//          Implementation of LoopIdiomRecognize
+//
+//===----------------------------------------------------------------------===//
+
+bool LoopIdiomRecognize::runOnCountableLoop() {
+  const SCEV *BECount = SE->getBackedgeTakenCount(CurLoop);
   if (isa<SCEVCouldNotCompute>(BECount)) return false;
 
   // If this loop executes exactly one time, then it should be peeled, not
@@ -199,24 +666,27 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
       return false;
 
   // We require target data for now.
-  TD = getAnalysisIfAvailable<DataLayout>();
-  if (TD == 0) return false;
+  if (!getDataLayout())
+    return false;
+
+  getDominatorTree();
 
-  DT = &getAnalysis<DominatorTree>();
   LoopInfo &LI = getAnalysis<LoopInfo>();
   TLI = &getAnalysis<TargetLibraryInfo>();
 
+  getTargetLibraryInfo();
+
   SmallVector<BasicBlock*, 8> ExitBlocks;
   CurLoop->getUniqueExitBlocks(ExitBlocks);
 
   DEBUG(dbgs() << "loop-idiom Scanning: F["
-               << L->getHeader()->getParent()->getName()
-               << "] Loop %" << L->getHeader()->getName() << "\n");
+               << CurLoop->getHeader()->getParent()->getName()
+               << "] Loop %" << CurLoop->getHeader()->getName() << "\n");
 
   bool MadeChange = false;
   // Scan all the blocks in the loop that are not in subloops.
-  for (Loop::block_iterator BI = L->block_begin(), E = L->block_end(); BI != E;
-       ++BI) {
+  for (Loop::block_iterator BI = CurLoop->block_begin(),
+         E = CurLoop->block_end(); BI != E; ++BI) {
     // Ignore blocks in subloops.
     if (LI.getLoopFor(*BI) != CurLoop)
       continue;
@@ -226,6 +696,33 @@ bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
   return MadeChange;
 }
 
+bool LoopIdiomRecognize::runOnNoncountableLoop() {
+  NclPopcountRecognize Popcount(*this);
+  if (Popcount.recognize())
+    return true;
+
+  return false;
+}
+
+bool LoopIdiomRecognize::runOnLoop(Loop *L, LPPassManager &LPM) {
+  CurLoop = L;
+
+  // If the loop could not be converted to canonical form, it must have an
+  // indirectbr in it, just give up.
+  if (!L->getLoopPreheader())
+    return false;
+
+  // Disable loop idiom recognition if the function's name is a common idiom.
+  StringRef Name = L->getHeader()->getParent()->getName();
+  if (Name == "memset" || Name == "memcpy")
+    return false;
+
+  SE = &getAnalysis<ScalarEvolution>();
+  if (SE->hasLoopInvariantBackedgeTakenCount(L))
+    return runOnCountableLoop();
+  return runOnNoncountableLoop();
+}
+
 /// runOnLoopBlock - Process the specified block, which lives in a counted loop
 /// with the specified backedge count.  This block is known to be in the current
 /// loop and not in any subloops.
diff --git a/lib/Transforms/Scalar/LoopInstSimplify.cpp b/lib/Transforms/Scalar/LoopInstSimplify.cpp
index 558f62e6b4..10ba22434a 100644
--- a/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -12,17 +12,17 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "loop-instsimplify"
-#include "llvm/Instructions.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/DataLayout.h"
+#include "llvm/Instructions.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Target/TargetLibraryInfo.h"
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
 STATISTIC(NumSimplified, "Number of redundant instructions simplified");
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index abe07aa9d3..249baf5164 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -13,20 +13,20 @@
 
 #define DEBUG_TYPE "loop-rotate"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Function.h"
-#include "llvm/IntrinsicInst.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/CodeMetrics.h"
-#include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Function.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
-#include "llvm/Support/CFG.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
 #define MAX_HEADER_SIZE 16
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 958348d9fa..d571ba3fe0 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -54,27 +54,27 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "loop-reduce"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/AddressingMode.h"
-#include "llvm/Constants.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Analysis/IVUsers.h"
 #include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/IVUsers.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Assembly/Writer.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/ADT/SmallBitVector.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLowering.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 using namespace llvm;
 
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 0d781ac977..2b15528411 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -13,16 +13,16 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "loop-unroll"
-#include "llvm/IntrinsicInst.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/DataLayout.h"
+#include "llvm/IntrinsicInst.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
-#include "llvm/DataLayout.h"
 #include <climits>
 
 using namespace llvm;
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index 047b43eb84..d41da4a9a9 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -28,25 +28,25 @@
 
 #define DEBUG_TYPE "loop-unswitch"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Function.h"
-#include "llvm/Instructions.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 #include <map>
 #include <set>
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 517657cf52..26b6269f42 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -14,20 +14,20 @@
 
 #define DEBUG_TYPE "memcpyopt"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/GlobalVariable.h"
-#include "llvm/IRBuilder.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/DataLayout.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/IRBuilder.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <list>
diff --git a/lib/Transforms/Scalar/NaClCcRewrite.cpp b/lib/Transforms/Scalar/NaClCcRewrite.cpp
index 4ac361835d..9109e22592 100644
--- a/lib/Transforms/Scalar/NaClCcRewrite.cpp
+++ b/lib/Transforms/Scalar/NaClCcRewrite.cpp
@@ -596,7 +596,7 @@ void UpdateFunctionSignature(Function &F,
   Attributes fattr = F.getAttributes().getFnAttributes();
   if (fattr.hasAttributes())
     new_attributes_vec.push_back(AttributeWithIndex::get(~0, fattr));
-  F.setAttributes(AttrListPtr::get(F.getContext(), new_attributes_vec));
+  F.setAttributes(AttributeSet::get(F.getContext(), new_attributes_vec));
 }
 
 
@@ -805,7 +805,7 @@ void ExtractOperandsAndAttributesFromCallInst(
   std::vector<Value*>& operands,
   std::vector<Attributes>& attributes) {
 
-  AttrListPtr PAL = call->getAttributes();
+  AttributeSet PAL = call->getAttributes();
   // last operand is: function
   for (size_t i = 0; i <  call->getNumOperands() - 1; ++i) {
     operands.push_back(call->getArgOperand(i));
@@ -819,7 +819,7 @@ void ExtractOperandsAndAttributesFromeInvokeInst(
   InvokeInst* call,
   std::vector<Value*>& operands,
   std::vector<Attributes>& attributes) {
-  AttrListPtr PAL = call->getAttributes();
+  AttributeSet PAL = call->getAttributes();
   // last three operands are: function, bb-normal, bb-exception
   for (size_t i = 0; i <  call->getNumOperands() - 3; ++i) {
     operands.push_back(call->getArgOperand(i));
diff --git a/lib/Transforms/Scalar/ObjCARC.cpp b/lib/Transforms/Scalar/ObjCARC.cpp
index dfdf50549d..ce397658bf 100644
--- a/lib/Transforms/Scalar/ObjCARC.cpp
+++ b/lib/Transforms/Scalar/ObjCARC.cpp
@@ -29,9 +29,9 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "objc-arc"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/CommandLine.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
 // A handy option to enable/disable all optimizations in this file.
@@ -132,12 +132,12 @@ namespace {
 // ARC Utilities.
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Intrinsics.h"
 #include "llvm/Module.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Support/CallSite.h"
-#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Transforms/Utils/Local.h"
 
 namespace {
   /// InstructionClass - A simple classification for instructions.
@@ -660,9 +660,9 @@ static bool DoesObjCBlockEscape(const Value *BlockPtr) {
 // ARC AliasAnalysis.
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Pass.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/Passes.h"
+#include "llvm/Pass.h"
 
 namespace {
   /// ObjCARCAliasAnalysis - This is a simple alias analysis
@@ -912,8 +912,8 @@ bool ObjCARCExpand::runOnFunction(Function &F) {
 // ARC autorelease pool elimination.
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Constants.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Constants.h"
 
 namespace {
   /// ObjCARCAPElim - Autorelease pool elimination.
@@ -1093,10 +1093,10 @@ bool ObjCARCAPElim::runOnModule(Module &M) {
 
 // TODO: Delete release+retain pairs (rare).
 
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Support/CFG.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/SmallPtrSet.h"
 
 STATISTIC(NumNoops,       "Number of no-op objc calls eliminated");
 STATISTIC(NumPartialNoops, "Number of partially no-op objc calls eliminated");
@@ -1788,8 +1788,8 @@ Constant *ObjCARCOpt::getRetainRVCallee(Module *M) {
     Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
     Type *Params[] = { I8X };
     FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttrListPtr Attributes =
-      AttrListPtr().addAttr(M->getContext(), AttrListPtr::FunctionIndex,
+    AttributeSet Attributes =
+      AttributeSet().addAttr(M->getContext(), AttributeSet::FunctionIndex,
                             Attributes::get(C, Attributes::NoUnwind));
     RetainRVCallee =
       M->getOrInsertFunction("objc_retainAutoreleasedReturnValue", FTy,
@@ -1804,8 +1804,8 @@ Constant *ObjCARCOpt::getAutoreleaseRVCallee(Module *M) {
     Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
     Type *Params[] = { I8X };
     FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttrListPtr Attributes =
-      AttrListPtr().addAttr(M->getContext(), AttrListPtr::FunctionIndex,
+    AttributeSet Attributes =
+      AttributeSet().addAttr(M->getContext(), AttributeSet::FunctionIndex,
                             Attributes::get(C, Attributes::NoUnwind));
     AutoreleaseRVCallee =
       M->getOrInsertFunction("objc_autoreleaseReturnValue", FTy,
@@ -1818,8 +1818,8 @@ Constant *ObjCARCOpt::getReleaseCallee(Module *M) {
   if (!ReleaseCallee) {
     LLVMContext &C = M->getContext();
     Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
-    AttrListPtr Attributes =
-      AttrListPtr().addAttr(M->getContext(), AttrListPtr::FunctionIndex,
+    AttributeSet Attributes =
+      AttributeSet().addAttr(M->getContext(), AttributeSet::FunctionIndex,
                             Attributes::get(C, Attributes::NoUnwind));
     ReleaseCallee =
       M->getOrInsertFunction(
@@ -1834,8 +1834,8 @@ Constant *ObjCARCOpt::getRetainCallee(Module *M) {
   if (!RetainCallee) {
     LLVMContext &C = M->getContext();
     Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
-    AttrListPtr Attributes =
-      AttrListPtr().addAttr(M->getContext(), AttrListPtr::FunctionIndex,
+    AttributeSet Attributes =
+      AttributeSet().addAttr(M->getContext(), AttributeSet::FunctionIndex,
                             Attributes::get(C, Attributes::NoUnwind));
     RetainCallee =
       M->getOrInsertFunction(
@@ -1856,7 +1856,7 @@ Constant *ObjCARCOpt::getRetainBlockCallee(Module *M) {
       M->getOrInsertFunction(
         "objc_retainBlock",
         FunctionType::get(Params[0], Params, /*isVarArg=*/false),
-        AttrListPtr());
+        AttributeSet());
   }
   return RetainBlockCallee;
 }
@@ -1865,8 +1865,8 @@ Constant *ObjCARCOpt::getAutoreleaseCallee(Module *M) {
   if (!AutoreleaseCallee) {
     LLVMContext &C = M->getContext();
     Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
-    AttrListPtr Attributes =
-      AttrListPtr().addAttr(M->getContext(), AttrListPtr::FunctionIndex,
+    AttributeSet Attributes =
+      AttributeSet().addAttr(M->getContext(), AttributeSet::FunctionIndex,
                             Attributes::get(C, Attributes::NoUnwind));
     AutoreleaseCallee =
       M->getOrInsertFunction(
@@ -3756,9 +3756,9 @@ void ObjCARCOpt::releaseMemory() {
 // TODO: ObjCARCContract could insert PHI nodes when uses aren't
 // dominated by single calls.
 
-#include "llvm/Operator.h"
-#include "llvm/InlineAsm.h"
 #include "llvm/Analysis/Dominators.h"
+#include "llvm/InlineAsm.h"
+#include "llvm/Operator.h"
 
 STATISTIC(NumStoreStrongs, "Number objc_storeStrong calls formed");
 
@@ -3840,8 +3840,8 @@ Constant *ObjCARCContract::getStoreStrongCallee(Module *M) {
     Type *I8XX = PointerType::getUnqual(I8X);
     Type *Params[] = { I8XX, I8X };
 
-    AttrListPtr Attributes = AttrListPtr()
-      .addAttr(M->getContext(), AttrListPtr::FunctionIndex,
+    AttributeSet Attributes = AttributeSet()
+      .addAttr(M->getContext(), AttributeSet::FunctionIndex,
                Attributes::get(C, Attributes::NoUnwind))
       .addAttr(M->getContext(), 1, Attributes::get(C, Attributes::NoCapture));
 
@@ -3860,8 +3860,8 @@ Constant *ObjCARCContract::getRetainAutoreleaseCallee(Module *M) {
     Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
     Type *Params[] = { I8X };
     FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttrListPtr Attributes =
-      AttrListPtr().addAttr(M->getContext(), AttrListPtr::FunctionIndex,
+    AttributeSet Attributes =
+      AttributeSet().addAttr(M->getContext(), AttributeSet::FunctionIndex,
                             Attributes::get(C, Attributes::NoUnwind));
     RetainAutoreleaseCallee =
       M->getOrInsertFunction("objc_retainAutorelease", FTy, Attributes);
@@ -3875,8 +3875,8 @@ Constant *ObjCARCContract::getRetainAutoreleaseRVCallee(Module *M) {
     Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
     Type *Params[] = { I8X };
     FunctionType *FTy = FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttrListPtr Attributes =
-      AttrListPtr().addAttr(M->getContext(), AttrListPtr::FunctionIndex,
+    AttributeSet Attributes =
+      AttributeSet().addAttr(M->getContext(), AttributeSet::FunctionIndex,
                             Attributes::get(C, Attributes::NoUnwind));
     RetainAutoreleaseRVCallee =
       M->getOrInsertFunction("objc_retainAutoreleaseReturnValue", FTy,
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index 377c07020d..569439aaf4 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -22,7 +22,12 @@
 
 #define DEBUG_TYPE "reassociate"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Assembly/Writer.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
@@ -30,16 +35,11 @@
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Pass.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Assembly/Writer.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 using namespace llvm;
 
diff --git a/lib/Transforms/Scalar/Reg2Mem.cpp b/lib/Transforms/Scalar/Reg2Mem.cpp
index ea1de63de7..5524e01230 100644
--- a/lib/Transforms/Scalar/Reg2Mem.cpp
+++ b/lib/Transforms/Scalar/Reg2Mem.cpp
@@ -18,15 +18,15 @@
 
 #define DEBUG_TYPE "reg2mem"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Pass.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/BasicBlock.h"
 #include "llvm/Function.h"
+#include "llvm/Instructions.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
-#include "llvm/BasicBlock.h"
-#include "llvm/Instructions.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CFG.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <list>
 using namespace llvm;
 
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 686520e724..28aaddc50e 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -19,26 +19,26 @@
 
 #define DEBUG_TYPE "sccp"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/PointerIntPair.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Constants.h"
+#include "llvm/DataLayout.h"
 #include "llvm/DerivedTypes.h"
+#include "llvm/InstVisitor.h"
 #include "llvm/Instructions.h"
 #include "llvm/Pass.h"
-#include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/DataLayout.h"
-#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/InstVisitor.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DenseSet.h"
-#include "llvm/ADT/PointerIntPair.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 using namespace llvm;
 
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index 8284e144b2..1c220ca0f6 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -25,33 +25,34 @@
 
 #define DEBUG_TYPE "sroa"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/PtrUseVisitor.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Constants.h"
 #include "llvm/DIBuilder.h"
+#include "llvm/DataLayout.h"
 #include "llvm/DebugInfo.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
 #include "llvm/IRBuilder.h"
+#include "llvm/InstVisitor.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/Operator.h"
 #include "llvm/Pass.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Analysis/Dominators.h"
-#include "llvm/Analysis/Loads.h"
-#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
-#include "llvm/Support/InstVisitor.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/DataLayout.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -404,106 +405,17 @@ private:
 };
 }
 
-template <typename DerivedT, typename RetT>
-class AllocaPartitioning::BuilderBase
-    : public InstVisitor<DerivedT, RetT> {
-public:
-  BuilderBase(const DataLayout &TD, AllocaInst &AI, AllocaPartitioning &P)
-      : TD(TD),
-        AllocSize(TD.getTypeAllocSize(AI.getAllocatedType())),
-        P(P) {
-    enqueueUsers(AI, 0);
-  }
-
-protected:
-  const DataLayout &TD;
-  const uint64_t AllocSize;
-  AllocaPartitioning &P;
-
-  SmallPtrSet<Use *, 8> VisitedUses;
-
-  struct OffsetUse {
-    Use *U;
-    int64_t Offset;
-  };
-  SmallVector<OffsetUse, 8> Queue;
-
-  // The active offset and use while visiting.
-  Use *U;
-  int64_t Offset;
-
-  void enqueueUsers(Instruction &I, int64_t UserOffset) {
-    for (Value::use_iterator UI = I.use_begin(), UE = I.use_end();
-         UI != UE; ++UI) {
-      if (VisitedUses.insert(&UI.getUse())) {
-        OffsetUse OU = { &UI.getUse(), UserOffset };
-        Queue.push_back(OU);
-      }
-    }
-  }
-
-  bool computeConstantGEPOffset(GetElementPtrInst &GEPI, int64_t &GEPOffset) {
-    GEPOffset = Offset;
-    for (gep_type_iterator GTI = gep_type_begin(GEPI), GTE = gep_type_end(GEPI);
-         GTI != GTE; ++GTI) {
-      ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
-      if (!OpC)
-        return false;
-      if (OpC->isZero())
-        continue;
-
-      // Handle a struct index, which adds its field offset to the pointer.
-      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
-        unsigned ElementIdx = OpC->getZExtValue();
-        const StructLayout *SL = TD.getStructLayout(STy);
-        uint64_t ElementOffset = SL->getElementOffset(ElementIdx);
-        // Check that we can continue to model this GEP in a signed 64-bit offset.
-        if (ElementOffset > INT64_MAX ||
-            (GEPOffset >= 0 &&
-             ((uint64_t)GEPOffset + ElementOffset) > INT64_MAX)) {
-          DEBUG(dbgs() << "WARNING: Encountered a cumulative offset exceeding "
-                       << "what can be represented in an int64_t!\n"
-                       << "  alloca: " << P.AI << "\n");
-          return false;
-        }
-        if (GEPOffset < 0)
-          GEPOffset = ElementOffset + (uint64_t)-GEPOffset;
-        else
-          GEPOffset += ElementOffset;
-        continue;
-      }
-
-      APInt Index = OpC->getValue().sextOrTrunc(TD.getPointerSizeInBits());
-      Index *= APInt(Index.getBitWidth(),
-                     TD.getTypeAllocSize(GTI.getIndexedType()));
-      Index += APInt(Index.getBitWidth(), (uint64_t)GEPOffset,
-                     /*isSigned*/true);
-      // Check if the result can be stored in our int64_t offset.
-      if (!Index.isSignedIntN(sizeof(GEPOffset) * 8)) {
-        DEBUG(dbgs() << "WARNING: Encountered a cumulative offset exceeding "
-                     << "what can be represented in an int64_t!\n"
-                     << "  alloca: " << P.AI << "\n");
-        return false;
-      }
-
-      GEPOffset = Index.getSExtValue();
-    }
-    return true;
+static Value *foldSelectInst(SelectInst &SI) {
+  // If the condition being selected on is a constant or the same value is
+  // being selected between, fold the select. Yes this does (rarely) happen
+  // early on.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
+    return SI.getOperand(1+CI->isZero());
+  if (SI.getOperand(1) == SI.getOperand(2)) {
+    return SI.getOperand(1);
   }
-
-  Value *foldSelectInst(SelectInst &SI) {
-    // If the condition being selected on is a constant or the same value is
-    // being selected between, fold the select. Yes this does (rarely) happen
-    // early on.
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(SI.getCondition()))
-      return SI.getOperand(1+CI->isZero());
-    if (SI.getOperand(1) == SI.getOperand(2)) {
-      assert(*U == SI.getOperand(1));
-      return SI.getOperand(1);
-    }
-    return 0;
-  }
-};
+  return 0;
+}
 
 /// \brief Builder for the alloca partitioning.
 ///
@@ -511,60 +423,38 @@ protected:
 /// of an alloca and splitting the partitions for each load and store at each
 /// offset.
 class AllocaPartitioning::PartitionBuilder
-    : public BuilderBase<PartitionBuilder, bool> {
-  friend class InstVisitor<PartitionBuilder, bool>;
+    : public PtrUseVisitor<PartitionBuilder> {
+  friend class PtrUseVisitor<PartitionBuilder>;
+  friend class InstVisitor<PartitionBuilder>;
+  typedef PtrUseVisitor<PartitionBuilder> Base;
+
+  const uint64_t AllocSize;
+  AllocaPartitioning &P;
 
   SmallDenseMap<Instruction *, unsigned> MemTransferPartitionMap;
 
 public:
-  PartitionBuilder(const DataLayout &TD, AllocaInst &AI, AllocaPartitioning &P)
-      : BuilderBase<PartitionBuilder, bool>(TD, AI, P) {}
-
-  /// \brief Run the builder over the allocation.
-  bool operator()() {
-    // Note that we have to re-evaluate size on each trip through the loop as
-    // the queue grows at the tail.
-    for (unsigned Idx = 0; Idx < Queue.size(); ++Idx) {
-      U = Queue[Idx].U;
-      Offset = Queue[Idx].Offset;
-      if (!visit(cast<Instruction>(U->getUser())))
-        return false;
-    }
-    return true;
-  }
+  PartitionBuilder(const DataLayout &DL, AllocaInst &AI, AllocaPartitioning &P)
+      : PtrUseVisitor<PartitionBuilder>(DL),
+        AllocSize(DL.getTypeAllocSize(AI.getAllocatedType())),
+        P(P) {}
 
 private:
-  bool markAsEscaping(Instruction &I) {
-    P.PointerEscapingInstr = &I;
-    return false;
-  }
-
-  void insertUse(Instruction &I, int64_t Offset, uint64_t Size,
+  void insertUse(Instruction &I, const APInt &Offset, uint64_t Size,
                  bool IsSplittable = false) {
-    // Completely skip uses which have a zero size or don't overlap the
-    // allocation.
-    if (Size == 0 ||
-        (Offset >= 0 && (uint64_t)Offset >= AllocSize) ||
-        (Offset < 0 && (uint64_t)-Offset >= Size)) {
+    // Completely skip uses which have a zero size or start either before or
+    // past the end of the allocation.
+    if (Size == 0 || Offset.isNegative() || Offset.uge(AllocSize)) {
       DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte use @" << Offset
-                   << " which starts past the end of the " << AllocSize
-                   << " byte alloca:\n"
+                   << " which has zero size or starts outside of the "
+                   << AllocSize << " byte alloca:\n"
                    << "    alloca: " << P.AI << "\n"
                    << "       use: " << I << "\n");
       return;
     }
 
-    // Clamp the start to the beginning of the allocation.
-    if (Offset < 0) {
-      DEBUG(dbgs() << "WARNING: Clamping a " << Size << " byte use @" << Offset
-                   << " to start at the beginning of the alloca:\n"
-                   << "    alloca: " << P.AI << "\n"
-                   << "       use: " << I << "\n");
-      Size -= (uint64_t)-Offset;
-      Offset = 0;
-    }
-
-    uint64_t BeginOffset = Offset, EndOffset = BeginOffset + Size;
+    uint64_t BeginOffset = Offset.getZExtValue();
+    uint64_t EndOffset = BeginOffset + Size;
 
     // Clamp the end offset to the end of the allocation. Note that this is
     // formulated to handle even the case where "BeginOffset + Size" overflows.
@@ -585,9 +475,9 @@ private:
     P.Partitions.push_back(New);
   }
 
-  bool handleLoadOrStore(Type *Ty, Instruction &I, int64_t Offset,
+  void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset,
                          bool IsVolatile) {
-    uint64_t Size = TD.getTypeStoreSize(Ty);
+    uint64_t Size = DL.getTypeStoreSize(Ty);
 
     // If this memory access can be shown to *statically* extend outside the
     // bounds of of the allocation, it's behavior is undefined, so simply
@@ -596,15 +486,15 @@ private:
     // risk of overflow.
     // FIXME: We should instead consider the pointer to have escaped if this
     // function is being instrumented for addressing bugs or race conditions.
-    if (Offset < 0 || (uint64_t)Offset >= AllocSize ||
-        Size > (AllocSize - (uint64_t)Offset)) {
+    if (Offset.isNegative() || Size > AllocSize ||
+        Offset.ugt(AllocSize - Size)) {
       DEBUG(dbgs() << "WARNING: Ignoring " << Size << " byte "
                    << (isa<LoadInst>(I) ? "load" : "store") << " @" << Offset
                    << " which extends past the end of the " << AllocSize
                    << " byte alloca:\n"
                    << "    alloca: " << P.AI << "\n"
                    << "       use: " << I << "\n");
-      return true;
+      return;
     }
 
     // We allow splitting of loads and stores where the type is an integer type
@@ -615,54 +505,61 @@ private:
       IsSplittable = !IsVolatile && ITy->getBitWidth() == AllocSize*8;
 
     insertUse(I, Offset, Size, IsSplittable);
-    return true;
-  }
-
-  bool visitBitCastInst(BitCastInst &BC) {
-    enqueueUsers(BC, Offset);
-    return true;
-  }
-
-  bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
-    int64_t GEPOffset;
-    if (!computeConstantGEPOffset(GEPI, GEPOffset))
-      return markAsEscaping(GEPI);
-
-    enqueueUsers(GEPI, GEPOffset);
-    return true;
   }
 
-  bool visitLoadInst(LoadInst &LI) {
+  void visitLoadInst(LoadInst &LI) {
     assert((!LI.isSimple() || LI.getType()->isSingleValueType()) &&
            "All simple FCA loads should have been pre-split");
+
+    if (!IsOffsetKnown)
+      return PI.setAborted(&LI);
+
     return handleLoadOrStore(LI.getType(), LI, Offset, LI.isVolatile());
   }
 
-  bool visitStoreInst(StoreInst &SI) {
+  void visitStoreInst(StoreInst &SI) {
     Value *ValOp = SI.getValueOperand();
     if (ValOp == *U)
-      return markAsEscaping(SI);
+      return PI.setEscapedAndAborted(&SI);
+    if (!IsOffsetKnown)
+      return PI.setAborted(&SI);
 
     assert((!SI.isSimple() || ValOp->getType()->isSingleValueType()) &&
            "All simple FCA stores should have been pre-split");
-    return handleLoadOrStore(ValOp->getType(), SI, Offset, SI.isVolatile());
+    handleLoadOrStore(ValOp->getType(), SI, Offset, SI.isVolatile());
   }
 
 
-  bool visitMemSetInst(MemSetInst &II) {
+  void visitMemSetInst(MemSetInst &II) {
     assert(II.getRawDest() == *U && "Pointer use is not the destination?");
     ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
-    uint64_t Size = Length ? Length->getZExtValue() : AllocSize - Offset;
-    insertUse(II, Offset, Size, Length);
-    return true;
+    if ((Length && Length->getValue() == 0) ||
+        (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize)))
+      // Zero-length mem transfer intrinsics can be ignored entirely.
+      return;
+
+    if (!IsOffsetKnown)
+      return PI.setAborted(&II);
+
+    insertUse(II, Offset,
+              Length ? Length->getLimitedValue()
+                     : AllocSize - Offset.getLimitedValue(),
+              (bool)Length);
   }
 
-  bool visitMemTransferInst(MemTransferInst &II) {
+  void visitMemTransferInst(MemTransferInst &II) {
     ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
-    uint64_t Size = Length ? Length->getZExtValue() : AllocSize - Offset;
-    if (!Size)
+    if ((Length && Length->getValue() == 0) ||
+        (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize)))
       // Zero-length mem transfer intrinsics can be ignored entirely.
-      return true;
+      return;
+
+    if (!IsOffsetKnown)
+      return PI.setAborted(&II);
+
+    uint64_t RawOffset = Offset.getLimitedValue();
+    uint64_t Size = Length ? Length->getLimitedValue()
+                           : AllocSize - RawOffset;
 
     MemTransferOffsets &Offsets = P.MemTransferInstData[&II];
 
@@ -670,12 +567,12 @@ private:
     Offsets.IsSplittable = Length;
 
     if (*U == II.getRawDest()) {
-      Offsets.DestBegin = Offset;
-      Offsets.DestEnd = Offset + Size;
+      Offsets.DestBegin = RawOffset;
+      Offsets.DestEnd = RawOffset + Size;
     }
     if (*U == II.getRawSource()) {
-      Offsets.SourceBegin = Offset;
-      Offsets.SourceEnd = Offset + Size;
+      Offsets.SourceBegin = RawOffset;
+      Offsets.SourceEnd = RawOffset + Size;
     }
 
     // If we have set up end offsets for both the source and the destination,
@@ -688,7 +585,7 @@ private:
       // In that case, we can completely elide the transfer.
       if (!II.isVolatile() && Offsets.SourceBegin == Offsets.DestBegin) {
         P.Partitions[PrevIdx].kill();
-        return true;
+        return;
       }
 
       // Otherwise we have an offset transfer within the same alloca. We can't
@@ -701,7 +598,7 @@ private:
 
       // For non-volatile transfers this is a no-op.
       if (!II.isVolatile())
-        return true;
+        return;
 
       // Otherwise just suppress splitting.
       Offsets.IsSplittable = false;
@@ -721,23 +618,25 @@ private:
              "Already have intrinsic in map but haven't seen both ends");
       (void)Inserted;
     }
-
-    return true;
   }
 
   // Disable SRoA for any intrinsics except for lifetime invariants.
   // FIXME: What about debug instrinsics? This matches old behavior, but
   // doesn't make sense.
-  bool visitIntrinsicInst(IntrinsicInst &II) {
+  void visitIntrinsicInst(IntrinsicInst &II) {
+    if (!IsOffsetKnown)
+      return PI.setAborted(&II);
+
     if (II.getIntrinsicID() == Intrinsic::lifetime_start ||
         II.getIntrinsicID() == Intrinsic::lifetime_end) {
       ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0));
-      uint64_t Size = std::min(AllocSize - Offset, Length->getLimitedValue());
+      uint64_t Size = std::min(AllocSize - Offset.getLimitedValue(),
+                               Length->getLimitedValue());
       insertUse(II, Offset, Size, true);
-      return true;
+      return;
     }
 
-    return markAsEscaping(II);
+    Base::visitIntrinsicInst(II);
   }
 
   Instruction *hasUnsafePHIOrSelectUse(Instruction *Root, uint64_t &Size) {
@@ -757,14 +656,14 @@ private:
       llvm::tie(UsedI, I) = Uses.pop_back_val();
 
       if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
-        Size = std::max(Size, TD.getTypeStoreSize(LI->getType()));
+        Size = std::max(Size, DL.getTypeStoreSize(LI->getType()));
         continue;
       }
       if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
         Value *Op = SI->getOperand(0);
         if (Op == UsedI)
           return SI;
-        Size = std::max(Size, TD.getTypeStoreSize(Op->getType()));
+        Size = std::max(Size, DL.getTypeStoreSize(Op->getType()));
         continue;
       }
 
@@ -785,54 +684,62 @@ private:
     return 0;
   }
 
-  bool visitPHINode(PHINode &PN) {
+  void visitPHINode(PHINode &PN) {
+    if (PN.use_empty())
+      return;
+    if (!IsOffsetKnown)
+      return PI.setAborted(&PN);
+
     // See if we already have computed info on this node.
     std::pair<uint64_t, bool> &PHIInfo = P.PHIOrSelectSizes[&PN];
     if (PHIInfo.first) {
       PHIInfo.second = true;
       insertUse(PN, Offset, PHIInfo.first);
-      return true;
+      return;
     }
 
     // Check for an unsafe use of the PHI node.
-    if (Instruction *EscapingI = hasUnsafePHIOrSelectUse(&PN, PHIInfo.first))
-      return markAsEscaping(*EscapingI);
+    if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&PN, PHIInfo.first))
+      return PI.setAborted(UnsafeI);
 
     insertUse(PN, Offset, PHIInfo.first);
-    return true;
   }
 
-  bool visitSelectInst(SelectInst &SI) {
+  void visitSelectInst(SelectInst &SI) {
+    if (SI.use_empty())
+      return;
     if (Value *Result = foldSelectInst(SI)) {
       if (Result == *U)
         // If the result of the constant fold will be the pointer, recurse
         // through the select as if we had RAUW'ed it.
-        enqueueUsers(SI, Offset);
+        enqueueUsers(SI);
 
-      return true;
+      return;
     }
+    if (!IsOffsetKnown)
+      return PI.setAborted(&SI);
 
     // See if we already have computed info on this node.
     std::pair<uint64_t, bool> &SelectInfo = P.PHIOrSelectSizes[&SI];
     if (SelectInfo.first) {
       SelectInfo.second = true;
       insertUse(SI, Offset, SelectInfo.first);
-      return true;
+      return;
     }
 
     // Check for an unsafe use of the PHI node.
-    if (Instruction *EscapingI = hasUnsafePHIOrSelectUse(&SI, SelectInfo.first))
-      return markAsEscaping(*EscapingI);
+    if (Instruction *UnsafeI = hasUnsafePHIOrSelectUse(&SI, SelectInfo.first))
+      return PI.setAborted(UnsafeI);
 
     insertUse(SI, Offset, SelectInfo.first);
-    return true;
   }
 
   /// \brief Disable SROA entirely if there are unhandled users of the alloca.
-  bool visitInstruction(Instruction &I) { return markAsEscaping(I); }
+  void visitInstruction(Instruction &I) {
+    PI.setAborted(&I);
+  }
 };
 
-
 /// \brief Use adder for the alloca partitioning.
 ///
 /// This class adds the uses of an alloca to all of the partitions which they
@@ -851,26 +758,22 @@ private:
 /// partition space is pre-sorted, and do a logarithmic search for the
 /// partition needed, making the total visit a classical ((N + M) * log(N))
 /// complexity operation.
-class AllocaPartitioning::UseBuilder : public BuilderBase<UseBuilder> {
+class AllocaPartitioning::UseBuilder : public PtrUseVisitor<UseBuilder> {
+  friend class PtrUseVisitor<UseBuilder>;
   friend class InstVisitor<UseBuilder>;
+  typedef PtrUseVisitor<UseBuilder> Base;
+
+  const uint64_t AllocSize;
+  AllocaPartitioning &P;
 
   /// \brief Set to de-duplicate dead instructions found in the use walk.
   SmallPtrSet<Instruction *, 4> VisitedDeadInsts;
 
 public:
   UseBuilder(const DataLayout &TD, AllocaInst &AI, AllocaPartitioning &P)
-      : BuilderBase<UseBuilder>(TD, AI, P) {}
-
-  /// \brief Run the builder over the allocation.
-  void operator()() {
-    // Note that we have to re-evaluate size on each trip through the loop as
-    // the queue grows at the tail.
-    for (unsigned Idx = 0; Idx < Queue.size(); ++Idx) {
-      U = Queue[Idx].U;
-      Offset = Queue[Idx].Offset;
-      this->visit(cast<Instruction>(U->getUser()));
-    }
-  }
+      : PtrUseVisitor<UseBuilder>(TD),
+        AllocSize(TD.getTypeAllocSize(AI.getAllocatedType())),
+        P(P) {}
 
 private:
   void markAsDead(Instruction &I) {
@@ -878,20 +781,14 @@ private:
       P.DeadUsers.push_back(&I);
   }
 
-  void insertUse(Instruction &User, int64_t Offset, uint64_t Size) {
+  void insertUse(Instruction &User, const APInt &Offset, uint64_t Size) {
     // If the use has a zero size or extends outside of the allocation, record
     // it as a dead use for elimination later.
-    if (Size == 0 || (uint64_t)Offset >= AllocSize ||
-        (Offset < 0 && (uint64_t)-Offset >= Size))
+    if (Size == 0 || Offset.isNegative() || Offset.uge(AllocSize))
       return markAsDead(User);
 
-    // Clamp the start to the beginning of the allocation.
-    if (Offset < 0) {
-      Size -= (uint64_t)-Offset;
-      Offset = 0;
-    }
-
-    uint64_t BeginOffset = Offset, EndOffset = BeginOffset + Size;
+    uint64_t BeginOffset = Offset.getZExtValue();
+    uint64_t EndOffset = BeginOffset + Size;
 
     // Clamp the end offset to the end of the allocation. Note that this is
     // formulated to handle even the case where "BeginOffset + Size" overflows.
@@ -914,15 +811,15 @@ private:
     }
   }
 
-  void handleLoadOrStore(Type *Ty, Instruction &I, int64_t Offset) {
-    uint64_t Size = TD.getTypeStoreSize(Ty);
+  void handleLoadOrStore(Type *Ty, Instruction &I, const APInt &Offset) {
+    uint64_t Size = DL.getTypeStoreSize(Ty);
 
     // If this memory access can be shown to *statically* extend outside the
     // bounds of of the allocation, it's behavior is undefined, so simply
     // ignore it. Note that this is more strict than the generic clamping
     // behavior of insertUse.
-    if (Offset < 0 || (uint64_t)Offset >= AllocSize ||
-        Size > (AllocSize - (uint64_t)Offset))
+    if (Offset.isNegative() || Size > AllocSize ||
+        Offset.ugt(AllocSize - Size))
       return markAsDead(I);
 
     insertUse(I, Offset, Size);
@@ -932,40 +829,47 @@ private:
     if (BC.use_empty())
       return markAsDead(BC);
 
-    enqueueUsers(BC, Offset);
+    return Base::visitBitCastInst(BC);
   }
 
   void visitGetElementPtrInst(GetElementPtrInst &GEPI) {
     if (GEPI.use_empty())
       return markAsDead(GEPI);
 
-    int64_t GEPOffset;
-    if (!computeConstantGEPOffset(GEPI, GEPOffset))
-      llvm_unreachable("Unable to compute constant offset for use");
-
-    enqueueUsers(GEPI, GEPOffset);
+    return Base::visitGetElementPtrInst(GEPI);
   }
 
   void visitLoadInst(LoadInst &LI) {
+    assert(IsOffsetKnown);
     handleLoadOrStore(LI.getType(), LI, Offset);
   }
 
   void visitStoreInst(StoreInst &SI) {
+    assert(IsOffsetKnown);
     handleLoadOrStore(SI.getOperand(0)->getType(), SI, Offset);
   }
 
   void visitMemSetInst(MemSetInst &II) {
     ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
-    uint64_t Size = Length ? Length->getZExtValue() : AllocSize - Offset;
-    insertUse(II, Offset, Size);
+    if ((Length && Length->getValue() == 0) ||
+        (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize)))
+      return markAsDead(II);
+
+    assert(IsOffsetKnown);
+    insertUse(II, Offset, Length ? Length->getLimitedValue()
+                                 : AllocSize - Offset.getLimitedValue());
   }
 
   void visitMemTransferInst(MemTransferInst &II) {
     ConstantInt *Length = dyn_cast<ConstantInt>(II.getLength());
-    uint64_t Size = Length ? Length->getZExtValue() : AllocSize - Offset;
-    if (!Size)
+    if ((Length && Length->getValue() == 0) ||
+        (IsOffsetKnown && !Offset.isNegative() && Offset.uge(AllocSize)))
       return markAsDead(II);
 
+    assert(IsOffsetKnown);
+    uint64_t Size = Length ? Length->getLimitedValue()
+                           : AllocSize - Offset.getLimitedValue();
+
     MemTransferOffsets &Offsets = P.MemTransferInstData[&II];
     if (!II.isVolatile() && Offsets.DestEnd && Offsets.SourceEnd &&
         Offsets.DestBegin == Offsets.SourceBegin)
@@ -975,34 +879,39 @@ private:
   }
 
   void visitIntrinsicInst(IntrinsicInst &II) {
+    assert(IsOffsetKnown);
     assert(II.getIntrinsicID() == Intrinsic::lifetime_start ||
            II.getIntrinsicID() == Intrinsic::lifetime_end);
 
     ConstantInt *Length = cast<ConstantInt>(II.getArgOperand(0));
-    insertUse(II, Offset,
-              std::min(AllocSize - Offset, Length->getLimitedValue()));
+    insertUse(II, Offset, std::min(Length->getLimitedValue(),
+                                   AllocSize - Offset.getLimitedValue()));
   }
 
-  void insertPHIOrSelect(Instruction &User, uint64_t Offset) {
+  void insertPHIOrSelect(Instruction &User, const APInt &Offset) {
     uint64_t Size = P.PHIOrSelectSizes.lookup(&User).first;
 
     // For PHI and select operands outside the alloca, we can't nuke the entire
     // phi or select -- the other side might still be relevant, so we special
     // case them here and use a separate structure to track the operands
     // themselves which should be replaced with undef.
-    if (Offset >= AllocSize) {
+    if ((Offset.isNegative() && Offset.uge(Size)) ||
+        (!Offset.isNegative() && Offset.uge(AllocSize))) {
       P.DeadOperands.push_back(U);
       return;
     }
 
     insertUse(User, Offset, Size);
   }
+
   void visitPHINode(PHINode &PN) {
     if (PN.use_empty())
       return markAsDead(PN);
 
+    assert(IsOffsetKnown);
     insertPHIOrSelect(PN, Offset);
   }
+
   void visitSelectInst(SelectInst &SI) {
     if (SI.use_empty())
       return markAsDead(SI);
@@ -1011,7 +920,7 @@ private:
       if (Result == *U)
         // If the result of the constant fold will be the pointer, recurse
         // through the select as if we had RAUW'ed it.
-        enqueueUsers(SI, Offset);
+        enqueueUsers(SI);
       else
         // Otherwise the operand to the select is dead, and we can replace it
         // with undef.
@@ -1020,6 +929,7 @@ private:
       return;
     }
 
+    assert(IsOffsetKnown);
     insertPHIOrSelect(SI, Offset);
   }
 
@@ -1131,8 +1041,15 @@ AllocaPartitioning::AllocaPartitioning(const DataLayout &TD, AllocaInst &AI)
 #endif
       PointerEscapingInstr(0) {
   PartitionBuilder PB(TD, AI, *this);
-  if (!PB())
+  PartitionBuilder::PtrInfo PtrI = PB.visitPtr(AI);
+  if (PtrI.isEscaped() || PtrI.isAborted()) {
+    // FIXME: We should sink the escape vs. abort info into the caller nicely,
+    // possibly by just storing the PtrInfo in the AllocaPartitioning.
+    PointerEscapingInstr = PtrI.getEscapingInst() ? PtrI.getEscapingInst()
+                                                  : PtrI.getAbortingInst();
+    assert(PointerEscapingInstr && "Did not track a bad instruction");
     return;
+  }
 
   // Sort the uses. This arranges for the offsets to be in ascending order,
   // and the sizes to be in descending order.
@@ -1166,7 +1083,9 @@ AllocaPartitioning::AllocaPartitioning(const DataLayout &TD, AllocaInst &AI)
   // re-walking the recursive users of the alloca.
   Uses.resize(Partitions.size());
   UseBuilder UB(TD, AI, *this);
-  UB();
+  PtrI = UB.visitPtr(AI);
+  assert(!PtrI.isEscaped() && "Previously analyzed pointer now escapes!");
+  assert(!PtrI.isAborted() && "Early aborted the visit of the pointer.");
 }
 
 Type *AllocaPartitioning::getCommonType(iterator I) const {
@@ -2164,6 +2083,9 @@ static bool isIntegerWideningViable(const DataLayout &TD,
                                     AllocaPartitioning::const_use_iterator I,
                                     AllocaPartitioning::const_use_iterator E) {
   uint64_t SizeInBits = TD.getTypeSizeInBits(AllocaTy);
+  // Don't create integer types larger than the maximum bitwidth.
+  if (SizeInBits > IntegerType::MAX_INT_BITS)
+    return false;
 
   // Don't try to handle allocas with bit-padding.
   if (SizeInBits != TD.getTypeStoreSizeInBits(AllocaTy))
@@ -2202,7 +2124,7 @@ static bool isIntegerWideningViable(const DataLayout &TD,
       if (RelBegin == 0 && RelEnd == Size)
         WholeAllocaOp = true;
       if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
-        if (ITy->getBitWidth() < TD.getTypeStoreSize(ITy))
+        if (ITy->getBitWidth() < TD.getTypeStoreSizeInBits(ITy))
           return false;
         continue;
       }
@@ -2218,7 +2140,7 @@ static bool isIntegerWideningViable(const DataLayout &TD,
       if (RelBegin == 0 && RelEnd == Size)
         WholeAllocaOp = true;
       if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
-        if (ITy->getBitWidth() < TD.getTypeStoreSize(ITy))
+        if (ITy->getBitWidth() < TD.getTypeStoreSizeInBits(ITy))
           return false;
         continue;
       }
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index 39630fd027..762bb15c59 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -13,14 +13,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm-c/Transforms/Scalar.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm-c/Initialization.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/PassManager.h"
+#include "llvm-c/Transforms/Scalar.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/Verifier.h"
 #include "llvm/DataLayout.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/PassManager.h"
 
 using namespace llvm;
 
diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
index a46d09c320..c8656fbd8e 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -21,8 +21,15 @@
 
 #define DEBUG_TYPE "scalarrepl"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Constants.h"
 #include "llvm/DIBuilder.h"
+#include "llvm/DataLayout.h"
 #include "llvm/DebugInfo.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
@@ -34,19 +41,12 @@
 #include "llvm/Module.h"
 #include "llvm/Operator.h"
 #include "llvm/Pass.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/Dominators.h"
-#include "llvm/Analysis/Loads.h"
-#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/DataLayout.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 9f24bb635e..9160f04fe2 100644
--- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -23,19 +23,19 @@
 
 #define DEBUG_TYPE "simplifycfg"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Attributes.h"
 #include "llvm/Constants.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Module.h"
-#include "llvm/Attributes.h"
-#include "llvm/Support/CFG.h"
 #include "llvm/Pass.h"
-#include "llvm/DataLayout.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CFG.h"
 #include "llvm/TargetTransformInfo.h"
+#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
 STATISTIC(NumSimpl, "Number of blocks simplified");
diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
index 0788f19014..d4643b9d80 100644
--- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
@@ -17,25 +17,24 @@
 
 #define DEBUG_TYPE "simplify-libcalls"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BuildLibCalls.h"
-#include "llvm/IRBuilder.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Module.h"
-#include "llvm/Pass.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Config/config.h"            // FIXME: Shouldn't depend on host!
+#include "llvm/DataLayout.h"
+#include "llvm/IRBuilder.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLibraryInfo.h"
-#include "llvm/Config/config.h"            // FIXME: Shouldn't depend on host!
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
 using namespace llvm;
 
-STATISTIC(NumSimplified, "Number of library calls simplified");
 STATISTIC(NumAnnotated, "Number of attributes added to library functions");
 
 //===----------------------------------------------------------------------===//
@@ -82,289 +81,6 @@ public:
 
 
 //===----------------------------------------------------------------------===//
-// Helper Functions
-//===----------------------------------------------------------------------===//
-
-static bool CallHasFloatingPointArgument(const CallInst *CI) {
-  for (CallInst::const_op_iterator it = CI->op_begin(), e = CI->op_end();
-       it != e; ++it) {
-    if ((*it)->getType()->isFloatingPointTy())
-      return true;
-  }
-  return false;
-}
-
-namespace {
-//===----------------------------------------------------------------------===//
-// Formatting and IO Optimizations
-//===----------------------------------------------------------------------===//
-
-//===---------------------------------------===//
-// 'sprintf' Optimizations
-
-struct SPrintFOpt : public LibCallOptimization {
-  Value *OptimizeFixedFormatString(Function *Callee, CallInst *CI,
-                                   IRBuilder<> &B) {
-    // Check for a fixed format string.
-    StringRef FormatStr;
-    if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr))
-      return 0;
-
-    // If we just have a format string (nothing else crazy) transform it.
-    if (CI->getNumArgOperands() == 2) {
-      // Make sure there's no % in the constant array.  We could try to handle
-      // %% -> % in the future if we cared.
-      for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
-        if (FormatStr[i] == '%')
-          return 0; // we found a format specifier, bail out.
-
-      // These optimizations require DataLayout.
-      if (!TD) return 0;
-
-      // sprintf(str, fmt) -> llvm.memcpy(str, fmt, strlen(fmt)+1, 1)
-      B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
-                     ConstantInt::get(TD->getIntPtrType(*Context), // Copy the
-                                      FormatStr.size() + 1), 1);   // nul byte.
-      return ConstantInt::get(CI->getType(), FormatStr.size());
-    }
-
-    // The remaining optimizations require the format string to be "%s" or "%c"
-    // and have an extra operand.
-    if (FormatStr.size() != 2 || FormatStr[0] != '%' ||
-        CI->getNumArgOperands() < 3)
-      return 0;
-
-    // Decode the second character of the format string.
-    if (FormatStr[1] == 'c') {
-      // sprintf(dst, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0
-      if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return 0;
-      Value *V = B.CreateTrunc(CI->getArgOperand(2), B.getInt8Ty(), "char");
-      Value *Ptr = CastToCStr(CI->getArgOperand(0), B);
-      B.CreateStore(V, Ptr);
-      Ptr = B.CreateGEP(Ptr, B.getInt32(1), "nul");
-      B.CreateStore(B.getInt8(0), Ptr);
-
-      return ConstantInt::get(CI->getType(), 1);
-    }
-
-    if (FormatStr[1] == 's') {
-      // These optimizations require DataLayout.
-      if (!TD) return 0;
-
-      // sprintf(dest, "%s", str) -> llvm.memcpy(dest, str, strlen(str)+1, 1)
-      if (!CI->getArgOperand(2)->getType()->isPointerTy()) return 0;
-
-      Value *Len = EmitStrLen(CI->getArgOperand(2), B, TD, TLI);
-      if (!Len)
-        return 0;
-      Value *IncLen = B.CreateAdd(Len,
-                                  ConstantInt::get(Len->getType(), 1),
-                                  "leninc");
-      B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(2), IncLen, 1);
-
-      // The sprintf result is the unincremented number of bytes in the string.
-      return B.CreateIntCast(Len, CI->getType(), false);
-    }
-    return 0;
-  }
-
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    // Require two fixed pointer arguments and an integer result.
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isPointerTy() ||
-        !FT->getReturnType()->isIntegerTy())
-      return 0;
-
-    if (Value *V = OptimizeFixedFormatString(Callee, CI, B)) {
-      return V;
-    }
-
-    // sprintf(str, format, ...) -> siprintf(str, format, ...) if no floating
-    // point arguments.
-    if (TLI->has(LibFunc::siprintf) && !CallHasFloatingPointArgument(CI)) {
-      Module *M = B.GetInsertBlock()->getParent()->getParent();
-      Constant *SIPrintFFn =
-        M->getOrInsertFunction("siprintf", FT, Callee->getAttributes());
-      CallInst *New = cast<CallInst>(CI->clone());
-      New->setCalledFunction(SIPrintFFn);
-      B.Insert(New);
-      return New;
-    }
-    return 0;
-  }
-};
-
-//===---------------------------------------===//
-// 'fwrite' Optimizations
-
-struct FWriteOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    // Require a pointer, an integer, an integer, a pointer, returning integer.
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 4 || !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isIntegerTy() ||
-        !FT->getParamType(2)->isIntegerTy() ||
-        !FT->getParamType(3)->isPointerTy() ||
-        !FT->getReturnType()->isIntegerTy())
-      return 0;
-
-    // Get the element size and count.
-    ConstantInt *SizeC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
-    ConstantInt *CountC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
-    if (!SizeC || !CountC) return 0;
-    uint64_t Bytes = SizeC->getZExtValue()*CountC->getZExtValue();
-
-    // If this is writing zero records, remove the call (it's a noop).
-    if (Bytes == 0)
-      return ConstantInt::get(CI->getType(), 0);
-
-    // If this is writing one byte, turn it into fputc.
-    // This optimisation is only valid, if the return value is unused.
-    if (Bytes == 1 && CI->use_empty()) {  // fwrite(S,1,1,F) -> fputc(S[0],F)
-      Value *Char = B.CreateLoad(CastToCStr(CI->getArgOperand(0), B), "char");
-      Value *NewCI = EmitFPutC(Char, CI->getArgOperand(3), B, TD, TLI);
-      return NewCI ? ConstantInt::get(CI->getType(), 1) : 0;
-    }
-
-    return 0;
-  }
-};
-
-//===---------------------------------------===//
-// 'fputs' Optimizations
-
-struct FPutsOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    // These optimizations require DataLayout.
-    if (!TD) return 0;
-
-    // Require two pointers.  Also, we can't optimize if return value is used.
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isPointerTy() ||
-        !CI->use_empty())
-      return 0;
-
-    // fputs(s,F) --> fwrite(s,1,strlen(s),F)
-    uint64_t Len = GetStringLength(CI->getArgOperand(0));
-    if (!Len) return 0;
-    // Known to have no uses (see above).
-    return EmitFWrite(CI->getArgOperand(0),
-                      ConstantInt::get(TD->getIntPtrType(*Context), Len-1),
-                      CI->getArgOperand(1), B, TD, TLI);
-  }
-};
-
-//===---------------------------------------===//
-// 'fprintf' Optimizations
-
-struct FPrintFOpt : public LibCallOptimization {
-  Value *OptimizeFixedFormatString(Function *Callee, CallInst *CI,
-                                   IRBuilder<> &B) {
-    // All the optimizations depend on the format string.
-    StringRef FormatStr;
-    if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr))
-      return 0;
-
-    // fprintf(F, "foo") --> fwrite("foo", 3, 1, F)
-    if (CI->getNumArgOperands() == 2) {
-      for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
-        if (FormatStr[i] == '%')  // Could handle %% -> % if we cared.
-          return 0; // We found a format specifier.
-
-      // These optimizations require DataLayout.
-      if (!TD) return 0;
-
-      Value *NewCI = EmitFWrite(CI->getArgOperand(1),
-                                ConstantInt::get(TD->getIntPtrType(*Context),
-                                                 FormatStr.size()),
-                                CI->getArgOperand(0), B, TD, TLI);
-      return NewCI ? ConstantInt::get(CI->getType(), FormatStr.size()) : 0;
-    }
-
-    // The remaining optimizations require the format string to be "%s" or "%c"
-    // and have an extra operand.
-    if (FormatStr.size() != 2 || FormatStr[0] != '%' ||
-        CI->getNumArgOperands() < 3)
-      return 0;
-
-    // Decode the second character of the format string.
-    if (FormatStr[1] == 'c') {
-      // fprintf(F, "%c", chr) --> fputc(chr, F)
-      if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return 0;
-      Value *NewCI = EmitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B,
-                               TD, TLI);
-      return NewCI ? ConstantInt::get(CI->getType(), 1) : 0;
-    }
-
-    if (FormatStr[1] == 's') {
-      // fprintf(F, "%s", str) --> fputs(str, F)
-      if (!CI->getArgOperand(2)->getType()->isPointerTy() || !CI->use_empty())
-        return 0;
-      return EmitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TD, TLI);
-    }
-    return 0;
-  }
-
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    // Require two fixed paramters as pointers and integer result.
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
-        !FT->getParamType(1)->isPointerTy() ||
-        !FT->getReturnType()->isIntegerTy())
-      return 0;
-
-    if (Value *V = OptimizeFixedFormatString(Callee, CI, B)) {
-      return V;
-    }
-
-    // fprintf(stream, format, ...) -> fiprintf(stream, format, ...) if no
-    // floating point arguments.
-    if (TLI->has(LibFunc::fiprintf) && !CallHasFloatingPointArgument(CI)) {
-      Module *M = B.GetInsertBlock()->getParent()->getParent();
-      Constant *FIPrintFFn =
-        M->getOrInsertFunction("fiprintf", FT, Callee->getAttributes());
-      CallInst *New = cast<CallInst>(CI->clone());
-      New->setCalledFunction(FIPrintFFn);
-      B.Insert(New);
-      return New;
-    }
-    return 0;
-  }
-};
-
-//===---------------------------------------===//
-// 'puts' Optimizations
-
-struct PutsOpt : public LibCallOptimization {
-  virtual Value *CallOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
-    // Require one fixed pointer argument and an integer/void result.
-    FunctionType *FT = Callee->getFunctionType();
-    if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() ||
-        !(FT->getReturnType()->isIntegerTy() ||
-          FT->getReturnType()->isVoidTy()))
-      return 0;
-
-    // Check for a constant string.
-    StringRef Str;
-    if (!getConstantStringInfo(CI->getArgOperand(0), Str))
-      return 0;
-
-    if (Str.empty() && CI->use_empty()) {
-      // puts("") -> putchar('\n')
-      Value *Res = EmitPutChar(B.getInt32('\n'), B, TD, TLI);
-      if (CI->use_empty() || !Res) return Res;
-      return B.CreateIntCast(Res, CI->getType(), true);
-    }
-
-    return 0;
-  }
-};
-
-} // end anonymous namespace.
-
-//===----------------------------------------------------------------------===//
 // SimplifyLibCalls Pass Implementation
 //===----------------------------------------------------------------------===//
 
@@ -375,10 +91,6 @@ namespace {
     TargetLibraryInfo *TLI;
 
     StringMap<LibCallOptimization*> Optimizations;
-    // Formatting and IO Optimizations
-    SPrintFOpt SPrintF;
-    FWriteOpt FWrite; FPutsOpt FPuts; FPrintFOpt FPrintF;
-    PutsOpt Puts;
 
     bool Modified;  // This is only used by doInitialization.
   public:
@@ -433,12 +145,6 @@ void SimplifyLibCalls::AddOpt(LibFunc::Func F1, LibFunc::Func F2,
 /// Optimizations - Populate the Optimizations map with all the optimizations
 /// we know.
 void SimplifyLibCalls::InitOptimizations() {
-  // Formatting and IO Optimizations
-  Optimizations["sprintf"] = &SPrintF;
-  AddOpt(LibFunc::fwrite, &FWrite);
-  AddOpt(LibFunc::fputs, &FPuts);
-  Optimizations["fprintf"] = &FPrintF;
-  Optimizations["puts"] = &Puts;
 }
 
 
@@ -486,7 +192,6 @@ bool SimplifyLibCalls::runOnFunction(Function &F) {
 
       // Something changed!
       Changed = true;
-      ++NumSimplified;
 
       // Inspect the instruction after the call (which was potentially just
       // added) next.
diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp
index 34f1d6c622..cde9c178ad 100644
--- a/lib/Transforms/Scalar/Sink.cpp
+++ b/lib/Transforms/Scalar/Sink.cpp
@@ -14,13 +14,13 @@
 
 #define DEBUG_TYPE "sink"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/IntrinsicInst.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Assembly/Writer.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/IntrinsicInst.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 6557d630a9..e357378524 100644
--- a/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -52,8 +52,12 @@
 
 #define DEBUG_TYPE "tailcallelim"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
@@ -61,16 +65,12 @@
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/Analysis/CaptureTracking.h"
-#include "llvm/Analysis/InlineCost.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/Loads.h"
-#include "llvm/Support/CallSite.h"
 #include "llvm/Support/CFG.h"
+#include "llvm/Support/CallSite.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
 STATISTIC(NumEliminated, "Number of tail calls removed");
diff --git a/lib/Transforms/Utils/AddrModeMatcher.cpp b/lib/Transforms/Utils/AddrModeMatcher.cpp
index 6815e411b4..3a19b706ea 100644
--- a/lib/Transforms/Utils/AddrModeMatcher.cpp
+++ b/lib/Transforms/Utils/AddrModeMatcher.cpp
@@ -12,16 +12,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/AddrModeMatcher.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/DataLayout.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/GlobalValue.h"
 #include "llvm/Instruction.h"
-#include "llvm/Assembly/Writer.h"
-#include "llvm/DataLayout.h"
+#include "llvm/Support/CallSite.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/PatternMatch.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/CallSite.h"
 
 using namespace llvm;
 using namespace llvm::PatternMatch;
diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
index 9fea11391a..e8833f2092 100644
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -13,20 +13,20 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Function.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Constant.h"
-#include "llvm/Type.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Constant.h"
 #include "llvm/DataLayout.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ValueHandle.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Type.h"
 #include <algorithm>
 using namespace llvm;
 
diff --git a/lib/Transforms/Utils/BreakCriticalEdges.cpp b/lib/Transforms/Utils/BreakCriticalEdges.cpp
index 6b04e3d17b..385ceb13b2 100644
--- a/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -17,17 +17,17 @@
 
 #define DEBUG_TYPE "break-crit-edges"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ProfileInfo.h"
 #include "llvm/Function.h"
 #include "llvm/Instructions.h"
-#include "llvm/Type.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Type.h"
 using namespace llvm;
 
 STATISTIC(NumBroken, "Number of blocks inserted");
diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp
index 74b2ee10e0..62b79bf2b3 100644
--- a/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -12,7 +12,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/Constants.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Function.h"
 #include "llvm/IRBuilder.h"
 #include "llvm/Intrinsics.h"
@@ -20,10 +22,8 @@
 #include "llvm/LLVMContext.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
-#include "llvm/Type.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/DataLayout.h"
 #include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Type.h"
 
 using namespace llvm;
 
@@ -43,12 +43,12 @@ Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout *TD,
   AttributeWithIndex AWI[2];
   AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attributes::NoCapture);
   Attributes::AttrVal AVs[2] = { Attributes::ReadOnly, Attributes::NoUnwind };
-  AWI[1] = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
+  AWI[1] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
                                    ArrayRef<Attributes::AttrVal>(AVs, 2));
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Constant *StrLen = M->getOrInsertFunction("strlen",
-                                            AttrListPtr::get(M->getContext(),
+                                            AttributeSet::get(M->getContext(),
                                                              AWI),
                                             TD->getIntPtrType(Context),
                                             B.getInt8PtrTy(),
@@ -72,12 +72,12 @@ Value *llvm::EmitStrNLen(Value *Ptr, Value *MaxLen, IRBuilder<> &B,
   AttributeWithIndex AWI[2];
   AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attributes::NoCapture);
   Attributes::AttrVal AVs[2] = { Attributes::ReadOnly, Attributes::NoUnwind };
-  AWI[1] = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
+  AWI[1] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
                                    ArrayRef<Attributes::AttrVal>(AVs, 2));
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Constant *StrNLen = M->getOrInsertFunction("strnlen",
-                                             AttrListPtr::get(M->getContext(),
+                                             AttributeSet::get(M->getContext(),
                                                               AWI),
                                              TD->getIntPtrType(Context),
                                              B.getInt8PtrTy(),
@@ -101,13 +101,13 @@ Value *llvm::EmitStrChr(Value *Ptr, char C, IRBuilder<> &B,
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   Attributes::AttrVal AVs[2] = { Attributes::ReadOnly, Attributes::NoUnwind };
   AttributeWithIndex AWI =
-    AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
+    AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
                             ArrayRef<Attributes::AttrVal>(AVs, 2));
 
   Type *I8Ptr = B.getInt8PtrTy();
   Type *I32Ty = B.getInt32Ty();
   Constant *StrChr = M->getOrInsertFunction("strchr",
-                                            AttrListPtr::get(M->getContext(),
+                                            AttributeSet::get(M->getContext(),
                                                              AWI),
                                             I8Ptr, I8Ptr, I32Ty, NULL);
   CallInst *CI = B.CreateCall2(StrChr, CastToCStr(Ptr, B),
@@ -129,12 +129,12 @@ Value *llvm::EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len,
   AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attributes::NoCapture);
   AWI[1] = AttributeWithIndex::get(M->getContext(), 2, Attributes::NoCapture);
   Attributes::AttrVal AVs[2] = { Attributes::ReadOnly, Attributes::NoUnwind };
-  AWI[2] = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
+  AWI[2] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
                                    ArrayRef<Attributes::AttrVal>(AVs, 2));
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *StrNCmp = M->getOrInsertFunction("strncmp",
-                                          AttrListPtr::get(M->getContext(),
+                                          AttributeSet::get(M->getContext(),
                                                            AWI),
                                           B.getInt32Ty(),
                                           B.getInt8PtrTy(),
@@ -160,11 +160,11 @@ Value *llvm::EmitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B,
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI[2];
   AWI[0] = AttributeWithIndex::get(M->getContext(), 2, Attributes::NoCapture);
-  AWI[1] = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
+  AWI[1] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
                                    Attributes::NoUnwind);
   Type *I8Ptr = B.getInt8PtrTy();
   Value *StrCpy = M->getOrInsertFunction(Name,
-                                         AttrListPtr::get(M->getContext(), AWI),
+                                         AttributeSet::get(M->getContext(), AWI),
                                          I8Ptr, I8Ptr, I8Ptr, NULL);
   CallInst *CI = B.CreateCall2(StrCpy, CastToCStr(Dst, B), CastToCStr(Src, B),
                                Name);
@@ -184,11 +184,11 @@ Value *llvm::EmitStrNCpy(Value *Dst, Value *Src, Value *Len,
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI[2];
   AWI[0] = AttributeWithIndex::get(M->getContext(), 2, Attributes::NoCapture);
-  AWI[1] = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
+  AWI[1] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
                                    Attributes::NoUnwind);
   Type *I8Ptr = B.getInt8PtrTy();
   Value *StrNCpy = M->getOrInsertFunction(Name,
-                                          AttrListPtr::get(M->getContext(),
+                                          AttributeSet::get(M->getContext(),
                                                            AWI),
                                           I8Ptr, I8Ptr, I8Ptr,
                                           Len->getType(), NULL);
@@ -210,11 +210,11 @@ Value *llvm::EmitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI;
-  AWI = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
+  AWI = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
                                 Attributes::NoUnwind);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *MemCpy = M->getOrInsertFunction("__memcpy_chk",
-                                         AttrListPtr::get(M->getContext(), AWI),
+                                         AttributeSet::get(M->getContext(), AWI),
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
@@ -239,11 +239,11 @@ Value *llvm::EmitMemChr(Value *Ptr, Value *Val,
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI;
   Attributes::AttrVal AVs[2] = { Attributes::ReadOnly, Attributes::NoUnwind };
-  AWI = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
+  AWI = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
                                 ArrayRef<Attributes::AttrVal>(AVs, 2));
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *MemChr = M->getOrInsertFunction("memchr",
-                                         AttrListPtr::get(M->getContext(), AWI),
+                                         AttributeSet::get(M->getContext(), AWI),
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
                                          B.getInt32Ty(),
@@ -269,12 +269,12 @@ Value *llvm::EmitMemCmp(Value *Ptr1, Value *Ptr2,
   AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attributes::NoCapture);
   AWI[1] = AttributeWithIndex::get(M->getContext(), 2, Attributes::NoCapture);
   Attributes::AttrVal AVs[2] = { Attributes::ReadOnly, Attributes::NoUnwind };
-  AWI[2] = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
+  AWI[2] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
                                    ArrayRef<Attributes::AttrVal>(AVs, 2));
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *MemCmp = M->getOrInsertFunction("memcmp",
-                                         AttrListPtr::get(M->getContext(), AWI),
+                                         AttributeSet::get(M->getContext(), AWI),
                                          B.getInt32Ty(),
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
@@ -293,7 +293,7 @@ Value *llvm::EmitMemCmp(Value *Ptr1, Value *Ptr2,
 /// returns one value with the same type.  If 'Op' is a long double, 'l' is
 /// added as the suffix of name, if 'Op' is a float, we add a 'f' suffix.
 Value *llvm::EmitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B,
-                                  const AttrListPtr &Attrs) {
+                                  const AttributeSet &Attrs) {
   SmallString<20> NameBuffer;
   if (!Op->getType()->isDoubleTy()) {
     // If we need to add a suffix, copy into NameBuffer.
@@ -348,11 +348,11 @@ Value *llvm::EmitPutS(Value *Str, IRBuilder<> &B, const DataLayout *TD,
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI[2];
   AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attributes::NoCapture);
-  AWI[1] = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
+  AWI[1] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
                                    Attributes::NoUnwind);
 
   Value *PutS = M->getOrInsertFunction("puts",
-                                       AttrListPtr::get(M->getContext(), AWI),
+                                       AttributeSet::get(M->getContext(), AWI),
                                        B.getInt32Ty(),
                                        B.getInt8PtrTy(),
                                        NULL);
@@ -372,12 +372,12 @@ Value *llvm::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B,
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI[2];
   AWI[0] = AttributeWithIndex::get(M->getContext(), 2, Attributes::NoCapture);
-  AWI[1] = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
+  AWI[1] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
                                    Attributes::NoUnwind);
   Constant *F;
   if (File->getType()->isPointerTy())
     F = M->getOrInsertFunction("fputc",
-                               AttrListPtr::get(M->getContext(), AWI),
+                               AttributeSet::get(M->getContext(), AWI),
                                B.getInt32Ty(),
                                B.getInt32Ty(), File->getType(),
                                NULL);
@@ -406,13 +406,13 @@ Value *llvm::EmitFPutS(Value *Str, Value *File, IRBuilder<> &B,
   AttributeWithIndex AWI[3];
   AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attributes::NoCapture);
   AWI[1] = AttributeWithIndex::get(M->getContext(), 2, Attributes::NoCapture);
-  AWI[2] = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
+  AWI[2] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
                                    Attributes::NoUnwind);
   StringRef FPutsName = TLI->getName(LibFunc::fputs);
   Constant *F;
   if (File->getType()->isPointerTy())
     F = M->getOrInsertFunction(FPutsName,
-                               AttrListPtr::get(M->getContext(), AWI),
+                               AttributeSet::get(M->getContext(), AWI),
                                B.getInt32Ty(),
                                B.getInt8PtrTy(),
                                File->getType(), NULL);
@@ -439,14 +439,14 @@ Value *llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File,
   AttributeWithIndex AWI[3];
   AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attributes::NoCapture);
   AWI[1] = AttributeWithIndex::get(M->getContext(), 4, Attributes::NoCapture);
-  AWI[2] = AttributeWithIndex::get(M->getContext(), AttrListPtr::FunctionIndex,
+  AWI[2] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
                                    Attributes::NoUnwind);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   StringRef FWriteName = TLI->getName(LibFunc::fwrite);
   Constant *F;
   if (File->getType()->isPointerTy())
     F = M->getOrInsertFunction(FWriteName,
-                               AttrListPtr::get(M->getContext(), AWI),
+                               AttributeSet::get(M->getContext(), AWI),
                                TD->getIntPtrType(Context),
                                B.getInt8PtrTy(),
                                TD->getIntPtrType(Context),
diff --git a/lib/Transforms/Utils/BypassSlowDivision.cpp b/lib/Transforms/Utils/BypassSlowDivision.cpp
index bee2f7bcb6..1699a3b648 100644
--- a/lib/Transforms/Utils/BypassSlowDivision.cpp
+++ b/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -16,11 +16,11 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "bypass-slow-division"
-#include "llvm/Instructions.h"
+#include "llvm/Transforms/Utils/BypassSlowDivision.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/Function.h"
 #include "llvm/IRBuilder.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/Transforms/Utils/BypassSlowDivision.h"
+#include "llvm/Instructions.h"
 
 using namespace llvm;
 
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index 7ba9f6d9d2..12f2e4b83e 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -14,22 +14,22 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Constants.h"
 #include "llvm/DebugInfo.h"
 #include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalVariable.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
-#include "llvm/GlobalVariable.h"
-#include "llvm/Function.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Metadata.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
-#include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/ADT/SmallVector.h"
 #include <map>
 using namespace llvm;
 
@@ -99,12 +99,12 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
                        .getParamAttributes(I->getArgNo() + 1));
     NewFunc->setAttributes(NewFunc->getAttributes()
                            .addAttr(NewFunc->getContext(),
-                                    AttrListPtr::ReturnIndex,
+                                    AttributeSet::ReturnIndex,
                                     OldFunc->getAttributes()
                                      .getRetAttributes()));
     NewFunc->setAttributes(NewFunc->getAttributes()
                            .addAttr(NewFunc->getContext(),
-                                    AttrListPtr::FunctionIndex,
+                                    AttributeSet::FunctionIndex,
                                     OldFunc->getAttributes()
                                      .getFnAttributes()));
 
diff --git a/lib/Transforms/Utils/CloneModule.cpp b/lib/Transforms/Utils/CloneModule.cpp
index 1dac6b5b8b..114babd101 100644
--- a/lib/Transforms/Utils/CloneModule.cpp
+++ b/lib/Transforms/Utils/CloneModule.cpp
@@ -13,9 +13,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Module.h"
-#include "llvm/DerivedTypes.h"
 #include "llvm/Constant.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Module.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 using namespace llvm;
 
@@ -38,10 +38,6 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) {
   New->setTargetTriple(M->getTargetTriple());
   New->setModuleInlineAsm(M->getModuleInlineAsm());
    
-  // Copy all of the dependent libraries over.
-  for (Module::lib_iterator I = M->lib_begin(), E = M->lib_end(); I != E; ++I)
-    New->addLibrary(*I);
-
   // Loop over all of the global variables, making corresponding globals in the
   // new module.  Here we add them to the VMap and to the new Module.  We
   // don't worry about attributes or initializers, they will come later.
diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index 281714f4c1..a596df64fd 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -14,6 +14,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/CodeExtractor.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/RegionInfo.h"
+#include "llvm/Analysis/RegionIterator.h"
+#include "llvm/Analysis/Verifier.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Instructions.h"
@@ -21,18 +28,11 @@
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/Analysis/Dominators.h"
-#include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/RegionInfo.h"
-#include "llvm/Analysis/RegionIterator.h"
-#include "llvm/Analysis/Verifier.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include <algorithm>
 #include <set>
 using namespace llvm;
diff --git a/lib/Transforms/Utils/DemoteRegToStack.cpp b/lib/Transforms/Utils/DemoteRegToStack.cpp
index 99b5830163..f8a0cafadc 100644
--- a/lib/Transforms/Utils/DemoteRegToStack.cpp
+++ b/lib/Transforms/Utils/DemoteRegToStack.cpp
@@ -8,10 +8,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/Function.h"
 #include "llvm/Instructions.h"
 #include "llvm/Type.h"
-#include "llvm/ADT/DenseMap.h"
 using namespace llvm;
 
 /// DemoteRegToStack - This function takes a virtual register computed by an
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index 303de56d95..c176cf1075 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -13,8 +13,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Attributes.h"
 #include "llvm/Constants.h"
+#include "llvm/DataLayout.h"
 #include "llvm/DebugInfo.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/IRBuilder.h"
@@ -22,12 +27,7 @@
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Intrinsics.h"
 #include "llvm/Module.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/Analysis/CallGraph.h"
-#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Support/CallSite.h"
-#include "llvm/DataLayout.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
diff --git a/lib/Transforms/Utils/IntegerDivision.cpp b/lib/Transforms/Utils/IntegerDivision.cpp
index 55227e2714..67dcbe446b 100644
--- a/lib/Transforms/Utils/IntegerDivision.cpp
+++ b/lib/Transforms/Utils/IntegerDivision.cpp
@@ -15,11 +15,11 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "integer-division"
+#include "llvm/Transforms/Utils/IntegerDivision.h"
 #include "llvm/Function.h"
+#include "llvm/IRBuilder.h"
 #include "llvm/Instructions.h"
 #include "llvm/Intrinsics.h"
-#include "llvm/IRBuilder.h"
-#include "llvm/Transforms/Utils/IntegerDivision.h"
 
 using namespace llvm;
 
diff --git a/lib/Transforms/Utils/LCSSA.cpp b/lib/Transforms/Utils/LCSSA.cpp
index 5e05c83c35..5dddb6e28a 100644
--- a/lib/Transforms/Utils/LCSSA.cpp
+++ b/lib/Transforms/Utils/LCSSA.cpp
@@ -29,17 +29,17 @@
 
 #define DEBUG_TYPE "lcssa"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Constants.h"
-#include "llvm/Pass.h"
-#include "llvm/Function.h"
-#include "llvm/Instructions.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/PredIteratorCache.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
 using namespace llvm;
 
 STATISTIC(NumLCSSA, "Number of live out of a loop variables");
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index a954d82c05..0e56817a1b 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -13,8 +13,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/ProfileInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Constants.h"
 #include "llvm/DIBuilder.h"
+#include "llvm/DataLayout.h"
 #include "llvm/DebugInfo.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/GlobalAlias.h"
@@ -26,20 +34,12 @@
 #include "llvm/MDBuilder.h"
 #include "llvm/Metadata.h"
 #include "llvm/Operator.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/Analysis/Dominators.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Analysis/ProfileInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/DataLayout.h"
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp
index 9d9e201665..6a68416a3d 100644
--- a/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/lib/Transforms/Utils/LoopSimplify.cpp
@@ -39,26 +39,26 @@
 
 #define DEBUG_TYPE "loop-simplify"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Constants.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Function.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Type.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/LLVMContext.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/ADT/SetOperations.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Type.h"
 using namespace llvm;
 
 STATISTIC(NumInserted, "Number of pre-header or exit blocks inserted");
diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp
index 20237500c3..d24b334681 100644
--- a/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/lib/Transforms/Utils/LoopUnroll.cpp
@@ -18,12 +18,12 @@
 
 #define DEBUG_TYPE "loop-unroll"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
-#include "llvm/BasicBlock.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/BasicBlock.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 67e17f4ca8..242e7fa021 100644
--- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -23,12 +23,12 @@
 
 #define DEBUG_TYPE "loop-unroll"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
-#include "llvm/BasicBlock.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/BasicBlock.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
diff --git a/lib/Transforms/Utils/LowerExpectIntrinsic.cpp b/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
index 02bdcda391..8756d26ca4 100644
--- a/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
+++ b/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
@@ -12,6 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "lower-expect-intrinsic"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/BasicBlock.h"
 #include "llvm/Constants.h"
 #include "llvm/Function.h"
@@ -21,8 +23,6 @@
 #include "llvm/MDBuilder.h"
 #include "llvm/Metadata.h"
 #include "llvm/Pass.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include <vector>
diff --git a/lib/Transforms/Utils/LowerInvoke.cpp b/lib/Transforms/Utils/LowerInvoke.cpp
index 930555424d..7b89ffd401 100644
--- a/lib/Transforms/Utils/LowerInvoke.cpp
+++ b/lib/Transforms/Utils/LowerInvoke.cpp
@@ -36,6 +36,8 @@
 
 #define DEBUG_TYPE "lowerinvoke"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Constants.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Instructions.h"
@@ -43,12 +45,10 @@
 #include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/Pass.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetLowering.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <csetjmp>
 #include <set>
 using namespace llvm;
diff --git a/lib/Transforms/Utils/LowerSwitch.cpp b/lib/Transforms/Utils/LowerSwitch.cpp
index 1547439b5c..74a457ce81 100644
--- a/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/lib/Transforms/Utils/LowerSwitch.cpp
@@ -14,16 +14,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Constants.h"
 #include "llvm/Function.h"
 #include "llvm/Instructions.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Pass.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
 #include <algorithm>
 using namespace llvm;
 
diff --git a/lib/Transforms/Utils/Mem2Reg.cpp b/lib/Transforms/Utils/Mem2Reg.cpp
index f4ca81af6d..70fbf13b97 100644
--- a/lib/Transforms/Utils/Mem2Reg.cpp
+++ b/lib/Transforms/Utils/Mem2Reg.cpp
@@ -14,12 +14,12 @@
 
 #define DEBUG_TYPE "mem2reg"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/PromoteMemToReg.h"
-#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Dominators.h"
-#include "llvm/Instructions.h"
 #include "llvm/Function.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/Instructions.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
 using namespace llvm;
 
 STATISTIC(NumPromoted, "Number of alloca's promoted");
diff --git a/lib/Transforms/Utils/MetaRenamer.cpp b/lib/Transforms/Utils/MetaRenamer.cpp
index ada8e3b11e..363e9367f3 100644
--- a/lib/Transforms/Utils/MetaRenamer.cpp
+++ b/lib/Transforms/Utils/MetaRenamer.cpp
@@ -13,9 +13,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/Transforms/IPO.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Function.h"
 #include "llvm/Module.h"
diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index 558de9d12e..b41f433659 100644
--- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -27,26 +27,26 @@
 
 #define DEBUG_TYPE "mem2reg"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Constants.h"
+#include "llvm/DIBuilder.h"
 #include "llvm/DebugInfo.h"
 #include "llvm/DerivedTypes.h"
-#include "llvm/DIBuilder.h"
 #include "llvm/Function.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/Metadata.h"
-#include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Analysis/Dominators.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/CFG.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 #include <queue>
 using namespace llvm;
diff --git a/lib/Transforms/Utils/SSAUpdater.cpp b/lib/Transforms/Utils/SSAUpdater.cpp
index 72d4199a2a..e1e7f4d668 100644
--- a/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/lib/Transforms/Utils/SSAUpdater.cpp
@@ -12,12 +12,13 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "ssaupdater"
-#include "llvm/Constants.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Constants.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
 #include "llvm/Support/AlignOf.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CFG.h"
@@ -25,7 +26,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/Transforms/Utils/SSAUpdaterImpl.h"
 
 using namespace llvm;
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 6c34eed13d..3cae77227c 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -13,6 +13,14 @@
 
 #define DEBUG_TYPE "simplifycfg"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Constants.h"
 #include "llvm/DataLayout.h"
 #include "llvm/DerivedTypes.h"
@@ -25,15 +33,6 @@
 #include "llvm/Metadata.h"
 #include "llvm/Module.h"
 #include "llvm/Operator.h"
-#include "llvm/Type.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ConstantRange.h"
@@ -42,9 +41,10 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetTransformInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Type.h"
 #include <algorithm>
-#include <set>
 #include <map>
+#include <set>
 using namespace llvm;
 
 static cl::opt<unsigned>
@@ -3511,22 +3511,44 @@ bool SwitchLookupTable::WouldFitInRegister(const DataLayout *TD,
 static bool ShouldBuildLookupTable(SwitchInst *SI,
                                    uint64_t TableSize,
                                    const DataLayout *TD,
+                                   const TargetTransformInfo *TTI,
                             const SmallDenseMap<PHINode*, Type*>& ResultTypes) {
-  // The table density should be at least 40%. This is the same criterion as for
-  // jump tables, see SelectionDAGBuilder::handleJTSwitchCase.
-  // FIXME: Find the best cut-off.
   if (SI->getNumCases() > TableSize || TableSize >= UINT64_MAX / 10)
     return false; // TableSize overflowed, or mul below might overflow.
-  if (SI->getNumCases() * 10 >= TableSize * 4)
-    return true;
 
-  // If each table would fit in a register, we should build it anyway.
+  bool AllTablesFitInRegister = true;
+  bool HasIllegalType = false;
   for (SmallDenseMap<PHINode*, Type*>::const_iterator I = ResultTypes.begin(),
        E = ResultTypes.end(); I != E; ++I) {
-    if (!SwitchLookupTable::WouldFitInRegister(TD, TableSize, I->second))
-      return false;
+    Type *Ty = I->second;
+
+    // Saturate this flag to true.
+    HasIllegalType = HasIllegalType ||
+      !TTI->getScalarTargetTransformInfo()->isTypeLegal(Ty);
+
+    // Saturate this flag to false.
+    AllTablesFitInRegister = AllTablesFitInRegister &&
+      SwitchLookupTable::WouldFitInRegister(TD, TableSize, Ty);
+
+    // If both flags saturate, we're done. NOTE: This *only* works with
+    // saturating flags, and all flags have to saturate first due to the
+    // non-deterministic behavior of iterating over a dense map.
+    if (HasIllegalType && !AllTablesFitInRegister)
+      break;
   }
-  return true;
+
+  // If each table would fit in a register, we should build it anyway.
+  if (AllTablesFitInRegister)
+    return true;
+
+  // Don't build a table that doesn't fit in-register if it has illegal types.
+  if (HasIllegalType)
+    return false;
+
+  // The table density should be at least 40%. This is the same criterion as for
+  // jump tables, see SelectionDAGBuilder::handleJTSwitchCase.
+  // FIXME: Find the best cut-off.
+  return SI->getNumCases() * 10 >= TableSize * 4;
 }
 
 /// SwitchToLookupTable - If the switch is only used to initialize one or more
@@ -3607,7 +3629,7 @@ static bool SwitchToLookupTable(SwitchInst *SI,
 
   APInt RangeSpread = MaxCaseVal->getValue() - MinCaseVal->getValue();
   uint64_t TableSize = RangeSpread.getLimitedValue() + 1;
-  if (!ShouldBuildLookupTable(SI, TableSize, TD, ResultTypes))
+  if (!ShouldBuildLookupTable(SI, TableSize, TD, TTI, ResultTypes))
     return false;
 
   // Create the BB that does the lookups.
diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp
index 110f380857..5883293a81 100644
--- a/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -15,18 +15,18 @@
 
 #define DEBUG_TYPE "indvars"
 
-#include "llvm/Instructions.h"
+#include "llvm/Transforms/Utils/SimplifyIndVar.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/IVUsers.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/DataLayout.h"
+#include "llvm/Instructions.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/SimplifyIndVar.h"
-#include "llvm/DataLayout.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
 
 using namespace llvm;
 
diff --git a/lib/Transforms/Utils/SimplifyInstructions.cpp b/lib/Transforms/Utils/SimplifyInstructions.cpp
index 65353dc460..8b2eeb9928 100644
--- a/lib/Transforms/Utils/SimplifyInstructions.cpp
+++ b/lib/Transforms/Utils/SimplifyInstructions.cpp
@@ -15,18 +15,18 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "instsimplify"
-#include "llvm/Function.h"
-#include "llvm/Pass.h"
-#include "llvm/Type.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/DataLayout.h"
+#include "llvm/Function.h"
+#include "llvm/Pass.h"
 #include "llvm/Target/TargetLibraryInfo.h"
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Type.h"
 using namespace llvm;
 
 STATISTIC(NumSimplified, "Number of redundant instructions removed");
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index 6ca1175c41..11a1ee5207 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -15,14 +15,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
-#include "llvm/DataLayout.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/DataLayout.h"
 #include "llvm/Function.h"
 #include "llvm/IRBuilder.h"
 #include "llvm/Intrinsics.h"
-#include "llvm/Module.h"
 #include "llvm/LLVMContext.h"
+#include "llvm/Module.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 
@@ -1409,6 +1409,254 @@ struct PrintFOpt : public LibCallOptimization {
   }
 };
 
+struct SPrintFOpt : public LibCallOptimization {
+  Value *OptimizeFixedFormatString(Function *Callee, CallInst *CI,
+                                   IRBuilder<> &B) {
+    // Check for a fixed format string.
+    StringRef FormatStr;
+    if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr))
+      return 0;
+
+    // If we just have a format string (nothing else crazy) transform it.
+    if (CI->getNumArgOperands() == 2) {
+      // Make sure there's no % in the constant array.  We could try to handle
+      // %% -> % in the future if we cared.
+      for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
+        if (FormatStr[i] == '%')
+          return 0; // we found a format specifier, bail out.
+
+      // These optimizations require DataLayout.
+      if (!TD) return 0;
+
+      // sprintf(str, fmt) -> llvm.memcpy(str, fmt, strlen(fmt)+1, 1)
+      B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                     ConstantInt::get(TD->getIntPtrType(*Context), // Copy the
+                                      FormatStr.size() + 1), 1);   // nul byte.
+      return ConstantInt::get(CI->getType(), FormatStr.size());
+    }
+
+    // The remaining optimizations require the format string to be "%s" or "%c"
+    // and have an extra operand.
+    if (FormatStr.size() != 2 || FormatStr[0] != '%' ||
+        CI->getNumArgOperands() < 3)
+      return 0;
+
+    // Decode the second character of the format string.
+    if (FormatStr[1] == 'c') {
+      // sprintf(dst, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0
+      if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return 0;
+      Value *V = B.CreateTrunc(CI->getArgOperand(2), B.getInt8Ty(), "char");
+      Value *Ptr = CastToCStr(CI->getArgOperand(0), B);
+      B.CreateStore(V, Ptr);
+      Ptr = B.CreateGEP(Ptr, B.getInt32(1), "nul");
+      B.CreateStore(B.getInt8(0), Ptr);
+
+      return ConstantInt::get(CI->getType(), 1);
+    }
+
+    if (FormatStr[1] == 's') {
+      // These optimizations require DataLayout.
+      if (!TD) return 0;
+
+      // sprintf(dest, "%s", str) -> llvm.memcpy(dest, str, strlen(str)+1, 1)
+      if (!CI->getArgOperand(2)->getType()->isPointerTy()) return 0;
+
+      Value *Len = EmitStrLen(CI->getArgOperand(2), B, TD, TLI);
+      if (!Len)
+        return 0;
+      Value *IncLen = B.CreateAdd(Len,
+                                  ConstantInt::get(Len->getType(), 1),
+                                  "leninc");
+      B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(2), IncLen, 1);
+
+      // The sprintf result is the unincremented number of bytes in the string.
+      return B.CreateIntCast(Len, CI->getType(), false);
+    }
+    return 0;
+  }
+
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Require two fixed pointer arguments and an integer result.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy() ||
+        !FT->getReturnType()->isIntegerTy())
+      return 0;
+
+    if (Value *V = OptimizeFixedFormatString(Callee, CI, B)) {
+      return V;
+    }
+
+    // sprintf(str, format, ...) -> siprintf(str, format, ...) if no floating
+    // point arguments.
+    if (TLI->has(LibFunc::siprintf) && !callHasFloatingPointArgument(CI)) {
+      Module *M = B.GetInsertBlock()->getParent()->getParent();
+      Constant *SIPrintFFn =
+        M->getOrInsertFunction("siprintf", FT, Callee->getAttributes());
+      CallInst *New = cast<CallInst>(CI->clone());
+      New->setCalledFunction(SIPrintFFn);
+      B.Insert(New);
+      return New;
+    }
+    return 0;
+  }
+};
+
+struct FPrintFOpt : public LibCallOptimization {
+  Value *optimizeFixedFormatString(Function *Callee, CallInst *CI,
+                                   IRBuilder<> &B) {
+    // All the optimizations depend on the format string.
+    StringRef FormatStr;
+    if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr))
+      return 0;
+
+    // fprintf(F, "foo") --> fwrite("foo", 3, 1, F)
+    if (CI->getNumArgOperands() == 2) {
+      for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
+        if (FormatStr[i] == '%')  // Could handle %% -> % if we cared.
+          return 0; // We found a format specifier.
+
+      // These optimizations require DataLayout.
+      if (!TD) return 0;
+
+      Value *NewCI = EmitFWrite(CI->getArgOperand(1),
+                                ConstantInt::get(TD->getIntPtrType(*Context),
+                                                 FormatStr.size()),
+                                CI->getArgOperand(0), B, TD, TLI);
+      return NewCI ? ConstantInt::get(CI->getType(), FormatStr.size()) : 0;
+    }
+
+    // The remaining optimizations require the format string to be "%s" or "%c"
+    // and have an extra operand.
+    if (FormatStr.size() != 2 || FormatStr[0] != '%' ||
+        CI->getNumArgOperands() < 3)
+      return 0;
+
+    // Decode the second character of the format string.
+    if (FormatStr[1] == 'c') {
+      // fprintf(F, "%c", chr) --> fputc(chr, F)
+      if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return 0;
+      Value *NewCI = EmitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B,
+                               TD, TLI);
+      return NewCI ? ConstantInt::get(CI->getType(), 1) : 0;
+    }
+
+    if (FormatStr[1] == 's') {
+      // fprintf(F, "%s", str) --> fputs(str, F)
+      if (!CI->getArgOperand(2)->getType()->isPointerTy() || !CI->use_empty())
+        return 0;
+      return EmitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TD, TLI);
+    }
+    return 0;
+  }
+
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Require two fixed paramters as pointers and integer result.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy() ||
+        !FT->getReturnType()->isIntegerTy())
+      return 0;
+
+    if (Value *V = optimizeFixedFormatString(Callee, CI, B)) {
+      return V;
+    }
+
+    // fprintf(stream, format, ...) -> fiprintf(stream, format, ...) if no
+    // floating point arguments.
+    if (TLI->has(LibFunc::fiprintf) && !callHasFloatingPointArgument(CI)) {
+      Module *M = B.GetInsertBlock()->getParent()->getParent();
+      Constant *FIPrintFFn =
+        M->getOrInsertFunction("fiprintf", FT, Callee->getAttributes());
+      CallInst *New = cast<CallInst>(CI->clone());
+      New->setCalledFunction(FIPrintFFn);
+      B.Insert(New);
+      return New;
+    }
+    return 0;
+  }
+};
+
+struct FWriteOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Require a pointer, an integer, an integer, a pointer, returning integer.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 4 || !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isIntegerTy() ||
+        !FT->getParamType(2)->isIntegerTy() ||
+        !FT->getParamType(3)->isPointerTy() ||
+        !FT->getReturnType()->isIntegerTy())
+      return 0;
+
+    // Get the element size and count.
+    ConstantInt *SizeC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+    ConstantInt *CountC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+    if (!SizeC || !CountC) return 0;
+    uint64_t Bytes = SizeC->getZExtValue()*CountC->getZExtValue();
+
+    // If this is writing zero records, remove the call (it's a noop).
+    if (Bytes == 0)
+      return ConstantInt::get(CI->getType(), 0);
+
+    // If this is writing one byte, turn it into fputc.
+    // This optimisation is only valid, if the return value is unused.
+    if (Bytes == 1 && CI->use_empty()) {  // fwrite(S,1,1,F) -> fputc(S[0],F)
+      Value *Char = B.CreateLoad(CastToCStr(CI->getArgOperand(0), B), "char");
+      Value *NewCI = EmitFPutC(Char, CI->getArgOperand(3), B, TD, TLI);
+      return NewCI ? ConstantInt::get(CI->getType(), 1) : 0;
+    }
+
+    return 0;
+  }
+};
+
+struct FPutsOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // These optimizations require DataLayout.
+    if (!TD) return 0;
+
+    // Require two pointers.  Also, we can't optimize if return value is used.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy() ||
+        !CI->use_empty())
+      return 0;
+
+    // fputs(s,F) --> fwrite(s,1,strlen(s),F)
+    uint64_t Len = GetStringLength(CI->getArgOperand(0));
+    if (!Len) return 0;
+    // Known to have no uses (see above).
+    return EmitFWrite(CI->getArgOperand(0),
+                      ConstantInt::get(TD->getIntPtrType(*Context), Len-1),
+                      CI->getArgOperand(1), B, TD, TLI);
+  }
+};
+
+struct PutsOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Require one fixed pointer argument and an integer/void result.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() ||
+        !(FT->getReturnType()->isIntegerTy() ||
+          FT->getReturnType()->isVoidTy()))
+      return 0;
+
+    // Check for a constant string.
+    StringRef Str;
+    if (!getConstantStringInfo(CI->getArgOperand(0), Str))
+      return 0;
+
+    if (Str.empty() && CI->use_empty()) {
+      // puts("") -> putchar('\n')
+      Value *Res = EmitPutChar(B.getInt32('\n'), B, TD, TLI);
+      if (CI->use_empty() || !Res) return Res;
+      return B.CreateIntCast(Res, CI->getType(), true);
+    }
+
+    return 0;
+  }
+};
+
 } // End anonymous namespace.
 
 namespace llvm {
@@ -1464,6 +1712,11 @@ class LibCallSimplifierImpl {
 
   // Formatting and IO library call optimizations.
   PrintFOpt PrintF;
+  SPrintFOpt SPrintF;
+  FPrintFOpt FPrintF;
+  FWriteOpt FWrite;
+  FPutsOpt FPuts;
+  PutsOpt Puts;
 
   void initOptimizations();
   void addOpt(LibFunc::Func F, LibCallOptimization* Opt);
@@ -1588,6 +1841,11 @@ void LibCallSimplifierImpl::initOptimizations() {
 
   // Formatting and IO library call optimizations.
   addOpt(LibFunc::printf, &PrintF);
+  addOpt(LibFunc::sprintf, &SPrintF);
+  addOpt(LibFunc::fprintf, &FPrintF);
+  addOpt(LibFunc::fwrite, &FWrite);
+  addOpt(LibFunc::fputs, &FPuts);
+  addOpt(LibFunc::puts, &Puts);
 }
 
 Value *LibCallSimplifierImpl::optimizeCall(CallInst *CI) {
diff --git a/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp b/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
index b1cad06dff..8cf62196cc 100644
--- a/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
+++ b/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
@@ -15,12 +15,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/BasicBlock.h"
 #include "llvm/Function.h"
 #include "llvm/Instructions.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/Type.h"
-#include "llvm/ADT/StringExtras.h"
 using namespace llvm;
 
 char UnifyFunctionExitNodes::ID = 0;
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp
index dacbc7f242..a48229132b 100644
--- a/lib/Transforms/Vectorize/BBVectorize.cpp
+++ b/lib/Transforms/Vectorize/BBVectorize.cpp
@@ -16,22 +16,13 @@
 
 #define BBV_NAME "bb-vectorize"
 #define DEBUG_TYPE BBV_NAME
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Function.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Intrinsics.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Metadata.h"
-#include "llvm/Pass.h"
-#include "llvm/Type.h"
+#include "llvm/Transforms/Vectorize.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
@@ -39,14 +30,23 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Constants.h"
+#include "llvm/DataLayout.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Metadata.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/ValueHandle.h"
-#include "llvm/DataLayout.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetTransformInfo.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Vectorize.h"
+#include "llvm/Type.h"
 #include <algorithm>
 #include <map>
 using namespace llvm;
@@ -2903,6 +2903,7 @@ namespace {
         K->mutateType(getVecTypeForPair(L->getType(), H->getType()));
 
       combineMetadata(K, H);
+      K->intersectOptionalDataWith(H);
 
       for (unsigned o = 0; o < NumOperands; ++o)
         K->setOperand(o, ReplacedOperands[o]);
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 55733f7f8a..feeececedb 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6,432 +6,50 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
-// and generates target-independent LLVM-IR. Legalization of the IR is done
-// in the codegen. However, the vectorizes uses (will use) the codegen
-// interfaces to generate IR that is likely to result in an optimal binary.
-//
-// The loop vectorizer combines consecutive loop iteration into a single
-// 'wide' iteration. After this transformation the index is incremented
-// by the SIMD vector width, and not by one.
-//
-// This pass has three parts:
-// 1. The main loop pass that drives the different parts.
-// 2. LoopVectorizationLegality - A unit that checks for the legality
-//    of the vectorization.
-// 3. SingleBlockLoopVectorizer - A unit that performs the actual
-//    widening of instructions.
-// 4. LoopVectorizationCostModel - A unit that checks for the profitability
-//    of vectorization. It decides on the optimal vector width, which
-//    can be one, if vectorization is not profitable.
-//
-//===----------------------------------------------------------------------===//
-//
-// The reduction-variable vectorization is based on the paper:
-//  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
-//
-// Variable uniformity checks are inspired by:
-// Karrenberg, R. and Hack, S. Whole Function Vectorization.
-//
-// Other ideas/concepts are from:
-//  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
-//
-//  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
-//  Vectorizing Compilers.
-//
-//===----------------------------------------------------------------------===//
-#define LV_NAME "loop-vectorize"
-#define DEBUG_TYPE LV_NAME
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Instructions.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Pass.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Value.h"
-#include "llvm/Function.h"
-#include "llvm/Analysis/Verifier.h"
-#include "llvm/Module.h"
-#include "llvm/Type.h"
-#include "llvm/ADT/SmallVector.h"
+#include "LoopVectorize.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/Dominators.h"
-#include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/TargetTransformInfo.h"
+#include "llvm/Analysis/Verifier.h"
+#include "llvm/Constants.h"
+#include "llvm/DataLayout.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/DataLayout.h"
+#include "llvm/TargetTransformInfo.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include <algorithm>
-using namespace llvm;
+#include "llvm/Transforms/Vectorize.h"
+#include "llvm/Type.h"
+#include "llvm/Value.h"
 
 static cl::opt<unsigned>
 VectorizationFactor("force-vector-width", cl::init(0), cl::Hidden,
-          cl::desc("Set the default vectorization width. Zero is autoselect."));
-
-/// We don't vectorize loops with a known constant trip count below this number.
-const unsigned TinyTripCountThreshold = 16;
+                    cl::desc("Sets the SIMD width. Zero is autoselect."));
 
-/// When performing a runtime memory check, do not check more than this
-/// number of pointers. Notice that the check is quadratic!
-const unsigned RuntimeMemoryCheckThreshold = 2;
-
-/// This is the highest vector width that we try to generate.
-const unsigned MaxVectorSize = 8;
+static cl::opt<bool>
+EnableIfConversion("enable-if-conversion", cl::init(true), cl::Hidden,
+                   cl::desc("Enable if-conversion during vectorization."));
 
 namespace {
 
-// Forward declarations.
-class LoopVectorizationLegality;
-class LoopVectorizationCostModel;
-
-/// SingleBlockLoopVectorizer vectorizes loops which contain only one basic
-/// block to a specified vectorization factor (VF).
-/// This class performs the widening of scalars into vectors, or multiple
-/// scalars. This class also implements the following features:
-/// * It inserts an epilogue loop for handling loops that don't have iteration
-///   counts that are known to be a multiple of the vectorization factor.
-/// * It handles the code generation for reduction variables.
-/// * Scalarization (implementation using scalars) of un-vectorizable
-///   instructions.
-/// SingleBlockLoopVectorizer does not perform any vectorization-legality
-/// checks, and relies on the caller to check for the different legality
-/// aspects. The SingleBlockLoopVectorizer relies on the
-/// LoopVectorizationLegality class to provide information about the induction
-/// and reduction variables that were found to a given vectorization factor.
-class SingleBlockLoopVectorizer {
-public:
-  /// Ctor.
-  SingleBlockLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li,
-                            DominatorTree *Dt, DataLayout *Dl,
-                            LPPassManager *Lpm,
-                            unsigned VecWidth):
-  OrigLoop(Orig), SE(Se), LI(Li), DT(Dt), DL(Dl), LPM(Lpm), VF(VecWidth),
-  Builder(Se->getContext()), Induction(0), OldInduction(0) { }
-
-  // Perform the actual loop widening (vectorization).
-  void vectorize(LoopVectorizationLegality *Legal) {
-    // Create a new empty loop. Unlink the old loop and connect the new one.
-    createEmptyLoop(Legal);
-    // Widen each instruction in the old loop to a new one in the new loop.
-    // Use the Legality module to find the induction and reduction variables.
-    vectorizeLoop(Legal);
-    // Register the new loop and update the analysis passes.
-    updateAnalysis();
- }
-
-private:
-  /// Add code that checks at runtime if the accessed arrays overlap.
-  /// Returns the comperator value or NULL if no check is needed.
-  Value *addRuntimeCheck(LoopVectorizationLegality *Legal,
-                         Instruction *Loc);
-  /// Create an empty loop, based on the loop ranges of the old loop.
-  void createEmptyLoop(LoopVectorizationLegality *Legal);
-  /// Copy and widen the instructions from the old loop.
-  void vectorizeLoop(LoopVectorizationLegality *Legal);
-  /// Insert the new loop to the loop hierarchy and pass manager
-  /// and update the analysis passes.
-  void updateAnalysis();
-
-  /// This instruction is un-vectorizable. Implement it as a sequence
-  /// of scalars.
-  void scalarizeInstruction(Instruction *Instr);
-
-  /// Create a broadcast instruction. This method generates a broadcast
-  /// instruction (shuffle) for loop invariant values and for the induction
-  /// value. If this is the induction variable then we extend it to N, N+1, ...
-  /// this is needed because each iteration in the loop corresponds to a SIMD
-  /// element.
-  Value *getBroadcastInstrs(Value *V);
-
-  /// This is a helper function used by getBroadcastInstrs. It adds 0, 1, 2 ..
-  /// for each element in the vector. Starting from zero.
-  Value *getConsecutiveVector(Value* Val);
-
-  /// When we go over instructions in the basic block we rely on previous
-  /// values within the current basic block or on loop invariant values.
-  /// When we widen (vectorize) values we place them in the map. If the values
-  /// are not within the map, they have to be loop invariant, so we simply
-  /// broadcast them into a vector.
-  Value *getVectorValue(Value *V);
-
-  /// Get a uniform vector of constant integers. We use this to get
-  /// vectors of ones and zeros for the reduction code.
-  Constant* getUniformVector(unsigned Val, Type* ScalarTy);
-
-  typedef DenseMap<Value*, Value*> ValueMap;
-
-  /// The original loop.
-  Loop *OrigLoop;
-  // Scev analysis to use.
-  ScalarEvolution *SE;
-  // Loop Info.
-  LoopInfo *LI;
-  // Dominator Tree.
-  DominatorTree *DT;
-  // Data Layout.
-  DataLayout *DL;
-  // Loop Pass Manager;
-  LPPassManager *LPM;
-  // The vectorization factor to use.
-  unsigned VF;
-
-  // The builder that we use
-  IRBuilder<> Builder;
-
-  // --- Vectorization state ---
-
-  /// The vector-loop preheader.
-  BasicBlock *LoopVectorPreHeader;
-  /// The scalar-loop preheader.
-  BasicBlock *LoopScalarPreHeader;
-  /// Middle Block between the vector and the scalar.
-  BasicBlock *LoopMiddleBlock;
-  ///The ExitBlock of the scalar loop.
-  BasicBlock *LoopExitBlock;
-  ///The vector loop body.
-  BasicBlock *LoopVectorBody;
-  ///The scalar loop body.
-  BasicBlock *LoopScalarBody;
-  ///The first bypass block.
-  BasicBlock *LoopBypassBlock;
-
-  /// The new Induction variable which was added to the new block.
-  PHINode *Induction;
-  /// The induction variable of the old basic block.
-  PHINode *OldInduction;
-  // Maps scalars to widened vectors.
-  ValueMap WidenMap;
-};
-
-/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
-/// to what vectorization factor.
-/// This class does not look at the profitability of vectorization, only the
-/// legality. This class has two main kinds of checks:
-/// * Memory checks - The code in canVectorizeMemory checks if vectorization
-///   will change the order of memory accesses in a way that will change the
-///   correctness of the program.
-/// * Scalars checks - The code in canVectorizeBlock checks for a number
-///   of different conditions, such as the availability of a single induction
-///   variable, that all types are supported and vectorize-able, etc.
-/// This code reflects the capabilities of SingleBlockLoopVectorizer.
-/// This class is also used by SingleBlockLoopVectorizer for identifying
-/// induction variable and the different reduction variables.
-class LoopVectorizationLegality {
-public:
-  LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl):
-  TheLoop(Lp), SE(Se), DL(Dl), Induction(0) { }
-
-  /// This represents the kinds of reductions that we support.
-  enum ReductionKind {
-    NoReduction, /// Not a reduction.
-    IntegerAdd,  /// Sum of numbers.
-    IntegerMult, /// Product of numbers.
-    IntegerOr,   /// Bitwise or logical OR of numbers.
-    IntegerAnd,  /// Bitwise or logical AND of numbers.
-    IntegerXor   /// Bitwise or logical XOR of numbers.
-  };
-
-  /// This POD struct holds information about reduction variables.
-  struct ReductionDescriptor {
-    // Default C'tor
-    ReductionDescriptor():
-    StartValue(0), LoopExitInstr(0), Kind(NoReduction) {}
-
-    // C'tor.
-    ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K):
-    StartValue(Start), LoopExitInstr(Exit), Kind(K) {}
-
-    // The starting value of the reduction.
-    // It does not have to be zero!
-    Value *StartValue;
-    // The instruction who's value is used outside the loop.
-    Instruction *LoopExitInstr;
-    // The kind of the reduction.
-    ReductionKind Kind;
-  };
-
-  // This POD struct holds information about the memory runtime legality
-  // check that a group of pointers do not overlap.
-  struct RuntimePointerCheck {
-    RuntimePointerCheck(): Need(false) {}
-
-    /// Reset the state of the pointer runtime information.
-    void reset() {
-      Need = false;
-      Pointers.clear();
-      Starts.clear();
-      Ends.clear();
-    }
-
-    /// Insert a pointer and calculate the start and end SCEVs.
-    void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr) {
-      const SCEV *Sc = SE->getSCEV(Ptr);
-      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
-      assert(AR && "Invalid addrec expression");
-      const SCEV *Ex = SE->getExitCount(Lp, Lp->getHeader());
-      const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
-      Pointers.push_back(Ptr);
-      Starts.push_back(AR->getStart());
-      Ends.push_back(ScEnd);
-    }
-
-    /// This flag indicates if we need to add the runtime check.
-    bool Need;
-    /// Holds the pointers that we need to check.
-    SmallVector<Value*, 2> Pointers;
-    /// Holds the pointer value at the beginning of the loop.
-    SmallVector<const SCEV*, 2> Starts;
-    /// Holds the pointer value at the end of the loop.
-    SmallVector<const SCEV*, 2> Ends;
-  };
-
-  /// ReductionList contains the reduction descriptors for all
-  /// of the reductions that were found in the loop.
-  typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
-
-  /// InductionList saves induction variables and maps them to the initial
-  /// value entring the loop.
-  typedef DenseMap<PHINode*, Value*> InductionList;
-
-  /// Returns true if it is legal to vectorize this loop.
-  /// This does not mean that it is profitable to vectorize this
-  /// loop, only that it is legal to do so.
-  bool canVectorize();
-
-  /// Returns the Induction variable.
-  PHINode *getInduction() {return Induction;}
-
-  /// Returns the reduction variables found in the loop.
-  ReductionList *getReductionVars() { return &Reductions; }
-
-  /// Returns the induction variables found in the loop.
-  InductionList *getInductionVars() { return &Inductions; }
-
-  /// Check if this  pointer is consecutive when vectorizing. This happens
-  /// when the last index of the GEP is the induction variable, or that the
-  /// pointer itself is an induction variable.
-  /// This check allows us to vectorize A[idx] into a wide load/store.
-  bool isConsecutivePtr(Value *Ptr);
-
-  /// Returns true if the value V is uniform within the loop.
-  bool isUniform(Value *V);
-
-  /// Returns true if this instruction will remain scalar after vectorization.
-  bool isUniformAfterVectorization(Instruction* I) {return Uniforms.count(I);}
-
-  /// Returns the information that we collected about runtime memory check.
-  RuntimePointerCheck *getRuntimePointerCheck() {return &PtrRtCheck; }
-private:
-  /// Check if a single basic block loop is vectorizable.
-  /// At this point we know that this is a loop with a constant trip count
-  /// and we only need to check individual instructions.
-  bool canVectorizeBlock(BasicBlock &BB);
-
-  /// When we vectorize loops we may change the order in which
-  /// we read and write from memory. This method checks if it is
-  /// legal to vectorize the code, considering only memory constrains.
-  /// Returns true if BB is vectorizable
-  bool canVectorizeMemory(BasicBlock &BB);
-
-  /// Returns True, if 'Phi' is the kind of reduction variable for type
-  /// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
-  bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
-  /// Returns true if the instruction I can be a reduction variable of type
-  /// 'Kind'.
-  bool isReductionInstr(Instruction *I, ReductionKind Kind);
-  /// Returns True, if 'Phi' is an induction variable.
-  bool isInductionVariable(PHINode *Phi);
-  /// Return true if can compute the address bounds of Ptr within the loop.
-  bool hasComputableBounds(Value *Ptr);
-
-  /// The loop that we evaluate.
-  Loop *TheLoop;
-  /// Scev analysis.
-  ScalarEvolution *SE;
-  /// DataLayout analysis.
-  DataLayout *DL;
-
-  //  ---  vectorization state --- //
-
-  /// Holds the integer induction variable. This is the counter of the
-  /// loop.
-  PHINode *Induction;
-  /// Holds the reduction variables.
-  ReductionList Reductions;
-  /// Holds all of the induction variables that we found in the loop.
-  /// Notice that inductions don't need to start at zero and that induction
-  /// variables can be pointers.
-  InductionList Inductions;
-
-  /// Allowed outside users. This holds the reduction
-  /// vars which can be accessed from outside the loop.
-  SmallPtrSet<Value*, 4> AllowedExit;
-  /// This set holds the variables which are known to be uniform after
-  /// vectorization.
-  SmallPtrSet<Instruction*, 4> Uniforms;
-  /// We need to check that all of the pointers in this list are disjoint
-  /// at runtime.
-  RuntimePointerCheck PtrRtCheck;
-};
-
-/// LoopVectorizationCostModel - estimates the expected speedups due to
-/// vectorization.
-/// In many cases vectorization is not profitable. This can happen because
-/// of a number of reasons. In this class we mainly attempt to predict
-/// the expected speedup/slowdowns due to the supported instruction set.
-/// We use the VectorTargetTransformInfo to query the different backends
-/// for the cost of different operations.
-class LoopVectorizationCostModel {
-public:
-  /// C'tor.
-  LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se,
-                             LoopVectorizationLegality *Leg,
-                             const VectorTargetTransformInfo *Vtti):
-  TheLoop(Lp), SE(Se), Legal(Leg), VTTI(Vtti) { }
-
-  /// Returns the most profitable vectorization factor for the loop that is
-  /// smaller or equal to the VF argument. This method checks every power
-  /// of two up to VF.
-  unsigned findBestVectorizationFactor(unsigned VF = MaxVectorSize);
-
-private:
-  /// Returns the expected execution cost. The unit of the cost does
-  /// not matter because we use the 'cost' units to compare different
-  /// vector widths. The cost that is returned is *not* normalized by
-  /// the factor width.
-  unsigned expectedCost(unsigned VF);
-
-  /// Returns the execution time cost of an instruction for a given vector
-  /// width. Vector width of one means scalar.
-  unsigned getInstructionCost(Instruction *I, unsigned VF);
-
-  /// A helper function for converting Scalar types to vector types.
-  /// If the incoming type is void, we return void. If the VF is 1, we return
-  /// the scalar type.
-  static Type* ToVectorTy(Type *Scalar, unsigned VF);
-
-  /// The loop that we evaluate.
-  Loop *TheLoop;
-  /// Scev analysis.
-  ScalarEvolution *SE;
-
-  /// Vectorization legality.
-  LoopVectorizationLegality *Legal;
-  /// Vector target information.
-  const VectorTargetTransformInfo *VTTI;
-};
-
+/// The LoopVectorize Pass.
 struct LoopVectorize : public LoopPass {
   static char ID; // Pass identification, replacement for typeid
 
@@ -460,7 +78,7 @@ struct LoopVectorize : public LoopPass {
           L->getHeader()->getParent()->getName() << "\"\n");
 
     // Check if it is legal to vectorize the loop.
-    LoopVectorizationLegality LVL(L, SE, DL);
+    LoopVectorizationLegality LVL(L, SE, DL, DT);
     if (!LVL.canVectorize()) {
       DEBUG(dbgs() << "LV: Not vectorizing.\n");
       return false;
@@ -491,7 +109,7 @@ struct LoopVectorize : public LoopPass {
           "\n");
 
     // If we decided that it is *legal* to vectorizer the loop then do it.
-    SingleBlockLoopVectorizer LB(L, SE, LI, DT, DL, &LPM, VF);
+    InnerLoopVectorizer LB(L, SE, LI, DT, DL, VF);
     LB.vectorize(&LVL);
 
     DEBUG(verifyFunction(*L->getHeader()->getParent()));
@@ -511,11 +129,44 @@ struct LoopVectorize : public LoopPass {
 
 };
 
-Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) {
+}// namespace
+
+//===----------------------------------------------------------------------===//
+// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
+// LoopVectorizationCostModel.
+//===----------------------------------------------------------------------===//
+
+void
+LoopVectorizationLegality::RuntimePointerCheck::insert(ScalarEvolution *SE,
+                                                       Loop *Lp, Value *Ptr) {
+  const SCEV *Sc = SE->getSCEV(Ptr);
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
+  assert(AR && "Invalid addrec expression");
+  const SCEV *Ex = SE->getExitCount(Lp, Lp->getLoopLatch());
+  const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
+  Pointers.push_back(Ptr);
+  Starts.push_back(AR->getStart());
+  Ends.push_back(ScEnd);
+}
+
+Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
   // Create the types.
   LLVMContext &C = V->getContext();
   Type *VTy = VectorType::get(V->getType(), VF);
   Type *I32 = IntegerType::getInt32Ty(C);
+
+  // Save the current insertion location.
+  Instruction *Loc = Builder.GetInsertPoint();
+
+  // We need to place the broadcast of invariant variables outside the loop.
+  Instruction *Instr = dyn_cast<Instruction>(V);
+  bool NewInstr = (Instr && Instr->getParent() == LoopVectorBody);
+  bool Invariant = OrigLoop->isLoopInvariant(V) && !NewInstr;
+
+  // Place the code for broadcasting invariant variables in the new preheader.
+  if (Invariant)
+    Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
+
   Constant *Zero = ConstantInt::get(I32, 0);
   Value *Zeros = ConstantAggregateZero::get(VectorType::get(I32, VF));
   Value *UndefVal = UndefValue::get(VTy);
@@ -523,27 +174,28 @@ Value *SingleBlockLoopVectorizer::getBroadcastInstrs(Value *V) {
   Value *SingleElem = Builder.CreateInsertElement(UndefVal, V, Zero);
   // Broadcast the scalar into all locations in the vector.
   Value *Shuf = Builder.CreateShuffleVector(SingleElem, UndefVal, Zeros,
-                                             "broadcast");
-  // We are accessing the induction variable. Make sure to promote the
-  // index for each consecutive SIMD lane. This adds 0,1,2 ... to all lanes.
-  if (V == Induction)
-    return getConsecutiveVector(Shuf);
+                                            "broadcast");
+
+  // Restore the builder insertion point.
+  if (Invariant)
+    Builder.SetInsertPoint(Loc);
+
   return Shuf;
 }
 
-Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) {
+Value *InnerLoopVectorizer::getConsecutiveVector(Value* Val, bool Negate) {
   assert(Val->getType()->isVectorTy() && "Must be a vector");
   assert(Val->getType()->getScalarType()->isIntegerTy() &&
          "Elem must be an integer");
   // Create the types.
   Type *ITy = Val->getType()->getScalarType();
   VectorType *Ty = cast<VectorType>(Val->getType());
-  unsigned VLen = Ty->getNumElements();
+  int VLen = Ty->getNumElements();
   SmallVector<Constant*, 8> Indices;
 
   // Create a vector of consecutive numbers from zero to VF.
-  for (unsigned i = 0; i < VLen; ++i)
-    Indices.push_back(ConstantInt::get(ITy, i));
+  for (int i = 0; i < VLen; ++i)
+    Indices.push_back(ConstantInt::get(ITy, Negate ? (-i): i ));
 
   // Add the consecutive indices to the vector value.
   Constant *Cv = ConstantVector::get(Indices);
@@ -554,10 +206,13 @@ Value *SingleBlockLoopVectorizer::getConsecutiveVector(Value* Val) {
 bool LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
   assert(Ptr->getType()->isPointerTy() && "Unexpected non ptr");
 
-  // If this pointer is an induction variable, return it.
+  // If this value is a pointer induction variable we know it is consecutive.
   PHINode *Phi = dyn_cast_or_null<PHINode>(Ptr);
-  if (Phi && getInductionVars()->count(Phi))
-    return true;
+  if (Phi && Inductions.count(Phi)) {
+    InductionInfo II = Inductions[Phi];
+    if (PtrInduction == II.IK)
+      return true;
+  }
 
   GetElementPtrInst *Gep = dyn_cast_or_null<GetElementPtrInst>(Ptr);
   if (!Gep)
@@ -571,7 +226,7 @@ bool LoopVectorizationLegality::isConsecutivePtr(Value *Ptr) {
     if (!SE->isLoopInvariant(SE->getSCEV(Gep->getOperand(i)), TheLoop))
       return false;
 
-  // We can emit wide load/stores only of the last index is the induction
+  // We can emit wide load/stores only if the last index is the induction
   // variable.
   const SCEV *Last = SE->getSCEV(LastIndex);
   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Last)) {
@@ -590,7 +245,8 @@ bool LoopVectorizationLegality::isUniform(Value *V) {
   return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
 }
 
-Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) {
+Value *InnerLoopVectorizer::getVectorValue(Value *V) {
+  assert(V != Induction && "The new induction variable should not be used.");
   assert(!V->getType()->isVectorTy() && "Can't widen a vector");
   // If we saved a vectorized copy of V, use it.
   Value *&MapEntry = WidenMap[V];
@@ -604,11 +260,11 @@ Value *SingleBlockLoopVectorizer::getVectorValue(Value *V) {
 }
 
 Constant*
-SingleBlockLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) {
+InnerLoopVectorizer::getUniformVector(unsigned Val, Type* ScalarTy) {
   return ConstantVector::getSplat(VF, ConstantInt::get(ScalarTy, Val, true));
 }
 
-void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
+void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
   // Holds vector parameters or scalars, in case of uniform vals.
   SmallVector<Value*, 8> Params;
@@ -619,7 +275,7 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
 
     // If we are accessing the old induction variable, use the new one.
     if (SrcOp == OldInduction) {
-      Params.push_back(getVectorValue(Induction));
+      Params.push_back(getVectorValue(SrcOp));
       continue;
     }
 
@@ -679,10 +335,10 @@ void SingleBlockLoopVectorizer::scalarizeInstruction(Instruction *Instr) {
 }
 
 Value*
-SingleBlockLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
-                                           Instruction *Loc) {
+InnerLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
+                                     Instruction *Loc) {
   LoopVectorizationLegality::RuntimePointerCheck *PtrRtCheck =
-    Legal->getRuntimePointerCheck();
+  Legal->getRuntimePointerCheck();
 
   if (!PtrRtCheck->Need)
     return NULL;
@@ -695,22 +351,21 @@ SingleBlockLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
   SCEVExpander Exp(*SE, "induction");
 
   // Use this type for pointer arithmetic.
-  Type* PtrArithTy = PtrRtCheck->Pointers[0]->getType();
+  Type* PtrArithTy = Type::getInt8PtrTy(Loc->getContext(), 0);
 
-  for (unsigned i=0; i < NumPointers; ++i) {
+  for (unsigned i = 0; i < NumPointers; ++i) {
     Value *Ptr = PtrRtCheck->Pointers[i];
     const SCEV *Sc = SE->getSCEV(Ptr);
 
     if (SE->isLoopInvariant(Sc, OrigLoop)) {
-      DEBUG(dbgs() << "LV1: Adding RT check for a loop invariant ptr:" <<
+      DEBUG(dbgs() << "LV: Adding RT check for a loop invariant ptr:" <<
             *Ptr <<"\n");
       Starts.push_back(Ptr);
       Ends.push_back(Ptr);
     } else {
       DEBUG(dbgs() << "LV: Adding RT check for range:" << *Ptr <<"\n");
 
-      Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i],
-                                       PtrArithTy, Loc);
+      Value *Start = Exp.expandCodeFor(PtrRtCheck->Starts[i], PtrArithTy, Loc);
       Value *End = Exp.expandCodeFor(PtrRtCheck->Ends[i], PtrArithTy, Loc);
       Starts.push_back(Start);
       Ends.push_back(End);
@@ -719,10 +374,16 @@ SingleBlockLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
 
   for (unsigned i = 0; i < NumPointers; ++i) {
     for (unsigned j = i+1; j < NumPointers; ++j) {
+      Instruction::CastOps Op = Instruction::BitCast;
+      Value *Start0 = CastInst::Create(Op, Starts[i], PtrArithTy, "bc", Loc);
+      Value *Start1 = CastInst::Create(Op, Starts[j], PtrArithTy, "bc", Loc);
+      Value *End0 =   CastInst::Create(Op, Ends[i],   PtrArithTy, "bc", Loc);
+      Value *End1 =   CastInst::Create(Op, Ends[j],   PtrArithTy, "bc", Loc);
+
       Value *Cmp0 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
-                                    Starts[i], Ends[j], "bound0", Loc);
+                                    Start0, End1, "bound0", Loc);
       Value *Cmp1 = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_ULE,
-                                    Starts[j], Ends[i], "bound1", Loc);
+                                    Start1, End0, "bound1", Loc);
       Value *IsConflict = BinaryOperator::Create(Instruction::And, Cmp0, Cmp1,
                                                  "found.conflict", Loc);
       if (MemoryRuntimeCheck)
@@ -740,32 +401,32 @@ SingleBlockLoopVectorizer::addRuntimeCheck(LoopVectorizationLegality *Legal,
 }
 
 void
-SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
+InnerLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   /*
    In this function we generate a new loop. The new loop will contain
    the vectorized instructions while the old loop will continue to run the
    scalar remainder.
 
-    [ ] <-- vector loop bypass.
-  /  |
- /   v
-|   [ ]     <-- vector pre header.
-|    |
-|    v
-|   [  ] \
-|   [  ]_|   <-- vector loop.
-|    |
- \   v
+   [ ] <-- vector loop bypass.
+   /  |
+   /   v
+   |   [ ]     <-- vector pre header.
+   |    |
+   |    v
+   |   [  ] \
+   |   [  ]_|   <-- vector loop.
+   |    |
+   \   v
    >[ ]   <--- middle-block.
-  /  |
- /   v
-|   [ ]     <--- new preheader.
-|    |
-|    v
-|   [ ] \
-|   [ ]_|   <-- old scalar loop to handle remainder.
- \   |
-  \  v
+   /  |
+   /   v
+   |   [ ]     <--- new preheader.
+   |    |
+   |    v
+   |   [ ] \
+   |   [ ]_|   <-- old scalar loop to handle remainder.
+   \   |
+   \  v
    >[ ]     <-- exit block.
    ...
    */
@@ -781,10 +442,10 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   // don't have a single induction variable.
   OldInduction = Legal->getInduction();
   Type *IdxTy = OldInduction ? OldInduction->getType() :
-    DL->getIntPtrType(SE->getContext());
+  DL->getIntPtrType(SE->getContext());
 
   // Find the loop boundaries.
-  const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getHeader());
+  const SCEV *ExitCount = SE->getExitCount(OrigLoop, OrigLoop->getLoopLatch());
   assert(ExitCount != SE->getCouldNotCompute() && "Invalid loop count");
 
   // Get the total trip count from the count by adding 1.
@@ -803,10 +464,9 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   // value from the induction PHI node. If we don't have an induction variable
   // then we know that it starts at zero.
   Value *StartIdx = OldInduction ?
-    OldInduction->getIncomingValueForBlock(BypassBlock):
-    ConstantInt::get(IdxTy, 0);
+  OldInduction->getIncomingValueForBlock(BypassBlock):
+  ConstantInt::get(IdxTy, 0);
 
-  assert(OrigLoop->getNumBlocks() == 1 && "Invalid loop");
   assert(BypassBlock && "Invalid loop structure");
 
   // Generate the code that checks in runtime if arrays overlap.
@@ -815,13 +475,13 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
 
   // Split the single block loop into the two loop structure described above.
   BasicBlock *VectorPH =
-      BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
+  BypassBlock->splitBasicBlock(BypassBlock->getTerminator(), "vector.ph");
   BasicBlock *VecBody =
-    VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
+  VectorPH->splitBasicBlock(VectorPH->getTerminator(), "vector.body");
   BasicBlock *MiddleBlock =
-    VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
+  VecBody->splitBasicBlock(VecBody->getTerminator(), "middle.block");
   BasicBlock *ScalarPH =
-    MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
+  MiddleBlock->splitBasicBlock(MiddleBlock->getTerminator(), "scalar.ph");
 
   // This is the location in which we add all of the logic for bypassing
   // the new vector loop.
@@ -878,8 +538,8 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   // PHIs that are left in the scalar version of the loop.
   // The starting values of PHI nodes depend on the counter of the last
   // iteration in the vectorized loop.
-  // If we come from a bypass edge then we need to start from the original start
-  // value.
+  // If we come from a bypass edge then we need to start from the original
+  // start value.
 
   // This variable saves the new starting index for the scalar loop.
   PHINode *ResumeIndex = 0;
@@ -887,27 +547,54 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
   for (I = List->begin(), E = List->end(); I != E; ++I) {
     PHINode *OrigPhi = I->first;
+    LoopVectorizationLegality::InductionInfo II = I->second;
     PHINode *ResumeVal = PHINode::Create(OrigPhi->getType(), 2, "resume.val",
-                                           MiddleBlock->getTerminator());
+                                         MiddleBlock->getTerminator());
     Value *EndValue = 0;
-    if (OrigPhi->getType()->isIntegerTy()) {
+    switch (II.IK) {
+    case LoopVectorizationLegality::NoInduction:
+      llvm_unreachable("Unknown induction");
+    case LoopVectorizationLegality::IntInduction: {
       // Handle the integer induction counter:
+      assert(OrigPhi->getType()->isIntegerTy() && "Invalid type");
       assert(OrigPhi == OldInduction && "Unknown integer PHI");
       // We know what the end value is.
       EndValue = IdxEndRoundDown;
       // We also know which PHI node holds it.
       ResumeIndex = ResumeVal;
-    } else {
+      break;
+    }
+    case LoopVectorizationLegality::ReverseIntInduction: {
+      // Convert the CountRoundDown variable to the PHI size.
+      unsigned CRDSize = CountRoundDown->getType()->getScalarSizeInBits();
+      unsigned IISize = II.StartValue->getType()->getScalarSizeInBits();
+      Value *CRD = CountRoundDown;
+      if (CRDSize > IISize)
+        CRD = CastInst::Create(Instruction::Trunc, CountRoundDown,
+                               II.StartValue->getType(),
+                               "tr.crd", BypassBlock->getTerminator());
+      else if (CRDSize < IISize)
+        CRD = CastInst::Create(Instruction::SExt, CountRoundDown,
+                               II.StartValue->getType(),
+                               "sext.crd", BypassBlock->getTerminator());
+      // Handle reverse integer induction counter:
+      EndValue = BinaryOperator::CreateSub(II.StartValue, CRD, "rev.ind.end",
+                                           BypassBlock->getTerminator());
+      break;
+    }
+    case LoopVectorizationLegality::PtrInduction: {
       // For pointer induction variables, calculate the offset using
       // the end index.
-      EndValue = GetElementPtrInst::Create(I->second, CountRoundDown,
+      EndValue = GetElementPtrInst::Create(II.StartValue, CountRoundDown,
                                            "ptr.ind.end",
                                            BypassBlock->getTerminator());
+      break;
     }
+    }// end of case
 
     // The new PHI merges the original incoming value, in case of a bypass,
     // or the value at the end of the vectorized loop.
-    ResumeVal->addIncoming(I->second, BypassBlock);
+    ResumeVal->addIncoming(II.StartValue, BypassBlock);
     ResumeVal->addIncoming(EndValue, VecBody);
 
     // Fix the scalar body counter (PHI node).
@@ -956,19 +643,22 @@ SingleBlockLoopVectorizer::createEmptyLoop(LoopVectorizationLegality *Legal) {
   // Get ready to start creating new instructions into the vectorized body.
   Builder.SetInsertPoint(VecBody->getFirstInsertionPt());
 
-  // Register the new loop.
+  // Create and register the new vector loop.
   Loop* Lp = new Loop();
-  LPM->insertLoop(Lp, OrigLoop->getParentLoop());
-
-  Lp->addBasicBlockToLoop(VecBody, LI->getBase());
-
   Loop *ParentLoop = OrigLoop->getParentLoop();
+
+  // Insert the new loop into the loop nest and register the new basic blocks.
   if (ParentLoop) {
+    ParentLoop->addChildLoop(Lp);
     ParentLoop->addBasicBlockToLoop(ScalarPH, LI->getBase());
     ParentLoop->addBasicBlockToLoop(VectorPH, LI->getBase());
     ParentLoop->addBasicBlockToLoop(MiddleBlock, LI->getBase());
+  } else {
+    LI->addTopLevelLoop(Lp);
   }
 
+  Lp->addBasicBlockToLoop(VecBody, LI->getBase());
+
   // Save the state.
   LoopVectorPreHeader = VectorPH;
   LoopScalarPreHeader = ScalarPH;
@@ -1000,8 +690,37 @@ getReductionIdentity(LoopVectorizationLegality::ReductionKind K) {
   }
 }
 
+static bool
+isTriviallyVectorizableIntrinsic(Instruction *Inst) {
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst);
+  if (!II)
+    return false;
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::sqrt:
+  case Intrinsic::sin:
+  case Intrinsic::cos:
+  case Intrinsic::exp:
+  case Intrinsic::exp2:
+  case Intrinsic::log:
+  case Intrinsic::log10:
+  case Intrinsic::log2:
+  case Intrinsic::fabs:
+  case Intrinsic::floor:
+  case Intrinsic::ceil:
+  case Intrinsic::trunc:
+  case Intrinsic::rint:
+  case Intrinsic::nearbyint:
+  case Intrinsic::pow:
+  case Intrinsic::fma:
+    return true;
+  default:
+    return false;
+  }
+  return false;
+}
+
 void
-SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
+InnerLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
   //===------------------------------------------------===//
   //
   // Notice: any optimization or new instruction that go
@@ -1009,14 +728,13 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
   // the cost-model.
   //
   //===------------------------------------------------===//
-  typedef SmallVector<PHINode*, 4> PhiVector;
   BasicBlock &BB = *OrigLoop->getHeader();
-  Constant *Zero = ConstantInt::get(
-    IntegerType::getInt32Ty(BB.getContext()), 0);
+  Constant *Zero =
+  ConstantInt::get(IntegerType::getInt32Ty(BB.getContext()), 0);
 
   // In order to support reduction variables we need to be able to vectorize
   // Phi nodes. Phi nodes have cycles, so we need to vectorize them in two
-  // steages. First, we create a new vector PHI node with no incoming edges.
+  // stages. First, we create a new vector PHI node with no incoming edges.
   // We use this value when we vectorize all of the instructions that use the
   // PHI. Next, after all of the instructions in the block are complete we
   // add the new incoming edges to the PHI. At this point all of the
@@ -1024,245 +742,17 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
   // construct the PHI.
   PhiVector RdxPHIsToFix;
 
-  // For each instruction in the old loop.
-  for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
-    Instruction *Inst = it;
-
-    switch (Inst->getOpcode()) {
-      case Instruction::Br:
-        // Nothing to do for PHIs and BR, since we already took care of the
-        // loop control flow instructions.
-        continue;
-      case Instruction::PHI:{
-        PHINode* P = cast<PHINode>(Inst);
-        // Handle reduction variables:
-        if (Legal->getReductionVars()->count(P)) {
-          // This is phase one of vectorizing PHIs.
-          Type *VecTy = VectorType::get(Inst->getType(), VF);
-          WidenMap[Inst] = PHINode::Create(VecTy, 2, "vec.phi",
-                                  LoopVectorBody->getFirstInsertionPt());
-          RdxPHIsToFix.push_back(P);
-          continue;
-        }
-
-        // This PHINode must be an induction variable.
-        // Make sure that we know about it.
-        assert(Legal->getInductionVars()->count(P) &&
-               "Not an induction variable");
-
-        if (P->getType()->isIntegerTy()) {
-          assert(P == OldInduction && "Unexpected PHI");
-          WidenMap[Inst] = getBroadcastInstrs(Induction);
-          continue;
-        }
-
-        // Handle pointer inductions.
-        assert(P->getType()->isPointerTy() && "Unexpected type.");
-        Value *StartIdx = OldInduction ?
-          Legal->getInductionVars()->lookup(OldInduction) :
-          ConstantInt::get(Induction->getType(), 0);
-
-        // This is the pointer value coming into the loop.
-        Value *StartPtr = Legal->getInductionVars()->lookup(P);
-
-        // This is the normalized GEP that starts counting at zero.
-        Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
-                                                 "normalized.idx");
+  // Scan the loop in a topological order to ensure that defs are vectorized
+  // before users.
+  LoopBlocksDFS DFS(OrigLoop);
+  DFS.perform(LI);
 
-        // This is the vector of results. Notice that we don't generate vector
-        // geps because scalar geps result in better code.
-        Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
-        for (unsigned int i = 0; i < VF; ++i) {
-          Constant *Idx = ConstantInt::get(Induction->getType(), i);
-          Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx, "gep.idx");
-          Value *SclrGep = Builder.CreateGEP(StartPtr, GlobalIdx, "next.gep");
-          VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
-                                               Builder.getInt32(i),
-                                               "insert.gep");
-        }
+  // Vectorize all of the blocks in the original loop.
+  for (LoopBlocksDFS::RPOIterator bb = DFS.beginRPO(),
+       be = DFS.endRPO(); bb != be; ++bb)
+    vectorizeBlockInLoop(Legal, *bb, &RdxPHIsToFix);
 
-        WidenMap[Inst] = VecVal;
-        continue;
-      }
-      case Instruction::Add:
-      case Instruction::FAdd:
-      case Instruction::Sub:
-      case Instruction::FSub:
-      case Instruction::Mul:
-      case Instruction::FMul:
-      case Instruction::UDiv:
-      case Instruction::SDiv:
-      case Instruction::FDiv:
-      case Instruction::URem:
-      case Instruction::SRem:
-      case Instruction::FRem:
-      case Instruction::Shl:
-      case Instruction::LShr:
-      case Instruction::AShr:
-      case Instruction::And:
-      case Instruction::Or:
-      case Instruction::Xor: {
-        // Just widen binops.
-        BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
-        Value *A = getVectorValue(Inst->getOperand(0));
-        Value *B = getVectorValue(Inst->getOperand(1));
-
-        // Use this vector value for all users of the original instruction.
-        Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
-        WidenMap[Inst] = V;
-
-        // Update the NSW, NUW and Exact flags.
-        BinaryOperator *VecOp = cast<BinaryOperator>(V);
-        if (isa<OverflowingBinaryOperator>(BinOp)) {
-          VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap());
-          VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap());
-        }
-        if (isa<PossiblyExactOperator>(VecOp))
-          VecOp->setIsExact(BinOp->isExact());
-        break;
-      }
-      case Instruction::Select: {
-        // Widen selects.
-        // If the selector is loop invariant we can create a select
-        // instruction with a scalar condition. Otherwise, use vector-select.
-        Value *Cond = Inst->getOperand(0);
-        bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop);
-
-        // The condition can be loop invariant  but still defined inside the
-        // loop. This means that we can't just use the original 'cond' value.
-        // We have to take the 'vectorized' value and pick the first lane.
-        // Instcombine will make this a no-op.
-        Cond = getVectorValue(Cond);
-        if (InvariantCond)
-          Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0));
-
-        Value *Op0 = getVectorValue(Inst->getOperand(1));
-        Value *Op1 = getVectorValue(Inst->getOperand(2));
-        WidenMap[Inst] = Builder.CreateSelect(Cond, Op0, Op1);
-        break;
-      }
-
-      case Instruction::ICmp:
-      case Instruction::FCmp: {
-        // Widen compares. Generate vector compares.
-        bool FCmp = (Inst->getOpcode() == Instruction::FCmp);
-        CmpInst *Cmp = dyn_cast<CmpInst>(Inst);
-        Value *A = getVectorValue(Inst->getOperand(0));
-        Value *B = getVectorValue(Inst->getOperand(1));
-        if (FCmp)
-          WidenMap[Inst] = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
-        else
-          WidenMap[Inst] = Builder.CreateICmp(Cmp->getPredicate(), A, B);
-        break;
-      }
-
-      case Instruction::Store: {
-        // Attempt to issue a wide store.
-        StoreInst *SI = dyn_cast<StoreInst>(Inst);
-        Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF);
-        Value *Ptr = SI->getPointerOperand();
-        unsigned Alignment = SI->getAlignment();
-
-        assert(!Legal->isUniform(Ptr) &&
-               "We do not allow storing to uniform addresses");
-
-        GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
-
-        // This store does not use GEPs.
-        if (!Legal->isConsecutivePtr(Ptr)) {
-          scalarizeInstruction(Inst);
-          break;
-        }
-
-        if (Gep) {
-          // The last index does not have to be the induction. It can be
-          // consecutive and be a function of the index. For example A[I+1];
-          unsigned NumOperands = Gep->getNumOperands();
-          Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1));
-          LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
-
-          // Create the new GEP with the new induction variable.
-          GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
-          Gep2->setOperand(NumOperands - 1, LastIndex);
-          Ptr = Builder.Insert(Gep2);
-        } else {
-          // Use the induction element ptr.
-          assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
-          Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero);
-        }
-        Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo());
-        Value *Val = getVectorValue(SI->getValueOperand());
-        Builder.CreateStore(Val, Ptr)->setAlignment(Alignment);
-        break;
-      }
-      case Instruction::Load: {
-        // Attempt to issue a wide load.
-        LoadInst *LI = dyn_cast<LoadInst>(Inst);
-        Type *RetTy = VectorType::get(LI->getType(), VF);
-        Value *Ptr = LI->getPointerOperand();
-        unsigned Alignment = LI->getAlignment();
-        GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
-
-        // If the pointer is loop invariant or if it is non consecutive,
-        // scalarize the load.
-        bool Con = Legal->isConsecutivePtr(Ptr);
-        if (Legal->isUniform(Ptr) || !Con) {
-          scalarizeInstruction(Inst);
-          break;
-        }
-
-        if (Gep) {
-          // The last index does not have to be the induction. It can be
-          // consecutive and be a function of the index. For example A[I+1];
-          unsigned NumOperands = Gep->getNumOperands();
-          Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
-          LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
-
-          // Create the new GEP with the new induction variable.
-          GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
-          Gep2->setOperand(NumOperands - 1, LastIndex);
-          Ptr = Builder.Insert(Gep2);
-        } else {
-          // Use the induction element ptr.
-          assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
-          Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero);
-        }
-
-        Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo());
-        LI = Builder.CreateLoad(Ptr);
-        LI->setAlignment(Alignment);
-        // Use this vector value for all users of the load.
-        WidenMap[Inst] = LI;
-        break;
-      }
-      case Instruction::ZExt:
-      case Instruction::SExt:
-      case Instruction::FPToUI:
-      case Instruction::FPToSI:
-      case Instruction::FPExt:
-      case Instruction::PtrToInt:
-      case Instruction::IntToPtr:
-      case Instruction::SIToFP:
-      case Instruction::UIToFP:
-      case Instruction::Trunc:
-      case Instruction::FPTrunc:
-      case Instruction::BitCast: {
-        /// Vectorize bitcasts.
-        CastInst *CI = dyn_cast<CastInst>(Inst);
-        Value *A = getVectorValue(Inst->getOperand(0));
-        Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF);
-        WidenMap[Inst] = Builder.CreateCast(CI->getOpcode(), A, DestTy);
-        break;
-      }
-
-      default:
-        /// All other instructions are unsupported. Scalarize them.
-        scalarizeInstruction(Inst);
-        break;
-    }// end of switch.
-  }// end of for_each instr.
-
-  // At this point every instruction in the original loop is widended to
+  // At this point every instruction in the original loop is widened to
   // a vector form. We are almost done. Now, we need to fix the PHI nodes
   // that we vectorized. The PHI nodes are currently empty because we did
   // not want to introduce cycles. Notice that the remaining PHI nodes
@@ -1281,7 +771,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     assert(Legal->getReductionVars()->count(RdxPhi) &&
            "Unable to find the reduction variable");
     LoopVectorizationLegality::ReductionDescriptor RdxDesc =
-      (*Legal->getReductionVars())[RdxPhi];
+    (*Legal->getReductionVars())[RdxPhi];
 
     // We need to generate a reduction vector from the incoming scalar.
     // To do so, we need to generate the 'identity' vector and overide
@@ -1301,7 +791,7 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     // This vector is the Identity vector where the first element is the
     // incoming scalar reduction.
     Value *VectorStart = Builder.CreateInsertElement(Identity,
-                                                    RdxDesc.StartValue, Zero);
+                                                     RdxDesc.StartValue, Zero);
 
     // Fix the vector-loop phi.
     // We created the induction variable so we know that the
@@ -1311,8 +801,8 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
     // Reductions do not have to start at zero. They can start with
     // any loop invariant values.
     VecRdxPhi->addIncoming(VectorStart, VecPreheader);
-    unsigned SelfEdgeIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
-    Value *Val = getVectorValue(RdxPhi->getIncomingValue(SelfEdgeIdx));
+    Value *Val =
+    getVectorValue(RdxPhi->getIncomingValueForBlock(OrigLoop->getLoopLatch()));
     VecRdxPhi->addIncoming(Val, LoopVectorBody);
 
     // Before each round, move the insertion point right between
@@ -1329,29 +819,29 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
 
     // Extract the first scalar.
     Value *Scalar0 =
-      Builder.CreateExtractElement(NewPhi, Builder.getInt32(0));
+    Builder.CreateExtractElement(NewPhi, Builder.getInt32(0));
     // Extract and reduce the remaining vector elements.
     for (unsigned i=1; i < VF; ++i) {
       Value *Scalar1 =
-        Builder.CreateExtractElement(NewPhi, Builder.getInt32(i));
+      Builder.CreateExtractElement(NewPhi, Builder.getInt32(i));
       switch (RdxDesc.Kind) {
-        case LoopVectorizationLegality::IntegerAdd:
-          Scalar0 = Builder.CreateAdd(Scalar0, Scalar1);
-          break;
-        case LoopVectorizationLegality::IntegerMult:
-          Scalar0 = Builder.CreateMul(Scalar0, Scalar1);
-          break;
-        case LoopVectorizationLegality::IntegerOr:
-          Scalar0 = Builder.CreateOr(Scalar0, Scalar1);
-          break;
-        case LoopVectorizationLegality::IntegerAnd:
-          Scalar0 = Builder.CreateAnd(Scalar0, Scalar1);
-          break;
-        case LoopVectorizationLegality::IntegerXor:
-          Scalar0 = Builder.CreateXor(Scalar0, Scalar1);
-          break;
-        default:
-          llvm_unreachable("Unknown reduction operation");
+      case LoopVectorizationLegality::IntegerAdd:
+        Scalar0 = Builder.CreateAdd(Scalar0, Scalar1, "add.rdx");
+        break;
+      case LoopVectorizationLegality::IntegerMult:
+        Scalar0 = Builder.CreateMul(Scalar0, Scalar1, "mul.rdx");
+        break;
+      case LoopVectorizationLegality::IntegerOr:
+        Scalar0 = Builder.CreateOr(Scalar0, Scalar1, "or.rdx");
+        break;
+      case LoopVectorizationLegality::IntegerAnd:
+        Scalar0 = Builder.CreateAnd(Scalar0, Scalar1, "and.rdx");
+        break;
+      case LoopVectorizationLegality::IntegerXor:
+        Scalar0 = Builder.CreateXor(Scalar0, Scalar1, "xor.rdx");
+        break;
+      default:
+        llvm_unreachable("Unknown reduction operation");
       }
     }
 
@@ -1379,15 +869,373 @@ SingleBlockLoopVectorizer::vectorizeLoop(LoopVectorizationLegality *Legal) {
 
     // Fix the scalar loop reduction variable with the incoming reduction sum
     // from the vector body and from the backedge value.
-    int IncomingEdgeBlockIdx = (RdxPhi)->getBasicBlockIndex(LoopScalarBody);
-    int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1); // The other block.
+    int IncomingEdgeBlockIdx =
+    (RdxPhi)->getBasicBlockIndex(OrigLoop->getLoopLatch());
+    assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
+    // Pick the other block.
+    int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
     (RdxPhi)->setIncomingValue(SelfEdgeBlockIdx, Scalar0);
     (RdxPhi)->setIncomingValue(IncomingEdgeBlockIdx, RdxDesc.LoopExitInstr);
   }// end of for each redux variable.
 }
 
-void SingleBlockLoopVectorizer::updateAnalysis() {
-  // The original basic block.
+Value *InnerLoopVectorizer::createEdgeMask(BasicBlock *Src, BasicBlock *Dst) {
+  assert(std::find(pred_begin(Dst), pred_end(Dst), Src) != pred_end(Dst) &&
+         "Invalid edge");
+
+  Value *SrcMask = createBlockInMask(Src);
+
+  // The terminator has to be a branch inst!
+  BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
+  assert(BI && "Unexpected terminator found");
+
+  Value *EdgeMask = SrcMask;
+  if (BI->isConditional()) {
+    EdgeMask = getVectorValue(BI->getCondition());
+    if (BI->getSuccessor(0) != Dst)
+      EdgeMask = Builder.CreateNot(EdgeMask);
+  }
+
+  return Builder.CreateAnd(EdgeMask, SrcMask);
+}
+
+Value *InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
+  assert(OrigLoop->contains(BB) && "Block is not a part of a loop");
+
+  // Loop incoming mask is all-one.
+  if (OrigLoop->getHeader() == BB) {
+    Value *C = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 1);
+    return getVectorValue(C);
+  }
+
+  // This is the block mask. We OR all incoming edges, and with zero.
+  Value *Zero = ConstantInt::get(IntegerType::getInt1Ty(BB->getContext()), 0);
+  Value *BlockMask = getVectorValue(Zero);
+
+  // For each pred:
+  for (pred_iterator it = pred_begin(BB), e = pred_end(BB); it != e; ++it)
+    BlockMask = Builder.CreateOr(BlockMask, createEdgeMask(*it, BB));
+
+  return BlockMask;
+}
+
+void
+InnerLoopVectorizer::vectorizeBlockInLoop(LoopVectorizationLegality *Legal,
+                                          BasicBlock *BB, PhiVector *PV) {
+  Constant *Zero =
+  ConstantInt::get(IntegerType::getInt32Ty(BB->getContext()), 0);
+
+  // For each instruction in the old loop.
+  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+    switch (it->getOpcode()) {
+    case Instruction::Br:
+      // Nothing to do for PHIs and BR, since we already took care of the
+      // loop control flow instructions.
+      continue;
+    case Instruction::PHI:{
+      PHINode* P = cast<PHINode>(it);
+      // Handle reduction variables:
+      if (Legal->getReductionVars()->count(P)) {
+        // This is phase one of vectorizing PHIs.
+        Type *VecTy = VectorType::get(it->getType(), VF);
+        WidenMap[it] =
+          PHINode::Create(VecTy, 2, "vec.phi",
+                          LoopVectorBody->getFirstInsertionPt());
+        PV->push_back(P);
+        continue;
+      }
+
+      // Check for PHI nodes that are lowered to vector selects.
+      if (P->getParent() != OrigLoop->getHeader()) {
+        // We know that all PHIs in non header blocks are converted into
+        // selects, so we don't have to worry about the insertion order and we
+        // can just use the builder.
+
+        // At this point we generate the predication tree. There may be
+        // duplications since this is a simple recursive scan, but future
+        // optimizations will clean it up.
+        Value *Cond = createBlockInMask(P->getIncomingBlock(0));
+        WidenMap[P] =
+          Builder.CreateSelect(Cond,
+                               getVectorValue(P->getIncomingValue(0)),
+                               getVectorValue(P->getIncomingValue(1)),
+                               "predphi");
+        continue;
+      }
+
+      // This PHINode must be an induction variable.
+      // Make sure that we know about it.
+      assert(Legal->getInductionVars()->count(P) &&
+             "Not an induction variable");
+
+      LoopVectorizationLegality::InductionInfo II =
+        Legal->getInductionVars()->lookup(P);
+
+      switch (II.IK) {
+      case LoopVectorizationLegality::NoInduction:
+        llvm_unreachable("Unknown induction");
+      case LoopVectorizationLegality::IntInduction: {
+        assert(P == OldInduction && "Unexpected PHI");
+        Value *Broadcasted = getBroadcastInstrs(Induction);
+        // After broadcasting the induction variable we need to make the
+        // vector consecutive by adding 0, 1, 2 ...
+        Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted);
+        WidenMap[OldInduction] = ConsecutiveInduction;
+        continue;
+      }
+      case LoopVectorizationLegality::ReverseIntInduction:
+      case LoopVectorizationLegality::PtrInduction:
+        // Handle reverse integer and pointer inductions.
+        Value *StartIdx = 0;
+        // If we have a single integer induction variable then use it.
+        // Otherwise, start counting at zero.
+        if (OldInduction) {
+          LoopVectorizationLegality::InductionInfo OldII =
+            Legal->getInductionVars()->lookup(OldInduction);
+          StartIdx = OldII.StartValue;
+        } else {
+          StartIdx = ConstantInt::get(Induction->getType(), 0);
+        }
+        // This is the normalized GEP that starts counting at zero.
+        Value *NormalizedIdx = Builder.CreateSub(Induction, StartIdx,
+                                                 "normalized.idx");
+
+        // Handle the reverse integer induction variable case.
+        if (LoopVectorizationLegality::ReverseIntInduction == II.IK) {
+          IntegerType *DstTy = cast<IntegerType>(II.StartValue->getType());
+          Value *CNI = Builder.CreateSExtOrTrunc(NormalizedIdx, DstTy,
+                                                 "resize.norm.idx");
+          Value *ReverseInd  = Builder.CreateSub(II.StartValue, CNI,
+                                                 "reverse.idx");
+
+          // This is a new value so do not hoist it out.
+          Value *Broadcasted = getBroadcastInstrs(ReverseInd);
+          // After broadcasting the induction variable we need to make the
+          // vector consecutive by adding  ... -3, -2, -1, 0.
+          Value *ConsecutiveInduction = getConsecutiveVector(Broadcasted,
+                                                             true);
+          WidenMap[it] = ConsecutiveInduction;
+          continue;
+        }
+
+        // Handle the pointer induction variable case.
+        assert(P->getType()->isPointerTy() && "Unexpected type.");
+
+        // This is the vector of results. Notice that we don't generate
+        // vector geps because scalar geps result in better code.
+        Value *VecVal = UndefValue::get(VectorType::get(P->getType(), VF));
+        for (unsigned int i = 0; i < VF; ++i) {
+          Constant *Idx = ConstantInt::get(Induction->getType(), i);
+          Value *GlobalIdx = Builder.CreateAdd(NormalizedIdx, Idx,
+                                               "gep.idx");
+          Value *SclrGep = Builder.CreateGEP(II.StartValue, GlobalIdx,
+                                             "next.gep");
+          VecVal = Builder.CreateInsertElement(VecVal, SclrGep,
+                                               Builder.getInt32(i),
+                                               "insert.gep");
+        }
+
+        WidenMap[it] = VecVal;
+        continue;
+      }
+
+    }// End of PHI.
+
+    case Instruction::Add:
+    case Instruction::FAdd:
+    case Instruction::Sub:
+    case Instruction::FSub:
+    case Instruction::Mul:
+    case Instruction::FMul:
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::FDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+    case Instruction::FRem:
+    case Instruction::Shl:
+    case Instruction::LShr:
+    case Instruction::AShr:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor: {
+      // Just widen binops.
+      BinaryOperator *BinOp = dyn_cast<BinaryOperator>(it);
+      Value *A = getVectorValue(it->getOperand(0));
+      Value *B = getVectorValue(it->getOperand(1));
+
+      // Use this vector value for all users of the original instruction.
+      Value *V = Builder.CreateBinOp(BinOp->getOpcode(), A, B);
+      WidenMap[it] = V;
+
+      // Update the NSW, NUW and Exact flags.
+      BinaryOperator *VecOp = cast<BinaryOperator>(V);
+      if (isa<OverflowingBinaryOperator>(BinOp)) {
+        VecOp->setHasNoSignedWrap(BinOp->hasNoSignedWrap());
+        VecOp->setHasNoUnsignedWrap(BinOp->hasNoUnsignedWrap());
+      }
+      if (isa<PossiblyExactOperator>(VecOp))
+        VecOp->setIsExact(BinOp->isExact());
+      break;
+    }
+    case Instruction::Select: {
+      // Widen selects.
+      // If the selector is loop invariant we can create a select
+      // instruction with a scalar condition. Otherwise, use vector-select.
+      Value *Cond = it->getOperand(0);
+      bool InvariantCond = SE->isLoopInvariant(SE->getSCEV(Cond), OrigLoop);
+
+      // The condition can be loop invariant  but still defined inside the
+      // loop. This means that we can't just use the original 'cond' value.
+      // We have to take the 'vectorized' value and pick the first lane.
+      // Instcombine will make this a no-op.
+      Cond = getVectorValue(Cond);
+      if (InvariantCond)
+        Cond = Builder.CreateExtractElement(Cond, Builder.getInt32(0));
+
+      Value *Op0 = getVectorValue(it->getOperand(1));
+      Value *Op1 = getVectorValue(it->getOperand(2));
+      WidenMap[it] = Builder.CreateSelect(Cond, Op0, Op1);
+      break;
+    }
+
+    case Instruction::ICmp:
+    case Instruction::FCmp: {
+      // Widen compares. Generate vector compares.
+      bool FCmp = (it->getOpcode() == Instruction::FCmp);
+      CmpInst *Cmp = dyn_cast<CmpInst>(it);
+      Value *A = getVectorValue(it->getOperand(0));
+      Value *B = getVectorValue(it->getOperand(1));
+      if (FCmp)
+        WidenMap[it] = Builder.CreateFCmp(Cmp->getPredicate(), A, B);
+      else
+        WidenMap[it] = Builder.CreateICmp(Cmp->getPredicate(), A, B);
+      break;
+    }
+
+    case Instruction::Store: {
+      // Attempt to issue a wide store.
+      StoreInst *SI = dyn_cast<StoreInst>(it);
+      Type *StTy = VectorType::get(SI->getValueOperand()->getType(), VF);
+      Value *Ptr = SI->getPointerOperand();
+      unsigned Alignment = SI->getAlignment();
+
+      assert(!Legal->isUniform(Ptr) &&
+             "We do not allow storing to uniform addresses");
+
+      GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+
+      // This store does not use GEPs.
+      if (!Legal->isConsecutivePtr(Ptr)) {
+        scalarizeInstruction(it);
+        break;
+      }
+
+      if (Gep) {
+        // The last index does not have to be the induction. It can be
+        // consecutive and be a function of the index. For example A[I+1];
+        unsigned NumOperands = Gep->getNumOperands();
+        Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands - 1));
+        LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
+
+        // Create the new GEP with the new induction variable.
+        GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+        Gep2->setOperand(NumOperands - 1, LastIndex);
+        Ptr = Builder.Insert(Gep2);
+      } else {
+        // Use the induction element ptr.
+        assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
+        Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero);
+      }
+      Ptr = Builder.CreateBitCast(Ptr, StTy->getPointerTo());
+      Value *Val = getVectorValue(SI->getValueOperand());
+      Builder.CreateStore(Val, Ptr)->setAlignment(Alignment);
+      break;
+    }
+    case Instruction::Load: {
+      // Attempt to issue a wide load.
+      LoadInst *LI = dyn_cast<LoadInst>(it);
+      Type *RetTy = VectorType::get(LI->getType(), VF);
+      Value *Ptr = LI->getPointerOperand();
+      unsigned Alignment = LI->getAlignment();
+      GetElementPtrInst *Gep = dyn_cast<GetElementPtrInst>(Ptr);
+
+      // If the pointer is loop invariant or if it is non consecutive,
+      // scalarize the load.
+      bool Con = Legal->isConsecutivePtr(Ptr);
+      if (Legal->isUniform(Ptr) || !Con) {
+        scalarizeInstruction(it);
+        break;
+      }
+
+      if (Gep) {
+        // The last index does not have to be the induction. It can be
+        // consecutive and be a function of the index. For example A[I+1];
+        unsigned NumOperands = Gep->getNumOperands();
+        Value *LastIndex = getVectorValue(Gep->getOperand(NumOperands -1));
+        LastIndex = Builder.CreateExtractElement(LastIndex, Zero);
+
+        // Create the new GEP with the new induction variable.
+        GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
+        Gep2->setOperand(NumOperands - 1, LastIndex);
+        Ptr = Builder.Insert(Gep2);
+      } else {
+        // Use the induction element ptr.
+        assert(isa<PHINode>(Ptr) && "Invalid induction ptr");
+        Ptr = Builder.CreateExtractElement(getVectorValue(Ptr), Zero);
+      }
+
+      Ptr = Builder.CreateBitCast(Ptr, RetTy->getPointerTo());
+      LI = Builder.CreateLoad(Ptr);
+      LI->setAlignment(Alignment);
+      // Use this vector value for all users of the load.
+      WidenMap[it] = LI;
+      break;
+    }
+    case Instruction::ZExt:
+    case Instruction::SExt:
+    case Instruction::FPToUI:
+    case Instruction::FPToSI:
+    case Instruction::FPExt:
+    case Instruction::PtrToInt:
+    case Instruction::IntToPtr:
+    case Instruction::SIToFP:
+    case Instruction::UIToFP:
+    case Instruction::Trunc:
+    case Instruction::FPTrunc:
+    case Instruction::BitCast: {
+      /// Vectorize bitcasts.
+      CastInst *CI = dyn_cast<CastInst>(it);
+      Value *A = getVectorValue(it->getOperand(0));
+      Type *DestTy = VectorType::get(CI->getType()->getScalarType(), VF);
+      WidenMap[it] = Builder.CreateCast(CI->getOpcode(), A, DestTy);
+      break;
+    }
+
+    case Instruction::Call: {
+      assert(isTriviallyVectorizableIntrinsic(it));
+      Module *M = BB->getParent()->getParent();
+      IntrinsicInst *II = cast<IntrinsicInst>(it);
+      Intrinsic::ID ID = II->getIntrinsicID();
+      SmallVector<Value*, 4> Args;
+      for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i)
+        Args.push_back(getVectorValue(II->getArgOperand(i)));
+      Type *Tys[] = { VectorType::get(II->getType()->getScalarType(), VF) };
+      Function *F = Intrinsic::getDeclaration(M, ID, Tys);
+      WidenMap[it] = Builder.CreateCall(F, Args);
+      break;
+    }
+
+    default:
+      // All other instructions are unsupported. Scalarize them.
+      scalarizeInstruction(it);
+      break;
+    }// end of switch.
+  }// end of for_each instr.
+}
+
+void InnerLoopVectorizer::updateAnalysis() {
+  // Forget the original basic block.
   SE->forgetLoop(OrigLoop);
 
   // Update the dominator tree information.
@@ -1404,45 +1252,90 @@ void SingleBlockLoopVectorizer::updateAnalysis() {
   DEBUG(DT->verifyAnalysis());
 }
 
-bool LoopVectorizationLegality::canVectorize() {
-  if (!TheLoop->getLoopPreheader()) {
-    assert(false && "No preheader!!");
-    DEBUG(dbgs() << "LV: Loop not normalized." << "\n");
+bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
+  if (!EnableIfConversion)
     return false;
+
+  assert(TheLoop->getNumBlocks() > 1 && "Single block loops are vectorizable");
+  std::vector<BasicBlock*> &LoopBlocks = TheLoop->getBlocksVector();
+
+  // Collect the blocks that need predication.
+  for (unsigned i = 0, e = LoopBlocks.size(); i < e; ++i) {
+    BasicBlock *BB = LoopBlocks[i];
+
+    // We must have at most two predecessors because we need to convert
+    // all PHIs to selects.
+    unsigned Preds = std::distance(pred_begin(BB), pred_end(BB));
+    if (Preds > 2)
+      return false;
+
+    // We must be able to predicate all blocks that need to be predicated.
+    if (blockNeedsPredication(BB) && !blockCanBePredicated(BB))
+      return false;
   }
 
-  // We can only vectorize single basic block loops.
+  // We can if-convert this loop.
+  return true;
+}
+
+bool LoopVectorizationLegality::canVectorize() {
+  assert(TheLoop->getLoopPreheader() && "No preheader!!");
+
+  // We can only vectorize innermost loops.
+  if (TheLoop->getSubLoopsVector().size())
+    return false;
+
+  // We must have a single backedge.
+  if (TheLoop->getNumBackEdges() != 1)
+    return false;
+
+  // We must have a single exiting block.
+  if (!TheLoop->getExitingBlock())
+    return false;
+
   unsigned NumBlocks = TheLoop->getNumBlocks();
-  if (NumBlocks != 1) {
-    DEBUG(dbgs() << "LV: Too many blocks:" << NumBlocks << "\n");
+
+  // Check if we can if-convert non single-bb loops.
+  if (NumBlocks != 1 && !canVectorizeWithIfConvert()) {
+    DEBUG(dbgs() << "LV: Can't if-convert the loop.\n");
     return false;
   }
 
   // We need to have a loop header.
-  BasicBlock *BB = TheLoop->getHeader();
-  DEBUG(dbgs() << "LV: Found a loop: " << BB->getName() << "\n");
+  BasicBlock *Latch = TheLoop->getLoopLatch();
+  DEBUG(dbgs() << "LV: Found a loop: " <<
+        TheLoop->getHeader()->getName() << "\n");
 
   // ScalarEvolution needs to be able to find the exit count.
-  const SCEV *ExitCount = SE->getExitCount(TheLoop, BB);
+  const SCEV *ExitCount = SE->getExitCount(TheLoop, Latch);
   if (ExitCount == SE->getCouldNotCompute()) {
     DEBUG(dbgs() << "LV: SCEV could not compute the loop exit count.\n");
     return false;
   }
 
   // Do not loop-vectorize loops with a tiny trip count.
-  unsigned TC = SE->getSmallConstantTripCount(TheLoop, BB);
+  unsigned TC = SE->getSmallConstantTripCount(TheLoop, Latch);
   if (TC > 0u && TC < TinyTripCountThreshold) {
     DEBUG(dbgs() << "LV: Found a loop with a very small trip count. " <<
           "This loop is not worth vectorizing.\n");
     return false;
   }
 
+  // Check if we can vectorize the instructions and CFG in this loop.
+  if (!canVectorizeInstrs()) {
+    DEBUG(dbgs() << "LV: Can't vectorize the instructions or CFG\n");
+    return false;
+  }
+
   // Go over each instruction and look at memory deps.
-  if (!canVectorizeBlock(*BB)) {
-    DEBUG(dbgs() << "LV: Can't vectorize this loop header\n");
+  if (!canVectorizeMemory()) {
+    DEBUG(dbgs() << "LV: Can't vectorize due to memory conflicts\n");
     return false;
   }
 
+  // Collect all of the variables that remain uniform after vectorization.
+  collectLoopUniforms();
+
   DEBUG(dbgs() << "LV: We can vectorize this loop" <<
         (PtrRtCheck.Need ? " (with a runtime bound check)" : "")
         <<"!\n");
@@ -1453,137 +1346,152 @@ bool LoopVectorizationLegality::canVectorize() {
   return true;
 }
 
-bool LoopVectorizationLegality::canVectorizeBlock(BasicBlock &BB) {
-
+bool LoopVectorizationLegality::canVectorizeInstrs() {
   BasicBlock *PreHeader = TheLoop->getLoopPreheader();
+  BasicBlock *Header = TheLoop->getHeader();
 
-  // Scan the instructions in the block and look for hazards.
-  for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
-    Instruction *I = it;
-
-    if (PHINode *Phi = dyn_cast<PHINode>(I)) {
-      // This should not happen because the loop should be normalized.
-      if (Phi->getNumIncomingValues() != 2) {
-        DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
-        return false;
-      }
+  // For each block in the loop.
+  for (Loop::block_iterator bb = TheLoop->block_begin(),
+       be = TheLoop->block_end(); bb != be; ++bb) {
 
-      // This is the value coming from the preheader.
-      Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);
+    // Scan the instructions in the block and look for hazards.
+    for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
+         ++it) {
 
-      // We only look at integer and pointer phi nodes.
-      if (Phi->getType()->isPointerTy() && isInductionVariable(Phi)) {
-        DEBUG(dbgs() << "LV: Found a pointer induction variable.\n");
-        Inductions[Phi] = StartValue;
-        continue;
-      } else if (!Phi->getType()->isIntegerTy()) {
-        DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
-        return false;
-      }
+      if (PHINode *Phi = dyn_cast<PHINode>(it)) {
+        // This should not happen because the loop should be normalized.
+        if (Phi->getNumIncomingValues() != 2) {
+          DEBUG(dbgs() << "LV: Found an invalid PHI.\n");
+          return false;
+        }
 
-      // Handle integer PHIs:
-      if (isInductionVariable(Phi)) {
-        if (Induction) {
-          DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n");
+        // Check that this PHI type is allowed.
+        if (!Phi->getType()->isIntegerTy() &&
+            !Phi->getType()->isPointerTy()) {
+          DEBUG(dbgs() << "LV: Found an non-int non-pointer PHI.\n");
           return false;
         }
-        DEBUG(dbgs() << "LV: Found the induction PHI."<< *Phi <<"\n");
-        Induction = Phi;
-        Inductions[Phi] = StartValue;
-        continue;
-      }
-      if (AddReductionVar(Phi, IntegerAdd)) {
-        DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n");
-        continue;
-      }
-      if (AddReductionVar(Phi, IntegerMult)) {
-        DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n");
-        continue;
-      }
-      if (AddReductionVar(Phi, IntegerOr)) {
-        DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n");
-        continue;
-      }
-      if (AddReductionVar(Phi, IntegerAnd)) {
-        DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n");
-        continue;
-      }
-      if (AddReductionVar(Phi, IntegerXor)) {
-        DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n");
-        continue;
-      }
 
-      DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
-      return false;
-    }// end of PHI handling
+        // If this PHINode is not in the header block, then we know that we
+        // can convert it to select during if-conversion. No need to check if
+        // the PHIs in this block are induction or reduction variables.
+        if (*bb != Header)
+          continue;
 
-    // We still don't handle functions.
-    CallInst *CI = dyn_cast<CallInst>(I);
-    if (CI) {
-      DEBUG(dbgs() << "LV: Found a call site.\n");
-      return false;
-    }
+        // This is the value coming from the preheader.
+        Value *StartValue = Phi->getIncomingValueForBlock(PreHeader);
+        // Check if this is an induction variable.
+        InductionKind IK = isInductionVariable(Phi);
+
+        if (NoInduction != IK) {
+          // Int inductions are special because we only allow one IV.
+          if (IK == IntInduction) {
+            if (Induction) {
+              DEBUG(dbgs() << "LV: Found too many inductions."<< *Phi <<"\n");
+              return false;
+            }
+            Induction = Phi;
+          }
+
+          DEBUG(dbgs() << "LV: Found an induction variable.\n");
+          Inductions[Phi] = InductionInfo(StartValue, IK);
+          continue;
+        }
 
-    // We do not re-vectorize vectors.
-    if (!VectorType::isValidElementType(I->getType()) &&
-        !I->getType()->isVoidTy()) {
-      DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n");
-      return false;
-    }
+        if (AddReductionVar(Phi, IntegerAdd)) {
+          DEBUG(dbgs() << "LV: Found an ADD reduction PHI."<< *Phi <<"\n");
+          continue;
+        }
+        if (AddReductionVar(Phi, IntegerMult)) {
+          DEBUG(dbgs() << "LV: Found a MUL reduction PHI."<< *Phi <<"\n");
+          continue;
+        }
+        if (AddReductionVar(Phi, IntegerOr)) {
+          DEBUG(dbgs() << "LV: Found an OR reduction PHI."<< *Phi <<"\n");
+          continue;
+        }
+        if (AddReductionVar(Phi, IntegerAnd)) {
+          DEBUG(dbgs() << "LV: Found an AND reduction PHI."<< *Phi <<"\n");
+          continue;
+        }
+        if (AddReductionVar(Phi, IntegerXor)) {
+          DEBUG(dbgs() << "LV: Found a XOR reduction PHI."<< *Phi <<"\n");
+          continue;
+        }
 
-    // Reduction instructions are allowed to have exit users.
-    // All other instructions must not have external users.
-    if (!AllowedExit.count(I))
-      //Check that all of the users of the loop are inside the BB.
-      for (Value::use_iterator it = I->use_begin(), e = I->use_end();
-           it != e; ++it) {
-        Instruction *U = cast<Instruction>(*it);
-        // This user may be a reduction exit value.
-        BasicBlock *Parent = U->getParent();
-        if (Parent != &BB) {
-          DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
-          return false;
+        DEBUG(dbgs() << "LV: Found an unidentified PHI."<< *Phi <<"\n");
+        return false;
+      }// end of PHI handling
+
+      // We still don't handle functions.
+      CallInst *CI = dyn_cast<CallInst>(it);
+      if (CI && !isTriviallyVectorizableIntrinsic(it)) {
+        DEBUG(dbgs() << "LV: Found a call site.\n");
+        return false;
+      }
+
+      // We do not re-vectorize vectors.
+      if (!VectorType::isValidElementType(it->getType()) &&
+          !it->getType()->isVoidTy()) {
+        DEBUG(dbgs() << "LV: Found unvectorizable type." << "\n");
+        return false;
+      }
+
+      // Reduction instructions are allowed to have exit users.
+      // All other instructions must not have external users.
+      if (!AllowedExit.count(it))
+        //Check that all of the users of the loop are inside the BB.
+        for (Value::use_iterator I = it->use_begin(), E = it->use_end();
+             I != E; ++I) {
+          Instruction *U = cast<Instruction>(*I);
+          // This user may be a reduction exit value.
+          if (!TheLoop->contains(U)) {
+            DEBUG(dbgs() << "LV: Found an outside user for : "<< *U << "\n");
+            return false;
+          }
         }
-    }
-  } // next instr.
+    } // next instr.
+
+  }
 
   if (!Induction) {
     DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
     assert(getInductionVars()->size() && "No induction variables");
   }
 
-  // Don't vectorize if the memory dependencies do not allow vectorization.
-  if (!canVectorizeMemory(BB))
-    return false;
+  return true;
+}
 
+void LoopVectorizationLegality::collectLoopUniforms() {
   // We now know that the loop is vectorizable!
   // Collect variables that will remain uniform after vectorization.
   std::vector<Value*> Worklist;
+  BasicBlock *Latch = TheLoop->getLoopLatch();
 
   // Start with the conditional branch and walk up the block.
-  Worklist.push_back(BB.getTerminator()->getOperand(0));
+  Worklist.push_back(Latch->getTerminator()->getOperand(0));
 
   while (Worklist.size()) {
     Instruction *I = dyn_cast<Instruction>(Worklist.back());
     Worklist.pop_back();
 
-    // Look at instructions inside this block. Stop when reaching PHI nodes.
-    if (!I || I->getParent() != &BB || isa<PHINode>(I))
+    // Look at instructions inside this loop.
+    // Stop when reaching PHI nodes.
+    // TODO: we need to follow values all over the loop, not only in this block.
+    if (!I || !TheLoop->contains(I) || isa<PHINode>(I))
       continue;
 
     // This is a known uniform.
     Uniforms.insert(I);
 
     // Insert all operands.
-    for (int i=0, Op = I->getNumOperands(); i < Op; ++i) {
+    for (int i = 0, Op = I->getNumOperands(); i < Op; ++i) {
       Worklist.push_back(I->getOperand(i));
     }
   }
-
-  return true;
 }
 
-bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
+bool LoopVectorizationLegality::canVectorizeMemory() {
   typedef SmallVector<Value*, 16> ValueVector;
   typedef SmallPtrSet<Value*, 16> ValueSet;
   // Holds the Load and Store *instructions*.
@@ -1592,35 +1500,40 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
   PtrRtCheck.Pointers.clear();
   PtrRtCheck.Need = false;
 
-  // Scan the BB and collect legal loads and stores.
-  for (BasicBlock::iterator it = BB.begin(), e = BB.end(); it != e; ++it) {
-    Instruction *I = it;
-
-    // If this is a load, save it. If this instruction can read from memory
-    // but is not a load, then we quit. Notice that we don't handle function
-    // calls that read or write.
-    if (I->mayReadFromMemory()) {
-      LoadInst *Ld = dyn_cast<LoadInst>(I);
-      if (!Ld) return false;
-      if (!Ld->isSimple()) {
-        DEBUG(dbgs() << "LV: Found a non-simple load.\n");
-        return false;
+  // For each block.
+  for (Loop::block_iterator bb = TheLoop->block_begin(),
+       be = TheLoop->block_end(); bb != be; ++bb) {
+
+    // Scan the BB and collect legal loads and stores.
+    for (BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); it != e;
+         ++it) {
+
+      // If this is a load, save it. If this instruction can read from memory
+      // but is not a load, then we quit. Notice that we don't handle function
+      // calls that read or write.
+      if (it->mayReadFromMemory()) {
+        LoadInst *Ld = dyn_cast<LoadInst>(it);
+        if (!Ld) return false;
+        if (!Ld->isSimple()) {
+          DEBUG(dbgs() << "LV: Found a non-simple load.\n");
+          return false;
+        }
+        Loads.push_back(Ld);
+        continue;
       }
-      Loads.push_back(Ld);
-      continue;
-    }
 
-    // Save store instructions. Abort if other instructions write to memory.
-    if (I->mayWriteToMemory()) {
-      StoreInst *St = dyn_cast<StoreInst>(I);
-      if (!St) return false;
-      if (!St->isSimple()) {
-        DEBUG(dbgs() << "LV: Found a non-simple store.\n");
-        return false;
+      // Save 'store' instructions. Abort if other instructions write to memory.
+      if (it->mayWriteToMemory()) {
+        StoreInst *St = dyn_cast<StoreInst>(it);
+        if (!St) return false;
+        if (!St->isSimple()) {
+          DEBUG(dbgs() << "LV: Found a non-simple store.\n");
+          return false;
+        }
+        Stores.push_back(St);
       }
-      Stores.push_back(St);
-    }
-  } // next instr.
+    } // next instr.
+  } // next block.
 
   // Now we have two lists that hold the loads and the stores.
   // Next, we find the pointers that they use.
@@ -1628,8 +1541,8 @@ bool LoopVectorizationLegality::canVectorizeMemory(BasicBlock &BB) {
   // Check if we see any stores. If there are no stores, then we don't
   // care if the pointers are *restrict*.
   if (!Stores.size()) {
-        DEBUG(dbgs() << "LV: Found a read-only loop!\n");
-        return true;
+    DEBUG(dbgs() << "LV: Found a read-only loop!\n");
+    return true;
   }
 
   // Holds the read and read-write *pointers* that we find.
@@ -1770,11 +1683,13 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
   if (Phi->getNumIncomingValues() != 2)
     return false;
 
-  // Find the possible incoming reduction variable.
-  BasicBlock *BB = Phi->getParent();
-  int SelfEdgeIdx = Phi->getBasicBlockIndex(BB);
-  int InEdgeBlockIdx = (SelfEdgeIdx ? 0 : 1); // The other entry.
-  Value *RdxStart = Phi->getIncomingValue(InEdgeBlockIdx);
+  // Reduction variables are only found in the loop header block.
+  if (Phi->getParent() != TheLoop->getHeader())
+    return false;
+
+  // Obtain the reduction start value from the value that comes from the loop
+  // preheader.
+  Value *RdxStart = Phi->getIncomingValueForBlock(TheLoop->getLoopPreheader());
 
   // ExitInstruction is the single value which is used outside the loop.
   // We only allow for a single reduction value to be used outside the loop.
@@ -1789,20 +1704,20 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
   // Also, we can't have multiple block-local users.
   Instruction *Iter = Phi;
   while (true) {
+    // If the instruction has no users then this is a broken
+    // chain and can't be a reduction variable.
+    if (Iter->use_empty())
+      return false;
+
     // Any reduction instr must be of one of the allowed kinds.
     if (!isReductionInstr(Iter, Kind))
       return false;
 
-    // Did we found a user inside this block ?
+    // Did we find a user inside this block ?
     bool FoundInBlockUser = false;
     // Did we reach the initial PHI node ?
     bool FoundStartPHI = false;
 
-    // If the instruction has no users then this is a broken
-    // chain and can't be a reduction variable.
-    if (Iter->use_empty())
-      return false;
-
     // For each of the *users* of iter.
     for (Value::use_iterator it = Iter->use_begin(), e = Iter->use_end();
          it != e; ++it) {
@@ -1812,14 +1727,23 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
         FoundStartPHI = true;
         continue;
       }
+
       // Check if we found the exit user.
       BasicBlock *Parent = U->getParent();
-      if (Parent != BB) {
-        // We must have a single exit instruction.
+      if (!TheLoop->contains(Parent)) {
+        // Exit if you find multiple outside users.
         if (ExitInstruction != 0)
           return false;
         ExitInstruction = Iter;
       }
+
+      // We allow in-loop PHINodes which are not the original reduction PHI
+      // node. If this PHI is the only user of Iter (happens in IF w/ no ELSE
+      // structure) then don't skip this PHI.
+      if (isa<PHINode>(U) && U->getParent() != TheLoop->getHeader() &&
+          TheLoop->contains(U) && Iter->getNumUses() > 1)
+        continue;
+
       // We can't have multiple inside users.
       if (FoundInBlockUser)
         return false;
@@ -1830,67 +1754,110 @@ bool LoopVectorizationLegality::AddReductionVar(PHINode *Phi,
     // We found a reduction var if we have reached the original
     // phi node and we only have a single instruction with out-of-loop
     // users.
-   if (FoundStartPHI && ExitInstruction) {
-     // This instruction is allowed to have out-of-loop users.
-     AllowedExit.insert(ExitInstruction);
-
-     // Save the description of this reduction variable.
-     ReductionDescriptor RD(RdxStart, ExitInstruction, Kind);
-     Reductions[Phi] = RD;
-     return true;
-   }
+    if (FoundStartPHI && ExitInstruction) {
+      // This instruction is allowed to have out-of-loop users.
+      AllowedExit.insert(ExitInstruction);
+
+      // Save the description of this reduction variable.
+      ReductionDescriptor RD(RdxStart, ExitInstruction, Kind);
+      Reductions[Phi] = RD;
+      return true;
+    }
+
+    // If we've reached the start PHI but did not find an outside user then
+    // this is dead code. Abort.
+    if (FoundStartPHI)
+      return false;
   }
 }
 
 bool
 LoopVectorizationLegality::isReductionInstr(Instruction *I,
                                             ReductionKind Kind) {
-    switch (I->getOpcode()) {
-    default:
-      return false;
-    case Instruction::PHI:
-      // possibly.
-      return true;
-    case Instruction::Add:
-    case Instruction::Sub:
-      return Kind == IntegerAdd;
-    case Instruction::Mul:
-      return Kind == IntegerMult;
-    case Instruction::And:
-      return Kind == IntegerAnd;
-    case Instruction::Or:
-      return Kind == IntegerOr;
-    case Instruction::Xor:
-      return Kind == IntegerXor;
-    }
+  switch (I->getOpcode()) {
+  default:
+    return false;
+  case Instruction::PHI:
+    // possibly.
+    return true;
+  case Instruction::Add:
+  case Instruction::Sub:
+    return Kind == IntegerAdd;
+  case Instruction::Mul:
+    return Kind == IntegerMult;
+  case Instruction::And:
+    return Kind == IntegerAnd;
+  case Instruction::Or:
+    return Kind == IntegerOr;
+  case Instruction::Xor:
+    return Kind == IntegerXor;
+  }
 }
 
-bool LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
+LoopVectorizationLegality::InductionKind
+LoopVectorizationLegality::isInductionVariable(PHINode *Phi) {
   Type *PhiTy = Phi->getType();
   // We only handle integer and pointer inductions variables.
   if (!PhiTy->isIntegerTy() && !PhiTy->isPointerTy())
-    return false;
+    return NoInduction;
 
   // Check that the PHI is consecutive and starts at zero.
   const SCEV *PhiScev = SE->getSCEV(Phi);
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PhiScev);
   if (!AR) {
     DEBUG(dbgs() << "LV: PHI is not a poly recurrence.\n");
-    return false;
+    return NoInduction;
   }
   const SCEV *Step = AR->getStepRecurrence(*SE);
 
   // Integer inductions need to have a stride of one.
-  if (PhiTy->isIntegerTy())
-    return Step->isOne();
+  if (PhiTy->isIntegerTy()) {
+    if (Step->isOne())
+      return IntInduction;
+    if (Step->isAllOnesValue())
+      return ReverseIntInduction;
+    return NoInduction;
+  }
 
   // Calculate the pointer stride and check if it is consecutive.
   const SCEVConstant *C = dyn_cast<SCEVConstant>(Step);
-  if (!C) return false;
+  if (!C)
+    return NoInduction;
 
   assert(PhiTy->isPointerTy() && "The PHI must be a pointer");
   uint64_t Size = DL->getTypeAllocSize(PhiTy->getPointerElementType());
-  return (C->getValue()->equalsInt(Size));
+  if (C->getValue()->equalsInt(Size))
+    return PtrInduction;
+
+  return NoInduction;
+}
+
+bool LoopVectorizationLegality::blockNeedsPredication(BasicBlock *BB)  {
+  assert(TheLoop->contains(BB) && "Unknown block used");
+
+  // Blocks that do not dominate the latch need predication.
+  BasicBlock* Latch = TheLoop->getLoopLatch();
+  return !DT->dominates(BB, Latch);
+}
+
+bool LoopVectorizationLegality::blockCanBePredicated(BasicBlock *BB) {
+  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+    // We don't predicate loads/stores at the moment.
+    if (it->mayReadFromMemory() || it->mayWriteToMemory() || it->mayThrow())
+      return false;
+
+    // The isntructions below can trap.
+    switch (it->getOpcode()) {
+    default: continue;
+    case Instruction::UDiv:
+    case Instruction::SDiv:
+    case Instruction::URem:
+    case Instruction::SRem:
+             return false;
+    }
+  }
+
+  return true;
 }
 
 bool LoopVectorizationLegality::hasComputableBounds(Value *Ptr) {
@@ -1930,19 +1897,29 @@ LoopVectorizationCostModel::findBestVectorizationFactor(unsigned VF) {
 }
 
 unsigned LoopVectorizationCostModel::expectedCost(unsigned VF) {
-  // We can only estimate the cost of single basic block loops.
-  assert(1 == TheLoop->getNumBlocks() && "Too many blocks in loop");
-
-  BasicBlock *BB = TheLoop->getHeader();
   unsigned Cost = 0;
 
-  // For each instruction in the old loop.
-  for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
-    Instruction *Inst = it;
-    unsigned C = getInstructionCost(Inst, VF);
-    Cost += C;
-    DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF "<< VF <<
-          " For instruction: "<< *Inst << "\n");
+  // For each block.
+  for (Loop::block_iterator bb = TheLoop->block_begin(),
+       be = TheLoop->block_end(); bb != be; ++bb) {
+    unsigned BlockCost = 0;
+    BasicBlock *BB = *bb;
+
+    // For each instruction in the old loop.
+    for (BasicBlock::iterator it = BB->begin(), e = BB->end(); it != e; ++it) {
+      unsigned C = getInstructionCost(it, VF);
+      Cost += C;
+      DEBUG(dbgs() << "LV: Found an estimated cost of "<< C <<" for VF " <<
+            VF << " For instruction: "<< *it << "\n");
+    }
+
+    // We assume that if-converted blocks have a 50% chance of being executed.
+    // When the code is scalar then some of the blocks are avoided due to CF.
+    // When the code is vectorized we execute all code paths.
+    if (Legal->blockNeedsPredication(*bb) && VF == 1)
+      BlockCost /= 2;
+
+    Cost += BlockCost;
   }
 
   return Cost;
@@ -1960,147 +1937,156 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   Type *RetTy = I->getType();
   Type *VectorTy = ToVectorTy(RetTy, VF);
 
-
   // TODO: We need to estimate the cost of intrinsic calls.
   switch (I->getOpcode()) {
-    case Instruction::GetElementPtr:
-      // We mark this instruction as zero-cost because scalar GEPs are usually
-      // lowered to the intruction addressing mode. At the moment we don't
-      // generate vector geps.
-      return 0;
-    case Instruction::Br: {
-      return VTTI->getCFInstrCost(I->getOpcode());
-    }
-    case Instruction::PHI:
-      return 0;
-    case Instruction::Add:
-    case Instruction::FAdd:
-    case Instruction::Sub:
-    case Instruction::FSub:
-    case Instruction::Mul:
-    case Instruction::FMul:
-    case Instruction::UDiv:
-    case Instruction::SDiv:
-    case Instruction::FDiv:
-    case Instruction::URem:
-    case Instruction::SRem:
-    case Instruction::FRem:
-    case Instruction::Shl:
-    case Instruction::LShr:
-    case Instruction::AShr:
-    case Instruction::And:
-    case Instruction::Or:
-    case Instruction::Xor: {
-      return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy);
-    }
-    case Instruction::Select: {
-      SelectInst *SI = cast<SelectInst>(I);
-      const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
-      bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
-      Type *CondTy = SI->getCondition()->getType();
-      if (ScalarCond)
-        CondTy = VectorType::get(CondTy, VF);
-
-      return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
-    }
-    case Instruction::ICmp:
-    case Instruction::FCmp: {
-      Type *ValTy = I->getOperand(0)->getType();
-      VectorTy = ToVectorTy(ValTy, VF);
-      return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy);
-    }
-    case Instruction::Store: {
-      StoreInst *SI = cast<StoreInst>(I);
-      Type *ValTy = SI->getValueOperand()->getType();
-      VectorTy = ToVectorTy(ValTy, VF);
-
-      if (VF == 1)
-        return VTTI->getMemoryOpCost(I->getOpcode(), ValTy,
-                              SI->getAlignment(), SI->getPointerAddressSpace());
-
-      // Scalarized stores.
-      if (!Legal->isConsecutivePtr(SI->getPointerOperand())) {
-        unsigned Cost = 0;
-        unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
-                                              ValTy);
-        // The cost of extracting from the value vector.
-        Cost += VF * (ExtCost);
-        // The cost of the scalar stores.
-        Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
-                                           ValTy->getScalarType(),
-                                           SI->getAlignment(),
-                                           SI->getPointerAddressSpace());
-        return Cost;
-      }
-
-      // Wide stores.
-      return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(),
+  case Instruction::GetElementPtr:
+    // We mark this instruction as zero-cost because scalar GEPs are usually
+    // lowered to the intruction addressing mode. At the moment we don't
+    // generate vector geps.
+    return 0;
+  case Instruction::Br: {
+    return VTTI->getCFInstrCost(I->getOpcode());
+  }
+  case Instruction::PHI:
+    //TODO: IF-converted IFs become selects.
+    return 0;
+  case Instruction::Add:
+  case Instruction::FAdd:
+  case Instruction::Sub:
+  case Instruction::FSub:
+  case Instruction::Mul:
+  case Instruction::FMul:
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::FDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::FRem:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    return VTTI->getArithmeticInstrCost(I->getOpcode(), VectorTy);
+  case Instruction::Select: {
+    SelectInst *SI = cast<SelectInst>(I);
+    const SCEV *CondSCEV = SE->getSCEV(SI->getCondition());
+    bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
+    Type *CondTy = SI->getCondition()->getType();
+    if (ScalarCond)
+      CondTy = VectorType::get(CondTy, VF);
+
+    return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
+  }
+  case Instruction::ICmp:
+  case Instruction::FCmp: {
+    Type *ValTy = I->getOperand(0)->getType();
+    VectorTy = ToVectorTy(ValTy, VF);
+    return VTTI->getCmpSelInstrCost(I->getOpcode(), VectorTy);
+  }
+  case Instruction::Store: {
+    StoreInst *SI = cast<StoreInst>(I);
+    Type *ValTy = SI->getValueOperand()->getType();
+    VectorTy = ToVectorTy(ValTy, VF);
+
+    if (VF == 1)
+      return VTTI->getMemoryOpCost(I->getOpcode(), ValTy,
+                                   SI->getAlignment(),
                                    SI->getPointerAddressSpace());
+
+    // Scalarized stores.
+    if (!Legal->isConsecutivePtr(SI->getPointerOperand())) {
+      unsigned Cost = 0;
+      unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
+                                            ValTy);
+      // The cost of extracting from the value vector.
+      Cost += VF * (ExtCost);
+      // The cost of the scalar stores.
+      Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
+                                         ValTy->getScalarType(),
+                                         SI->getAlignment(),
+                                         SI->getPointerAddressSpace());
+      return Cost;
     }
-    case Instruction::Load: {
-      LoadInst *LI = cast<LoadInst>(I);
-
-      if (VF == 1)
-        return VTTI->getMemoryOpCost(I->getOpcode(), RetTy,
-                                     LI->getAlignment(),
-                                     LI->getPointerAddressSpace());
-
-      // Scalarized loads.
-      if (!Legal->isConsecutivePtr(LI->getPointerOperand())) {
-        unsigned Cost = 0;
-        unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy);
-        // The cost of inserting the loaded value into the result vector.
-        Cost += VF * (InCost);
-        // The cost of the scalar stores.
-        Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
-                                           RetTy->getScalarType(),
-                                           LI->getAlignment(),
-                                           LI->getPointerAddressSpace());
-        return Cost;
-      }
 
-      // Wide loads.
-      return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(),
+    // Wide stores.
+    return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, SI->getAlignment(),
+                                 SI->getPointerAddressSpace());
+  }
+  case Instruction::Load: {
+    LoadInst *LI = cast<LoadInst>(I);
+
+    if (VF == 1)
+      return VTTI->getMemoryOpCost(I->getOpcode(), RetTy,
+                                   LI->getAlignment(),
                                    LI->getPointerAddressSpace());
-    }
-    case Instruction::ZExt:
-    case Instruction::SExt:
-    case Instruction::FPToUI:
-    case Instruction::FPToSI:
-    case Instruction::FPExt:
-    case Instruction::PtrToInt:
-    case Instruction::IntToPtr:
-    case Instruction::SIToFP:
-    case Instruction::UIToFP:
-    case Instruction::Trunc:
-    case Instruction::FPTrunc:
-    case Instruction::BitCast: {
-      Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
-      return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
-    }
-    default: {
-      // We are scalarizing the instruction. Return the cost of the scalar
-      // instruction, plus the cost of insert and extract into vector
-      // elements, times the vector width.
+
+    // Scalarized loads.
+    if (!Legal->isConsecutivePtr(LI->getPointerOperand())) {
       unsigned Cost = 0;
+      unsigned InCost = VTTI->getInstrCost(Instruction::InsertElement, RetTy);
+      // The cost of inserting the loaded value into the result vector.
+      Cost += VF * (InCost);
+      // The cost of the scalar stores.
+      Cost += VF * VTTI->getMemoryOpCost(I->getOpcode(),
+                                         RetTy->getScalarType(),
+                                         LI->getAlignment(),
+                                         LI->getPointerAddressSpace());
+      return Cost;
+    }
+
+    // Wide loads.
+    return VTTI->getMemoryOpCost(I->getOpcode(), VectorTy, LI->getAlignment(),
+                                 LI->getPointerAddressSpace());
+  }
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::FPExt:
+  case Instruction::PtrToInt:
+  case Instruction::IntToPtr:
+  case Instruction::SIToFP:
+  case Instruction::UIToFP:
+  case Instruction::Trunc:
+  case Instruction::FPTrunc:
+  case Instruction::BitCast: {
+    Type *SrcVecTy = ToVectorTy(I->getOperand(0)->getType(), VF);
+    return VTTI->getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
+  }
+  case Instruction::Call: {
+    assert(isTriviallyVectorizableIntrinsic(I));
+    IntrinsicInst *II = cast<IntrinsicInst>(I);
+    Type *RetTy = ToVectorTy(II->getType(), VF);
+    SmallVector<Type*, 4> Tys;
+    for (unsigned i = 0, ie = II->getNumArgOperands(); i != ie; ++i)
+      Tys.push_back(ToVectorTy(II->getArgOperand(i)->getType(), VF));
+    return VTTI->getIntrinsicInstrCost(II->getIntrinsicID(), RetTy, Tys);
+  }
+  default: {
+    // We are scalarizing the instruction. Return the cost of the scalar
+    // instruction, plus the cost of insert and extract into vector
+    // elements, times the vector width.
+    unsigned Cost = 0;
 
-      bool IsVoid = RetTy->isVoidTy();
+    bool IsVoid = RetTy->isVoidTy();
 
-      unsigned InsCost = (IsVoid ? 0 :
-                          VTTI->getInstrCost(Instruction::InsertElement,
-                                             VectorTy));
+    unsigned InsCost = (IsVoid ? 0 :
+                        VTTI->getInstrCost(Instruction::InsertElement,
+                                           VectorTy));
 
-      unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
-                                            VectorTy);
+    unsigned ExtCost = VTTI->getInstrCost(Instruction::ExtractElement,
+                                          VectorTy);
 
-      // The cost of inserting the results plus extracting each one of the
-      // operands.
-      Cost += VF * (InsCost + ExtCost * I->getNumOperands());
+    // The cost of inserting the results plus extracting each one of the
+    // operands.
+    Cost += VF * (InsCost + ExtCost * I->getNumOperands());
 
-      // The cost of executing VF copies of the scalar instruction.
-      Cost += VF * VTTI->getInstrCost(I->getOpcode(), RetTy);
-      return Cost;
-    }
+    // The cost of executing VF copies of the scalar instruction.
+    Cost += VF * VTTI->getInstrCost(I->getOpcode(), RetTy);
+    return Cost;
+  }
   }// end of switch.
 }
 
@@ -2110,8 +2096,6 @@ Type* LoopVectorizationCostModel::ToVectorTy(Type *Scalar, unsigned VF) {
   return VectorType::get(Scalar, VF);
 }
 
-} // namespace
-
 char LoopVectorize::ID = 0;
 static const char lv_name[] = "Loop Vectorization";
 INITIALIZE_PASS_BEGIN(LoopVectorize, LV_NAME, lv_name, false, false)
@@ -2126,3 +2110,4 @@ namespace llvm {
   }
 }
 
+
diff --git a/lib/Transforms/Vectorize/LoopVectorize.h b/lib/Transforms/Vectorize/LoopVectorize.h
new file mode 100644
index 0000000000..9d6d80e22b
--- /dev/null
+++ b/lib/Transforms/Vectorize/LoopVectorize.h
@@ -0,0 +1,458 @@
+//===- LoopVectorize.h --- A Loop Vectorizer ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
+// and generates target-independent LLVM-IR. Legalization of the IR is done
+// in the codegen. However, the vectorizes uses (will use) the codegen
+// interfaces to generate IR that is likely to result in an optimal binary.
+//
+// The loop vectorizer combines consecutive loop iteration into a single
+// 'wide' iteration. After this transformation the index is incremented
+// by the SIMD vector width, and not by one.
+//
+// This pass has three parts:
+// 1. The main loop pass that drives the different parts.
+// 2. LoopVectorizationLegality - A unit that checks for the legality
+//    of the vectorization.
+// 3. InnerLoopVectorizer - A unit that performs the actual
+//    widening of instructions.
+// 4. LoopVectorizationCostModel - A unit that checks for the profitability
+//    of vectorization. It decides on the optimal vector width, which
+//    can be one, if vectorization is not profitable.
+//
+//===----------------------------------------------------------------------===//
+//
+// The reduction-variable vectorization is based on the paper:
+//  D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
+//
+// Variable uniformity checks are inspired by:
+// Karrenberg, R. and Hack, S. Whole Function Vectorization.
+//
+// Other ideas/concepts are from:
+//  A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
+//
+//  S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua.  An Evaluation of
+//  Vectorizing Compilers.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TRANSFORM_VECTORIZE_LOOP_VECTORIZE_H
+#define LLVM_TRANSFORM_VECTORIZE_LOOP_VECTORIZE_H
+
+#define LV_NAME "loop-vectorize"
+#define DEBUG_TYPE LV_NAME
+
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/IRBuilder.h" 
+
+#include <algorithm>
+using namespace llvm;
+
+/// We don't vectorize loops with a known constant trip count below this number.
+const unsigned TinyTripCountThreshold = 16;
+
+/// When performing a runtime memory check, do not check more than this
+/// number of pointers. Notice that the check is quadratic!
+const unsigned RuntimeMemoryCheckThreshold = 4;
+
+/// This is the highest vector width that we try to generate.
+const unsigned MaxVectorSize = 8;
+
+namespace llvm {
+
+// Forward declarations.
+class LoopVectorizationLegality;
+class LoopVectorizationCostModel;
+class VectorTargetTransformInfo;
+
+/// InnerLoopVectorizer vectorizes loops which contain only one basic
+/// block to a specified vectorization factor (VF).
+/// This class performs the widening of scalars into vectors, or multiple
+/// scalars. This class also implements the following features:
+/// * It inserts an epilogue loop for handling loops that don't have iteration
+///   counts that are known to be a multiple of the vectorization factor.
+/// * It handles the code generation for reduction variables.
+/// * Scalarization (implementation using scalars) of un-vectorizable
+///   instructions.
+/// InnerLoopVectorizer does not perform any vectorization-legality
+/// checks, and relies on the caller to check for the different legality
+/// aspects. The InnerLoopVectorizer relies on the
+/// LoopVectorizationLegality class to provide information about the induction
+/// and reduction variables that were found to a given vectorization factor.
+class InnerLoopVectorizer {
+public:
+  /// Ctor.
+  InnerLoopVectorizer(Loop *Orig, ScalarEvolution *Se, LoopInfo *Li,
+                      DominatorTree *Dt, DataLayout *Dl, unsigned VecWidth):
+  OrigLoop(Orig), SE(Se), LI(Li), DT(Dt), DL(Dl), VF(VecWidth),
+  Builder(Se->getContext()), Induction(0), OldInduction(0) { }
+
+  // Perform the actual loop widening (vectorization).
+  void vectorize(LoopVectorizationLegality *Legal) {
+    // Create a new empty loop. Unlink the old loop and connect the new one.
+    createEmptyLoop(Legal);
+    // Widen each instruction in the old loop to a new one in the new loop.
+    // Use the Legality module to find the induction and reduction variables.
+    vectorizeLoop(Legal);
+    // Register the new loop and update the analysis passes.
+    updateAnalysis();
+  }
+
+private:
+  /// A small list of PHINodes.
+  typedef SmallVector<PHINode*, 4> PhiVector;
+
+  /// Add code that checks at runtime if the accessed arrays overlap.
+  /// Returns the comparator value or NULL if no check is needed.
+  Value *addRuntimeCheck(LoopVectorizationLegality *Legal,
+                         Instruction *Loc);
+  /// Create an empty loop, based on the loop ranges of the old loop.
+  void createEmptyLoop(LoopVectorizationLegality *Legal);
+  /// Copy and widen the instructions from the old loop.
+  void vectorizeLoop(LoopVectorizationLegality *Legal);
+
+  /// A helper function that computes the predicate of the block BB, assuming
+  /// that the header block of the loop is set to True. It returns the *entry*
+  /// mask for the block BB.
+  Value *createBlockInMask(BasicBlock *BB);
+  /// A helper function that computes the predicate of the edge between SRC
+  /// and DST.
+  Value *createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
+
+  /// A helper function to vectorize a single BB within the innermost loop.
+  void vectorizeBlockInLoop(LoopVectorizationLegality *Legal, BasicBlock *BB,
+                            PhiVector *PV);
+
+  /// Insert the new loop to the loop hierarchy and pass manager
+  /// and update the analysis passes.
+  void updateAnalysis();
+
+  /// This instruction is un-vectorizable. Implement it as a sequence
+  /// of scalars.
+  void scalarizeInstruction(Instruction *Instr);
+
+  /// Create a broadcast instruction. This method generates a broadcast
+  /// instruction (shuffle) for loop invariant values and for the induction
+  /// value. If this is the induction variable then we extend it to N, N+1, ...
+  /// this is needed because each iteration in the loop corresponds to a SIMD
+  /// element.
+  Value *getBroadcastInstrs(Value *V);
+
+  /// This function adds 0, 1, 2 ... to each vector element, starting at zero.
+  /// If Negate is set then negative numbers are added e.g. (0, -1, -2, ...).
+  Value *getConsecutiveVector(Value* Val, bool Negate = false);
+
+  /// When we go over instructions in the basic block we rely on previous
+  /// values within the current basic block or on loop invariant values.
+  /// When we widen (vectorize) values we place them in the map. If the values
+  /// are not within the map, they have to be loop invariant, so we simply
+  /// broadcast them into a vector.
+  Value *getVectorValue(Value *V);
+
+  /// Get a uniform vector of constant integers. We use this to get
+  /// vectors of ones and zeros for the reduction code.
+  Constant* getUniformVector(unsigned Val, Type* ScalarTy);
+
+  typedef DenseMap<Value*, Value*> ValueMap;
+
+  /// The original loop.
+  Loop *OrigLoop;
+  // Scev analysis to use.
+  ScalarEvolution *SE;
+  // Loop Info.
+  LoopInfo *LI;
+  // Dominator Tree.
+  DominatorTree *DT;
+  // Data Layout.
+  DataLayout *DL;
+  // The vectorization factor to use.
+  unsigned VF;
+
+  // The builder that we use
+  IRBuilder<> Builder;
+
+  // --- Vectorization state ---
+
+  /// The vector-loop preheader.
+  BasicBlock *LoopVectorPreHeader;
+  /// The scalar-loop preheader.
+  BasicBlock *LoopScalarPreHeader;
+  /// Middle Block between the vector and the scalar.
+  BasicBlock *LoopMiddleBlock;
+  ///The ExitBlock of the scalar loop.
+  BasicBlock *LoopExitBlock;
+  ///The vector loop body.
+  BasicBlock *LoopVectorBody;
+  ///The scalar loop body.
+  BasicBlock *LoopScalarBody;
+  ///The first bypass block.
+  BasicBlock *LoopBypassBlock;
+
+  /// The new Induction variable which was added to the new block.
+  PHINode *Induction;
+  /// The induction variable of the old basic block.
+  PHINode *OldInduction;
+  // Maps scalars to widened vectors.
+  ValueMap WidenMap;
+};
+
+/// LoopVectorizationLegality checks if it is legal to vectorize a loop, and
+/// to what vectorization factor.
+/// This class does not look at the profitability of vectorization, only the
+/// legality. This class has two main kinds of checks:
+/// * Memory checks - The code in canVectorizeMemory checks if vectorization
+///   will change the order of memory accesses in a way that will change the
+///   correctness of the program.
+/// * Scalars checks - The code in canVectorizeInstrs and canVectorizeMemory
+/// checks for a number of different conditions, such as the availability of a
+/// single induction variable, that all types are supported and vectorize-able,
+/// etc. This code reflects the capabilities of InnerLoopVectorizer.
+/// This class is also used by InnerLoopVectorizer for identifying
+/// induction variable and the different reduction variables.
+class LoopVectorizationLegality {
+public:
+  LoopVectorizationLegality(Loop *Lp, ScalarEvolution *Se, DataLayout *Dl,
+                            DominatorTree *Dt):
+  TheLoop(Lp), SE(Se), DL(Dl), DT(Dt), Induction(0) { }
+
+  /// This enum represents the kinds of reductions that we support.
+  enum ReductionKind {
+    NoReduction, /// Not a reduction.
+    IntegerAdd,  /// Sum of numbers.
+    IntegerMult, /// Product of numbers.
+    IntegerOr,   /// Bitwise or logical OR of numbers.
+    IntegerAnd,  /// Bitwise or logical AND of numbers.
+    IntegerXor   /// Bitwise or logical XOR of numbers.
+  };
+
+  /// This enum represents the kinds of inductions that we support.
+  enum InductionKind {
+    NoInduction,         /// Not an induction variable.
+    IntInduction,        /// Integer induction variable. Step = 1.
+    ReverseIntInduction, /// Reverse int induction variable. Step = -1.
+    PtrInduction         /// Pointer induction variable. Step = sizeof(elem).
+  };
+
+  /// This POD struct holds information about reduction variables.
+  struct ReductionDescriptor {
+    // Default C'tor
+    ReductionDescriptor():
+    StartValue(0), LoopExitInstr(0), Kind(NoReduction) {}
+
+    // C'tor.
+    ReductionDescriptor(Value *Start, Instruction *Exit, ReductionKind K):
+    StartValue(Start), LoopExitInstr(Exit), Kind(K) {}
+
+    // The starting value of the reduction.
+    // It does not have to be zero!
+    Value *StartValue;
+    // The instruction who's value is used outside the loop.
+    Instruction *LoopExitInstr;
+    // The kind of the reduction.
+    ReductionKind Kind;
+  };
+
+  // This POD struct holds information about the memory runtime legality
+  // check that a group of pointers do not overlap.
+  struct RuntimePointerCheck {
+    RuntimePointerCheck(): Need(false) {}
+
+    /// Reset the state of the pointer runtime information.
+    void reset() {
+      Need = false;
+      Pointers.clear();
+      Starts.clear();
+      Ends.clear();
+    }
+
+    /// Insert a pointer and calculate the start and end SCEVs.
+    void insert(ScalarEvolution *SE, Loop *Lp, Value *Ptr);
+
+    /// This flag indicates if we need to add the runtime check.
+    bool Need;
+    /// Holds the pointers that we need to check.
+    SmallVector<Value*, 2> Pointers;
+    /// Holds the pointer value at the beginning of the loop.
+    SmallVector<const SCEV*, 2> Starts;
+    /// Holds the pointer value at the end of the loop.
+    SmallVector<const SCEV*, 2> Ends;
+  };
+
+  /// A POD for saving information about induction variables.
+  struct InductionInfo {
+    /// Ctors.
+    InductionInfo(Value *Start, InductionKind K):
+    StartValue(Start), IK(K) {};
+    InductionInfo(): StartValue(0), IK(NoInduction) {};
+    /// Start value.
+    Value *StartValue;
+    /// Induction kind.
+    InductionKind IK;
+  };
+
+  /// ReductionList contains the reduction descriptors for all
+  /// of the reductions that were found in the loop.
+  typedef DenseMap<PHINode*, ReductionDescriptor> ReductionList;
+
+  /// InductionList saves induction variables and maps them to the
+  /// induction descriptor.
+  typedef DenseMap<PHINode*, InductionInfo> InductionList;
+
+  /// Returns true if it is legal to vectorize this loop.
+  /// This does not mean that it is profitable to vectorize this
+  /// loop, only that it is legal to do so.
+  bool canVectorize();
+
+  /// Returns the Induction variable.
+  PHINode *getInduction() {return Induction;}
+
+  /// Returns the reduction variables found in the loop.
+  ReductionList *getReductionVars() { return &Reductions; }
+
+  /// Returns the induction variables found in the loop.
+  InductionList *getInductionVars() { return &Inductions; }
+
+  /// Return true if the block BB needs to be predicated in order for the loop
+  /// to be vectorized.
+  bool blockNeedsPredication(BasicBlock *BB);
+
+  /// Check if this  pointer is consecutive when vectorizing. This happens
+  /// when the last index of the GEP is the induction variable, or that the
+  /// pointer itself is an induction variable.
+  /// This check allows us to vectorize A[idx] into a wide load/store.
+  bool isConsecutivePtr(Value *Ptr);
+
+  /// Returns true if the value V is uniform within the loop.
+  bool isUniform(Value *V);
+
+  /// Returns true if this instruction will remain scalar after vectorization.
+  bool isUniformAfterVectorization(Instruction* I) {return Uniforms.count(I);}
+
+  /// Returns the information that we collected about runtime memory check.
+  RuntimePointerCheck *getRuntimePointerCheck() {return &PtrRtCheck; }
+private:
+  /// Check if a single basic block loop is vectorizable.
+  /// At this point we know that this is a loop with a constant trip count
+  /// and we only need to check individual instructions.
+  bool canVectorizeInstrs();
+
+  /// When we vectorize loops we may change the order in which
+  /// we read and write from memory. This method checks if it is
+  /// legal to vectorize the code, considering only memory constrains.
+  /// Returns true if the loop is vectorizable
+  bool canVectorizeMemory();
+
+  /// Return true if we can vectorize this loop using the IF-conversion
+  /// transformation.
+  bool canVectorizeWithIfConvert();
+
+  /// Collect the variables that need to stay uniform after vectorization.
+  void collectLoopUniforms();
+
+  /// Return true if all of the instructions in the block can be speculatively
+  /// executed.
+  bool blockCanBePredicated(BasicBlock *BB);
+
+  /// Returns True, if 'Phi' is the kind of reduction variable for type
+  /// 'Kind'. If this is a reduction variable, it adds it to ReductionList.
+  bool AddReductionVar(PHINode *Phi, ReductionKind Kind);
+  /// Returns true if the instruction I can be a reduction variable of type
+  /// 'Kind'.
+  bool isReductionInstr(Instruction *I, ReductionKind Kind);
+  /// Returns the induction kind of Phi. This function may return NoInduction
+  /// if the PHI is not an induction variable.
+  InductionKind isInductionVariable(PHINode *Phi);
+  /// Return true if can compute the address bounds of Ptr within the loop.
+  bool hasComputableBounds(Value *Ptr);
+
+  /// The loop that we evaluate.
+  Loop *TheLoop;
+  /// Scev analysis.
+  ScalarEvolution *SE;
+  /// DataLayout analysis.
+  DataLayout *DL;
+  // Dominators.
+  DominatorTree *DT;
+
+  //  ---  vectorization state --- //
+
+  /// Holds the integer induction variable. This is the counter of the
+  /// loop.
+  PHINode *Induction;
+  /// Holds the reduction variables.
+  ReductionList Reductions;
+  /// Holds all of the induction variables that we found in the loop.
+  /// Notice that inductions don't need to start at zero and that induction
+  /// variables can be pointers.
+  InductionList Inductions;
+
+  /// Allowed outside users. This holds the reduction
+  /// vars which can be accessed from outside the loop.
+  SmallPtrSet<Value*, 4> AllowedExit;
+  /// This set holds the variables which are known to be uniform after
+  /// vectorization.
+  SmallPtrSet<Instruction*, 4> Uniforms;
+  /// We need to check that all of the pointers in this list are disjoint
+  /// at runtime.
+  RuntimePointerCheck PtrRtCheck;
+};
+
+/// LoopVectorizationCostModel - estimates the expected speedups due to
+/// vectorization.
+/// In many cases vectorization is not profitable. This can happen because
+/// of a number of reasons. In this class we mainly attempt to predict
+/// the expected speedup/slowdowns due to the supported instruction set.
+/// We use the VectorTargetTransformInfo to query the different backends
+/// for the cost of different operations.
+class LoopVectorizationCostModel {
+public:
+  /// C'tor.
+  LoopVectorizationCostModel(Loop *Lp, ScalarEvolution *Se,
+                             LoopVectorizationLegality *Leg,
+                             const VectorTargetTransformInfo *Vtti):
+  TheLoop(Lp), SE(Se), Legal(Leg), VTTI(Vtti) { }
+
+  /// Returns the most profitable vectorization factor for the loop that is
+  /// smaller or equal to the VF argument. This method checks every power
+  /// of two up to VF.
+  unsigned findBestVectorizationFactor(unsigned VF = MaxVectorSize);
+
+private:
+  /// Returns the expected execution cost. The unit of the cost does
+  /// not matter because we use the 'cost' units to compare different
+  /// vector widths. The cost that is returned is *not* normalized by
+  /// the factor width.
+  unsigned expectedCost(unsigned VF);
+
+  /// Returns the execution time cost of an instruction for a given vector
+  /// width. Vector width of one means scalar.
+  unsigned getInstructionCost(Instruction *I, unsigned VF);
+
+  /// A helper function for converting Scalar types to vector types.
+  /// If the incoming type is void, we return void. If the VF is 1, we return
+  /// the scalar type.
+  static Type* ToVectorTy(Type *Scalar, unsigned VF);
+
+  /// The loop that we evaluate.
+  Loop *TheLoop;
+  /// Scev analysis.
+  ScalarEvolution *SE;
+
+  /// Vectorization legality.
+  LoopVectorizationLegality *Legal;
+  /// Vector target information.
+  const VectorTargetTransformInfo *VTTI;
+};
+
+}// namespace llvm
+
+#endif //LLVM_TRANSFORM_VECTORIZE_LOOP_VECTORIZE_H
+
diff --git a/lib/Transforms/Vectorize/Vectorize.cpp b/lib/Transforms/Vectorize/Vectorize.cpp
index d26973a7b3..3fb36cadea 100644
--- a/lib/Transforms/Vectorize/Vectorize.cpp
+++ b/lib/Transforms/Vectorize/Vectorize.cpp
@@ -13,13 +13,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm-c/Transforms/Vectorize.h"
+#include "llvm/Transforms/Vectorize.h"
 #include "llvm-c/Initialization.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/PassManager.h"
+#include "llvm-c/Transforms/Vectorize.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/Analysis/Verifier.h"
-#include "llvm/Transforms/Vectorize.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/PassManager.h"
 
 using namespace llvm;