77 files changed, 3419 insertions, 934 deletions
diff --git a/docs/ReleaseNotes.html b/docs/ReleaseNotes.html
index 45a9cc5dec..fc3a8b71bd 100644
--- a/docs/ReleaseNotes.html
+++ b/docs/ReleaseNotes.html
@@ -473,15 +473,31 @@ Release Notes</a>.</h1>
     <b>-mllvm -force-vector-width=4</b>.
     The default value is <b>0</b> which means auto-select.
     <br/>
-    We can now vectorize this code:
+    We can now vectorize this function:
 
     <pre class="doc_code">
-    for (i=0; i&lt;n; i++) {
-      a[i] = b[i+1] + c[i+3] + i;
-      sum += d[i];
+    unsigned sum_arrays(int *A, int *B, int start, int end) {
+      unsigned sum = 0;
+      for (int i = start; i &lt; end; ++i)
+        sum += A[i] + B[i] + i;
+
+      return sum;
     }
     </pre>
 
+    We vectorize under the following loops:
+    <ul>
+    <li>The inner most loops must have a single basic block.</li>
+    <li>The number of iterations are known before the loop starts to execute.</li>
+    <li>The loop counter needs to be incrimented by one.</li>
+    <li>The loop trip count <b>can</b> be a variable.</li>
+    <li>Loops do <b>not</b> need to start at zero.</li>
+    <li>The induction variable can be used inside the loop.</li>
+    <li>Loop reductions are supported.</li>
+    <li>Arrays with affine access pattern do <b>not</b> need to be marked as 'noalias' and are checked at runtime.</li>
+    <li>...</li>
+    </ul>
+
 </p>
 
 <p>SROA - We've re-written SROA to be significantly more powerful.
diff --git a/include/llvm/MC/MCExpr.h b/include/llvm/MC/MCExpr.h
index 4c10e5114a..00eef270d6 100644
--- a/include/llvm/MC/MCExpr.h
+++ b/include/llvm/MC/MCExpr.h
@@ -167,6 +167,7 @@ public:
     VK_ARM_TPOFF,
     VK_ARM_GOTTPOFF,
     VK_ARM_TARGET1,
+    VK_ARM_TARGET2,
 
     VK_PPC_TOC,          // TOC base
     VK_PPC_TOC_ENTRY,    // TOC entry
diff --git a/include/llvm/Target/TargetLibraryInfo.h b/include/llvm/Target/TargetLibraryInfo.h
index 2a0a43229f..a2c97d782e 100644
--- a/include/llvm/Target/TargetLibraryInfo.h
+++ b/include/llvm/Target/TargetLibraryInfo.h
@@ -276,12 +276,18 @@ namespace llvm {
       sqrtf,
       /// long double sqrtl(long double x);
       sqrtl,
+      /// char *stpcpy(char *s1, const char *s2);
+      stpcpy,
       /// char *strcat(char *s1, const char *s2);
       strcat,
       /// char *strchr(const char *s, int c);
       strchr,
+      /// int strcmp(const char *s1, const char *s2);
+      strcmp,
       /// char *strcpy(char *s1, const char *s2);
       strcpy,
+      /// size_t strcspn(const char *s1, const char *s2);
+      strcspn,
       /// char *strdup(const char *s1);
       strdup,
       /// size_t strlen(const char *s);
@@ -296,6 +302,29 @@ namespace llvm {
       strndup,
       /// size_t strnlen(const char *s, size_t maxlen);
       strnlen,
+      /// char *strpbrk(const char *s1, const char *s2);
+      strpbrk,
+      /// char *strrchr(const char *s, int c);
+      strrchr,
+      /// size_t strspn(const char *s1, const char *s2);
+      strspn,
+      /// char *strstr(const char *s1, const char *s2);
+      strstr,
+      /// double strtod(const char *nptr, char **endptr);
+      strtod,
+      /// float strtof(const char *nptr, char **endptr);
+      strtof,
+      /// long int strtol(const char *nptr, char **endptr, int base);
+      strtol,
+      /// long double strtold(const char *nptr, char **endptr);
+      strtold,
+      /// long long int strtoll(const char *nptr, char **endptr, int base);
+      strtoll,
+      /// unsigned long int strtoul(const char *nptr, char **endptr, int base);
+      strtoul,
+      /// unsigned long long int strtoull(const char *nptr, char **endptr,
+      ///                                 int base);
+      strtoull,
       /// double tan(double x);
       tan,
       /// float tanf(float x);
diff --git a/include/llvm/Transforms/Utils/SimplifyLibCalls.h b/include/llvm/Transforms/Utils/SimplifyLibCalls.h
index 5db2d00181..fde452bca2 100644
--- a/include/llvm/Transforms/Utils/SimplifyLibCalls.h
+++ b/include/llvm/Transforms/Utils/SimplifyLibCalls.h
@@ -19,6 +19,7 @@ namespace llvm {
   class Value;
   class CallInst;
   class DataLayout;
+  class Instruction;
   class TargetLibraryInfo;
   class LibCallSimplifierImpl;
 
@@ -35,8 +36,16 @@ namespace llvm {
 
     /// optimizeCall - Take the given call instruction and return a more
     /// optimal value to replace the instruction with or 0 if a more
-    /// optimal form can't be found.
+    /// optimal form can't be found.  Note that the returned value may
+    /// be equal to the instruction being optimized.  In this case all
+    /// other instructions that use the given instruction were modified
+    /// and the given instruction is dead.
     Value *optimizeCall(CallInst *CI);
+
+    /// replaceAllUsesWith - This method is used when the library call
+    /// simplifier needs to replace instructions other than the library
+    /// call being modified.
+    virtual void replaceAllUsesWith(Instruction *I, Value *With) const;
   };
 } // End llvm namespace
 
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index 5cac8ca3ba..91a5b84e8a 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -292,7 +292,10 @@ static bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset,
     unsigned IntBytes = unsigned(CI->getBitWidth()/8);
 
     for (unsigned i = 0; i != BytesLeft && ByteOffset != IntBytes; ++i) {
-      CurPtr[i] = (unsigned char)(Val >> (ByteOffset * 8));
+      int n = ByteOffset;
+      if (!TD.isLittleEndian())
+        n = IntBytes - n - 1;
+      CurPtr[i] = (unsigned char)(Val >> (n * 8));
       ++ByteOffset;
     }
     return true;
@@ -442,10 +445,19 @@ static Constant *FoldReinterpretLoadFromConstPtr(Constant *C,
                           BytesLoaded, TD))
     return 0;
 
-  APInt ResultVal = APInt(IntType->getBitWidth(), RawBytes[BytesLoaded-1]);
-  for (unsigned i = 1; i != BytesLoaded; ++i) {
-    ResultVal <<= 8;
-    ResultVal |= RawBytes[BytesLoaded-1-i];
+  APInt ResultVal = APInt(IntType->getBitWidth(), 0);
+  if (TD.isLittleEndian()) {
+    ResultVal = RawBytes[BytesLoaded - 1];
+    for (unsigned i = 1; i != BytesLoaded; ++i) {
+      ResultVal <<= 8;
+      ResultVal |= RawBytes[BytesLoaded-1-i];
+    }
+  } else {
+    ResultVal = RawBytes[0];
+    for (unsigned i = 1; i != BytesLoaded; ++i) {
+      ResultVal <<= 8;
+      ResultVal |= RawBytes[i];
+    }
   }
 
   return ConstantInt::get(IntType->getContext(), ResultVal);
@@ -521,10 +533,8 @@ Constant *llvm::ConstantFoldLoadFromConstPtr(Constant *C,
     }
   }
 
-  // Try hard to fold loads from bitcasted strange and non-type-safe things.  We
-  // currently don't do any of this for big endian systems.  It can be
-  // generalized in the future if someone is interested.
-  if (TD && TD->isLittleEndian())
+  // Try hard to fold loads from bitcasted strange and non-type-safe things.
+  if (TD)
     return FoldReinterpretLoadFromConstPtr(CE, *TD);
   return 0;
 }
diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
index 65bc4af99e..4e75d892e5 100644
--- a/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -146,6 +146,11 @@ void LiveIntervals::print(raw_ostream &OS, const Module* ) const {
       OS << PrintReg(Reg) << " = " << getInterval(Reg) << '\n';
   }
 
+  OS << "RegMasks:";
+  for (unsigned i = 0, e = RegMaskSlots.size(); i != e; ++i)
+    OS << ' ' << RegMaskSlots[i];
+  OS << '\n';
+
   printInstrs(OS);
 }
 
@@ -1257,10 +1262,15 @@ private:
     SmallVectorImpl<SlotIndex>::iterator RI =
       std::lower_bound(LIS.RegMaskSlots.begin(), LIS.RegMaskSlots.end(),
                        OldIdx);
-    assert(*RI == OldIdx && "No RegMask at OldIdx.");
-    *RI = NewIdx;
-    assert(*prior(RI) < *RI && *RI < *next(RI) &&
-           "RegSlots out of order. Did you move one call across another?");
+    assert(RI != LIS.RegMaskSlots.end() && *RI == OldIdx.getRegSlot() &&
+           "No RegMask at OldIdx.");
+    *RI = NewIdx.getRegSlot();
+    assert((RI == LIS.RegMaskSlots.begin() ||
+            SlotIndex::isEarlierInstr(*llvm::prior(RI), *RI)) &&
+            "Cannot move regmask instruction above another call");
+    assert((llvm::next(RI) == LIS.RegMaskSlots.end() ||
+            SlotIndex::isEarlierInstr(*RI, *llvm::next(RI))) &&
+            "Cannot move regmask instruction below another call");
   }
 
   // Return the last use of reg between NewIdx and OldIdx.
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index de16932c06..a4817d09c0 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -1583,6 +1583,7 @@ const char *ConvergingScheduler::getReasonStr(
   case NextDefUse:     return "DEF-USE   ";
   case NodeOrder:      return "ORDER     ";
   };
+  llvm_unreachable("Unknown reason!");
 }
 
 void ConvergingScheduler::traceCandidate(const SchedCandidate &Cand,
diff --git a/lib/MC/MCExpr.cpp b/lib/MC/MCExpr.cpp
index ffa79761f2..e0336342d6 100644
--- a/lib/MC/MCExpr.cpp
+++ b/lib/MC/MCExpr.cpp
@@ -60,7 +60,8 @@ void MCExpr::print(raw_ostream &OS) const {
         SRE.getKind() == MCSymbolRefExpr::VK_ARM_GOTOFF ||
         SRE.getKind() == MCSymbolRefExpr::VK_ARM_TPOFF ||
         SRE.getKind() == MCSymbolRefExpr::VK_ARM_GOTTPOFF ||
-        SRE.getKind() == MCSymbolRefExpr::VK_ARM_TARGET1)
+        SRE.getKind() == MCSymbolRefExpr::VK_ARM_TARGET1 ||
+        SRE.getKind() == MCSymbolRefExpr::VK_ARM_TARGET2)
       OS << MCSymbolRefExpr::getVariantKindName(SRE.getKind());
     else if (SRE.getKind() != MCSymbolRefExpr::VK_None &&
              SRE.getKind() != MCSymbolRefExpr::VK_PPC_DARWIN_HA16 &&
@@ -199,6 +200,7 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_ARM_GOTTPOFF: return "(gottpoff)";
   case VK_ARM_TLSGD: return "(tlsgd)";
   case VK_ARM_TARGET1: return "(target1)";
+  case VK_ARM_TARGET2: return "(target2)";
   case VK_PPC_TOC: return "tocbase";
   case VK_PPC_TOC_ENTRY: return "toc";
   case VK_PPC_DARWIN_HA16: return "ha16";
diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp
index 43c68f4d1d..7e8b4a3d0d 100644
--- a/lib/Support/APFloat.cpp
+++ b/lib/Support/APFloat.cpp
@@ -3553,11 +3553,6 @@ void APFloat::toString(SmallVectorImpl<char> &Str,
 }
 
 bool APFloat::getExactInverse(APFloat *inv) const {
-  // We can only guarantee the existence of an exact inverse for IEEE floats.
-  if (semantics != &IEEEhalf && semantics != &IEEEsingle &&
-      semantics != &IEEEdouble && semantics != &IEEEquad)
-    return false;
-
   // Special floats and denormals have no exact inverse.
   if (category != fcNormal)
     return false;
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index 8d1a301a67..f67decc550 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -853,13 +853,28 @@ void ARMAsmPrinter::emitAttributes() {
     AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use,
                                ARMBuildAttrs::Allowed);
   } else if (CPUString == "generic") {
-    // FIXME: Why these defaults?
-    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v4T);
+    // For a generic CPU, we assume a standard v7a architecture in Subtarget.
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v7);
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch_profile,
+                               ARMBuildAttrs::ApplicationProfile);
     AttrEmitter->EmitAttribute(ARMBuildAttrs::ARM_ISA_use,
                                ARMBuildAttrs::Allowed);
     AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use,
-                               ARMBuildAttrs::Allowed);
-  }
+                               ARMBuildAttrs::AllowThumb32);
+  } else if (Subtarget->hasV7Ops()) {
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v7);
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::THUMB_ISA_use,
+                               ARMBuildAttrs::AllowThumb32);
+  } else if (Subtarget->hasV6T2Ops())
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v6T2);
+  else if (Subtarget->hasV6Ops())
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v6);
+  else if (Subtarget->hasV5TEOps())
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v5TE);
+  else if (Subtarget->hasV5TOps())
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v5T);
+  else if (Subtarget->hasV4TOps())
+    AttrEmitter->EmitAttribute(ARMBuildAttrs::CPU_arch, ARMBuildAttrs::v4T);
 
   if (Subtarget->hasNEON() && emitFPU) {
     /* NEON is not exactly a VFP architecture, but GAS emit one of
@@ -1515,31 +1530,6 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     }
     return;
   }
-  case ARM::t2BMOVPCB_CALL: {
-    {
-      MCInst TmpInst;
-      TmpInst.setOpcode(ARM::tMOVr);
-      TmpInst.addOperand(MCOperand::CreateReg(ARM::LR));
-      TmpInst.addOperand(MCOperand::CreateReg(ARM::PC));
-      // Add predicate operands.
-      TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
-      TmpInst.addOperand(MCOperand::CreateReg(0));
-      OutStreamer.EmitInstruction(TmpInst);
-    }
-    {
-      MCInst TmpInst;
-      TmpInst.setOpcode(ARM::t2B);
-      const GlobalValue *GV = MI->getOperand(0).getGlobal();
-      MCSymbol *GVSym = Mang->getSymbol(GV);
-      const MCExpr *GVSymExpr = MCSymbolRefExpr::Create(GVSym, OutContext);
-      TmpInst.addOperand(MCOperand::CreateExpr(GVSymExpr));
-      // Add predicate operands.
-      TmpInst.addOperand(MCOperand::CreateImm(ARMCC::AL));
-      TmpInst.addOperand(MCOperand::CreateReg(0));
-      OutStreamer.EmitInstruction(TmpInst);
-    }
-    return;
-  }
   case ARM::MOVi16_ga_pcrel:
   case ARM::t2MOVi16_ga_pcrel: {
     MCInst TmpInst;
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index dec498a4f7..0893826427 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -1639,18 +1639,12 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   if (Subtarget->isThumb()) {
     if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
       CallOpc = ARMISD::CALL_NOLINK;
-    else if (doesNotRet && isDirect && !isARMFunc &&
-             Subtarget->hasRAS() && !Subtarget->isThumb1Only() &&
-             // Emit regular call when code size is the priority
-             !HasMinSizeAttr)
-      // "mov lr, pc; b _foo" to avoid confusing the RSP
-      CallOpc = ARMISD::CALL_NOLINK;
     else
       CallOpc = isARMFunc ? ARMISD::CALL : ARMISD::tCALL;
   } else {
-    if (!isDirect && !Subtarget->hasV5TOps()) {
+    if (!isDirect && !Subtarget->hasV5TOps())
       CallOpc = ARMISD::CALL_NOLINK;
-    } else if (doesNotRet && isDirect && Subtarget->hasRAS() &&
+    else if (doesNotRet && isDirect